From 97e892f68c6aeea0ebaa08bdd223fac5ef615c34 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:06 +0200 Subject: [PATCH 001/134] migration: Remove migrate_max_downtime() declaration commit 17b9483baae595aa1b9d25adceb240bf74fdb1ee upstream. migrate_max_downtime() has been removed long ago, but its declaration was mistakenly left. Remove it. Intel-SIG: commit 17b9483baae5 migration: Remove migrate_max_downtime() declaration Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-2-avihaih@nvidia.com Signed-off-by: Peter Xu Conflicts: migration/migration.h [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/migration.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/migration/migration.h b/migration/migration.h index eeddb7c0bd..7e461fff24 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -480,8 +480,6 @@ void migration_incoming_process(void); bool migration_has_all_channels(void); -uint64_t migrate_max_downtime(void); - void migrate_fd_error(MigrationState *s, const Error *error); void migrate_set_error(MigrationState *s, const Error *error); bool migrate_has_error(MigrationState *s); -- Gitee From d67c1200d467f2105c02ee37a8519cbe04b57b3b Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:07 +0200 Subject: [PATCH 002/134] migration: Remove nulling of hostname in migrate_init() commit 80caba6955c422ac7f848999cb961846aa2781d1 upstream. MigrationState->hostname is set to NULL in migrate_init(). This is redundant because it is already freed and set to NULL in migrade_fd_cleanup(). Remove it. Intel-SIG: commit 80caba6955c4 migration: Remove nulling of hostname in migrate_init() Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-3-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index dce22c2da5..0863a6ba40 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1587,7 +1587,6 @@ int migrate_init(MigrationState *s, Error **errp) s->migration_thread_running = false; error_free(s->error); s->error = NULL; - s->hostname = NULL; s->vmdesc = NULL; migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); -- Gitee From d56b24f726d688ae91faf1733add6e790f2b5285 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:08 +0200 Subject: [PATCH 003/134] migration: Refactor migration_incoming_setup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c606f3baa2de0ee40a3cc69142a23fff130debc3 upstream. Commit 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") extracted the only code that could fail in migration_incoming_setup(). Now migration_incoming_setup() can't fail, so refactor it to return void and remove errp parameter. Intel-SIG: commit c606f3baa2de migration: Refactor migration_incoming_setup() Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-4-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 0863a6ba40..8e2c2ba034 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -723,11 +723,8 @@ fail: /** * migration_incoming_setup: Setup incoming migration * @f: file for main migration channel - * @errp: where to put errors - * - * Returns: %true on success, %false on error. */ -static bool migration_incoming_setup(QEMUFile *f, Error **errp) +static void migration_incoming_setup(QEMUFile *f) { MigrationIncomingState *mis = migration_incoming_get_current(); @@ -735,7 +732,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp) mis->from_src_file = f; } qemu_file_set_blocking(f, false); - return true; } void migration_incoming_process(void) @@ -779,9 +775,7 @@ static bool postcopy_try_recover(void) void migration_fd_process_incoming(QEMUFile *f, Error **errp) { - if (!migration_incoming_setup(f, errp)) { - return; - } + migration_incoming_setup(f); if (postcopy_try_recover()) { return; } @@ -854,10 +848,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) if (default_channel) { f = qemu_file_new_input(ioc); - - if (!migration_incoming_setup(f, errp)) { - return; - } + migration_incoming_setup(f); } else { /* Multiple connections */ assert(migration_needs_multiple_sockets()); -- Gitee From 5b54331d30eabd9b990ed65d864e61261c9cf411 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:09 +0200 Subject: [PATCH 004/134] migration: Remove errp parameter in migration_fd_process_incoming() commit b0cf3bfc69752969314f9e88d68f75f40fcdf274 upstream. Errp parameter in migration_fd_process_incoming() is unused. Remove it. Intel-SIG: commit b0cf3bfc6975 migration: Remove errp parameter in migration_fd_process_incoming() Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-5-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 2 +- migration/migration.h | 2 +- migration/rdma.c | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 8e2c2ba034..1feff3bfd6 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -773,7 +773,7 @@ static bool postcopy_try_recover(void) return false; } -void migration_fd_process_incoming(QEMUFile *f, Error **errp) +void migration_fd_process_incoming(QEMUFile *f) { migration_incoming_setup(f); if (postcopy_try_recover()) { diff --git a/migration/migration.h b/migration/migration.h index 7e461fff24..be99e613d2 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -474,7 +474,7 @@ struct MigrationState { void migrate_set_state(int *state, int old_state, int new_state); -void migration_fd_process_incoming(QEMUFile *f, Error **errp); +void migration_fd_process_incoming(QEMUFile *f); void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp); void migration_incoming_process(void); diff --git a/migration/rdma.c b/migration/rdma.c index 04debab5d9..94c0f871f0 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -4035,7 +4035,6 @@ static void rdma_accept_incoming_migration(void *opaque) { RDMAContext *rdma = opaque; QEMUFile *f; - Error *local_err = NULL; trace_qemu_rdma_accept_incoming_migration(); if (qemu_rdma_accept(rdma) < 0) { @@ -4057,10 +4056,7 @@ static void rdma_accept_incoming_migration(void *opaque) } rdma->migration_started_on_destination = 1; - migration_fd_process_incoming(f, &local_err); - if (local_err) { - error_reportf_err(local_err, "RDMA ERROR:"); - } + migration_fd_process_incoming(f); } void rdma_start_incoming_migration(InetSocketAddress *host_port, -- Gitee From a38c84e1a487d282bb58562a87aac61668ec7d1c Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:10 +0200 Subject: [PATCH 005/134] migration/multifd: Fix error message in multifd_recv_initial_packet() commit c77b40859a5201f01b44dc475258405e289c431f upstream. In multifd_recv_initial_packet(), if MultiFDInit_t->id is greater than the configured number of multifd channels, an irrelevant error message about multifd version is printed. Change the error message to a relevant one about the channel id. Intel-SIG: commit c77b40859a52 migration/multifd: Fix error message in multifd_recv_initial_packet() Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-6-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 7d373a245e..647ce08c08 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -229,8 +229,8 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) } if (msg.id > migrate_multifd_channels()) { - error_setg(errp, "multifd: received channel version %u " - "expected %u", msg.version, MULTIFD_VERSION); + error_setg(errp, "multifd: received channel id %u is greater than " + "number of channels %u", msg.id, migrate_multifd_channels()); return -1; } -- Gitee From 2047996bc711902f86690ea5cd43d18cf15aaa51 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:11 +0200 Subject: [PATCH 006/134] migration/multifd: Simplify multifd_channel_connect() if else statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a4395f5d3c06472ed70d9ef9f79878f95575be9e upstream. The else branch in multifd_channel_connect() is redundant because when the if branch is taken the function returns. Simplify the code by removing the else branch. Intel-SIG: commit a4395f5d3c06 migration/multifd: Simplify multifd_channel_connect() if else statement Signed-off-by: Avihai Horon Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-7-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 647ce08c08..ea507c5a0d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -851,14 +851,13 @@ static bool multifd_channel_connect(MultiFDSendParams *p, * so we mustn't call multifd_send_thread until then */ return multifd_tls_channel_connect(p, ioc, errp); - - } else { - migration_ioc_register_yank(ioc); - p->registered_yank = true; - p->c = ioc; - qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, - QEMU_THREAD_JOINABLE); } + + migration_ioc_register_yank(ioc); + p->registered_yank = true; + p->c = ioc; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); return true; } -- Gitee From 324e2c39d6cbae1ef26f9bd1e9d5f223e87eaa60 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:12 +0200 Subject: [PATCH 007/134] migration/multifd: Fix leaking of Error in TLS error flow commit 6ae208ce9656114e428b1a75ac62a6761ed3216c upstream. If there is an error in multifd TLS handshake task, multifd_tls_outgoing_handshake() retrieves the error with qio_task_propagate_error() but never frees it. Fix it by freeing the obtained Error. In addition, the error is not reported at all, so report it with migrate_set_error(). Intel-SIG: commit 6ae208ce9656 migration/multifd: Fix leaking of Error in TLS error flow Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-8-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd.c b/migration/multifd.c index ea507c5a0d..4013ba71fd 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -791,6 +791,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + migrate_set_error(migrate_get_current(), err); /* * Error happen, mark multifd_send_thread status as 'quit' although it * is not created, and then tell who pay attention to me. @@ -798,6 +799,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, p->quit = true; qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_post(&p->sem_sync); + error_free(err); } static void *multifd_tls_handshake_thread(void *opaque) -- Gitee From f51304e106e4550e8bc45404ff92981ebfd3695e Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:13 +0200 Subject: [PATCH 008/134] migration/multifd: Remove error_setg() in migration_ioc_process_incoming() commit 1d3886f837d8e972366a8b58ba8afb0e5efbeed7 upstream. If multifd_load_setup() fails in migration_ioc_process_incoming(), error_setg() is called with errp. This will lead to an assert because in that case errp already contains an error. Fix it by removing the redundant error_setg(). Intel-SIG: commit 1d3886f837d8 migration/multifd: Remove error_setg() in migration_ioc_process_incoming() Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-9-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 1feff3bfd6..94c6ed2230 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -842,7 +842,6 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } if (multifd_load_setup(errp) != 0) { - error_setg(errp, "Failed to setup multifd channels"); return; } -- Gitee From fea7dd740c5a38ba2bb83ac408276d4603582e2c Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:14 +0200 Subject: [PATCH 009/134] migration: Fix migration_channel_read_peek() error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4f8cf323e80c17f7d4b5604f1699591326df6262 upstream. migration_channel_read_peek() calls qio_channel_readv_full() and handles both cases of return value == 0 and return value < 0 the same way, by calling error_setg() with errp. However, if return value < 0, errp is already set, so calling error_setg() with errp will lead to an assert. Fix it by handling these cases separately, calling error_setg() with errp only in return value == 0 case. Intel-SIG: commit 4f8cf323e80c migration: Fix migration_channel_read_peek() error path Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-10-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/channel.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index ca3319a309..f9de064f3b 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -117,9 +117,12 @@ int migration_channel_read_peek(QIOChannel *ioc, len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); - if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { - error_setg(errp, - "Failed to peek at channel"); + if (len < 0 && len != QIO_CHANNEL_ERR_BLOCK) { + return -1; + } + + if (len == 0) { + error_setg(errp, "Failed to peek at channel"); return -1; } -- Gitee From 9d0cca81d9e7183bb43a82208922f07bb9280b52 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:15 +0200 Subject: [PATCH 010/134] migration: Remove unnecessary usage of local Error commit b6f4c0c788d5a037197a0ed409e131ed2cec274a upstream. According to Error API, usage of ERRP_GUARD() or a local Error instead of errp is needed if errp is passed to void functions, where it is later dereferenced to see if an error occurred. There are several places in migration.c that use local Error although it is not needed. Change these places to use errp directly. Intel-SIG: commit b6f4c0c788d5 migration: Remove unnecessary usage of local Error Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-11-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 94c6ed2230..ff309b48f7 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -829,10 +829,9 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) * issue is not possible. */ ret = migration_channel_read_peek(ioc, (void *)&channel_magic, - sizeof(channel_magic), &local_err); + sizeof(channel_magic), errp); if (ret != 0) { - error_propagate(errp, local_err); return; } @@ -1824,8 +1823,6 @@ bool migration_is_blocked(Error **errp) static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, bool resume, Error **errp) { - Error *local_err = NULL; - if (blk_inc) { warn_report("parameter 'inc' is deprecated;" " use blockdev-mirror with NBD instead"); @@ -1895,8 +1892,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, "current migration capabilities"); return false; } - if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, &local_err)) { - error_propagate(errp, local_err); + if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, errp)) { return false; } s->must_remove_block_options = true; -- Gitee From 657a6374d4230b6a2d888e92c5363b0903500291 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:16 +0200 Subject: [PATCH 011/134] migration/multifd: Remove unnecessary usage of local Error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3fc58efa938338a82e4d5c0c031e7e9c98e9544f upstream. According to Error API, usage of ERRP_GUARD() or a local Error instead of errp is needed if errp is passed to void functions, where it is later dereferenced to see if an error occurred. There are several places in multifd.c that use local Error although it is not needed. Change these places to use errp directly. Intel-SIG: commit 3fc58efa9383 migration/multifd: Remove unnecessary usage of local Error Signed-off-by: Avihai Horon Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-12-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 4013ba71fd..91dcb25120 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -955,12 +955,10 @@ int multifd_save_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_send_state->ops->send_setup(p, &local_err); + ret = multifd_send_state->ops->send_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } @@ -1199,12 +1197,10 @@ int multifd_load_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_recv_state->ops->recv_setup(p, &local_err); + ret = multifd_recv_state->ops->recv_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } -- Gitee From df3aa34e806f9c4dd21ac4a0f26e3338551fec7f Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:38 -0300 Subject: [PATCH 012/134] migration/multifd: Remove MultiFDPages_t::packet_num commit dca1bc7f24d2fa227f0b787f85f3cc67006e67bf upstream. This was introduced by commit 34c55a94b1 ("migration: Create multipage support") and never used. Intel-SIG: commit dca1bc7f24d2 migration/multifd: Remove MultiFDPages_t::packet_num Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 1 - migration/multifd.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 91dcb25120..2520e2706c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -251,7 +251,6 @@ static void multifd_pages_clear(MultiFDPages_t *pages) { pages->num = 0; pages->allocated = 0; - pages->packet_num = 0; pages->block = NULL; g_free(pages->offset); pages->offset = NULL; diff --git a/migration/multifd.h b/migration/multifd.h index a835643b48..b0ff610c37 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -58,8 +58,6 @@ typedef struct { uint32_t num; /* number of allocated pages */ uint32_t allocated; - /* global number of generated multifd packets */ - uint64_t packet_num; /* offset of each page */ ram_addr_t *offset; RAMBlock *block; -- Gitee From c24e7cc56e29da1d068c289092bda3507345b172 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:39 -0300 Subject: [PATCH 013/134] migration/multifd: Remove QEMUFile from where it is not needed commit 9346fa1870784c70618bfd5a9e1f1da89de0c5ec upstream. Intel-SIG: commit 9346fa187078 migration/multifd: Remove QEMUFile from where it is not needed Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-3-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/ram.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++------ migration/multifd.h | 4 ++-- migration/ram.c | 15 +++++++-------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 2520e2706c..03855772f7 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -391,7 +391,7 @@ struct { * false. */ -static int multifd_send_pages(QEMUFile *f) +static int multifd_send_pages(void) { int i; static int next_channel; @@ -437,7 +437,7 @@ static int multifd_send_pages(QEMUFile *f) return 1; } -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) +int multifd_queue_page(RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; bool changed = false; @@ -457,12 +457,12 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) changed = true; } - if (multifd_send_pages(f) < 0) { + if (multifd_send_pages() < 0) { return -1; } if (changed) { - return multifd_queue_page(f, block, offset); + return multifd_queue_page(block, offset); } return 1; @@ -584,7 +584,7 @@ static int multifd_zero_copy_flush(QIOChannel *c) return ret; } -int multifd_send_sync_main(QEMUFile *f) +int multifd_send_sync_main(void) { int i; bool flush_zero_copy; @@ -593,7 +593,7 @@ int multifd_send_sync_main(QEMUFile *f) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages(f) < 0) { + if (multifd_send_pages() < 0) { error_report("%s: multifd_send_pages fail", __func__); return -1; } diff --git a/migration/multifd.h b/migration/multifd.h index b0ff610c37..35d11f103c 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -21,8 +21,8 @@ void multifd_load_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -int multifd_send_sync_main(QEMUFile *f); -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); +int multifd_send_sync_main(void); +int multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/ram.c b/migration/ram.c index 1377b9eb37..9ad6e55f13 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1386,10 +1386,9 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) return pages; } -static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, - ram_addr_t offset) +static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(file, block, offset) < 0) { + if (multifd_queue_page(block, offset) < 0) { return -1; } stat64_add(&mig_stats.normal_pages, 1); @@ -1472,7 +1471,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; - int ret = multifd_send_sync_main(f); + int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -2264,7 +2263,7 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) * still see partially copied pages which is data corruption. */ if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(pss->pss_channel, block, offset); + return ram_save_multifd_page(block, offset); } return ram_save_page(rs, pss); @@ -3319,7 +3318,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_ops->ram_save_target_page = ram_save_target_page_legacy; qemu_mutex_unlock_iothread(); - ret = multifd_send_sync_main(f); + ret = multifd_send_sync_main(); qemu_mutex_lock_iothread(); if (ret < 0) { return ret; @@ -3443,7 +3442,7 @@ out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -3539,7 +3538,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } } - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } -- Gitee From 47733ad42f59c71f1431f4464aaee9f9befbd4cd Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:40 -0300 Subject: [PATCH 014/134] migration/multifd: Change multifd_pages_init argument commit 6074f81625800743e4c374aecf7dd30774aaf6e0 upstream. The 'size' argument is actually the number of pages that fit in a multifd packet. Change it to uint32_t and rename. Intel-SIG: commit 6074f8162580 migration/multifd: Change multifd_pages_init argument Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 03855772f7..eaae772321 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -237,12 +237,12 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } -static MultiFDPages_t *multifd_pages_init(size_t size) +static MultiFDPages_t *multifd_pages_init(uint32_t n) { MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); - pages->allocated = size; - pages->offset = g_new0(ram_addr_t, size); + pages->allocated = n; + pages->offset = g_new0(ram_addr_t, n); return pages; } -- Gitee From 4465d265f3cfab47440fae68e9388669ebe3c768 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:41 -0300 Subject: [PATCH 015/134] migration: Report error in incoming migration commit e3b8ad5c13714cca5e3fc1445472171fbcd469bc upstream. We're not currently reporting the errors set with migrate_set_error() when incoming migration fails. Intel-SIG: commit e3b8ad5c1371 migration: Report error in incoming migration Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index ff309b48f7..a4db2615cc 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -698,6 +698,13 @@ process_incoming_migration_co(void *opaque) } if (ret < 0) { + MigrationState *s = migrate_get_current(); + + if (migrate_has_error(s)) { + WITH_QEMU_LOCK_GUARD(&s->error_mutex) { + error_report_err(s->error); + } + } error_report("load of migration failed: %s", strerror(-ret)); goto fail; } -- Gitee From 8ecf4a07638fb4e06ea6e68a08e63133cbc2ccf0 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:42 -0300 Subject: [PATCH 016/134] tests/qtest/migration: Print migration incoming errors commit 679a7382a389875c0f7835a1a409ebf4859f8410 upstream. We're currently just asserting when incoming migration fails. Let's print the error message from QMP as well. Intel-SIG: commit 679a7382a389 tests/qtest/migration: Print migration incoming errors Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-helpers.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index 24fb7b3525..f1106128a9 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -118,6 +118,12 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}", args); + + if (!qdict_haskey(rsp, "return")) { + g_autoptr(GString) s = qobject_to_json_pretty(QOBJECT(rsp), true); + g_test_message("%s", s->str); + } + g_assert(qdict_haskey(rsp, "return")); qobject_unref(rsp); -- Gitee From e330fd05537c94dc41abf24e6760fc9824c8923c Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:43 -0300 Subject: [PATCH 017/134] tests/qtest/migration: Add a wrapper to print test names commit e33b6712dba206547a313a6f2608b0fd967ee558 upstream. Our usage of gtest results in us losing the very basic functionality of "knowing which test failed". The issue is that gtest only prints test names ("paths" in gtest parlance) once the test has finished, but we use asserts in the tests and crash gtest itself before it can print anything. We also use a final abort when the result of g_test_run is not 0. Depending on how the test failed/broke we can see the function that trigged the abort, which may be representative of the test, but it could also just be some generic function. We have been relying on the primitive method of looking at the name of the previous successful test and then looking at the code to figure out which test should have come next. Add a wrapper to the test registration that does the job of printing the test name before running. Intel-SIG: commit e33b6712dba2 tests/qtest/migration: Add a wrapper to print test names Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-helpers.c | 32 ++++++++++++++++++++++++++++++++ tests/qtest/migration-helpers.h | 1 + 2 files changed, 33 insertions(+) diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index f1106128a9..164e09c299 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -298,3 +298,35 @@ char *resolve_machine_version(const char *alias, const char *var1, return find_common_machine_version(machine_name, var1, var2); } + +typedef struct { + char *name; + void (*func)(void); +} MigrationTest; + +static void migration_test_destroy(gpointer data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_free(test->name); + g_free(test); +} + +static void migration_test_wrapper(const void *data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_test_message("Running /%s%s", qtest_get_arch(), test->name); + test->func(); +} + +void migration_test_add(const char *path, void (*fn)(void)) +{ + MigrationTest *test = g_new0(MigrationTest, 1); + + test->func = fn; + test->name = g_strdup(path); + + qtest_add_data_func_full(path, test, migration_test_wrapper, + migration_test_destroy); +} diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h index e31dc85cc7..0d9a02edc7 100644 --- a/tests/qtest/migration-helpers.h +++ b/tests/qtest/migration-helpers.h @@ -47,4 +47,5 @@ char *find_common_machine_version(const char *mtype, const char *var1, const char *var2); char *resolve_machine_version(const char *alias, const char *var1, const char *var2); +void migration_test_add(const char *path, void (*fn)(void)); #endif /* MIGRATION_HELPERS_H */ -- Gitee From 47356a0848d75a0f67d59238bbe59f837c1b7947 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:44 -0300 Subject: [PATCH 018/134] tests/qtest/migration: Use the new migration_test_add commit 6f0771de903bb7623dc85bbf9f94f641979daaaa upstream. Replace the tests registration with the new function that prints tests names. Intel-SIG: commit 6f0771de903b tests/qtest/migration: Use the new migration_test_add Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-8-farosas@suse.de Signed-off-by: Peter Xu Conflicts: tests/qtest/migration-test.c [jz: resolve context conflicts due to live-suspend] Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 201 ++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 97 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 0fbaa6a90f..4884a40be2 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -3339,62 +3339,65 @@ int main(int argc, char **argv) module_call_init(MODULE_INIT_QOM); if (has_uffd) { - qtest_add_func("/migration/postcopy/plain", test_postcopy); - qtest_add_func("/migration/postcopy/recovery/plain", - test_postcopy_recovery); - qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt); - qtest_add_func("/migration/postcopy/preempt/recovery/plain", - test_postcopy_preempt_recovery); + migration_test_add("/migration/postcopy/plain", test_postcopy); + migration_test_add("/migration/postcopy/recovery/plain", + test_postcopy_recovery); + migration_test_add("/migration/postcopy/preempt/plain", + test_postcopy_preempt); + migration_test_add("/migration/postcopy/preempt/recovery/plain", + test_postcopy_preempt_recovery); if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/postcopy/compress/plain", - test_postcopy_compress); - qtest_add_func("/migration/postcopy/recovery/compress/plain", - test_postcopy_recovery_compress); + migration_test_add("/migration/postcopy/compress/plain", + test_postcopy_compress); + migration_test_add("/migration/postcopy/recovery/compress/plain", + test_postcopy_recovery_compress); } #ifndef _WIN32 - qtest_add_func("/migration/postcopy/recovery/double-failures", - test_postcopy_recovery_double_fail); + migration_test_add("/migration/postcopy/recovery/double-failures", + test_postcopy_recovery_double_fail); #endif /* _WIN32 */ } - qtest_add_func("/migration/bad_dest", test_baddest); + migration_test_add("/migration/bad_dest", test_baddest); #ifndef _WIN32 if (!g_str_equal(arch, "s390x")) { - qtest_add_func("/migration/analyze-script", test_analyze_script); + migration_test_add("/migration/analyze-script", test_analyze_script); } #endif - qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain); - qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle); + migration_test_add("/migration/precopy/unix/plain", + test_precopy_unix_plain); + migration_test_add("/migration/precopy/unix/xbzrle", + test_precopy_unix_xbzrle); /* * Compression fails from time to time. * Put test here but don't enable it until everything is fixed. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/precopy/unix/compress/wait", - test_precopy_unix_compress); - qtest_add_func("/migration/precopy/unix/compress/nowait", - test_precopy_unix_compress_nowait); + migration_test_add("/migration/precopy/unix/compress/wait", + test_precopy_unix_compress); + migration_test_add("/migration/precopy/unix/compress/nowait", + test_precopy_unix_compress_nowait); } - qtest_add_func("/migration/precopy/file", - test_precopy_file); - qtest_add_func("/migration/precopy/file/offset", - test_precopy_file_offset); - qtest_add_func("/migration/precopy/file/offset/bad", - test_precopy_file_offset_bad); + migration_test_add("/migration/precopy/file", + test_precopy_file); + migration_test_add("/migration/precopy/file/offset", + test_precopy_file_offset); + migration_test_add("/migration/precopy/file/offset/bad", + test_precopy_file_offset_bad); /* * Our CI system has problems with shared memory. * Don't run this test until we find a workaround. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/mode/reboot", test_mode_reboot); + migration_test_add("/migration/mode/reboot", test_mode_reboot); } #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/unix/tls/psk", - test_precopy_unix_tls_psk); + migration_test_add("/migration/precopy/unix/tls/psk", + test_precopy_unix_tls_psk); if (has_uffd) { /* @@ -3402,110 +3405,114 @@ int main(int argc, char **argv) * channels are tested under precopy. Here what we want to test is the * general postcopy path that has TLS channel enabled. */ - qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk); - qtest_add_func("/migration/postcopy/recovery/tls/psk", - test_postcopy_recovery_tls_psk); - qtest_add_func("/migration/postcopy/preempt/tls/psk", - test_postcopy_preempt_tls_psk); - qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk", - test_postcopy_preempt_all); + migration_test_add("/migration/postcopy/tls/psk", + test_postcopy_tls_psk); + migration_test_add("/migration/postcopy/recovery/tls/psk", + test_postcopy_recovery_tls_psk); + migration_test_add("/migration/postcopy/preempt/tls/psk", + test_postcopy_preempt_tls_psk); + migration_test_add("/migration/postcopy/preempt/recovery/tls/psk", + test_postcopy_preempt_all); } #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/unix/tls/x509/default-host", - test_precopy_unix_tls_x509_default_host); - qtest_add_func("/migration/precopy/unix/tls/x509/override-host", - test_precopy_unix_tls_x509_override_host); + migration_test_add("/migration/precopy/unix/tls/x509/default-host", + test_precopy_unix_tls_x509_default_host); + migration_test_add("/migration/precopy/unix/tls/x509/override-host", + test_precopy_unix_tls_x509_override_host); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain); + migration_test_add("/migration/precopy/tcp/plain", test_precopy_tcp_plain); - qtest_add_func("/migration/precopy/tcp/plain/switchover-ack", - test_precopy_tcp_switchover_ack); + migration_test_add("/migration/precopy/tcp/plain/switchover-ack", + test_precopy_tcp_switchover_ack); #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/tcp/tls/psk/match", - test_precopy_tcp_tls_psk_match); - qtest_add_func("/migration/precopy/tcp/tls/psk/mismatch", - test_precopy_tcp_tls_psk_mismatch); + migration_test_add("/migration/precopy/tcp/tls/psk/match", + test_precopy_tcp_tls_psk_match); + migration_test_add("/migration/precopy/tcp/tls/psk/mismatch", + test_precopy_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/tcp/tls/x509/default-host", - test_precopy_tcp_tls_x509_default_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/override-host", - test_precopy_tcp_tls_x509_override_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/mismatch-host", - test_precopy_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/friendly-client", - test_precopy_tcp_tls_x509_friendly_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/hostile-client", - test_precopy_tcp_tls_x509_hostile_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/allow-anon-client", - test_precopy_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/reject-anon-client", - test_precopy_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/default-host", + test_precopy_tcp_tls_x509_default_host); + migration_test_add("/migration/precopy/tcp/tls/x509/override-host", + test_precopy_tcp_tls_x509_override_host); + migration_test_add("/migration/precopy/tcp/tls/x509/mismatch-host", + test_precopy_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/precopy/tcp/tls/x509/friendly-client", + test_precopy_tcp_tls_x509_friendly_client); + migration_test_add("/migration/precopy/tcp/tls/x509/hostile-client", + test_precopy_tcp_tls_x509_hostile_client); + migration_test_add("/migration/precopy/tcp/tls/x509/allow-anon-client", + test_precopy_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/reject-anon-client", + test_precopy_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */ + /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */ #ifndef _WIN32 - qtest_add_func("/migration/fd_proto", test_migrate_fd_proto); + migration_test_add("/migration/fd_proto", test_migrate_fd_proto); #endif - qtest_add_func("/migration/validate_uuid", test_validate_uuid); - qtest_add_func("/migration/validate_uuid_error", test_validate_uuid_error); - qtest_add_func("/migration/validate_uuid_src_not_set", - test_validate_uuid_src_not_set); - qtest_add_func("/migration/validate_uuid_dst_not_set", - test_validate_uuid_dst_not_set); + migration_test_add("/migration/validate_uuid", test_validate_uuid); + migration_test_add("/migration/validate_uuid_error", + test_validate_uuid_error); + migration_test_add("/migration/validate_uuid_src_not_set", + test_validate_uuid_src_not_set); + migration_test_add("/migration/validate_uuid_dst_not_set", + test_validate_uuid_dst_not_set); /* * See explanation why this test is slow on function definition */ if (g_test_slow()) { - qtest_add_func("/migration/auto_converge", test_migrate_auto_converge); + migration_test_add("/migration/auto_converge", + test_migrate_auto_converge); if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_limit", test_migrate_dirty_limit); + migration_test_add("/migration/dirty_limit", + test_migrate_dirty_limit); } } - qtest_add_func("/migration/multifd/tcp/plain/none", - test_multifd_tcp_none); + migration_test_add("/migration/multifd/tcp/plain/none", + test_multifd_tcp_none); /* * This test is flaky and sometimes fails in CI and otherwise: * don't run unless user opts in via environment variable. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/multifd/tcp/plain/cancel", - test_multifd_tcp_cancel); + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); } - qtest_add_func("/migration/multifd/tcp/plain/zlib", - test_multifd_tcp_zlib); + migration_test_add("/migration/multifd/tcp/plain/zlib", + test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD - qtest_add_func("/migration/multifd/tcp/plain/zstd", - test_multifd_tcp_zstd); + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); #endif #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/multifd/tcp/tls/psk/match", - test_multifd_tcp_tls_psk_match); - qtest_add_func("/migration/multifd/tcp/tls/psk/mismatch", - test_multifd_tcp_tls_psk_mismatch); + migration_test_add("/migration/multifd/tcp/tls/psk/match", + test_multifd_tcp_tls_psk_match); + migration_test_add("/migration/multifd/tcp/tls/psk/mismatch", + test_multifd_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/multifd/tcp/tls/x509/default-host", - test_multifd_tcp_tls_x509_default_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/override-host", - test_multifd_tcp_tls_x509_override_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/mismatch-host", - test_multifd_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/allow-anon-client", - test_multifd_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/multifd/tcp/tls/x509/reject-anon-client", - test_multifd_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/default-host", + test_multifd_tcp_tls_x509_default_host); + migration_test_add("/migration/multifd/tcp/tls/x509/override-host", + test_multifd_tcp_tls_x509_override_host); + migration_test_add("/migration/multifd/tcp/tls/x509/mismatch-host", + test_multifd_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/multifd/tcp/tls/x509/allow-anon-client", + test_multifd_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/reject-anon-client", + test_multifd_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_ring", - test_precopy_unix_dirty_ring); - qtest_add_func("/migration/vcpu_dirty_limit", - test_vcpu_dirty_limit); + migration_test_add("/migration/dirty_ring", + test_precopy_unix_dirty_ring); + migration_test_add("/migration/vcpu_dirty_limit", + test_vcpu_dirty_limit); } ret = g_test_run(); -- Gitee From 276406221d08b902103869c4c925de276fd4b56b Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 11 Oct 2023 15:46:04 -0300 Subject: [PATCH 019/134] tests/qtest: Re-enable multifd cancel test commit 75b1f88cd2dd5eeb1fd817a2f3a291c2670f9c50 upstream. We've found the source of flakiness in this test, so re-enable it. Intel-SIG: commit 75b1f88cd2dd tests/qtest: Re-enable multifd cancel test Reviewed-by: Juan Quintela Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20230606144551.24367-4-farosas@suse.de [peterx: rebase to 2a61a6964c, to use migration_test_add()] Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 4884a40be2..4bdf397828 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -3475,14 +3475,8 @@ int main(int argc, char **argv) } migration_test_add("/migration/multifd/tcp/plain/none", test_multifd_tcp_none); - /* - * This test is flaky and sometimes fails in CI and otherwise: - * don't run unless user opts in via environment variable. - */ - if (getenv("QEMU_TEST_FLAKY_TESTS")) { - migration_test_add("/migration/multifd/tcp/plain/cancel", - test_multifd_tcp_cancel); - } + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); migration_test_add("/migration/multifd/tcp/plain/zlib", test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD -- Gitee From 795506755080778ba6e3f1c74d1676babc61bfd5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:19 +0800 Subject: [PATCH 020/134] docs/migration: Create migration/ directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8cb2f8b172e74a7279fabb5d5c20aee32b5b98cd upstream. Migration documentation is growing into a single file too large. Create a sub-directory for it for a split. We also already have separate vfio/virtio documentations, move it all over into the directory. Note that the virtio one is still not yet converted to rST. That is a job for later. Intel-SIG: commit 8cb2f8b172e7 docs/migration: Create migration/ directory Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/index-internals.rst | 2 +- docs/devel/{migration.rst => migration/main.rst} | 0 docs/devel/{vfio-migration.rst => migration/vfio.rst} | 0 docs/devel/{virtio-migration.txt => migration/virtio.txt} | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename docs/devel/{migration.rst => migration/main.rst} (100%) rename docs/devel/{vfio-migration.rst => migration/vfio.rst} (100%) rename docs/devel/{virtio-migration.txt => migration/virtio.txt} (100%) diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 6f81df92bc..645f0cd7b9 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,7 +11,7 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration + migration/main multi-process reset s390-cpu-topology diff --git a/docs/devel/migration.rst b/docs/devel/migration/main.rst similarity index 100% rename from docs/devel/migration.rst rename to docs/devel/migration/main.rst diff --git a/docs/devel/vfio-migration.rst b/docs/devel/migration/vfio.rst similarity index 100% rename from docs/devel/vfio-migration.rst rename to docs/devel/migration/vfio.rst diff --git a/docs/devel/virtio-migration.txt b/docs/devel/migration/virtio.txt similarity index 100% rename from docs/devel/virtio-migration.txt rename to docs/devel/migration/virtio.txt -- Gitee From 794f71e693a7834f63f444335dea6e31f31a4b32 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:20 +0800 Subject: [PATCH 021/134] docs/migration: Create index page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f6bbac985e6df492f2c6be94fb893ada75ffdefa upstream. Create an index page for migration module. Move VFIO migration there too. A trivial touch-up on the title to use lower case there. Since then we'll have "migration" as the top title, make the main doc file renamed to "migration framework". Intel-SIG: commit f6bbac985e6d docs/migration: Create index page Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-3-peterx@redhat.com Signed-off-by: Peter Xu Conflicts: docs/devel/index-internals.rst [jz: resolve context conflict due to vfio-iommufd] Signed-off-by: Jason Zeng --- docs/devel/index-internals.rst | 3 +-- docs/devel/migration/index.rst | 11 +++++++++++ docs/devel/migration/main.rst | 6 +++--- docs/devel/migration/vfio.rst | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 docs/devel/migration/index.rst diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 645f0cd7b9..4d1502d4b4 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,12 +11,11 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration/main + migration/index multi-process reset s390-cpu-topology s390-dasd-ipl tracing - vfio-migration writing-monitor-commands virtio-backends diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst new file mode 100644 index 0000000000..02cfdcc969 --- /dev/null +++ b/docs/devel/migration/index.rst @@ -0,0 +1,11 @@ +Migration +========= + +This is the main entry for QEMU migration documentations. It explains how +QEMU live migration works. + +.. toctree:: + :maxdepth: 2 + + main + vfio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index ec55089b25..82cdb420bf 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -1,6 +1,6 @@ -========= -Migration -========= +=================== +Migration framework +=================== QEMU has code to load/save the state of the guest that it is running. These are two complementary operations. Saving the state just does diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index 605fe60e96..c49482eab6 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -1,5 +1,5 @@ ===================== -VFIO device Migration +VFIO device migration ===================== Migration of virtual machine involves saving the state for each device that -- Gitee From eca4a85a6cb288c54b562dbf8ae9c18063767de1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:21 +0800 Subject: [PATCH 022/134] docs/migration: Convert virtio.txt into rST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4d7a691bcfeb5580e3f7457e1f1c2fbd64572161 upstream. Convert the plain old .txt into .rst, add it into migration/index.rst. Intel-SIG: commit 4d7a691bcfeb docs/migration: Convert virtio.txt into rST Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/index.rst | 1 + .../migration/{virtio.txt => virtio.rst} | 139 +++++++++--------- 2 files changed, 74 insertions(+), 66 deletions(-) rename docs/devel/migration/{virtio.txt => virtio.rst} (37%) diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 02cfdcc969..2cb701c77c 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -9,3 +9,4 @@ QEMU live migration works. main vfio + virtio diff --git a/docs/devel/migration/virtio.txt b/docs/devel/migration/virtio.rst similarity index 37% rename from docs/devel/migration/virtio.txt rename to docs/devel/migration/virtio.rst index 98a6b0ffb5..611a18b821 100644 --- a/docs/devel/migration/virtio.txt +++ b/docs/devel/migration/virtio.rst @@ -1,5 +1,6 @@ -Virtio devices and migration -============================ +======================= +Virtio device migration +======================= Copyright 2015 IBM Corp. @@ -8,91 +9,97 @@ the COPYING file in the top-level directory. Saving and restoring the state of virtio devices is a bit of a twisty maze, for several reasons: + - state is distributed between several parts: + - virtio core, for common fields like features, number of queues, ... + - virtio transport (pci, ccw, ...), for the different proxy devices and transport specific state (msix vectors, indicators, ...) + - virtio device (net, blk, ...), for the different device types and their state (mac address, request queue, ...) + - most fields are saved via the stream interface; subsequently, subsections have been added to make cross-version migration possible This file attempts to document the current procedure and point out some caveats. - Save state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - save() function registered - via VMState wrapper on - device class -virtio_save() <---------- - ------> save_config() - - save proxy device - - save transport-specific - device fields -- save common device - fields -- save common virtqueue - fields - ------> save_queue() - - save transport-specific - virtqueue fields - ------> save_device() - - save device-specific - fields -- save subsections - - device endianness, - if changed from - default endianness - - 64 bit features, if - any high feature bit - is set - - virtio-1 virtqueue - fields, if VERSION_1 - is set +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + save() function registered + via VMState wrapper on + device class + virtio_save() <---------- + ------> save_config() + - save proxy device + - save transport-specific + device fields + - save common device + fields + - save common virtqueue + fields + ------> save_queue() + - save transport-specific + virtqueue fields + ------> save_device() + - save device-specific + fields + - save subsections + - device endianness, + if changed from + default endianness + - 64 bit features, if + any high feature bit + is set + - virtio-1 virtqueue + fields, if VERSION_1 + is set Load state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - load() function registered - via VMState wrapper on - device class -virtio_load() <---------- - ------> load_config() - - load proxy device - - load transport-specific - device fields -- load common device - fields -- load common virtqueue - fields - ------> load_queue() - - load transport-specific - virtqueue fields -- notify guest - ------> load_device() - - load device-specific - fields -- load subsections - - device endianness - - 64 bit features - - virtio-1 virtqueue - fields -- sanitize endianness -- sanitize features -- virtqueue index sanity - check - - feature-dependent setup +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + load() function registered + via VMState wrapper on + device class + virtio_load() <---------- + ------> load_config() + - load proxy device + - load transport-specific + device fields + - load common device + fields + - load common virtqueue + fields + ------> load_queue() + - load transport-specific + virtqueue fields + - notify guest + ------> load_device() + - load device-specific + fields + - load subsections + - device endianness + - 64 bit features + - virtio-1 virtqueue + fields + - sanitize endianness + - sanitize features + - virtqueue index sanity + check + - feature-dependent setup Implications of this setup ========================== -- Gitee From a4411dd99fe5212d5443ec76e63277f3f2ce8562 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:22 +0800 Subject: [PATCH 023/134] docs/migration: Split "Backwards compatibility" separately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6cc6a7b98b88f1a7d1d5ed99db0d373a46606aac upstream. Split the section from main.rst into a separate file. Reference it in the index.rst. Intel-SIG: commit 6cc6a7b98b88 docs/migration: Split "Backwards compatibility" separately Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/compatibility.rst | 517 ++++++++++++++++++++++++ docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 519 ------------------------- 3 files changed, 518 insertions(+), 519 deletions(-) create mode 100644 docs/devel/migration/compatibility.rst diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst new file mode 100644 index 0000000000..5a5417ef06 --- /dev/null +++ b/docs/devel/migration/compatibility.rst @@ -0,0 +1,517 @@ +Backwards compatibility +======================= + +How backwards compatibility works +--------------------------------- + +When we do migration, we have two QEMU processes: the source and the +target. There are two cases, they are the same version or they are +different versions. The easy case is when they are the same version. +The difficult one is when they are different versions. + +There are two things that are different, but they have very similar +names and sometimes get confused: + +- QEMU version +- machine type version + +Let's start with a practical example, we start with: + +- qemu-system-x86_64 (v5.2), from now on qemu-5.2. +- qemu-system-x86_64 (v5.1), from now on qemu-5.1. + +Related to this are the "latest" machine types defined on each of +them: + +- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 +- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 + +First of all, migration is only supposed to work if you use the same +machine type in both source and destination. The QEMU hardware +configuration needs to be the same also on source and destination. +Most aspects of the backend configuration can be changed at will, +except for a few cases where the backend features influence frontend +device feature exposure. But that is not relevant for this section. + +I am going to list the number of combinations that we can have. Let's +start with the trivial ones, QEMU is the same on source and +destination: + +1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 + + This is the latest QEMU with the latest machine type. + This have to work, and if it doesn't work it is a bug. + +2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + Exactly the same case than the previous one, but for 5.1. + Nothing to see here either. + +This are the easiest ones, we will not talk more about them in this +section. + +Now we start with the more interesting cases. Consider the case where +we have the same QEMU version in both sides (qemu-5.2) but we are using +the latest machine type for that version (pc-5.2) but one of an older +QEMU version, in this case pc-5.1. + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + It needs to use the definition of pc-5.1 and the devices as they + were configured on 5.1, but this should be easy in the sense that + both sides are the same QEMU and both sides have exactly the same + idea of what the pc-5.1 machine is. + +4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 + + This combination is not possible as the qemu-5.1 doesn't understand + pc-5.2 machine type. So nothing to worry here. + +Now it comes the interesting ones, when both QEMU processes are +different. Notice also that the machine type needs to be pc-5.1, +because we have the limitation than qemu-5.1 doesn't know pc-5.2. So +the possible cases are: + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + This migration is known as newer to older. We need to make sure + when we are developing 5.2 we need to take care about not to break + migration to qemu-5.1. Notice that we can't make updates to + qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is + in qemu-5.2 side to make the relevant changes. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + This migration is known as older to newer. We need to make sure + than we are able to receive migrations from qemu-5.1. The problem is + similar to the previous one. + +If qemu-5.1 and qemu-5.2 were the same, there will not be any +compatibility problems. But the reason that we create qemu-5.2 is to +get new features, devices, defaults, etc. + +If we get a device that has a new feature, or change a default value, +we have a problem when we try to migrate between different QEMU +versions. + +So we need a way to tell qemu-5.2 that when we are using machine type +pc-5.1, it needs to **not** use the feature, to be able to migrate to +real qemu-5.1. + +And the equivalent part when migrating from qemu-5.1 to qemu-5.2. +qemu-5.2 has to expect that it is not going to get data for the new +feature, because qemu-5.1 doesn't know about it. + +How do we tell QEMU about these device feature changes? In +hw/core/machine.c:hw_compat_X_Y arrays. + +If we change a default value, we need to put back the old value on +that array. And the device, during initialization needs to look at +that array to see what value it needs to get for that feature. And +what are we going to put in that array, the value of a property. + +To create a property for a device, we need to use one of the +DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the +macros that exist. With it, we set the default value for that +property, and that is what it is going to get in the latest released +version. But if we want a different value for a previous version, we +can change that in the hw_compat_X_Y arrays. + +hw_compat_X_Y is an array of registers that have the format: + +- name_device +- name_property +- value + +Let's see a practical example. + +In qemu-5.2 virtio-blk-device got multi queue support. This is a +change that is not backward compatible. In qemu-5.1 it has one +queue. In qemu-5.2 it has the same number of queues as the number of +cpus in the system. + +When we are doing migration, if we migrate from a device that has 4 +queues to a device that have only one queue, we don't know where to +put the extra information for the other 3 queues, and we fail +migration. + +Similar problem when we migrate from qemu-5.1 that has only one queue +to qemu-5.2, we only sent information for one queue, but destination +has 4, and we have 3 queues that are not properly initialized and +anything can happen. + +So, how can we address this problem. Easy, just convince qemu-5.2 +that when it is running pc-5.1, it needs to set the number of queues +for virtio-blk-devices to 1. + +That way we fix the cases 5 and 6. + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + qemu-5.2 -M pc-5.1 sets number of queues to be 1. + qemu-5.1 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + qemu-5.1 -M pc-5.1 sets number of queues to be 1. + qemu-5.2 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +And now the other interesting case, case 3. In this case we have: + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + Here we have the same QEMU in both sides. So it doesn't matter a + lot if we have set the number of queues to 1 or not, because + they are the same. + + WRONG! + + Think what happens if we do one of this double migrations: + + A -> migrates -> B -> migrates -> C + + where: + + A: qemu-5.1 -M pc-5.1 + B: qemu-5.2 -M pc-5.1 + C: qemu-5.2 -M pc-5.1 + + migration A -> B is case 6, so number of queues needs to be 1. + + migration B -> C is case 3, so we don't care. But actually we + care because we haven't started the guest in qemu-5.2, it came + migrated from qemu-5.1. So to be in the safe place, we need to + always use number of queues 1 when we are using pc-5.1. + +Now, how was this done in reality? The following commit shows how it +was done:: + + commit 9445e1e15e66c19e42bea942ba810db28052cd05 + Author: Stefan Hajnoczi + Date: Tue Aug 18 15:33:47 2020 +0100 + + virtio-blk-pci: default num_queues to -smp N + +The relevant parts for migration are:: + + @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { + #endif + DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, + true), + - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), + + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, + + VIRTIO_BLK_AUTO_NUM_QUEUES), + DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), + +It changes the default value of num_queues. But it fishes it for old +machine types to have the right value:: + + @@ -31,6 +31,7 @@ + GlobalProperty hw_compat_5_1[] = { + ... + + { "virtio-blk-device", "num-queues", "1"}, + ... + }; + +A device with different features on both sides +---------------------------------------------- + +Let's assume that we are using the same QEMU binary on both sides, +just to make the things easier. But we have a device that has +different features on both sides of the migration. That can be +because the devices are different, because the kernel driver of both +devices have different features, whatever. + +How can we get this to work with migration. The way to do that is +"theoretically" easy. You have to get the features that the device +has in the source of the migration. The features that the device has +on the target of the migration, you get the intersection of the +features of both sides, and that is the way that you should launch +QEMU. + +Notice that this is not completely related to QEMU. The most +important thing here is that this should be handled by the managing +application that launches QEMU. If QEMU is configured correctly, the +migration will succeed. + +That said, actually doing it is complicated. Almost all devices are +bad at being able to be launched with only some features enabled. +With one big exception: cpus. + +You can read the documentation for QEMU x86 cpu models here: + +https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html + +See when they talk about migration they recommend that one chooses the +newest cpu model that is supported for all cpus. + +Let's say that we have: + +Host A: + +Device X has the feature Y + +Host B: + +Device X has not the feature Y + +If we try to migrate without any care from host A to host B, it will +fail because when migration tries to load the feature Y on +destination, it will find that the hardware is not there. + +Doing this would be the equivalent of doing with cpus: + +Host A: + +$ qemu-system-x86_64 -cpu host + +Host B: + +$ qemu-system-x86_64 -cpu host + +When both hosts have different cpu features this is guaranteed to +fail. Especially if Host B has less features than host A. If host A +has less features than host B, sometimes it works. Important word of +last sentence is "sometimes". + +So, forgetting about cpu models and continuing with the -cpu host +example, let's see that the differences of the cpus is that Host A and +B have the following features: + +Features: 'pcid' 'stibp' 'taa-no' +Host A: X X +Host B: X + +And we want to migrate between them, the way configure both QEMU cpu +will be: + +Host A: + +$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off + +Host B: + +$ qemu-system-x86_64 -cpu host,taa-no=off + +And you would be able to migrate between them. It is responsibility +of the management application or of the user to make sure that the +configuration is correct. QEMU doesn't know how to look at this kind +of features in general. + +Notice that we don't recommend to use -cpu host for migration. It is +used in this example because it makes the example simpler. + +Other devices have worse control about individual features. If they +want to be able to migrate between hosts that show different features, +the device needs a way to configure which ones it is going to use. + +In this section we have considered that we are using the same QEMU +binary in both sides of the migration. If we use different QEMU +versions process, then we need to have into account all other +differences and the examples become even more complicated. + +How to mitigate when we have a backward compatibility error +----------------------------------------------------------- + +We broke migration for old machine types continuously during +development. But as soon as we find that there is a problem, we fix +it. The problem is what happens when we detect after we have done a +release that something has gone wrong. + +Let see how it worked with one example. + +After the release of qemu-8.0 we found a problem when doing migration +of the machine type pc-7.2. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration fails + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration fails + +So clearly something fails when migration between qemu-7.2 and +qemu-8.0 with machine type pc-7.2. The error messages, and git bisect +pointed to this commit. + +In qemu-8.0 we got this commit:: + + commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 + Author: Jonathan Cameron + Date: Thu Mar 2 13:37:02 2023 +0000 + + hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register + + +The relevant bits of the commit for our example are this ones:: + + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +The patch changes how we configure PCI space for AER. But QEMU fails +when the PCI space configuration is different between source and +destination. + +The following commit shows how this got fixed:: + + commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f + Author: Leonardo Bras + Date: Tue May 2 21:27:02 2023 -0300 + + hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 + + [...] + +The relevant parts of the fix in QEMU are as follow: + +First, we create a new property for the device to be able to configure +the old behaviour or the new behaviour:: + + diff --git a/hw/pci/pci.c b/hw/pci/pci.c + index 8a87ccc8b0..5153ad63d6 100644 + --- a/hw/pci/pci.c + +++ b/hw/pci/pci.c + @@ -79,6 +79,8 @@ static Property pci_props[] = { + DEFINE_PROP_STRING("failover_pair_id", PCIDevice, + failover_pair_id), + DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), + + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, + + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), + DEFINE_PROP_END_OF_LIST() + }; + +Notice that we enable the feature for new machine types. + +Now we see how the fix is done. This is going to depend on what kind +of breakage happens, but in this case it is quite simple:: + + diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c + index 103667c368..374d593ead 100644 + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, + uint16_t offset, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_MASK_DEFAULT); + - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_SUPPORTED); + + + + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + } + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +I.e. If the property bit is enabled, we configure it as we did for +qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. + +And now, everything that is missing is disabling the feature for old +machine types:: + + diff --git a/hw/core/machine.c b/hw/core/machine.c + index 47a34841a5..07f763eb2e 100644 + --- a/hw/core/machine.c + +++ b/hw/core/machine.c + @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { + { "e1000e", "migrate-timadj", "off" }, + { "virtio-mem", "x-early-migration", "false" }, + { "migration", "x-preempt-pre-7-2", "true" }, + + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, + }; + const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); + +And now, when qemu-8.0.1 is released with this fix, all combinations +are going to work as supposed. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) + +So the normality has been restored and everything is ok, no? + +Not really, now our matrix is much bigger. We started with the easy +cases, migration from the same version to the same version always +works: + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +Now the interesting ones. When the QEMU processes versions are +different. For the 1st set, their fail and we can do nothing, both +versions are released and we can't change anything. + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +This two are the ones that work. The whole point of making the +change in qemu-8.0.1 release was to fix this issue: + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +But now we found that qemu-8.0 neither can migrate to qemu-7.2 not +qemu-8.0.1. + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + +So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to +anything except to qemu-8.0. + +Can we do better? + +Yeap. If we know that we are going to do this migration: + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +We can launch the appropriate devices with:: + + --device...,x-pci-e-err-unc-mask=on + +And now we can receive a migration from 8.0. And from now on, we can +do that migration to new machine types if we remember to enable that +property for pc-7.2. Notice that we need to remember, it is not +enough to know that the source of the migration is qemu-8.0. Think of +this example: + +$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 + +In the second migration, the source is not qemu-8.0, but we still have +that "problem" and have that property enabled. Notice that we need to +continue having this mark/property until we have this machine +rebooted. But it is not a normal reboot (that don't reload QEMU) we +need the machine to poweroff/poweron on a fixed QEMU. And from now +on we can use the proper real machine. diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 2cb701c77c..7fc02b9520 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -8,5 +8,6 @@ QEMU live migration works. :maxdepth: 2 main + compatibility vfio virtio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 82cdb420bf..04194414af 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -993,522 +993,3 @@ In some cases it may be best to tie specific firmware versions to specific versioned machine types to cut down on the combinations that will need support. This is also useful when newer versions of firmware outgrow the padding. - - -Backwards compatibility -======================= - -How backwards compatibility works ---------------------------------- - -When we do migration, we have two QEMU processes: the source and the -target. There are two cases, they are the same version or they are -different versions. The easy case is when they are the same version. -The difficult one is when they are different versions. - -There are two things that are different, but they have very similar -names and sometimes get confused: - -- QEMU version -- machine type version - -Let's start with a practical example, we start with: - -- qemu-system-x86_64 (v5.2), from now on qemu-5.2. -- qemu-system-x86_64 (v5.1), from now on qemu-5.1. - -Related to this are the "latest" machine types defined on each of -them: - -- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 -- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 - -First of all, migration is only supposed to work if you use the same -machine type in both source and destination. The QEMU hardware -configuration needs to be the same also on source and destination. -Most aspects of the backend configuration can be changed at will, -except for a few cases where the backend features influence frontend -device feature exposure. But that is not relevant for this section. - -I am going to list the number of combinations that we can have. Let's -start with the trivial ones, QEMU is the same on source and -destination: - -1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 - - This is the latest QEMU with the latest machine type. - This have to work, and if it doesn't work it is a bug. - -2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - Exactly the same case than the previous one, but for 5.1. - Nothing to see here either. - -This are the easiest ones, we will not talk more about them in this -section. - -Now we start with the more interesting cases. Consider the case where -we have the same QEMU version in both sides (qemu-5.2) but we are using -the latest machine type for that version (pc-5.2) but one of an older -QEMU version, in this case pc-5.1. - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - It needs to use the definition of pc-5.1 and the devices as they - were configured on 5.1, but this should be easy in the sense that - both sides are the same QEMU and both sides have exactly the same - idea of what the pc-5.1 machine is. - -4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 - - This combination is not possible as the qemu-5.1 doesn't understand - pc-5.2 machine type. So nothing to worry here. - -Now it comes the interesting ones, when both QEMU processes are -different. Notice also that the machine type needs to be pc-5.1, -because we have the limitation than qemu-5.1 doesn't know pc-5.2. So -the possible cases are: - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - This migration is known as newer to older. We need to make sure - when we are developing 5.2 we need to take care about not to break - migration to qemu-5.1. Notice that we can't make updates to - qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is - in qemu-5.2 side to make the relevant changes. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - This migration is known as older to newer. We need to make sure - than we are able to receive migrations from qemu-5.1. The problem is - similar to the previous one. - -If qemu-5.1 and qemu-5.2 were the same, there will not be any -compatibility problems. But the reason that we create qemu-5.2 is to -get new features, devices, defaults, etc. - -If we get a device that has a new feature, or change a default value, -we have a problem when we try to migrate between different QEMU -versions. - -So we need a way to tell qemu-5.2 that when we are using machine type -pc-5.1, it needs to **not** use the feature, to be able to migrate to -real qemu-5.1. - -And the equivalent part when migrating from qemu-5.1 to qemu-5.2. -qemu-5.2 has to expect that it is not going to get data for the new -feature, because qemu-5.1 doesn't know about it. - -How do we tell QEMU about these device feature changes? In -hw/core/machine.c:hw_compat_X_Y arrays. - -If we change a default value, we need to put back the old value on -that array. And the device, during initialization needs to look at -that array to see what value it needs to get for that feature. And -what are we going to put in that array, the value of a property. - -To create a property for a device, we need to use one of the -DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the -macros that exist. With it, we set the default value for that -property, and that is what it is going to get in the latest released -version. But if we want a different value for a previous version, we -can change that in the hw_compat_X_Y arrays. - -hw_compat_X_Y is an array of registers that have the format: - -- name_device -- name_property -- value - -Let's see a practical example. - -In qemu-5.2 virtio-blk-device got multi queue support. This is a -change that is not backward compatible. In qemu-5.1 it has one -queue. In qemu-5.2 it has the same number of queues as the number of -cpus in the system. - -When we are doing migration, if we migrate from a device that has 4 -queues to a device that have only one queue, we don't know where to -put the extra information for the other 3 queues, and we fail -migration. - -Similar problem when we migrate from qemu-5.1 that has only one queue -to qemu-5.2, we only sent information for one queue, but destination -has 4, and we have 3 queues that are not properly initialized and -anything can happen. - -So, how can we address this problem. Easy, just convince qemu-5.2 -that when it is running pc-5.1, it needs to set the number of queues -for virtio-blk-devices to 1. - -That way we fix the cases 5 and 6. - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - qemu-5.2 -M pc-5.1 sets number of queues to be 1. - qemu-5.1 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - qemu-5.1 -M pc-5.1 sets number of queues to be 1. - qemu-5.2 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -And now the other interesting case, case 3. In this case we have: - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - Here we have the same QEMU in both sides. So it doesn't matter a - lot if we have set the number of queues to 1 or not, because - they are the same. - - WRONG! - - Think what happens if we do one of this double migrations: - - A -> migrates -> B -> migrates -> C - - where: - - A: qemu-5.1 -M pc-5.1 - B: qemu-5.2 -M pc-5.1 - C: qemu-5.2 -M pc-5.1 - - migration A -> B is case 6, so number of queues needs to be 1. - - migration B -> C is case 3, so we don't care. But actually we - care because we haven't started the guest in qemu-5.2, it came - migrated from qemu-5.1. So to be in the safe place, we need to - always use number of queues 1 when we are using pc-5.1. - -Now, how was this done in reality? The following commit shows how it -was done:: - - commit 9445e1e15e66c19e42bea942ba810db28052cd05 - Author: Stefan Hajnoczi - Date: Tue Aug 18 15:33:47 2020 +0100 - - virtio-blk-pci: default num_queues to -smp N - -The relevant parts for migration are:: - - @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { - #endif - DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, - true), - - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), - + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, - + VIRTIO_BLK_AUTO_NUM_QUEUES), - DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), - -It changes the default value of num_queues. But it fishes it for old -machine types to have the right value:: - - @@ -31,6 +31,7 @@ - GlobalProperty hw_compat_5_1[] = { - ... - + { "virtio-blk-device", "num-queues", "1"}, - ... - }; - -A device with different features on both sides ----------------------------------------------- - -Let's assume that we are using the same QEMU binary on both sides, -just to make the things easier. But we have a device that has -different features on both sides of the migration. That can be -because the devices are different, because the kernel driver of both -devices have different features, whatever. - -How can we get this to work with migration. The way to do that is -"theoretically" easy. You have to get the features that the device -has in the source of the migration. The features that the device has -on the target of the migration, you get the intersection of the -features of both sides, and that is the way that you should launch -QEMU. - -Notice that this is not completely related to QEMU. The most -important thing here is that this should be handled by the managing -application that launches QEMU. If QEMU is configured correctly, the -migration will succeed. - -That said, actually doing it is complicated. Almost all devices are -bad at being able to be launched with only some features enabled. -With one big exception: cpus. - -You can read the documentation for QEMU x86 cpu models here: - -https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html - -See when they talk about migration they recommend that one chooses the -newest cpu model that is supported for all cpus. - -Let's say that we have: - -Host A: - -Device X has the feature Y - -Host B: - -Device X has not the feature Y - -If we try to migrate without any care from host A to host B, it will -fail because when migration tries to load the feature Y on -destination, it will find that the hardware is not there. - -Doing this would be the equivalent of doing with cpus: - -Host A: - -$ qemu-system-x86_64 -cpu host - -Host B: - -$ qemu-system-x86_64 -cpu host - -When both hosts have different cpu features this is guaranteed to -fail. Especially if Host B has less features than host A. If host A -has less features than host B, sometimes it works. Important word of -last sentence is "sometimes". - -So, forgetting about cpu models and continuing with the -cpu host -example, let's see that the differences of the cpus is that Host A and -B have the following features: - -Features: 'pcid' 'stibp' 'taa-no' -Host A: X X -Host B: X - -And we want to migrate between them, the way configure both QEMU cpu -will be: - -Host A: - -$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off - -Host B: - -$ qemu-system-x86_64 -cpu host,taa-no=off - -And you would be able to migrate between them. It is responsibility -of the management application or of the user to make sure that the -configuration is correct. QEMU doesn't know how to look at this kind -of features in general. - -Notice that we don't recommend to use -cpu host for migration. It is -used in this example because it makes the example simpler. - -Other devices have worse control about individual features. If they -want to be able to migrate between hosts that show different features, -the device needs a way to configure which ones it is going to use. - -In this section we have considered that we are using the same QEMU -binary in both sides of the migration. If we use different QEMU -versions process, then we need to have into account all other -differences and the examples become even more complicated. - -How to mitigate when we have a backward compatibility error ------------------------------------------------------------ - -We broke migration for old machine types continuously during -development. But as soon as we find that there is a problem, we fix -it. The problem is what happens when we detect after we have done a -release that something has gone wrong. - -Let see how it worked with one example. - -After the release of qemu-8.0 we found a problem when doing migration -of the machine type pc-7.2. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration fails - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration fails - -So clearly something fails when migration between qemu-7.2 and -qemu-8.0 with machine type pc-7.2. The error messages, and git bisect -pointed to this commit. - -In qemu-8.0 we got this commit:: - - commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 - Author: Jonathan Cameron - Date: Thu Mar 2 13:37:02 2023 +0000 - - hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register - - -The relevant bits of the commit for our example are this ones:: - - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -The patch changes how we configure PCI space for AER. But QEMU fails -when the PCI space configuration is different between source and -destination. - -The following commit shows how this got fixed:: - - commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f - Author: Leonardo Bras - Date: Tue May 2 21:27:02 2023 -0300 - - hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 - - [...] - -The relevant parts of the fix in QEMU are as follow: - -First, we create a new property for the device to be able to configure -the old behaviour or the new behaviour:: - - diff --git a/hw/pci/pci.c b/hw/pci/pci.c - index 8a87ccc8b0..5153ad63d6 100644 - --- a/hw/pci/pci.c - +++ b/hw/pci/pci.c - @@ -79,6 +79,8 @@ static Property pci_props[] = { - DEFINE_PROP_STRING("failover_pair_id", PCIDevice, - failover_pair_id), - DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), - + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, - + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), - DEFINE_PROP_END_OF_LIST() - }; - -Notice that we enable the feature for new machine types. - -Now we see how the fix is done. This is going to depend on what kind -of breakage happens, but in this case it is quite simple:: - - diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c - index 103667c368..374d593ead 100644 - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, - uint16_t offset, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_MASK_DEFAULT); - - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_SUPPORTED); - + - + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - + } - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -I.e. If the property bit is enabled, we configure it as we did for -qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. - -And now, everything that is missing is disabling the feature for old -machine types:: - - diff --git a/hw/core/machine.c b/hw/core/machine.c - index 47a34841a5..07f763eb2e 100644 - --- a/hw/core/machine.c - +++ b/hw/core/machine.c - @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { - { "e1000e", "migrate-timadj", "off" }, - { "virtio-mem", "x-early-migration", "false" }, - { "migration", "x-preempt-pre-7-2", "true" }, - + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, - }; - const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); - -And now, when qemu-8.0.1 is released with this fix, all combinations -are going to work as supposed. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) - -So the normality has been restored and everything is ok, no? - -Not really, now our matrix is much bigger. We started with the easy -cases, migration from the same version to the same version always -works: - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -Now the interesting ones. When the QEMU processes versions are -different. For the 1st set, their fail and we can do nothing, both -versions are released and we can't change anything. - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -This two are the ones that work. The whole point of making the -change in qemu-8.0.1 release was to fix this issue: - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -But now we found that qemu-8.0 neither can migrate to qemu-7.2 not -qemu-8.0.1. - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - -So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to -anything except to qemu-8.0. - -Can we do better? - -Yeap. If we know that we are going to do this migration: - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -We can launch the appropriate devices with:: - - --device...,x-pci-e-err-unc-mask=on - -And now we can receive a migration from 8.0. And from now on, we can -do that migration to new machine types if we remember to enable that -property for pc-7.2. Notice that we need to remember, it is not -enough to know that the source of the migration is qemu-8.0. Think of -this example: - -$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 - -In the second migration, the source is not qemu-8.0, but we still have -that "problem" and have that property enabled. Notice that we need to -continue having this mark/property until we have this machine -rebooted. But it is not a normal reboot (that don't reload QEMU) we -need the machine to poweroff/poweron on a fixed QEMU. And from now -on we can use the proper real machine. -- Gitee From fd963e58ec02a43b81895db89c8f2f35edf4a6ec Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:23 +0800 Subject: [PATCH 024/134] docs/migration: Split "Debugging" and "Firmware" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 774ad6b53b9449223115ffa8851eb93de92b0ce7 upstream. Move the two sections into a separate file called "best-practices.rst". Add the entry into index. Intel-SIG: commit 774ad6b53b94 docs/migration: Split "Debugging" and "Firmware" Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/best-practices.rst | 48 +++++++++++++++++++++++++ docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 44 ----------------------- 3 files changed, 49 insertions(+), 44 deletions(-) create mode 100644 docs/devel/migration/best-practices.rst diff --git a/docs/devel/migration/best-practices.rst b/docs/devel/migration/best-practices.rst new file mode 100644 index 0000000000..d7c34a3014 --- /dev/null +++ b/docs/devel/migration/best-practices.rst @@ -0,0 +1,48 @@ +============== +Best practices +============== + +Debugging +========= + +The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. + +Example usage: + +.. code-block:: shell + + $ qemu-system-x86_64 -display none -monitor stdio + (qemu) migrate "exec:cat > mig" + (qemu) q + $ ./scripts/analyze-migration.py -f mig + { + "ram (3)": { + "section sizes": { + "pc.ram": "0x0000000008000000", + ... + +See also ``analyze-migration.py -h`` help for more options. + +Firmware +======== + +Migration migrates the copies of RAM and ROM, and thus when running +on the destination it includes the firmware from the source. Even after +resetting a VM, the old firmware is used. Only once QEMU has been restarted +is the new firmware in use. + +- Changes in firmware size can cause changes in the required RAMBlock size + to hold the firmware and thus migration can fail. In practice it's best + to pad firmware images to convenient powers of 2 with plenty of space + for growth. + +- Care should be taken with device emulation code so that newer + emulation code can work with older firmware to allow forward migration. + +- Care should be taken with newer firmware so that backward migration + to older systems with older device emulation code will work. + +In some cases it may be best to tie specific firmware versions to specific +versioned machine types to cut down on the combinations that will need +support. This is also useful when newer versions of firmware outgrow +the padding. diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 7fc02b9520..9a8fd1ead7 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -11,3 +11,4 @@ QEMU live migration works. compatibility vfio virtio + best-practices diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 04194414af..7ca3b4dd3f 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -52,27 +52,6 @@ All these migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. -Debugging -========= - -The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. - -Example usage: - -.. code-block:: shell - - $ qemu-system-x86_64 -display none -monitor stdio - (qemu) migrate "exec:cat > mig" - (qemu) q - $ ./scripts/analyze-migration.py -f mig - { - "ram (3)": { - "section sizes": { - "pc.ram": "0x0000000008000000", - ... - -See also ``analyze-migration.py -h`` help for more options. - Common infrastructure ===================== @@ -970,26 +949,3 @@ the background migration channel. Anyone who cares about latencies of page faults during a postcopy migration should enable this feature. By default, it's not enabled. -Firmware -======== - -Migration migrates the copies of RAM and ROM, and thus when running -on the destination it includes the firmware from the source. Even after -resetting a VM, the old firmware is used. Only once QEMU has been restarted -is the new firmware in use. - -- Changes in firmware size can cause changes in the required RAMBlock size - to hold the firmware and thus migration can fail. In practice it's best - to pad firmware images to convenient powers of 2 with plenty of space - for growth. - -- Care should be taken with device emulation code so that newer - emulation code can work with older firmware to allow forward migration. - -- Care should be taken with newer firmware so that backward migration - to older systems with older device emulation code will work. - -In some cases it may be best to tie specific firmware versions to specific -versioned machine types to cut down on the combinations that will need -support. This is also useful when newer versions of firmware outgrow -the padding. -- Gitee From f317cc2ec642222dc8bfa5678694b85d8407a5cc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:24 +0800 Subject: [PATCH 025/134] docs/migration: Split "Postcopy" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bfb4c7cd99f1c39dedf33381954d03b9f8f244ec upstream. Split postcopy into a separate file. Introduce a head page "features.rst" to keep all the features on top of migration framework. Intel-SIG: commit bfb4c7cd99f1 docs/migration: Split "Postcopy" Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-7-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 9 + docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 305 ------------------------------ docs/devel/migration/postcopy.rst | 304 +++++++++++++++++++++++++++++ 4 files changed, 314 insertions(+), 305 deletions(-) create mode 100644 docs/devel/migration/features.rst create mode 100644 docs/devel/migration/postcopy.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst new file mode 100644 index 0000000000..0054e0c900 --- /dev/null +++ b/docs/devel/migration/features.rst @@ -0,0 +1,9 @@ +Migration features +================== + +Migration has plenty of features to support different use cases. + +.. toctree:: + :maxdepth: 2 + + postcopy diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 9a8fd1ead7..21ad58b189 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -8,6 +8,7 @@ QEMU live migration works. :maxdepth: 2 main + features compatibility vfio virtio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 7ca3b4dd3f..1e98e9e40c 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -644,308 +644,3 @@ algorithm will restrict virtual CPUs as needed to keep their dirty page rate inside the limit. This leads to more steady reading performance during live migration and can aid in improving large guest responsiveness. -Postcopy -======== - -'Postcopy' migration is a way to deal with migrations that refuse to converge -(or take too long to converge) its plus side is that there is an upper bound on -the amount of migration traffic and time it takes, the down side is that during -the postcopy phase, a failure of *either* side causes the guest to be lost. - -In postcopy the destination CPUs are started before all the memory has been -transferred, and accesses to pages that are yet to be transferred cause -a fault that's translated by QEMU into a request to the source QEMU. - -Postcopy can be combined with precopy (i.e. normal migration) so that if precopy -doesn't finish in a given time the switch is made to postcopy. - -Enabling postcopy ------------------ - -To enable postcopy, issue this command on the monitor (both source and -destination) prior to the start of migration: - -``migrate_set_capability postcopy-ram on`` - -The normal commands are then used to start a migration, which is still -started in precopy mode. Issuing: - -``migrate_start_postcopy`` - -will now cause the transition from precopy to postcopy. -It can be issued immediately after migration is started or any -time later on. Issuing it after the end of a migration is harmless. - -Blocktime is a postcopy live migration metric, intended to show how -long the vCPU was in state of interruptible sleep due to pagefault. -That metric is calculated both for all vCPUs as overlapped value, and -separately for each vCPU. These values are calculated on destination -side. To enable postcopy blocktime calculation, enter following -command on destination monitor: - -``migrate_set_capability postcopy-blocktime on`` - -Postcopy blocktime can be retrieved by query-migrate qmp command. -postcopy-blocktime value of qmp command will show overlapped blocking -time for all vCPU, postcopy-vcpu-blocktime will show list of blocking -time per vCPU. - -.. note:: - During the postcopy phase, the bandwidth limits set using - ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that - the destination is waiting for). - -Postcopy device transfer ------------------------- - -Loading of device data may cause the device emulation to access guest RAM -that may trigger faults that have to be resolved by the source, as such -the migration stream has to be able to respond with page data *during* the -device load, and hence the device data has to be read from the stream completely -before the device load begins to free the stream up. This is achieved by -'packaging' the device data into a blob that's read in one go. - -Source behaviour ----------------- - -Until postcopy is entered the migration stream is identical to normal -precopy, except for the addition of a 'postcopy advise' command at -the beginning, to tell the destination that postcopy might happen. -When postcopy starts the source sends the page discard data and then -forms the 'package' containing: - - - Command: 'postcopy listen' - - The device state - - A series of sections, identical to the precopy streams device state stream - containing everything except postcopiable devices (i.e. RAM) - - Command: 'postcopy run' - -The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the -contents are formatted in the same way as the main migration stream. - -During postcopy the source scans the list of dirty pages and sends them -to the destination without being requested (in much the same way as precopy), -however when a page request is received from the destination, the dirty page -scanning restarts from the requested location. This causes requested pages -to be sent quickly, and also causes pages directly after the requested page -to be sent quickly in the hope that those pages are likely to be used -by the destination soon. - -Destination behaviour ---------------------- - -Initially the destination looks the same as precopy, with a single thread -reading the migration stream; the 'postcopy advise' and 'discard' commands -are processed to change the way RAM is managed, but don't affect the stream -processing. - -:: - - ------------------------------------------------------------------------------ - 1 2 3 4 5 6 7 - main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) - thread | | - | (page request) - | \___ - v \ - listen thread: --- page -- page -- page -- page -- page -- - - a b c - ------------------------------------------------------------------------------ - -- On receipt of ``CMD_PACKAGED`` (1) - - All the data associated with the package - the ( ... ) section in the diagram - - is read into memory, and the main thread recurses into qemu_loadvm_state_main - to process the contents of the package (2) which contains commands (3,6) and - devices (4...) - -- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) - - a new thread (a) is started that takes over servicing the migration stream, - while the main thread carries on loading the package. It loads normal - background page data (b) but if during a device load a fault happens (5) - the returned page (c) is loaded by the listen thread allowing the main - threads device load to carry on. - -- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) - - letting the destination CPUs start running. At the end of the - ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and - is no longer used by migration, while the listen thread carries on servicing - page data until the end of migration. - -Postcopy Recovery ------------------ - -Comparing to precopy, postcopy is special on error handlings. When any -error happens (in this case, mostly network errors), QEMU cannot easily -fail a migration because VM data resides in both source and destination -QEMU instances. On the other hand, when issue happens QEMU on both sides -will go into a paused state. It'll need a recovery phase to continue a -paused postcopy migration. - -The recovery phase normally contains a few steps: - - - When network issue occurs, both QEMU will go into PAUSED state - - - When the network is recovered (or a new network is provided), the admin - can setup the new channel for migration using QMP command - 'migrate-recover' on destination node, preparing for a resume. - - - On source host, the admin can continue the interrupted postcopy - migration using QMP command 'migrate' with resume=true flag set. - - - After the connection is re-established, QEMU will continue the postcopy - migration on both sides. - -During a paused postcopy migration, the VM can logically still continue -running, and it will not be impacted from any page access to pages that -were already migrated to destination VM before the interruption happens. -However, if any of the missing pages got accessed on destination VM, the VM -thread will be halted waiting for the page to be migrated, it means it can -be halted until the recovery is complete. - -The impact of accessing missing pages can be relevant to different -configurations of the guest. For example, when with async page fault -enabled, logically the guest can proactively schedule out the threads -accessing missing pages. - -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - -Postcopy with hugepages ------------------------ - -Postcopy now works with hugetlbfs backed memory: - - a) The linux kernel on the destination must support userfault on hugepages. - b) The huge-page configuration on the source and destination VMs must be - identical; i.e. RAMBlocks on both sides must use the same page size. - c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal - RAM if it doesn't have enough hugepages, triggering (b) to fail. - Using ``-mem-prealloc`` enforces the allocation using hugepages. - d) Care should be taken with the size of hugepage used; postcopy with 2MB - hugepages works well, however 1GB hugepages are likely to be problematic - since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, - and until the full page is transferred the destination thread is blocked. - -Postcopy with shared memory ---------------------------- - -Postcopy migration with shared memory needs explicit support from the other -processes that share memory and from QEMU. There are restrictions on the type of -memory that userfault can support shared. - -The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` -(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` -for hugetlbfs which may be a problem in some configurations). - -The vhost-user code in QEMU supports clients that have Postcopy support, -and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes -to support postcopy. - -The client needs to open a userfaultfd and register the areas -of memory that it maps with userfault. The client must then pass the -userfaultfd back to QEMU together with a mapping table that allows -fault addresses in the clients address space to be converted back to -RAMBlock/offsets. The client's userfaultfd is added to the postcopy -fault-thread and page requests are made on behalf of the client by QEMU. -QEMU performs 'wake' operations on the client's userfaultfd to allow it -to continue after a page has arrived. - -.. note:: - There are two future improvements that would be nice: - a) Some way to make QEMU ignorant of the addresses in the clients - address space - b) Avoiding the need for QEMU to perform ufd-wake calls after the - pages have arrived - -Retro-fitting postcopy to existing clients is possible: - a) A mechanism is needed for the registration with userfault as above, - and the registration needs to be coordinated with the phases of - postcopy. In vhost-user extra messages are added to the existing - control channel. - b) Any thread that can block due to guest memory accesses must be - identified and the implication understood; for example if the - guest memory access is made while holding a lock then all other - threads waiting for that lock will also be blocked. - -Postcopy Preemption Mode ------------------------- - -Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -allows urgent pages (those got page fault requested from destination QEMU -explicitly) to be sent in a separate preempt channel, rather than queued in -the background migration channel. Anyone who cares about latencies of page -faults during a postcopy migration should enable this feature. By default, -it's not enabled. - diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst new file mode 100644 index 0000000000..d60eec06ab --- /dev/null +++ b/docs/devel/migration/postcopy.rst @@ -0,0 +1,304 @@ +Postcopy +======== + +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side causes the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +Enabling postcopy +----------------- + +To enable postcopy, issue this command on the monitor (both source and +destination) prior to the start of migration: + +``migrate_set_capability postcopy-ram on`` + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +``migrate_start_postcopy`` + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Blocktime is a postcopy live migration metric, intended to show how +long the vCPU was in state of interruptible sleep due to pagefault. +That metric is calculated both for all vCPUs as overlapped value, and +separately for each vCPU. These values are calculated on destination +side. To enable postcopy blocktime calculation, enter following +command on destination monitor: + +``migrate_set_capability postcopy-blocktime on`` + +Postcopy blocktime can be retrieved by query-migrate qmp command. +postcopy-blocktime value of qmp command will show overlapped blocking +time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +time per vCPU. + +.. note:: + During the postcopy phase, the bandwidth limits set using + ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that + the destination is waiting for). + +Postcopy device transfer +------------------------ + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour +---------------- + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + - Command: 'postcopy listen' + - The device state + + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + - Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour +--------------------- + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +:: + + ------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 + main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) + thread | | + | (page request) + | \___ + v \ + listen thread: --- page -- page -- page -- page -- page -- + + a b c + ------------------------------------------------------------------------------ + +- On receipt of ``CMD_PACKAGED`` (1) + + All the data associated with the package - the ( ... ) section in the diagram - + is read into memory, and the main thread recurses into qemu_loadvm_state_main + to process the contents of the package (2) which contains commands (3,6) and + devices (4...) + +- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) + + a new thread (a) is started that takes over servicing the migration stream, + while the main thread carries on loading the package. It loads normal + background page data (b) but if during a device load a fault happens (5) + the returned page (c) is loaded by the listen thread allowing the main + threads device load to carry on. + +- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) + + letting the destination CPUs start running. At the end of the + ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and + is no longer used by migration, while the listen thread carries on servicing + page data until the end of migration. + +Postcopy Recovery +----------------- + +Comparing to precopy, postcopy is special on error handlings. When any +error happens (in this case, mostly network errors), QEMU cannot easily +fail a migration because VM data resides in both source and destination +QEMU instances. On the other hand, when issue happens QEMU on both sides +will go into a paused state. It'll need a recovery phase to continue a +paused postcopy migration. + +The recovery phase normally contains a few steps: + + - When network issue occurs, both QEMU will go into PAUSED state + + - When the network is recovered (or a new network is provided), the admin + can setup the new channel for migration using QMP command + 'migrate-recover' on destination node, preparing for a resume. + + - On source host, the admin can continue the interrupted postcopy + migration using QMP command 'migrate' with resume=true flag set. + + - After the connection is re-established, QEMU will continue the postcopy + migration on both sides. + +During a paused postcopy migration, the VM can logically still continue +running, and it will not be impacted from any page access to pages that +were already migrated to destination VM before the interruption happens. +However, if any of the missing pages got accessed on destination VM, the VM +thread will be halted waiting for the page to be migrated, it means it can +be halted until the recovery is complete. + +The impact of accessing missing pages can be relevant to different +configurations of the guest. For example, when with async page fault +enabled, logically the guest can proactively schedule out the threads +accessing missing pages. + +Postcopy states +--------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Source side page map +-------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy with hugepages +----------------------- + +Postcopy now works with hugetlbfs backed memory: + + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using ``-mem-prealloc`` enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. + +Postcopy with shared memory +--------------------------- + +Postcopy migration with shared memory needs explicit support from the other +processes that share memory and from QEMU. There are restrictions on the type of +memory that userfault can support shared. + +The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` +(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` +for hugetlbfs which may be a problem in some configurations). + +The vhost-user code in QEMU supports clients that have Postcopy support, +and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes +to support postcopy. + +The client needs to open a userfaultfd and register the areas +of memory that it maps with userfault. The client must then pass the +userfaultfd back to QEMU together with a mapping table that allows +fault addresses in the clients address space to be converted back to +RAMBlock/offsets. The client's userfaultfd is added to the postcopy +fault-thread and page requests are made on behalf of the client by QEMU. +QEMU performs 'wake' operations on the client's userfaultfd to allow it +to continue after a page has arrived. + +.. note:: + There are two future improvements that would be nice: + a) Some way to make QEMU ignorant of the addresses in the clients + address space + b) Avoiding the need for QEMU to perform ufd-wake calls after the + pages have arrived + +Retro-fitting postcopy to existing clients is possible: + a) A mechanism is needed for the registration with userfault as above, + and the registration needs to be coordinated with the phases of + postcopy. In vhost-user extra messages are added to the existing + control channel. + b) Any thread that can block due to guest memory accesses must be + identified and the implication understood; for example if the + guest memory access is made while holding a lock then all other + threads waiting for that lock will also be blocked. + +Postcopy Preemption Mode +------------------------ + +Postcopy preempt is a new capability introduced in 8.0 QEMU release, it +allows urgent pages (those got page fault requested from destination QEMU +explicitly) to be sent in a separate preempt channel, rather than queued in +the background migration channel. Anyone who cares about latencies of page +faults during a postcopy migration should enable this feature. By default, +it's not enabled. -- Gitee From b588513eae559a5f362330f0a0931b3d768ee3e8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:25 +0800 Subject: [PATCH 026/134] docs/migration: Split "dirty limit" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4c6f8a79ae539eeb1f86af6522e4000edde3638b upstream. Split that into a separate file, put under "features". Intel-SIG: commit 4c6f8a79ae53 docs/migration: Split "dirty limit" Cc: Yong Huang Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-8-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/dirty-limit.rst | 71 ++++++++++++++++++++++++++++ docs/devel/migration/features.rst | 1 + docs/devel/migration/main.rst | 71 ---------------------------- 3 files changed, 72 insertions(+), 71 deletions(-) create mode 100644 docs/devel/migration/dirty-limit.rst diff --git a/docs/devel/migration/dirty-limit.rst b/docs/devel/migration/dirty-limit.rst new file mode 100644 index 0000000000..8f32329d5f --- /dev/null +++ b/docs/devel/migration/dirty-limit.rst @@ -0,0 +1,71 @@ +Dirty limit +=========== + +The dirty limit, short for dirty page rate upper limit, is a new capability +introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM +dirty ring to throttle down the guest during live migration. + +The algorithm framework is as follows: + +:: + + ------------------------------------------------------------------------------ + main --------------> throttle thread ------------> PREPARE(1) <-------- + thread \ | | + \ | | + \ V | + -\ CALCULATE(2) | + \ | | + \ | | + \ V | + \ SET PENALTY(3) ----- + -\ | + \ | + \ V + -> virtual CPU thread -------> ACCEPT PENALTY(4) + ------------------------------------------------------------------------------ + +When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, +the QEMU main thread starts the throttle thread. The throttle thread, once +launched, executes the loop, which consists of three steps: + + - PREPARE (1) + + The entire work of PREPARE (1) is preparation for the second stage, + CALCULATE(2), as the name implies. It involves preparing the dirty + page rate value and the corresponding upper limit of the VM: + The dirty page rate is calculated via the KVM dirty ring mechanism, + which tells QEMU how many dirty pages a virtual CPU has had since the + last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper + limit is specified by caller, therefore fetch it directly. + + - CALCULATE (2) + + Calculate a suitable sleep period for each virtual CPU, which will be + used to determine the penalty for the target virtual CPU. The + computation must be done carefully in order to reduce the dirty page + rate progressively down to the upper limit without oscillation. To + achieve this, two strategies are provided: the first is to add or + subtract sleep time based on the ratio of the current dirty page rate + to the limit, which is used when the current dirty page rate is far + from the limit; the second is to add or subtract a fixed time when + the current dirty page rate is close to the limit. + + - SET PENALTY (3) + + Set the sleep time for each virtual CPU that should be penalized based + on the results of the calculation supplied by step CALCULATE (2). + +After completing the three above stages, the throttle thread loops back +to step PREPARE (1) until the dirty limit is reached. + +On the other hand, each virtual CPU thread reads the sleep duration and +sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that +is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will +obviously exit to the path and get penalized, whereas virtual CPUs involved +with read processes will not. + +In summary, thanks to the KVM dirty ring technology, the dirty limit +algorithm will restrict virtual CPUs as needed to keep their dirty page +rate inside the limit. This leads to more steady reading performance during +live migration and can aid in improving large guest responsiveness. diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 0054e0c900..e257d0d100 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -7,3 +7,4 @@ Migration has plenty of features to support different use cases. :maxdepth: 2 postcopy + dirty-limit diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 1e98e9e40c..396c7c51ca 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -573,74 +573,3 @@ path. Return path - opened by main thread, written by main thread AND postcopy thread (protected by rp_mutex) -Dirty limit -===================== -The dirty limit, short for dirty page rate upper limit, is a new capability -introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM -dirty ring to throttle down the guest during live migration. - -The algorithm framework is as follows: - -:: - - ------------------------------------------------------------------------------ - main --------------> throttle thread ------------> PREPARE(1) <-------- - thread \ | | - \ | | - \ V | - -\ CALCULATE(2) | - \ | | - \ | | - \ V | - \ SET PENALTY(3) ----- - -\ | - \ | - \ V - -> virtual CPU thread -------> ACCEPT PENALTY(4) - ------------------------------------------------------------------------------ - -When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, -the QEMU main thread starts the throttle thread. The throttle thread, once -launched, executes the loop, which consists of three steps: - - - PREPARE (1) - - The entire work of PREPARE (1) is preparation for the second stage, - CALCULATE(2), as the name implies. It involves preparing the dirty - page rate value and the corresponding upper limit of the VM: - The dirty page rate is calculated via the KVM dirty ring mechanism, - which tells QEMU how many dirty pages a virtual CPU has had since the - last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper - limit is specified by caller, therefore fetch it directly. - - - CALCULATE (2) - - Calculate a suitable sleep period for each virtual CPU, which will be - used to determine the penalty for the target virtual CPU. The - computation must be done carefully in order to reduce the dirty page - rate progressively down to the upper limit without oscillation. To - achieve this, two strategies are provided: the first is to add or - subtract sleep time based on the ratio of the current dirty page rate - to the limit, which is used when the current dirty page rate is far - from the limit; the second is to add or subtract a fixed time when - the current dirty page rate is close to the limit. - - - SET PENALTY (3) - - Set the sleep time for each virtual CPU that should be penalized based - on the results of the calculation supplied by step CALCULATE (2). - -After completing the three above stages, the throttle thread loops back -to step PREPARE (1) until the dirty limit is reached. - -On the other hand, each virtual CPU thread reads the sleep duration and -sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that -is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will -obviously exit to the path and get penalized, whereas virtual CPUs involved -with read processes will not. - -In summary, thanks to the KVM dirty ring technology, the dirty limit -algorithm will restrict virtual CPUs as needed to keep their dirty page -rate inside the limit. This leads to more steady reading performance during -live migration and can aid in improving large guest responsiveness. - -- Gitee From 10c1ad602097c95f0414e825d074fa9f9020e6b6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:26 +0800 Subject: [PATCH 027/134] docs/migration: Organize "Postcopy" page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 21b17cd011c959c3fd3fdad994389410a02df901 upstream. Reorganize the page, moving things around, and add a few headlines ("Postcopy internals", "Postcopy features") to cover sub-areas. Intel-SIG: commit 21b17cd011c9 docs/migration: Organize "Postcopy" page Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-9-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/postcopy.rst | 159 ++++++++++++++++-------------- 1 file changed, 84 insertions(+), 75 deletions(-) diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst index d60eec06ab..6c51e96d79 100644 --- a/docs/devel/migration/postcopy.rst +++ b/docs/devel/migration/postcopy.rst @@ -1,6 +1,9 @@ +======== Postcopy ======== +.. contents:: + 'Postcopy' migration is a way to deal with migrations that refuse to converge (or take too long to converge) its plus side is that there is an upper bound on the amount of migration traffic and time it takes, the down side is that during @@ -14,7 +17,7 @@ Postcopy can be combined with precopy (i.e. normal migration) so that if precopy doesn't finish in a given time the switch is made to postcopy. Enabling postcopy ------------------ +================= To enable postcopy, issue this command on the monitor (both source and destination) prior to the start of migration: @@ -49,8 +52,71 @@ time per vCPU. ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that the destination is waiting for). -Postcopy device transfer ------------------------- +Postcopy internals +================== + +State machine +------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Device transfer +--------------- Loading of device data may cause the device emulation to access guest RAM that may trigger faults that have to be resolved by the source, as such @@ -130,7 +196,20 @@ processing. is no longer used by migration, while the listen thread carries on servicing page data until the end of migration. -Postcopy Recovery +Source side page bitmap +----------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy features +================= + +Postcopy recovery ----------------- Comparing to precopy, postcopy is special on error handlings. When any @@ -166,76 +245,6 @@ configurations of the guest. For example, when with async page fault enabled, logically the guest can proactively schedule out the threads accessing missing pages. -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - Postcopy with hugepages ----------------------- @@ -293,7 +302,7 @@ Retro-fitting postcopy to existing clients is possible: guest memory access is made while holding a lock then all other threads waiting for that lock will also be blocked. -Postcopy Preemption Mode +Postcopy preemption mode ------------------------ Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -- Gitee From 420ca0837acf15b95a921e51fdc063d4dda1eacb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:27 +0800 Subject: [PATCH 028/134] docs/migration: Further move vfio to be feature of migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 66fd3b1a7ab02f7d8c84f92eba23e3ddc955204d upstream. Move it one layer down, so taking VFIO-migration as a feature for migration. Intel-SIG: commit 66fd3b1a7ab0 docs/migration: Further move vfio to be feature of migration Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-10-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/index.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index e257d0d100..dea016f707 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -8,3 +8,4 @@ Migration has plenty of features to support different use cases. postcopy dirty-limit + vfio diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 21ad58b189..b1357309e1 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -10,6 +10,5 @@ QEMU live migration works. main features compatibility - vfio virtio best-practices -- Gitee From 109d5dbc0e1c9d220eb94b3e1b0ae7e791aaa90c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:28 +0800 Subject: [PATCH 029/134] docs/migration: Further move virtio to be feature of migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit eb9f6daae49c06bb91e9660908587cc55265e43a upstream. Move it one layer down, so taking Virtio-migration as a feature for migration. Intel-SIG: commit eb9f6daae49c docs/migration: Further move virtio to be feature of migration Cc: "Michael S. Tsirkin" Cc: Jason Wang Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-11-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/index.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index dea016f707..a9acaf618e 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -9,3 +9,4 @@ Migration has plenty of features to support different use cases. postcopy dirty-limit vfio + virtio diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index b1357309e1..2aa294d631 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -10,5 +10,4 @@ QEMU live migration works. main features compatibility - virtio best-practices -- Gitee From 9af2fd6b64b97e1b8386c13090733ad42b9df3ed Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:35 +0800 Subject: [PATCH 030/134] migration/multifd: Drop stale comment for multifd zero copy commit 8888a552bf7af200e36ff123772547dfb4f133c4 upstream. We've already done that with multifd_flush_after_each_section, for multifd in general. Drop the stale "TODO-like" comment. Intel-SIG: commit 8888a552bf7a migration/multifd: Drop stale comment for multifd zero copy Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index eaae772321..c0d9fa17ba 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -599,17 +599,6 @@ int multifd_send_sync_main(void) } } - /* - * When using zero-copy, it's necessary to flush the pages before any of - * the pages can be sent again, so we'll make sure the new version of the - * pages will always arrive _later_ than the old pages. - * - * Currently we achieve this by flushing the zero-page requested writes - * per ram iteration, but in the future we could potentially optimize it - * to be less frequent, e.g. only after we finished one whole scanning of - * all the dirty bitmaps. - */ - flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { -- Gitee From 78ad9e5e83cf76b3d0d57ce10058bdbba29ebff9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:36 +0800 Subject: [PATCH 031/134] migration/multifd: multifd_send_kick_main() commit 48c0f5d56fd2ff0a0cda23301637b742c690f59a upstream. When a multifd sender thread hit errors, it always needs to kick the main thread by kicking all the semaphores that it can be waiting upon. Provide a helper for it and deduplicate the code. Intel-SIG: commit 48c0f5d56fd2 migration/multifd: multifd_send_kick_main() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index c0d9fa17ba..017be682a5 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -373,6 +373,18 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +/* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of + * them. Should mostly only be called when something wrong happened with + * the current multifd send thread. + */ +static void multifd_send_kick_main(MultiFDSendParams *p) +{ + qemu_sem_post(&p->sem_sync); + qemu_sem_post(&multifd_send_state->channels_ready); +} + /* * How we use multifd_send_state->pages and channel->pages? * @@ -743,8 +755,7 @@ out: assert(local_err); trace_multifd_send_error(p->id); multifd_send_terminate_threads(local_err); - qemu_sem_post(&p->sem_sync); - qemu_sem_post(&multifd_send_state->channels_ready); + multifd_send_kick_main(p); error_free(local_err); } @@ -785,8 +796,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, * is not created, and then tell who pay attention to me. */ p->quit = true; - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); + multifd_send_kick_main(p); error_free(err); } @@ -856,8 +866,7 @@ static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, { migrate_set_error(migrate_get_current(), err); /* Error happen, we need to tell who pay attention to me */ - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); + multifd_send_kick_main(p); /* * Although multifd_send_thread is not created, but main migration * thread need to judge whether it is running, so we need to mark -- Gitee From 67b1b0ddeb844219c88cdd28ad475aa2fb4382e1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:37 +0800 Subject: [PATCH 032/134] migration/multifd: Drop MultiFDSendParams.quit, cleanup error paths commit 15f3f21d598148895c33b6fc41e29777cf6ad992 upstream. Multifd send side has two fields to indicate error quits: - MultiFDSendParams.quit - &multifd_send_state->exiting Merge them into the global one. The replacement is done by changing all p->quit checks into the global var check. The global check doesn't need any lock. A few more things done on top of this altogether: - multifd_send_terminate_threads() Moving the xchg() of &multifd_send_state->exiting upper, so as to cover the tracepoint, migrate_set_error() and migrate_set_state(). - multifd_send_sync_main() In the 2nd loop, add one more check over the global var to make sure we don't keep the looping if QEMU already decided to quit. - multifd_tls_outgoing_handshake() Use multifd_send_terminate_threads() to set the error state. That has a benefit of updating MigrationState.error to that error too, so we can persist that 1st error we hit in that specific channel. - multifd_new_send_channel_async() Take similar approach like above, drop the migrate_set_error() because multifd_send_terminate_threads() already covers that. Unwrap the helper multifd_new_send_channel_cleanup() along the way; not really needed. Intel-SIG: commit 15f3f21d5981 migration/multifd: Drop MultiFDSendParams.quit, cleanup error paths Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 85 ++++++++++++++++++--------------------------- migration/multifd.h | 2 -- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 017be682a5..2c3e347762 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -373,6 +373,11 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +static bool multifd_send_should_exit(void) +{ + return qatomic_read(&multifd_send_state->exiting); +} + /* * The migration thread can wait on either of the two semaphores. This * function can be used to kick the main thread out of waiting on either of @@ -410,7 +415,7 @@ static int multifd_send_pages(void) MultiFDSendParams *p = NULL; /* make happy gcc */ MultiFDPages_t *pages = multifd_send_state->pages; - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { return -1; } @@ -422,14 +427,11 @@ static int multifd_send_pages(void) */ next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { - p = &multifd_send_state->params[i]; - - qemu_mutex_lock(&p->mutex); - if (p->quit) { - error_report("%s: channel %d has already quit!", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } + p = &multifd_send_state->params[i]; + qemu_mutex_lock(&p->mutex); if (!p->pending_job) { p->pending_job++; next_channel = (i + 1) % migrate_multifd_channels(); @@ -484,6 +486,16 @@ static void multifd_send_terminate_threads(Error *err) { int i; + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (qatomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } + trace_multifd_send_terminate_threads(err != NULL); if (err) { @@ -498,26 +510,13 @@ static void multifd_send_terminate_threads(Error *err) } } - /* - * We don't want to exit each threads twice. Depending on where - * we get the error, or if there are two independent errors in two - * threads at the same time, we can end calling this function - * twice. - */ - if (qatomic_xchg(&multifd_send_state->exiting, 1)) { - return; - } - for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; qemu_sem_post(&p->sem); if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } @@ -616,16 +615,13 @@ int multifd_send_sync_main(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - trace_multifd_send_sync_main_signal(p->id); - - qemu_mutex_lock(&p->mutex); - - if (p->quit) { - error_report("%s: channel %d has already quit", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } + trace_multifd_send_sync_main_signal(p->id); + + qemu_mutex_lock(&p->mutex); p->packet_num = multifd_send_state->packet_num++; p->flags |= MULTIFD_FLAG_SYNC; p->pending_job++; @@ -635,6 +631,10 @@ int multifd_send_sync_main(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (multifd_send_should_exit()) { + return -1; + } + qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -675,7 +675,7 @@ static void *multifd_send_thread(void *opaque) qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_wait(&p->sem); - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { break; } qemu_mutex_lock(&p->mutex); @@ -790,12 +790,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - migrate_set_error(migrate_get_current(), err); - /* - * Error happen, mark multifd_send_thread status as 'quit' although it - * is not created, and then tell who pay attention to me. - */ - p->quit = true; + multifd_send_terminate_threads(err); multifd_send_kick_main(p); error_free(err); } @@ -861,22 +856,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return true; } -static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, - QIOChannel *ioc, Error *err) -{ - migrate_set_error(migrate_get_current(), err); - /* Error happen, we need to tell who pay attention to me */ - multifd_send_kick_main(p); - /* - * Although multifd_send_thread is not created, but main migration - * thread need to judge whether it is running, so we need to mark - * its status. - */ - p->quit = true; - object_unref(OBJECT(ioc)); - error_free(err); -} - static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; @@ -893,7 +872,10 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_new_send_channel_cleanup(p, ioc, local_err); + multifd_send_terminate_threads(local_err); + multifd_send_kick_main(p); + object_unref(OBJECT(ioc)); + error_free(local_err); } static void multifd_new_send_channel_create(gpointer opaque) @@ -925,7 +907,6 @@ int multifd_save_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); diff --git a/migration/multifd.h b/migration/multifd.h index 35d11f103c..7c040cb85a 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -95,8 +95,6 @@ typedef struct { QemuMutex mutex; /* is this channel thread running */ bool running; - /* should this thread finish */ - bool quit; /* multifd flags for each packet */ uint32_t flags; /* global number of generated multifd packets */ -- Gitee From fc33f91d28971f24b14fbfcb1b0b8e45242eedb3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:38 +0800 Subject: [PATCH 033/134] migration/multifd: Postpone reset of MultiFDPages_t commit 836eca47f62f9f6d5b8e9b6fedfc3539775c4e2e upstream. Now we reset MultiFDPages_t object in the multifd sender thread in the middle of the sending job. That's not necessary, because the "*pages" struct will not be reused anyway until pending_job is cleared. Move that to the end after the job is completed, provide a helper to reset a "*pages" object. Use that same helper when free the object too. This prepares us to keep using p->pages in the follow up patches, where we may drop p->normal[]. Intel-SIG: commit 836eca47f62f migration/multifd: Postpone reset of MultiFDPages_t Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 2c3e347762..7de4aacb06 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -173,6 +173,17 @@ void multifd_register_ops(int method, MultiFDMethods *ops) multifd_ops[method] = ops; } +/* Reset a MultiFDPages_t* object for the next use */ +static void multifd_pages_reset(MultiFDPages_t *pages) +{ + /* + * We don't need to touch offset[] array, because it will be + * overwritten later when reused. + */ + pages->num = 0; + pages->block = NULL; +} + static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { MultiFDInit_t msg = {}; @@ -249,9 +260,8 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n) static void multifd_pages_clear(MultiFDPages_t *pages) { - pages->num = 0; + multifd_pages_reset(pages); pages->allocated = 0; - pages->block = NULL; g_free(pages->offset); pages->offset = NULL; g_free(pages); @@ -708,8 +718,6 @@ static void *multifd_send_thread(void *opaque) p->flags = 0; p->num_packets++; p->total_normal_pages += p->normal_num; - p->pages->num = 0; - p->pages->block = NULL; qemu_mutex_unlock(&p->mutex); trace_multifd_send(p->id, packet_num, p->normal_num, flags, @@ -736,6 +744,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + + multifd_pages_reset(p->pages); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); p->pending_job--; -- Gitee From 8bc0dc022b008d84848019b49121bf37e06d7283 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:39 +0800 Subject: [PATCH 034/134] migration/multifd: Drop MultiFDSendParams.normal[] array commit efd8c5439db7eaf00f35adc0fcc4f01d916e8619 upstream. This array is redundant when p->pages exists. Now we extended the life of p->pages to the whole period where pending_job is set, it should be safe to always use p->pages->offset[] rather than p->normal[]. Drop the array. Alongside, the normal_num is also redundant, which is the same to p->pages->num. This doesn't apply to recv side, because there's no extra buffering on recv side, so p->normal[] array is still needed. Intel-SIG: commit efd8c5439db7 migration/multifd: Drop MultiFDSendParams.normal[] array Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 7 ++++--- migration/multifd-zstd.c | 7 ++++--- migration/multifd.c | 33 +++++++++++++-------------------- migration/multifd.h | 4 ---- 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 37ce48621e..100809abc1 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -116,17 +116,18 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zlib_data *z = p->data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; uint32_t i; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = Z_SYNC_FLUSH; } @@ -135,7 +136,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); + memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); zs->avail_in = p->page_size; zs->next_in = z->buf; diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index b471daadcd..2023edd8cc 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -113,6 +113,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zstd_data *z = p->data; int ret; uint32_t i; @@ -121,13 +122,13 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = ZSTD_e_flush; } - z->in.src = p->pages->block->host + p->normal[i]; + z->in.src = p->pages->block->host + pages->offset[i]; z->in.size = p->page_size; z->in.pos = 0; diff --git a/migration/multifd.c b/migration/multifd.c index 7de4aacb06..ee11fc5b4d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -91,13 +91,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < p->normal_num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = p->normal_num * p->page_size; + p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; return 0; } @@ -270,21 +270,22 @@ static void multifd_pages_clear(MultiFDPages_t *pages) static void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(p->normal_num); + packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet->packet_num = cpu_to_be64(p->packet_num); - if (p->pages->block) { - strncpy(packet->ramblock, p->pages->block->idstr, 256); + if (pages->block) { + strncpy(packet->ramblock, pages->block->idstr, 256); } - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { /* there are architectures where ram_addr_t is 32 bit */ - uint64_t temp = p->normal[i]; + uint64_t temp = pages->offset[i]; packet->offset[i] = cpu_to_be64(temp); } @@ -571,8 +572,6 @@ void multifd_save_cleanup(void) p->packet = NULL; g_free(p->iov); p->iov = NULL; - g_free(p->normal); - p->normal = NULL; multifd_send_state->ops->send_cleanup(p, &local_err); if (local_err) { migrate_set_error(migrate_get_current(), local_err); @@ -692,8 +691,8 @@ static void *multifd_send_thread(void *opaque) if (p->pending_job) { uint64_t packet_num = p->packet_num; + MultiFDPages_t *pages = p->pages; uint32_t flags; - p->normal_num = 0; if (use_zero_copy_send) { p->iovs_num = 0; @@ -701,12 +700,7 @@ static void *multifd_send_thread(void *opaque) p->iovs_num = 1; } - for (int i = 0; i < p->pages->num; i++) { - p->normal[p->normal_num] = p->pages->offset[i]; - p->normal_num++; - } - - if (p->normal_num) { + if (pages->num) { ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { qemu_mutex_unlock(&p->mutex); @@ -717,10 +711,10 @@ static void *multifd_send_thread(void *opaque) flags = p->flags; p->flags = 0; p->num_packets++; - p->total_normal_pages += p->normal_num; + p->total_normal_pages += pages->num; qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, p->normal_num, flags, + trace_multifd_send(p->id, packet_num, pages->num, flags, p->next_packet_size); if (use_zero_copy_send) { @@ -928,7 +922,6 @@ int multifd_save_setup(Error **errp) p->name = g_strdup_printf("multifdsend_%d", i); /* We need one extra place for the packet header */ p->iov = g_new0(struct iovec, page_count + 1); - p->normal = g_new0(ram_addr_t, page_count); p->page_size = qemu_target_page_size(); p->page_count = page_count; diff --git a/migration/multifd.h b/migration/multifd.h index 7c040cb85a..3920bdbcf1 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -122,10 +122,6 @@ typedef struct { struct iovec *iov; /* number of iovs used */ uint32_t iovs_num; - /* Pages that are not zero */ - ram_addr_t *normal; - /* num of non zero pages */ - uint32_t normal_num; /* used for compression methods */ void *data; } MultiFDSendParams; -- Gitee From a59f288867c1b624ccc0d1f8d94a98d4220a067e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:40 +0800 Subject: [PATCH 035/134] migration/multifd: Separate SYNC request with normal jobs commit f5f48a7891cf6664a920ba52f6f4dea1646049a4 upstream. Multifd provide a threaded model for processing jobs. On sender side, there can be two kinds of job: (1) a list of pages to send, or (2) a sync request. The sync request is a very special kind of job. It never contains a page array, but only a multifd packet telling the dest side to synchronize with sent pages. Before this patch, both requests use the pending_job field, no matter what the request is, it will boost pending_job, while multifd sender thread will decrement it after it finishes one job. However this should be racy, because SYNC is special in that it needs to set p->flags with MULTIFD_FLAG_SYNC, showing that this is a sync request. Consider a sequence of operations where: - migration thread enqueue a job to send some pages, pending_job++ (0->1) - [...before the selected multifd sender thread wakes up...] - migration thread enqueue another job to sync, pending_job++ (1->2), setup p->flags=MULTIFD_FLAG_SYNC - multifd sender thread wakes up, found pending_job==2 - send the 1st packet with MULTIFD_FLAG_SYNC and list of pages - send the 2nd packet with flags==0 and no pages This is not expected, because MULTIFD_FLAG_SYNC should hopefully be done after all the pages are received. Meanwhile, the 2nd packet will be completely useless, which contains zero information. I didn't verify above, but I think this issue is still benign in that at least on the recv side we always receive pages before handling MULTIFD_FLAG_SYNC. However that's not always guaranteed and just tricky. One other reason I want to separate it is using p->flags to communicate between the two threads is also not clearly defined, it's very hard to read and understand why accessing p->flags is always safe; see the current impl of multifd_send_thread() where we tried to cache only p->flags. It doesn't need to be that complicated. This patch introduces pending_sync, a separate flag just to show that the requester needs a sync. Alongside, we remove the tricky caching of p->flags now because after this patch p->flags should only be used by multifd sender thread now, which will be crystal clear. So it is always thread safe to access p->flags. With that, we can also safely convert the pending_job into a boolean, because we don't support >1 pending jobs anyway. Always use atomic ops to access both flags to make sure no cache effect. When at it, drop the initial setting of "pending_job = 0" because it's always allocated using g_new0(). Intel-SIG: commit f5f48a7891cf migration/multifd: Separate SYNC request with normal jobs Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-7-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 39 +++++++++++++++++++++++++-------------- migration/multifd.h | 13 +++++++++++-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index ee11fc5b4d..03c138137e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -443,8 +443,8 @@ static int multifd_send_pages(void) } p = &multifd_send_state->params[i]; qemu_mutex_lock(&p->mutex); - if (!p->pending_job) { - p->pending_job++; + if (qatomic_read(&p->pending_job) == false) { + qatomic_set(&p->pending_job, true); next_channel = (i + 1) % migrate_multifd_channels(); break; } @@ -632,8 +632,12 @@ int multifd_send_sync_main(void) qemu_mutex_lock(&p->mutex); p->packet_num = multifd_send_state->packet_num++; - p->flags |= MULTIFD_FLAG_SYNC; - p->pending_job++; + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. + */ + assert(qatomic_read(&p->pending_sync) == false); + qatomic_set(&p->pending_sync, true); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); } @@ -689,10 +693,9 @@ static void *multifd_send_thread(void *opaque) } qemu_mutex_lock(&p->mutex); - if (p->pending_job) { + if (qatomic_read(&p->pending_job)) { uint64_t packet_num = p->packet_num; MultiFDPages_t *pages = p->pages; - uint32_t flags; if (use_zero_copy_send) { p->iovs_num = 0; @@ -708,13 +711,11 @@ static void *multifd_send_thread(void *opaque) } } multifd_send_fill_packet(p); - flags = p->flags; - p->flags = 0; p->num_packets++; p->total_normal_pages += pages->num; qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, pages->num, flags, + trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); if (use_zero_copy_send) { @@ -742,12 +743,23 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); - p->pending_job--; + qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); - - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&p->sem_sync); + } else if (qatomic_read(&p->pending_sync)) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; + qatomic_set(&p->pending_sync, false); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem_sync); } else { qemu_mutex_unlock(&p->mutex); /* sometimes there are spurious wakeups */ @@ -911,7 +923,6 @@ int multifd_save_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); p->packet_len = sizeof(MultiFDPacket_t) diff --git a/migration/multifd.h b/migration/multifd.h index 3920bdbcf1..08f26ef3fe 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -99,8 +99,17 @@ typedef struct { uint32_t flags; /* global number of generated multifd packets */ uint64_t packet_num; - /* thread has work to do */ - int pending_job; + /* + * The sender thread has work to do if either of below boolean is set. + * + * @pending_job: a job is pending + * @pending_sync: a sync request is pending + * + * For both of these fields, they're only set by the requesters, and + * cleared by the multifd sender threads. + */ + bool pending_job; + bool pending_sync; /* array of pages to sent. * The owner of 'pages' depends of 'pending_job' value: * pending_job == 0 -> migration_thread can use it. -- Gitee From d3c041421d253cc389858416384558af64ff9d63 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:41 +0800 Subject: [PATCH 036/134] migration/multifd: Simplify locking in sender thread commit e3cce9af10b06c51434ced4e1a6686f1ce43e124 upstream. The sender thread will yield the p->mutex before IO starts, trying to not block the requester thread. This may be unnecessary lock optimizations, because the requester can already read pending_job safely even without the lock, because the requester is currently the only one who can assign a task. Drop that lock complication on both sides: (1) in the sender thread, always take the mutex until job done (2) in the requester thread, check pending_job clear lockless Intel-SIG: commit e3cce9af10b0 migration/multifd: Simplify locking in sender thread Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-8-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 03c138137e..fc2e6592d6 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -430,7 +430,9 @@ static int multifd_send_pages(void) return -1; } + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); + /* * next_channel can remain from a previous migration that was * using more channels, so ensure it doesn't overflow if the @@ -442,17 +444,26 @@ static int multifd_send_pages(void) return -1; } p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); + /* + * Lockless read to p->pending_job is safe, because only multifd + * sender thread can clear it. + */ if (qatomic_read(&p->pending_job) == false) { - qatomic_set(&p->pending_job, true); next_channel = (i + 1) % migrate_multifd_channels(); break; } - qemu_mutex_unlock(&p->mutex); } + + qemu_mutex_lock(&p->mutex); assert(!p->pages->num); assert(!p->pages->block); - + /* + * Double check on pending_job==false with the lock. In the future if + * we can have >1 requester thread, we can replace this with a "goto + * retry", but that is for later. + */ + assert(qatomic_read(&p->pending_job) == false); + qatomic_set(&p->pending_job, true); p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; @@ -713,8 +724,6 @@ static void *multifd_send_thread(void *opaque) multifd_send_fill_packet(p); p->num_packets++; p->total_normal_pages += pages->num; - qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -734,6 +743,7 @@ static void *multifd_send_thread(void *opaque) ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { + qemu_mutex_unlock(&p->mutex); break; } @@ -742,7 +752,6 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; - qemu_mutex_lock(&p->mutex); qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); } else if (qatomic_read(&p->pending_sync)) { -- Gitee From 1095a2d1c4862c30edd102ba0247aad9a7b09116 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:42 +0800 Subject: [PATCH 037/134] migration/multifd: Drop pages->num check in sender thread commit 83c560fb4249ee5698652249e0c1730c3d611a9b upstream. Now with a split SYNC handler, we always have pages->num set for pending_job==true. Assert it instead. Intel-SIG: commit 83c560fb4249 migration/multifd: Drop pages->num check in sender thread Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-9-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index fc2e6592d6..fe7f450597 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -714,13 +714,14 @@ static void *multifd_send_thread(void *opaque) p->iovs_num = 1; } - if (pages->num) { - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - qemu_mutex_unlock(&p->mutex); - break; - } + assert(pages->num); + + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; } + multifd_send_fill_packet(p); p->num_packets++; p->total_normal_pages += pages->num; -- Gitee From 7b180307931d3bb776211adb6d0b1bd0d35adb31 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:43 +0800 Subject: [PATCH 038/134] migration/multifd: Rename p->num_packets and clean it up commit 05b7ec1890158471afb8537a6817a7e0d0a6c938 upstream. This field, no matter whether on src or dest, is only used for debugging purpose. They can even be removed already, unless it still more or less provide some accounting on "how many packets are sent/recved for this thread". The other more important one is called packet_num, which is embeded in the multifd packet headers (MultiFDPacket_t). So let's keep them for now, but make them much easier to understand, by doing below: - Rename both of them to packets_sent / packets_recved, the old name (num_packets) are waaay too confusing when we already have MultiFDPacket_t.packets_num. - Avoid worrying on the "initial packet": we know we will send it, that's good enough. The accounting won't matter a great deal to start with 0 or with 1. - Move them to where we send/recv the packets. They're: - multifd_send_fill_packet() for senders. - multifd_recv_unfill_packet() for receivers. Intel-SIG: commit 05b7ec189015 migration/multifd: Rename p->num_packets and clean it up Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-10-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 +++++-------- migration/multifd.h | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index fe7f450597..d1cdfc4d9c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -289,6 +289,8 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) packet->offset[i] = cpu_to_be64(temp); } + + p->packets_sent++; } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -336,6 +338,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; if (p->normal_num == 0) { return 0; @@ -692,8 +695,6 @@ static void *multifd_send_thread(void *opaque) ret = -1; goto out; } - /* initial packet */ - p->num_packets = 1; while (true) { qemu_sem_post(&multifd_send_state->channels_ready); @@ -723,7 +724,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - p->num_packets++; p->total_normal_pages += pages->num; trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -791,7 +791,7 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); return NULL; } @@ -1128,7 +1128,6 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, p->next_packet_size); - p->num_packets++; p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); @@ -1154,7 +1153,7 @@ static void *multifd_recv_thread(void *opaque) qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); return NULL; } @@ -1256,8 +1255,6 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) } p->c = ioc; object_ref(OBJECT(ioc)); - /* initial packet */ - p->num_packets = 1; p->running = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, diff --git a/migration/multifd.h b/migration/multifd.h index 08f26ef3fe..2e4ad0dc56 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -124,7 +124,7 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ - uint64_t num_packets; + uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; /* buffers to send */ @@ -174,8 +174,8 @@ typedef struct { MultiFDPacket_t *packet; /* size of the next packet that contains pages */ uint32_t next_packet_size; - /* packets sent through this channel */ - uint64_t num_packets; + /* packets received through this channel */ + uint64_t packets_recved; /* ramblock */ RAMBlock *block; /* ramblock host address */ -- Gitee From a461cbb0288fc6e43fd7d86796d8d486925c241e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:44 +0800 Subject: [PATCH 039/134] migration/multifd: Move total_normal_pages accounting commit db7e1cc5103137743394a939045a17fa2b30a0dc upstream. Just like the previous patch, move the accounting for total_normal_pages on both src/dst sides into the packet fill/unfill procedures. Intel-SIG: commit db7e1cc51031 migration/multifd: Move total_normal_pages accounting Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-11-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index d1cdfc4d9c..f15dfd2983 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -291,6 +291,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; + p->total_normal_pages += pages->num; } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -339,6 +340,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; + p->total_normal_pages += p->normal_num; if (p->normal_num == 0) { return 0; @@ -724,7 +726,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - p->total_normal_pages += pages->num; trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -1128,7 +1129,6 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, p->next_packet_size); - p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); if (p->normal_num) { -- Gitee From b38b5ec0aa022dec05df7aa425777cf8f4fcb121 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:45 +0800 Subject: [PATCH 040/134] migration/multifd: Move trace_multifd_send|recv() commit 8a9ef1738037e1d1132f9e1bd3e2f1102bde719f upstream. Move them into fill/unfill of packets. With that, we can further cleanup the send/recv thread procedure, and remove one more temp var. Intel-SIG: commit 8a9ef1738037 migration/multifd: Move trace_multifd_send|recv() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-12-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f15dfd2983..dd94352f71 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -292,6 +292,9 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) p->packets_sent++; p->total_normal_pages += pages->num; + + trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, + p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -342,6 +345,9 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->packets_recved++; p->total_normal_pages += p->normal_num; + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, + p->next_packet_size); + if (p->normal_num == 0) { return 0; } @@ -708,7 +714,6 @@ static void *multifd_send_thread(void *opaque) qemu_mutex_lock(&p->mutex); if (qatomic_read(&p->pending_job)) { - uint64_t packet_num = p->packet_num; MultiFDPages_t *pages = p->pages; if (use_zero_copy_send) { @@ -726,8 +731,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); if (use_zero_copy_send) { /* Send header first, without zerocopy */ @@ -1127,8 +1130,6 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, - p->next_packet_size); qemu_mutex_unlock(&p->mutex); if (p->normal_num) { -- Gitee From b0581dadff99f1594abf9cdfc06754ad2b2a5f25 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:46 +0800 Subject: [PATCH 041/134] migration/multifd: multifd_send_prepare_header() commit 452b205702335ddd45554aaf0eb37baf50bdfa00 upstream. Introduce a helper multifd_send_prepare_header() to setup the header packet for multifd sender. It's fine to setup the IOV[0] _before_ send_prepare() because the packet buffer is already ready, even if the content is to be filled in. With this helper, we can already slightly clean up the zero copy path. Note that I explicitly put it into multifd.h, because I want it inlined directly into multifd*.c where necessary later. Intel-SIG: commit 452b20570233 migration/multifd: multifd_send_prepare_header() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-13-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 16 ++++++++-------- migration/multifd.h | 8 ++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index dd94352f71..b9dda72257 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -716,10 +716,14 @@ static void *multifd_send_thread(void *opaque) if (qatomic_read(&p->pending_job)) { MultiFDPages_t *pages = p->pages; - if (use_zero_copy_send) { - p->iovs_num = 0; - } else { - p->iovs_num = 1; + p->iovs_num = 0; + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); } assert(pages->num); @@ -739,10 +743,6 @@ static void *multifd_send_thread(void *opaque) if (ret != 0) { break; } - } else { - /* Send header using the same writev call */ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, diff --git a/migration/multifd.h b/migration/multifd.h index 2e4ad0dc56..4ec005f53f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -209,5 +209,13 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); +static inline void multifd_send_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + + #endif -- Gitee From 8cf9e6ab5e2950bbc89957e1e61697963c317a2f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:47 +0800 Subject: [PATCH 042/134] migration/multifd: Move header prepare/fill into send_prepare() commit 25a1f8787597f6906b151b2f73ae6cc92a31de57 upstream. This patch redefines the interfacing of ->send_prepare(). It further simplifies multifd_send_thread() especially on zero copy. Now with the new interface, we require the hook to do all the work for preparing the IOVs to send. After it's completed, the IOVs should be ready to be dumped into the specific multifd QIOChannel later. So now the API looks like: p->pages -----------> send_prepare() -------------> IOVs This also prepares for the case where the input can be extended to even not any p->pages. But that's for later. This patch will achieve similar goal of what Fabiano used to propose here: https://lore.kernel.org/r/20240126221943.26628-1-farosas@suse.de However the send() interface may not be necessary. I'm boldly attaching a "Co-developed-by" for Fabiano. Intel-SIG: commit 25a1f8787597 migration/multifd: Move header prepare/fill into send_prepare() Co-developed-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-14-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 4 +++ migration/multifd-zstd.c | 4 +++ migration/multifd.c | 61 ++++++++++++++++++---------------------- migration/multifd.h | 1 + 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 100809abc1..012e3bdea1 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,6 +123,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; + multifd_send_prepare_header(p); + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; @@ -172,6 +174,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = out_size; p->flags |= MULTIFD_FLAG_ZLIB; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 2023edd8cc..dc8fe43e94 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,6 +118,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; + multifd_send_prepare_header(p); + z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; @@ -161,6 +163,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = z->out.pos; p->flags |= MULTIFD_FLAG_ZSTD; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd.c b/migration/multifd.c index b9dda72257..b1fe703712 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -51,15 +51,15 @@ typedef struct { /** * nocomp_send_setup: setup send side * - * For no compression this function does nothing. - * - * Returns 0 for success or -1 for error - * * @p: Params for the channel that we are using * @errp: pointer to an error */ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) { + if (migrate_zero_copy_send()) { + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + return 0; } @@ -89,7 +89,17 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { + bool use_zero_copy_send = migrate_zero_copy_send(); MultiFDPages_t *pages = p->pages; + int ret; + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); + } for (int i = 0; i < pages->num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; @@ -99,6 +109,18 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, errp); + if (ret != 0) { + return -1; + } + } + return 0; } @@ -267,7 +289,7 @@ static void multifd_pages_clear(MultiFDPages_t *pages) g_free(pages); } -static void multifd_send_fill_packet(MultiFDSendParams *p) +void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; @@ -689,7 +711,6 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; - bool use_zero_copy_send = migrate_zero_copy_send(); thread = migration_threads_add(p->name, qemu_get_thread_id()); @@ -717,15 +738,6 @@ static void *multifd_send_thread(void *opaque) MultiFDPages_t *pages = p->pages; p->iovs_num = 0; - - if (!use_zero_copy_send) { - /* - * Only !zerocopy needs the header in IOV; zerocopy will - * send it separately. - */ - multifd_send_prepare_header(p); - } - assert(pages->num); ret = multifd_send_state->ops->send_prepare(p, &local_err); @@ -734,17 +746,6 @@ static void *multifd_send_thread(void *opaque) break; } - multifd_send_fill_packet(p); - - if (use_zero_copy_send) { - /* Send header first, without zerocopy */ - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; - } - } - ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { @@ -949,13 +950,7 @@ int multifd_save_setup(Error **errp) p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; - - if (migrate_zero_copy_send()) { - p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; - } else { - p->write_flags = 0; - } - + p->write_flags = 0; multifd_new_send_channel_create(p); } diff --git a/migration/multifd.h b/migration/multifd.h index 4ec005f53f..34a2ecb9f4 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -208,6 +208,7 @@ typedef struct { } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); +void multifd_send_fill_packet(MultiFDSendParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { -- Gitee From be8ad2f45441caca361ae36fb7c976e661647217 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:48 +0800 Subject: [PATCH 043/134] migration/multifd: Forbid spurious wakeups commit 859ebaf346e8b5dece6cf255c604fe953d8ec9ab upstream. Now multifd's logic is designed to have no spurious wakeup. I still remember a talk to Juan and he seems to agree we should drop it now, and if my memory was right it was there because multifd used to hit that when still debugging. Let's drop it and see what can explode; as long as it's not reaching soft-freeze. Intel-SIG: commit 859ebaf346e8 migration/multifd: Forbid spurious wakeups Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-15-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index b1fe703712..3c5a57197b 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -760,7 +760,9 @@ static void *multifd_send_thread(void *opaque) p->next_packet_size = 0; qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); - } else if (qatomic_read(&p->pending_sync)) { + } else { + /* If not a normal job, must be a sync request */ + assert(qatomic_read(&p->pending_sync)); p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, @@ -775,9 +777,6 @@ static void *multifd_send_thread(void *opaque) qatomic_set(&p->pending_sync, false); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem_sync); - } else { - qemu_mutex_unlock(&p->mutex); - /* sometimes there are spurious wakeups */ } } -- Gitee From de0e86733b313a146e65f02152c4cdd36428a587 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:49 +0800 Subject: [PATCH 044/134] migration/multifd: Split multifd_send_terminate_threads() commit 3ab4441d97af59ea09ee015d68c4770704b2b34f upstream. Split multifd_send_terminate_threads() into two functions: - multifd_send_set_error(): used when an error happened on the sender side, set error and quit state only - multifd_send_terminate_threads(): used only by the main thread to kick all multifd send threads out of sleep, for the last recycling. Use multifd_send_set_error() in the three old call sites where only the error will be set. Use multifd_send_terminate_threads() in the last one where the main thread will kick the multifd threads at last in multifd_save_cleanup(). Both helpers will need to set quitting=1. Intel-SIG: commit 3ab4441d97af migration/multifd: Split multifd_send_terminate_threads() Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-16-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 27 ++++++++++++++++++--------- migration/trace-events | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3c5a57197b..cf0938856b 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -537,10 +537,9 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) return 1; } -static void multifd_send_terminate_threads(Error *err) +/* Multifd send side hit an error; remember it and prepare to quit */ +static void multifd_send_set_error(Error *err) { - int i; - /* * We don't want to exit each threads twice. Depending on where * we get the error, or if there are two independent errors in two @@ -551,8 +550,6 @@ static void multifd_send_terminate_threads(Error *err) return; } - trace_multifd_send_terminate_threads(err != NULL); - if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -564,7 +561,19 @@ static void multifd_send_terminate_threads(Error *err) MIGRATION_STATUS_FAILED); } } +} + +static void multifd_send_terminate_threads(void) +{ + int i; + + trace_multifd_send_terminate_threads(); + /* + * Tell everyone we're quitting. No xchg() needed here; we simply + * always set it. + */ + qatomic_set(&multifd_send_state->exiting, 1); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -587,7 +596,7 @@ void multifd_save_cleanup(void) if (!migrate_multifd()) { return; } - multifd_send_terminate_threads(NULL); + multifd_send_terminate_threads(); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -784,7 +793,7 @@ out: if (ret) { assert(local_err); trace_multifd_send_error(p->id); - multifd_send_terminate_threads(local_err); + multifd_send_set_error(local_err); multifd_send_kick_main(p); error_free(local_err); } @@ -820,7 +829,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - multifd_send_terminate_threads(err); + multifd_send_set_error(err); multifd_send_kick_main(p); error_free(err); } @@ -902,7 +911,7 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_send_terminate_threads(local_err); + multifd_send_set_error(local_err); multifd_send_kick_main(p); object_unref(OBJECT(ioc)); error_free(local_err); diff --git a/migration/trace-events b/migration/trace-events index de4a743c8a..298ad2b0dd 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -141,7 +141,7 @@ multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" -multifd_send_terminate_threads(bool error) "error %d" +multifd_send_terminate_threads(void) "" multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" -- Gitee From 06d84d66328b9fa8c5b02062b3940ef6d417a530 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:50 +0800 Subject: [PATCH 045/134] migration/multifd: Change retval of multifd_queue_page() commit d6556d174a6b9fc443f2320193f18e71eb67052a upstream. Using int is an overkill when there're only two options. Change it to a boolean. Intel-SIG: commit d6556d174a6b migration/multifd: Change retval of multifd_queue_page() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-17-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 9 +++++---- migration/multifd.h | 2 +- migration/ram.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index cf0938856b..cff3eaa389 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -506,7 +506,8 @@ static int multifd_send_pages(void) return 1; } -int multifd_queue_page(RAMBlock *block, ram_addr_t offset) +/* Returns true if enqueue successful, false otherwise */ +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; bool changed = false; @@ -520,21 +521,21 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) pages->num++; if (pages->num < pages->allocated) { - return 1; + return true; } } else { changed = true; } if (multifd_send_pages() < 0) { - return -1; + return false; } if (changed) { return multifd_queue_page(block, offset); } - return 1; + return true; } /* Multifd send side hit an error; remember it and prepare to quit */ diff --git a/migration/multifd.h b/migration/multifd.h index 34a2ecb9f4..a320c53a6f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -22,7 +22,7 @@ bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); -int multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/ram.c b/migration/ram.c index 9ad6e55f13..f642cd68c6 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1388,7 +1388,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(block, offset) < 0) { + if (!multifd_queue_page(block, offset)) { return -1; } stat64_add(&mig_stats.normal_pages, 1); -- Gitee From 3266ea3ee4d0f0a24460295f51b8f03d684e7751 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:51 +0800 Subject: [PATCH 046/134] migration/multifd: Change retval of multifd_send_pages() commit 3b40964a863d69121733c8b9794a02347ed0000b upstream. Using int is an overkill when there're only two options. Change it to a boolean. Intel-SIG: commit 3b40964a863d migration/multifd: Change retval of multifd_send_pages() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-18-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index cff3eaa389..fdeae75aea 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -450,9 +450,10 @@ static void multifd_send_kick_main(MultiFDSendParams *p) * thread is using the channel mutex when changing it, and the channel * have to had finish with its own, otherwise pending_job can't be * false. + * + * Returns true if succeed, false otherwise. */ - -static int multifd_send_pages(void) +static bool multifd_send_pages(void) { int i; static int next_channel; @@ -460,7 +461,7 @@ static int multifd_send_pages(void) MultiFDPages_t *pages = multifd_send_state->pages; if (multifd_send_should_exit()) { - return -1; + return false; } /* We wait here, until at least one channel is ready */ @@ -474,7 +475,7 @@ static int multifd_send_pages(void) next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { if (multifd_send_should_exit()) { - return -1; + return false; } p = &multifd_send_state->params[i]; /* @@ -503,7 +504,7 @@ static int multifd_send_pages(void) qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); - return 1; + return true; } /* Returns true if enqueue successful, false otherwise */ @@ -527,7 +528,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) changed = true; } - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { return false; } @@ -667,7 +668,7 @@ int multifd_send_sync_main(void) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { error_report("%s: multifd_send_pages fail", __func__); return -1; } -- Gitee From 2d4d1654e7230c204e6731302209e28a3630bf92 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:52 +0800 Subject: [PATCH 047/134] migration/multifd: Rewrite multifd_queue_page() commit f88f86c4ee3fe673b34873e27af2de0a16fe01fd upstream. The current multifd_queue_page() is not easy to read and follow. It is not good with a few reasons: - No helper at all to show what exactly does a condition mean; in short, readability is low. - Rely on pages->ramblock being cleared to detect an empty queue. It's slightly an overload of the ramblock pointer, per Fabiano [1], which I also agree. - Contains a self recursion, even if not necessary.. Rewrite this function. We add some comments to make it even clearer on what it does. [1] https://lore.kernel.org/r/87wmrpjzew.fsf@suse.de Intel-SIG: commit f88f86c4ee3f migration/multifd: Rewrite multifd_queue_page() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-19-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 56 ++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index fdeae75aea..2094f36eff 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -507,35 +507,53 @@ static bool multifd_send_pages(void) return true; } +static inline bool multifd_queue_empty(MultiFDPages_t *pages) +{ + return pages->num == 0; +} + +static inline bool multifd_queue_full(MultiFDPages_t *pages) +{ + return pages->num == pages->allocated; +} + +static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) +{ + pages->offset[pages->num++] = offset; +} + /* Returns true if enqueue successful, false otherwise */ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) { - MultiFDPages_t *pages = multifd_send_state->pages; - bool changed = false; + MultiFDPages_t *pages; + +retry: + pages = multifd_send_state->pages; - if (!pages->block) { + /* If the queue is empty, we can already enqueue now */ + if (multifd_queue_empty(pages)) { pages->block = block; + multifd_enqueue(pages, offset); + return true; } - if (pages->block == block) { - pages->offset[pages->num] = offset; - pages->num++; - - if (pages->num < pages->allocated) { - return true; + /* + * Not empty, meanwhile we need a flush. It can because of either: + * + * (1) The page is not on the same ramblock of previous ones, or, + * (2) The queue is full. + * + * After flush, always retry. + */ + if (pages->block != block || multifd_queue_full(pages)) { + if (!multifd_send_pages()) { + return false; } - } else { - changed = true; - } - - if (!multifd_send_pages()) { - return false; - } - - if (changed) { - return multifd_queue_page(block, offset); + goto retry; } + /* Not empty, and we still have space, do it! */ + multifd_enqueue(pages, offset); return true; } -- Gitee From 6266655d5451b4fbd1a558b943521267cd38fa80 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:53 +0800 Subject: [PATCH 048/134] migration/multifd: Cleanup multifd_save_cleanup() commit 12808db3b8c22d26c9bc3da6f41756890ce882e4 upstream. Shrink the function by moving relevant works into helpers: move the thread join()s into multifd_send_terminate_threads(), then create two more helpers to cover channel/state cleanups. Add a TODO entry for the thread terminate process because p->running is still buggy. We need to fix it at some point but not yet covered. Intel-SIG: commit 12808db3b8c2 migration/multifd: Cleanup multifd_save_cleanup() Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-20-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 91 +++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 2094f36eff..fcba2e8207 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -594,6 +594,11 @@ static void multifd_send_terminate_threads(void) * always set it. */ qatomic_set(&multifd_send_state->exiting, 1); + + /* + * Firstly, kick all threads out; no matter whether they are just idle, + * or blocked in an IO system call. + */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -602,6 +607,21 @@ static void multifd_send_terminate_threads(void) qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } } + + /* + * Finally recycle all the threads. + * + * TODO: p->running is still buggy, e.g. we can reach here without the + * corresponding multifd_new_send_channel_async() get invoked yet, + * then a new thread can even be created after this function returns. + */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + if (p->running) { + qemu_thread_join(&p->thread); + } + } } static int multifd_send_channel_destroy(QIOChannel *send) @@ -609,6 +629,41 @@ static int multifd_send_channel_destroy(QIOChannel *send) return socket_send_channel_destroy(send); } +static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) +{ + if (p->registered_yank) { + migration_ioc_unregister_yank(p->c); + } + multifd_send_channel_destroy(p->c); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + multifd_pages_clear(p->pages); + p->pages = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + multifd_send_state->ops->send_cleanup(p, errp); + + return *errp == NULL; +} + +static void multifd_send_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; + multifd_pages_clear(multifd_send_state->pages); + multifd_send_state->pages = NULL; + g_free(multifd_send_state); + multifd_send_state = NULL; +} + void multifd_save_cleanup(void) { int i; @@ -616,48 +671,20 @@ void multifd_save_cleanup(void) if (!migrate_multifd()) { return; } + multifd_send_terminate_threads(); - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDSendParams *p = &multifd_send_state->params[i]; - if (p->running) { - qemu_thread_join(&p->thread); - } - } for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; Error *local_err = NULL; - if (p->registered_yank) { - migration_ioc_unregister_yank(p->c); - } - multifd_send_channel_destroy(p->c); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - multifd_pages_clear(p->pages); - p->pages = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - multifd_send_state->ops->send_cleanup(p, &local_err); - if (local_err) { + if (!multifd_send_cleanup_channel(p, &local_err)) { migrate_set_error(migrate_get_current(), local_err); error_free(local_err); } } - qemu_sem_destroy(&multifd_send_state->channels_ready); - g_free(multifd_send_state->params); - multifd_send_state->params = NULL; - multifd_pages_clear(multifd_send_state->pages); - multifd_send_state->pages = NULL; - g_free(multifd_send_state); - multifd_send_state = NULL; + + multifd_send_cleanup_state(); } static int multifd_zero_copy_flush(QIOChannel *c) -- Gitee From d26d8e8fbbb4ccda90bd5a837abf771166e5b3b9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:54 +0800 Subject: [PATCH 049/134] migration/multifd: Cleanup multifd_load_cleanup() commit 5e6ea8a1d64e72e648b5a5277f08ec7fb09c3b8e upstream. Use similar logic to cleanup the recv side. Note that multifd_recv_terminate_threads() may need some similar rework like the sender side, but let's leave that for later. Intel-SIG: commit 5e6ea8a1d64e migration/multifd: Cleanup multifd_load_cleanup() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-21-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 52 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index fcba2e8207..7f17734a75 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1074,6 +1074,34 @@ void multifd_load_shutdown(void) } } +static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) +{ + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + multifd_recv_state->ops->recv_cleanup(p); +} + +static void multifd_recv_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_recv_state->sem_sync); + g_free(multifd_recv_state->params); + multifd_recv_state->params = NULL; + g_free(multifd_recv_state); + multifd_recv_state = NULL; +} + void multifd_load_cleanup(void) { int i; @@ -1096,29 +1124,9 @@ void multifd_load_cleanup(void) qemu_thread_join(&p->thread); } for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - - migration_ioc_unregister_yank(p->c); - object_unref(OBJECT(p->c)); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_recv_state->ops->recv_cleanup(p); + multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); } - qemu_sem_destroy(&multifd_recv_state->sem_sync); - g_free(multifd_recv_state->params); - multifd_recv_state->params = NULL; - g_free(multifd_recv_state); - multifd_recv_state = NULL; + multifd_recv_cleanup_state(); } void multifd_recv_sync_main(void) -- Gitee From 7dea18d7d17a0dde5f59e883d43c0aea22e3b278 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:55 +0800 Subject: [PATCH 050/134] migration/multifd: Stick with send/recv on function names commit cde85c37ca54e4a2dbee8653181938499887f6be upstream. Most of the multifd code uses send/recv to represent the two sides, but some rare cases use save/load. Since send/recv is the majority, replacing the save/load use cases to use send/recv globally. Now we reach a consensus on the naming. Intel-SIG: commit cde85c37ca54 migration/multifd: Stick with send/recv on function names Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-22-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 12 ++++++------ migration/multifd.c | 10 +++++----- migration/multifd.h | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index a4db2615cc..77429c563f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -269,7 +269,7 @@ void migration_incoming_state_destroy(void) { struct MigrationIncomingState *mis = migration_incoming_get_current(); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); if (mis->to_src_file) { @@ -622,7 +622,7 @@ static void process_incoming_migration_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); - multifd_load_shutdown(); + multifd_recv_shutdown(); dirty_bitmap_mig_before_vm_start(); @@ -721,7 +721,7 @@ fail: MIGRATION_STATUS_FAILED); qemu_fclose(mis->from_src_file); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); exit(EXIT_FAILURE); @@ -847,7 +847,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) default_channel = !mis->from_src_file; } - if (multifd_load_setup(errp) != 0) { + if (multifd_recv_setup(errp) != 0) { return; } @@ -1296,7 +1296,7 @@ static void migrate_fd_cleanup(MigrationState *s) } qemu_mutex_lock_iothread(); - multifd_save_cleanup(); + multifd_send_shutdown(); qemu_mutex_lock(&s->qemu_file_lock); tmp = s->to_dst_file; s->to_dst_file = NULL; @@ -3624,7 +3624,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_save_setup(&local_err) != 0) { + if (multifd_send_setup(&local_err) != 0) { migrate_set_error(s, local_err); error_report_err(local_err); migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, diff --git a/migration/multifd.c b/migration/multifd.c index 7f17734a75..bb4fb17544 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -664,7 +664,7 @@ static void multifd_send_cleanup_state(void) multifd_send_state = NULL; } -void multifd_save_cleanup(void) +void multifd_send_shutdown(void) { int i; @@ -969,7 +969,7 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_save_setup(Error **errp) +int multifd_send_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); @@ -1067,7 +1067,7 @@ static void multifd_recv_terminate_threads(Error *err) } } -void multifd_load_shutdown(void) +void multifd_recv_shutdown(void) { if (migrate_multifd()) { multifd_recv_terminate_threads(NULL); @@ -1102,7 +1102,7 @@ static void multifd_recv_cleanup_state(void) multifd_recv_state = NULL; } -void multifd_load_cleanup(void) +void multifd_recv_cleanup(void) { int i; @@ -1217,7 +1217,7 @@ static void *multifd_recv_thread(void *opaque) return NULL; } -int multifd_load_setup(Error **errp) +int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); diff --git a/migration/multifd.h b/migration/multifd.h index a320c53a6f..9b40a53cb6 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,11 +13,11 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_save_setup(Error **errp); -void multifd_save_cleanup(void); -int multifd_load_setup(Error **errp); -void multifd_load_cleanup(void); -void multifd_load_shutdown(void); +int multifd_send_setup(Error **errp); +void multifd_send_shutdown(void); +int multifd_recv_setup(Error **errp); +void multifd_recv_cleanup(void); +void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -- Gitee From 24a27ec25fe38ae82614a59989b3f38d329612c5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:56 +0800 Subject: [PATCH 051/134] migration/multifd: Fix MultiFDSendParams.packet_num race commit 98ea497d8b8a5076be7b6ceb0dcc4a475373eb76 upstream. As reported correctly by Fabiano [1] (while per Fabiano, it sourced back to Elena's initial report in Oct 2023), MultiFDSendParams.packet_num is buggy to be assigned and stored. Consider two consequent operations of: (1) queue a job into multifd send thread X, then (2) queue another sync request to the same send thread X. Then the MultiFDSendParams.packet_num will be assigned twice, and the first assignment can get lost already. To avoid that, we move the packet_num assignment from p->packet_num into where the thread will fill in the packet. Use atomic operations to protect the field, making sure there's no race. Note that atomic fetch_add() may not be good for scaling purposes, however multifd should be fine as number of threads should normally not go beyond 16 threads. Let's leave that concern for later but fix the issue first. There's also a trick on how to make it always work even on 32 bit hosts for uint64_t packet number. Switching to uintptr_t as of now to simply the case. It will cause packet number to overflow easier on 32 bit, but that shouldn't be a major concern for now as 32 bit systems is not the major audience for any performance concerns like what multifd wants to address. We also need to move multifd_send_state definition upper, so that multifd_send_fill_packet() can reference it. [1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de Intel-SIG: commit 98ea497d8b8a migration/multifd: Fix MultiFDSendParams.packet_num race Reported-by: Elena Ufimtseva Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-23-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 56 +++++++++++++++++++++++++++------------------ migration/multifd.h | 2 -- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index bb4fb17544..3cbd4865f6 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -46,6 +46,35 @@ typedef struct { uint64_t unused2[4]; /* Reserved for future use */ } __attribute__((packed)) MultiFDInit_t; +struct { + MultiFDSendParams *params; + /* array of pages to sent */ + MultiFDPages_t *pages; + /* + * Global number of generated multifd packets. + * + * Note that we used 'uintptr_t' because it'll naturally support atomic + * operations on both 32bit / 64 bits hosts. It means on 32bit systems + * multifd will overflow the packet_num easier, but that should be + * fine. + * + * Another option is to use QEMU's Stat64 then it'll be 64 bits on all + * hosts, however so far it does not support atomic fetch_add() yet. + * Make it easy for now. + */ + uintptr_t packet_num; + /* send channels ready */ + QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_send_state; + /* Multifd without compression */ /** @@ -293,13 +322,16 @@ void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; + uint64_t packet_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); - packet->packet_num = cpu_to_be64(p->packet_num); + + packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); + packet->packet_num = cpu_to_be64(packet_num); if (pages->block) { strncpy(packet->ramblock, pages->block->idstr, 256); @@ -315,7 +347,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p) p->packets_sent++; p->total_normal_pages += pages->num; - trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, + trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); } @@ -399,24 +431,6 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return 0; } -struct { - MultiFDSendParams *params; - /* array of pages to sent */ - MultiFDPages_t *pages; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* send channels ready */ - QemuSemaphore channels_ready; - /* - * Have we already run terminate threads. There is a race when it - * happens that we got one error while we are exiting. - * We will use atomic operations. Only valid values are 0 and 1. - */ - int exiting; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_send_state; - static bool multifd_send_should_exit(void) { return qatomic_read(&multifd_send_state->exiting); @@ -498,7 +512,6 @@ static bool multifd_send_pages(void) */ assert(qatomic_read(&p->pending_job) == false); qatomic_set(&p->pending_job, true); - p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; qemu_mutex_unlock(&p->mutex); @@ -731,7 +744,6 @@ int multifd_send_sync_main(void) trace_multifd_send_sync_main_signal(p->id); qemu_mutex_lock(&p->mutex); - p->packet_num = multifd_send_state->packet_num++; /* * We should be the only user so far, so not possible to be set by * others concurrently. diff --git a/migration/multifd.h b/migration/multifd.h index 9b40a53cb6..98876ff94a 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -97,8 +97,6 @@ typedef struct { bool running; /* multifd flags for each packet */ uint32_t flags; - /* global number of generated multifd packets */ - uint64_t packet_num; /* * The sender thread has work to do if either of below boolean is set. * -- Gitee From 2d20a1964c57f7b680b42156765c2c060883838d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:57 +0800 Subject: [PATCH 052/134] migration/multifd: Optimize sender side to be lockless commit 488c84acb465c21b716c3fd14de27ab5ce388c85 upstream. When reviewing my attempt to refactor send_prepare(), Fabiano suggested we try out with dropping the mutex in multifd code [1]. I thought about that before but I never tried to change the code. Now maybe it's time to give it a stab. This only optimizes the sender side. The trick here is multifd has a clear provider/consumer model, that the migration main thread publishes requests (either pending_job/pending_sync), while the multifd sender threads are consumers. Here we don't have a lot of complicated data sharing, and the jobs can logically be submitted lockless. Arm the code with atomic weapons. Two things worth mentioning: - For multifd_send_pages(): we can use qatomic_load_acquire() when trying to find a free channel, but that's expensive if we attach one ACQUIRE per channel. Instead, keep the qatomic_read() on reading the pending_job flag as we do already, meanwhile use one smp_mb_acquire() after the loop to guarantee the memory ordering. - For pending_sync: it doesn't have any extra data required since now p->flags are never touched, it should be safe to not use memory barrier. That's different from pending_job. Provide rich comments for all the lockless operations to state how they are paired. With that, we can remove the mutex. [1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de Intel-SIG: commit 488c84acb465 migration/multifd: Optimize sender side to be lockless Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-24-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 51 +++++++++++++++++++++++---------------------- migration/multifd.h | 2 -- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3cbd4865f6..ede34ec4bf 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -502,19 +502,19 @@ static bool multifd_send_pages(void) } } - qemu_mutex_lock(&p->mutex); - assert(!p->pages->num); - assert(!p->pages->block); /* - * Double check on pending_job==false with the lock. In the future if - * we can have >1 requester thread, we can replace this with a "goto - * retry", but that is for later. + * Make sure we read p->pending_job before all the rest. Pairs with + * qatomic_store_release() in multifd_send_thread(). */ - assert(qatomic_read(&p->pending_job) == false); - qatomic_set(&p->pending_job, true); + smp_mb_acquire(); + assert(!p->pages->num); multifd_send_state->pages = p->pages; p->pages = pages; - qemu_mutex_unlock(&p->mutex); + /* + * Making sure p->pages is setup before marking pending_job=true. Pairs + * with the qatomic_load_acquire() in multifd_send_thread(). + */ + qatomic_store_release(&p->pending_job, true); qemu_sem_post(&p->sem); return true; @@ -649,7 +649,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) } multifd_send_channel_destroy(p->c); p->c = NULL; - qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem); qemu_sem_destroy(&p->sem_sync); g_free(p->name); @@ -743,14 +742,12 @@ int multifd_send_sync_main(void) trace_multifd_send_sync_main_signal(p->id); - qemu_mutex_lock(&p->mutex); /* * We should be the only user so far, so not possible to be set by * others concurrently. */ assert(qatomic_read(&p->pending_sync) == false); qatomic_set(&p->pending_sync, true); - qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -800,9 +797,12 @@ static void *multifd_send_thread(void *opaque) if (multifd_send_should_exit()) { break; } - qemu_mutex_lock(&p->mutex); - if (qatomic_read(&p->pending_job)) { + /* + * Read pending_job flag before p->pages. Pairs with the + * qatomic_store_release() in multifd_send_pages(). + */ + if (qatomic_load_acquire(&p->pending_job)) { MultiFDPages_t *pages = p->pages; p->iovs_num = 0; @@ -810,14 +810,12 @@ static void *multifd_send_thread(void *opaque) ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } @@ -826,24 +824,31 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; - qatomic_set(&p->pending_job, false); - qemu_mutex_unlock(&p->mutex); + + /* + * Making sure p->pages is published before saying "we're + * free". Pairs with the smp_mb_acquire() in + * multifd_send_pages(). + */ + qatomic_store_release(&p->pending_job, false); } else { - /* If not a normal job, must be a sync request */ + /* + * If not a normal job, must be a sync request. Note that + * pending_sync is a standalone flag (unlike pending_job), so + * it doesn't require explicit memory barriers. + */ assert(qatomic_read(&p->pending_sync)); p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, p->packet_len, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } /* p->next_packet_size will always be zero for a SYNC packet */ stat64_add(&mig_stats.multifd_bytes, p->packet_len); p->flags = 0; qatomic_set(&p->pending_sync, false); - qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem_sync); } } @@ -857,10 +862,7 @@ out: error_free(local_err); } - qemu_mutex_lock(&p->mutex); p->running = false; - qemu_mutex_unlock(&p->mutex); - rcu_unregister_thread(); migration_threads_remove(thread); trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); @@ -1002,7 +1004,6 @@ int multifd_send_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); p->id = i; diff --git a/migration/multifd.h b/migration/multifd.h index 98876ff94a..78a2317263 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -91,8 +91,6 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* this mutex protects the following parameters */ - QemuMutex mutex; /* is this channel thread running */ bool running; /* multifd flags for each packet */ -- Gitee From 6a8ee3ee37ff581cef0b12ea2b60600320b8d6f0 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:13 -0300 Subject: [PATCH 053/134] migration/multifd: Join the TLS thread commit e1921f10d9afe651f4887284e85f6789b37e67d3 upstream. We're currently leaking the resources of the TLS thread by not joining it and also overwriting the p->thread pointer altogether. Intel-SIG: commit e1921f10d9af migration/multifd: Join the TLS thread Fixes: a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") Cc: qemu-stable Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 +++++++- migration/multifd.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index ede34ec4bf..96b1acece2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -631,6 +631,10 @@ static void multifd_send_terminate_threads(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (p->tls_thread_created) { + qemu_thread_join(&p->tls_thread); + } + if (p->running) { qemu_thread_join(&p->thread); } @@ -925,7 +929,9 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); p->c = QIO_CHANNEL(tioc); - qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", + + p->tls_thread_created = true; + qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", multifd_tls_handshake_thread, p, QEMU_THREAD_JOINABLE); return true; diff --git a/migration/multifd.h b/migration/multifd.h index 78a2317263..720c9d50db 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -73,6 +73,8 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + QemuThread tls_thread; + bool tls_thread_created; /* communication channel */ QIOChannel *c; /* is the yank function registered */ -- Gitee From 87fd7df69918e4ee671556163c40d4d101959841 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:14 -0300 Subject: [PATCH 054/134] migration/multifd: Remove p->running commit a2a63c4abd52f4e3ff4046dcb67fe44ebf0bb8de upstream. We currently only need p->running to avoid calling qemu_thread_join() on a non existent thread if the thread has never been created. However, there are at least two bugs in this logic: 1) On the sending side, p->running is set too early and qemu_thread_create() can be skipped due to an error during TLS handshake, leaving the flag set and leading to a crash when multifd_send_cleanup() calls qemu_thread_join(). 2) During exit, the multifd thread clears the flag while holding the channel lock. The counterpart at multifd_send_cleanup() reads the flag outside of the lock and might free the mutex while the multifd thread still has it locked. Fix the first issue by setting the flag right before creating the thread. Rename it from p->running to p->thread_created to clarify its usage. Fix the second issue by not clearing the flag at the multifd thread exit. We don't have any use for that. Note that these bugs are straight-forward logic issues and not race conditions. There is still a gap for races to affect this code due to multifd_send_cleanup() being allowed to run concurrently with the thread creation loop. This issue is solved in the next patches. Intel-SIG: commit a2a63c4abd52 migration/multifd: Remove p->running Cc: qemu-stable Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") Reported-by: Avihai Horon Reported-by: chenyuhui5@huawei.com Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 27 ++++++++++++--------------- migration/multifd.h | 7 ++----- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 96b1acece2..08265cffe9 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -635,7 +635,7 @@ static void multifd_send_terminate_threads(void) qemu_thread_join(&p->tls_thread); } - if (p->running) { + if (p->thread_created) { qemu_thread_join(&p->thread); } } @@ -866,7 +866,6 @@ out: error_free(local_err); } - p->running = false; rcu_unregister_thread(); migration_threads_remove(thread); trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); @@ -957,6 +956,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p, migration_ioc_register_yank(ioc); p->registered_yank = true; p->c = ioc; + + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); return true; @@ -971,7 +972,6 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) trace_multifd_new_send_channel_async(p->id); if (!qio_task_propagate_error(task, &local_err)) { qio_channel_set_delay(ioc, false); - p->running = true; if (multifd_channel_connect(p, ioc, &local_err)) { return; } @@ -1132,15 +1132,15 @@ void multifd_recv_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - if (p->running) { - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - } + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); - qemu_thread_join(&p->thread); + if (p->thread_created) { + qemu_thread_join(&p->thread); + } } for (i = 0; i < migrate_multifd_channels(); i++) { multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); @@ -1226,9 +1226,6 @@ static void *multifd_recv_thread(void *opaque) multifd_recv_terminate_threads(local_err); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); @@ -1334,7 +1331,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) p->c = ioc; object_ref(OBJECT(ioc)); - p->running = true; + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); diff --git a/migration/multifd.h b/migration/multifd.h index 720c9d50db..7881980ee6 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -73,6 +73,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; QemuThread tls_thread; bool tls_thread_created; /* communication channel */ @@ -93,8 +94,6 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* is this channel thread running */ - bool running; /* multifd flags for each packet */ uint32_t flags; /* @@ -143,6 +142,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; /* communication channel */ QIOChannel *c; /* packet allocated len */ @@ -157,8 +157,6 @@ typedef struct { /* this mutex protects the following parameters */ QemuMutex mutex; - /* is this channel thread running */ - bool running; /* should this thread finish */ bool quit; /* multifd flags for each packet */ @@ -217,4 +215,3 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) #endif - -- Gitee From a8a04775ee0f593dbc7e3cd1d0bae9f405d908d0 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:15 -0300 Subject: [PATCH 055/134] migration/multifd: Move multifd_send_setup error handling in to the function commit bd8b0a8f82d8fc17aa285ab963ba75675c2fbe7a upstream. Hide the error handling inside multifd_send_setup to make it cleaner for the next patch to move the function around. Intel-SIG: commit bd8b0a8f82d8 migration/multifd: Move multifd_send_setup error handling in to the function Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 6 +----- migration/multifd.c | 24 +++++++++++++++++------- migration/multifd.h | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 77429c563f..f797fdba33 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3624,11 +3624,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_send_setup(&local_err) != 0) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); + if (!multifd_send_setup()) { migrate_fd_cleanup(s); return; } diff --git a/migration/multifd.c b/migration/multifd.c index 08265cffe9..b3bd247bbf 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -989,14 +989,16 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_send_setup(Error **errp) +bool multifd_send_setup(void) { - int thread_count; + MigrationState *s = migrate_get_current(); + Error *local_err = NULL; + int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; if (!migrate_multifd()) { - return 0; + return true; } thread_count = migrate_multifd_channels(); @@ -1030,14 +1032,22 @@ int multifd_send_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - int ret; - ret = multifd_send_state->ops->send_setup(p, errp); + ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - return ret; + break; } } - return 0; + + if (ret) { + migrate_set_error(s, local_err); + error_report_err(local_err); + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; + } + + return true; } struct { diff --git a/migration/multifd.h b/migration/multifd.h index 7881980ee6..8a1cad0996 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,7 +13,7 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_send_setup(Error **errp); +bool multifd_send_setup(void); void multifd_send_shutdown(void); int multifd_recv_setup(Error **errp); void multifd_recv_cleanup(void); -- Gitee From 8d3ea10ecba49213651bfa409f87b27a4feab5f2 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:16 -0300 Subject: [PATCH 056/134] migration/multifd: Move multifd_send_setup into migration thread commit dd904bc13f2af0c605c3fe72f118ea4e27a6610c upstream. We currently have an unfavorable situation around multifd channels creation and the migration thread execution. We create the multifd channels with qio_channel_socket_connect_async -> qio_task_run_in_thread, but only connect them at the multifd_new_send_channel_async callback, called from qio_task_complete, which is registered as a glib event. So at multifd_send_setup() we create the channels, but they will only be actually usable after the whole multifd_send_setup() calling stack returns back to the main loop. Which means that the migration thread is already up and running without any possibility for the multifd channels to be ready on time. We currently rely on the channels-ready semaphore blocking multifd_send_sync_main() until channels start to come up and release it. However there have been bugs recently found when a channel's creation fails and multifd_send_cleanup() is allowed to run while other channels are still being created. Let's start to organize this situation by moving the multifd_send_setup() call into the migration thread. That way we unblock the main-loop to dispatch the completion callbacks and actually have a chance of getting the multifd channels ready for when the migration thread needs them. The next patches will deal with the synchronization aspects. Note that this takes multifd_send_setup() out of the BQL. Intel-SIG: commit dd904bc13f2a migration/multifd: Move multifd_send_setup into migration thread Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-5-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/migration.c [bql] [jz: resolve context conflict due to BQL] Signed-off-by: Jason Zeng --- migration/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index f797fdba33..4aaf280315 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3300,6 +3300,10 @@ static void *migration_thread(void *opaque) object_ref(OBJECT(s)); update_iteration_initial_status(s); + if (!multifd_send_setup()) { + goto out; + } + qemu_mutex_lock_iothread(); qemu_savevm_state_header(s->to_dst_file); qemu_mutex_unlock_iothread(); @@ -3371,6 +3375,7 @@ static void *migration_thread(void *opaque) urgent = migration_rate_limit(); } +out: trace_migration_thread_after_loop(); migration_iteration_finish(s); object_unref(OBJECT(s)); @@ -3624,11 +3629,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (!multifd_send_setup()) { - migrate_fd_cleanup(s); - return; - } - if (migrate_background_snapshot()) { qemu_thread_create(&s->thread, "bg_snapshot", bg_migration_thread, s, QEMU_THREAD_JOINABLE); -- Gitee From 9fc3090a287dba7c65364f1ee562d28fe963e064 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:17 -0300 Subject: [PATCH 057/134] migration/multifd: Unify multifd and TLS connection paths commit 2576ae488ef9aa692486157df7d8b410919cd219 upstream. During multifd channel creation (multifd_send_new_channel_async) when TLS is enabled, the multifd_channel_connect function is called twice, once to create the TLS handshake thread and another time after the asynchrounous TLS handshake has finished. This creates a slightly confusing call stack where multifd_channel_connect() is called more times than the number of channels. It also splits error handling between the two callers of multifd_channel_connect() causing some code duplication. Lastly, it gets in the way of having a single point to determine whether all channel creation tasks have been initiated. Refactor the code to move the reentrancy one level up at the multifd_new_send_channel_async() level, de-duplicating the error handling and allowing for the next patch to introduce a synchronization point common to all the multifd channel creation, regardless of TLS. Note that the previous code would never fail once p->c had been set. This patch changes this assumption, which affects refcounting, so add comments around object_unref to explain the situation. Intel-SIG: commit 2576ae488ef9 migration/multifd: Unify multifd and TLS connection paths Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 83 ++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index b3bd247bbf..f36d454ec0 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -873,30 +873,7 @@ out: return NULL; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp); - -static void multifd_tls_outgoing_handshake(QIOTask *task, - gpointer opaque) -{ - MultiFDSendParams *p = opaque; - QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); - Error *err = NULL; - - if (!qio_task_propagate_error(task, &err)) { - trace_multifd_tls_outgoing_handshake_complete(ioc); - if (multifd_channel_connect(p, ioc, &err)) { - return; - } - } - - trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - - multifd_send_set_error(err); - multifd_send_kick_main(p); - error_free(err); -} +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); static void *multifd_tls_handshake_thread(void *opaque) { @@ -904,7 +881,7 @@ static void *multifd_tls_handshake_thread(void *opaque) QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); qio_channel_tls_handshake(tioc, - multifd_tls_outgoing_handshake, + multifd_new_send_channel_async, p, NULL, NULL); @@ -924,6 +901,10 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return false; } + /* + * Ownership of the socket channel now transfers to the newly + * created TLS channel, which has already taken a reference. + */ object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); @@ -940,18 +921,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, Error **errp) { - trace_multifd_set_outgoing_channel( - ioc, object_get_typename(OBJECT(ioc)), - migrate_get_current()->hostname); - - if (migrate_channel_requires_tls_upgrade(ioc)) { - /* - * tls_channel_connect will call back to this - * function after the TLS handshake, - * so we mustn't call multifd_send_thread until then - */ - return multifd_tls_channel_connect(p, ioc, errp); - } + qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); p->registered_yank = true; @@ -963,24 +933,51 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return true; } +/* + * When TLS is enabled this function is called once to establish the + * TLS connection and a second time after the TLS handshake to create + * the multifd channel. Without TLS it goes straight into the channel + * creation. + */ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *local_err = NULL; + bool ret; trace_multifd_new_send_channel_async(p->id); - if (!qio_task_propagate_error(task, &local_err)) { - qio_channel_set_delay(ioc, false); - if (multifd_channel_connect(p, ioc, &local_err)) { - return; - } + + if (qio_task_propagate_error(task, &local_err)) { + ret = false; + goto out; + } + + trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)), + migrate_get_current()->hostname); + + if (migrate_channel_requires_tls_upgrade(ioc)) { + ret = multifd_tls_channel_connect(p, ioc, &local_err); + } else { + ret = multifd_channel_connect(p, ioc, &local_err); } + if (ret) { + return; + } + +out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); multifd_send_kick_main(p); - object_unref(OBJECT(ioc)); + if (!p->c) { + /* + * If no channel has been created, drop the initial + * reference. Otherwise cleanup happens at + * multifd_send_channel_destroy() + */ + object_unref(OBJECT(ioc)); + } error_free(local_err); } -- Gitee From d884cb43f5763eb900fc4f19187a6877e07a0734 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:18 -0300 Subject: [PATCH 058/134] migration/multifd: Add a synchronization point for channel creation commit 93fa9dc2e0522c54b813dee0898a5feb98b624c9 upstream. It is possible that one of the multifd channels fails to be created at multifd_new_send_channel_async() while the rest of the channel creation tasks are still in flight. This could lead to multifd_save_cleanup() executing the qemu_thread_join() loop too early and not waiting for the threads which haven't been created yet, leading to the freeing of resources that the newly created threads will try to access and crash. Add a synchronization point after which there will be no attempts at thread creation and therefore calling multifd_save_cleanup() past that point will ensure it properly waits for the threads. A note about performance: Prior to this patch, if a channel took too long to be established, other channels could finish connecting first and already start taking load. Now we're bounded by the slowest-connecting channel. Intel-SIG: commit 93fa9dc2e052 migration/multifd: Add a synchronization point for channel creation Reported-by: Avihai Horon Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f36d454ec0..cdeac3279d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -63,6 +63,11 @@ struct { * Make it easy for now. */ uintptr_t packet_num; + /* + * Synchronization point past which no more channels will be + * created. + */ + QemuSemaphore channels_created; /* send channels ready */ QemuSemaphore channels_ready; /* @@ -623,10 +628,6 @@ static void multifd_send_terminate_threads(void) /* * Finally recycle all the threads. - * - * TODO: p->running is still buggy, e.g. we can reach here without the - * corresponding multifd_new_send_channel_async() get invoked yet, - * then a new thread can even be created after this function returns. */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -671,6 +672,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); multifd_send_state->params = NULL; @@ -958,18 +960,26 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) if (migrate_channel_requires_tls_upgrade(ioc)) { ret = multifd_tls_channel_connect(p, ioc, &local_err); + if (ret) { + return; + } } else { ret = multifd_channel_connect(p, ioc, &local_err); } +out: + /* + * Here we're not interested whether creation succeeded, only that + * it happened at all. + */ + qemu_sem_post(&multifd_send_state->channels_created); + if (ret) { return; } -out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); - multifd_send_kick_main(p); if (!p->c) { /* * If no channel has been created, drop the initial @@ -1002,6 +1012,7 @@ bool multifd_send_setup(void) multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1027,6 +1038,15 @@ bool multifd_send_setup(void) multifd_new_send_channel_create(p); } + /* + * Wait until channel creation has started for all channels. The + * creation can still fail, but no more channels will be created + * past this point. + */ + for (i = 0; i < thread_count; i++) { + qemu_sem_wait(&multifd_send_state->channels_created); + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; -- Gitee From 1e9bc9e878ed9f9f7037c9f461578bdd5d57267b Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:05 -0300 Subject: [PATCH 059/134] docs/devel/migration.rst: Document the file transport commit c35462f19b70afd27420f260aaa62adb30eafe91 upstream. When adding the support for file migration with the file: transport, we missed adding documentation for it. Intel-SIG: commit c35462f19b70 docs/devel/migration.rst: Document the file transport Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/main.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 396c7c51ca..9f907a85e6 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -41,6 +41,10 @@ over any transport. - exec migration: do the migration using the stdin/stdout through a process. - fd migration: do the migration using a file descriptor that is passed to QEMU. QEMU doesn't care how this file descriptor is opened. +- file migration: do the migration using a file that is passed to QEMU + by path. A file offset option is supported to allow a management + application to add its own metadata to the start of the file without + QEMU interference. In addition, support is included for migration using RDMA, which transports the page data using ``RDMA``, where the hardware takes care of -- Gitee From ebb19a51a35bbad7942338ff100bb5514a51e43d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:06 -0300 Subject: [PATCH 060/134] tests/qtest/migration: Rename fd_proto test commit 85cf9abd865841878c8d6df91b055aea06795fca upstream. Next patch adds another fd test. Rename the existing one closer to what's used on other tests, with the 'precopy' prefix. Intel-SIG: commit 85cf9abd8658 tests/qtest/migration: Rename fd_proto test Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 4bdf397828..da00bf22c5 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2358,7 +2358,7 @@ static void test_migrate_fd_finish_hook(QTestState *from, qobject_unref(rsp); } -static void test_migrate_fd_proto(void) +static void test_migrate_precopy_fd_socket(void) { MigrateCommon args = { .listen_uri = "defer", @@ -3452,7 +3452,8 @@ int main(int argc, char **argv) /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */ #ifndef _WIN32 - migration_test_add("/migration/fd_proto", test_migrate_fd_proto); + migration_test_add("/migration/precopy/fd/tcp", + test_migrate_precopy_fd_socket); #endif migration_test_add("/migration/validate_uuid", test_validate_uuid); migration_test_add("/migration/validate_uuid_error", -- Gitee From 41bd56a1a1ed4c512813229c00ca8a57ba8da04a Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:07 -0300 Subject: [PATCH 061/134] tests/qtest/migration: Add a fd + file test commit 6d79bd6818b17bcfc8245f6f8df482ecb03d8e3e upstream. The fd URI supports an fd that is backed by a file. The code should select between QIOChannelFile and QIOChannelSocket, depending on the type of the fd. Add a test for that. Intel-SIG: commit 6d79bd6818b1 tests/qtest/migration: Add a fd + file test Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index da00bf22c5..9b244dedb2 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2368,6 +2368,45 @@ static void test_migrate_precopy_fd_socket(void) }; test_precopy_common(&args); } + +static void *migrate_precopy_fd_file_start(QTestState *from, QTestState *to) +{ + g_autofree char *file = g_strdup_printf("%s/%s", tmpfs, FILE_TEST_FILENAME); + int src_flags = O_CREAT | O_RDWR; + int dst_flags = O_CREAT | O_RDWR; + int fds[2]; + + fds[0] = open(file, src_flags, 0660); + assert(fds[0] != -1); + + fds[1] = open(file, dst_flags, 0660); + assert(fds[1] != -1); + + + qtest_qmp_fds_assert_success(to, &fds[0], 1, + "{ 'execute': 'getfd'," + " 'arguments': { 'fdname': 'fd-mig' }}"); + + qtest_qmp_fds_assert_success(from, &fds[1], 1, + "{ 'execute': 'getfd'," + " 'arguments': { 'fdname': 'fd-mig' }}"); + + close(fds[0]); + close(fds[1]); + + return NULL; +} + +static void test_migrate_precopy_fd_file(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .connect_uri = "fd:fd-mig", + .start_hook = migrate_precopy_fd_file_start, + .finish_hook = test_migrate_fd_finish_hook + }; + test_file_common(&args, true); +} #endif /* _WIN32 */ static void do_test_validate_uuid(MigrateStart *args, bool should_fail) @@ -3454,6 +3493,8 @@ int main(int argc, char **argv) #ifndef _WIN32 migration_test_add("/migration/precopy/fd/tcp", test_migrate_precopy_fd_socket); + migration_test_add("/migration/precopy/fd/file", + test_migrate_precopy_fd_file); #endif migration_test_add("/migration/validate_uuid", test_validate_uuid); migration_test_add("/migration/validate_uuid_error", -- Gitee From e8370b17100d31d02636308de4720564ead8db27 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:08 -0300 Subject: [PATCH 062/134] migration/multifd: Remove p->quit from recv side commit 11dd7be57524d400652cecf8740a016b3d66b53d upstream. Like we did on the sending side, replace the p->quit per-channel flag with a global atomic 'exiting' flag. Intel-SIG: commit 11dd7be57524 migration/multifd: Remove p->quit from recv side Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index cdeac3279d..8b18b9e055 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -80,6 +80,19 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +struct { + MultiFDRecvParams *params; + /* number of created threads */ + int count; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + /* global number of generated multifd packets */ + uint64_t packet_num; + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_recv_state; + /* Multifd without compression */ /** @@ -441,6 +454,11 @@ static bool multifd_send_should_exit(void) return qatomic_read(&multifd_send_state->exiting); } +static bool multifd_recv_should_exit(void) +{ + return qatomic_read(&multifd_recv_state->exiting); +} + /* * The migration thread can wait on either of the two semaphores. This * function can be used to kick the main thread out of waiting on either of @@ -1067,24 +1085,16 @@ bool multifd_send_setup(void) return true; } -struct { - MultiFDRecvParams *params; - /* number of created threads */ - int count; - /* syncs main thread and channels */ - QemuSemaphore sem_sync; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_recv_state; - static void multifd_recv_terminate_threads(Error *err) { int i; trace_multifd_recv_terminate_threads(err != NULL); + if (qatomic_xchg(&multifd_recv_state->exiting, 1)) { + return; + } + if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -1098,8 +1108,6 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1109,7 +1117,6 @@ static void multifd_recv_terminate_threads(Error *err) if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } @@ -1214,7 +1221,7 @@ static void *multifd_recv_thread(void *opaque) while (true) { uint32_t flags; - if (p->quit) { + if (multifd_recv_should_exit()) { break; } @@ -1278,6 +1285,7 @@ int multifd_recv_setup(Error **errp) multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); qatomic_set(&multifd_recv_state->count, 0); + qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1286,7 +1294,6 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->id = i; p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; -- Gitee From 6cbca3f5a59bcaab052176fb4ef6bcac11d97996 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:09 -0300 Subject: [PATCH 063/134] migration/multifd: Release recv sem_sync earlier commit d13f0026c7a625a5a34a5dea4095a4d9cfa04652 upstream. Now that multifd_recv_terminate_threads() is called only once, release the recv side sem_sync earlier like we do for the send side. Intel-SIG: commit d13f0026c7a6 migration/multifd: Release recv sem_sync earlier Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 8b18b9e055..315774b2fd 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1108,6 +1108,12 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); + /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1166,12 +1172,6 @@ void multifd_recv_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - if (p->thread_created) { qemu_thread_join(&p->thread); } -- Gitee From 9cf6a7bef3afaf94b4da83f8718e148c5dab2d77 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:57 +0800 Subject: [PATCH 064/134] migration/multifd: Cleanup TLS iochannel referencing commit 9221e3c6a237da90ac296adfeb6e99ea9babfc20 upstream. Commit a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") introduced a thread for TLS channels, which will resolve the issue on blocking the main thread. However in the same commit p->c is slightly abused just to be able to pass over the pointer "p" into the thread. That's the major reason we'll need to conditionally free the io channel in the fault paths. To clean it up, using a separate structure to pass over both "p" and "tioc" in the tls handshake thread. Then we can make it a rule that p->c will never be set until the channel is completely setup. With that, we can drop the tricky conditional unref of the io channel in the error path. Intel-SIG: commit 9221e3c6a237 migration/multifd: Cleanup TLS iochannel referencing Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 315774b2fd..a87c294ee3 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -895,16 +895,22 @@ out: static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); +typedef struct { + MultiFDSendParams *p; + QIOChannelTLS *tioc; +} MultiFDTLSThreadArgs; + static void *multifd_tls_handshake_thread(void *opaque) { - MultiFDSendParams *p = opaque; - QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); + MultiFDTLSThreadArgs *args = opaque; - qio_channel_tls_handshake(tioc, + qio_channel_tls_handshake(args->tioc, multifd_new_send_channel_async, - p, + args->p, NULL, NULL); + g_free(args); + return NULL; } @@ -914,6 +920,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, { MigrationState *s = migrate_get_current(); const char *hostname = s->hostname; + MultiFDTLSThreadArgs *args; QIOChannelTLS *tioc; tioc = migration_tls_client_create(ioc, hostname, errp); @@ -928,11 +935,14 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); - p->c = QIO_CHANNEL(tioc); + + args = g_new0(MultiFDTLSThreadArgs, 1); + args->tioc = tioc; + args->p = p; p->tls_thread_created = true; qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", - multifd_tls_handshake_thread, p, + multifd_tls_handshake_thread, args, QEMU_THREAD_JOINABLE); return true; } @@ -945,6 +955,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, migration_ioc_register_yank(ioc); p->registered_yank = true; + /* Setup p->c only if the channel is completely setup */ p->c = ioc; p->thread_created = true; @@ -998,14 +1009,12 @@ out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); - if (!p->c) { - /* - * If no channel has been created, drop the initial - * reference. Otherwise cleanup happens at - * multifd_send_channel_destroy() - */ - object_unref(OBJECT(ioc)); - } + /* + * For error cases (TLS or non-TLS), IO channel is always freed here + * rather than when cleanup multifd: since p->c is not set, multifd + * cleanup code doesn't even know its existence. + */ + object_unref(OBJECT(ioc)); error_free(local_err); } -- Gitee From d166ede31083a8d3a93fab2917f96cd5a3902d00 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:58 +0800 Subject: [PATCH 065/134] migration/multifd: Drop registered_yank commit 0518b5d8d30d3a4d0ea4f45d61527bcdc43044d2 upstream. With a clear definition of p->c protocol, where we only set it up if the channel is fully established (TLS or non-TLS), registered_yank boolean will have equal meaning of "p->c != NULL". Drop registered_yank by checking p->c instead. Intel-SIG: commit 0518b5d8d30d migration/multifd: Drop registered_yank Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +++---- migration/multifd.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index a87c294ee3..41a2aa7d25 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -667,11 +667,11 @@ static int multifd_send_channel_destroy(QIOChannel *send) static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { - if (p->registered_yank) { + if (p->c) { migration_ioc_unregister_yank(p->c); + multifd_send_channel_destroy(p->c); + p->c = NULL; } - multifd_send_channel_destroy(p->c); - p->c = NULL; qemu_sem_destroy(&p->sem); qemu_sem_destroy(&p->sem_sync); g_free(p->name); @@ -954,7 +954,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); - p->registered_yank = true; /* Setup p->c only if the channel is completely setup */ p->c = ioc; diff --git a/migration/multifd.h b/migration/multifd.h index 8a1cad0996..b3fe27ae93 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -78,8 +78,6 @@ typedef struct { bool tls_thread_created; /* communication channel */ QIOChannel *c; - /* is the yank function registered */ - bool registered_yank; /* packet allocated len */ uint32_t packet_len; /* guest page size */ -- Gitee From 01a6b732ee49735858c33895ded41b9c1c9b2e22 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:59 +0800 Subject: [PATCH 066/134] migration/multifd: Make multifd_channel_connect() return void commit 770de49c00fa9eb262473f282c92979b47b7fd22 upstream. It never fails, drop the retval and also the Error**. Intel-SIG: commit 770de49c00fa migration/multifd: Make multifd_channel_connect() return void Suggested-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 41a2aa7d25..a816232121 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -947,9 +947,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return true; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp) +static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { qio_channel_set_delay(ioc, false); @@ -960,7 +958,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); - return true; } /* @@ -992,7 +989,8 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) return; } } else { - ret = multifd_channel_connect(p, ioc, &local_err); + multifd_channel_connect(p, ioc); + ret = true; } out: -- Gitee From 6afcce7802815d7a8e70fa93a7cb37a2d6dfb306 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:53:00 +0800 Subject: [PATCH 067/134] migration/multifd: Cleanup outgoing_args in state destroy commit 72b90b96872acc5d00f9c16dfc196543349361da upstream. outgoing_args is a global cache of socket address to be reused in multifd. Freeing the cache in per-channel destructor is more or less a hack. Move it to multifd_send_cleanup_state() so it only get checked once. Use a small helper to do so because it's internal of socket.c. Intel-SIG: commit 72b90b96872a migration/multifd: Cleanup outgoing_args in state destroy Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 1 + migration/socket.c | 12 ++++++++---- migration/socket.h | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index a816232121..8caff4580f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -690,6 +690,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); diff --git a/migration/socket.c b/migration/socket.c index 98e3ea1514..3184c7c3c1 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -64,10 +64,6 @@ int socket_send_channel_destroy(QIOChannel *send) { /* Remove channel */ object_unref(OBJECT(send)); - if (outgoing_args.saddr) { - qapi_free_SocketAddress(outgoing_args.saddr); - outgoing_args.saddr = NULL; - } return 0; } @@ -137,6 +133,14 @@ void socket_start_outgoing_migration(MigrationState *s, NULL); } +void socket_cleanup_outgoing_migration(void) +{ + if (outgoing_args.saddr) { + qapi_free_SocketAddress(outgoing_args.saddr); + outgoing_args.saddr = NULL; + } +} + static void socket_accept_incoming_migration(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) diff --git a/migration/socket.h b/migration/socket.h index 5e4c33b8ea..5f52eddd4c 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -29,4 +29,6 @@ void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); void socket_start_outgoing_migration(MigrationState *s, SocketAddress *saddr, Error **errp); +void socket_cleanup_outgoing_migration(void); + #endif -- Gitee From e6270e0bf45eeaf5c275301e707cfb5a033f44e0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:53:01 +0800 Subject: [PATCH 068/134] migration/multifd: Drop unnecessary helper to destroy IOC commit c9a7e83c9d64fd5ebc759186789e1b753c919d32 upstream. Both socket_send_channel_destroy() and multifd_send_channel_destroy() are unnecessary wrappers to destroy an IOC, as the only thing to do is to release the final IOC reference. We have plenty of code that destroys an IOC using direct unref() already; keep that style. Intel-SIG: commit c9a7e83c9d64 migration/multifd: Drop unnecessary helper to destroy IOC Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +------ migration/socket.c | 7 ------- migration/socket.h | 1 - 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 8caff4580f..5e754259b7 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -660,16 +660,11 @@ static void multifd_send_terminate_threads(void) } } -static int multifd_send_channel_destroy(QIOChannel *send) -{ - return socket_send_channel_destroy(send); -} - static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); - multifd_send_channel_destroy(p->c); + object_unref(OBJECT(p->c)); p->c = NULL; } qemu_sem_destroy(&p->sem); diff --git a/migration/socket.c b/migration/socket.c index 3184c7c3c1..9ab89b1e08 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -60,13 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp) return QIO_CHANNEL(sioc); } -int socket_send_channel_destroy(QIOChannel *send) -{ - /* Remove channel */ - object_unref(OBJECT(send)); - return 0; -} - struct SocketConnectData { MigrationState *s; char *hostname; diff --git a/migration/socket.h b/migration/socket.h index 5f52eddd4c..46c233ecd2 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -23,7 +23,6 @@ void socket_send_channel_create(QIOTaskFunc f, void *data); QIOChannel *socket_send_channel_create_sync(Error **errp); -int socket_send_channel_destroy(QIOChannel *send); void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); -- Gitee From aaa0e0517f5f3573c62ca6d05e77462fbdc9d419 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Thu, 25 Jan 2024 18:25:12 +0200 Subject: [PATCH 069/134] migration: Fix logic of channels and transport compatibility check commit 3205bebd4fc6dd501fb8b10c93ddce9da18e09db upstream. The commit in the fixes line mistakenly modified the channels and transport compatibility check logic so it now checks multi-channel support only for socket transport type. Thus, running multifd migration using a transport other than socket that is incompatible with multi-channels (such as "exec") would lead to a segmentation fault instead of an error message. For example: (qemu) migrate_set_capability multifd on (qemu) migrate -d "exec:cat > /tmp/vm_state" Segmentation fault (core dumped) Fix it by checking multi-channel compatibility for all transport types. Intel-SIG: commit 3205bebd4fc6 migration: Fix logic of channels and transport compatibility check Cc: qemu-stable Fixes: d95533e1cdcc ("migration: modify migration_channels_and_uri_compatible() for new QAPI syntax") Signed-off-by: Avihai Horon Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240125162528.7552-2-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 4aaf280315..081b656918 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -127,11 +127,17 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } -static bool transport_supports_multi_channels(SocketAddress *saddr) +static bool transport_supports_multi_channels(MigrationAddress *addr) { - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + SocketAddress *saddr = &addr->u.socket; + + return saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + } + + return false; } static bool @@ -139,8 +145,7 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { if (migration_needs_multiple_sockets() && - (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) && - !transport_supports_multi_channels(&addr->u.socket)) { + !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); return false; } -- Gitee From 40b7eed0213c0ce2453b86878f9239b1e07b7a24 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:29:55 -0300 Subject: [PATCH 070/134] migration/multifd: Cleanup multifd_recv_sync_main commit 4aac6b1e9bd48677c4f24518fe86ffd34c677d5a upstream. Some minor cleanups and documentation for multifd_recv_sync_main. Use thread_count as done in other parts of the code. Remove p->id from the multifd_recv_state sync, since that is global and not tied to a channel. Add documentation for the sync steps. Intel-SIG: commit 4aac6b1e9bd4 migration/multifd: Cleanup multifd_recv_sync_main Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 17 +++++++++++++---- migration/trace-events | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 5e754259b7..433d2c3ac3 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1186,18 +1186,27 @@ void multifd_recv_cleanup(void) void multifd_recv_sync_main(void) { + int thread_count = migrate_multifd_channels(); int i; if (!migrate_multifd()) { return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - trace_multifd_recv_sync_main_wait(p->id); + /* + * Initiate the synchronization by waiting for all channels. + * For socket-based migration this means each channel has received + * the SYNC packet on the stream. + */ + for (i = 0; i < thread_count; i++) { + trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } - for (i = 0; i < migrate_multifd_channels(); i++) { + + /* + * Sync done. Release the channels for the next iteration. + */ + for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; WITH_QEMU_LOCK_GUARD(&p->mutex) { diff --git a/migration/trace-events b/migration/trace-events index 298ad2b0dd..bf1a069632 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -132,7 +132,7 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uin multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" -multifd_recv_sync_main_wait(uint8_t id) "channel %u" +multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -- Gitee From 026990ec2cd64c67a963652827b6756163a9e33c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 29 Feb 2024 12:29:56 -0300 Subject: [PATCH 071/134] io: add and implement QIO_CHANNEL_FEATURE_SEEKABLE for channel file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 401e311ff72e0a62c834bfe466de68a82cfd90cb upstream. Add a generic QIOChannel feature SEEKABLE which would be used by the qemu_file* apis. For the time being this will be only implemented for file channels. Intel-SIG: commit 401e311ff72e io: add and implement QIO_CHANNEL_FEATURE_SEEKABLE for channel file Signed-off-by: Nikolay Borisov Reviewed-by: "Daniel P. Berrangé" Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- include/io/channel.h | 1 + io/channel-file.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/io/channel.h b/include/io/channel.h index 5f9dbaab65..fcb19fd672 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -44,6 +44,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, QIO_CHANNEL_FEATURE_READ_MSG_PEEK, + QIO_CHANNEL_FEATURE_SEEKABLE, }; diff --git a/io/channel-file.c b/io/channel-file.c index 4a12c61886..f91bf6db1c 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -36,6 +36,10 @@ qio_channel_file_new_fd(int fd) ioc->fd = fd; + if (lseek(fd, 0, SEEK_CUR) != (off_t)-1) { + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); + } + trace_qio_channel_file_new_fd(ioc, fd); return ioc; @@ -60,6 +64,10 @@ qio_channel_file_new_path(const char *path, return NULL; } + if (lseek(ioc->fd, 0, SEEK_CUR) != (off_t)-1) { + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); + } + trace_qio_channel_file_new_path(ioc, path, flags, mode, ioc->fd); return ioc; -- Gitee From 11694aca2b4b7fe65152f29f52e6319351abf944 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 29 Feb 2024 12:29:57 -0300 Subject: [PATCH 072/134] io: Add generic pwritev/preadv interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f1cfe39418a837e8dd52a7e75331f1606949deff upstream. Introduce basic pwritev/preadv support in the generic channel layer. Specific implementation will follow for the file channel as this is required in order to support migration streams with fixed location of each ram page. Intel-SIG: commit f1cfe39418a8 io: Add generic pwritev/preadv interface Signed-off-by: Nikolay Borisov Reviewed-by: "Daniel P. Berrangé" Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- include/io/channel.h | 82 ++++++++++++++++++++++++++++++++++++++++++++ io/channel.c | 58 +++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/include/io/channel.h b/include/io/channel.h index fcb19fd672..7986c49c71 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -131,6 +131,16 @@ struct QIOChannelClass { Error **errp); /* Optional callbacks */ + ssize_t (*io_pwritev)(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp); + ssize_t (*io_preadv)(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp); int (*io_shutdown)(QIOChannel *ioc, QIOChannelShutdown how, Error **errp); @@ -529,6 +539,78 @@ void qio_channel_set_follow_coroutine_ctx(QIOChannel *ioc, bool enabled); int qio_channel_close(QIOChannel *ioc, Error **errp); +/** + * qio_channel_pwritev + * @ioc: the channel object + * @iov: the array of memory regions to write data from + * @niov: the length of the @iov array + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + * Behaves as qio_channel_writev_full, apart from not supporting + * sending of file handles as well as beginning the write at the + * passed @offset + * + */ +ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp); + +/** + * qio_channel_pwrite + * @ioc: the channel object + * @buf: the memory region to write data into + * @buflen: the number of bytes to @buf + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + */ +ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp); + +/** + * qio_channel_preadv + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + * Behaves as qio_channel_readv_full, apart from not supporting + * receiving of file handles as well as beginning the read at the + * passed @offset + * + */ +ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp); + +/** + * qio_channel_pread + * @ioc: the channel object + * @buf: the memory region to write data into + * @buflen: the number of bytes to @buf + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + */ +ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp); + /** * qio_channel_shutdown: * @ioc: the channel object diff --git a/io/channel.c b/io/channel.c index 86c5834510..a1f12f8e90 100644 --- a/io/channel.c +++ b/io/channel.c @@ -454,6 +454,64 @@ GSource *qio_channel_add_watch_source(QIOChannel *ioc, } +ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_pwritev) { + error_setg(errp, "Channel does not support pwritev"); + return -1; + } + + if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) { + error_setg_errno(errp, EINVAL, "Requested channel is not seekable"); + return -1; + } + + return klass->io_pwritev(ioc, iov, niov, offset, errp); +} + +ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = buflen + }; + + return qio_channel_pwritev(ioc, &iov, 1, offset, errp); +} + +ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_preadv) { + error_setg(errp, "Channel does not support preadv"); + return -1; + } + + if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) { + error_setg_errno(errp, EINVAL, "Requested channel is not seekable"); + return -1; + } + + return klass->io_preadv(ioc, iov, niov, offset, errp); +} + +ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = buflen + }; + + return qio_channel_preadv(ioc, &iov, 1, offset, errp); +} + int qio_channel_shutdown(QIOChannel *ioc, QIOChannelShutdown how, Error **errp) -- Gitee From 54fe25a253385c85be238135dac532cdba188e42 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 29 Feb 2024 12:29:58 -0300 Subject: [PATCH 073/134] io: implement io_pwritev/preadv for QIOChannelFile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0478b030fa2530cbbfc4d6432e8e39a16d06865b upstream. The upcoming 'mapped-ram' feature will require qemu to write data to (and restore from) specific offsets of the migration file. Add a minimal implementation of pwritev/preadv and expose them via the io_pwritev and io_preadv interfaces. Intel-SIG: commit 0478b030fa25 io: implement io_pwritev/preadv for QIOChannelFile Signed-off-by: Nikolay Borisov Reviewed-by: "Daniel P. Berrangé" Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- io/channel-file.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/io/channel-file.c b/io/channel-file.c index f91bf6db1c..a6ad7770c6 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -146,6 +146,58 @@ static ssize_t qio_channel_file_writev(QIOChannel *ioc, return ret; } +#ifdef CONFIG_PREADV +static ssize_t qio_channel_file_preadv(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + ssize_t ret; + + retry: + ret = preadv(fioc->fd, iov, niov, offset); + if (ret < 0) { + if (errno == EAGAIN) { + return QIO_CHANNEL_ERR_BLOCK; + } + if (errno == EINTR) { + goto retry; + } + + error_setg_errno(errp, errno, "Unable to read from file"); + return -1; + } + + return ret; +} + +static ssize_t qio_channel_file_pwritev(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + ssize_t ret; + + retry: + ret = pwritev(fioc->fd, iov, niov, offset); + if (ret <= 0) { + if (errno == EAGAIN) { + return QIO_CHANNEL_ERR_BLOCK; + } + if (errno == EINTR) { + goto retry; + } + error_setg_errno(errp, errno, "Unable to write to file"); + return -1; + } + return ret; +} +#endif /* CONFIG_PREADV */ + static int qio_channel_file_set_blocking(QIOChannel *ioc, bool enabled, Error **errp) @@ -231,6 +283,10 @@ static void qio_channel_file_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_file_writev; ioc_klass->io_readv = qio_channel_file_readv; ioc_klass->io_set_blocking = qio_channel_file_set_blocking; +#ifdef CONFIG_PREADV + ioc_klass->io_pwritev = qio_channel_file_pwritev; + ioc_klass->io_preadv = qio_channel_file_preadv; +#endif ioc_klass->io_seek = qio_channel_file_seek; ioc_klass->io_close = qio_channel_file_close; ioc_klass->io_create_watch = qio_channel_file_create_watch; -- Gitee From 4999a3977a7cdf6f518be45ed35c6347b7497669 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:29:59 -0300 Subject: [PATCH 074/134] io: fsync before closing a file channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c05dfcb7f2c5e39fc47c347de305df4e6afb4fa9 upstream. Make sure the data is flushed to disk before closing file channels. This is to ensure data is on disk and not lost in the event of a host crash. This is currently being implemented to affect the migration code when migrating to a file, but all QIOChannelFile users should benefit from the change. Intel-SIG: commit c05dfcb7f2c5 io: fsync before closing a file channel Reviewed-by: "Daniel P. Berrangé" Acked-by: "Daniel P. Berrangé" Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- io/channel-file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io/channel-file.c b/io/channel-file.c index a6ad7770c6..d4706fa592 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -242,6 +242,11 @@ static int qio_channel_file_close(QIOChannel *ioc, { QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + if (qemu_fdatasync(fioc->fd) < 0) { + error_setg_errno(errp, errno, + "Unable to synchronize file data with storage device"); + return -1; + } if (qemu_close(fioc->fd) < 0) { error_setg_errno(errp, errno, "Unable to close file"); -- Gitee From e9c0b94df6734dacab909a84884684b05aaae6a6 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:00 -0300 Subject: [PATCH 075/134] migration/qemu-file: add utility methods for working with seekable channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7f5b50a40181bd75c1f74aeaa7fe94fe10680720 upstream. Add utility methods that will be needed when implementing 'mapped-ram' migration capability. Intel-SIG: commit 7f5b50a40181 migration/qemu-file: add utility methods for working with seekable channels Signed-off-by: Fabiano Rosas Reviewed-by: "Daniel P. Berrangé" Link: https://lore.kernel.org/r/20240229153017.2221-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- include/migration/qemu-file-types.h | 2 + migration/qemu-file.c | 106 ++++++++++++++++++++++++++++ migration/qemu-file.h | 6 ++ 3 files changed, 114 insertions(+) diff --git a/include/migration/qemu-file-types.h b/include/migration/qemu-file-types.h index 9ba163f333..adec5abc07 100644 --- a/include/migration/qemu-file-types.h +++ b/include/migration/qemu-file-types.h @@ -50,6 +50,8 @@ unsigned int qemu_get_be16(QEMUFile *f); unsigned int qemu_get_be32(QEMUFile *f); uint64_t qemu_get_be64(QEMUFile *f); +bool qemu_file_is_seekable(QEMUFile *f); + static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { qemu_put_be64(f, *pv); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index bd1dbc3db1..60304f93fd 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -33,6 +33,7 @@ #include "options.h" #include "qapi/error.h" #include "rdma.h" +#include "io/channel-file.h" #define IO_BUF_SIZE 32768 #define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64) @@ -255,6 +256,10 @@ static void qemu_iovec_release_ram(QEMUFile *f) memset(f->may_free, 0, sizeof(f->may_free)); } +bool qemu_file_is_seekable(QEMUFile *f) +{ + return qio_channel_has_feature(f->ioc, QIO_CHANNEL_FEATURE_SEEKABLE); +} /** * Flushes QEMUFile buffer @@ -447,6 +452,107 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) } } +void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos) +{ + Error *err = NULL; + size_t ret; + + if (f->last_error) { + return; + } + + qemu_fflush(f); + ret = qio_channel_pwrite(f->ioc, (char *)buf, buflen, pos, &err); + + if (err) { + qemu_file_set_error_obj(f, -EIO, err); + return; + } + + if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) { + qemu_file_set_error_obj(f, -EAGAIN, NULL); + return; + } + + if (ret != buflen) { + error_setg(&err, "Partial write of size %zu, expected %zu", ret, + buflen); + qemu_file_set_error_obj(f, -EIO, err); + return; + } + + stat64_add(&mig_stats.qemu_file_transferred, buflen); + + return; +} + + +size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos) +{ + Error *err = NULL; + size_t ret; + + if (f->last_error) { + return 0; + } + + ret = qio_channel_pread(f->ioc, (char *)buf, buflen, pos, &err); + + if ((ssize_t)ret == -1 || err) { + qemu_file_set_error_obj(f, -EIO, err); + return 0; + } + + if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) { + qemu_file_set_error_obj(f, -EAGAIN, NULL); + return 0; + } + + if (ret != buflen) { + error_setg(&err, "Partial read of size %zu, expected %zu", ret, buflen); + qemu_file_set_error_obj(f, -EIO, err); + return 0; + } + + return ret; +} + +void qemu_set_offset(QEMUFile *f, off_t off, int whence) +{ + Error *err = NULL; + off_t ret; + + if (qemu_file_is_writable(f)) { + qemu_fflush(f); + } else { + /* Drop all cached buffers if existed; will trigger a re-fill later */ + f->buf_index = 0; + f->buf_size = 0; + } + + ret = qio_channel_io_seek(f->ioc, off, whence, &err); + if (ret == (off_t)-1) { + qemu_file_set_error_obj(f, -EIO, err); + } +} + +off_t qemu_get_offset(QEMUFile *f) +{ + Error *err = NULL; + off_t ret; + + qemu_fflush(f); + + ret = qio_channel_io_seek(f->ioc, 0, SEEK_CUR, &err); + if (ret == (off_t)-1) { + qemu_file_set_error_obj(f, -EIO, err); + } + return ret; +} + + void qemu_put_byte(QEMUFile *f, int v) { if (f->last_error) { diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 8afa95732b..5d8e80f996 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -75,6 +75,12 @@ QEMUFile *qemu_file_get_return_path(QEMUFile *f); int qemu_fflush(QEMUFile *f); void qemu_file_set_blocking(QEMUFile *f, bool block); int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size); +void qemu_set_offset(QEMUFile *f, off_t off, int whence); +off_t qemu_get_offset(QEMUFile *f); +void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos); +size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos); QIOChannel *qemu_file_get_ioc(QEMUFile *file); -- Gitee From 787a7b4f9d6e4b4756b361f3e1354b73fa6b9668 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:01 -0300 Subject: [PATCH 076/134] migration/ram: Introduce 'mapped-ram' migration capability commit 4ed49feb4449f6959256c083afcb94116561b507 upstream. Add a new migration capability 'mapped-ram'. The core of the feature is to ensure that RAM pages are mapped directly to offsets in the resulting migration file instead of being streamed at arbitrary points. The reasons why we'd want such behavior are: - The resulting file will have a bounded size, since pages which are dirtied multiple times will always go to a fixed location in the file, rather than constantly being added to a sequential stream. This eliminates cases where a VM with, say, 1G of RAM can result in a migration file that's 10s of GBs, provided that the workload constantly redirties memory. - It paves the way to implement O_DIRECT-enabled save/restore of the migration stream as the pages are ensured to be written at aligned offsets. - It allows the usage of multifd so we can write RAM pages to the migration file in parallel. For now, enabling the capability has no effect. The next couple of patches implement the core functionality. Intel-SIG: commit 4ed49feb4449 migration/ram: Introduce 'mapped-ram' migration capability Acked-by: Markus Armbruster Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-8-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/migration.c [jz: resolve context conflict due to CPR migration mode] Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/mapped-ram.rst | 138 ++++++++++++++++++++++++++++ migration/migration.c | 7 ++ migration/options.c | 34 +++++++ migration/options.h | 1 + migration/savevm.c | 1 + qapi/migration.json | 6 +- 7 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 docs/devel/migration/mapped-ram.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index a9acaf618e..9d1abd2587 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -10,3 +10,4 @@ Migration has plenty of features to support different use cases. dirty-limit vfio virtio + mapped-ram diff --git a/docs/devel/migration/mapped-ram.rst b/docs/devel/migration/mapped-ram.rst new file mode 100644 index 0000000000..fa4cefd9fc --- /dev/null +++ b/docs/devel/migration/mapped-ram.rst @@ -0,0 +1,138 @@ +Mapped-ram +========== + +Mapped-ram is a new stream format for the RAM section designed to +supplement the existing ``file:`` migration and make it compatible +with ``multifd``. This enables parallel migration of a guest's RAM to +a file. + +The core of the feature is to ensure that RAM pages are mapped +directly to offsets in the resulting migration file. This enables the +``multifd`` threads to write exclusively to those offsets even if the +guest is constantly dirtying pages (i.e. live migration). Another +benefit is that the resulting file will have a bounded size, since +pages which are dirtied multiple times will always go to a fixed +location in the file, rather than constantly being added to a +sequential stream. Having the pages at fixed offsets also allows the +usage of O_DIRECT for save/restore of the migration stream as the +pages are ensured to be written respecting O_DIRECT alignment +restrictions (direct-io support not yet implemented). + +Usage +----- + +On both source and destination, enable the ``multifd`` and +``mapped-ram`` capabilities: + + ``migrate_set_capability multifd on`` + + ``migrate_set_capability mapped-ram on`` + +Use a ``file:`` URL for migration: + + ``migrate file:/path/to/migration/file`` + +Mapped-ram migration is best done non-live, i.e. by stopping the VM on +the source side before migrating. + +Use-cases +--------- + +The mapped-ram feature was designed for use cases where the migration +stream will be directed to a file in the filesystem and not +immediately restored on the destination VM [#]_. These could be +thought of as snapshots. We can further categorize them into live and +non-live. + +- Non-live snapshot + +If the use case requires a VM to be stopped before taking a snapshot, +that's the ideal scenario for mapped-ram migration. Not having to +track dirty pages, the migration will write the RAM pages to the disk +as fast as it can. + +Note: if a snapshot is taken of a running VM, but the VM will be +stopped after the snapshot by the admin, then consider stopping it +right before the snapshot to take benefit of the performance gains +mentioned above. + +- Live snapshot + +If the use case requires that the VM keeps running during and after +the snapshot operation, then mapped-ram migration can still be used, +but will be less performant. Other strategies such as +background-snapshot should be evaluated as well. One benefit of +mapped-ram in this scenario is portability since background-snapshot +depends on async dirty tracking (KVM_GET_DIRTY_LOG) which is not +supported outside of Linux. + +.. [#] While this same effect could be obtained with the usage of + snapshots or the ``file:`` migration alone, mapped-ram provides + a performance increase for VMs with larger RAM sizes (10s to + 100s of GiBs), specially if the VM has been stopped beforehand. + +RAM section format +------------------ + +Instead of having a sequential stream of pages that follow the +RAMBlock headers, the dirty pages for a RAMBlock follow its header +instead. This ensures that each RAM page has a fixed offset in the +resulting migration file. + +A bitmap is introduced to track which pages have been written in the +migration file. Pages are written at a fixed location for every +ramblock. Zero pages are ignored as they'd be zero in the destination +migration as well. + +:: + + Without mapped-ram: With mapped-ram: + + --------------------- -------------------------------- + | ramblock 1 header | | ramblock 1 header | + --------------------- -------------------------------- + | ramblock 2 header | | ramblock 1 mapped-ram header | + --------------------- -------------------------------- + | ... | | padding to next 1MB boundary | + --------------------- | ... | + | ramblock n header | -------------------------------- + --------------------- | ramblock 1 pages | + | RAM_SAVE_FLAG_EOS | | ... | + --------------------- -------------------------------- + | stream of pages | | ramblock 2 header | + | (iter 1) | -------------------------------- + | ... | | ramblock 2 mapped-ram header | + --------------------- -------------------------------- + | RAM_SAVE_FLAG_EOS | | padding to next 1MB boundary | + --------------------- | ... | + | stream of pages | -------------------------------- + | (iter 2) | | ramblock 2 pages | + | ... | | ... | + --------------------- -------------------------------- + | ... | | ... | + --------------------- -------------------------------- + | RAM_SAVE_FLAG_EOS | + -------------------------------- + | ... | + -------------------------------- + +where: + - ramblock header: the generic information for a ramblock, such as + idstr, used_len, etc. + + - ramblock mapped-ram header: the information added by this feature: + bitmap of pages written, bitmap size and offset of pages in the + migration file. + +Restrictions +------------ + +Since pages are written to their relative offsets and out of order +(due to the memory dirtying patterns), streaming channels such as +sockets are not supported. A seekable channel such as a file is +required. This can be verified in the QIOChannel by the presence of +the QIO_CHANNEL_FEATURE_SEEKABLE. + +The improvements brought by this feature apply only to guest physical +RAM. Other types of memory such as VRAM are migrated as part of device +states. diff --git a/migration/migration.c b/migration/migration.c index 081b656918..063a92a15e 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1894,6 +1894,13 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (migrate_mapped_ram()) { + if (migrate_tls()) { + error_setg(errp, "Cannot use TLS with mapped-ram"); + return false; + } + } + if (blk || blk_inc) { if (migrate_colo()) { error_setg(errp, "No disk migration is required in COLO mode"); diff --git a/migration/options.c b/migration/options.c index 71e71ea801..4f8ac90367 100644 --- a/migration/options.c +++ b/migration/options.c @@ -211,6 +211,7 @@ Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-switchover-ack", MIGRATION_CAPABILITY_SWITCHOVER_ACK), DEFINE_PROP_MIG_CAP("x-dirty-limit", MIGRATION_CAPABILITY_DIRTY_LIMIT), + DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM), DEFINE_PROP_END_OF_LIST(), }; @@ -270,6 +271,13 @@ bool migrate_events(void) return s->capabilities[MIGRATION_CAPABILITY_EVENTS]; } +bool migrate_mapped_ram(void) +{ + MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_MAPPED_RAM]; +} + bool migrate_ignore_shared(void) { MigrationState *s = migrate_get_current(); @@ -652,6 +660,32 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } } + if (new_caps[MIGRATION_CAPABILITY_MAPPED_RAM]) { + if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { + error_setg(errp, + "Mapped-ram migration is incompatible with multifd"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_XBZRLE]) { + error_setg(errp, + "Mapped-ram migration is incompatible with xbzrle"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, + "Mapped-ram migration is incompatible with compression"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + error_setg(errp, + "Mapped-ram migration is incompatible with postcopy"); + return false; + } + } + return true; } diff --git a/migration/options.h b/migration/options.h index 9aca5e41ad..fabfadc3b0 100644 --- a/migration/options.h +++ b/migration/options.h @@ -31,6 +31,7 @@ bool migrate_compress(void); bool migrate_dirty_bitmaps(void); bool migrate_dirty_limit(void); bool migrate_events(void); +bool migrate_mapped_ram(void); bool migrate_ignore_shared(void); bool migrate_late_block_activate(void); bool migrate_multifd(void); diff --git a/migration/savevm.c b/migration/savevm.c index cc65da605e..0dc1327d8b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -246,6 +246,7 @@ static bool should_validate_capability(int capability) /* Validate only new capabilities to keep compatibility. */ switch (capability) { case MIGRATION_CAPABILITY_X_IGNORE_SHARED: + case MIGRATION_CAPABILITY_MAPPED_RAM: return true; default: return false; diff --git a/qapi/migration.json b/qapi/migration.json index 3aed216c3b..9e812a70c6 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -531,6 +531,10 @@ # and can result in more stable read performance. Requires KVM # with accelerator property "dirty-ring-size" set. (Since 8.1) # +# @mapped-ram: Migrate using fixed offsets in the migration file for +# each RAM page. Requires a migration URI that supports seeking, +# such as a file. (since 9.0) +# # Features: # # @deprecated: Member @block is deprecated. Use blockdev-mirror with @@ -555,7 +559,7 @@ { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] }, 'validate-uuid', 'background-snapshot', 'zero-copy-send', 'postcopy-preempt', 'switchover-ack', - 'dirty-limit'] } + 'dirty-limit', 'mapped-ram'] } ## # @MigrationCapabilityStatus: -- Gitee From c7080a4db8c5e1ef49fb5816e7c0de2bc628119d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:02 -0300 Subject: [PATCH 077/134] migration: Add mapped-ram URI compatibility check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8d9e0d41006f094a8a377f0a12bef1e34b0a70b7 upstream. The mapped-ram migration format needs a channel that supports seeking to be able to write each page to an arbitrary offset in the migration stream. Intel-SIG: commit 8d9e0d41006f migration: Add mapped-ram URI compatibility check Reviewed-by: "Daniel P. Berrangé" Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-9-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 063a92a15e..b97a1d2993 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -140,10 +140,39 @@ static bool transport_supports_multi_channels(MigrationAddress *addr) return false; } +static bool migration_needs_seekable_channel(void) +{ + return migrate_mapped_ram(); +} + +static bool transport_supports_seeking(MigrationAddress *addr) +{ + if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + return true; + } + + /* + * At this point, the user might not yet have passed the file + * descriptor to QEMU, so we cannot know for sure whether it + * refers to a plain file or a socket. Let it through anyway. + */ + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD; + } + + return false; +} + static bool migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { + if (migration_needs_seekable_channel() && + !transport_supports_seeking(addr)) { + error_setg(errp, "Migration requires seekable transport (e.g. file)"); + return false; + } + if (migration_needs_multiple_sockets() && !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); -- Gitee From f2623d03f7000a4db9fa93f0d71efb9c4be89973 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:03 -0300 Subject: [PATCH 078/134] migration/ram: Add outgoing 'mapped-ram' migration commit c2d5c4a7cb7f59125c34e5b883d5d91336761661 upstream. Implement the outgoing migration side for the 'mapped-ram' capability. A bitmap is introduced to track which pages have been written in the migration file. Pages are written at a fixed location for every ramblock. Zero pages are ignored as they'd be zero in the destination migration as well. The migration stream is altered to put the dirty pages for a ramblock after its header instead of having a sequential stream of pages that follow the ramblock headers. Without mapped-ram (current): With mapped-ram (new): --------------------- -------------------------------- | ramblock 1 header | | ramblock 1 header | --------------------- -------------------------------- | ramblock 2 header | | ramblock 1 mapped-ram header | --------------------- -------------------------------- | ... | | padding to next 1MB boundary | --------------------- | ... | | ramblock n header | -------------------------------- --------------------- | ramblock 1 pages | | RAM_SAVE_FLAG_EOS | | ... | --------------------- -------------------------------- | stream of pages | | ramblock 2 header | | (iter 1) | -------------------------------- | ... | | ramblock 2 mapped-ram header | --------------------- -------------------------------- | RAM_SAVE_FLAG_EOS | | padding to next 1MB boundary | --------------------- | ... | | stream of pages | -------------------------------- | (iter 2) | | ramblock 2 pages | | ... | | ... | --------------------- -------------------------------- | ... | | ... | --------------------- -------------------------------- | RAM_SAVE_FLAG_EOS | -------------------------------- | ... | -------------------------------- where: - ramblock header: the generic information for a ramblock, such as idstr, used_len, etc. - ramblock mapped-ram header: the new information added by this feature: bitmap of pages written, bitmap size and offset of pages in the migration file. Intel-SIG: commit c2d5c4a7cb7f migration/ram: Add outgoing 'mapped-ram' migration Signed-off-by: Nikolay Borisov Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-10-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/ram.c [jz: resolve context conflict due to openeuler customization] Signed-off-by: Jason Zeng --- include/exec/ramblock.h | 13 ++++ migration/ram.c | 131 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 9 deletions(-) diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 69c6a53902..e8d82b4228 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -44,6 +44,19 @@ struct RAMBlock { size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; + + /* + * Below fields are only used by mapped-ram migration + */ + /* bitmap of pages present in the migration file */ + unsigned long *file_bmap; + /* + * offset in the file pages belonging to this ramblock are saved, + * used only during migration to a file. + */ + off_t bitmap_offset; + uint64_t pages_offset; + /* bitmap of already received pages in postcopy */ unsigned long *receivedmap; diff --git a/migration/ram.c b/migration/ram.c index f642cd68c6..e3664a4eef 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -109,6 +109,18 @@ bool memcrypt_enabled(void) return false; } +/* + * mapped-ram migration supports O_DIRECT, so we need to make sure the + * userspace buffer, the IO operation size and the file offset are + * aligned according to the underlying device's block size. The first + * two are already aligned to page size, but we need to add padding to + * the file to align the offset. We cannot read the block size + * dynamically because the migration file can be moved between + * different systems, so use 1M to cover most block sizes and to keep + * the file offset aligned at page size as well. + */ +#define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 + XBZRLECacheStats xbzrle_counters; extern MigrationDecompressOps *decompress_ops; @@ -1144,12 +1156,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, return 0; } + stat64_add(&mig_stats.zero_pages, 1); + + if (migrate_mapped_ram()) { + /* zero pages are not transferred with mapped-ram */ + clear_bit(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); + return 1; + } + len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1; ram_release_page(pss->block->idstr, offset); - - stat64_add(&mig_stats.zero_pages, 1); ram_transferred_add(len); /* @@ -1207,14 +1225,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, { QEMUFile *file = pss->pss_channel; - ram_transferred_add(save_page_header(pss, pss->pss_channel, block, - offset | RAM_SAVE_FLAG_PAGE)); - if (async) { - qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, - migrate_release_ram() && - migration_in_postcopy()); + if (migrate_mapped_ram()) { + qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, + block->pages_offset + offset); + set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); } else { - qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); + ram_transferred_add(save_page_header(pss, pss->pss_channel, block, + offset | RAM_SAVE_FLAG_PAGE)); + if (async) { + qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, + migrate_release_ram() && + migration_in_postcopy()); + } else { + qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); + } } ram_transferred_add(TARGET_PAGE_SIZE); stat64_add(&mig_stats.normal_pages, 1); @@ -2727,6 +2751,8 @@ static void ram_save_cleanup(void *opaque) block->clear_bmap = NULL; g_free(block->bmap); block->bmap = NULL; + g_free(block->file_bmap); + block->file_bmap = NULL; } xbzrle_cleanup(); @@ -3094,6 +3120,9 @@ static void ram_list_init_bitmaps(void) */ block->bmap = bitmap_new(pages); bitmap_set(block->bmap, 0, pages); + if (migrate_mapped_ram()) { + block->file_bmap = bitmap_new(pages); + } block->clear_bmap_shift = shift; block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); } @@ -3243,6 +3272,60 @@ static int ram_encrypted_save_setup(void) return ops->save_setup(p->sev_pdh, p->sev_plat_cert, p->sev_amd_cert); } +#define MAPPED_RAM_HDR_VERSION 1 +struct MappedRamHeader { + uint32_t version; + /* + * The target's page size, so we know how many pages are in the + * bitmap. + */ + uint64_t page_size; + /* + * The offset in the migration file where the pages bitmap is + * stored. + */ + uint64_t bitmap_offset; + /* + * The offset in the migration file where the actual pages (data) + * are stored. + */ + uint64_t pages_offset; +} QEMU_PACKED; +typedef struct MappedRamHeader MappedRamHeader; + +static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) +{ + g_autofree MappedRamHeader *header = NULL; + size_t header_size, bitmap_size; + long num_pages; + + header = g_new0(MappedRamHeader, 1); + header_size = sizeof(MappedRamHeader); + + num_pages = block->used_length >> TARGET_PAGE_BITS; + bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + /* + * Save the file offsets of where the bitmap and the pages should + * go as they are written at the end of migration and during the + * iterative phase, respectively. + */ + block->bitmap_offset = qemu_get_offset(file) + header_size; + block->pages_offset = ROUND_UP(block->bitmap_offset + + bitmap_size, + MAPPED_RAM_FILE_OFFSET_ALIGNMENT); + + header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); + header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); + header->bitmap_offset = cpu_to_be64(block->bitmap_offset); + header->pages_offset = cpu_to_be64(block->pages_offset); + + qemu_put_buffer(file, (uint8_t *) header, header_size); + + /* prepare offset for next ramblock */ + qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -3299,6 +3382,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque) if (migrate_ignore_shared()) { qemu_put_be64(f, block->mr->addr); } + + if (migrate_mapped_ram()) { + mapped_ram_setup_ramblock(f, block); + } } } @@ -3332,6 +3419,20 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return qemu_fflush(f); } +static void ram_save_file_bmap(QEMUFile *f) +{ + RAMBlock *block; + + RAMBLOCK_FOREACH_MIGRATABLE(block) { + long num_pages = block->used_length >> TARGET_PAGE_BITS; + long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, + block->bitmap_offset); + ram_transferred_add(bitmap_size); + } +} + /** * ram_save_iterate: iterative stage for migration * @@ -3543,6 +3644,18 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return ret; } + if (migrate_mapped_ram()) { + ram_save_file_bmap(f); + + if (qemu_file_get_error(f)) { + Error *local_err = NULL; + int err = qemu_file_get_error_obj(f, &local_err); + + error_reportf_err(local_err, "Failed to write bitmap to file: "); + return -err; + } + } + if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } -- Gitee From 3ff6476b5b33a1323f3f37aa5018dee7eb841d0c Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:04 -0300 Subject: [PATCH 079/134] migration/ram: Add incoming 'mapped-ram' migration commit 2f6b8826a542a5a2e8eb3fbf097ea6982ba5bf31 upstream. Add the necessary code to parse the format changes for the 'mapped-ram' capability. One of the more notable changes in behavior is that in the 'mapped-ram' case ram pages are restored in one go rather than constantly looping through the migration stream. Intel-SIG: commit 2f6b8826a542 migration/ram: Add incoming 'mapped-ram' migration Signed-off-by: Nikolay Borisov Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-11-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/ram.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index e3664a4eef..bb7fde4695 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -121,6 +121,12 @@ bool memcrypt_enabled(void) */ #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 +/* + * When doing mapped-ram migration, this is the amount we read from + * the pages region in the migration file at a time. + */ +#define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 + XBZRLECacheStats xbzrle_counters; extern MigrationDecompressOps *decompress_ops; @@ -3326,6 +3332,35 @@ static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); } +static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, + Error **errp) +{ + size_t ret, header_size = sizeof(MappedRamHeader); + + ret = qemu_get_buffer(file, (uint8_t *)header, header_size); + if (ret != header_size) { + error_setg(errp, "Could not read whole mapped-ram migration header " + "(expected %zd, got %zd bytes)", header_size, ret); + return false; + } + + /* migration stream is big-endian */ + header->version = be32_to_cpu(header->version); + + if (header->version > MAPPED_RAM_HDR_VERSION) { + error_setg(errp, "Migration mapped-ram capability version not " + "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, + header->version); + return false; + } + + header->page_size = be64_to_cpu(header->page_size); + header->bitmap_offset = be64_to_cpu(header->bitmap_offset); + header->pages_offset = be64_to_cpu(header->pages_offset); + + return true; +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -4257,22 +4292,126 @@ void colo_flush_ram_cache(void) trace_colo_flush_ram_cache_end(); } +static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, + long num_pages, unsigned long *bitmap, + Error **errp) +{ + ERRP_GUARD(); + unsigned long set_bit_idx, clear_bit_idx; + ram_addr_t offset; + void *host; + size_t read, unread, size; + + for (set_bit_idx = find_first_bit(bitmap, num_pages); + set_bit_idx < num_pages; + set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { + + clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); + + unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); + offset = set_bit_idx << TARGET_PAGE_BITS; + + while (unread > 0) { + host = host_from_ram_block_offset(block, offset); + if (!host) { + error_setg(errp, "page outside of ramblock %s range", + block->idstr); + return false; + } + + size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); + + read = qemu_get_buffer_at(f, host, size, + block->pages_offset + offset); + if (!read) { + goto err; + } + offset += read; + unread -= read; + } + } + + return true; + +err: + qemu_file_get_error_obj(f, errp); + error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT + "from file offset %" PRIx64 ": ", block->idstr, offset, + block->pages_offset + offset); + return false; +} + +static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, + ram_addr_t length, Error **errp) +{ + g_autofree unsigned long *bitmap = NULL; + MappedRamHeader header; + size_t bitmap_size; + long num_pages; + + if (!mapped_ram_read_header(f, &header, errp)) { + return; + } + + block->pages_offset = header.pages_offset; + + /* + * Check the alignment of the file region that contains pages. We + * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that + * value to change in the future. Do only a sanity check with page + * size alignment. + */ + if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { + error_setg(errp, + "Error reading ramblock %s pages, region has bad alignment", + block->idstr); + return; + } + + num_pages = length / header.page_size; + bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + bitmap = g_malloc0(bitmap_size); + if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, + header.bitmap_offset) != bitmap_size) { + error_setg(errp, "Error reading dirty bitmap"); + return; + } + + if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { + return; + } + + /* Skip pages array */ + qemu_set_offset(f, block->pages_offset + length, SEEK_SET); + + return; +} + static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) { int ret = 0; /* ADVISE is earlier, it shows the source has the postcopy capability on */ bool postcopy_advised = migration_incoming_postcopy_advised(); + Error *local_err = NULL; assert(block); + if (migrate_mapped_ram()) { + parse_ramblock_mapped_ram(f, block, length, &local_err); + if (local_err) { + error_report_err(local_err); + return -EINVAL; + } + return 0; + } + if (!qemu_ram_is_migratable(block)) { error_report("block %s should not be migrated !", block->idstr); return -EINVAL; } if (length != block->used_length) { - Error *local_err = NULL; - ret = qemu_ram_resize(block, length, &local_err); if (local_err) { error_report_err(local_err); -- Gitee From 8b28b80cb61e6349c38ee3a78f65c875ff710986 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:05 -0300 Subject: [PATCH 080/134] tests/qtest/migration: Add tests for mapped-ram file-based migration commit c7076ec350421706e4dfe048a9f37b603f5a40b9 upstream. Intel-SIG: commit c7076ec35042 tests/qtest/migration: Add tests for mapped-ram file-based migration Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-12-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 59 ++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 9b244dedb2..4d8f9943a8 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2135,6 +2135,14 @@ static void *test_mode_reboot_start(QTestState *from, QTestState *to) return NULL; } +static void *migrate_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_set_capability(from, "mapped-ram", true); + migrate_set_capability(to, "mapped-ram", true); + + return NULL; +} + static void test_mode_reboot(void) { g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, @@ -2149,6 +2157,32 @@ static void test_mode_reboot(void) test_file_common(&args, true); } +static void test_precopy_file_mapped_ram_live(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_mapped_ram_start, + }; + + test_file_common(&args, false); +} + +static void test_precopy_file_mapped_ram(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_mapped_ram_start, + }; + + test_file_common(&args, true); +} + static void test_precopy_tcp_plain(void) { MigrateCommon args = { @@ -2397,6 +2431,13 @@ static void *migrate_precopy_fd_file_start(QTestState *from, QTestState *to) return NULL; } +static void *migrate_fd_file_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_mapped_ram_start(from, to); + + return migrate_precopy_fd_file_start(from, to); +} + static void test_migrate_precopy_fd_file(void) { MigrateCommon args = { @@ -2407,6 +2448,17 @@ static void test_migrate_precopy_fd_file(void) }; test_file_common(&args, true); } + +static void test_migrate_precopy_fd_file_mapped_ram(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .connect_uri = "fd:fd-mig", + .start_hook = migrate_fd_file_mapped_ram_start, + .finish_hook = test_migrate_fd_finish_hook + }; + test_file_common(&args, true); +} #endif /* _WIN32 */ static void do_test_validate_uuid(MigrateStart *args, bool should_fail) @@ -3434,6 +3486,11 @@ int main(int argc, char **argv) migration_test_add("/migration/mode/reboot", test_mode_reboot); } + migration_test_add("/migration/precopy/file/mapped-ram", + test_precopy_file_mapped_ram); + migration_test_add("/migration/precopy/file/mapped-ram/live", + test_precopy_file_mapped_ram_live); + #ifdef CONFIG_GNUTLS migration_test_add("/migration/precopy/unix/tls/psk", test_precopy_unix_tls_psk); @@ -3495,6 +3552,8 @@ int main(int argc, char **argv) test_migrate_precopy_fd_socket); migration_test_add("/migration/precopy/fd/file", test_migrate_precopy_fd_file); + migration_test_add("/migration/precopy/fd/file/mapped-ram", + test_migrate_precopy_fd_file_mapped_ram); #endif migration_test_add("/migration/validate_uuid", test_validate_uuid); migration_test_add("/migration/validate_uuid_error", -- Gitee From ca612834ae8cec5c3219709e21fb2b9fae866048 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:06 -0300 Subject: [PATCH 081/134] migration/multifd: Rename MultiFDSend|RecvParams::data to compress_data commit 402dd7ac1c3be44f306c903cdfd2583ffec5e2fd upstream. Use a more specific name for the compression data so we can use the generic for the multifd core code. Intel-SIG: commit 402dd7ac1c3b migration/multifd: Rename MultiFDSend|RecvParams::data to compress_data Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-13-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 20 ++++++++++---------- migration/multifd-zstd.c | 20 ++++++++++---------- migration/multifd.h | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 012e3bdea1..2a8f5fc9a6 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -69,7 +69,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) err_msg = "out of memory for buf"; goto err_free_zbuff; } - p->data = z; + p->compress_data = z; return 0; err_free_zbuff: @@ -92,15 +92,15 @@ err_free_z: */ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; deflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; g_free(z->buf); z->buf = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -117,7 +117,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; @@ -194,7 +194,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; - p->data = z; + p->compress_data = z; zs->zalloc = Z_NULL; zs->zfree = Z_NULL; zs->opaque = Z_NULL; @@ -224,13 +224,13 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zlib_recv_cleanup(MultiFDRecvParams *p) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; inflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -246,7 +246,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) */ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index dc8fe43e94..593cf290ad 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->data = z; + p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -90,14 +90,14 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) */ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeCStream(z->zcs); z->zcs = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -114,7 +114,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; uint32_t i; @@ -183,7 +183,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; - p->data = z; + p->compress_data = z; z->zds = ZSTD_createDStream(); if (!z->zds) { g_free(z); @@ -221,14 +221,14 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zstd_recv_cleanup(MultiFDRecvParams *p) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeDStream(z->zds); z->zds = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -248,7 +248,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) uint32_t out_size = 0; uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; int i; diff --git a/migration/multifd.h b/migration/multifd.h index b3fe27ae93..adccd3532f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -127,7 +127,7 @@ typedef struct { /* number of iovs used */ uint32_t iovs_num; /* used for compression methods */ - void *data; + void *compress_data; } MultiFDSendParams; typedef struct { @@ -183,7 +183,7 @@ typedef struct { /* num of non zero pages */ uint32_t normal_num; /* used for de-compression methods */ - void *data; + void *compress_data; } MultiFDRecvParams; typedef struct { -- Gitee From 911a2d4e42adbafadc9e8eee5bb7a2d5692649a2 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:07 -0300 Subject: [PATCH 082/134] migration/multifd: Decouple recv method from pages commit 9db191251381c75e57201f7b07330ca982a55d1e upstream. Next patches will abstract the type of data being received by the channels, so do some cleanup now to remove references to pages and dependency on 'normal_num'. Intel-SIG: commit 9db191251381 migration/multifd: Decouple recv method from pages Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-14-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 6 +++--- migration/multifd-zstd.c | 6 +++--- migration/multifd.c | 13 ++++++++----- migration/multifd.h | 4 ++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 2a8f5fc9a6..6120faad65 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -234,7 +234,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) } /** - * zlib_recv_pages: read the data from the channel into actual pages + * zlib_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -244,7 +244,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zlib_recv(MultiFDRecvParams *p, Error **errp) { struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; @@ -319,7 +319,7 @@ static MultiFDMethods multifd_zlib_ops = { .send_prepare = zlib_send_prepare, .recv_setup = zlib_recv_setup, .recv_cleanup = zlib_recv_cleanup, - .recv_pages = zlib_recv_pages + .recv = zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 593cf290ad..cac236833d 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -232,7 +232,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) } /** - * zstd_recv_pages: read the data from the channel into actual pages + * zstd_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -242,7 +242,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; @@ -310,7 +310,7 @@ static MultiFDMethods multifd_zstd_ops = { .send_prepare = zstd_send_prepare, .recv_setup = zstd_recv_setup, .recv_cleanup = zstd_recv_cleanup, - .recv_pages = zstd_recv_pages + .recv = zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index 433d2c3ac3..d51ea8de18 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -198,7 +198,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) } /** - * nocomp_recv_pages: read the data from the channel into actual pages + * nocomp_recv: read the data from the channel * * For no compression we just need to read things into the correct place. * @@ -207,7 +207,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; @@ -229,7 +229,7 @@ static MultiFDMethods multifd_nocomp_ops = { .send_prepare = nocomp_send_prepare, .recv_setup = nocomp_recv_setup, .recv_cleanup = nocomp_recv_cleanup, - .recv_pages = nocomp_recv_pages + .recv = nocomp_recv }; static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { @@ -1231,6 +1231,8 @@ static void *multifd_recv_thread(void *opaque) while (true) { uint32_t flags; + bool has_data = false; + p->normal_num = 0; if (multifd_recv_should_exit()) { break; @@ -1252,10 +1254,11 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - if (p->normal_num) { - ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { break; } diff --git a/migration/multifd.h b/migration/multifd.h index adccd3532f..6a54377cc1 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -197,8 +197,8 @@ typedef struct { int (*recv_setup)(MultiFDRecvParams *p, Error **errp); /* Cleanup for receiving side */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all pages */ - int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + /* Read all data */ + int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); -- Gitee From 5e00a1d6f65e42b5cd5649e5bf09a1a1a975fc1a Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:08 -0300 Subject: [PATCH 083/134] migration/multifd: Allow multifd without packets commit 06833d83f8978139395da0f1d6a9fad81b9dd024 upstream. For the upcoming support to the new 'mapped-ram' migration stream format, we cannot use multifd packets because each write into the ramblock section in the migration file is expected to contain only the guest pages. They are written at their respective offsets relative to the ramblock section header. There is no space for the packet information and the expected gains from the new approach come partly from being able to write the pages sequentially without extraneous data in between. The new format also simply doesn't need the packets and all necessary information can be taken from the standard migration headers with some (future) changes to multifd code. Use the presence of the mapped-ram capability to decide whether to send packets. This only moves code under multifd_use_packets(), it has no effect for now as mapped-ram cannot yet be enabled with multifd. Intel-SIG: commit 06833d83f897 migration/multifd: Allow multifd without packets Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-15-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 175 +++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 61 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index d51ea8de18..8b753d89c6 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -93,6 +93,11 @@ struct { MultiFDMethods *ops; } *multifd_recv_state; +static bool multifd_use_packets(void) +{ + return !migrate_mapped_ram(); +} + /* Multifd without compression */ /** @@ -123,6 +128,19 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->num * p->page_size; +} + /** * nocomp_send_prepare: prepare date to be able to send * @@ -137,9 +155,13 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { bool use_zero_copy_send = migrate_zero_copy_send(); - MultiFDPages_t *pages = p->pages; int ret; + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + return 0; + } + if (!use_zero_copy_send) { /* * Only !zerocopy needs the header in IOV; zerocopy will @@ -148,13 +170,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) multifd_send_prepare_header(p); } - for (int i = 0; i < pages->num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; - } - - p->next_packet_size = pages->num * p->page_size; + multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; multifd_send_fill_packet(p); @@ -209,7 +225,13 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) */ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { - uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t flags; + + if (!multifd_use_packets()) { + return 0; + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", @@ -796,6 +818,7 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; + bool use_packets = multifd_use_packets(); thread = migration_threads_add(p->name, qemu_get_thread_id()); @@ -805,9 +828,11 @@ static void *multifd_send_thread(void *opaque) trace_multifd_send_thread_start(p->id); rcu_register_thread(); - if (multifd_send_initial_packet(p, &local_err) < 0) { - ret = -1; - goto out; + if (use_packets) { + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } } while (true) { @@ -858,16 +883,20 @@ static void *multifd_send_thread(void *opaque) * it doesn't require explicit memory barriers. */ assert(qatomic_read(&p->pending_sync)); - p->flags = MULTIFD_FLAG_SYNC; - multifd_send_fill_packet(p); - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; + + if (use_packets) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; } - /* p->next_packet_size will always be zero for a SYNC packet */ - stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; + qatomic_set(&p->pending_sync, false); qemu_sem_post(&p->sem_sync); } @@ -1022,6 +1051,7 @@ bool multifd_send_setup(void) Error *local_err = NULL; int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; if (!migrate_multifd()) { @@ -1044,14 +1074,20 @@ bool multifd_send_setup(void) qemu_sem_init(&p->sem_sync, 0); p->id = i; p->pages = multifd_pages_init(page_count); - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + } else { + p->iov = g_new0(struct iovec, page_count); + } p->name = g_strdup_printf("multifdsend_%d", i); - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; p->write_flags = 0; @@ -1114,7 +1150,9 @@ static void multifd_recv_terminate_threads(Error *err) * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, * however try to wakeup it without harm in cleanup phase. */ - qemu_sem_post(&p->sem_sync); + if (multifd_use_packets()) { + qemu_sem_post(&p->sem_sync); + } /* * We could arrive here for two reasons: @@ -1189,7 +1227,7 @@ void multifd_recv_sync_main(void) int thread_count = migrate_multifd_channels(); int i; - if (!migrate_multifd()) { + if (!migrate_multifd() || !multifd_use_packets()) { return; } @@ -1224,13 +1262,14 @@ static void *multifd_recv_thread(void *opaque) { MultiFDRecvParams *p = opaque; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int ret; trace_multifd_recv_thread_start(p->id); rcu_register_thread(); while (true) { - uint32_t flags; + uint32_t flags = 0; bool has_data = false; p->normal_num = 0; @@ -1238,25 +1277,27 @@ static void *multifd_recv_thread(void *opaque) break; } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ - break; - } + if (use_packets) { + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + break; + } - qemu_mutex_lock(&p->mutex); - ret = multifd_recv_unfill_packet(p, &local_err); - if (ret) { + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - break; } - flags = p->flags; - /* recv methods don't know how to handle the SYNC flag */ - p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; - qemu_mutex_unlock(&p->mutex); - if (has_data) { ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { @@ -1264,9 +1305,11 @@ static void *multifd_recv_thread(void *opaque) } } - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&multifd_recv_state->sem_sync); - qemu_sem_wait(&p->sem_sync); + if (use_packets) { + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } } } @@ -1285,6 +1328,7 @@ int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; /* @@ -1309,9 +1353,12 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); p->id = i; - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); @@ -1355,18 +1402,24 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int id; - id = multifd_recv_initial_packet(ioc, &local_err); - if (id < 0) { - multifd_recv_terminate_threads(local_err); - error_propagate_prepend(errp, local_err, - "failed to receive packet" - " via multifd channel %d: ", - qatomic_read(&multifd_recv_state->count)); - return; + if (use_packets) { + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + } else { + /* next patch gives this a meaningful value */ + id = 0; } - trace_multifd_recv_new_channel(id); p = &multifd_recv_state->params[id]; if (p->c != NULL) { -- Gitee From 187f0b2dfcebe164a9d3ba4f7486b3e6745c1269 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:09 -0300 Subject: [PATCH 084/134] migration/multifd: Allow receiving pages without packets commit d117ed0699d4117e75b67d76d9678785c676e21c upstream. Currently multifd does not need to have knowledge of pages on the receiving side because all the information needed is within the packets that come in the stream. We're about to add support to mapped-ram migration, which cannot use packets because it expects the ramblock section in the migration file to contain only the guest pages data. Add a data structure to transfer pages between the ram migration code and the multifd receiving threads. We don't want to reuse MultiFDPages_t for two reasons: a) multifd threads don't really need to know about the data they're receiving. b) the receiving side has to be stopped to load the pages, which means we can experiment with larger granularities than page size when transferring data. Intel-SIG: commit d117ed0699d4 migration/multifd: Allow receiving pages without packets Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-16-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/file.c | 1 + migration/multifd.c | 129 +++++++++++++++++++++++++++++++++++++++++--- migration/multifd.h | 15 ++++++ 3 files changed, 138 insertions(+), 7 deletions(-) diff --git a/migration/file.c b/migration/file.c index 5d4975f43e..22d052a71f 100644 --- a/migration/file.c +++ b/migration/file.c @@ -6,6 +6,7 @@ */ #include "qemu/osdep.h" +#include "exec/ramblock.h" #include "qemu/cutils.h" #include "qapi/error.h" #include "channel.h" diff --git a/migration/multifd.c b/migration/multifd.c index 8b753d89c6..4bdc7f511a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -82,9 +82,13 @@ struct { struct { MultiFDRecvParams *params; + MultiFDRecvData *data; /* number of created threads */ int count; - /* syncs main thread and channels */ + /* + * This is always posted by the recv threads, the migration thread + * uses it to wait for recv threads to finish assigned tasks. + */ QemuSemaphore sem_sync; /* global number of generated multifd packets */ uint64_t packet_num; @@ -1123,6 +1127,57 @@ bool multifd_send_setup(void) return true; } +bool multifd_recv(void) +{ + int i; + static int next_recv_channel; + MultiFDRecvParams *p = NULL; + MultiFDRecvData *data = multifd_recv_state->data; + + /* + * next_channel can remain from a previous migration that was + * using more channels, so ensure it doesn't overflow if the + * limit is lower now. + */ + next_recv_channel %= migrate_multifd_channels(); + for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) { + if (multifd_recv_should_exit()) { + return false; + } + + p = &multifd_recv_state->params[i]; + + if (qatomic_read(&p->pending_job) == false) { + next_recv_channel = (i + 1) % migrate_multifd_channels(); + break; + } + } + + /* + * Order pending_job read before manipulating p->data below. Pairs + * with qatomic_store_release() at multifd_recv_thread(). + */ + smp_mb_acquire(); + + assert(!p->data->size); + multifd_recv_state->data = p->data; + p->data = data; + + /* + * Order p->data update before setting pending_job. Pairs with + * qatomic_load_acquire() at multifd_recv_thread(). + */ + qatomic_store_release(&p->pending_job, true); + qemu_sem_post(&p->sem); + + return true; +} + +MultiFDRecvData *multifd_get_recv_data(void) +{ + return multifd_recv_state->data; +} + static void multifd_recv_terminate_threads(Error *err) { int i; @@ -1147,11 +1202,26 @@ static void multifd_recv_terminate_threads(Error *err) MultiFDRecvParams *p = &multifd_recv_state->params[i]; /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. + * The migration thread and channels interact differently + * depending on the presence of packets. */ if (multifd_use_packets()) { + /* + * The channel receives as long as there are packets. When + * packets end (i.e. MULTIFD_FLAG_SYNC is reached), the + * channel waits for the migration thread to sync. If the + * sync never happens, do it here. + */ qemu_sem_post(&p->sem_sync); + } else { + /* + * The channel waits for the migration thread to give it + * work. When the migration thread runs out of work, it + * releases the channel and waits for any pending work to + * finish. If we reach here (e.g. due to error) before the + * work runs out, release the channel. + */ + qemu_sem_post(&p->sem); } /* @@ -1180,6 +1250,7 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->c = NULL; qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem_sync); + qemu_sem_destroy(&p->sem); g_free(p->name); p->name = NULL; p->packet_len = 0; @@ -1197,6 +1268,8 @@ static void multifd_recv_cleanup_state(void) qemu_sem_destroy(&multifd_recv_state->sem_sync); g_free(multifd_recv_state->params); multifd_recv_state->params = NULL; + g_free(multifd_recv_state->data); + multifd_recv_state->data = NULL; g_free(multifd_recv_state); multifd_recv_state = NULL; } @@ -1273,11 +1346,11 @@ static void *multifd_recv_thread(void *opaque) bool has_data = false; p->normal_num = 0; - if (multifd_recv_should_exit()) { - break; - } - if (use_packets) { + if (multifd_recv_should_exit()) { + break; + } + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, p->packet_len, &local_err); if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ @@ -1296,6 +1369,30 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); + } else { + /* + * No packets, so we need to wait for the vmstate code to + * give us work. + */ + qemu_sem_wait(&p->sem); + + if (multifd_recv_should_exit()) { + break; + } + + /* pairs with qatomic_store_release() at multifd_recv() */ + if (!qatomic_load_acquire(&p->pending_job)) { + /* + * Migration thread did not send work, this is + * equivalent to pending_sync on the sending + * side. Post sem_sync to notify we reached this + * point. + */ + qemu_sem_post(&multifd_recv_state->sem_sync); + continue; + } + + has_data = !!p->data->size; } if (has_data) { @@ -1310,6 +1407,15 @@ static void *multifd_recv_thread(void *opaque) qemu_sem_post(&multifd_recv_state->sem_sync); qemu_sem_wait(&p->sem_sync); } + } else { + p->total_normal_pages += p->data->size / qemu_target_page_size(); + p->data->size = 0; + /* + * Order data->size update before clearing + * pending_job. Pairs with smp_mb_acquire() at + * multifd_recv(). + */ + qatomic_store_release(&p->pending_job, false); } } @@ -1342,6 +1448,10 @@ int multifd_recv_setup(Error **errp) thread_count = migrate_multifd_channels(); multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); + + multifd_recv_state->data = g_new0(MultiFDRecvData, 1); + multifd_recv_state->data->size = 0; + qatomic_set(&multifd_recv_state->count, 0); qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); @@ -1352,8 +1462,13 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); + qemu_sem_init(&p->sem, 0); + p->pending_job = false; p->id = i; + p->data = g_new0(MultiFDRecvData, 1); + p->data->size = 0; + if (use_packets) { p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; diff --git a/migration/multifd.h b/migration/multifd.h index 6a54377cc1..1be985978e 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,6 +13,8 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +typedef struct MultiFDRecvData MultiFDRecvData; + bool multifd_send_setup(void); void multifd_send_shutdown(void); int multifd_recv_setup(Error **errp); @@ -23,6 +25,8 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_recv(void); +MultiFDRecvData *multifd_get_recv_data(void); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) @@ -63,6 +67,13 @@ typedef struct { RAMBlock *block; } MultiFDPages_t; +struct MultiFDRecvData { + void *opaque; + size_t size; + /* for preadv */ + off_t file_offset; +}; + typedef struct { /* Fields are only written at creating/deletion time */ /* No lock required for them, they are read only */ @@ -152,6 +163,8 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; + /* sem where to wait for more work */ + QemuSemaphore sem; /* this mutex protects the following parameters */ QemuMutex mutex; @@ -161,6 +174,8 @@ typedef struct { uint32_t flags; /* global number of generated multifd packets */ uint64_t packet_num; + int pending_job; + MultiFDRecvData *data; /* thread local variables. No locking required */ -- Gitee From 88710de00b058d874b0f8e24ec92cdf2e5c5bd83 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:10 -0300 Subject: [PATCH 085/134] migration/multifd: Add a wrapper for channels_created commit a8a3e7102ccf65ee57fb7427952b98ebb460bec9 upstream. We'll need to access multifd_send_state->channels_created from outside multifd.c, so introduce a helper for that. Intel-SIG: commit a8a3e7102ccf migration/multifd: Add a wrapper for channels_created Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-17-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 ++++++- migration/multifd.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index 4bdc7f511a..3dc8d5328f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -102,6 +102,11 @@ static bool multifd_use_packets(void) return !migrate_mapped_ram(); } +void multifd_send_channel_created(void) +{ + qemu_sem_post(&multifd_send_state->channels_created); +} + /* Multifd without compression */ /** @@ -1027,7 +1032,7 @@ out: * Here we're not interested whether creation succeeded, only that * it happened at all. */ - qemu_sem_post(&multifd_send_state->channels_created); + multifd_send_channel_created(); if (ret) { return; diff --git a/migration/multifd.h b/migration/multifd.h index 1be985978e..1d8bbaf96b 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -17,6 +17,7 @@ typedef struct MultiFDRecvData MultiFDRecvData; bool multifd_send_setup(void); void multifd_send_shutdown(void); +void multifd_send_channel_created(void); int multifd_recv_setup(Error **errp); void multifd_recv_cleanup(void); void multifd_recv_shutdown(void); -- Gitee From 9cc4b9f7fac3dbfcc0874131f4e7ad781bb8d44e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:11 -0300 Subject: [PATCH 086/134] migration/multifd: Add outgoing QIOChannelFile support commit b7b03eb614d01079f9b1f7a1d342140b5055a559 upstream. Allow multifd to open file-backed channels. This will be used when enabling the mapped-ram migration stream format which expects a seekable transport. The QIOChannel read and write methods will use the preadv/pwritev versions which don't update the file offset at each call so we can reuse the fd without re-opening for every channel. Contrary to the socket migration, the file migration doesn't need an asynchronous channel creation process, so expose multifd_channel_connect() and call it directly. Note that this is just setup code and multifd cannot yet make use of the file channels. Intel-SIG: commit b7b03eb614d0 migration/multifd: Add outgoing QIOChannelFile support Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240229153017.2221-18-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/file.c | 37 +++++++++++++++++++++++++++++++++++++ migration/file.h | 4 ++++ migration/multifd.c | 18 +++++++++++++++--- migration/multifd.h | 1 + 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/migration/file.c b/migration/file.c index 22d052a71f..a350dd61f0 100644 --- a/migration/file.c +++ b/migration/file.c @@ -12,12 +12,17 @@ #include "channel.h" #include "file.h" #include "migration.h" +#include "multifd.h" #include "io/channel-file.h" #include "io/channel-util.h" #include "trace.h" #define OFFSET_OPTION ",offset=" +static struct FileOutgoingArgs { + char *fname; +} outgoing_args; + /* Remove the offset option from @filespec and return it in @offsetp. */ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp) @@ -37,6 +42,36 @@ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp) return 0; } +void file_cleanup_outgoing_migration(void) +{ + g_free(outgoing_args.fname); + outgoing_args.fname = NULL; +} + +bool file_send_channel_create(gpointer opaque, Error **errp) +{ + QIOChannelFile *ioc; + int flags = O_WRONLY; + bool ret = true; + + ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); + if (!ioc) { + ret = false; + goto out; + } + + multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); + +out: + /* + * File channel creation is synchronous. However posting this + * semaphore here is simpler than adding a special case. + */ + multifd_send_channel_created(); + + return ret; +} + void file_start_outgoing_migration(MigrationState *s, FileMigrationArgs *file_args, Error **errp) { @@ -53,6 +88,8 @@ void file_start_outgoing_migration(MigrationState *s, return; } + outgoing_args.fname = g_strdup(filename); + ioc = QIO_CHANNEL(fioc); if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { return; diff --git a/migration/file.h b/migration/file.h index 37d6a08bfc..4577f9efdd 100644 --- a/migration/file.h +++ b/migration/file.h @@ -9,10 +9,14 @@ #define QEMU_MIGRATION_FILE_H #include "qapi/qapi-types-migration.h" +#include "io/task.h" +#include "channel.h" void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp); void file_start_outgoing_migration(MigrationState *s, FileMigrationArgs *file_args, Error **errp); int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); +void file_cleanup_outgoing_migration(void); +bool file_send_channel_create(gpointer opaque, Error **errp); #endif diff --git a/migration/multifd.c b/migration/multifd.c index 3dc8d5328f..d3728343de 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -18,6 +18,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "qapi/qapi-events-migration.h" +#include "file.h" #include "ram.h" #include "migration.h" #include "migration-stats.h" @@ -29,6 +30,7 @@ #include "threadinfo.h" #include "options.h" #include "qemu/yank.h" +#include "io/channel-file.h" #include "io/channel-socket.h" #include "yank_functions.h" @@ -695,6 +697,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); + qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); p->c = NULL; } @@ -716,6 +719,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + file_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); @@ -981,7 +985,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return true; } -static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { qio_channel_set_delay(ioc, false); @@ -1049,9 +1053,14 @@ out: error_free(local_err); } -static void multifd_new_send_channel_create(gpointer opaque) +static bool multifd_new_send_channel_create(gpointer opaque, Error **errp) { + if (!multifd_use_packets()) { + return file_send_channel_create(opaque, errp); + } + socket_send_channel_create(multifd_new_send_channel_async, opaque); + return true; } bool multifd_send_setup(void) @@ -1100,7 +1109,10 @@ bool multifd_send_setup(void) p->page_size = qemu_target_page_size(); p->page_count = page_count; p->write_flags = 0; - multifd_new_send_channel_create(p); + + if (!multifd_new_send_channel_create(p, &local_err)) { + return false; + } } /* diff --git a/migration/multifd.h b/migration/multifd.h index 1d8bbaf96b..db8887f088 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -227,5 +227,6 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) p->iovs_num++; } +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); #endif -- Gitee From dd7a7719d250453533fd9e6a6d0bdd2bdef94332 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:12 -0300 Subject: [PATCH 087/134] migration/multifd: Add incoming QIOChannelFile support commit 2dd7ee7a51bc0a90abe410d6223b297cf2925e4f upstream. On the receiving side we don't need to differentiate between main channel and threads, so whichever channel is defined first gets to be the main one. And since there are no packets, use the atomic channel count to index into the params array. Intel-SIG: commit 2dd7ee7a51bc migration/multifd: Add incoming QIOChannelFile support Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-19-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/file.c | 35 +++++++++++++++++++++++++++-------- migration/migration.c | 3 ++- migration/multifd.c | 3 +-- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/migration/file.c b/migration/file.c index a350dd61f0..2f8b626b27 100644 --- a/migration/file.c +++ b/migration/file.c @@ -8,6 +8,7 @@ #include "qemu/osdep.h" #include "exec/ramblock.h" #include "qemu/cutils.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "channel.h" #include "file.h" @@ -15,6 +16,7 @@ #include "multifd.h" #include "io/channel-file.h" #include "io/channel-util.h" +#include "options.h" #include "trace.h" #define OFFSET_OPTION ",offset=" @@ -112,7 +114,8 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) g_autofree char *filename = g_strdup(file_args->filename); QIOChannelFile *fioc = NULL; uint64_t offset = file_args->offset; - QIOChannel *ioc; + int channels = 1; + int i = 0; trace_migration_file_incoming(filename); @@ -121,13 +124,29 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) return; } - ioc = QIO_CHANNEL(fioc); - if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { + if (offset && + qio_channel_io_seek(QIO_CHANNEL(fioc), offset, SEEK_SET, errp) < 0) { return; } - qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - file_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); + + if (migrate_multifd()) { + channels += migrate_multifd_channels(); + } + + do { + QIOChannel *ioc = QIO_CHANNEL(fioc); + + qio_channel_set_name(ioc, "migration-file-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + + fioc = qio_channel_file_new_fd(dup(fioc->fd)); + + if (!fioc || fioc->fd == -1) { + error_setg(errp, "Error creating migration incoming channel"); + break; + } + } while (++i < channels); } diff --git a/migration/migration.c b/migration/migration.c index b97a1d2993..dddbe56111 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -857,7 +857,8 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) uint32_t channel_magic = 0; int ret = 0; - if (migrate_multifd() && !migrate_postcopy_ram() && + if (migrate_multifd() && !migrate_mapped_ram() && + !migrate_postcopy_ram() && qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { /* * With multiple channels, it is possible that we receive channels diff --git a/migration/multifd.c b/migration/multifd.c index d3728343de..da0a0dcfe1 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1549,8 +1549,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) } trace_multifd_recv_new_channel(id); } else { - /* next patch gives this a meaningful value */ - id = 0; + id = qatomic_read(&multifd_recv_state->count); } p = &multifd_recv_state->params[id]; -- Gitee From 5a3277ffe24ce59975b2c45713684f640316013d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:13 -0300 Subject: [PATCH 088/134] migration/multifd: Prepare multifd sync for mapped-ram migration commit 9d01778af8e39f5b0f101d0a9f32215717604bef upstream. The mapped-ram migration can be performed live or non-live, but it is always asynchronous, i.e. the source machine and the destination machine are not migrating at the same time. We only need some pieces of the multifd sync operations. multifd_send_sync_main() ------------------------ Issued by the ram migration code on the migration thread, causes the multifd send channels to synchronize with the migration thread and makes the sending side emit a packet with the MULTIFD_FLUSH flag. With mapped-ram we want to maintain the sync on the sending side because that provides ordering between the rounds of dirty pages when migrating live. MULTIFD_FLUSH ------------- On the receiving side, the presence of the MULTIFD_FLUSH flag on a packet causes the receiving channels to start synchronizing with the main thread. We're not using packets with mapped-ram, so there's no MULTIFD_FLUSH flag and therefore no channel sync on the receiving side. multifd_recv_sync_main() ------------------------ Issued by the migration thread when the ram migration flag RAM_SAVE_FLAG_MULTIFD_FLUSH is received, causes the migration thread on the receiving side to start synchronizing with the recv channels. Due to compatibility, this is also issued when RAM_SAVE_FLAG_EOS is received. For mapped-ram we only need to synchronize the channels at the end of migration to avoid doing cleanup before the channels have finished their IO. Make sure the multifd syncs are only issued at the appropriate times. Note that due to pre-existing backward compatibility issues, we have the multifd_flush_after_each_section property that can cause a sync to happen at EOS. Since the EOS flag is needed on the stream, allow mapped-ram to just ignore it. Also emit an error if any other unexpected flags are found on the stream. Intel-SIG: commit 9d01778af8e3 migration/multifd: Prepare multifd sync for mapped-ram migration Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240229153017.2221-20-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/ram.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index bb7fde4695..bcbfc87bf8 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1499,14 +1499,18 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) pss->block = QLIST_NEXT_RCU(pss->block, next); if (!pss->block) { if (migrate_multifd() && - !migrate_multifd_flush_after_each_section()) { + (!migrate_multifd_flush_after_each_section() || + migrate_mapped_ram())) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - qemu_fflush(f); + + if (!migrate_mapped_ram()) { + qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); + qemu_fflush(f); + } } /* * If memory migration starts over, we will meet a dirtied page @@ -3446,7 +3450,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && !migrate_multifd_flush_after_each_section() + && !migrate_mapped_ram()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } @@ -3577,7 +3582,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { - if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && migrate_multifd_flush_after_each_section() && + !migrate_mapped_ram()) { ret = multifd_send_sync_main(); if (ret < 0) { return ret; @@ -3691,7 +3697,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && !migrate_multifd_flush_after_each_section() && + !migrate_mapped_ram()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); @@ -4495,6 +4502,12 @@ static int ram_load_precopy(QEMUFile *f) invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; } + if (migrate_mapped_ram()) { + invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | + RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | + RAM_SAVE_FLAG_ZERO); + } + while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr; void *host = NULL, *host_bak = NULL; @@ -4516,6 +4529,8 @@ static int ram_load_precopy(QEMUFile *f) addr &= TARGET_PAGE_MASK; if (flags & invalid_flags) { + error_report("Unexpected RAM flags: %d", flags & invalid_flags); + if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { error_report("Received an unexpected compressed page"); } @@ -4569,6 +4584,10 @@ static int ram_load_precopy(QEMUFile *f) switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: ret = parse_ramblocks(f, addr); + + if (migrate_mapped_ram()) { + multifd_recv_sync_main(); + } break; case RAM_SAVE_FLAG_ZERO: @@ -4609,7 +4628,12 @@ static int ram_load_precopy(QEMUFile *f) case RAM_SAVE_FLAG_EOS: /* normal exit */ if (migrate_multifd() && - migrate_multifd_flush_after_each_section()) { + migrate_multifd_flush_after_each_section() && + /* + * Mapped-ram migration flushes once and for all after + * parsing ramblocks. Always ignore EOS for it. + */ + !migrate_mapped_ram()) { multifd_recv_sync_main(); } break; -- Gitee From 898a38310e983786e410100ecebb7eacb8e67f4e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:14 -0300 Subject: [PATCH 089/134] migration/multifd: Support outgoing mapped-ram stream format commit f427d90b9898dd7a718b645eeccd9d0ee75d4295 upstream. The new mapped-ram stream format uses a file transport and puts ram pages in the migration file at their respective offsets and can be done in parallel by using the pwritev system call which takes iovecs and an offset. Add support to enabling the new format along with multifd to make use of the threading and page handling already in place. This requires multifd to stop sending headers and leaving the stream format to the mapped-ram code. When it comes time to write the data, we need to call a version of qio_channel_write that can take an offset. Usage on HMP is: (qemu) stop (qemu) migrate_set_capability multifd on (qemu) migrate_set_capability mapped-ram on (qemu) migrate_set_parameter max-bandwidth 0 (qemu) migrate_set_parameter multifd-channels 8 (qemu) migrate file:migfile Intel-SIG: commit f427d90b9898 migration/multifd: Support outgoing mapped-ram stream format Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-21-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- include/qemu/bitops.h | 13 +++++++++++ migration/file.c | 54 +++++++++++++++++++++++++++++++++++++++++++ migration/file.h | 2 ++ migration/migration.c | 17 ++++++++++---- migration/multifd.c | 24 +++++++++++++++++-- migration/options.c | 13 ++++++----- migration/ram.c | 17 +++++++++++--- migration/ram.h | 1 + 8 files changed, 125 insertions(+), 16 deletions(-) diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h index cb3526d1f4..2c0a2fe751 100644 --- a/include/qemu/bitops.h +++ b/include/qemu/bitops.h @@ -67,6 +67,19 @@ static inline void clear_bit(long nr, unsigned long *addr) *p &= ~mask; } +/** + * clear_bit_atomic - Clears a bit in memory atomically + * @nr: Bit to clear + * @addr: Address to start counting from + */ +static inline void clear_bit_atomic(long nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = addr + BIT_WORD(nr); + + return qatomic_and(p, ~mask); +} + /** * change_bit - Toggle a bit in memory * @nr: Bit to change diff --git a/migration/file.c b/migration/file.c index 2f8b626b27..d949a941d0 100644 --- a/migration/file.c +++ b/migration/file.c @@ -150,3 +150,57 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) } } while (++i < channels); } + +int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, + int niov, RAMBlock *block, Error **errp) +{ + ssize_t ret = -1; + int i, slice_idx, slice_num; + uintptr_t base, next, offset; + size_t len; + + slice_idx = 0; + slice_num = 1; + + /* + * If the iov array doesn't have contiguous elements, we need to + * split it in slices because we only have one file offset for the + * whole iov. Do this here so callers don't need to break the iov + * array themselves. + */ + for (i = 0; i < niov; i++, slice_num++) { + base = (uintptr_t) iov[i].iov_base; + + if (i != niov - 1) { + len = iov[i].iov_len; + next = (uintptr_t) iov[i + 1].iov_base; + + if (base + len == next) { + continue; + } + } + + /* + * Use the offset of the first element of the segment that + * we're sending. + */ + offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host; + if (offset >= block->used_length) { + error_setg(errp, "offset " RAM_ADDR_FMT + "outside of ramblock %s range", offset, block->idstr); + ret = -1; + break; + } + + ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num, + block->pages_offset + offset, errp); + if (ret < 0) { + break; + } + + slice_idx += slice_num; + slice_num = 0; + } + + return (ret < 0) ? ret : 0; +} diff --git a/migration/file.h b/migration/file.h index 4577f9efdd..01a338cac7 100644 --- a/migration/file.h +++ b/migration/file.h @@ -19,4 +19,6 @@ void file_start_outgoing_migration(MigrationState *s, int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); void file_cleanup_outgoing_migration(void); bool file_send_channel_create(gpointer opaque, Error **errp); +int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, + int niov, RAMBlock *block, Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index dddbe56111..362720734d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -132,12 +132,14 @@ static bool transport_supports_multi_channels(MigrationAddress *addr) if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { SocketAddress *saddr = &addr->u.socket; - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + return (saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK); + } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + return migrate_mapped_ram(); + } else { + return false; } - - return false; } static bool migration_needs_seekable_channel(void) @@ -1929,6 +1931,11 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, error_setg(errp, "Cannot use TLS with mapped-ram"); return false; } + + if (migrate_multifd_compression()) { + error_setg(errp, "Cannot use compression with mapped-ram"); + return false; + } } if (blk || blk_inc) { diff --git a/migration/multifd.c b/migration/multifd.c index da0a0dcfe1..256b798ad7 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -109,6 +109,17 @@ void multifd_send_channel_created(void) qemu_sem_post(&multifd_send_state->channels_created); } +static void multifd_set_file_bitmap(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + assert(pages->block); + + for (int i = 0; i < p->pages->num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]); + } +} + /* Multifd without compression */ /** @@ -170,6 +181,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) if (!multifd_use_packets()) { multifd_send_prepare_iovs(p); + multifd_set_file_bitmap(p); + return 0; } @@ -871,8 +884,15 @@ static void *multifd_send_thread(void *opaque) break; } - ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, - 0, p->write_flags, &local_err); + if (migrate_mapped_ram()) { + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, + p->pages->block, &local_err); + } else { + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, + NULL, 0, p->write_flags, + &local_err); + } + if (ret != 0) { break; } diff --git a/migration/options.c b/migration/options.c index 4f8ac90367..f9cc1a2666 100644 --- a/migration/options.c +++ b/migration/options.c @@ -661,12 +661,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } if (new_caps[MIGRATION_CAPABILITY_MAPPED_RAM]) { - if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { - error_setg(errp, - "Mapped-ram migration is incompatible with multifd"); - return false; - } - if (new_caps[MIGRATION_CAPABILITY_XBZRLE]) { error_setg(errp, "Mapped-ram migration is incompatible with xbzrle"); @@ -1300,6 +1294,13 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) } #endif + if (migrate_mapped_ram() && + (migrate_multifd_compression() || migrate_tls())) { + error_setg(errp, + "Mapped-ram only available for non-compressed non-TLS multifd migration"); + return false; + } + if (params->has_x_vcpu_dirty_limit_period && (params->x_vcpu_dirty_limit_period < 1 || params->x_vcpu_dirty_limit_period > 1000)) { diff --git a/migration/ram.c b/migration/ram.c index bcbfc87bf8..350de46a50 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1166,7 +1166,7 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, if (migrate_mapped_ram()) { /* zero pages are not transferred with mapped-ram */ - clear_bit(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); + clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); return 1; } @@ -2761,8 +2761,6 @@ static void ram_save_cleanup(void *opaque) block->clear_bmap = NULL; g_free(block->bmap); block->bmap = NULL; - g_free(block->file_bmap); - block->file_bmap = NULL; } xbzrle_cleanup(); @@ -3470,9 +3468,22 @@ static void ram_save_file_bmap(QEMUFile *f) qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, block->bitmap_offset); ram_transferred_add(bitmap_size); + + /* + * Free the bitmap here to catch any synchronization issues + * with multifd channels. No channels should be sending pages + * after we've written the bitmap to file. + */ + g_free(block->file_bmap); + block->file_bmap = NULL; } } +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset) +{ + set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); +} + /** * ram_save_iterate: iterative stage for migration * diff --git a/migration/ram.h b/migration/ram.h index 9b937a446b..b9ac0da587 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -75,6 +75,7 @@ bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset); /* ram cache */ int colo_init_ram_cache(void); -- Gitee From 27298d1804a3a01e6ee059956fd98f07c50f0394 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:15 -0300 Subject: [PATCH 090/134] migration/multifd: Support incoming mapped-ram stream format commit a49d15a38d3db0aca7e55850c036d1abbc09a0ea upstream. For the incoming mapped-ram migration we need to read the ramblock headers, get the pages bitmap and send the host address of each non-zero page to the multifd channel thread for writing. Usage on HMP is: (qemu) migrate_set_capability multifd on (qemu) migrate_set_capability mapped-ram on (qemu) migrate_incoming file:migfile (the ram.h include needs to move because we've been previously relying on it being included from migration.c. Now file.h will start including multifd.h before migration.o is processed) Intel-SIG: commit a49d15a38d3d migration/multifd: Support incoming mapped-ram stream format Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-22-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/file.c | 18 +++++++++++++++++- migration/file.h | 2 ++ migration/multifd.c | 31 ++++++++++++++++++++++++++++--- migration/multifd.h | 2 ++ migration/ram.c | 26 ++++++++++++++++++++++++-- 5 files changed, 73 insertions(+), 6 deletions(-) diff --git a/migration/file.c b/migration/file.c index d949a941d0..499d2782fe 100644 --- a/migration/file.c +++ b/migration/file.c @@ -13,7 +13,6 @@ #include "channel.h" #include "file.h" #include "migration.h" -#include "multifd.h" #include "io/channel-file.h" #include "io/channel-util.h" #include "options.h" @@ -204,3 +203,20 @@ int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, return (ret < 0) ? ret : 0; } + +int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp) +{ + MultiFDRecvData *data = p->data; + size_t ret; + + ret = qio_channel_pread(p->c, (char *) data->opaque, + data->size, data->file_offset, errp); + if (ret != data->size) { + error_prepend(errp, + "multifd recv (%u): read 0x%zx, expected 0x%zx", + p->id, ret, data->size); + return -1; + } + + return 0; +} diff --git a/migration/file.h b/migration/file.h index 01a338cac7..9f71e87f74 100644 --- a/migration/file.h +++ b/migration/file.h @@ -11,6 +11,7 @@ #include "qapi/qapi-types-migration.h" #include "io/task.h" #include "channel.h" +#include "multifd.h" void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp); @@ -21,4 +22,5 @@ void file_cleanup_outgoing_migration(void); bool file_send_channel_create(gpointer opaque, Error **errp); int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, int niov, RAMBlock *block, Error **errp); +int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp); #endif diff --git a/migration/multifd.c b/migration/multifd.c index 256b798ad7..4ee9d556a3 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -19,7 +19,6 @@ #include "qapi/error.h" #include "qapi/qapi-events-migration.h" #include "file.h" -#include "ram.h" #include "migration.h" #include "migration-stats.h" #include "socket.h" @@ -252,7 +251,7 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) uint32_t flags; if (!multifd_use_packets()) { - return 0; + return multifd_file_recv_data(p, errp); } flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; @@ -1335,22 +1334,48 @@ void multifd_recv_cleanup(void) void multifd_recv_sync_main(void) { int thread_count = migrate_multifd_channels(); + bool file_based = !multifd_use_packets(); int i; - if (!migrate_multifd() || !multifd_use_packets()) { + if (!migrate_multifd()) { return; } + /* + * File-based channels don't use packets and therefore need to + * wait for more work. Release them to start the sync. + */ + if (file_based) { + for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + trace_multifd_recv_sync_main_signal(p->id); + qemu_sem_post(&p->sem); + } + } + /* * Initiate the synchronization by waiting for all channels. + * * For socket-based migration this means each channel has received * the SYNC packet on the stream. + * + * For file-based migration this means each channel is done with + * the work (pending_job=false). */ for (i = 0; i < thread_count; i++) { trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } + if (file_based) { + /* + * For file-based loading is done in one iteration. We're + * done. + */ + return; + } + /* * Sync done. Release the channels for the next iteration. */ diff --git a/migration/multifd.h b/migration/multifd.h index db8887f088..7447c2bea3 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,6 +13,8 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +#include "ram.h" + typedef struct MultiFDRecvData MultiFDRecvData; bool multifd_send_setup(void); diff --git a/migration/ram.c b/migration/ram.c index 350de46a50..13cefb11ea 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -4310,6 +4310,22 @@ void colo_flush_ram_cache(void) trace_colo_flush_ram_cache_end(); } +static size_t ram_load_multifd_pages(void *host_addr, size_t size, + uint64_t offset) +{ + MultiFDRecvData *data = multifd_get_recv_data(); + + data->opaque = host_addr; + data->file_offset = offset; + data->size = size; + + if (!multifd_recv()) { + return 0; + } + + return size; +} + static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, long num_pages, unsigned long *bitmap, Error **errp) @@ -4339,8 +4355,14 @@ static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); - read = qemu_get_buffer_at(f, host, size, - block->pages_offset + offset); + if (migrate_multifd()) { + read = ram_load_multifd_pages(host, size, + block->pages_offset + offset); + } else { + read = qemu_get_buffer_at(f, host, size, + block->pages_offset + offset); + } + if (!read) { goto err; } -- Gitee From 8fea810e9efbb518cc3899a17090e5c2fd922cca Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:16 -0300 Subject: [PATCH 091/134] migration/multifd: Add mapped-ram support to fd: URI commit decdc76772c453ff1444612e910caa0d45cd8eac upstream. If we receive a file descriptor that points to a regular file, there's nothing stopping us from doing multifd migration with mapped-ram to that file. Enable the fd: URI to work with multifd + mapped-ram. Note that the fds passed into multifd are duplicated because we want to avoid cross-thread effects when doing cleanup (i.e. close(fd)). The original fd doesn't need to be duplicated because monitor_get_fd() transfers ownership to the caller. Intel-SIG: commit decdc76772c4 migration/multifd: Add mapped-ram support to fd: URI Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240229153017.2221-23-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/fd.c | 44 +++++++++++++++++++++++++++++++++++++++++++ migration/fd.h | 2 ++ migration/file.c | 18 ++++++++++++------ migration/migration.c | 4 ++++ migration/multifd.c | 2 ++ 5 files changed, 64 insertions(+), 6 deletions(-) diff --git a/migration/fd.c b/migration/fd.c index 0eb677dcae..d4ae72d132 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -15,18 +15,41 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "channel.h" #include "fd.h" #include "migration.h" #include "monitor/monitor.h" +#include "io/channel-file.h" #include "io/channel-util.h" +#include "options.h" #include "trace.h" +static struct FdOutgoingArgs { + int fd; +} outgoing_args; + +int fd_args_get_fd(void) +{ + return outgoing_args.fd; +} + +void fd_cleanup_outgoing_migration(void) +{ + if (outgoing_args.fd > 0) { + close(outgoing_args.fd); + outgoing_args.fd = -1; + } +} + void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { QIOChannel *ioc; int fd = monitor_get_fd(monitor_cur(), fdname, errp); + + outgoing_args.fd = -1; + if (fd == -1) { return; } @@ -38,6 +61,8 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } + outgoing_args.fd = fd; + qio_channel_set_name(ioc, "migration-fd-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); object_unref(OBJECT(ioc)); @@ -73,4 +98,23 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) fd_accept_incoming_migration, NULL, NULL, g_main_context_get_thread_default()); + + if (migrate_multifd()) { + int channels = migrate_multifd_channels(); + + while (channels--) { + ioc = QIO_CHANNEL(qio_channel_file_new_fd(dup(fd))); + + if (QIO_CHANNEL_FILE(ioc)->fd == -1) { + error_setg(errp, "Failed to duplicate fd %d", fd); + return; + } + + qio_channel_set_name(ioc, "migration-fd-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + } + } } diff --git a/migration/fd.h b/migration/fd.h index b901bc014e..0c0a18d9e7 100644 --- a/migration/fd.h +++ b/migration/fd.h @@ -20,4 +20,6 @@ void fd_start_incoming_migration(const char *fdname, Error **errp); void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp); +void fd_cleanup_outgoing_migration(void); +int fd_args_get_fd(void); #endif diff --git a/migration/file.c b/migration/file.c index 499d2782fe..164b079966 100644 --- a/migration/file.c +++ b/migration/file.c @@ -11,6 +11,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "channel.h" +#include "fd.h" #include "file.h" #include "migration.h" #include "io/channel-file.h" @@ -53,15 +54,20 @@ bool file_send_channel_create(gpointer opaque, Error **errp) { QIOChannelFile *ioc; int flags = O_WRONLY; - bool ret = true; - - ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); - if (!ioc) { - ret = false; - goto out; + bool ret = false; + int fd = fd_args_get_fd(); + + if (fd && fd != -1) { + ioc = qio_channel_file_new_fd(dup(fd)); + } else { + ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); + if (!ioc) { + goto out; + } } multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); + ret = true; out: /* diff --git a/migration/migration.c b/migration/migration.c index 362720734d..11cd4c7f67 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -132,6 +132,10 @@ static bool transport_supports_multi_channels(MigrationAddress *addr) if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { SocketAddress *saddr = &addr->u.socket; + if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { + return migrate_mapped_ram(); + } + return (saddr->type == SOCKET_ADDRESS_TYPE_INET || saddr->type == SOCKET_ADDRESS_TYPE_UNIX || saddr->type == SOCKET_ADDRESS_TYPE_VSOCK); diff --git a/migration/multifd.c b/migration/multifd.c index 4ee9d556a3..b00a0124f6 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -18,6 +18,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "qapi/qapi-events-migration.h" +#include "fd.h" #include "file.h" #include "migration.h" #include "migration-stats.h" @@ -732,6 +733,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { file_cleanup_outgoing_migration(); + fd_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); -- Gitee From e605b6b87b7db8995efa9c97054de799bc4f4a07 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:17 -0300 Subject: [PATCH 092/134] tests/qtest/migration: Add a multifd + mapped-ram migration test commit 7a09f092834641b7a793d50a3a261073bbb404a6 upstream. Intel-SIG: commit 7a09f0928346 tests/qtest/migration: Add a multifd + mapped-ram migration test Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-24-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 68 ++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 4d8f9943a8..7da512725b 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2183,6 +2183,46 @@ static void test_precopy_file_mapped_ram(void) test_file_common(&args, true); } +static void *migrate_multifd_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_mapped_ram_start(from, to); + + migrate_set_parameter_int(from, "multifd-channels", 4); + migrate_set_parameter_int(to, "multifd-channels", 4); + + migrate_set_capability(from, "multifd", true); + migrate_set_capability(to, "multifd", true); + + return NULL; +} + +static void test_multifd_file_mapped_ram_live(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_multifd_mapped_ram_start, + }; + + test_file_common(&args, false); +} + +static void test_multifd_file_mapped_ram(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_multifd_mapped_ram_start, + }; + + test_file_common(&args, true); +} + + static void test_precopy_tcp_plain(void) { MigrateCommon args = { @@ -2459,6 +2499,25 @@ static void test_migrate_precopy_fd_file_mapped_ram(void) }; test_file_common(&args, true); } + +static void *migrate_multifd_fd_mapped_ram_start(QTestState *from, + QTestState *to) +{ + migrate_multifd_mapped_ram_start(from, to); + return migrate_precopy_fd_file_start(from, to); +} + +static void test_multifd_fd_mapped_ram(void) +{ + MigrateCommon args = { + .connect_uri = "fd:fd-mig", + .listen_uri = "defer", + .start_hook = migrate_multifd_fd_mapped_ram_start, + .finish_hook = test_migrate_fd_finish_hook + }; + + test_file_common(&args, true); +} #endif /* _WIN32 */ static void do_test_validate_uuid(MigrateStart *args, bool should_fail) @@ -3491,6 +3550,15 @@ int main(int argc, char **argv) migration_test_add("/migration/precopy/file/mapped-ram/live", test_precopy_file_mapped_ram_live); + migration_test_add("/migration/multifd/file/mapped-ram", + test_multifd_file_mapped_ram); + migration_test_add("/migration/multifd/file/mapped-ram/live", + test_multifd_file_mapped_ram_live); +#ifndef _WIN32 + migration_test_add("/migration/multifd/fd/mapped-ram", + test_multifd_fd_mapped_ram); +#endif + #ifdef CONFIG_GNUTLS migration_test_add("/migration/precopy/unix/tls/psk", test_precopy_unix_tls_psk); -- Gitee From a90df8375a2b0952eba092afaff41f23a99dcc7d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 1 Mar 2024 17:15:24 +0800 Subject: [PATCH 093/134] migration/multifd: Document two places for mapped-ram commit 1a6e217c35b6dbab10fdc1e02640b8d60b2dc663 upstream. Add two documentations for mapped-ram migration on two spots that may not be extremely clear. Intel-SIG: commit 1a6e217c35b6 migration/multifd: Document two places for mapped-ram Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240301091524.39900-1-peterx@redhat.com Cc: Prasad Pandit [peterx: fix two English errors per Prasad] Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++++++++ migration/ram.c | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index b00a0124f6..6db2f0d1c5 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -710,6 +710,18 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); + /* + * An explicit close() on the channel here is normally not + * required, but can be helpful for "file:" iochannels, where it + * will include fdatasync() to make sure the data is flushed to the + * disk backend. + * + * The object_unref() cannot guarantee that because: (1) finalize() + * of the iochannel is only triggered on the last reference, and + * it's not guaranteed that we always hold the last refcount when + * reaching here, and, (2) even if finalize() is invoked, it only + * does a close(fd) without data flush. + */ qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); p->c = NULL; diff --git a/migration/ram.c b/migration/ram.c index 13cefb11ea..ca1e84553b 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -4617,7 +4617,13 @@ static int ram_load_precopy(QEMUFile *f) switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: ret = parse_ramblocks(f, addr); - + /* + * For mapped-ram migration (to a file) using multifd, we sync + * once and for all here to make sure all tasks we queued to + * multifd threads are completed, so that all the ramblocks + * (including all the guest memory pages within) are fully + * loaded after this sync returns. + */ if (migrate_mapped_ram()) { multifd_recv_sync_main(); } -- Gitee From aa1fd28ab3f250b41ed38e13b0981688cacd0938 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 5 Mar 2024 16:56:29 -0300 Subject: [PATCH 094/134] migration/multifd: Don't fsync when closing QIOChannelFile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 61dec060821de5d81a0c5f6b827e4f0282cd92ca upstream. Commit bc38feddeb ("io: fsync before closing a file channel") added a fsync/fdatasync at the closing point of the QIOChannelFile to ensure integrity of the migration stream in case of QEMU crash. The decision to do the sync at qio_channel_close() was not the best since that function runs in the main thread and the fsync can cause QEMU to hang for several minutes, depending on the migration size and disk speed. To fix the hang, remove the fsync from qio_channel_file_close(). At this moment, the migration code is the only user of the fsync and we're taking the tradeoff of not having a sync at all, leaving the responsibility to the upper layers. Intel-SIG: commit 61dec060821d migration/multifd: Don't fsync when closing QIOChannelFile Fixes: bc38feddeb ("io: fsync before closing a file channel") Reviewed-by: "Daniel P. Berrangé" Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240305195629.9922-1-farosas@suse.de Link: https://lore.kernel.org/r/20240305174332.2553-1-farosas@suse.de [peterx: add more comment to the qio_channel_close()] Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/main.rst | 3 ++- io/channel-file.c | 5 ----- migration/multifd.c | 28 +++++++++++++++++++--------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 9f907a85e6..a3f6f2ec9c 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -44,7 +44,8 @@ over any transport. - file migration: do the migration using a file that is passed to QEMU by path. A file offset option is supported to allow a management application to add its own metadata to the start of the file without - QEMU interference. + QEMU interference. Note that QEMU does not flush cached file + data/metadata at the end of migration. In addition, support is included for migration using RDMA, which transports the page data using ``RDMA``, where the hardware takes care of diff --git a/io/channel-file.c b/io/channel-file.c index d4706fa592..a6ad7770c6 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -242,11 +242,6 @@ static int qio_channel_file_close(QIOChannel *ioc, { QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); - if (qemu_fdatasync(fioc->fd) < 0) { - error_setg_errno(errp, errno, - "Unable to synchronize file data with storage device"); - return -1; - } if (qemu_close(fioc->fd) < 0) { error_setg_errno(errp, errno, "Unable to close file"); diff --git a/migration/multifd.c b/migration/multifd.c index 6db2f0d1c5..c5f018e8ab 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -711,16 +711,26 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) if (p->c) { migration_ioc_unregister_yank(p->c); /* - * An explicit close() on the channel here is normally not - * required, but can be helpful for "file:" iochannels, where it - * will include fdatasync() to make sure the data is flushed to the - * disk backend. + * The object_unref() cannot guarantee the fd will always be + * released because finalize() of the iochannel is only + * triggered on the last reference and it's not guaranteed + * that we always hold the last refcount when reaching here. * - * The object_unref() cannot guarantee that because: (1) finalize() - * of the iochannel is only triggered on the last reference, and - * it's not guaranteed that we always hold the last refcount when - * reaching here, and, (2) even if finalize() is invoked, it only - * does a close(fd) without data flush. + * Closing the fd explicitly has the benefit that if there is any + * registered I/O handler callbacks on such fd, that will get a + * POLLNVAL event and will further trigger the cleanup to finally + * release the IOC. + * + * FIXME: It should logically be guaranteed that all multifd + * channels have no I/O handler callback registered when reaching + * here, because migration thread will wait for all multifd channel + * establishments to complete during setup. Since + * migrate_fd_cleanup() will be scheduled in main thread too, all + * previous callbacks should guarantee to be completed when + * reaching here. See multifd_send_state.channels_created and its + * usage. In the future, we could replace this with an assert + * making sure we're the last reference, or simply drop it if above + * is more clear to be justified. */ qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); -- Gitee From a97fbe683e57ce79c2d678a8f69799a8e4e2bb5a Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 11 Mar 2024 18:00:09 +0000 Subject: [PATCH 095/134] migration/multifd: Allow zero pages in file migration commit 44fe138edc0f08b8568fe4ee90be6a2a56c67daf upstream. Currently, it's an error to have no data pages in the multifd file migration because zero page detection is done in the migration thread and zero pages don't reach multifd. This is enforced with the pages->num assert. We're about to add zero page detection on the multifd thread. Fix the file_write_ramblock_iov() to stop considering p->iovs_num=0 an error. Intel-SIG: commit 44fe138edc0f migration/multifd: Allow zero pages in file migration Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-2-hao.xiang@linux.dev Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/file.c b/migration/file.c index 164b079966..5075f9526f 100644 --- a/migration/file.c +++ b/migration/file.c @@ -159,7 +159,7 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, int niov, RAMBlock *block, Error **errp) { - ssize_t ret = -1; + ssize_t ret = 0; int i, slice_idx, slice_num; uintptr_t base, next, offset; size_t len; -- Gitee From fda6a84534c27cb1440fe447c4a56ad83c1a7931 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 11 Mar 2024 18:00:10 +0000 Subject: [PATCH 096/134] migration/multifd: Allow clearing of the file_bmap from multifd commit c3cdf3fb18e5ecf8a3b4dc7223ddbfc53c418eb8 upstream. We currently only need to clear the mapped-ram file bitmap from the migration thread during save_zero_page. We're about to add support for zero page detection on the multifd thread, so allow ramblock_set_file_bmap_atomic() to also clear the bits. Intel-SIG: commit c3cdf3fb18e5 migration/multifd: Allow clearing of the file_bmap from multifd Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-3-hao.xiang@linux.dev Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 2 +- migration/ram.c | 8 ++++++-- migration/ram.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index c5f018e8ab..8d10ce5153 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -116,7 +116,7 @@ static void multifd_set_file_bitmap(MultiFDSendParams *p) assert(pages->block); for (int i = 0; i < p->pages->num; i++) { - ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]); + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); } } diff --git a/migration/ram.c b/migration/ram.c index ca1e84553b..4330375be5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -3479,9 +3479,13 @@ static void ram_save_file_bmap(QEMUFile *f) } } -void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset) +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) { - set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + if (set) { + set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + } else { + clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); + } } /** diff --git a/migration/ram.h b/migration/ram.h index b9ac0da587..08feecaf51 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -75,7 +75,8 @@ bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); -void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset); +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, + bool set); /* ram cache */ int colo_init_ram_cache(void); -- Gitee From 6b364182ba9fe9c8c9afe9e761c88dba23d6b978 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:11 +0000 Subject: [PATCH 097/134] migration/multifd: Add new migration option zero-page-detection. commit 5fdbb1dfccfd59661c95cae760b8e276c5b8e65c upstream. This new parameter controls where the zero page checking is running. 1. If this parameter is set to 'legacy', zero page checking is done in the migration main thread. 2. If this parameter is set to 'none', zero page checking is disabled. Intel-SIG: commit 5fdbb1dfccfd migration/multifd: Add new migration option zero-page-detection. Signed-off-by: Hao Xiang Reviewed-by: Peter Xu Acked-by: Markus Armbruster Link: https://lore.kernel.org/r/20240311180015.3359271-4-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: hw/core/qdev-properties-system.c include/hw/qdev-properties-system.h migration/options.c qapi/migration.json [jz: resolve context conflicts] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 10 ++++++++++ include/hw/qdev-properties-system.h | 4 ++++ migration/migration-hmp-cmds.c | 9 +++++++++ migration/options.c | 21 +++++++++++++++++++++ migration/options.h | 1 + migration/ram.c | 4 ++++ qapi/migration.json | 28 +++++++++++++++++++++++++++- 7 files changed, 76 insertions(+), 1 deletion(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index c581d46f2e..cad1e04150 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -732,6 +732,16 @@ const PropertyInfo qdev_prop_mig_mode = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +const PropertyInfo qdev_prop_zero_page_detection = { + .name = "ZeroPageDetection", + .description = "zero_page_detection values, " + "none,legacy", + .enum_table = &ZeroPageDetection_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- Reserved Region --- */ /* diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 7cf27e51b9..63dcf69978 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr; extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; extern const PropertyInfo qdev_prop_mig_mode; +extern const PropertyInfo qdev_prop_zero_page_detection; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; extern const PropertyInfo qdev_prop_blockdev_retry_interval; @@ -48,6 +49,9 @@ extern const PropertyInfo qdev_prop_cpus390entitlement; #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ MigMode) +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \ + ZeroPageDetection) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index aac5e7a73a..71c485edbc 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -348,6 +348,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), MultiFDCompression_str(params->multifd_compression)); + assert(params->has_zero_page_detection); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), + qapi_enum_lookup(&ZeroPageDetection_lookup, + params->zero_page_detection)); monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); @@ -663,6 +668,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; + case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: + p->has_zero_page_detection = true; + visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); + break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; if (!visit_type_size(v, param, &cache_size, &err)) { diff --git a/migration/options.c b/migration/options.c index f9cc1a2666..e1b3aa38ea 100644 --- a/migration/options.c +++ b/migration/options.c @@ -183,6 +183,9 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, + parameters.zero_page_detection, + ZERO_PAGE_DETECTION_LEGACY), DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), @@ -945,6 +948,13 @@ uint64_t migrate_xbzrle_cache_size(void) return s->parameters.xbzrle_cache_size; } +ZeroPageDetection migrate_zero_page_detection(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.zero_page_detection; +} + /* parameter setters */ void migrate_set_block_incremental(bool value) @@ -1060,6 +1070,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_zero_page_detection = true; + params->zero_page_detection = s->parameters.zero_page_detection; return params; } @@ -1097,6 +1109,7 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_zero_page_detection = true; params->sev_pdh = g_strdup(""); params->sev_plat_cert = g_strdup(""); @@ -1438,6 +1451,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->mode = params->mode; } + if (params->has_zero_page_detection) { + dest->zero_page_detection = params->zero_page_detection; + } + if (params->sev_pdh) { assert(params->sev_pdh->type == QTYPE_QSTRING); dest->sev_pdh = params->sev_pdh->u.s; @@ -1599,6 +1616,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.mode = params->mode; } + if (params->has_zero_page_detection) { + s->parameters.zero_page_detection = params->zero_page_detection; + } + if (params->sev_pdh) { g_free(s->parameters.sev_pdh); assert(params->sev_pdh->type == QTYPE_QSTRING); diff --git a/migration/options.h b/migration/options.h index fabfadc3b0..bc5c5bfeaf 100644 --- a/migration/options.h +++ b/migration/options.h @@ -95,6 +95,7 @@ const char *migrate_tls_authz(void); const char *migrate_tls_creds(void); const char *migrate_tls_hostname(void); uint64_t migrate_xbzrle_cache_size(void); +ZeroPageDetection migrate_zero_page_detection(void); /* parameters setters */ diff --git a/migration/ram.c b/migration/ram.c index 4330375be5..30b848bd79 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1158,6 +1158,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, QEMUFile *file = pss->pss_channel; int len = 0; + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { + return 0; + } + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { return 0; } diff --git a/qapi/migration.json b/qapi/migration.json index 9e812a70c6..c303ca4452 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -657,6 +657,18 @@ { 'enum': 'MigMode', 'data': [ 'normal', 'cpr-reboot' ] } +## +# @ZeroPageDetection: +# +# @none: Do not perform zero page checking. +# +# @legacy: Perform zero page checking in main migration thread. +# +# Since: 9.0 +## +{ 'enum': 'ZeroPageDetection', + 'data': [ 'none', 'legacy' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -895,6 +907,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -941,6 +957,7 @@ { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', 'mode', + 'zero-page-detection', 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert'] } ## @@ -1099,6 +1116,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -1167,11 +1188,11 @@ 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', '*sev-pdh': 'StrOrNull', '*sev-plat-cert': 'StrOrNull', '*sev-amd-cert' : 'StrOrNull' } } - ## # @migrate-set-parameters: # @@ -1348,6 +1369,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -1412,6 +1437,7 @@ 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', '*sev-pdh': 'str', '*sev-plat-cert': 'str', '*sev-amd-cert' : 'str'} } -- Gitee From b35279c1fb30a9e2af8f65c2ce013a8409f20754 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:12 +0000 Subject: [PATCH 098/134] migration/multifd: Implement zero page transmission on the multifd thread. commit 303e6f54f9657be76ee060006ee2d4cacff263a0 upstream. 1. Add zero_pages field in MultiFDPacket_t. 2. Implements the zero page detection and handling on the multifd threads for non-compression, zlib and zstd compression backends. 3. Added a new value 'multifd' in ZeroPageDetection enumeration. 4. Adds zero page counters and updates multifd send/receive tracing format to track the newly added counters. Intel-SIG: commit 303e6f54f965 migration/multifd: Implement zero page transmission on the multifd thread. Signed-off-by: Hao Xiang Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-5-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: migration/meson.build [jz: resolve context conflict] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-zero-page.c | 87 ++++++++++++++++++++++++++++++ migration/multifd-zlib.c | 21 ++++++-- migration/multifd-zstd.c | 20 +++++-- migration/multifd.c | 90 +++++++++++++++++++++++++++----- migration/multifd.h | 23 +++++++- migration/ram.c | 1 - migration/trace-events | 8 +-- qapi/migration.json | 7 ++- 10 files changed, 228 insertions(+), 32 deletions(-) create mode 100644 migration/multifd-zero-page.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index cad1e04150..b3b9238b65 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -735,7 +735,7 @@ const PropertyInfo qdev_prop_mig_mode = { const PropertyInfo qdev_prop_zero_page_detection = { .name = "ZeroPageDetection", .description = "zero_page_detection values, " - "none,legacy", + "none,legacy,multifd", .enum_table = &ZeroPageDetection_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index d9b46ef0df..d619ebf238 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,6 +22,7 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', + 'multifd-zero-page.c', 'options.c', 'postcopy-ram.c', 'savevm.c', diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c new file mode 100644 index 0000000000..1ba38be636 --- /dev/null +++ b/migration/multifd-zero-page.c @@ -0,0 +1,87 @@ +/* + * Multifd zero page detection implementation. + * + * Copyright (c) 2024 Bytedance Inc + * + * Authors: + * Hao Xiang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "ram.h" + +static bool multifd_zero_page_enabled(void) +{ + return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; +} + +static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) +{ + ram_addr_t temp; + + if (a == b) { + return; + } + + temp = pages_offset[a]; + pages_offset[a] = pages_offset[b]; + pages_offset[b] = temp; +} + +/** + * multifd_send_zero_page_detect: Perform zero page detection on all pages. + * + * Sorts normal pages before zero pages in p->pages->offset and updates + * p->pages->normal_num. + * + * @param p A pointer to the send params. + */ +void multifd_send_zero_page_detect(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + RAMBlock *rb = pages->block; + int i = 0; + int j = pages->num - 1; + + if (!multifd_zero_page_enabled()) { + pages->normal_num = pages->num; + return; + } + + /* + * Sort the page offset array by moving all normal pages to + * the left and all zero pages to the right of the array. + */ + while (i <= j) { + uint64_t offset = pages->offset[i]; + + if (!buffer_is_zero(rb->host + offset, p->page_size)) { + i++; + continue; + } + + swap_page_offset(pages->offset, i, j); + ram_release_page(rb->idstr, offset); + j--; + } + + pages->normal_num = i; +} + +void multifd_recv_zero_page_process(MultiFDRecvParams *p) +{ + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; + if (!buffer_is_zero(page, p->page_size)) { + memset(page, 0, p->page_size); + } + } +} diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 6120faad65..83c0374380 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,13 +123,15 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -172,10 +174,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = out_size; p->iovs_num++; p->next_packet_size = out_size; - p->flags |= MULTIFD_FLAG_ZLIB; +out: + p->flags |= MULTIFD_FLAG_ZLIB; multifd_send_fill_packet(p); - return 0; } @@ -261,6 +263,14 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZLIB); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -310,6 +320,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, out_size, expected_size); return -1; } + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index cac236833d..02112255ad 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,16 +118,18 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } z->in.src = p->pages->block->host + pages->offset[i]; @@ -161,10 +163,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = z->out.pos; p->iovs_num++; p->next_packet_size = z->out.pos; - p->flags |= MULTIFD_FLAG_ZSTD; +out: + p->flags |= MULTIFD_FLAG_ZSTD; multifd_send_fill_packet(p); - return 0; } @@ -257,6 +259,14 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZSTD); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { diff --git a/migration/multifd.c b/migration/multifd.c index 8d10ce5153..4d38e27695 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -11,6 +11,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" @@ -112,12 +113,17 @@ void multifd_send_channel_created(void) static void multifd_set_file_bitmap(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; + uint32_t zero_num = p->pages->num - p->pages->normal_num; assert(pages->block); - for (int i = 0; i < p->pages->num; i++) { + for (int i = 0; i < p->pages->normal_num; i++) { ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); } + + for (int i = p->pages->num; i < zero_num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false); + } } /* Multifd without compression */ @@ -154,13 +160,13 @@ static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < pages->num; i++) { + for (int i = 0; i < pages->normal_num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = pages->num * p->page_size; + p->next_packet_size = pages->normal_num * p->page_size; } /** @@ -179,6 +185,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) bool use_zero_copy_send = migrate_zero_copy_send(); int ret; + multifd_send_zero_page_detect(p); + if (!multifd_use_packets()) { multifd_send_prepare_iovs(p); multifd_set_file_bitmap(p); @@ -262,6 +270,13 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_NOCOMP); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; @@ -296,6 +311,7 @@ static void multifd_pages_reset(MultiFDPages_t *pages) * overwritten later when reused. */ pages->num = 0; + pages->normal_num = 0; pages->block = NULL; } @@ -387,11 +403,13 @@ void multifd_send_fill_packet(MultiFDSendParams *p) MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; uint64_t packet_num; + uint32_t zero_num = pages->num - pages->normal_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(pages->num); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -409,10 +427,11 @@ void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; - p->total_normal_pages += pages->num; + p->total_normal_pages += pages->normal_num; + p->total_zero_pages += zero_num; - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); + trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, + p->flags, p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -453,20 +472,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal_num = be32_to_cpu(packet->normal_pages); if (p->normal_num > packet->pages_alloc) { error_setg(errp, "multifd: received packet " - "with %u pages and expected maximum pages are %u", + "with %u normal pages and expected maximum pages are %u", p->normal_num, packet->pages_alloc) ; return -1; } + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > packet->pages_alloc - p->normal_num) { + error_setg(errp, "multifd: received packet " + "with %u zero pages and expected maximum zero pages are %u", + p->zero_num, packet->pages_alloc - p->normal_num) ; + return -1; + } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; p->total_normal_pages += p->normal_num; + p->total_zero_pages += p->zero_num; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, - p->next_packet_size); + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, + p->flags, p->next_packet_size); - if (p->normal_num == 0) { + if (p->normal_num == 0 && p->zero_num == 0) { return 0; } @@ -492,6 +520,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal[i] = offset; } + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + return 0; } @@ -922,6 +962,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); multifd_pages_reset(p->pages); p->next_packet_size = 0; @@ -969,7 +1011,8 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1320,6 +1363,8 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->iov = NULL; g_free(p->normal); p->normal = NULL; + g_free(p->zero); + p->zero = NULL; multifd_recv_state->ops->recv_cleanup(p); } @@ -1453,7 +1498,7 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); } else { /* @@ -1511,7 +1556,9 @@ static void *multifd_recv_thread(void *opaque) } rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, + p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1563,6 +1610,7 @@ int multifd_recv_setup(Error **errp) p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; p->page_size = qemu_target_page_size(); } @@ -1637,3 +1685,17 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + multifd_send_prepare_header(p); + + return true; +} diff --git a/migration/multifd.h b/migration/multifd.h index 7447c2bea3..c9d9b09239 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -55,14 +55,24 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; uint64_t packet_num; - uint64_t unused[4]; /* Reserved for future use */ + /* zero pages */ + uint32_t zero_pages; + uint32_t unused32[1]; /* Reserved for future use */ + uint64_t unused64[3]; /* Reserved for future use */ char ramblock[256]; + /* + * This array contains the pointers to: + * - normal pages (initial normal_pages entries) + * - zero pages (following zero_pages entries) + */ uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; typedef struct { /* number of used pages */ uint32_t num; + /* number of normal pages */ + uint32_t normal_num; /* number of allocated pages */ uint32_t allocated; /* offset of each page */ @@ -136,6 +146,8 @@ typedef struct { uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; + /* zero pages sent through this channel */ + uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ @@ -194,12 +206,18 @@ typedef struct { uint8_t *host; /* non zero pages recv through this channel */ uint64_t total_normal_pages; + /* zero pages recv through this channel */ + uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ ram_addr_t *normal; /* num of non zero pages */ uint32_t normal_num; + /* Pages that are zero */ + ram_addr_t *zero; + /* num of zero pages */ + uint32_t zero_num; /* used for de-compression methods */ void *compress_data; } MultiFDRecvParams; @@ -221,6 +239,9 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); void multifd_send_fill_packet(MultiFDSendParams *p); +bool multifd_send_prepare_common(MultiFDSendParams *p); +void multifd_send_zero_page_detect(MultiFDSendParams *p); +void multifd_recv_zero_page_process(MultiFDRecvParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { diff --git a/migration/ram.c b/migration/ram.c index 30b848bd79..9347b3ee1a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1425,7 +1425,6 @@ static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) if (!multifd_queue_page(block, offset)) { return -1; } - stat64_add(&mig_stats.normal_pages, 1); return 1; } diff --git a/migration/trace-events b/migration/trace-events index bf1a069632..f0e1cb80c7 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" +multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" +multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" multifd_send_terminate_threads(void) "" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 +multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" diff --git a/qapi/migration.json b/qapi/migration.json index c303ca4452..abf70f5925 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -664,10 +664,15 @@ # # @legacy: Perform zero page checking in main migration thread. # +# @multifd: Perform zero page checking in multifd sender thread if +# multifd migration is enabled, else in the main migration +# thread as for @legacy. +# # Since: 9.0 +# ## { 'enum': 'ZeroPageDetection', - 'data': [ 'none', 'legacy' ] } + 'data': [ 'none', 'legacy', 'multifd' ] } ## # @BitmapMigrationBitmapAliasTransform: -- Gitee From ed6e7ebbc9a93e407a189f457858999e2358f6ec Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:13 +0000 Subject: [PATCH 099/134] migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. commit 9ae90f73e623c8b8c7ec1fccd8ca493805df8fbd upstream. 1. Add a dedicated handler for MigrationOps::ram_save_target_page in multifd live migration. 2. Refactor ram_save_target_page_legacy so that the legacy and multifd handlers don't have internal functions calling into each other. Intel-SIG: commit 9ae90f73e623 migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. Signed-off-by: Hao Xiang Reviewed-by: Fabiano Rosas Message-Id: <20240226195654.934709-4-hao.xiang@bytedance.com> Link: https://lore.kernel.org/r/20240311180015.3359271-6-hao.xiang@linux.dev Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/ram.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 9347b3ee1a..a31cda3c64 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2266,7 +2266,6 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, */ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) { - RAMBlock *block = pss->block; ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; @@ -2293,17 +2292,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } + return ram_save_page(rs, pss); +} + +/** + * ram_save_target_page_multifd: send one target page to multifd workers + * + * Returns 1 if the page was queued, -1 otherwise. + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) +{ + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + /* - * Do not use multifd in postcopy as one whole host page should be - * placed. Meanwhile postcopy requires atomic update of pages, so even - * if host page size == guest page size the dest guest during run may - * still see partially copied pages which is data corruption. + * While using multifd live migration, we still need to handle zero + * page checking on the migration main thread. */ - if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(block, offset); + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (save_zero_page(rs, pss, offset)) { + return 1; + } } - return ram_save_page(rs, pss); + return ram_save_multifd_page(block, offset); } /* Should be called before sending a host page */ @@ -3442,7 +3457,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } migration_ops = g_malloc0(sizeof(MigrationOps)); - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + if (migrate_multifd()) { + migration_ops->ram_save_target_page = ram_save_target_page_multifd; + } else { + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + } qemu_mutex_unlock_iothread(); ret = multifd_send_sync_main(); -- Gitee From 109951aa484436587e3fb87e1687344f7fd0642f Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:14 +0000 Subject: [PATCH 100/134] migration/multifd: Enable multifd zero page checking by default. commit 70c25c92e602f393d3c33596530c5f2b18491e55 upstream. 1. Set default "zero-page-detection" option to "multifd". Now zero page checking can be done in the multifd threads and this becomes the default configuration. 2. Handle migration QEMU9.0 -> QEMU8.2 compatibility. We provide backward compatibility where zero page checking is done from the migration main thread. Intel-SIG: commit 70c25c92e602 migration/multifd: Enable multifd zero page checking by default. Signed-off-by: Hao Xiang Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240311180015.3359271-7-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: migration/options.c [jz: no compatibility property and resolve context conflict] Signed-off-by: Jason Zeng --- migration/options.c | 2 +- qapi/migration.json | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/migration/options.c b/migration/options.c index e1b3aa38ea..710707acea 100644 --- a/migration/options.c +++ b/migration/options.c @@ -185,7 +185,7 @@ Property migration_properties[] = { MIG_MODE_NORMAL), DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, parameters.zero_page_detection, - ZERO_PAGE_DETECTION_LEGACY), + ZERO_PAGE_DETECTION_MULTIFD), DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), diff --git a/qapi/migration.json b/qapi/migration.json index abf70f5925..f55faadd2f 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -913,7 +913,7 @@ # (Since 8.2) # # @zero-page-detection: Whether and how to detect zero pages. -# See description in @ZeroPageDetection. Default is 'legacy'. +# See description in @ZeroPageDetection. Default is 'multifd'. # (since 9.0) # # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or @@ -1122,7 +1122,7 @@ # (Since 8.2) # # @zero-page-detection: Whether and how to detect zero pages. -# See description in @ZeroPageDetection. Default is 'legacy'. +# See description in @ZeroPageDetection. Default is 'multifd'. # (since 9.0) # # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or @@ -1375,7 +1375,7 @@ # (Since 8.2) # # @zero-page-detection: Whether and how to detect zero pages. -# See description in @ZeroPageDetection. Default is 'legacy'. +# See description in @ZeroPageDetection. Default is 'multifd'. # (since 9.0) # # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or -- Gitee From a0ef281001a06f26b4f90d2cd937bbf4f03400e9 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:15 +0000 Subject: [PATCH 101/134] migration/multifd: Add new migration test cases for legacy zero page checking. commit 1815338df00fd0a3fe25085564c6966f74c8f43d upstream. Now that zero page checking is done on the multifd sender threads by default, we still provide an option for backward compatibility. This change adds a qtest migration test case to set the zero-page-detection option to "legacy" and run multifd migration with zero page checking on the migration main thread. Intel-SIG: commit 1815338df00f migration/multifd: Add new migration test cases for legacy zero page checking. Signed-off-by: Hao Xiang Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240311180015.3359271-8-hao.xiang@linux.dev Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 52 ++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 7da512725b..ce8afde236 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2706,6 +2706,24 @@ test_migrate_precopy_tcp_multifd_start(QTestState *from, return test_migrate_precopy_tcp_multifd_start_common(from, to, "none"); } +static void * +test_migrate_precopy_tcp_multifd_start_zero_page_legacy(QTestState *from, + QTestState *to) +{ + test_migrate_precopy_tcp_multifd_start_common(from, to, "none"); + migrate_set_parameter_str(from, "zero-page-detection", "legacy"); + return NULL; +} + +static void * +test_migration_precopy_tcp_multifd_start_no_zero_page(QTestState *from, + QTestState *to) +{ + test_migrate_precopy_tcp_multifd_start_common(from, to, "none"); + migrate_set_parameter_str(from, "zero-page-detection", "none"); + return NULL; +} + static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) @@ -2737,6 +2755,36 @@ static void test_multifd_tcp_none(void) test_precopy_common(&args); } +static void test_multifd_tcp_zero_page_legacy(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_start_zero_page_legacy, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure + * everything will work alright even if guest page is changing. + */ + .live = true, + }; + test_precopy_common(&args); +} + +static void test_multifd_tcp_no_zero_page(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migration_precopy_tcp_multifd_start_no_zero_page, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure + * everything will work alright even if guest page is changing. + */ + .live = true, + }; + test_precopy_common(&args); +} + static void test_multifd_tcp_zlib(void) { MigrateCommon args = { @@ -3644,6 +3692,10 @@ int main(int argc, char **argv) } migration_test_add("/migration/multifd/tcp/plain/none", test_multifd_tcp_none); + migration_test_add("/migration/multifd/tcp/plain/zero-page/legacy", + test_multifd_tcp_zero_page_legacy); + migration_test_add("/migration/multifd/tcp/plain/zero-page/none", + test_multifd_tcp_no_zero_page); migration_test_add("/migration/multifd/tcp/plain/cancel", test_multifd_tcp_cancel); migration_test_add("/migration/multifd/tcp/plain/zlib", -- Gitee From 99bef5741b52d6bd2dd750f102b4dbe2c4984692 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 11 Mar 2024 20:33:34 -0300 Subject: [PATCH 102/134] io: Introduce qio_channel_file_new_dupfd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4760cedc61328e47bf7f1fabceb9937facfa4cdd upstream. Add a new helper function for creating a QIOChannelFile channel with a duplicated file descriptor. This saves the calling code from having to do error checking on the dup() call. Intel-SIG: commit 4760cedc6132 io: Introduce qio_channel_file_new_dupfd Suggested-by: "Daniel P. Berrangé" Signed-off-by: Fabiano Rosas Reviewed-by: "Daniel P. Berrangé" Link: https://lore.kernel.org/r/20240311233335.17299-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- include/io/channel-file.h | 18 ++++++++++++++++++ io/channel-file.c | 12 ++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/io/channel-file.h b/include/io/channel-file.h index 50e8eb1138..d373a4e44d 100644 --- a/include/io/channel-file.h +++ b/include/io/channel-file.h @@ -68,6 +68,24 @@ struct QIOChannelFile { QIOChannelFile * qio_channel_file_new_fd(int fd); +/** + * qio_channel_file_new_dupfd: + * @fd: the file descriptor + * @errp: pointer to initialized error object + * + * Create a new IO channel object for a file represented by the @fd + * parameter. Like qio_channel_file_new_fd(), but the @fd is first + * duplicated with dup(). + * + * The channel will own the duplicated file descriptor and will take + * responsibility for closing it, the original FD is owned by the + * caller. + * + * Returns: the new channel object + */ +QIOChannelFile * +qio_channel_file_new_dupfd(int fd, Error **errp); + /** * qio_channel_file_new_path: * @path: the file path diff --git a/io/channel-file.c b/io/channel-file.c index a6ad7770c6..6436cfb6ae 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -45,6 +45,18 @@ qio_channel_file_new_fd(int fd) return ioc; } +QIOChannelFile * +qio_channel_file_new_dupfd(int fd, Error **errp) +{ + int newfd = dup(fd); + + if (newfd < 0) { + error_setg_errno(errp, errno, "Could not dup FD %d", fd); + return NULL; + } + + return qio_channel_file_new_fd(newfd); +} QIOChannelFile * qio_channel_file_new_path(const char *path, -- Gitee From ce76a3e9abf21655ada6a8507106d8774ec8de3e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 11 Mar 2024 20:33:35 -0300 Subject: [PATCH 103/134] migration: Fix error handling after dup in file migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c827fafcaad3e8b3dcf7eeb5944b03f6b63dfc44 upstream. The file migration code was allowing a possible -1 from a failed call to dup() to propagate into the new QIOFileChannel::fd before checking for validity. Coverity doesn't like that, possibly due to the the lseek(-1, ...) call that would ensue before returning from the channel creation routine. Use the newly introduced qio_channel_file_dupfd() to properly check the return of dup() before proceeding. Intel-SIG: commit c827fafcaad3 migration: Fix error handling after dup in file migration Fixes: CID 1539961 Fixes: CID 1539965 Fixes: CID 1539960 Fixes: 2dd7ee7a51 ("migration/multifd: Add incoming QIOChannelFile support") Fixes: decdc76772 ("migration/multifd: Add mapped-ram support to fd: URI") Reported-by: Peter Maydell Signed-off-by: Fabiano Rosas Reviewed-by: "Daniel P. Berrangé" Link: https://lore.kernel.org/r/20240311233335.17299-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/fd.c | 9 ++++----- migration/file.c | 14 +++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/migration/fd.c b/migration/fd.c index d4ae72d132..4e2a63a73d 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -80,6 +80,7 @@ static gboolean fd_accept_incoming_migration(QIOChannel *ioc, void fd_start_incoming_migration(const char *fdname, Error **errp) { QIOChannel *ioc; + QIOChannelFile *fioc; int fd = monitor_fd_param(monitor_cur(), fdname, errp); if (fd == -1) { return; @@ -103,15 +104,13 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) int channels = migrate_multifd_channels(); while (channels--) { - ioc = QIO_CHANNEL(qio_channel_file_new_fd(dup(fd))); - - if (QIO_CHANNEL_FILE(ioc)->fd == -1) { - error_setg(errp, "Failed to duplicate fd %d", fd); + fioc = qio_channel_file_new_dupfd(fd, errp); + if (!fioc) { return; } qio_channel_set_name(ioc, "migration-fd-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, + qio_channel_add_watch_full(QIO_CHANNEL(fioc), G_IO_IN, fd_accept_incoming_migration, NULL, NULL, g_main_context_get_thread_default()); diff --git a/migration/file.c b/migration/file.c index 5075f9526f..af3ada7ec0 100644 --- a/migration/file.c +++ b/migration/file.c @@ -58,12 +58,13 @@ bool file_send_channel_create(gpointer opaque, Error **errp) int fd = fd_args_get_fd(); if (fd && fd != -1) { - ioc = qio_channel_file_new_fd(dup(fd)); + ioc = qio_channel_file_new_dupfd(fd, errp); } else { ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); - if (!ioc) { - goto out; - } + } + + if (!ioc) { + goto out; } multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); @@ -147,10 +148,9 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) NULL, NULL, g_main_context_get_thread_default()); - fioc = qio_channel_file_new_fd(dup(fioc->fd)); + fioc = qio_channel_file_new_dupfd(fioc->fd, errp); - if (!fioc || fioc->fd == -1) { - error_setg(errp, "Error creating migration incoming channel"); + if (!fioc) { break; } } while (++i < channels); -- Gitee From ce0f9cdfe2c9bf0442abb21d2fbab52c72cac42b Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 13 Mar 2024 18:28:23 -0300 Subject: [PATCH 104/134] migration: Fix iocs leaks during file and fd migration commit 74228c598f139bd9ce7839794be9a3ccc180fb27 upstream. The memory for the io channels is being leaked in three different ways during file migration: 1) if the offset check fails we never drop the ioc reference; 2) we allocate an extra channel for no reason; 3) if multifd is enabled but channel creation fails when calling dup(), we leave the previous channels around along with the glib polling; Fix all issues by restructuring the code to first allocate the channels and only register the watches when all channels have been created. For multifd, the file and fd migrations can share code because both are backed by a QIOChannelFile. For the non-multifd case, the fd needs to be separate because it is backed by a QIOChannelSocket. Intel-SIG: commit 74228c598f13 migration: Fix iocs leaks during file and fd migration Fixes: 2dd7ee7a51 ("migration/multifd: Add incoming QIOChannelFile support") Fixes: decdc76772 ("migration/multifd: Add mapped-ram support to fd: URI") Reported-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240313212824.16974-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/fd.c | 29 +++++++----------------- migration/file.c | 58 ++++++++++++++++++++++++++++++------------------ migration/file.h | 1 + 3 files changed, 46 insertions(+), 42 deletions(-) diff --git a/migration/fd.c b/migration/fd.c index 4e2a63a73d..39a52e5c90 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -18,6 +18,7 @@ #include "qapi/error.h" #include "channel.h" #include "fd.h" +#include "file.h" #include "migration.h" #include "monitor/monitor.h" #include "io/channel-file.h" @@ -80,7 +81,6 @@ static gboolean fd_accept_incoming_migration(QIOChannel *ioc, void fd_start_incoming_migration(const char *fdname, Error **errp) { QIOChannel *ioc; - QIOChannelFile *fioc; int fd = monitor_fd_param(monitor_cur(), fdname, errp); if (fd == -1) { return; @@ -94,26 +94,13 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) return; } - qio_channel_set_name(ioc, "migration-fd-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - fd_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); - if (migrate_multifd()) { - int channels = migrate_multifd_channels(); - - while (channels--) { - fioc = qio_channel_file_new_dupfd(fd, errp); - if (!fioc) { - return; - } - - qio_channel_set_name(ioc, "migration-fd-incoming"); - qio_channel_add_watch_full(QIO_CHANNEL(fioc), G_IO_IN, - fd_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); - } + file_create_incoming_channels(ioc, errp); + } else { + qio_channel_set_name(ioc, "migration-fd-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); } } diff --git a/migration/file.c b/migration/file.c index af3ada7ec0..9aa5576937 100644 --- a/migration/file.c +++ b/migration/file.c @@ -115,13 +115,46 @@ static gboolean file_accept_incoming_migration(QIOChannel *ioc, return G_SOURCE_REMOVE; } +void file_create_incoming_channels(QIOChannel *ioc, Error **errp) +{ + int i, fd, channels = 1; + g_autofree QIOChannel **iocs = NULL; + + if (migrate_multifd()) { + channels += migrate_multifd_channels(); + } + + iocs = g_new0(QIOChannel *, channels); + fd = QIO_CHANNEL_FILE(ioc)->fd; + iocs[0] = ioc; + + for (i = 1; i < channels; i++) { + QIOChannelFile *fioc = qio_channel_file_new_dupfd(fd, errp); + + if (!fioc) { + while (i) { + object_unref(iocs[--i]); + } + return; + } + + iocs[i] = QIO_CHANNEL(fioc); + } + + for (i = 0; i < channels; i++) { + qio_channel_set_name(iocs[i], "migration-file-incoming"); + qio_channel_add_watch_full(iocs[i], G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + } +} + void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) { g_autofree char *filename = g_strdup(file_args->filename); QIOChannelFile *fioc = NULL; uint64_t offset = file_args->offset; - int channels = 1; - int i = 0; trace_migration_file_incoming(filename); @@ -132,28 +165,11 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) if (offset && qio_channel_io_seek(QIO_CHANNEL(fioc), offset, SEEK_SET, errp) < 0) { + object_unref(OBJECT(fioc)); return; } - if (migrate_multifd()) { - channels += migrate_multifd_channels(); - } - - do { - QIOChannel *ioc = QIO_CHANNEL(fioc); - - qio_channel_set_name(ioc, "migration-file-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - file_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); - - fioc = qio_channel_file_new_dupfd(fioc->fd, errp); - - if (!fioc) { - break; - } - } while (++i < channels); + file_create_incoming_channels(QIO_CHANNEL(fioc), errp); } int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, diff --git a/migration/file.h b/migration/file.h index 9f71e87f74..7699c04677 100644 --- a/migration/file.h +++ b/migration/file.h @@ -20,6 +20,7 @@ void file_start_outgoing_migration(MigrationState *s, int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); void file_cleanup_outgoing_migration(void); bool file_send_channel_create(gpointer opaque, Error **errp); +void file_create_incoming_channels(QIOChannel *ioc, Error **errp); int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, int niov, RAMBlock *block, Error **errp); int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp); -- Gitee From 90f1aed4a4c5a4fcff66e0ddfe7b6a3a867fc854 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Fri, 15 Mar 2024 00:20:38 -0300 Subject: [PATCH 105/134] migration/multifd: Ensure we're not given a socket for file migration commit 73f6f9a12fb4a3afe01e18690ebd6a6e4283c1a6 upstream. When doing migration using the fd: URI, QEMU will fetch the file descriptor passed in via the monitor at fd_start_outgoing|incoming_migration(), which means the checks at migration_channels_and_transport_compatible() happen too soon and we don't know at that point whether the FD refers to a plain file or a socket. For this reason, we've been allowing a migration channel of type SOCKET_ADDRESS_TYPE_FD to pass the initial verifications in scenarios where the socket migration is not supported, such as with fd + multifd. The commit decdc76772 ("migration/multifd: Add mapped-ram support to fd: URI") was supposed to add a second check prior to starting migration to make sure a socket fd is not passed instead of a file fd, but failed to do so. Add the missing verification and update the comment explaining this situation which is currently incorrect. Intel-SIG: commit 73f6f9a12fb4 migration/multifd: Ensure we're not given a socket for file migration Fixes: decdc76772 ("migration/multifd: Add mapped-ram support to fd: URI") Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240315032040.7974-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/fd.c | 8 ++++++++ migration/file.c | 7 +++++++ migration/migration.c | 6 +++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/migration/fd.c b/migration/fd.c index 39a52e5c90..c07030f715 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -22,6 +22,7 @@ #include "migration.h" #include "monitor/monitor.h" #include "io/channel-file.h" +#include "io/channel-socket.h" #include "io/channel-util.h" #include "options.h" #include "trace.h" @@ -95,6 +96,13 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) } if (migrate_multifd()) { + if (fd_is_socket(fd)) { + error_setg(errp, + "Multifd migration to a socket FD is not supported"); + object_unref(ioc); + return; + } + file_create_incoming_channels(ioc, errp); } else { qio_channel_set_name(ioc, "migration-fd-incoming"); diff --git a/migration/file.c b/migration/file.c index 9aa5576937..579e0df6de 100644 --- a/migration/file.c +++ b/migration/file.c @@ -15,6 +15,7 @@ #include "file.h" #include "migration.h" #include "io/channel-file.h" +#include "io/channel-socket.h" #include "io/channel-util.h" #include "options.h" #include "trace.h" @@ -58,6 +59,12 @@ bool file_send_channel_create(gpointer opaque, Error **errp) int fd = fd_args_get_fd(); if (fd && fd != -1) { + if (fd_is_socket(fd)) { + error_setg(errp, + "Multifd migration to a socket FD is not supported"); + goto out; + } + ioc = qio_channel_file_new_dupfd(fd, errp); } else { ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); diff --git a/migration/migration.c b/migration/migration.c index 11cd4c7f67..c5ba446f50 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -158,9 +158,9 @@ static bool transport_supports_seeking(MigrationAddress *addr) } /* - * At this point, the user might not yet have passed the file - * descriptor to QEMU, so we cannot know for sure whether it - * refers to a plain file or a socket. Let it through anyway. + * At this point QEMU has not yet fetched the fd passed in by the + * user, so we cannot know for sure whether it refers to a plain + * file or a socket. Let it through anyway and check at fd.c. */ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD; -- Gitee From 205874e67677b4a8f7b6b4aa0bc9900a9306e5c4 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Fri, 15 Mar 2024 00:20:39 -0300 Subject: [PATCH 106/134] migration/multifd: Duplicate the fd for the outgoing_args commit 9adfb308c1513562d6acec02aa780c5ef9b0193d upstream. We currently store the file descriptor used during the main outgoing channel creation to use it again when creating the multifd channels. Since this fd is used for the first iochannel, there's risk that the QIOChannel gets freed and the fd closed while outgoing_args.fd still has it available. This could lead to an fd-reuse bug. Duplicate the outgoing_args fd to avoid this issue. Intel-SIG: commit 9adfb308c151 migration/multifd: Duplicate the fd for the outgoing_args Suggested-by: Peter Xu Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240315032040.7974-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/fd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/migration/fd.c b/migration/fd.c index c07030f715..fe0d096abd 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -49,8 +49,7 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** { QIOChannel *ioc; int fd = monitor_get_fd(monitor_cur(), fdname, errp); - - outgoing_args.fd = -1; + int newfd; if (fd == -1) { return; @@ -63,7 +62,17 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } - outgoing_args.fd = fd; + /* + * This is dup()ed just to avoid referencing an fd that might + * be already closed by the iochannel. + */ + newfd = dup(fd); + if (newfd == -1) { + error_setg_errno(errp, errno, "Could not dup FD %d", fd); + object_unref(ioc); + return; + } + outgoing_args.fd = newfd; qio_channel_set_name(ioc, "migration-fd-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); -- Gitee From 03ce8adfce9d4f5af528a5e39b960255763c273c Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 19 Mar 2024 18:09:41 -0300 Subject: [PATCH 107/134] migration: Revert mapped-ram multifd support to fd: URI commit bd4480b0d02ee760b51aa82cc0915174753716ee upstream. This reverts commit decdc76772c453ff1444612e910caa0d45cd8eac in full and also the relevant migration-tests from 7a09f092834641b7a793d50a3a261073bbb404a6. After the addition of the new QAPI-based migration address API in 8.2 we've been converting an "fd:" URI into a SocketAddress, missing the fact that the "fd:" syntax could also be used for a plain file instead of a socket. This is a problem because the SocketAddress is part of the API, so we're effectively asking users to create a "socket" channel to pass in a plain file. The easiest way to fix this situation is to deprecate the usage of both SocketAddress and "fd:" when used with a plain file for migration. Since this has been possible since 8.2, we can wait until 9.1 to deprecate it. For 9.0, however, we should avoid adding further support to migration to a plain file using the old "fd:" syntax or the new SocketAddress API, and instead require the usage of either the old-style "file:" URI or the FileMigrationArgs::filename field of the new API with the "/dev/fdset/NN" syntax, both of which are already supported. Intel-SIG: commit bd4480b0d02e migration: Revert mapped-ram multifd support to fd: URI Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240319210941.1907-1-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/fd.c | 56 ++++-------------------------------- migration/fd.h | 2 -- migration/file.c | 19 ++---------- migration/migration.c | 13 --------- migration/multifd.c | 2 -- tests/qtest/migration-test.c | 43 --------------------------- 6 files changed, 8 insertions(+), 127 deletions(-) diff --git a/migration/fd.c b/migration/fd.c index fe0d096abd..449adaa2de 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -15,42 +15,19 @@ */ #include "qemu/osdep.h" -#include "qapi/error.h" #include "channel.h" #include "fd.h" #include "file.h" #include "migration.h" #include "monitor/monitor.h" -#include "io/channel-file.h" -#include "io/channel-socket.h" #include "io/channel-util.h" -#include "options.h" #include "trace.h" -static struct FdOutgoingArgs { - int fd; -} outgoing_args; - -int fd_args_get_fd(void) -{ - return outgoing_args.fd; -} - -void fd_cleanup_outgoing_migration(void) -{ - if (outgoing_args.fd > 0) { - close(outgoing_args.fd); - outgoing_args.fd = -1; - } -} - void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { QIOChannel *ioc; int fd = monitor_get_fd(monitor_cur(), fdname, errp); - int newfd; - if (fd == -1) { return; } @@ -62,18 +39,6 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } - /* - * This is dup()ed just to avoid referencing an fd that might - * be already closed by the iochannel. - */ - newfd = dup(fd); - if (newfd == -1) { - error_setg_errno(errp, errno, "Could not dup FD %d", fd); - object_unref(ioc); - return; - } - outgoing_args.fd = newfd; - qio_channel_set_name(ioc, "migration-fd-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); object_unref(OBJECT(ioc)); @@ -104,20 +69,9 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) return; } - if (migrate_multifd()) { - if (fd_is_socket(fd)) { - error_setg(errp, - "Multifd migration to a socket FD is not supported"); - object_unref(ioc); - return; - } - - file_create_incoming_channels(ioc, errp); - } else { - qio_channel_set_name(ioc, "migration-fd-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - fd_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); - } + qio_channel_set_name(ioc, "migration-fd-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); } diff --git a/migration/fd.h b/migration/fd.h index 0c0a18d9e7..b901bc014e 100644 --- a/migration/fd.h +++ b/migration/fd.h @@ -20,6 +20,4 @@ void fd_start_incoming_migration(const char *fdname, Error **errp); void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp); -void fd_cleanup_outgoing_migration(void); -int fd_args_get_fd(void); #endif diff --git a/migration/file.c b/migration/file.c index 579e0df6de..8bd7d7eeba 100644 --- a/migration/file.c +++ b/migration/file.c @@ -11,7 +11,6 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "channel.h" -#include "fd.h" #include "file.h" #include "migration.h" #include "io/channel-file.h" @@ -55,27 +54,15 @@ bool file_send_channel_create(gpointer opaque, Error **errp) { QIOChannelFile *ioc; int flags = O_WRONLY; - bool ret = false; - int fd = fd_args_get_fd(); - - if (fd && fd != -1) { - if (fd_is_socket(fd)) { - error_setg(errp, - "Multifd migration to a socket FD is not supported"); - goto out; - } - - ioc = qio_channel_file_new_dupfd(fd, errp); - } else { - ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); - } + bool ret = true; + ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); if (!ioc) { + ret = false; goto out; } multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); - ret = true; out: /* diff --git a/migration/migration.c b/migration/migration.c index c5ba446f50..cd22d15a34 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -132,10 +132,6 @@ static bool transport_supports_multi_channels(MigrationAddress *addr) if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { SocketAddress *saddr = &addr->u.socket; - if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { - return migrate_mapped_ram(); - } - return (saddr->type == SOCKET_ADDRESS_TYPE_INET || saddr->type == SOCKET_ADDRESS_TYPE_UNIX || saddr->type == SOCKET_ADDRESS_TYPE_VSOCK); @@ -157,15 +153,6 @@ static bool transport_supports_seeking(MigrationAddress *addr) return true; } - /* - * At this point QEMU has not yet fetched the fd passed in by the - * user, so we cannot know for sure whether it refers to a plain - * file or a socket. Let it through anyway and check at fd.c. - */ - if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { - return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD; - } - return false; } diff --git a/migration/multifd.c b/migration/multifd.c index 4d38e27695..b64c3c106e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -19,7 +19,6 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "qapi/qapi-events-migration.h" -#include "fd.h" #include "file.h" #include "migration.h" #include "migration-stats.h" @@ -795,7 +794,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { file_cleanup_outgoing_migration(); - fd_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index ce8afde236..9149469d37 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2471,13 +2471,6 @@ static void *migrate_precopy_fd_file_start(QTestState *from, QTestState *to) return NULL; } -static void *migrate_fd_file_mapped_ram_start(QTestState *from, QTestState *to) -{ - migrate_mapped_ram_start(from, to); - - return migrate_precopy_fd_file_start(from, to); -} - static void test_migrate_precopy_fd_file(void) { MigrateCommon args = { @@ -2488,36 +2481,6 @@ static void test_migrate_precopy_fd_file(void) }; test_file_common(&args, true); } - -static void test_migrate_precopy_fd_file_mapped_ram(void) -{ - MigrateCommon args = { - .listen_uri = "defer", - .connect_uri = "fd:fd-mig", - .start_hook = migrate_fd_file_mapped_ram_start, - .finish_hook = test_migrate_fd_finish_hook - }; - test_file_common(&args, true); -} - -static void *migrate_multifd_fd_mapped_ram_start(QTestState *from, - QTestState *to) -{ - migrate_multifd_mapped_ram_start(from, to); - return migrate_precopy_fd_file_start(from, to); -} - -static void test_multifd_fd_mapped_ram(void) -{ - MigrateCommon args = { - .connect_uri = "fd:fd-mig", - .listen_uri = "defer", - .start_hook = migrate_multifd_fd_mapped_ram_start, - .finish_hook = test_migrate_fd_finish_hook - }; - - test_file_common(&args, true); -} #endif /* _WIN32 */ static void do_test_validate_uuid(MigrateStart *args, bool should_fail) @@ -3602,10 +3565,6 @@ int main(int argc, char **argv) test_multifd_file_mapped_ram); migration_test_add("/migration/multifd/file/mapped-ram/live", test_multifd_file_mapped_ram_live); -#ifndef _WIN32 - migration_test_add("/migration/multifd/fd/mapped-ram", - test_multifd_fd_mapped_ram); -#endif #ifdef CONFIG_GNUTLS migration_test_add("/migration/precopy/unix/tls/psk", @@ -3668,8 +3627,6 @@ int main(int argc, char **argv) test_migrate_precopy_fd_socket); migration_test_add("/migration/precopy/fd/file", test_migrate_precopy_fd_file); - migration_test_add("/migration/precopy/fd/file/mapped-ram", - test_migrate_precopy_fd_file_mapped_ram); #endif migration_test_add("/migration/validate_uuid", test_validate_uuid); migration_test_add("/migration/validate_uuid_error", -- Gitee From fc0be897adbd783bb796a4ca179cdec0fd0160a5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 20 Mar 2024 17:44:53 -0400 Subject: [PATCH 108/134] migration/postcopy: Fix high frequency sync commit 910c1647364f0c8c1f680f260a2b7d52587bc38e upstream. With current code base I can observe extremely high sync count during precopy, as long as one enables postcopy-ram=on before switchover to postcopy. To provide some context of when QEMU decides to do a full sync: it checks must_precopy (which implies "data must be sent during precopy phase"), and as long as it is lower than the threshold size we calculated (out of bandwidth and expected downtime) QEMU will kick off the slow/exact sync. However, when postcopy is enabled (even if still during precopy phase), RAM only reports all pages as can_postcopy, and report must_precopy==0. Then "must_precopy <= threshold_size" mostly always triggers and enforces a slow sync for every call to migration_iteration_run() when postcopy is enabled even if not used. That is insane. It turns out it was a regress bug introduced in the previous refactoring in 8.0 as reported by Nina [1]: (a) c8df4a7aef ("migration: Split save_live_pending() into state_pending_*") Then a workaround patch is applied at the end of release (8.0-rc4) to fix it: (b) 28ef5339c3 ("migration: fix ram_state_pending_exact()") However that "workaround" was overlooked when during the cleanup in this 9.0 release in this commit.. (c) b0504edd40 ("migration: Drop unnecessary check in ram's pending_exact()") Then the issue was re-exposed as reported by Nina [1]. The problem with (b) is that it only fixed the case for RAM, rather than all the rest of iterators. Here a slow sync should only be required if all dirty data (precopy+postcopy) is less than the threshold_size that QEMU calculated. It is even debatable whether a sync is needed when switched to postcopy. Currently ram_state_pending_exact() will be mostly noop if switched to postcopy, and that logic seems to apply too for all the rest of iterators, as sync dirty bitmap during a postcopy doesn't make much sense. However let's leave such change for later, as we're in rc phase. So rather than reusing commit (b), this patch provides the complete fix for all iterators. When at it, cleanup a little bit on the lines around. [1] https://gitlab.com/qemu-project/qemu/-/issues/1565 Intel-SIG: commit 910c1647364f migration/postcopy: Fix high frequency sync Reported-by: Nina Schoetterl-Glausch Fixes: b0504edd40 ("migration: Drop unnecessary check in ram's pending_exact()") Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240320214453.584374-1-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index cd22d15a34..8e01355b9b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3111,17 +3111,16 @@ typedef enum { */ static MigIterateState migration_iteration_run(MigrationState *s) { - uint64_t must_precopy, can_postcopy; + uint64_t must_precopy, can_postcopy, pending_size; Error *local_err = NULL; bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; bool can_switchover = migration_can_switchover(s); qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy); - uint64_t pending_size = must_precopy + can_postcopy; - + pending_size = must_precopy + can_postcopy; trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); - if (must_precopy <= s->threshold_size) { + if (pending_size < s->threshold_size) { qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy); pending_size = must_precopy + can_postcopy; trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy); -- Gitee From 38efeb27a9500ce0a4636b6b7c27cf182db6637d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 21 Mar 2024 17:12:42 -0300 Subject: [PATCH 109/134] migration/multifd: Fix clearing of mapped-ram zero pages commit 8fa1a21c6edc2bf7de85984944848ab9ac49e937 upstream. When the zero page detection is done in the multifd threads, we need to iterate the second part of the pages->offset array and clear the file bitmap for each zero page. The piece of code we merged to do that is wrong. The reason this has passed all the tests is because the bitmap is initialized with zeroes already, so clearing the bits only really has an effect during live migration and when a data page goes from having data to no data. Intel-SIG: commit 8fa1a21c6edc migration/multifd: Fix clearing of mapped-ram zero pages Fixes: 303e6f54f9 ("migration/multifd: Implement zero page transmission on the multifd thread.") Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240321201242.6009-1-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index b64c3c106e..b089a45c96 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -112,7 +112,6 @@ void multifd_send_channel_created(void) static void multifd_set_file_bitmap(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; - uint32_t zero_num = p->pages->num - p->pages->normal_num; assert(pages->block); @@ -120,7 +119,7 @@ static void multifd_set_file_bitmap(MultiFDSendParams *p) ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true); } - for (int i = p->pages->num; i < zero_num; i++) { + for (int i = p->pages->normal_num; i < p->pages->num; i++) { ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false); } } -- Gitee From da29ef5eab5e314d71e978305e9843eb7ee4858a Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 1 Apr 2024 23:41:10 +0800 Subject: [PATCH 110/134] migration/multifd: solve zero page causing multiple page faults commit 5ef7e26bdb7eda10d6d5e1b77121be9945e5e550 upstream. Implemented recvbitmap tracking of received pages in multifd. If the zero page appears for the first time in the recvbitmap, this page is not checked and set. If the zero page has already appeared in the recvbitmap, there is no need to check the data but directly set the data to 0, because it is unlikely that the zero page will be migrated multiple times. Intel-SIG: commit 5ef7e26bdb7e migration/multifd: solve zero page causing multiple page faults Signed-off-by: Yuan Liu Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240401154110.2028453-2-yuan1.liu@intel.com [peterx: touch up the comment, as the bitmap is used outside postcopy now] Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- include/exec/ramblock.h | 2 +- migration/multifd-zero-page.c | 4 +++- migration/multifd-zlib.c | 1 + migration/multifd-zstd.c | 1 + migration/multifd.c | 1 + migration/ram.c | 4 ++++ migration/ram.h | 1 + 7 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index e8d82b4228..55d7f83927 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -57,7 +57,7 @@ struct RAMBlock { off_t bitmap_offset; uint64_t pages_offset; - /* bitmap of already received pages in postcopy */ + /* Bitmap of already received pages. Only used on destination side. */ unsigned long *receivedmap; /* diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c index 1ba38be636..e1b8370f88 100644 --- a/migration/multifd-zero-page.c +++ b/migration/multifd-zero-page.c @@ -80,8 +80,10 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) { for (int i = 0; i < p->zero_num; i++) { void *page = p->host + p->zero[i]; - if (!buffer_is_zero(page, p->page_size)) { + if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { memset(page, 0, p->page_size); + } else { + ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); } } } diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 83c0374380..b210725f6e 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -284,6 +284,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) int flush = Z_NO_FLUSH; unsigned long start = zs->total_out; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); if (i == p->normal_num - 1) { flush = Z_SYNC_FLUSH; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 02112255ad..256858df0a 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -278,6 +278,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) z->in.pos = 0; for (i = 0; i < p->normal_num; i++) { + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); z->out.dst = p->host + p->normal[i]; z->out.size = p->page_size; z->out.pos = 0; diff --git a/migration/multifd.c b/migration/multifd.c index b089a45c96..e411e2ba07 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -278,6 +278,7 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); } diff --git a/migration/ram.c b/migration/ram.c index a31cda3c64..d7e84a44ee 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -292,6 +292,10 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, nr); } +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) +{ + set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); +} #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) /* diff --git a/migration/ram.h b/migration/ram.h index 08feecaf51..bc0318b834 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -69,6 +69,7 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset); int64_t ramblock_recv_bitmap_send(QEMUFile *file, const char *block_name); bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); -- Gitee From 9573dc0fbced6c669821ff4707561fb818e25d95 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:04 +0800 Subject: [PATCH 111/134] docs/migration: add qpl compression feature commit 0d40b3d76ced77c1c82c77a636af703fabdb407c upstream. add Intel Query Processing Library (QPL) compression method introduction Intel-SIG: commit 0d40b3d76ced docs/migration: add qpl compression feature Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Acked-by: Peter Xu Signed-off-by: Fabiano Rosas Conflicts: docs/devel/migration/features.rst [jz: resolve context conflict due to CPR] Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/qpl-compression.rst | 260 +++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 docs/devel/migration/qpl-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 9d1abd2587..2c2aa736c3 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -11,3 +11,4 @@ Migration has plenty of features to support different use cases. vfio virtio mapped-ram + qpl-compression diff --git a/docs/devel/migration/qpl-compression.rst b/docs/devel/migration/qpl-compression.rst new file mode 100644 index 0000000000..990992d786 --- /dev/null +++ b/docs/devel/migration/qpl-compression.rst @@ -0,0 +1,260 @@ +=============== +QPL Compression +=============== +The Intel Query Processing Library (Intel ``QPL``) is an open-source library to +provide compression and decompression features and it is based on deflate +compression algorithm (RFC 1951). + +The ``QPL`` compression relies on Intel In-Memory Analytics Accelerator(``IAA``) +and Shared Virtual Memory(``SVM``) technology, they are new features supported +from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``). + +For more ``QPL`` introduction, please refer to `QPL Introduction +`_ + +QPL Compression Framework +========================= + +:: + + +----------------+ +------------------+ + | MultiFD Thread | |accel-config tool | + +-------+--------+ +--------+---------+ + | | + | | + |compress/decompress | + +-------+--------+ | Setup IAA + | QPL library | | Resources + +-------+---+----+ | + | | | + | +-------------+-------+ + | Open IAA | + | Devices +-----+-----+ + | |idxd driver| + | +-----+-----+ + | | + | | + | +-----+-----+ + +-----------+IAA Devices| + Submit jobs +-----------+ + via enqcmd + + +QPL Build And Installation +-------------------------- + +.. code-block:: shell + + $git clone --recursive https://github.com/intel/qpl.git qpl + $mkdir qpl/build + $cd qpl/build + $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DQPL_LIBRARY_TYPE=SHARED .. + $sudo cmake --build . --target install + +For more details about ``QPL`` installation, please refer to `QPL Installation +`_ + +IAA Device Management +--------------------- + +The number of ``IAA`` devices will vary depending on the Xeon product model. +On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with up to +4 devices per socket. + +By default, all ``IAA`` devices are disabled and need to be configured and +enabled by users manually. + +Check the number of devices through the following command + +.. code-block:: shell + + #lspci -d 8086:0cfe + 6a:02.0 System peripheral: Intel Corporation Device 0cfe + 6f:02.0 System peripheral: Intel Corporation Device 0cfe + 74:02.0 System peripheral: Intel Corporation Device 0cfe + 79:02.0 System peripheral: Intel Corporation Device 0cfe + e7:02.0 System peripheral: Intel Corporation Device 0cfe + ec:02.0 System peripheral: Intel Corporation Device 0cfe + f1:02.0 System peripheral: Intel Corporation Device 0cfe + f6:02.0 System peripheral: Intel Corporation Device 0cfe + +IAA Device Configuration And Enabling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``accel-config`` tool is used to enable ``IAA`` devices and configure +``IAA`` hardware resources(work queues and engines). One ``IAA`` device +has 8 work queues and 8 processing engines, multiple engines can be assigned +to a work queue via ``group`` attribute. + +For ``accel-config`` installation, please refer to `accel-config installation +`_ + +One example of configuring and enabling an ``IAA`` device. + +.. code-block:: shell + + #accel-config config-engine iax1/engine1.0 -g 0 + #accel-config config-engine iax1/engine1.1 -g 0 + #accel-config config-engine iax1/engine1.2 -g 0 + #accel-config config-engine iax1/engine1.3 -g 0 + #accel-config config-engine iax1/engine1.4 -g 0 + #accel-config config-engine iax1/engine1.5 -g 0 + #accel-config config-engine iax1/engine1.6 -g 0 + #accel-config config-engine iax1/engine1.7 -g 0 + #accel-config config-wq iax1/wq1.0 -g 0 -s 128 -p 10 -b 1 -t 128 -m shared -y user -n app1 -d user + #accel-config enable-device iax1 + #accel-config enable-wq iax1/wq1.0 + +.. note:: + IAX is an early name for IAA + +- The ``IAA`` device index is 1, use ``ls -lh /sys/bus/dsa/devices/iax*`` + command to query the ``IAA`` device index. + +- 8 engines and 1 work queue are configured in group 0, so all compression jobs + submitted to this work queue can be processed by all engines at the same time. + +- Set work queue attributes including the work mode, work queue size and so on. + +- Enable the ``IAA1`` device and work queue 1.0 + +.. note:: + + Set work queue mode to shared mode, since ``QPL`` library only supports + shared mode + +For more detailed configuration, please refer to `IAA Configuration Samples +`_ + +IAA Unit Test +^^^^^^^^^^^^^ + +- Enabling ``IAA`` devices for Xeon platform, please refer to `IAA User Guide + `_ + +- ``IAA`` device driver is Intel Data Accelerator Driver (idxd), it is + recommended that the minimum version of Linux kernel is 5.18. + +- Add ``"intel_iommu=on,sm_on"`` parameter to kernel command line + for ``SVM`` feature enabling. + +Here is an easy way to verify ``IAA`` device driver and ``SVM`` with `iaa_test +`_ + +.. code-block:: shell + + #./test/iaa_test + [ info] alloc wq 0 shared size 128 addr 0x7f26cebe5000 batch sz 0xfffffffe xfer sz 0x80000000 + [ info] test noop: tflags 0x1 num_desc 1 + [ info] preparing descriptor for noop + [ info] Submitted all noop jobs + [ info] verifying task result for 0x16f7e20 + [ info] test with op 0 passed + + +IAA Resources Allocation For Migration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is no ``IAA`` resource configuration parameters for migration and +``accel-config`` tool configuration cannot directly specify the ``IAA`` +resources used for migration. + +The multifd migration with ``QPL`` compression method will use all work +queues that are enabled and shared mode. + +.. note:: + + Accessing IAA resources requires ``sudo`` command or ``root`` privileges + by default. Administrators can modify the IAA device node ownership + so that QEMU can use IAA with specified user permissions. + + For example + + #chown -R qemu /dev/iax + +Shared Virtual Memory(SVM) Introduction +======================================= + +An ability for an accelerator I/O device to operate in the same virtual +memory space of applications on host processors. It also implies the +ability to operate from pageable memory, avoiding functional requirements +to pin memory for DMA operations. + +When using ``SVM`` technology, users do not need to reserve memory for the +``IAA`` device and perform pin memory operation. The ``IAA`` device can +directly access data using the virtual address of the process. + +For more ``SVM`` technology, please refer to +`Shared Virtual Addressing (SVA) with ENQCMD +`_ + + +How To Use QPL Compression In Migration +======================================= + +1 - Installation of ``QPL`` library and ``accel-config`` library if using IAA + +2 - Configure and enable ``IAA`` devices and work queues via ``accel-config`` + +3 - Build ``QEMU`` with ``--enable-qpl`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qpl`` + +4 - Enable ``QPL`` compression during migration + + Set ``migrate_set_parameter multifd-compression qpl`` when migrating, the + ``QPL`` compression does not support configuring the compression level, it + only supports one compression level. + +The Difference Between QPL And ZLIB +=================================== + +Although both ``QPL`` and ``ZLIB`` are based on the deflate compression +algorithm, and ``QPL`` can support the header and tail of ``ZLIB``, ``QPL`` +is still not fully compatible with the ``ZLIB`` compression in the migration. + +``QPL`` only supports 4K history buffer, and ``ZLIB`` is 32K by default. +``ZLIB`` compresses data that ``QPL`` may not decompress correctly and +vice versa. + +``QPL`` does not support the ``Z_SYNC_FLUSH`` operation in ``ZLIB`` streaming +compression, current ``ZLIB`` implementation uses ``Z_SYNC_FLUSH``, so each +``multifd`` thread has a ``ZLIB`` streaming context, and all page compression +and decompression are based on this stream. ``QPL`` cannot decompress such data +and vice versa. + +The introduction for ``Z_SYNC_FLUSH``, please refer to `Zlib Manual +`_ + +The Best Practices +================== +When user enables the IAA device for ``QPL`` compression, it is recommended +to add ``-mem-prealloc`` parameter to the destination boot parameters. This +parameter can avoid the occurrence of I/O page fault and reduce the overhead +of IAA compression and decompression. + +The example of booting with ``-mem-prealloc`` parameter + +.. code-block:: shell + + $qemu-system-x86_64 --enable-kvm -cpu host --mem-prealloc ... + + +An example about I/O page fault measurement of destination without +``-mem-prealloc``, the ``svm_prq`` indicates the number of I/O page fault +occurrences and processing time. + +.. code-block:: shell + + #echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 3 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 4 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #cat /sys/kernel/debug/iommu/intel/dmar_perf_latency + IOMMU: dmar18 Register Base Address: c87fc000 + <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms 1ms-10ms >=10ms min(us) max(us) average(us) + inv_iotlb 0 286 123 0 0 0 0 0 1 0 + inv_devtlb 0 276 133 0 0 0 0 0 2 0 + inv_iec 0 0 0 0 0 0 0 0 0 0 + svm_prq 0 0 25206 364 395 0 0 1 556 9 -- Gitee From eb9435d5919e2f09b4476fc149005ca7683ca72d Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:05 +0800 Subject: [PATCH 112/134] migration/multifd: put IOV initialization into compression method commit d9d3e4f243214f742425d9d8360f0794bb05c999 upstream. Different compression methods may require different numbers of IOVs. Based on streaming compression of zlib and zstd, all pages will be compressed to a data block, so two IOVs are needed for packet header and compressed data block. Intel-SIG: commit d9d3e4f24321 migration/multifd: put IOV initialization into compression method Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 7 +++++++ migration/multifd-zstd.c | 8 +++++++- migration/multifd.c | 22 ++++++++++++---------- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index b210725f6e..2df4983780 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -70,6 +70,10 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) goto err_free_zbuff; } p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); + return 0; err_free_zbuff: @@ -101,6 +105,9 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) z->buf = NULL; g_free(p->compress_data); p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 256858df0a..ca17b7e310 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) error_setg(errp, "multifd %u: out of memory for zbuff", p->id); return -1; } + p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); return 0; } @@ -98,6 +101,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) z->zbuff = NULL; g_free(p->compress_data); p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** diff --git a/migration/multifd.c b/migration/multifd.c index e411e2ba07..d27a84b73c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -138,6 +138,13 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; } + if (multifd_use_packets()) { + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, p->page_count + 1); + } else { + p->iov = g_new0(struct iovec, p->page_count); + } + return 0; } @@ -151,6 +158,8 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) */ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) { + g_free(p->iov); + p->iov = NULL; return; } @@ -229,6 +238,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) */ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) { + p->iov = g_new0(struct iovec, p->page_count); return 0; } @@ -241,6 +251,8 @@ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void nocomp_recv_cleanup(MultiFDRecvParams *p) { + g_free(p->iov); + p->iov = NULL; } /** @@ -784,8 +796,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) p->packet_len = 0; g_free(p->packet); p->packet = NULL; - g_free(p->iov); - p->iov = NULL; multifd_send_state->ops->send_cleanup(p, errp); return *errp == NULL; @@ -1183,11 +1193,6 @@ bool multifd_send_setup(void) p->packet = g_malloc0(p->packet_len); p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); p->packet->version = cpu_to_be32(MULTIFD_VERSION); - - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); - } else { - p->iov = g_new0(struct iovec, page_count); } p->name = g_strdup_printf("multifdsend_%d", i); p->page_size = qemu_target_page_size(); @@ -1357,8 +1362,6 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->packet_len = 0; g_free(p->packet); p->packet = NULL; - g_free(p->iov); - p->iov = NULL; g_free(p->normal); p->normal = NULL; g_free(p->zero); @@ -1606,7 +1609,6 @@ int multifd_recv_setup(Error **errp) p->packet = g_malloc0(p->packet_len); } p->name = g_strdup_printf("multifdrecv_%d", i); - p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; -- Gitee From 39b38535aae809a895b3208420a466fe1892373e Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:06 +0800 Subject: [PATCH 113/134] configure: add --enable-qpl build option commit b844a2c7cc7f7c7756a27d372e64f6688d67c4eb upstream. add --enable-qpl and --disable-qpl options to enable and disable the QPL compression method for multifd migration. The Query Processing Library (QPL) is an open-source library that supports data compression and decompression features. It is based on the deflate compression algorithm and use Intel In-Memory Analytics Accelerator(IAA) hardware for compression and decompression acceleration. For more live migration with IAA, please refer to the document docs/devel/migration/qpl-compression.rst Intel-SIG: commit b844a2c7cc7f configure: add --enable-qpl build option Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- meson.build | 8 ++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 13 insertions(+) diff --git a/meson.build b/meson.build index 4078f2aced..8f227d8f5c 100644 --- a/meson.build +++ b/meson.build @@ -1043,6 +1043,12 @@ if not get_option('zstd').auto() or have_block required: get_option('zstd'), method: 'pkg-config') endif +qpl = not_found +if not get_option('qpl').auto() or have_system + qpl = dependency('qpl', version: '>=1.5.0', + required: get_option('qpl'), + method: 'pkg-config') +endif virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2275,6 +2281,7 @@ config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) +config_host_data.set('CONFIG_QPL', qpl.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4449,6 +4456,7 @@ summary_info += {'snappy support': snappy} summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} +summary_info += {'Query Processing Library support': qpl} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index cf9706c411..82f73d51ce 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -259,6 +259,8 @@ option('xkbcommon', type : 'feature', value : 'auto', description: 'xkbcommon support') option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') +option('qpl', type : 'feature', value : 'auto', + description: 'Query Processing Library support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 680fa3f581..784f74fde9 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -222,6 +222,7 @@ meson_options_help() { printf "%s\n" ' Xen PCI passthrough support' printf "%s\n" ' xkbcommon xkbcommon support' printf "%s\n" ' zstd zstd compression support' + printf "%s\n" ' qpl Query Processing Library support' } _meson_option_parse() { case $1 in @@ -562,6 +563,8 @@ _meson_option_parse() { --disable-xkbcommon) printf "%s" -Dxkbcommon=disabled ;; --enable-zstd) printf "%s" -Dzstd=enabled ;; --disable-zstd) printf "%s" -Dzstd=disabled ;; + --enable-qpl) printf "%s" -Dqpl=enabled ;; + --disable-qpl) printf "%s" -Dqpl=disabled ;; *) return 1 ;; esac } -- Gitee From 75d86516682245147eaa5d6e82b970f3f7bb1ee2 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:07 +0800 Subject: [PATCH 114/134] migration/multifd: add qpl compression method commit 354cac2859e48ec5f7ee72a2a071da6c60a462d0 upstream. add the Query Processing Library (QPL) compression method Introduce the qpl as a new multifd migration compression method, it can use In-Memory Analytics Accelerator(IAA) to accelerate compression and decompression, which can not only reduce network bandwidth requirement but also reduce host compression and decompression CPU overhead. How to enable qpl compression during migration: migrate_set_parameter multifd-compression qpl There is no qpl compression level parameter added since it only supports level one, users do not need to specify the qpl compression level. Intel-SIG: commit 354cac2859e4 migration/multifd: add qpl compression method Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas [fixed docs spacing in migration.json] Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-qpl.c | 20 ++++++++++++++++++++ migration/multifd.h | 1 + qapi/migration.json | 8 +++++++- 5 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 migration/multifd-qpl.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b3b9238b65..6ee9744e00 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -711,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd", + "none/zlib/zstd/qpl", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index d619ebf238..6652f68d32 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed() system_ss.add(files('block.c')) endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) +system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c new file mode 100644 index 0000000000..056a68a060 --- /dev/null +++ b/migration/multifd-qpl.c @@ -0,0 +1,20 @@ +/* + * Multifd qpl compression accelerator implementation + * + * Copyright (c) 2023 Intel Corporation + * + * Authors: + * Yuan Liu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu/module.h" + +static void multifd_qpl_register(void) +{ + /* noop */ +} + +migration_init(multifd_qpl_register); diff --git a/migration/multifd.h b/migration/multifd.h index c9d9b09239..5b7d9b15f8 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -40,6 +40,7 @@ MultiFDRecvData *multifd_get_recv_data(void); #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) +#define MULTIFD_FLAG_QPL (4 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index f55faadd2f..994416d1be 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -629,11 +629,17 @@ # # @zstd: use zstd compression method. # +# @qpl: use qpl compression method. Query Processing Library(qpl) is +# based on the deflate compression algorithm and use the Intel +# In-Memory Analytics Accelerator(IAA) accelerated compression +# and decompression. (Since 9.1) +# # Since: 5.0 ## { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', - { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } + { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } ## # @MigMode: -- Gitee From da5325eb915b07a8ed517e45288adff98480241c Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:08 +0800 Subject: [PATCH 115/134] migration/multifd: implement initialization of qpl compression commit 34e104b897da6e144a5f34e7c5eebf8a4c4d9d59 upstream. during initialization, a software job is allocated to each channel for software path fallabck when the IAA hardware is unavailable or the hardware job submission fails. If the IAA hardware is available, multiple hardware jobs are allocated for batch processing. Intel-SIG: commit 34e104b897da migration/multifd: implement initialization of qpl compression Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 328 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 327 insertions(+), 1 deletion(-) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 056a68a060..6791a204d5 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -9,12 +9,338 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ + #include "qemu/osdep.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "multifd.h" +#include "qpl/qpl.h" + +typedef struct { + /* the QPL hardware path job */ + qpl_job *job; + /* indicates if fallback to software path is required */ + bool fallback_sw_path; + /* output data from the software path */ + uint8_t *sw_output; + /* output data length from the software path */ + uint32_t sw_output_len; +} QplHwJob; + +typedef struct { + /* array of hardware jobs, the number of jobs equals the number pages */ + QplHwJob *hw_jobs; + /* the QPL software job for the slow path and software fallback */ + qpl_job *sw_job; + /* the number of pages that the QPL needs to process at one time */ + uint32_t page_num; + /* array of compressed page buffers */ + uint8_t *zbuf; + /* array of compressed page lengths */ + uint32_t *zlen; + /* the status of the hardware device */ + bool hw_avail; +} QplData; + +/** + * check_hw_avail: check if IAA hardware is available + * + * If the IAA hardware does not exist or is unavailable, + * the QPL hardware job initialization will fail. + * + * Returns true if IAA hardware is available, otherwise false. + * + * @job_size: indicates the hardware job size if hardware is available + */ +static bool check_hw_avail(uint32_t *job_size) +{ + qpl_path_t path = qpl_path_hardware; + uint32_t size = 0; + qpl_job *job; + + if (qpl_get_job_size(path, &size) != QPL_STS_OK) { + return false; + } + assert(size > 0); + job = g_malloc0(size); + if (qpl_init_job(path, job) != QPL_STS_OK) { + g_free(job); + return false; + } + g_free(job); + *job_size = size; + return true; +} + +/** + * multifd_qpl_free_sw_job: clean up software job + * + * Free the software job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_sw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->sw_job) { + qpl_fini_job(qpl->sw_job); + g_free(qpl->sw_job); + qpl->sw_job = NULL; + } +} + +/** + * multifd_qpl_free_jobs: clean up hardware jobs + * + * Free all hardware job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_hw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->hw_jobs) { + for (int i = 0; i < qpl->page_num; i++) { + qpl_fini_job(qpl->hw_jobs[i].job); + g_free(qpl->hw_jobs[i].job); + qpl->hw_jobs[i].job = NULL; + } + g_free(qpl->hw_jobs); + qpl->hw_jobs = NULL; + } +} + +/** + * multifd_qpl_init_sw_job: initialize a software job + * + * Use the QPL software path to initialize a job + * + * @qpl: pointer to the QplData structure + * @errp: pointer to an error + */ +static int multifd_qpl_init_sw_job(QplData *qpl, Error **errp) +{ + qpl_path_t path = qpl_path_software; + uint32_t size = 0; + qpl_job *job = NULL; + qpl_status status; + + status = qpl_get_job_size(path, &size); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_get_job_size failed with error %d", status); + return -1; + } + job = g_malloc0(size); + status = qpl_init_job(path, job); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_init_job failed with error %d", status); + g_free(job); + return -1; + } + qpl->sw_job = job; + return 0; +} + +/** + * multifd_qpl_init_jobs: initialize hardware jobs + * + * Use the QPL hardware path to initialize jobs + * + * @qpl: pointer to the QplData structure + * @size: the size of QPL hardware path job + * @errp: pointer to an error + */ +static void multifd_qpl_init_hw_job(QplData *qpl, uint32_t size, Error **errp) +{ + qpl_path_t path = qpl_path_hardware; + qpl_job *job = NULL; + qpl_status status; + + qpl->hw_jobs = g_new0(QplHwJob, qpl->page_num); + for (int i = 0; i < qpl->page_num; i++) { + job = g_malloc0(size); + status = qpl_init_job(path, job); + /* the job initialization should succeed after check_hw_avail */ + assert(status == QPL_STS_OK); + qpl->hw_jobs[i].job = job; + } +} + +/** + * multifd_qpl_init: initialize QplData structure + * + * Allocate and initialize a QplData structure + * + * Returns a QplData pointer on success or NULL on error + * + * @num: the number of pages + * @size: the page size + * @errp: pointer to an error + */ +static QplData *multifd_qpl_init(uint32_t num, uint32_t size, Error **errp) +{ + uint32_t job_size = 0; + QplData *qpl; + + qpl = g_new0(QplData, 1); + qpl->page_num = num; + if (multifd_qpl_init_sw_job(qpl, errp) != 0) { + g_free(qpl); + return NULL; + } + qpl->hw_avail = check_hw_avail(&job_size); + if (qpl->hw_avail) { + multifd_qpl_init_hw_job(qpl, job_size, errp); + } + qpl->zbuf = g_malloc0(size * num); + qpl->zlen = g_new0(uint32_t, num); + return qpl; +} + +/** + * multifd_qpl_deinit: clean up QplData structure + * + * Free jobs, buffers and the QplData structure + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_deinit(QplData *qpl) +{ + if (qpl) { + multifd_qpl_free_sw_job(qpl); + multifd_qpl_free_hw_job(qpl); + g_free(qpl->zbuf); + g_free(qpl->zlen); + g_free(qpl); + } +} + +/** + * multifd_qpl_send_setup: set up send side + * + * Set up the channel with QPL compression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + + /* + * the page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_qpl_send_cleanup: clean up send side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; +} + +/** + * multifd_qpl_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) +{ + /* Implement in next patch */ + return -1; +} + +/** + * multifd_qpl_recv_setup: set up receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + return 0; +} + +/** + * multifd_qpl_recv_cleanup: set up receive side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; +} + +/** + * multifd_qpl_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) +{ + /* Implement in next patch */ + return -1; +} + +static MultiFDMethods multifd_qpl_ops = { + .send_setup = multifd_qpl_send_setup, + .send_cleanup = multifd_qpl_send_cleanup, + .send_prepare = multifd_qpl_send_prepare, + .recv_setup = multifd_qpl_recv_setup, + .recv_cleanup = multifd_qpl_recv_cleanup, + .recv = multifd_qpl_recv, +}; static void multifd_qpl_register(void) { - /* noop */ + multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops); } migration_init(multifd_qpl_register); -- Gitee From 020cfa4e8e717e5f0aa00b3bc5b4dfef1e440b19 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:09 +0800 Subject: [PATCH 116/134] migration/multifd: implement qpl compression and decompression commit f6fe9fea995249ecc2cd72975d803fbf4d512c02 upstream. QPL compression and decompression will use IAA hardware path if the IAA hardware is available. Otherwise the QPL library software path is used. The hardware path will automatically fall back to QPL software path if the IAA queues are busy. In some scenarios, this may happen frequently, such as configuring 4 channels but only one IAA device is available. In the case of insufficient IAA hardware resources, retry and fallback can help optimize performance: 1. Retry + SW fallback: total time: 14649 ms downtime: 25 ms throughput: 17666.57 mbps pages-per-second: 1509647 2. No fallback, always wait for work queues to become available total time: 18381 ms downtime: 25 ms throughput: 13698.65 mbps pages-per-second: 859607 If both the hardware and software paths fail, the uncompressed page is sent directly. Intel-SIG: commit f6fe9fea9952 migration/multifd: implement qpl compression and decompression Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 424 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 420 insertions(+), 4 deletions(-) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 6791a204d5..9265098ee7 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -13,9 +13,14 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qapi-types-migration.h" +#include "exec/ramblock.h" #include "multifd.h" #include "qpl/qpl.h" +/* Maximum number of retries to resubmit a job if IAA work queues are full */ +#define MAX_SUBMIT_RETRY_NUM (3) + typedef struct { /* the QPL hardware path job */ qpl_job *job; @@ -260,6 +265,225 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) p->iov = NULL; } +/** + * multifd_qpl_prepare_job: prepare the job + * + * Set the QPL job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @is_compression: indicates compression and decompression + * @input: pointer to the input data buffer + * @input_len: the length of the input data + * @output: pointer to the output data buffer + * @output_len: the length of the output data + */ +static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression, + uint8_t *input, uint32_t input_len, + uint8_t *output, uint32_t output_len) +{ + job->op = is_compression ? qpl_op_compress : qpl_op_decompress; + job->next_in_ptr = input; + job->next_out_ptr = output; + job->available_in = input_len; + job->available_out = output_len; + job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY; + /* only supports compression level 1 */ + job->level = 1; +} + +/** + * multifd_qpl_prepare_comp_job: prepare the compression job + * + * Set the compression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input, + uint8_t *output, uint32_t size) +{ + /* + * Set output length to less than the page size to force the job to + * fail in case it compresses to a larger size. We'll send that page + * without compression and skip the decompression operation on the + * destination. + */ + multifd_qpl_prepare_job(job, true, input, size, output, size - 1); +} + +/** + * multifd_qpl_prepare_decomp_job: prepare the decompression job + * + * Set the decompression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @len: the length of the input data + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t *input, + uint32_t len, uint8_t *output, + uint32_t size) +{ + multifd_qpl_prepare_job(job, false, input, len, output, size); +} + +/** + * multifd_qpl_fill_iov: fill in the IOV + * + * Fill in the QPL packet IOV + * + * @p: Params for the channel being used + * @data: pointer to the IOV data + * @len: The length of the IOV data + */ +static void multifd_qpl_fill_iov(MultiFDSendParams *p, uint8_t *data, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = data; + p->iov[p->iovs_num].iov_len = len; + p->iovs_num++; + p->next_packet_size += len; +} + +/** + * multifd_qpl_fill_packet: fill the compressed page into the QPL packet + * + * Fill the compressed page length and IOV into the QPL packet + * + * @idx: The index of the compressed length array + * @p: Params for the channel being used + * @data: pointer to the compressed page buffer + * @len: The length of the compressed page + */ +static void multifd_qpl_fill_packet(uint32_t idx, MultiFDSendParams *p, + uint8_t *data, uint32_t len) +{ + QplData *qpl = p->compress_data; + + qpl->zlen[idx] = cpu_to_be32(len); + multifd_qpl_fill_iov(p, data, len); +} + +/** + * multifd_qpl_submit_job: submit a job to the hardware + * + * Submit a QPL hardware job to the IAA device + * + * Returns true if the job is submitted successfully, otherwise false. + * + * @job: pointer to the qpl_job structure + */ +static bool multifd_qpl_submit_job(qpl_job *job) +{ + qpl_status status; + uint32_t num = 0; + +retry: + status = qpl_submit_job(job); + if (status == QPL_STS_QUEUES_ARE_BUSY_ERR) { + if (num < MAX_SUBMIT_RETRY_NUM) { + num++; + goto retry; + } + } + return (status == QPL_STS_OK); +} + +/** + * multifd_qpl_compress_pages_slow_path: compress pages using slow path + * + * Compress the pages using software. If compression fails, the uncompressed + * page will be sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *buf; + + for (int i = 0; i < p->pages->normal_num; i++) { + buf = p->pages->block->host + p->pages->offset[i]; + multifd_qpl_prepare_comp_job(job, buf, zbuf, size); + if (qpl_execute_job(job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + zbuf += size; + } +} + +/** + * multifd_qpl_compress_pages: compress pages + * + * Submit the pages to the IAA hardware for compression. If hardware + * compression fails, it falls back to software compression. If software + * compression also fails, the uncompressed page is sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + MultiFDPages_t *pages = p->pages; + uint32_t size = p->page_size; + QplHwJob *hw_job; + uint8_t *buf; + uint8_t *zbuf; + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + multifd_qpl_prepare_comp_job(hw_job->job, buf, zbuf, size); + if (multifd_qpl_submit_job(hw_job->job)) { + hw_job->fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + hw_job->fallback_sw_path = true; + multifd_qpl_prepare_comp_job(qpl->sw_job, buf, zbuf, size); + if (qpl_execute_job(qpl->sw_job) == QPL_STS_OK) { + hw_job->sw_output = zbuf; + hw_job->sw_output_len = qpl->sw_job->total_out; + } else { + hw_job->sw_output = buf; + hw_job->sw_output_len = size; + } + } + } + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + if (hw_job->fallback_sw_path) { + multifd_qpl_fill_packet(i, p, hw_job->sw_output, + hw_job->sw_output_len); + continue; + } + if (qpl_wait_job(hw_job->job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, hw_job->job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + } +} + /** * multifd_qpl_send_prepare: prepare data to be able to send * @@ -273,8 +497,26 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) { - /* Implement in next patch */ - return -1; + QplData *qpl = p->compress_data; + uint32_t len = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* The first IOV is used to store the compressed page lengths */ + len = p->pages->normal_num * sizeof(uint32_t); + multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len); + if (qpl->hw_avail) { + multifd_qpl_compress_pages(p); + } else { + multifd_qpl_compress_pages_slow_path(p); + } + +out: + p->flags |= MULTIFD_FLAG_QPL; + multifd_send_fill_packet(p); + return 0; } /** @@ -312,6 +554,140 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) p->compress_data = NULL; } +/** + * multifd_qpl_process_and_check_job: process and check a QPL job + * + * Process the job and check whether the job output length is the + * same as the specified length + * + * Returns true if the job execution succeeded and the output length + * is equal to the specified length, otherwise false. + * + * @job: pointer to the qpl_job structure + * @is_hardware: indicates whether the job is a hardware job + * @len: Specified output length + * @errp: pointer to an error + */ +static bool multifd_qpl_process_and_check_job(qpl_job *job, bool is_hardware, + uint32_t len, Error **errp) +{ + qpl_status status; + + status = (is_hardware ? qpl_wait_job(job) : qpl_execute_job(job)); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl job failed with error %d", status); + return false; + } + if (job->total_out != len) { + error_setg(errp, "qpl decompressed len %u, expected len %u", + job->total_out, len); + return false; + } + return true; +} + +/** + * multifd_qpl_decompress_pages_slow_path: decompress pages using slow path + * + * Decompress the pages using software + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, + Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + + for (int i = 0; i < p->normal_num; i++) { + len = qpl->zlen[i]; + addr = p->host + p->normal[i]; + /* the page is uncompressed, load it */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + zbuf += len; + } + return 0; +} + +/** + * multifd_qpl_decompress_pages: decompress pages + * + * Decompress the pages using the IAA hardware. If hardware + * decompression fails, it falls back to software decompression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + qpl_job *job; + + for (int i = 0; i < p->normal_num; i++) { + addr = p->host + p->normal[i]; + len = qpl->zlen[i]; + /* the page is uncompressed if received length equals the page size */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + + job = qpl->hw_jobs[i].job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (multifd_qpl_submit_job(job)) { + qpl->hw_jobs[i].fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + qpl->hw_jobs[i].fallback_sw_path = true; + job = qpl->sw_job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + } + zbuf += len; + } + + for (int i = 0; i < p->normal_num; i++) { + /* ignore pages that have already been processed */ + if (qpl->zlen[i] == size || qpl->hw_jobs[i].fallback_sw_path) { + continue; + } + + job = qpl->hw_jobs[i].job; + if (!multifd_qpl_process_and_check_job(job, true, size, errp)) { + return -1; + } + } + return 0; +} /** * multifd_qpl_recv: read the data from the channel into actual pages * @@ -325,8 +701,48 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) */ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) { - /* Implement in next patch */ - return -1; + QplData *qpl = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t len = 0; + uint32_t zbuf_len = 0; + int ret; + + if (flags != MULTIFD_FLAG_QPL) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QPL); + return -1; + } + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed page lengths */ + len = p->normal_num * sizeof(uint32_t); + assert(len < in_size); + ret = qio_channel_read_all(p->c, (void *) qpl->zlen, len, errp); + if (ret != 0) { + return ret; + } + for (int i = 0; i < p->normal_num; i++) { + qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); + assert(qpl->zlen[i] <= p->page_size); + zbuf_len += qpl->zlen[i]; + } + + /* read compressed pages */ + assert(in_size == len + zbuf_len); + ret = qio_channel_read_all(p->c, (void *) qpl->zbuf, zbuf_len, errp); + if (ret != 0) { + return ret; + } + + if (qpl->hw_avail) { + return multifd_qpl_decompress_pages(p, errp); + } + return multifd_qpl_decompress_pages_slow_path(p, errp); } static MultiFDMethods multifd_qpl_ops = { -- Gitee From bf383766fcd2407a0e415634139ecbaf903548bb Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:10 +0800 Subject: [PATCH 117/134] tests/migration-test: add qpl compression test commit 08b82d207d138173ddd334c91b387213508a6e13 upstream. add qpl to compression method test for multifd migration the qpl compression supports software path and hardware path(IAA device), and the hardware path is used first by default. If the hardware path is unavailable, it will automatically fallback to the software path for testing. Intel-SIG: commit 08b82d207d13 tests/migration-test: add qpl compression test Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Conflicts: tests/qtest/migration-test.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 9149469d37..b6c91460ba 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2703,6 +2703,15 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QPL +static void * +test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); +} +#endif /* CONFIG_QPL */ + static void test_multifd_tcp_none(void) { MigrateCommon args = { @@ -2768,6 +2777,17 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QPL +static void test_multifd_tcp_qpl(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qpl_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_GNUTLS static void * test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, @@ -3661,6 +3681,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/zstd", test_multifd_tcp_zstd); #endif +#ifdef CONFIG_QPL + migration_test_add("/migration/multifd/tcp/plain/qpl", + test_multifd_tcp_qpl); +#endif #ifdef CONFIG_GNUTLS migration_test_add("/migration/multifd/tcp/tls/psk/match", test_multifd_tcp_tls_psk_match); -- Gitee From 4a5c8271ec44590f0d2932a6b9005f0146132d81 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 1 Mar 2024 03:59:00 +0000 Subject: [PATCH 118/134] migration: Properly apply migration compression level parameters commit b4014a2bf57ce08e2f6458cd82e9f968facf25c8 upstream. Some glue code was missing, so that using `qmp_migrate_set_parameters` to set `multifd-zstd-level` or `multifd-zlib-level` did not work. This commit adds the glue code to fix that. Intel-SIG: commit b4014a2bf57c migration: Properly apply migration compression level parameters Signed-off-by: Bryan Zhang Link: https://lore.kernel.org/r/20240301035901.4006936-2-bryan.zhang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/options.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/migration/options.c b/migration/options.c index 710707acea..6969beab63 100644 --- a/migration/options.c +++ b/migration/options.c @@ -1412,6 +1412,12 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + dest->multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { dest->xbzrle_cache_size = params->xbzrle_cache_size; } @@ -1568,6 +1574,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + s->parameters.multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; xbzrle_cache_resize(params->xbzrle_cache_size, errp); -- Gitee From beb76af7ae2eaf5f5adaa73b8bd5246fd9fed857 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 1 Mar 2024 03:59:01 +0000 Subject: [PATCH 119/134] tests/migration: Set compression level in migration tests commit 2b571432314ab42da742fbb578f4174166ecd7f5 upstream. Adds calls to set compression level for `zstd` and `zlib` migration tests, just to make sure that the calls work. Intel-SIG: commit 2b571432314a tests/migration: Set compression level in migration tests Signed-off-by: Bryan Zhang Link: https://lore.kernel.org/r/20240301035901.4006936-3-bryan.zhang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index b6c91460ba..5b28d09413 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2691,6 +2691,13 @@ static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) { + /* + * Overloading this test to also check that set_parameter does not error. + * This is also done in the tests for the other compression methods. + */ + migrate_set_parameter_int(from, "multifd-zlib-level", 2); + migrate_set_parameter_int(to, "multifd-zlib-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); } @@ -2699,6 +2706,9 @@ static void * test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, QTestState *to) { + migrate_set_parameter_int(from, "multifd-zstd-level", 2); + migrate_set_parameter_int(to, "multifd-zstd-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); } #endif /* CONFIG_ZSTD */ -- Gitee From 1a9896170249e56f17b5ae840b888cf55eea6af2 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:04 +0100 Subject: [PATCH 120/134] docs/migration: add uadk compression feature commit 3ae9bd97829213808298ae6d35ea26f8def15dc1 upstream. Document UADK(User Space Accelerator Development Kit) library details and how to use that for migration. Intel-SIG: commit 3ae9bd978292 docs/migration: add uadk compression feature Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao [s/Qemu/QEMU in docs] Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/uadk-compression.rst | 144 ++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 docs/devel/migration/uadk-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 2c2aa736c3..02aef9839a 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -12,3 +12,4 @@ Migration has plenty of features to support different use cases. virtio mapped-ram qpl-compression + uadk-compression diff --git a/docs/devel/migration/uadk-compression.rst b/docs/devel/migration/uadk-compression.rst new file mode 100644 index 0000000000..3f73345dd5 --- /dev/null +++ b/docs/devel/migration/uadk-compression.rst @@ -0,0 +1,144 @@ +========================================================= +User Space Accelerator Development Kit (UADK) Compression +========================================================= +UADK is a general-purpose user space accelerator framework that uses shared +virtual addressing (SVA) to provide a unified programming interface for +hardware acceleration of cryptographic and compression algorithms. + +UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE), +which enables hardware accelerators from different vendors that support SVA to +adapt to UADK. + +Currently, HiSilicon Kunpeng hardware accelerators have been registered with +UACCE. Through the UADK framework, users can run cryptographic and compression +algorithms using hardware accelerators instead of CPUs, freeing up CPU +computing power and improving computing performance. + +https://github.com/Linaro/uadk/tree/master/docs + +UADK Framework +============== +UADK consists of UACCE, vendors' drivers, and an algorithm layer. UADK requires +the hardware accelerator to support SVA, and the operating system to support +IOMMU and SVA. Hardware accelerators from different vendors are registered as +different character devices with UACCE by using kernel-mode drivers of the +vendors. A user can access the hardware accelerators by performing user-mode +operations on the character devices. + +:: + + +----------------------------------+ + | apps | + +----+------------------------+----+ + | | + | | + +-------+--------+ +-------+-------+ + | scheduler | | alg libraries | + +-------+--------+ +-------+-------+ + | | + | | + | | + | +--------+------+ + | | vendor drivers| + | +-+-------------+ + | | + | | + +--+------------------+--+ + | libwd | + User +----+-------------+-----+ + -------------------------------------------------- + Kernel +--+-----+ +------+ + | uacce | | smmu | + +---+----+ +------+ + | + +---+------------------+ + | vendor kernel driver | + +----------------------+ + -------------------------------------------------- + +----------------------+ + | HW Accelerators | + +----------------------+ + +UADK Installation +----------------- +Build UADK +^^^^^^^^^^ + +.. code-block:: shell + + git clone https://github.com/Linaro/uadk.git + cd uadk + mkdir build + ./autogen.sh + ./configure --prefix=$PWD/build + make + make install + +Without --prefix, UADK will be installed to /usr/local/lib by default. +If get error:"cannot find -lnuma", please install the libnuma-dev + +Run pkg-config libwd to ensure env is setup correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* export PKG_CONFIG_PATH=$PWD/build/lib/pkgconfig +* pkg-config libwd --cflags --libs + -I/usr/local/include -L/usr/local/lib -lwd + +* export PKG_CONFIG_PATH is required on demand. + Not required if UADK is installed to /usr/local/lib + +UADK Host Kernel Requirements +----------------------------- +User needs to make sure that ``UACCE`` is already supported in Linux kernel. +The kernel version should be at least v5.9 with SVA (Shared Virtual +Addressing) enabled. + +Kernel Configuration +^^^^^^^^^^^^^^^^^^^^ + +``UACCE`` could be built as module or built-in. + +Here's an example to enable UACCE with hardware accelerator in HiSilicon +Kunpeng platform. + +* CONFIG_IOMMU_SVA_LIB=y +* CONFIG_ARM_SMMU=y +* CONFIG_ARM_SMMU_V3=y +* CONFIG_ARM_SMMU_V3_SVA=y +* CONFIG_PCI_PASID=y +* CONFIG_UACCE=y +* CONFIG_CRYPTO_DEV_HISI_QM=y +* CONFIG_CRYPTO_DEV_HISI_ZIP=y + +Make sure all these above kernel configurations are selected. + +Accelerator dev node permissions +-------------------------------- +Harware accelerators(eg: HiSilicon Kunpeng Zip accelerator) gets registered to +UADK and char devices are created in dev directory. In order to access resources +on hardware accelerator devices, write permission should be provided to user. + +.. code-block:: shell + + $ sudo chmod 777 /dev/hisi_zip-* + +How To Use UADK Compression In QEMU Migration +--------------------------------------------- +* Make sure UADK is installed as above +* Build ``QEMU`` with ``--enable-uadk`` parameter + + E.g. configure --target-list=aarch64-softmmu --enable-kvm ``--enable-uadk`` + +* Enable ``UADK`` compression during migration + + Set ``migrate_set_parameter multifd-compression uadk`` + +Since UADK uses Shared Virtual Addressing(SVA) and device access virtual memory +directly it is possible that SMMUv3 may enounter page faults while walking the +IO page tables. This may impact the performance. In order to mitigate this, +please make sure to specify ``-mem-prealloc`` parameter to the destination VM +boot parameters. + +Though both UADK and ZLIB are based on the deflate compression algorithm, UADK +is not fully compatible with ZLIB. Hence, please make sure to use ``uadk`` on +both source and destination during migration. -- Gitee From 404fe2c0963da4b1e012571a8d47f4c51dfd49c3 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:05 +0100 Subject: [PATCH 121/134] configure: Add uadk option commit cfc589a89b31930d9d658f4b0b6c4e6f33280e10 upstream. Add --enable-uadk and --disable-uadk options to enable and disable UADK compression accelerator. This is for using UADK based hardware accelerators for live migration. Intel-SIG: commit cfc589a89b31 configure: Add uadk option Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- meson.build | 14 ++++++++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 19 insertions(+) diff --git a/meson.build b/meson.build index 8f227d8f5c..e2b6c2a33b 100644 --- a/meson.build +++ b/meson.build @@ -1049,6 +1049,18 @@ if not get_option('qpl').auto() or have_system required: get_option('qpl'), method: 'pkg-config') endif +uadk = not_found +if not get_option('uadk').auto() or have_system + libwd = dependency('libwd', version: '>=2.6', + required: get_option('uadk'), + method: 'pkg-config') + libwd_comp = dependency('libwd_comp', version: '>=2.6', + required: get_option('uadk'), + method: 'pkg-config') + if libwd.found() and libwd_comp.found() + uadk = declare_dependency(dependencies: [libwd, libwd_comp]) + endif +endif virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2282,6 +2294,7 @@ config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) config_host_data.set('CONFIG_QPL', qpl.found()) +config_host_data.set('CONFIG_UADK', uadk.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4457,6 +4470,7 @@ summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} summary_info += {'Query Processing Library support': qpl} +summary_info += {'UADK Library support': uadk} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index 82f73d51ce..709678fa18 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -261,6 +261,8 @@ option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') option('qpl', type : 'feature', value : 'auto', description: 'Query Processing Library support') +option('uadk', type : 'feature', value : 'auto', + description: 'UADK Library support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 784f74fde9..833b996818 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -223,6 +223,7 @@ meson_options_help() { printf "%s\n" ' xkbcommon xkbcommon support' printf "%s\n" ' zstd zstd compression support' printf "%s\n" ' qpl Query Processing Library support' + printf "%s\n" ' uadk UADK Library support' } _meson_option_parse() { case $1 in @@ -565,6 +566,8 @@ _meson_option_parse() { --disable-zstd) printf "%s" -Dzstd=disabled ;; --enable-qpl) printf "%s" -Dqpl=enabled ;; --disable-qpl) printf "%s" -Dqpl=disabled ;; + --enable-uadk) printf "%s" -Duadk=enabled ;; + --disable-uadk) printf "%s" -Duadk=disabled ;; *) return 1 ;; esac } -- Gitee From e8025ef6d127843514ca3cc3a5603827c040ffd1 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:06 +0100 Subject: [PATCH 122/134] migration/multifd: add uadk compression framework commit f3d8bb759d13a2e33389f00fa338d0761309029a upstream. Adds the skeleton to support uadk compression method. Complete functionality will be added in subsequent patches. Intel-SIG: commit f3d8bb759d13 migration/multifd: add uadk compression framework Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-uadk.c | 20 ++++++++++++++++++++ migration/multifd.h | 5 +++-- qapi/migration.json | 5 ++++- 5 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 migration/multifd-uadk.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 6ee9744e00..650c42eaf8 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -711,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd/qpl", + "none/zlib/zstd/qpl/uadk", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 6652f68d32..264d04657f 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -41,6 +41,7 @@ if get_option('live_block_migration').allowed() endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) +system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c new file mode 100644 index 0000000000..c2bb07535b --- /dev/null +++ b/migration/multifd-uadk.c @@ -0,0 +1,20 @@ +/* + * Multifd UADK compression accelerator implementation + * + * Copyright (c) 2024 Huawei Technologies R & D (UK) Ltd + * + * Authors: + * Shameer Kolothum + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" + +static void multifd_uadk_register(void) +{ + /* noop for now */ +} +migration_init(multifd_uadk_register); diff --git a/migration/multifd.h b/migration/multifd.h index 5b7d9b15f8..0ecd6f47d7 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -34,13 +34,14 @@ MultiFDRecvData *multifd_get_recv_data(void); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 3 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (7 << 1) +/* We reserve 4 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) #define MULTIFD_FLAG_QPL (4 << 1) +#define MULTIFD_FLAG_UADK (8 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index 994416d1be..06c1cebcc1 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -634,12 +634,15 @@ # In-Memory Analytics Accelerator(IAA) accelerated compression # and decompression. (Since 9.1) # +# @uadk: use UADK library compression method. (Since 9.1) +# # Since: 5.0 ## { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, - { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } + { 'name': 'qpl', 'if': 'CONFIG_QPL' }, + { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] } ## # @MigMode: -- Gitee From 129f46085aef5fdb1249ba35f30c0dc49fb68b28 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:07 +0100 Subject: [PATCH 123/134] migration/multifd: Add UADK initialization commit 819dd20636d51d5dc9d42aa28edb3dd9c1b8b863 upstream. Initialize UADK session and allocate buffers required. The actual compression/decompression will only be done in a subsequent patch. Intel-SIG: commit 819dd20636d5 migration/multifd: Add UADK initialization Signed-off-by: Shameer Kolothum Reviewed-by: Fabiano Rosas Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 209 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 1 deletion(-) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index c2bb07535b..535411a405 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -12,9 +12,216 @@ #include "qemu/osdep.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "uadk/wd_comp.h" +#include "uadk/wd_sched.h" + +struct wd_data { + handle_t handle; + uint8_t *buf; + uint32_t *buf_hdr; +}; + +static bool uadk_hw_init(void) +{ + char alg[] = "zlib"; + int ret; + + ret = wd_comp_init2(alg, SCHED_POLICY_RR, TASK_HW); + if (ret && ret != -WD_EEXIST) { + return false; + } else { + return true; + } +} + +static struct wd_data *multifd_uadk_init_sess(uint32_t count, + uint32_t page_size, + bool compress, Error **errp) +{ + struct wd_comp_sess_setup ss = {0}; + struct sched_params param = {0}; + uint32_t size = count * page_size; + struct wd_data *wd; + + if (!uadk_hw_init()) { + error_setg(errp, "multifd: UADK hardware not available"); + return NULL; + } + + wd = g_new0(struct wd_data, 1); + ss.alg_type = WD_ZLIB; + if (compress) { + ss.op_type = WD_DIR_COMPRESS; + /* Add an additional page for handling output > input */ + size += page_size; + } else { + ss.op_type = WD_DIR_DECOMPRESS; + } + + /* We use default level 1 compression and 4K window size */ + param.type = ss.op_type; + ss.sched_param = ¶m; + + wd->handle = wd_comp_alloc_sess(&ss); + if (!wd->handle) { + error_setg(errp, "multifd: failed wd_comp_alloc_sess"); + goto out; + } + + wd->buf = g_try_malloc(size); + if (!wd->buf) { + error_setg(errp, "multifd: out of mem for uadk buf"); + goto out_free_sess; + } + wd->buf_hdr = g_new0(uint32_t, count); + return wd; + +out_free_sess: + wd_comp_free_sess(wd->handle); +out: + wd_comp_uninit2(); + g_free(wd); + return NULL; +} + +static void multifd_uadk_uninit_sess(struct wd_data *wd) +{ + wd_comp_free_sess(wd->handle); + wd_comp_uninit2(); + g_free(wd->buf); + g_free(wd->buf_hdr); + g_free(wd); +} + +/** + * multifd_uadk_send_setup: setup send side + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *wd; + + wd = multifd_uadk_init_sess(p->page_count, p->page_size, true, errp); + if (!wd) { + return -1; + } + + p->compress_data = wd; + assert(p->iov == NULL); + /* + * Each page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_uadk_send_cleanup: cleanup send side + * + * Close the channel and return memory. + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *wd = p->compress_data; + + multifd_uadk_uninit_sess(wd); + p->compress_data = NULL; +} + +/** + * multifd_uadk_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) +{ + return -1; +} + +/** + * multifd_uadk_recv_setup: setup receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + struct wd_data *wd; + + wd = multifd_uadk_init_sess(p->page_count, p->page_size, false, errp); + if (!wd) { + return -1; + } + p->compress_data = wd; + return 0; +} + +/** + * multifd_uadk_recv_cleanup: cleanup receive side + * + * Close the channel and return memory. + * + * @p: Params for the channel that we are using + */ +static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) +{ + struct wd_data *wd = p->compress_data; + + multifd_uadk_uninit_sess(wd); + p->compress_data = NULL; +} + +/** + * multifd_uadk_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) +{ + return -1; +} + +static MultiFDMethods multifd_uadk_ops = { + .send_setup = multifd_uadk_send_setup, + .send_cleanup = multifd_uadk_send_cleanup, + .send_prepare = multifd_uadk_send_prepare, + .recv_setup = multifd_uadk_recv_setup, + .recv_cleanup = multifd_uadk_recv_cleanup, + .recv = multifd_uadk_recv, +}; static void multifd_uadk_register(void) { - /* noop for now */ + multifd_register_ops(MULTIFD_COMPRESSION_UADK, &multifd_uadk_ops); } migration_init(multifd_uadk_register); -- Gitee From 83809f90510996749992287018274d000f4ff0a5 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:08 +0100 Subject: [PATCH 124/134] migration/multifd: Add UADK based compression and decompression commit 3c49191a0d011d941b347fda8fdadd88c988e753 upstream. Uses UADK wd_do_comp_sync() API to (de)compress a normal page using hardware accelerator. Intel-SIG: commit 3c49191a0d01 migration/multifd: Add UADK based compression and decompression Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 132 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index 535411a405..70bba92eaa 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -13,6 +13,7 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" +#include "exec/ramblock.h" #include "migration.h" #include "multifd.h" #include "options.h" @@ -142,6 +143,15 @@ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) p->compress_data = NULL; } +static inline void prepare_next_iov(MultiFDSendParams *p, void *base, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = (uint8_t *)base; + p->iov[p->iovs_num].iov_len = len; + p->next_packet_size += len; + p->iovs_num++; +} + /** * multifd_uadk_send_prepare: prepare data to be able to send * @@ -155,7 +165,56 @@ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) { - return -1; + struct wd_data *uadk_data = p->compress_data; + uint32_t hdr_size; + uint8_t *buf = uadk_data->buf; + int ret = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + hdr_size = p->pages->normal_num * sizeof(uint32_t); + /* prepare the header that stores the lengths of all compressed data */ + prepare_next_iov(p, uadk_data->buf_hdr, hdr_size); + + for (int i = 0; i < p->pages->normal_num; i++) { + struct wd_comp_req creq = { + .op_type = WD_DIR_COMPRESS, + .src = p->pages->block->host + p->pages->offset[i], + .src_len = p->page_size, + .dst = buf, + /* Set dst_len to double the src in case compressed out >= page_size */ + .dst_len = p->page_size * 2, + }; + + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed compression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len < p->page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); + prepare_next_iov(p, buf, creq.dst_len); + buf += creq.dst_len; + } else { + /* + * Send raw data if compressed out >= page_size. We might be better + * off sending raw data if output is slightly less than page_size + * as well because at the receive end we can skip the decompression. + * But it is tricky to find the right number here. + */ + uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size); + prepare_next_iov(p, p->pages->block->host + p->pages->offset[i], + p->page_size); + buf += p->page_size; + } + } +out: + p->flags |= MULTIFD_FLAG_UADK; + multifd_send_fill_packet(p); + return 0; } /** @@ -208,7 +267,76 @@ static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) */ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) { - return -1; + struct wd_data *uadk_data = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t hdr_len = p->normal_num * sizeof(uint32_t); + uint32_t data_len = 0; + uint8_t *buf = uadk_data->buf; + int ret = 0; + + if (flags != MULTIFD_FLAG_UADK) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_ZLIB); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed data lengths */ + assert(hdr_len < in_size); + ret = qio_channel_read_all(p->c, (void *) uadk_data->buf_hdr, + hdr_len, errp); + if (ret != 0) { + return ret; + } + + for (int i = 0; i < p->normal_num; i++) { + uadk_data->buf_hdr[i] = be32_to_cpu(uadk_data->buf_hdr[i]); + data_len += uadk_data->buf_hdr[i]; + assert(uadk_data->buf_hdr[i] <= p->page_size); + } + + /* read compressed data */ + assert(in_size == hdr_len + data_len); + ret = qio_channel_read_all(p->c, (void *)buf, data_len, errp); + if (ret != 0) { + return ret; + } + + for (int i = 0; i < p->normal_num; i++) { + struct wd_comp_req creq = { + .op_type = WD_DIR_DECOMPRESS, + .src = buf, + .src_len = uadk_data->buf_hdr[i], + .dst = p->host + p->normal[i], + .dst_len = p->page_size, + }; + + if (uadk_data->buf_hdr[i] == p->page_size) { + memcpy(p->host + p->normal[i], buf, p->page_size); + buf += p->page_size; + continue; + } + + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed decompression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len != p->page_size) { + error_setg(errp, "multifd %u: decompressed length error", p->id); + return -1; + } + buf += uadk_data->buf_hdr[i]; + } + + return 0; } static MultiFDMethods multifd_uadk_ops = { -- Gitee From 627affb4b391feef7e666a5506fb7d1ac985ea79 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:09 +0100 Subject: [PATCH 125/134] migration/multifd: Switch to no compression when no hardware support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c1dfd12168e1be0a940e97f85044098e18d18178 upstream. Send raw packets over if UADK hardware support is not available. This is to satisfy  Qemu qtest CI which may run on platforms that don't have UADK hardware support. Subsequent patch will add support for uadk migration qtest. Intel-SIG: commit c1dfd12168e1 migration/multifd: Switch to no compression when no hardware support Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 92 +++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index 70bba92eaa..d12353fb21 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -17,6 +17,7 @@ #include "migration.h" #include "multifd.h" #include "options.h" +#include "qemu/error-report.h" #include "uadk/wd_comp.h" #include "uadk/wd_sched.h" @@ -48,29 +49,29 @@ static struct wd_data *multifd_uadk_init_sess(uint32_t count, uint32_t size = count * page_size; struct wd_data *wd; - if (!uadk_hw_init()) { - error_setg(errp, "multifd: UADK hardware not available"); - return NULL; - } - wd = g_new0(struct wd_data, 1); - ss.alg_type = WD_ZLIB; - if (compress) { - ss.op_type = WD_DIR_COMPRESS; - /* Add an additional page for handling output > input */ - size += page_size; - } else { - ss.op_type = WD_DIR_DECOMPRESS; - } - - /* We use default level 1 compression and 4K window size */ - param.type = ss.op_type; - ss.sched_param = ¶m; - wd->handle = wd_comp_alloc_sess(&ss); - if (!wd->handle) { - error_setg(errp, "multifd: failed wd_comp_alloc_sess"); - goto out; + if (uadk_hw_init()) { + ss.alg_type = WD_ZLIB; + if (compress) { + ss.op_type = WD_DIR_COMPRESS; + /* Add an additional page for handling output > input */ + size += page_size; + } else { + ss.op_type = WD_DIR_DECOMPRESS; + } + /* We use default level 1 compression and 4K window size */ + param.type = ss.op_type; + ss.sched_param = ¶m; + + wd->handle = wd_comp_alloc_sess(&ss); + if (!wd->handle) { + error_setg(errp, "multifd: failed wd_comp_alloc_sess"); + goto out; + } + } else { + /* For CI test use */ + warn_report_once("UADK hardware not available. Switch to no compression mode"); } wd->buf = g_try_malloc(size); @@ -82,7 +83,9 @@ static struct wd_data *multifd_uadk_init_sess(uint32_t count, return wd; out_free_sess: - wd_comp_free_sess(wd->handle); + if (wd->handle) { + wd_comp_free_sess(wd->handle); + } out: wd_comp_uninit2(); g_free(wd); @@ -91,7 +94,9 @@ out: static void multifd_uadk_uninit_sess(struct wd_data *wd) { - wd_comp_free_sess(wd->handle); + if (wd->handle) { + wd_comp_free_sess(wd->handle); + } wd_comp_uninit2(); g_free(wd->buf); g_free(wd->buf_hdr); @@ -188,23 +193,26 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) .dst_len = p->page_size * 2, }; - ret = wd_do_comp_sync(uadk_data->handle, &creq); - if (ret || creq.status) { - error_setg(errp, "multifd %u: failed compression, ret %d status %d", - p->id, ret, creq.status); - return -1; + if (uadk_data->handle) { + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed compression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len < p->page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); + prepare_next_iov(p, buf, creq.dst_len); + buf += creq.dst_len; + } } - if (creq.dst_len < p->page_size) { - uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); - prepare_next_iov(p, buf, creq.dst_len); - buf += creq.dst_len; - } else { - /* - * Send raw data if compressed out >= page_size. We might be better - * off sending raw data if output is slightly less than page_size - * as well because at the receive end we can skip the decompression. - * But it is tricky to find the right number here. - */ + /* + * Send raw data if no UADK hardware or if compressed out >= page_size. + * We might be better off sending raw data if output is slightly less + * than page_size as well because at the receive end we can skip the + * decompression. But it is tricky to find the right number here. + */ + if (!uadk_data->handle || creq.dst_len >= p->page_size) { uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size); prepare_next_iov(p, p->pages->block->host + p->pages->offset[i], p->page_size); @@ -323,6 +331,12 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) continue; } + if (unlikely(!uadk_data->handle)) { + error_setg(errp, "multifd %u: UADK HW not available for decompression", + p->id); + return -1; + } + ret = wd_do_comp_sync(uadk_data->handle, &creq); if (ret || creq.status) { error_setg(errp, "multifd %u: failed decompression, ret %d status %d", -- Gitee From 73238a34a3b4d94ec0b139e8a420ca2719b1e16b Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:10 +0100 Subject: [PATCH 126/134] tests/migration-test: add uadk compression test commit c519caa825f5eba6e204bed5a464df167a5421d0 upstream. Intel-SIG: commit c519caa825f5 tests/migration-test: add uadk compression test Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 5b28d09413..d79978ac38 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2721,6 +2721,14 @@ test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); } #endif /* CONFIG_QPL */ +#ifdef CONFIG_UADK +static void * +test_migrate_precopy_tcp_multifd_uadk_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "uadk"); +} +#endif /* CONFIG_UADK */ static void test_multifd_tcp_none(void) { @@ -2798,6 +2806,17 @@ static void test_multifd_tcp_qpl(void) } #endif +#ifdef CONFIG_UADK +static void test_multifd_tcp_uadk(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_uadk_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_GNUTLS static void * test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, @@ -3695,6 +3714,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/qpl", test_multifd_tcp_qpl); #endif +#ifdef CONFIG_UADK + migration_test_add("/migration/multifd/tcp/plain/uadk", + test_multifd_tcp_uadk); +#endif #ifdef CONFIG_GNUTLS migration_test_add("/migration/multifd/tcp/tls/psk/match", test_multifd_tcp_tls_psk_match); -- Gitee From 2257fb170014e316d45f7f3c8e4fefa2e0d6212b Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Fri, 30 Aug 2024 16:27:18 -0700 Subject: [PATCH 127/134] docs/migration: add qatzip compression feature commit 85da4cbe6e5eb6ba6f31c8b30ee4582625546da7 upstream. add Intel QATzip compression method introduction Intel-SIG: commit 85da4cbe6e5e docs/migration: add qatzip compression feature Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas Signed-off-by: Yuan Liu Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-2-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/qatzip-compression.rst | 165 ++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 docs/devel/migration/qatzip-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 02aef9839a..be3d8fec9f 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -13,3 +13,4 @@ Migration has plenty of features to support different use cases. mapped-ram qpl-compression uadk-compression + qatzip-compression diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst new file mode 100644 index 0000000000..862b383164 --- /dev/null +++ b/docs/devel/migration/qatzip-compression.rst @@ -0,0 +1,165 @@ +================== +QATzip Compression +================== +In scenarios with limited network bandwidth, the ``QATzip`` solution can help +users save a lot of host CPU resources by accelerating compression and +decompression through the Intel QuickAssist Technology(``QAT``) hardware. + + +The following test was conducted using 8 multifd channels and 10Gbps network +bandwidth. The results show that, compared to zstd, ``QATzip`` significantly +saves CPU resources on the sender and reduces migration time. Compared to the +uncompressed solution, ``QATzip`` greatly improves the dirty page processing +capability, indicated by the Pages per Second metric, and also reduces the +total migration time. + +:: + + VM Configuration: 16 vCPU and 64G memory + VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data. + QAT Devices: 4 + |-----------|--------|---------|----------|----------|------|------| + |8 Channels |Total |down |throughput|pages per | send | recv | + | |time(ms)|time(ms) |(mbps) |second | cpu %| cpu% | + |-----------|--------|---------|----------|----------|------|------| + |qatzip | 16630| 28| 10467| 2940235| 160| 360| + |-----------|--------|---------|----------|----------|------|------| + |zstd | 20165| 24| 8579| 2391465| 810| 340| + |-----------|--------|---------|----------|----------|------|------| + |none | 46063| 40| 10848| 330240| 45| 85| + |-----------|--------|---------|----------|----------|------|------| + + +QATzip Compression Framework +============================ + +``QATzip`` is a user space library which builds on top of the Intel QuickAssist +Technology to provide extended accelerated compression and decompression +services. + +For more ``QATzip`` introduction, please refer to `QATzip Introduction +`_ + +:: + + +----------------+ + | MultiFd Thread | + +-------+--------+ + | + | compress/decompress + +-------+--------+ + | QATzip library | + +-------+--------+ + | + +-------+--------+ + | QAT library | + +-------+--------+ + | user space + --------+--------------------- + | kernel space + +------+-------+ + | QAT Driver | + +------+-------+ + | + +------+-------+ + | QAT Devices | + +--------------+ + + +QATzip Installation +------------------- + +The ``QATzip`` installation package has been integrated into some Linux +distributions and can be installed directly. For example, the Ubuntu Server +24.04 LTS system can be installed using below command + +.. code-block:: shell + + #apt search qatzip + libqatzip-dev/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library development files + + libqatzip3/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library + + qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed] + Compression user-space tool for Intel QuickAssist Technology + + #sudo apt install libqatzip-dev libqatzip3 qatzip + +If your system does not support the ``QATzip`` installation package, you can +use the source code to build and install, please refer to `QATzip source code installation +`_ + +QAT Hardware Deployment +----------------------- + +``QAT`` supports physical functions(PFs) and virtual functions(VFs) for +deployment, and users can configure ``QAT`` resources for migration according +to actual needs. For more details about ``QAT`` deployment, please refer to +`Intel QuickAssist Technology Documentation +`_ + +For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview +`_ + +How To Use QATzip Compression +============================= + +1 - Install ``QATzip`` library + +2 - Build ``QEMU`` with ``--enable-qatzip`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip`` + +3 - Set ``migrate_set_parameter multifd-compression qatzip`` + +4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default +comp_level value is 1, and it supports levels from 1 to 9 + +QAT Memory Requirements +======================= + +The user needs to reserve system memory for the QAT memory management to +allocate DMA memory. The size of the reserved system memory depends on the +number of devices used for migration and the number of multifd channels. + +Because memory usage depends on QAT configuration, please refer to `QAT Memory +Driver Queries +`_ +for memory usage calculation. + +.. list-table:: An example of a PF used for migration + :header-rows: 1 + + * - Number of channels + - Sender memory usage + - Receiver memory usage + * - 2 + - 10M + - 10M + * - 4 + - 12M + - 14M + * - 8 + - 16M + - 20M + +How To Choose Between QATzip and QPL +==================================== +Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``), multiple built-in accelerators are supported including +``QAT`` and ``IAA``. The former can accelerate ``QATzip`` and the latter is +used to accelerate ``QPL``. + +Here are some suggestions: + +1 - If the live migration scenario is limited by network bandwidth and ``QAT`` +hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a +lot of host CPU resources for compression. + +2 - If the system cannot support shared virtual memory (SVM) technology, use +the ``QATzip`` method because ``QPL`` performance is not good without SVM +support. + +3 - For other scenarios, use the ``QPL`` method first. -- Gitee From 39d2941c51cfc65fafb35349fb491184e51ce52b Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:19 -0700 Subject: [PATCH 128/134] meson: Introduce 'qatzip' feature to the build system commit e28ed313c268aeb4e0cefb66dcd215c30e4443fe upstream. Add a 'qatzip' feature, which is automatically disabled, and which depends on the QATzip library if enabled. Intel-SIG: commit e28ed313c268 meson: Introduce 'qatzip' feature to the build system Reviewed-by: Fabiano Rosas Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-3-yichen.wang@bytedance.com Signed-off-by: Peter Xu Conflicts: scripts/meson-buildoptions.sh [jz: resolve context conflict due to pvrdma] Signed-off-by: Jason Zeng --- meson.build | 10 ++++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 15 insertions(+) diff --git a/meson.build b/meson.build index e2b6c2a33b..22444bdb87 100644 --- a/meson.build +++ b/meson.build @@ -1061,6 +1061,14 @@ if not get_option('uadk').auto() or have_system uadk = declare_dependency(dependencies: [libwd, libwd_comp]) endif endif + +qatzip = not_found +if not get_option('qatzip').auto() or have_system + qatzip = dependency('qatzip', version: '>=1.1.2', + required: get_option('qatzip'), + method: 'pkg-config') +endif + virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2295,6 +2303,7 @@ config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) config_host_data.set('CONFIG_QPL', qpl.found()) config_host_data.set('CONFIG_UADK', uadk.found()) +config_host_data.set('CONFIG_QATZIP', qatzip.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4471,6 +4480,7 @@ summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} summary_info += {'Query Processing Library support': qpl} summary_info += {'UADK Library support': uadk} +summary_info += {'qatzip support': qatzip} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index 709678fa18..61996300d5 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -263,6 +263,8 @@ option('qpl', type : 'feature', value : 'auto', description: 'Query Processing Library support') option('uadk', type : 'feature', value : 'auto', description: 'UADK Library support') +option('qatzip', type: 'feature', value: 'auto', + description: 'QATzip compression support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 833b996818..e2dd950149 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -164,6 +164,7 @@ meson_options_help() { printf "%s\n" ' plugins TCG plugins via shared library loading' printf "%s\n" ' png PNG support with libpng' printf "%s\n" ' pvrdma Enable PVRDMA support' + printf "%s\n" ' qatzip QATzip compression support' printf "%s\n" ' qcow1 qcow1 image format support' printf "%s\n" ' qed qed image format support' printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)' @@ -432,6 +433,8 @@ _meson_option_parse() { --prefix=*) quote_sh "-Dprefix=$2" ;; --enable-pvrdma) printf "%s" -Dpvrdma=enabled ;; --disable-pvrdma) printf "%s" -Dpvrdma=disabled ;; + --enable-qatzip) printf "%s" -Dqatzip=enabled ;; + --disable-qatzip) printf "%s" -Dqatzip=disabled ;; --enable-qcow1) printf "%s" -Dqcow1=enabled ;; --disable-qcow1) printf "%s" -Dqcow1=disabled ;; --enable-qed) printf "%s" -Dqed=enabled ;; -- Gitee From 815c88722098f1300a5afc1b7f69db2352538abf Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:20 -0700 Subject: [PATCH 129/134] migration: Add migration parameters for QATzip commit 86c6eb1f39cbb7eb0467c114469e98ef699fb515 upstream. Adds support for migration parameters to control QATzip compression level. Intel-SIG: commit 86c6eb1f39cb migration: Add migration parameters for QATzip Acked-by: Markus Armbruster Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Reviewed-by: Fabiano Rosas Reviewed-by: Prasad Pandit Link: https://lore.kernel.org/r/20240830232722.58272-4-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration-hmp-cmds.c | 4 ++++ migration/options.c | 34 ++++++++++++++++++++++++++++++++++ migration/options.h | 1 + qapi/migration.json | 18 ++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 71c485edbc..42faa28a19 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -664,6 +664,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zlib_level = true; visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; + case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL: + p->has_multifd_qatzip_level = true; + visit_type_uint8(v, param, &p->multifd_qatzip_level, &err); + break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); diff --git a/migration/options.c b/migration/options.c index 6969beab63..b6746e98f5 100644 --- a/migration/options.c +++ b/migration/options.c @@ -63,6 +63,13 @@ #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 +/* + * 1: best speed, ... 9: best compress ratio + * There is some nuance here. Refer to QATzip documentation to understand + * the mapping of QATzip levels to standard deflate levels. + */ +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1 + /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 @@ -147,6 +154,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, parameters.multifd_zlib_level, DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), + DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState, + parameters.multifd_qatzip_level, + DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL), DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, parameters.multifd_zstd_level, DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), @@ -906,6 +916,13 @@ int migrate_multifd_zlib_level(void) return s->parameters.multifd_zlib_level; } +int migrate_multifd_qatzip_level(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_qatzip_level; +} + int migrate_multifd_zstd_level(void) { MigrationState *s = migrate_get_current(); @@ -1037,6 +1054,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->multifd_compression = s->parameters.multifd_compression; params->has_multifd_zlib_level = true; params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_qatzip_level = true; + params->multifd_qatzip_level = s->parameters.multifd_qatzip_level; params->has_multifd_zstd_level = true; params->multifd_zstd_level = s->parameters.multifd_zstd_level; params->has_xbzrle_cache_size = true; @@ -1098,6 +1117,7 @@ void migrate_params_init(MigrationParameters *params) params->has_multifd_channels = true; params->has_multifd_compression = true; params->has_multifd_zlib_level = true; + params->has_multifd_qatzip_level = true; params->has_multifd_zstd_level = true; params->has_xbzrle_cache_size = true; params->has_max_postcopy_bandwidth = true; @@ -1236,6 +1256,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_multifd_qatzip_level && + ((params->multifd_qatzip_level > 9) || + (params->multifd_qatzip_level < 1))) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level", + "a value between 1 and 9"); + return false; + } + if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", @@ -1412,6 +1440,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + dest->multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { dest->multifd_zlib_level = params->multifd_zlib_level; } @@ -1574,6 +1605,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + s->parameters.multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { s->parameters.multifd_zlib_level = params->multifd_zlib_level; } diff --git a/migration/options.h b/migration/options.h index bc5c5bfeaf..84fd2004f3 100644 --- a/migration/options.h +++ b/migration/options.h @@ -89,6 +89,7 @@ MigMode migrate_mode(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); +int migrate_multifd_qatzip_level(void); int migrate_multifd_zstd_level(void); uint8_t migrate_throttle_trigger_threshold(void); const char *migrate_tls_authz(void); diff --git a/qapi/migration.json b/qapi/migration.json index 06c1cebcc1..6917cf9985 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -889,6 +889,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -967,6 +972,7 @@ 'xbzrle-cache-size', 'max-postcopy-bandwidth', 'max-cpu-throttle', 'multifd-compression', 'multifd-zlib-level', 'multifd-zstd-level', + 'multifd-qatzip-level', 'block-bitmap-mapping', { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', @@ -1098,6 +1104,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1196,6 +1207,7 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', @@ -1351,6 +1363,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1445,6 +1462,7 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', -- Gitee From 42e63c3f40e2be1a14954f84443a3a543c3737bb Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:21 -0700 Subject: [PATCH 130/134] migration: Introduce 'qatzip' compression method commit 80484f945989988091c5cd729c3e8bde6c14907a upstream. Adds support for 'qatzip' as an option for the multifd compression method parameter, and implements using QAT for 'qatzip' compression and decompression. Intel-SIG: commit 80484f945989 migration: Introduce 'qatzip' compression method Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Reviewed-by: Prasad Pandit Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-5-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-qatzip.c | 394 +++++++++++++++++++++++++++++++ migration/multifd.h | 5 +- qapi/migration.json | 3 + 5 files changed, 402 insertions(+), 3 deletions(-) create mode 100644 migration/multifd-qatzip.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 650c42eaf8..9cc2e38aba 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -711,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd/qpl/uadk", + "none/zlib/zstd/qpl/uadk/qatzip", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 264d04657f..aba2581705 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -42,6 +42,7 @@ endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file mode 100644 index 0000000000..3c787ed879 --- /dev/null +++ b/migration/multifd-qatzip.c @@ -0,0 +1,394 @@ +/* + * Multifd QATzip compression implementation + * + * Copyright (c) Bytedance + * + * Authors: + * Bryan Zhang + * Hao Xiang + * Yichen Wang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "exec/ramblock.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qapi-types-migration.h" +#include "options.h" +#include "multifd.h" +#include + +typedef struct { + /* + * Unique session for use with QATzip API + */ + QzSession_T sess; + + /* + * For compression: Buffer for pages to compress + * For decompression: Buffer for data to decompress + */ + uint8_t *in_buf; + uint32_t in_len; + + /* + * For compression: Output buffer of compressed data + * For decompression: Output buffer of decompressed data + */ + uint8_t *out_buf; + uint32_t out_len; +} QatzipData; + +/** + * qatzip_send_setup: Set up QATzip session and private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, 2); + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + /* Make sure to use configured QATzip compression level. */ + params.common_params.comp_lvl = migrate_multifd_qatzip_level(); + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + if (MULTIFD_PACKET_SIZE > UINT32_MAX) { + err_msg = "packet size too large for QAT"; + goto err; + } + + q->in_len = MULTIFD_PACKET_SIZE; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess); + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_send_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return None + */ +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + + g_free(p->iov); + p->iov = NULL; + p->compress_data = NULL; +} + +/** + * qatzip_send_prepare: Compress pages and update IO channel info. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) +{ + MultiFDPages_t *pages = p->pages; + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* + * Unlike other multifd compression implementations, we use a non-streaming + * API and place all the data into one buffer, rather than sending each + * page to the compression API at a time. Based on initial benchmarks, the + * non-streaming API outperforms the streaming API. Plus, the logic in QEMU + * is friendly to using the non-streaming API anyway. If either of these + * statements becomes no longer true, we can revisit adding a streaming + * implementation. + */ + for (int i = 0; i < pages->normal_num; i++) { + memcpy(q->in_buf + (i * p->page_size), + pages->block->host + pages->offset[i], + p->page_size); + } + + in_len = pages->normal_num * p->page_size; + if (in_len > q->in_len) { + error_setg(errp, "multifd %u: unexpectedly large input", p->id); + return -1; + } + out_len = q->out_len; + + ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK", + p->id, ret); + return -1; + } + if (in_len != pages->normal_num * p->page_size) { + error_setg(errp, "multifd %u: QATzip failed to compress all input", + p->id); + return -1; + } + + p->iov[p->iovs_num].iov_base = q->out_buf; + p->iov[p->iovs_num].iov_len = out_len; + p->iovs_num++; + p->next_packet_size = out_len; + +out: + p->flags |= MULTIFD_FLAG_QATZIP; + multifd_send_fill_packet(p); + return 0; +} + +/** + * qatzip_recv_setup: Set up QATzip session and allocate private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + /* + * Reserve extra spaces for the incoming packets. Current implementation + * doesn't send uncompressed pages in case the compression gets too big. + */ + q->in_len = MULTIFD_PACKET_SIZE * 2; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = MULTIFD_PACKET_SIZE; + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_recv_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @return None + */ +static void qatzip_recv_cleanup(MultiFDRecvParams *p) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + p->compress_data = NULL; +} + + +/** + * qatzip_recv: Decompress pages and copy them to the appropriate + * locations. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + uint32_t in_size = p->next_packet_size; + uint32_t expected_size = p->normal_num * p->page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (in_size > q->in_len) { + error_setg(errp, "multifd %u: received unexpectedly large packet", + p->id); + return -1; + } + + if (flags != MULTIFD_FLAG_QATZIP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QATZIP); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp); + if (ret != 0) { + return ret; + } + + in_len = in_size; + out_len = q->out_len; + ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: qzDecompress failed", p->id); + return -1; + } + if (out_len != expected_size) { + error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_len, expected_size); + return -1; + } + + /* Copy each page to its appropriate location. */ + for (int i = 0; i < p->normal_num; i++) { + memcpy(p->host + p->normal[i], + q->out_buf + p->page_size * i, + p->page_size); + } + return 0; +} + +static MultiFDMethods multifd_qatzip_ops = { + .send_setup = qatzip_send_setup, + .send_cleanup = qatzip_send_cleanup, + .send_prepare = qatzip_send_prepare, + .recv_setup = qatzip_recv_setup, + .recv_cleanup = qatzip_recv_cleanup, + .recv = qatzip_recv +}; + +static void multifd_qatzip_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops); +} + +migration_init(multifd_qatzip_register); diff --git a/migration/multifd.h b/migration/multifd.h index 0ecd6f47d7..adceb65050 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -34,14 +34,15 @@ MultiFDRecvData *multifd_get_recv_data(void); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 4 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1) +/* We reserve 5 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) #define MULTIFD_FLAG_QPL (4 << 1) #define MULTIFD_FLAG_UADK (8 << 1) +#define MULTIFD_FLAG_QATZIP (16 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index 6917cf9985..acb9f03409 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -629,6 +629,8 @@ # # @zstd: use zstd compression method. # +# @qatzip: use qatzip compression method. (Since 9.2) +# # @qpl: use qpl compression method. Query Processing Library(qpl) is # based on the deflate compression algorithm and use the Intel # In-Memory Analytics Accelerator(IAA) accelerated compression @@ -641,6 +643,7 @@ { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'}, { 'name': 'qpl', 'if': 'CONFIG_QPL' }, { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] } -- Gitee From 00be1d4c760dc5cdbd58d1e7275a79ca696fbfb1 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:22 -0700 Subject: [PATCH 131/134] tests/migration: Add integration test for 'qatzip' compression method commit afe166d4e8bc33bc448cd573b55d0ac094187d48 upstream. Adds an integration test for 'qatzip'. Intel-SIG: commit afe166d4e8bc tests/migration: Add integration test for 'qatzip' compression method Reviewed-by: Fabiano Rosas Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-6-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index d79978ac38..e9807447c2 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2713,6 +2713,18 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QATZIP +static void * +test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from, + QTestState *to) +{ + migrate_set_parameter_int(from, "multifd-qatzip-level", 2); + migrate_set_parameter_int(to, "multifd-qatzip-level", 2); + + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip"); +} +#endif + #ifdef CONFIG_QPL static void * test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, @@ -2795,6 +2807,17 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QATZIP +static void test_multifd_tcp_qatzip(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_QPL static void test_multifd_tcp_qpl(void) { @@ -3710,6 +3733,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/zstd", test_multifd_tcp_zstd); #endif +#ifdef CONFIG_QATZIP + migration_test_add("/migration/multifd/tcp/plain/qatzip", + test_multifd_tcp_qatzip); +#endif #ifdef CONFIG_QPL migration_test_add("/migration/multifd/tcp/plain/qpl", test_multifd_tcp_qpl); -- Gitee From 24f190e5e8396cd8d604b46c6e9ead6817c027b8 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 17 Sep 2024 15:58:02 -0300 Subject: [PATCH 132/134] migration/multifd: Fix rb->receivedmap cleanup race commit 4ce56229087860805877075ddb29dd44578365a9 upstream. Fix a segmentation fault in multifd when rb->receivedmap is cleared too early. After commit 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults"), multifd started using the rb->receivedmap bitmap, which belongs to ram.c and is initialized and *freed* from the ram SaveVMHandlers. Multifd threads are live until migration_incoming_state_destroy(), which is called after qemu_loadvm_state_cleanup(), leading to a crash when accessing rb->receivedmap. process_incoming_migration_co() ... qemu_loadvm_state() multifd_nocomp_recv() qemu_loadvm_state_cleanup() ramblock_recv_bitmap_set_offset() rb->receivedmap = NULL set_bit_atomic(..., rb->receivedmap) ... migration_incoming_state_destroy() multifd_recv_cleanup() multifd_recv_terminate_threads(NULL) Move the loadvm cleanup into migration_incoming_state_destroy(), after multifd_recv_cleanup() to ensure multifd threads have already exited when rb->receivedmap is cleared. Adjust the postcopy listen thread comment to indicate that we still want to skip the cpu synchronization. Intel-SIG: commit 4ce562290878 migration/multifd: Fix rb->receivedmap cleanup race CC: qemu-stable@nongnu.org Fixes: 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults") Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240917185802.15619-3-farosas@suse.de [peterx: added comment in migration_incoming_state_destroy()] Signed-off-by: Peter Xu Conflicts: migration/migration.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/migration.c | 5 +++++ migration/savevm.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 8e01355b9b..dc8ca1a7d7 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -298,6 +298,11 @@ void migration_incoming_state_destroy(void) multifd_recv_cleanup(); compress_threads_load_cleanup(); + /* + * RAM state cleanup needs to happen after multifd cleanup, because + * multifd threads can use some of its states (receivedmap). + */ + qemu_loadvm_state_cleanup(); if (mis->to_src_file) { /* Tell source that we are done */ diff --git a/migration/savevm.c b/migration/savevm.c index 0dc1327d8b..a5b9a70d13 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2960,7 +2960,10 @@ int qemu_loadvm_state(QEMUFile *f) trace_qemu_loadvm_state_post_main(ret); if (mis->have_listen_thread) { - /* Listen thread still going, can't clean up yet */ + /* + * Postcopy listen thread still going, don't synchronize the + * cpus yet. + */ return ret; } @@ -3003,7 +3006,6 @@ int qemu_loadvm_state(QEMUFile *f) } } - qemu_loadvm_state_cleanup(); cpu_synchronize_all_post_init(); return ret; -- Gitee From 12061d7ade72fe5b0ea89607be3e776244672274 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 1 Aug 2024 14:41:01 -0300 Subject: [PATCH 133/134] migration/multifd: Fix multifd_send_setup cleanup when channel creation fails commit 0bd5b9284fa94a6242a0d27a46380d93e753488b upstream. When a channel fails to create, the code currently just returns. This is wrong for two reasons: 1) Channel n+1 will not get to initialize it's semaphores, leading to an assert when terminate_threads tries to post to it: qemu-system-x86_64: ../util/qemu-thread-posix.c:92: qemu_mutex_lock_impl: Assertion `mutex->initialized' failed. 2) (theoretical) If channel n-1 already started creation it will defeat the purpose of the channels_created logic which is in place to avoid migrate_fd_cleanup() to run while channels are still being created. This cannot really happen today because the current failure cases for multifd_new_send_channel_create() are all synchronous, resulting from qio_channel_file_new_path() getting a bad filename. This would hit all channels equally. But I don't want to set a trap for future people, so have all channels try to create (even if failing), and only fail after the channels_created semaphore has been posted. While here, remove the error_report_err call. There's one already at migrate_fd_cleanup later on. Intel-SIG: commit 0bd5b9284fa9 migration/multifd: Fix multifd_send_setup cleanup when channel creation fails Cc: qemu-stable@nongnu.org Reported-by: Jim Fehlig Fixes: b7b03eb614 ("migration/multifd: Add outgoing QIOChannelFile support") Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index d27a84b73c..d352aa2016 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1160,7 +1160,6 @@ static bool multifd_new_send_channel_create(gpointer opaque, Error **errp) bool multifd_send_setup(void) { MigrationState *s = migrate_get_current(); - Error *local_err = NULL; int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); bool use_packets = multifd_use_packets(); @@ -1181,6 +1180,7 @@ bool multifd_send_setup(void) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); @@ -1200,7 +1200,8 @@ bool multifd_send_setup(void) p->write_flags = 0; if (!multifd_new_send_channel_create(p, &local_err)) { - return false; + migrate_set_error(s, local_err); + ret = -1; } } @@ -1213,24 +1214,27 @@ bool multifd_send_setup(void) qemu_sem_wait(&multifd_send_state->channels_created); } + if (ret) { + goto err; + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - break; + migrate_set_error(s, local_err); + goto err; } } - if (ret) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - return false; - } - return true; + +err: + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; } bool multifd_recv(void) -- Gitee From b32bdcac29274983f34215f940771534ab898268 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 28 Aug 2024 11:56:48 -0300 Subject: [PATCH 134/134] migration/multifd: Fix p->iov leak in multifd-uadk.c commit 405e352d28c24991cacfdebccf67d56c4795cf6e upstream. The send_cleanup() hook should free the p->iov that was allocated at send_setup(). This was missed because the UADK code is conditional on the presence of the accelerator, so it's not tested by default. Intel-SIG: commit 405e352d28c2 migration/multifd: Fix p->iov leak in multifd-uadk.c Fixes: 819dd20636 ("migration/multifd: Add UADK initialization") Reported-by: Peter Xu Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index d12353fb21..9a582fc919 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -146,6 +146,8 @@ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) multifd_uadk_uninit_sess(wd); p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; } static inline void prepare_next_iov(MultiFDSendParams *p, void *base, -- Gitee