From 1cbdefc2a7ecc58165e0e233dcdc479d211081d3 Mon Sep 17 00:00:00 2001 From: build Date: Tue, 15 Jul 2025 13:08:33 +0800 Subject: [PATCH 1/3] Sync upstream changes --- ...d-fix-error-message-in-multifd-recv-.patch | 43 + ...d-simplify-multifd-channel-connect-i.patch | 55 + ...d-fix-leaking-of-error-in-tls-error-.patch | 51 + ...d-remove-error-setg-in-migration-ioc.patch | 41 + ...gration-channel-read-peek-error-path.patch | 54 + ...d-remove-unnecessary-usage-of-local-.patch | 63 + ...ifd-remove-multifdpages-t-packet-num.patch | 50 + ...d-remove-qemufile-from-where-it-is-n.patch | 160 +++ ...d-change-multifd-pages-init-argument.patch | 44 + ...n-report-error-in-incoming-migration.patch | 42 + ...ation-print-migration-incoming-error.patch | 41 + ...ation-add-a-wrapper-to-print-test-na.patch | 90 ++ ...ation-use-the-new-migration-test-add.patch | 310 +++++ ...-qtest-re-enable-multifd-cancel-test.patch | 45 + ...migration-create-migration-directory.patch | 67 + 0370-docs-migration-create-index-page.patch | 99 ++ ...igration-convert-virtio-txt-into-rst.patch | 273 +++++ ...plit-backwards-compatibility-separat.patch | 1090 +++++++++++++++++ ...gration-split-debugging-and-firmware.patch | 151 +++ 0374-docs-migration-split-postcopy.patch | 681 ++++++++++ 0375-docs-migration-split-dirty-limit.patch | 194 +++ ...ocs-migration-organize-postcopy-page.patch | 231 ++++ ...urther-move-vfio-to-be-feature-of-mi.patch | 49 + ...urther-move-virtio-to-be-feature-of-.patch | 49 + ...d-drop-stale-comment-for-multifd-zer.patch | 45 + ...ation-multifd-multifd-send-kick-main.patch | 78 ++ ...d-drop-multifdsendparams-quit-cleanu.patch | 253 ++++ ...ifd-postpone-reset-of-multifdpages-t.patch | 81 ++ ...d-drop-multifdsendparams-normal-arra.patch | 214 ++++ ...d-separate-sync-request-with-normal-.patch | 192 +++ ...fd-simplify-locking-in-sender-thread.patch | 101 ++ ...d-drop-pages-num-check-in-sender-thr.patch | 48 + ...d-rename-p-num-packets-and-clean-it-.patch | 142 +++ ...d-move-total-normal-pages-accounting.patch | 59 + ...multifd-move-trace-multifd-send-recv.patch | 73 ++ ...-multifd-multifd-send-prepare-header.patch | 84 ++ ...d-move-header-prepare-fill-into-send.patch | 229 ++++ ...tion-multifd-forbid-spurious-wakeups.patch | 53 + ...d-split-multifd-send-terminate-threa.patch | 133 ++ ...d-change-retval-of-multifd-queue-pag.patch | 90 ++ ...d-change-retval-of-multifd-send-page.patch | 85 ++ ...n-multifd-rewrite-multifd-queue-page.patch | 114 ++ ...multifd-cleanup-multifd-save-cleanup.patch | 161 +++ ...multifd-cleanup-multifd-load-cleanup.patch | 96 ++ ...d-stick-with-send-recv-on-function-n.patch | 156 +++ ...d-fix-multifdsendparams-packet-num-r.patch | 169 +++ ...d-optimize-sender-side-to-be-lockles.patch | 206 ++++ ...igration-multifd-join-the-tls-thread.patch | 66 + 0403-migration-multifd-remove-p-running.patch | 177 +++ ...d-move-multifd-send-setup-error-hand.patch | 108 ++ ...d-move-multifd-send-setup-into-migra.patch | 92 ++ ...d-unify-multifd-and-tls-connection-p.patch | 177 +++ ...d-add-a-synchronization-point-for-ch.patch | 129 ++ ...multifd-remove-p-quit-from-recv-side.patch | 131 ++ ...ultifd-release-recv-sem-sync-earlier.patch | 54 + ...fd-cleanup-tls-iochannel-referencing.patch | 119 ++ ...gration-multifd-drop-registered-yank.patch | 67 + ...d-make-multifd-channel-connect-retur.patch | 56 + ...d-cleanup-outgoing-args-in-state-des.patch | 80 ++ ...d-drop-unnecessary-helper-to-destroy.patch | 79 ++ ...ltifd-cleanup-multifd-recv-sync-main.patch | 77 ++ ...d-rename-multifdsend-recvparams-data.patch | 201 +++ ...tifd-decouple-recv-method-from-pages.patch | 159 +++ ...ultifd-allow-multifd-without-packets.patch | 364 ++++++ ...d-add-new-migration-option-zero-page.patch | 288 +++++ ...d-implement-zero-page-transmission-o.patch | 623 ++++++++++ ...d-implement-ram-save-target-page-mul.patch | 95 ++ ...d-solve-zero-page-causing-multiple-p.patch | 134 ++ ...igration-add-qpl-compression-feature.patch | 306 +++++ ...d-put-iov-initialization-into-compre.patch | 170 +++ ...onfigure-add-enable-qpl-build-option.patch | 102 ++ ...n-multifd-add-qpl-compression-method.patch | 127 ++ ...n-multifd-include-ram-h-in-multifd-h.patch | 33 + ...d-implement-initialization-of-qpl-co.patch | 371 ++++++ ...d-implement-qpl-compression-and-deco.patch | 512 ++++++++ ...ration-test-add-qpl-compression-test.patch | 82 ++ ...ly-apply-migration-compression-level.patch | 55 + ...set-compression-level-in-migration-t.patch | 51 + ...ation-add-qatzip-compression-feature.patch | 212 ++++ ...e-qatzip-feature-to-the-build-system.patch | 102 ++ ...-add-migration-parameters-for-qatzip.patch | 216 ++++ ...-introduce-qatzip-compression-method.patch | 508 ++++++++ ...add-integration-test-for-qatzip-comp.patch | 78 ++ ...tifd-fix-rb-receivedmap-cleanup-race.patch | 97 ++ ...d-fix-loop-conditions-in-multifd-zst.patch | 59 + ...d-ensure-packet-ramblock-is-null-ter.patch | 71 ++ ...d-zero-p-flags-before-starting-filli.patch | 52 + ...or-migration-using-compression-metho.patch | 63 + ...or-incorrect-migration-data-with-qpl.patch | 47 + ...or-incorrect-migration-data-with-qat.patch | 51 + ...o-virtio-snd-fix-invalid-param-check.patch | 49 + ...id-per-nbdrequest-nbd-client-get-put.patch | 46 + ...traverse-nbdexport-clients-from-main.patch | 169 +++ ...duce-nbdclient-lock-to-protect-field.patch | 365 ++++++ 0449-nbd-minor-style-and-typo-fixes.patch | 49 + ...-plumb-in-new-args-to-nbd-client-add.patch | 165 +++ ...024-7409-cap-default-max-connections.patch | 173 +++ ...024-7409-drop-non-negotiating-client.patch | 124 ++ ...024-7409-close-stray-clients-at-serv.patch | 162 +++ ...024-7409-avoid-use-after-free-when-c.patch | 90 ++ 0455-add-rtc-acpi-table.patch | 57 + ...net-ensure-queue-index-fits-with-rss.patch | 36 + ...troduce-virtio-bh-new-guarded-helper.patch | 67 + ...o-gpu-protect-from-dma-re-entrancy-b.patch | 141 +++ ...erial-bus-protect-from-dma-re-entran.patch | 41 + ...-crypto-protect-from-dma-re-entrancy.patch | 42 + qemu.spec | 115 +- 107 files changed, 15366 insertions(+), 1 deletion(-) create mode 100644 0355-migration-multifd-fix-error-message-in-multifd-recv-.patch create mode 100644 0356-migration-multifd-simplify-multifd-channel-connect-i.patch create mode 100644 0357-migration-multifd-fix-leaking-of-error-in-tls-error-.patch create mode 100644 0358-migration-multifd-remove-error-setg-in-migration-ioc.patch create mode 100644 0359-migration-fix-migration-channel-read-peek-error-path.patch create mode 100644 0360-migration-multifd-remove-unnecessary-usage-of-local-.patch create mode 100644 0361-migration-multifd-remove-multifdpages-t-packet-num.patch create mode 100644 0362-migration-multifd-remove-qemufile-from-where-it-is-n.patch create mode 100644 0363-migration-multifd-change-multifd-pages-init-argument.patch create mode 100644 0364-migration-report-error-in-incoming-migration.patch create mode 100644 0365-tests-qtest-migration-print-migration-incoming-error.patch create mode 100644 0366-tests-qtest-migration-add-a-wrapper-to-print-test-na.patch create mode 100644 0367-tests-qtest-migration-use-the-new-migration-test-add.patch create mode 100644 0368-tests-qtest-re-enable-multifd-cancel-test.patch create mode 100644 0369-docs-migration-create-migration-directory.patch create mode 100644 0370-docs-migration-create-index-page.patch create mode 100644 0371-docs-migration-convert-virtio-txt-into-rst.patch create mode 100644 0372-docs-migration-split-backwards-compatibility-separat.patch create mode 100644 0373-docs-migration-split-debugging-and-firmware.patch create mode 100644 0374-docs-migration-split-postcopy.patch create mode 100644 0375-docs-migration-split-dirty-limit.patch create mode 100644 0376-docs-migration-organize-postcopy-page.patch create mode 100644 0377-docs-migration-further-move-vfio-to-be-feature-of-mi.patch create mode 100644 0378-docs-migration-further-move-virtio-to-be-feature-of-.patch create mode 100644 0379-migration-multifd-drop-stale-comment-for-multifd-zer.patch create mode 100644 0380-migration-multifd-multifd-send-kick-main.patch create mode 100644 0381-migration-multifd-drop-multifdsendparams-quit-cleanu.patch create mode 100644 0382-migration-multifd-postpone-reset-of-multifdpages-t.patch create mode 100644 0383-migration-multifd-drop-multifdsendparams-normal-arra.patch create mode 100644 0384-migration-multifd-separate-sync-request-with-normal-.patch create mode 100644 0385-migration-multifd-simplify-locking-in-sender-thread.patch create mode 100644 0386-migration-multifd-drop-pages-num-check-in-sender-thr.patch create mode 100644 0387-migration-multifd-rename-p-num-packets-and-clean-it-.patch create mode 100644 0388-migration-multifd-move-total-normal-pages-accounting.patch create mode 100644 0389-migration-multifd-move-trace-multifd-send-recv.patch create mode 100644 0390-migration-multifd-multifd-send-prepare-header.patch create mode 100644 0391-migration-multifd-move-header-prepare-fill-into-send.patch create mode 100644 0392-migration-multifd-forbid-spurious-wakeups.patch create mode 100644 0393-migration-multifd-split-multifd-send-terminate-threa.patch create mode 100644 0394-migration-multifd-change-retval-of-multifd-queue-pag.patch create mode 100644 0395-migration-multifd-change-retval-of-multifd-send-page.patch create mode 100644 0396-migration-multifd-rewrite-multifd-queue-page.patch create mode 100644 0397-migration-multifd-cleanup-multifd-save-cleanup.patch create mode 100644 0398-migration-multifd-cleanup-multifd-load-cleanup.patch create mode 100644 0399-migration-multifd-stick-with-send-recv-on-function-n.patch create mode 100644 0400-migration-multifd-fix-multifdsendparams-packet-num-r.patch create mode 100644 0401-migration-multifd-optimize-sender-side-to-be-lockles.patch create mode 100644 0402-migration-multifd-join-the-tls-thread.patch create mode 100644 0403-migration-multifd-remove-p-running.patch create mode 100644 0404-migration-multifd-move-multifd-send-setup-error-hand.patch create mode 100644 0405-migration-multifd-move-multifd-send-setup-into-migra.patch create mode 100644 0406-migration-multifd-unify-multifd-and-tls-connection-p.patch create mode 100644 0407-migration-multifd-add-a-synchronization-point-for-ch.patch create mode 100644 0408-migration-multifd-remove-p-quit-from-recv-side.patch create mode 100644 0409-migration-multifd-release-recv-sem-sync-earlier.patch create mode 100644 0410-migration-multifd-cleanup-tls-iochannel-referencing.patch create mode 100644 0411-migration-multifd-drop-registered-yank.patch create mode 100644 0412-migration-multifd-make-multifd-channel-connect-retur.patch create mode 100644 0413-migration-multifd-cleanup-outgoing-args-in-state-des.patch create mode 100644 0414-migration-multifd-drop-unnecessary-helper-to-destroy.patch create mode 100644 0415-migration-multifd-cleanup-multifd-recv-sync-main.patch create mode 100644 0416-migration-multifd-rename-multifdsend-recvparams-data.patch create mode 100644 0417-migration-multifd-decouple-recv-method-from-pages.patch create mode 100644 0418-migration-multifd-allow-multifd-without-packets.patch create mode 100644 0419-migration-multifd-add-new-migration-option-zero-page.patch create mode 100644 0420-migration-multifd-implement-zero-page-transmission-o.patch create mode 100644 0421-migration-multifd-implement-ram-save-target-page-mul.patch create mode 100644 0422-migration-multifd-solve-zero-page-causing-multiple-p.patch create mode 100644 0423-docs-migration-add-qpl-compression-feature.patch create mode 100644 0424-migration-multifd-put-iov-initialization-into-compre.patch create mode 100644 0425-configure-add-enable-qpl-build-option.patch create mode 100644 0426-migration-multifd-add-qpl-compression-method.patch create mode 100644 0427-migration-multifd-include-ram-h-in-multifd-h.patch create mode 100644 0428-migration-multifd-implement-initialization-of-qpl-co.patch create mode 100644 0429-migration-multifd-implement-qpl-compression-and-deco.patch create mode 100644 0430-tests-migration-test-add-qpl-compression-test.patch create mode 100644 0431-migration-properly-apply-migration-compression-level.patch create mode 100644 0432-tests-migration-set-compression-level-in-migration-t.patch create mode 100644 0433-docs-migration-add-qatzip-compression-feature.patch create mode 100644 0434-meson-introduce-qatzip-feature-to-the-build-system.patch create mode 100644 0435-migration-add-migration-parameters-for-qatzip.patch create mode 100644 0436-migration-introduce-qatzip-compression-method.patch create mode 100644 0437-tests-migration-add-integration-test-for-qatzip-comp.patch create mode 100644 0438-migration-multifd-fix-rb-receivedmap-cleanup-race.patch create mode 100644 0439-migration-multifd-fix-loop-conditions-in-multifd-zst.patch create mode 100644 0440-migration-multifd-ensure-packet-ramblock-is-null-ter.patch create mode 100644 0441-migration-multifd-zero-p-flags-before-starting-filli.patch create mode 100644 0442-multifd-bugfix-for-migration-using-compression-metho.patch create mode 100644 0443-multifd-bugfix-for-incorrect-migration-data-with-qpl.patch create mode 100644 0444-multifd-bugfix-for-incorrect-migration-data-with-qat.patch create mode 100644 0445-hw-audio-virtio-snd-fix-invalid-param-check.patch create mode 100644 0446-nbd-server-avoid-per-nbdrequest-nbd-client-get-put.patch create mode 100644 0447-nbd-server-only-traverse-nbdexport-clients-from-main.patch create mode 100644 0448-nbd-server-introduce-nbdclient-lock-to-protect-field.patch create mode 100644 0449-nbd-minor-style-and-typo-fixes.patch create mode 100644 0450-nbd-server-plumb-in-new-args-to-nbd-client-add.patch create mode 100644 0451-nbd-server-cve-2024-7409-cap-default-max-connections.patch create mode 100644 0452-nbd-server-cve-2024-7409-drop-non-negotiating-client.patch create mode 100644 0453-nbd-server-cve-2024-7409-close-stray-clients-at-serv.patch create mode 100644 0454-nbd-server-cve-2024-7409-avoid-use-after-free-when-c.patch create mode 100644 0455-add-rtc-acpi-table.patch create mode 100644 0456-virtio-net-ensure-queue-index-fits-with-rss.patch create mode 100644 0457-hw-virtio-introduce-virtio-bh-new-guarded-helper.patch create mode 100644 0458-hw-display-virtio-gpu-protect-from-dma-re-entrancy-b.patch create mode 100644 0459-hw-char-virtio-serial-bus-protect-from-dma-re-entran.patch create mode 100644 0460-hw-virtio-virtio-crypto-protect-from-dma-re-entrancy.patch diff --git a/0355-migration-multifd-fix-error-message-in-multifd-recv-.patch b/0355-migration-multifd-fix-error-message-in-multifd-recv-.patch new file mode 100644 index 0000000..6425f22 --- /dev/null +++ b/0355-migration-multifd-fix-error-message-in-multifd-recv-.patch @@ -0,0 +1,43 @@ +From ead67116389ee0d42529cb050f06526e7cf6edbc Mon Sep 17 00:00:00 2001 +From: Avihai Horon +Date: Sun, 31 Dec 2023 11:30:10 +0200 +Subject: [PATCH] migration/multifd: Fix error message in + multifd_recv_initial_packet() + +commit c77b40859a5201f01b44dc475258405e289c431f upstream. + +In multifd_recv_initial_packet(), if MultiFDInit_t->id is greater than +the configured number of multifd channels, an irrelevant error message +about multifd version is printed. + +Change the error message to a relevant one about the channel id. + +Intel-SIG: commit c77b40859a52 migration/multifd: Fix error message in multifd_recv_initial_packet() + +Signed-off-by: Avihai Horon +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20231231093016.14204-6-avihaih@nvidia.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 409460684f..a6204fc72f 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -228,8 +228,8 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) + } + + if (msg.id > migrate_multifd_channels()) { +- error_setg(errp, "multifd: received channel version %u " +- "expected %u", msg.version, MULTIFD_VERSION); ++ error_setg(errp, "multifd: received channel id %u is greater than " ++ "number of channels %u", msg.id, migrate_multifd_channels()); + return -1; + } + +-- +2.43.0 + diff --git a/0356-migration-multifd-simplify-multifd-channel-connect-i.patch b/0356-migration-multifd-simplify-multifd-channel-connect-i.patch new file mode 100644 index 0000000..bd0211c --- /dev/null +++ b/0356-migration-multifd-simplify-multifd-channel-connect-i.patch @@ -0,0 +1,55 @@ +From cc2c19aa5561a325ff21132b3cec7e6b3ddd155c Mon Sep 17 00:00:00 2001 +From: Avihai Horon +Date: Sun, 31 Dec 2023 11:30:11 +0200 +Subject: [PATCH] migration/multifd: Simplify multifd_channel_connect() if else + statement +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit a4395f5d3c06472ed70d9ef9f79878f95575be9e upstream. + +The else branch in multifd_channel_connect() is redundant because when +the if branch is taken the function returns. + +Simplify the code by removing the else branch. + +Intel-SIG: commit a4395f5d3c06 migration/multifd: Simplify multifd_channel_connect() if else statement + +Signed-off-by: Avihai Horon +Reviewed-by: Philippe Mathieu-Daudé +Link: https://lore.kernel.org/r/20231231093016.14204-7-avihaih@nvidia.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index a6204fc72f..55d5fd55f8 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -847,14 +847,13 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + * so we mustn't call multifd_send_thread until then + */ + return multifd_tls_channel_connect(p, ioc, errp); +- +- } else { +- migration_ioc_register_yank(ioc); +- p->registered_yank = true; +- p->c = ioc; +- qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, +- QEMU_THREAD_JOINABLE); + } ++ ++ migration_ioc_register_yank(ioc); ++ p->registered_yank = true; ++ p->c = ioc; ++ qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, ++ QEMU_THREAD_JOINABLE); + return true; + } + +-- +2.43.0 + diff --git a/0357-migration-multifd-fix-leaking-of-error-in-tls-error-.patch b/0357-migration-multifd-fix-leaking-of-error-in-tls-error-.patch new file mode 100644 index 0000000..d617fed --- /dev/null +++ b/0357-migration-multifd-fix-leaking-of-error-in-tls-error-.patch @@ -0,0 +1,51 @@ +From fa07f8ae871ae7628a4b14915e18e54e82f75c9f Mon Sep 17 00:00:00 2001 +From: Avihai Horon +Date: Sun, 31 Dec 2023 11:30:12 +0200 +Subject: [PATCH] migration/multifd: Fix leaking of Error in TLS error flow + +commit 6ae208ce9656114e428b1a75ac62a6761ed3216c upstream. + +If there is an error in multifd TLS handshake task, +multifd_tls_outgoing_handshake() retrieves the error with +qio_task_propagate_error() but never frees it. + +Fix it by freeing the obtained Error. + +In addition, the error is not reported at all, so report it with +migrate_set_error(). + +Intel-SIG: commit 6ae208ce9656 migration/multifd: Fix leaking of Error in TLS error flow + +Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") +Signed-off-by: Avihai Horon +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20231231093016.14204-8-avihaih@nvidia.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 55d5fd55f8..9ac24866ad 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -787,6 +787,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, + + trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + ++ migrate_set_error(migrate_get_current(), err); + /* + * Error happen, mark multifd_send_thread status as 'quit' although it + * is not created, and then tell who pay attention to me. +@@ -794,6 +795,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, + p->quit = true; + qemu_sem_post(&multifd_send_state->channels_ready); + qemu_sem_post(&p->sem_sync); ++ error_free(err); + } + + static void *multifd_tls_handshake_thread(void *opaque) +-- +2.43.0 + diff --git a/0358-migration-multifd-remove-error-setg-in-migration-ioc.patch b/0358-migration-multifd-remove-error-setg-in-migration-ioc.patch new file mode 100644 index 0000000..79edb88 --- /dev/null +++ b/0358-migration-multifd-remove-error-setg-in-migration-ioc.patch @@ -0,0 +1,41 @@ +From 5e6fcd7351041321a2bbbe22af763d223b9b3937 Mon Sep 17 00:00:00 2001 +From: Avihai Horon +Date: Sun, 31 Dec 2023 11:30:13 +0200 +Subject: [PATCH] migration/multifd: Remove error_setg() in + migration_ioc_process_incoming() + +commit 1d3886f837d8e972366a8b58ba8afb0e5efbeed7 upstream. + +If multifd_load_setup() fails in migration_ioc_process_incoming(), +error_setg() is called with errp. This will lead to an assert because in +that case errp already contains an error. + +Fix it by removing the redundant error_setg(). + +Intel-SIG: commit 1d3886f837d8 migration/multifd: Remove error_setg() in migration_ioc_process_incoming() + +Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") +Signed-off-by: Avihai Horon +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20231231093016.14204-9-avihaih@nvidia.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/migration.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 8b1b47836f..31abc7c3a3 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -857,7 +857,6 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) + } + + if (multifd_load_setup(errp) != 0) { +- error_setg(errp, "Failed to setup multifd channels"); + return; + } + +-- +2.43.0 + diff --git a/0359-migration-fix-migration-channel-read-peek-error-path.patch b/0359-migration-fix-migration-channel-read-peek-error-path.patch new file mode 100644 index 0000000..ccf2eab --- /dev/null +++ b/0359-migration-fix-migration-channel-read-peek-error-path.patch @@ -0,0 +1,54 @@ +From 83629f8df6c3b1fbdd75962ecf4209c34b6828a5 Mon Sep 17 00:00:00 2001 +From: Avihai Horon +Date: Sun, 31 Dec 2023 11:30:14 +0200 +Subject: [PATCH] migration: Fix migration_channel_read_peek() error path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 4f8cf323e80c17f7d4b5604f1699591326df6262 upstream. + +migration_channel_read_peek() calls qio_channel_readv_full() and handles +both cases of return value == 0 and return value < 0 the same way, by +calling error_setg() with errp. However, if return value < 0, errp is +already set, so calling error_setg() with errp will lead to an assert. + +Fix it by handling these cases separately, calling error_setg() with +errp only in return value == 0 case. + +Intel-SIG: commit 4f8cf323e80c migration: Fix migration_channel_read_peek() error path + +Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") +Signed-off-by: Avihai Horon +Reviewed-by: Fabiano Rosas +Reviewed-by: Philippe Mathieu-Daudé +Link: https://lore.kernel.org/r/20231231093016.14204-10-avihaih@nvidia.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/channel.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/migration/channel.c b/migration/channel.c +index ca3319a309..f9de064f3b 100644 +--- a/migration/channel.c ++++ b/migration/channel.c +@@ -117,9 +117,12 @@ int migration_channel_read_peek(QIOChannel *ioc, + len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, + QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); + +- if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { +- error_setg(errp, +- "Failed to peek at channel"); ++ if (len < 0 && len != QIO_CHANNEL_ERR_BLOCK) { ++ return -1; ++ } ++ ++ if (len == 0) { ++ error_setg(errp, "Failed to peek at channel"); + return -1; + } + +-- +2.43.0 + diff --git a/0360-migration-multifd-remove-unnecessary-usage-of-local-.patch b/0360-migration-multifd-remove-unnecessary-usage-of-local-.patch new file mode 100644 index 0000000..757c52a --- /dev/null +++ b/0360-migration-multifd-remove-unnecessary-usage-of-local-.patch @@ -0,0 +1,63 @@ +From 0ad9ed1745feb691cd2e2f14840b10d0f6421da5 Mon Sep 17 00:00:00 2001 +From: Avihai Horon +Date: Sun, 31 Dec 2023 11:30:16 +0200 +Subject: [PATCH] migration/multifd: Remove unnecessary usage of local Error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 3fc58efa938338a82e4d5c0c031e7e9c98e9544f upstream. + +According to Error API, usage of ERRP_GUARD() or a local Error instead +of errp is needed if errp is passed to void functions, where it is later +dereferenced to see if an error occurred. + +There are several places in multifd.c that use local Error although it +is not needed. Change these places to use errp directly. + +Intel-SIG: commit 3fc58efa9383 migration/multifd: Remove unnecessary usage of local Error + +Signed-off-by: Avihai Horon +Reviewed-by: Philippe Mathieu-Daudé +Link: https://lore.kernel.org/r/20231231093016.14204-12-avihaih@nvidia.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 9ac24866ad..9f353aecfa 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -951,12 +951,10 @@ int multifd_save_setup(Error **errp) + + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; +- Error *local_err = NULL; + int ret; + +- ret = multifd_send_state->ops->send_setup(p, &local_err); ++ ret = multifd_send_state->ops->send_setup(p, errp); + if (ret) { +- error_propagate(errp, local_err); + return ret; + } + } +@@ -1195,12 +1193,10 @@ int multifd_load_setup(Error **errp) + + for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; +- Error *local_err = NULL; + int ret; + +- ret = multifd_recv_state->ops->recv_setup(p, &local_err); ++ ret = multifd_recv_state->ops->recv_setup(p, errp); + if (ret) { +- error_propagate(errp, local_err); + return ret; + } + } +-- +2.43.0 + diff --git a/0361-migration-multifd-remove-multifdpages-t-packet-num.patch b/0361-migration-multifd-remove-multifdpages-t-packet-num.patch new file mode 100644 index 0000000..a19597d --- /dev/null +++ b/0361-migration-multifd-remove-multifdpages-t-packet-num.patch @@ -0,0 +1,50 @@ +From c7101785540290e77797ff8447463bd43749ef5d Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:38 -0300 +Subject: [PATCH] migration/multifd: Remove MultiFDPages_t::packet_num + +commit dca1bc7f24d2fa227f0b787f85f3cc67006e67bf upstream. + +This was introduced by commit 34c55a94b1 ("migration: Create multipage +support") and never used. + +Intel-SIG: commit dca1bc7f24d2 migration/multifd: Remove MultiFDPages_t::packet_num + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-2-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 1 - + migration/multifd.h | 2 -- + 2 files changed, 3 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 9f353aecfa..3e650f5da0 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -250,7 +250,6 @@ static void multifd_pages_clear(MultiFDPages_t *pages) + { + pages->num = 0; + pages->allocated = 0; +- pages->packet_num = 0; + pages->block = NULL; + g_free(pages->offset); + pages->offset = NULL; +diff --git a/migration/multifd.h b/migration/multifd.h +index a835643b48..b0ff610c37 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -58,8 +58,6 @@ typedef struct { + uint32_t num; + /* number of allocated pages */ + uint32_t allocated; +- /* global number of generated multifd packets */ +- uint64_t packet_num; + /* offset of each page */ + ram_addr_t *offset; + RAMBlock *block; +-- +2.43.0 + diff --git a/0362-migration-multifd-remove-qemufile-from-where-it-is-n.patch b/0362-migration-multifd-remove-qemufile-from-where-it-is-n.patch new file mode 100644 index 0000000..2d30359 --- /dev/null +++ b/0362-migration-multifd-remove-qemufile-from-where-it-is-n.patch @@ -0,0 +1,160 @@ +From 51253771b29826ec2e0b6620c6df8847c1550883 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:39 -0300 +Subject: [PATCH] migration/multifd: Remove QEMUFile from where it is not + needed + +commit 9346fa1870784c70618bfd5a9e1f1da89de0c5ec upstream. + +Intel-SIG: commit 9346fa187078 migration/multifd: Remove QEMUFile from where it is not needed + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-3-farosas@suse.de +Signed-off-by: Peter Xu + + Conflicts: + migration/ram.c +[jz: resolve context conflict] +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 12 ++++++------ + migration/multifd.h | 4 ++-- + migration/ram.c | 15 +++++++-------- + 3 files changed, 15 insertions(+), 16 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 3e650f5da0..2dbc3ba836 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -390,7 +390,7 @@ struct { + * false. + */ + +-static int multifd_send_pages(QEMUFile *f) ++static int multifd_send_pages(void) + { + int i; + static int next_channel; +@@ -436,7 +436,7 @@ static int multifd_send_pages(QEMUFile *f) + return 1; + } + +-int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) ++int multifd_queue_page(RAMBlock *block, ram_addr_t offset) + { + MultiFDPages_t *pages = multifd_send_state->pages; + bool changed = false; +@@ -456,12 +456,12 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) + changed = true; + } + +- if (multifd_send_pages(f) < 0) { ++ if (multifd_send_pages() < 0) { + return -1; + } + + if (changed) { +- return multifd_queue_page(f, block, offset); ++ return multifd_queue_page(block, offset); + } + + return 1; +@@ -583,7 +583,7 @@ static int multifd_zero_copy_flush(QIOChannel *c) + return ret; + } + +-int multifd_send_sync_main(QEMUFile *f) ++int multifd_send_sync_main(void) + { + int i; + bool flush_zero_copy; +@@ -592,7 +592,7 @@ int multifd_send_sync_main(QEMUFile *f) + return 0; + } + if (multifd_send_state->pages->num) { +- if (multifd_send_pages(f) < 0) { ++ if (multifd_send_pages() < 0) { + error_report("%s: multifd_send_pages fail", __func__); + return -1; + } +diff --git a/migration/multifd.h b/migration/multifd.h +index b0ff610c37..35d11f103c 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -21,8 +21,8 @@ void multifd_load_shutdown(void); + bool multifd_recv_all_channels_created(void); + void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); + void multifd_recv_sync_main(void); +-int multifd_send_sync_main(QEMUFile *f); +-int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); ++int multifd_send_sync_main(void); ++int multifd_queue_page(RAMBlock *block, ram_addr_t offset); + + /* Multifd Compression flags */ + #define MULTIFD_FLAG_SYNC (1 << 0) +diff --git a/migration/ram.c b/migration/ram.c +index 71353bc90b..4706dcda7d 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1384,10 +1384,9 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) + return pages; + } + +-static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, +- ram_addr_t offset) ++static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) + { +- if (multifd_queue_page(file, block, offset) < 0) { ++ if (multifd_queue_page(block, offset) < 0) { + return -1; + } + stat64_add(&mig_stats.normal_pages, 1); +@@ -1470,7 +1469,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) + if (migrate_multifd() && + !migrate_multifd_flush_after_each_section()) { + QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; +- int ret = multifd_send_sync_main(f); ++ int ret = multifd_send_sync_main(); + if (ret < 0) { + return ret; + } +@@ -2262,7 +2261,7 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) + * still see partially copied pages which is data corruption. + */ + if (migrate_multifd() && !migration_in_postcopy()) { +- return ram_save_multifd_page(pss->pss_channel, block, offset); ++ return ram_save_multifd_page(block, offset); + } + + return ram_save_page(rs, pss); +@@ -3404,7 +3403,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + qemu_mutex_unlock_iothread(); +- ret = multifd_send_sync_main(f); ++ ret = multifd_send_sync_main(); + qemu_mutex_lock_iothread(); + if (ret < 0) { + return ret; +@@ -3528,7 +3527,7 @@ out: + if (ret >= 0 + && migration_is_setup_or_active(migrate_get_current()->state)) { + if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { +- ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); ++ ret = multifd_send_sync_main(); + if (ret < 0) { + return ret; + } +@@ -3624,7 +3623,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) + } + } + +- ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); ++ ret = multifd_send_sync_main(); + if (ret < 0) { + return ret; + } +-- +2.43.0 + diff --git a/0363-migration-multifd-change-multifd-pages-init-argument.patch b/0363-migration-multifd-change-multifd-pages-init-argument.patch new file mode 100644 index 0000000..71c7a63 --- /dev/null +++ b/0363-migration-multifd-change-multifd-pages-init-argument.patch @@ -0,0 +1,44 @@ +From d507eb044f0c0113339f523df9481a761bfbc773 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:40 -0300 +Subject: [PATCH] migration/multifd: Change multifd_pages_init argument + +commit 6074f81625800743e4c374aecf7dd30774aaf6e0 upstream. + +The 'size' argument is actually the number of pages that fit in a +multifd packet. Change it to uint32_t and rename. + +Intel-SIG: commit 6074f8162580 migration/multifd: Change multifd_pages_init argument + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-4-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 2dbc3ba836..25cbc6dc6b 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -236,12 +236,12 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) + return msg.id; + } + +-static MultiFDPages_t *multifd_pages_init(size_t size) ++static MultiFDPages_t *multifd_pages_init(uint32_t n) + { + MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); + +- pages->allocated = size; +- pages->offset = g_new0(ram_addr_t, size); ++ pages->allocated = n; ++ pages->offset = g_new0(ram_addr_t, n); + + return pages; + } +-- +2.43.0 + diff --git a/0364-migration-report-error-in-incoming-migration.patch b/0364-migration-report-error-in-incoming-migration.patch new file mode 100644 index 0000000..bd7a7d9 --- /dev/null +++ b/0364-migration-report-error-in-incoming-migration.patch @@ -0,0 +1,42 @@ +From 01f62a6ca150f6ac7066f1eb007f4e83d4bc5e01 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:41 -0300 +Subject: [PATCH] migration: Report error in incoming migration + +commit e3b8ad5c13714cca5e3fc1445472171fbcd469bc upstream. + +We're not currently reporting the errors set with migrate_set_error() +when incoming migration fails. + +Intel-SIG: commit e3b8ad5c1371 migration: Report error in incoming migration + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-5-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/migration.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/migration/migration.c b/migration/migration.c +index 31abc7c3a3..699ba0c834 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -706,6 +706,13 @@ process_incoming_migration_co(void *opaque) + } + + if (ret < 0) { ++ MigrationState *s = migrate_get_current(); ++ ++ if (migrate_has_error(s)) { ++ WITH_QEMU_LOCK_GUARD(&s->error_mutex) { ++ error_report_err(s->error); ++ } ++ } + error_report("load of migration failed: %s", strerror(-ret)); + goto fail; + } +-- +2.43.0 + diff --git a/0365-tests-qtest-migration-print-migration-incoming-error.patch b/0365-tests-qtest-migration-print-migration-incoming-error.patch new file mode 100644 index 0000000..9dc3ed1 --- /dev/null +++ b/0365-tests-qtest-migration-print-migration-incoming-error.patch @@ -0,0 +1,41 @@ +From 55820b4c4f610dd41ea38b1ce941ac4b07b00110 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:42 -0300 +Subject: [PATCH] tests/qtest/migration: Print migration incoming errors + +commit 679a7382a389875c0f7835a1a409ebf4859f8410 upstream. + +We're currently just asserting when incoming migration fails. Let's +print the error message from QMP as well. + +Intel-SIG: commit 679a7382a389 tests/qtest/migration: Print migration incoming errors + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-6-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-helpers.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c +index 24fb7b3525..f1106128a9 100644 +--- a/tests/qtest/migration-helpers.c ++++ b/tests/qtest/migration-helpers.c +@@ -118,6 +118,12 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) + + rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}", + args); ++ ++ if (!qdict_haskey(rsp, "return")) { ++ g_autoptr(GString) s = qobject_to_json_pretty(QOBJECT(rsp), true); ++ g_test_message("%s", s->str); ++ } ++ + g_assert(qdict_haskey(rsp, "return")); + qobject_unref(rsp); + +-- +2.43.0 + diff --git a/0366-tests-qtest-migration-add-a-wrapper-to-print-test-na.patch b/0366-tests-qtest-migration-add-a-wrapper-to-print-test-na.patch new file mode 100644 index 0000000..e77d0db --- /dev/null +++ b/0366-tests-qtest-migration-add-a-wrapper-to-print-test-na.patch @@ -0,0 +1,90 @@ +From e25c6b1ea8a5819db2458194252a8eb4347dec6f Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:43 -0300 +Subject: [PATCH] tests/qtest/migration: Add a wrapper to print test names + +commit e33b6712dba206547a313a6f2608b0fd967ee558 upstream. + +Our usage of gtest results in us losing the very basic functionality +of "knowing which test failed". The issue is that gtest only prints +test names ("paths" in gtest parlance) once the test has finished, but +we use asserts in the tests and crash gtest itself before it can print +anything. We also use a final abort when the result of g_test_run is +not 0. + +Depending on how the test failed/broke we can see the function that +trigged the abort, which may be representative of the test, but it +could also just be some generic function. + +We have been relying on the primitive method of looking at the name of +the previous successful test and then looking at the code to figure +out which test should have come next. + +Add a wrapper to the test registration that does the job of printing +the test name before running. + +Intel-SIG: commit e33b6712dba2 tests/qtest/migration: Add a wrapper to print test names + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-7-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-helpers.c | 32 ++++++++++++++++++++++++++++++++ + tests/qtest/migration-helpers.h | 1 + + 2 files changed, 33 insertions(+) + +diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c +index f1106128a9..164e09c299 100644 +--- a/tests/qtest/migration-helpers.c ++++ b/tests/qtest/migration-helpers.c +@@ -298,3 +298,35 @@ char *resolve_machine_version(const char *alias, const char *var1, + + return find_common_machine_version(machine_name, var1, var2); + } ++ ++typedef struct { ++ char *name; ++ void (*func)(void); ++} MigrationTest; ++ ++static void migration_test_destroy(gpointer data) ++{ ++ MigrationTest *test = (MigrationTest *)data; ++ ++ g_free(test->name); ++ g_free(test); ++} ++ ++static void migration_test_wrapper(const void *data) ++{ ++ MigrationTest *test = (MigrationTest *)data; ++ ++ g_test_message("Running /%s%s", qtest_get_arch(), test->name); ++ test->func(); ++} ++ ++void migration_test_add(const char *path, void (*fn)(void)) ++{ ++ MigrationTest *test = g_new0(MigrationTest, 1); ++ ++ test->func = fn; ++ test->name = g_strdup(path); ++ ++ qtest_add_data_func_full(path, test, migration_test_wrapper, ++ migration_test_destroy); ++} +diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h +index e31dc85cc7..0d9a02edc7 100644 +--- a/tests/qtest/migration-helpers.h ++++ b/tests/qtest/migration-helpers.h +@@ -47,4 +47,5 @@ char *find_common_machine_version(const char *mtype, const char *var1, + const char *var2); + char *resolve_machine_version(const char *alias, const char *var1, + const char *var2); ++void migration_test_add(const char *path, void (*fn)(void)); + #endif /* MIGRATION_HELPERS_H */ +-- +2.43.0 + diff --git a/0367-tests-qtest-migration-use-the-new-migration-test-add.patch b/0367-tests-qtest-migration-use-the-new-migration-test-add.patch new file mode 100644 index 0000000..7e1c1a7 --- /dev/null +++ b/0367-tests-qtest-migration-use-the-new-migration-test-add.patch @@ -0,0 +1,310 @@ +From 73550f62a21c2993fb6add5eef109b88f3b2e7f1 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 4 Jan 2024 11:21:44 -0300 +Subject: [PATCH] tests/qtest/migration: Use the new migration_test_add + +commit 6f0771de903bb7623dc85bbf9f94f641979daaaa upstream. + +Replace the tests registration with the new function that prints tests +names. + +Intel-SIG: commit 6f0771de903b tests/qtest/migration: Use the new migration_test_add + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240104142144.9680-8-farosas@suse.de +Signed-off-by: Peter Xu + + Conflicts: + tests/qtest/migration-test.c +[jz: resolve context conflicts due to live-suspend which is not backported] +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-test.c | 201 ++++++++++++++++++----------------- + 1 file changed, 104 insertions(+), 97 deletions(-) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 0fbaa6a90f..4884a40be2 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -3339,62 +3339,65 @@ int main(int argc, char **argv) + module_call_init(MODULE_INIT_QOM); + + if (has_uffd) { +- qtest_add_func("/migration/postcopy/plain", test_postcopy); +- qtest_add_func("/migration/postcopy/recovery/plain", +- test_postcopy_recovery); +- qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt); +- qtest_add_func("/migration/postcopy/preempt/recovery/plain", +- test_postcopy_preempt_recovery); ++ migration_test_add("/migration/postcopy/plain", test_postcopy); ++ migration_test_add("/migration/postcopy/recovery/plain", ++ test_postcopy_recovery); ++ migration_test_add("/migration/postcopy/preempt/plain", ++ test_postcopy_preempt); ++ migration_test_add("/migration/postcopy/preempt/recovery/plain", ++ test_postcopy_preempt_recovery); + if (getenv("QEMU_TEST_FLAKY_TESTS")) { +- qtest_add_func("/migration/postcopy/compress/plain", +- test_postcopy_compress); +- qtest_add_func("/migration/postcopy/recovery/compress/plain", +- test_postcopy_recovery_compress); ++ migration_test_add("/migration/postcopy/compress/plain", ++ test_postcopy_compress); ++ migration_test_add("/migration/postcopy/recovery/compress/plain", ++ test_postcopy_recovery_compress); + } + #ifndef _WIN32 +- qtest_add_func("/migration/postcopy/recovery/double-failures", +- test_postcopy_recovery_double_fail); ++ migration_test_add("/migration/postcopy/recovery/double-failures", ++ test_postcopy_recovery_double_fail); + #endif /* _WIN32 */ + + } + +- qtest_add_func("/migration/bad_dest", test_baddest); ++ migration_test_add("/migration/bad_dest", test_baddest); + #ifndef _WIN32 + if (!g_str_equal(arch, "s390x")) { +- qtest_add_func("/migration/analyze-script", test_analyze_script); ++ migration_test_add("/migration/analyze-script", test_analyze_script); + } + #endif +- qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain); +- qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle); ++ migration_test_add("/migration/precopy/unix/plain", ++ test_precopy_unix_plain); ++ migration_test_add("/migration/precopy/unix/xbzrle", ++ test_precopy_unix_xbzrle); + /* + * Compression fails from time to time. + * Put test here but don't enable it until everything is fixed. + */ + if (getenv("QEMU_TEST_FLAKY_TESTS")) { +- qtest_add_func("/migration/precopy/unix/compress/wait", +- test_precopy_unix_compress); +- qtest_add_func("/migration/precopy/unix/compress/nowait", +- test_precopy_unix_compress_nowait); ++ migration_test_add("/migration/precopy/unix/compress/wait", ++ test_precopy_unix_compress); ++ migration_test_add("/migration/precopy/unix/compress/nowait", ++ test_precopy_unix_compress_nowait); + } + +- qtest_add_func("/migration/precopy/file", +- test_precopy_file); +- qtest_add_func("/migration/precopy/file/offset", +- test_precopy_file_offset); +- qtest_add_func("/migration/precopy/file/offset/bad", +- test_precopy_file_offset_bad); ++ migration_test_add("/migration/precopy/file", ++ test_precopy_file); ++ migration_test_add("/migration/precopy/file/offset", ++ test_precopy_file_offset); ++ migration_test_add("/migration/precopy/file/offset/bad", ++ test_precopy_file_offset_bad); + + /* + * Our CI system has problems with shared memory. + * Don't run this test until we find a workaround. + */ + if (getenv("QEMU_TEST_FLAKY_TESTS")) { +- qtest_add_func("/migration/mode/reboot", test_mode_reboot); ++ migration_test_add("/migration/mode/reboot", test_mode_reboot); + } + + #ifdef CONFIG_GNUTLS +- qtest_add_func("/migration/precopy/unix/tls/psk", +- test_precopy_unix_tls_psk); ++ migration_test_add("/migration/precopy/unix/tls/psk", ++ test_precopy_unix_tls_psk); + + if (has_uffd) { + /* +@@ -3402,110 +3405,114 @@ int main(int argc, char **argv) + * channels are tested under precopy. Here what we want to test is the + * general postcopy path that has TLS channel enabled. + */ +- qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk); +- qtest_add_func("/migration/postcopy/recovery/tls/psk", +- test_postcopy_recovery_tls_psk); +- qtest_add_func("/migration/postcopy/preempt/tls/psk", +- test_postcopy_preempt_tls_psk); +- qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk", +- test_postcopy_preempt_all); ++ migration_test_add("/migration/postcopy/tls/psk", ++ test_postcopy_tls_psk); ++ migration_test_add("/migration/postcopy/recovery/tls/psk", ++ test_postcopy_recovery_tls_psk); ++ migration_test_add("/migration/postcopy/preempt/tls/psk", ++ test_postcopy_preempt_tls_psk); ++ migration_test_add("/migration/postcopy/preempt/recovery/tls/psk", ++ test_postcopy_preempt_all); + } + #ifdef CONFIG_TASN1 +- qtest_add_func("/migration/precopy/unix/tls/x509/default-host", +- test_precopy_unix_tls_x509_default_host); +- qtest_add_func("/migration/precopy/unix/tls/x509/override-host", +- test_precopy_unix_tls_x509_override_host); ++ migration_test_add("/migration/precopy/unix/tls/x509/default-host", ++ test_precopy_unix_tls_x509_default_host); ++ migration_test_add("/migration/precopy/unix/tls/x509/override-host", ++ test_precopy_unix_tls_x509_override_host); + #endif /* CONFIG_TASN1 */ + #endif /* CONFIG_GNUTLS */ + +- qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain); ++ migration_test_add("/migration/precopy/tcp/plain", test_precopy_tcp_plain); + +- qtest_add_func("/migration/precopy/tcp/plain/switchover-ack", +- test_precopy_tcp_switchover_ack); ++ migration_test_add("/migration/precopy/tcp/plain/switchover-ack", ++ test_precopy_tcp_switchover_ack); + + #ifdef CONFIG_GNUTLS +- qtest_add_func("/migration/precopy/tcp/tls/psk/match", +- test_precopy_tcp_tls_psk_match); +- qtest_add_func("/migration/precopy/tcp/tls/psk/mismatch", +- test_precopy_tcp_tls_psk_mismatch); ++ migration_test_add("/migration/precopy/tcp/tls/psk/match", ++ test_precopy_tcp_tls_psk_match); ++ migration_test_add("/migration/precopy/tcp/tls/psk/mismatch", ++ test_precopy_tcp_tls_psk_mismatch); + #ifdef CONFIG_TASN1 +- qtest_add_func("/migration/precopy/tcp/tls/x509/default-host", +- test_precopy_tcp_tls_x509_default_host); +- qtest_add_func("/migration/precopy/tcp/tls/x509/override-host", +- test_precopy_tcp_tls_x509_override_host); +- qtest_add_func("/migration/precopy/tcp/tls/x509/mismatch-host", +- test_precopy_tcp_tls_x509_mismatch_host); +- qtest_add_func("/migration/precopy/tcp/tls/x509/friendly-client", +- test_precopy_tcp_tls_x509_friendly_client); +- qtest_add_func("/migration/precopy/tcp/tls/x509/hostile-client", +- test_precopy_tcp_tls_x509_hostile_client); +- qtest_add_func("/migration/precopy/tcp/tls/x509/allow-anon-client", +- test_precopy_tcp_tls_x509_allow_anon_client); +- qtest_add_func("/migration/precopy/tcp/tls/x509/reject-anon-client", +- test_precopy_tcp_tls_x509_reject_anon_client); ++ migration_test_add("/migration/precopy/tcp/tls/x509/default-host", ++ test_precopy_tcp_tls_x509_default_host); ++ migration_test_add("/migration/precopy/tcp/tls/x509/override-host", ++ test_precopy_tcp_tls_x509_override_host); ++ migration_test_add("/migration/precopy/tcp/tls/x509/mismatch-host", ++ test_precopy_tcp_tls_x509_mismatch_host); ++ migration_test_add("/migration/precopy/tcp/tls/x509/friendly-client", ++ test_precopy_tcp_tls_x509_friendly_client); ++ migration_test_add("/migration/precopy/tcp/tls/x509/hostile-client", ++ test_precopy_tcp_tls_x509_hostile_client); ++ migration_test_add("/migration/precopy/tcp/tls/x509/allow-anon-client", ++ test_precopy_tcp_tls_x509_allow_anon_client); ++ migration_test_add("/migration/precopy/tcp/tls/x509/reject-anon-client", ++ test_precopy_tcp_tls_x509_reject_anon_client); + #endif /* CONFIG_TASN1 */ + #endif /* CONFIG_GNUTLS */ + +- /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */ ++ /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */ + #ifndef _WIN32 +- qtest_add_func("/migration/fd_proto", test_migrate_fd_proto); ++ migration_test_add("/migration/fd_proto", test_migrate_fd_proto); + #endif +- qtest_add_func("/migration/validate_uuid", test_validate_uuid); +- qtest_add_func("/migration/validate_uuid_error", test_validate_uuid_error); +- qtest_add_func("/migration/validate_uuid_src_not_set", +- test_validate_uuid_src_not_set); +- qtest_add_func("/migration/validate_uuid_dst_not_set", +- test_validate_uuid_dst_not_set); ++ migration_test_add("/migration/validate_uuid", test_validate_uuid); ++ migration_test_add("/migration/validate_uuid_error", ++ test_validate_uuid_error); ++ migration_test_add("/migration/validate_uuid_src_not_set", ++ test_validate_uuid_src_not_set); ++ migration_test_add("/migration/validate_uuid_dst_not_set", ++ test_validate_uuid_dst_not_set); + /* + * See explanation why this test is slow on function definition + */ + if (g_test_slow()) { +- qtest_add_func("/migration/auto_converge", test_migrate_auto_converge); ++ migration_test_add("/migration/auto_converge", ++ test_migrate_auto_converge); + if (g_str_equal(arch, "x86_64") && + has_kvm && kvm_dirty_ring_supported()) { +- qtest_add_func("/migration/dirty_limit", test_migrate_dirty_limit); ++ migration_test_add("/migration/dirty_limit", ++ test_migrate_dirty_limit); + } + } +- qtest_add_func("/migration/multifd/tcp/plain/none", +- test_multifd_tcp_none); ++ migration_test_add("/migration/multifd/tcp/plain/none", ++ test_multifd_tcp_none); + /* + * This test is flaky and sometimes fails in CI and otherwise: + * don't run unless user opts in via environment variable. + */ + if (getenv("QEMU_TEST_FLAKY_TESTS")) { +- qtest_add_func("/migration/multifd/tcp/plain/cancel", +- test_multifd_tcp_cancel); ++ migration_test_add("/migration/multifd/tcp/plain/cancel", ++ test_multifd_tcp_cancel); + } +- qtest_add_func("/migration/multifd/tcp/plain/zlib", +- test_multifd_tcp_zlib); ++ migration_test_add("/migration/multifd/tcp/plain/zlib", ++ test_multifd_tcp_zlib); + #ifdef CONFIG_ZSTD +- qtest_add_func("/migration/multifd/tcp/plain/zstd", +- test_multifd_tcp_zstd); ++ migration_test_add("/migration/multifd/tcp/plain/zstd", ++ test_multifd_tcp_zstd); + #endif + #ifdef CONFIG_GNUTLS +- qtest_add_func("/migration/multifd/tcp/tls/psk/match", +- test_multifd_tcp_tls_psk_match); +- qtest_add_func("/migration/multifd/tcp/tls/psk/mismatch", +- test_multifd_tcp_tls_psk_mismatch); ++ migration_test_add("/migration/multifd/tcp/tls/psk/match", ++ test_multifd_tcp_tls_psk_match); ++ migration_test_add("/migration/multifd/tcp/tls/psk/mismatch", ++ test_multifd_tcp_tls_psk_mismatch); + #ifdef CONFIG_TASN1 +- qtest_add_func("/migration/multifd/tcp/tls/x509/default-host", +- test_multifd_tcp_tls_x509_default_host); +- qtest_add_func("/migration/multifd/tcp/tls/x509/override-host", +- test_multifd_tcp_tls_x509_override_host); +- qtest_add_func("/migration/multifd/tcp/tls/x509/mismatch-host", +- test_multifd_tcp_tls_x509_mismatch_host); +- qtest_add_func("/migration/multifd/tcp/tls/x509/allow-anon-client", +- test_multifd_tcp_tls_x509_allow_anon_client); +- qtest_add_func("/migration/multifd/tcp/tls/x509/reject-anon-client", +- test_multifd_tcp_tls_x509_reject_anon_client); ++ migration_test_add("/migration/multifd/tcp/tls/x509/default-host", ++ test_multifd_tcp_tls_x509_default_host); ++ migration_test_add("/migration/multifd/tcp/tls/x509/override-host", ++ test_multifd_tcp_tls_x509_override_host); ++ migration_test_add("/migration/multifd/tcp/tls/x509/mismatch-host", ++ test_multifd_tcp_tls_x509_mismatch_host); ++ migration_test_add("/migration/multifd/tcp/tls/x509/allow-anon-client", ++ test_multifd_tcp_tls_x509_allow_anon_client); ++ migration_test_add("/migration/multifd/tcp/tls/x509/reject-anon-client", ++ test_multifd_tcp_tls_x509_reject_anon_client); + #endif /* CONFIG_TASN1 */ + #endif /* CONFIG_GNUTLS */ + + if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { +- qtest_add_func("/migration/dirty_ring", +- test_precopy_unix_dirty_ring); +- qtest_add_func("/migration/vcpu_dirty_limit", +- test_vcpu_dirty_limit); ++ migration_test_add("/migration/dirty_ring", ++ test_precopy_unix_dirty_ring); ++ migration_test_add("/migration/vcpu_dirty_limit", ++ test_vcpu_dirty_limit); + } + + ret = g_test_run(); +-- +2.43.0 + diff --git a/0368-tests-qtest-re-enable-multifd-cancel-test.patch b/0368-tests-qtest-re-enable-multifd-cancel-test.patch new file mode 100644 index 0000000..4455816 --- /dev/null +++ b/0368-tests-qtest-re-enable-multifd-cancel-test.patch @@ -0,0 +1,45 @@ +From 442304ca0d2d08412faa90834a2684d57c4b0f03 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Wed, 11 Oct 2023 15:46:04 -0300 +Subject: [PATCH] tests/qtest: Re-enable multifd cancel test + +commit 75b1f88cd2dd5eeb1fd817a2f3a291c2670f9c50 upstream. + +We've found the source of flakiness in this test, so re-enable it. + +Intel-SIG: commit 75b1f88cd2dd tests/qtest: Re-enable multifd cancel test + +Reviewed-by: Juan Quintela +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20230606144551.24367-4-farosas@suse.de +[peterx: rebase to 2a61a6964c, to use migration_test_add()] +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-test.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 4884a40be2..4bdf397828 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -3475,14 +3475,8 @@ int main(int argc, char **argv) + } + migration_test_add("/migration/multifd/tcp/plain/none", + test_multifd_tcp_none); +- /* +- * This test is flaky and sometimes fails in CI and otherwise: +- * don't run unless user opts in via environment variable. +- */ +- if (getenv("QEMU_TEST_FLAKY_TESTS")) { +- migration_test_add("/migration/multifd/tcp/plain/cancel", +- test_multifd_tcp_cancel); +- } ++ migration_test_add("/migration/multifd/tcp/plain/cancel", ++ test_multifd_tcp_cancel); + migration_test_add("/migration/multifd/tcp/plain/zlib", + test_multifd_tcp_zlib); + #ifdef CONFIG_ZSTD +-- +2.43.0 + diff --git a/0369-docs-migration-create-migration-directory.patch b/0369-docs-migration-create-migration-directory.patch new file mode 100644 index 0000000..00770c1 --- /dev/null +++ b/0369-docs-migration-create-migration-directory.patch @@ -0,0 +1,67 @@ +From 46fe412a0ed405e6943ebbb00831bc1858cc2675 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:19 +0800 +Subject: [PATCH] docs/migration: Create migration/ directory +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 8cb2f8b172e74a7279fabb5d5c20aee32b5b98cd upstream. + +Migration documentation is growing into a single file too large. Create a +sub-directory for it for a split. + +We also already have separate vfio/virtio documentations, move it all over +into the directory. + +Note that the virtio one is still not yet converted to rST. That is a job +for later. + +Intel-SIG: commit 8cb2f8b172e7 docs/migration: Create migration/ directory + +Cc: "Michael S. Tsirkin" +Cc: Jason Wang +Cc: Alex Williamson +Cc: Cédric Le Goater +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-2-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/index-internals.rst | 2 +- + docs/devel/{migration.rst => migration/main.rst} | 0 + docs/devel/{vfio-migration.rst => migration/vfio.rst} | 0 + docs/devel/{virtio-migration.txt => migration/virtio.txt} | 0 + 4 files changed, 1 insertion(+), 1 deletion(-) + rename docs/devel/{migration.rst => migration/main.rst} (100%) + rename docs/devel/{vfio-migration.rst => migration/vfio.rst} (100%) + rename docs/devel/{virtio-migration.txt => migration/virtio.txt} (100%) + +diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst +index 6f81df92bc..645f0cd7b9 100644 +--- a/docs/devel/index-internals.rst ++++ b/docs/devel/index-internals.rst +@@ -11,7 +11,7 @@ Details about QEMU's various subsystems including how to add features to them. + block-coroutine-wrapper + clocks + ebpf_rss +- migration ++ migration/main + multi-process + reset + s390-cpu-topology +diff --git a/docs/devel/migration.rst b/docs/devel/migration/main.rst +similarity index 100% +rename from docs/devel/migration.rst +rename to docs/devel/migration/main.rst +diff --git a/docs/devel/vfio-migration.rst b/docs/devel/migration/vfio.rst +similarity index 100% +rename from docs/devel/vfio-migration.rst +rename to docs/devel/migration/vfio.rst +diff --git a/docs/devel/virtio-migration.txt b/docs/devel/migration/virtio.txt +similarity index 100% +rename from docs/devel/virtio-migration.txt +rename to docs/devel/migration/virtio.txt +-- +2.43.0 + diff --git a/0370-docs-migration-create-index-page.patch b/0370-docs-migration-create-index-page.patch new file mode 100644 index 0000000..7508b52 --- /dev/null +++ b/0370-docs-migration-create-index-page.patch @@ -0,0 +1,99 @@ +From 71a80ee5ea5df54a5cc5c1f49fd02d443e0b2399 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:20 +0800 +Subject: [PATCH] docs/migration: Create index page +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit f6bbac985e6df492f2c6be94fb893ada75ffdefa upstream. + +Create an index page for migration module. Move VFIO migration there too. +A trivial touch-up on the title to use lower case there. + +Since then we'll have "migration" as the top title, make the main doc file +renamed to "migration framework". + +Intel-SIG: commit f6bbac985e6d docs/migration: Create index page + +Cc: Alex Williamson +Cc: Cédric Le Goater +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-3-peterx@redhat.com +Signed-off-by: Peter Xu + +Conflicts: + docs/devel/index-internals.rst +[jz: resolve simple context conflict] +Signed-off-by: Jason Zeng +--- + docs/devel/index-internals.rst | 3 +-- + docs/devel/migration/index.rst | 11 +++++++++++ + docs/devel/migration/main.rst | 6 +++--- + docs/devel/migration/vfio.rst | 2 +- + 4 files changed, 16 insertions(+), 6 deletions(-) + create mode 100644 docs/devel/migration/index.rst + +diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst +index 645f0cd7b9..4d1502d4b4 100644 +--- a/docs/devel/index-internals.rst ++++ b/docs/devel/index-internals.rst +@@ -11,12 +11,11 @@ Details about QEMU's various subsystems including how to add features to them. + block-coroutine-wrapper + clocks + ebpf_rss +- migration/main ++ migration/index + multi-process + reset + s390-cpu-topology + s390-dasd-ipl + tracing +- vfio-migration + writing-monitor-commands + virtio-backends +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +new file mode 100644 +index 0000000000..02cfdcc969 +--- /dev/null ++++ b/docs/devel/migration/index.rst +@@ -0,0 +1,11 @@ ++Migration ++========= ++ ++This is the main entry for QEMU migration documentations. It explains how ++QEMU live migration works. ++ ++.. toctree:: ++ :maxdepth: 2 ++ ++ main ++ vfio +diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst +index ec55089b25..82cdb420bf 100644 +--- a/docs/devel/migration/main.rst ++++ b/docs/devel/migration/main.rst +@@ -1,6 +1,6 @@ +-========= +-Migration +-========= ++=================== ++Migration framework ++=================== + + QEMU has code to load/save the state of the guest that it is running. + These are two complementary operations. Saving the state just does +diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst +index 605fe60e96..c49482eab6 100644 +--- a/docs/devel/migration/vfio.rst ++++ b/docs/devel/migration/vfio.rst +@@ -1,5 +1,5 @@ + ===================== +-VFIO device Migration ++VFIO device migration + ===================== + + Migration of virtual machine involves saving the state for each device that +-- +2.43.0 + diff --git a/0371-docs-migration-convert-virtio-txt-into-rst.patch b/0371-docs-migration-convert-virtio-txt-into-rst.patch new file mode 100644 index 0000000..4732f32 --- /dev/null +++ b/0371-docs-migration-convert-virtio-txt-into-rst.patch @@ -0,0 +1,273 @@ +From e086b71497bfb223b9a719af4a5a695fe179bbab Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:21 +0800 +Subject: [PATCH] docs/migration: Convert virtio.txt into rST +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 4d7a691bcfeb5580e3f7457e1f1c2fbd64572161 upstream. + +Convert the plain old .txt into .rst, add it into migration/index.rst. + +Intel-SIG: commit 4d7a691bcfeb docs/migration: Convert virtio.txt into rST + +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-4-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/index.rst | 1 + + docs/devel/migration/virtio.rst | 115 ++++++++++++++++++++++++++++++++ + docs/devel/migration/virtio.txt | 108 ------------------------------ + 3 files changed, 116 insertions(+), 108 deletions(-) + create mode 100644 docs/devel/migration/virtio.rst + delete mode 100644 docs/devel/migration/virtio.txt + +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +index 02cfdcc969..2cb701c77c 100644 +--- a/docs/devel/migration/index.rst ++++ b/docs/devel/migration/index.rst +@@ -9,3 +9,4 @@ QEMU live migration works. + + main + vfio ++ virtio +diff --git a/docs/devel/migration/virtio.rst b/docs/devel/migration/virtio.rst +new file mode 100644 +index 0000000000..611a18b821 +--- /dev/null ++++ b/docs/devel/migration/virtio.rst +@@ -0,0 +1,115 @@ ++======================= ++Virtio device migration ++======================= ++ ++Copyright 2015 IBM Corp. ++ ++This work is licensed under the terms of the GNU GPL, version 2 or later. See ++the COPYING file in the top-level directory. ++ ++Saving and restoring the state of virtio devices is a bit of a twisty maze, ++for several reasons: ++ ++- state is distributed between several parts: ++ ++ - virtio core, for common fields like features, number of queues, ... ++ ++ - virtio transport (pci, ccw, ...), for the different proxy devices and ++ transport specific state (msix vectors, indicators, ...) ++ ++ - virtio device (net, blk, ...), for the different device types and their ++ state (mac address, request queue, ...) ++ ++- most fields are saved via the stream interface; subsequently, subsections ++ have been added to make cross-version migration possible ++ ++This file attempts to document the current procedure and point out some ++caveats. ++ ++Save state procedure ++==================== ++ ++:: ++ ++ virtio core virtio transport virtio device ++ ----------- ---------------- ------------- ++ ++ save() function registered ++ via VMState wrapper on ++ device class ++ virtio_save() <---------- ++ ------> save_config() ++ - save proxy device ++ - save transport-specific ++ device fields ++ - save common device ++ fields ++ - save common virtqueue ++ fields ++ ------> save_queue() ++ - save transport-specific ++ virtqueue fields ++ ------> save_device() ++ - save device-specific ++ fields ++ - save subsections ++ - device endianness, ++ if changed from ++ default endianness ++ - 64 bit features, if ++ any high feature bit ++ is set ++ - virtio-1 virtqueue ++ fields, if VERSION_1 ++ is set ++ ++Load state procedure ++==================== ++ ++:: ++ ++ virtio core virtio transport virtio device ++ ----------- ---------------- ------------- ++ ++ load() function registered ++ via VMState wrapper on ++ device class ++ virtio_load() <---------- ++ ------> load_config() ++ - load proxy device ++ - load transport-specific ++ device fields ++ - load common device ++ fields ++ - load common virtqueue ++ fields ++ ------> load_queue() ++ - load transport-specific ++ virtqueue fields ++ - notify guest ++ ------> load_device() ++ - load device-specific ++ fields ++ - load subsections ++ - device endianness ++ - 64 bit features ++ - virtio-1 virtqueue ++ fields ++ - sanitize endianness ++ - sanitize features ++ - virtqueue index sanity ++ check ++ - feature-dependent setup ++ ++Implications of this setup ++========================== ++ ++Devices need to be careful in their state processing during load: The ++load_device() procedure is invoked by the core before subsections have ++been loaded. Any code that depends on information transmitted in subsections ++therefore has to be invoked in the device's load() function _after_ ++virtio_load() returned (like e.g. code depending on features). ++ ++Any extension of the state being migrated should be done in subsections ++added to the core for compatibility reasons. If transport or device specific ++state is added, core needs to invoke a callback from the new subsection. +diff --git a/docs/devel/migration/virtio.txt b/docs/devel/migration/virtio.txt +deleted file mode 100644 +index 98a6b0ffb5..0000000000 +--- a/docs/devel/migration/virtio.txt ++++ /dev/null +@@ -1,108 +0,0 @@ +-Virtio devices and migration +-============================ +- +-Copyright 2015 IBM Corp. +- +-This work is licensed under the terms of the GNU GPL, version 2 or later. See +-the COPYING file in the top-level directory. +- +-Saving and restoring the state of virtio devices is a bit of a twisty maze, +-for several reasons: +-- state is distributed between several parts: +- - virtio core, for common fields like features, number of queues, ... +- - virtio transport (pci, ccw, ...), for the different proxy devices and +- transport specific state (msix vectors, indicators, ...) +- - virtio device (net, blk, ...), for the different device types and their +- state (mac address, request queue, ...) +-- most fields are saved via the stream interface; subsequently, subsections +- have been added to make cross-version migration possible +- +-This file attempts to document the current procedure and point out some +-caveats. +- +- +-Save state procedure +-==================== +- +-virtio core virtio transport virtio device +------------ ---------------- ------------- +- +- save() function registered +- via VMState wrapper on +- device class +-virtio_save() <---------- +- ------> save_config() +- - save proxy device +- - save transport-specific +- device fields +-- save common device +- fields +-- save common virtqueue +- fields +- ------> save_queue() +- - save transport-specific +- virtqueue fields +- ------> save_device() +- - save device-specific +- fields +-- save subsections +- - device endianness, +- if changed from +- default endianness +- - 64 bit features, if +- any high feature bit +- is set +- - virtio-1 virtqueue +- fields, if VERSION_1 +- is set +- +- +-Load state procedure +-==================== +- +-virtio core virtio transport virtio device +------------ ---------------- ------------- +- +- load() function registered +- via VMState wrapper on +- device class +-virtio_load() <---------- +- ------> load_config() +- - load proxy device +- - load transport-specific +- device fields +-- load common device +- fields +-- load common virtqueue +- fields +- ------> load_queue() +- - load transport-specific +- virtqueue fields +-- notify guest +- ------> load_device() +- - load device-specific +- fields +-- load subsections +- - device endianness +- - 64 bit features +- - virtio-1 virtqueue +- fields +-- sanitize endianness +-- sanitize features +-- virtqueue index sanity +- check +- - feature-dependent setup +- +- +-Implications of this setup +-========================== +- +-Devices need to be careful in their state processing during load: The +-load_device() procedure is invoked by the core before subsections have +-been loaded. Any code that depends on information transmitted in subsections +-therefore has to be invoked in the device's load() function _after_ +-virtio_load() returned (like e.g. code depending on features). +- +-Any extension of the state being migrated should be done in subsections +-added to the core for compatibility reasons. If transport or device specific +-state is added, core needs to invoke a callback from the new subsection. +-- +2.43.0 + diff --git a/0372-docs-migration-split-backwards-compatibility-separat.patch b/0372-docs-migration-split-backwards-compatibility-separat.patch new file mode 100644 index 0000000..de211fe --- /dev/null +++ b/0372-docs-migration-split-backwards-compatibility-separat.patch @@ -0,0 +1,1090 @@ +From c127516b35a29d8d4e66b435fc16f500e1e976b8 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:22 +0800 +Subject: [PATCH] docs/migration: Split "Backwards compatibility" separately +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 6cc6a7b98b88f1a7d1d5ed99db0d373a46606aac upstream. + +Split the section from main.rst into a separate file. Reference it in the +index.rst. + +Intel-SIG: commit 6cc6a7b98b88 docs/migration: Split "Backwards compatibility" separately + +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-5-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/compatibility.rst | 517 ++++++++++++++++++++++++ + docs/devel/migration/index.rst | 1 + + docs/devel/migration/main.rst | 519 ------------------------- + 3 files changed, 518 insertions(+), 519 deletions(-) + create mode 100644 docs/devel/migration/compatibility.rst + +diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst +new file mode 100644 +index 0000000000..5a5417ef06 +--- /dev/null ++++ b/docs/devel/migration/compatibility.rst +@@ -0,0 +1,517 @@ ++Backwards compatibility ++======================= ++ ++How backwards compatibility works ++--------------------------------- ++ ++When we do migration, we have two QEMU processes: the source and the ++target. There are two cases, they are the same version or they are ++different versions. The easy case is when they are the same version. ++The difficult one is when they are different versions. ++ ++There are two things that are different, but they have very similar ++names and sometimes get confused: ++ ++- QEMU version ++- machine type version ++ ++Let's start with a practical example, we start with: ++ ++- qemu-system-x86_64 (v5.2), from now on qemu-5.2. ++- qemu-system-x86_64 (v5.1), from now on qemu-5.1. ++ ++Related to this are the "latest" machine types defined on each of ++them: ++ ++- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 ++- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 ++ ++First of all, migration is only supposed to work if you use the same ++machine type in both source and destination. The QEMU hardware ++configuration needs to be the same also on source and destination. ++Most aspects of the backend configuration can be changed at will, ++except for a few cases where the backend features influence frontend ++device feature exposure. But that is not relevant for this section. ++ ++I am going to list the number of combinations that we can have. Let's ++start with the trivial ones, QEMU is the same on source and ++destination: ++ ++1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 ++ ++ This is the latest QEMU with the latest machine type. ++ This have to work, and if it doesn't work it is a bug. ++ ++2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 ++ ++ Exactly the same case than the previous one, but for 5.1. ++ Nothing to see here either. ++ ++This are the easiest ones, we will not talk more about them in this ++section. ++ ++Now we start with the more interesting cases. Consider the case where ++we have the same QEMU version in both sides (qemu-5.2) but we are using ++the latest machine type for that version (pc-5.2) but one of an older ++QEMU version, in this case pc-5.1. ++ ++3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 ++ ++ It needs to use the definition of pc-5.1 and the devices as they ++ were configured on 5.1, but this should be easy in the sense that ++ both sides are the same QEMU and both sides have exactly the same ++ idea of what the pc-5.1 machine is. ++ ++4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 ++ ++ This combination is not possible as the qemu-5.1 doesn't understand ++ pc-5.2 machine type. So nothing to worry here. ++ ++Now it comes the interesting ones, when both QEMU processes are ++different. Notice also that the machine type needs to be pc-5.1, ++because we have the limitation than qemu-5.1 doesn't know pc-5.2. So ++the possible cases are: ++ ++5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 ++ ++ This migration is known as newer to older. We need to make sure ++ when we are developing 5.2 we need to take care about not to break ++ migration to qemu-5.1. Notice that we can't make updates to ++ qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is ++ in qemu-5.2 side to make the relevant changes. ++ ++6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 ++ ++ This migration is known as older to newer. We need to make sure ++ than we are able to receive migrations from qemu-5.1. The problem is ++ similar to the previous one. ++ ++If qemu-5.1 and qemu-5.2 were the same, there will not be any ++compatibility problems. But the reason that we create qemu-5.2 is to ++get new features, devices, defaults, etc. ++ ++If we get a device that has a new feature, or change a default value, ++we have a problem when we try to migrate between different QEMU ++versions. ++ ++So we need a way to tell qemu-5.2 that when we are using machine type ++pc-5.1, it needs to **not** use the feature, to be able to migrate to ++real qemu-5.1. ++ ++And the equivalent part when migrating from qemu-5.1 to qemu-5.2. ++qemu-5.2 has to expect that it is not going to get data for the new ++feature, because qemu-5.1 doesn't know about it. ++ ++How do we tell QEMU about these device feature changes? In ++hw/core/machine.c:hw_compat_X_Y arrays. ++ ++If we change a default value, we need to put back the old value on ++that array. And the device, during initialization needs to look at ++that array to see what value it needs to get for that feature. And ++what are we going to put in that array, the value of a property. ++ ++To create a property for a device, we need to use one of the ++DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the ++macros that exist. With it, we set the default value for that ++property, and that is what it is going to get in the latest released ++version. But if we want a different value for a previous version, we ++can change that in the hw_compat_X_Y arrays. ++ ++hw_compat_X_Y is an array of registers that have the format: ++ ++- name_device ++- name_property ++- value ++ ++Let's see a practical example. ++ ++In qemu-5.2 virtio-blk-device got multi queue support. This is a ++change that is not backward compatible. In qemu-5.1 it has one ++queue. In qemu-5.2 it has the same number of queues as the number of ++cpus in the system. ++ ++When we are doing migration, if we migrate from a device that has 4 ++queues to a device that have only one queue, we don't know where to ++put the extra information for the other 3 queues, and we fail ++migration. ++ ++Similar problem when we migrate from qemu-5.1 that has only one queue ++to qemu-5.2, we only sent information for one queue, but destination ++has 4, and we have 3 queues that are not properly initialized and ++anything can happen. ++ ++So, how can we address this problem. Easy, just convince qemu-5.2 ++that when it is running pc-5.1, it needs to set the number of queues ++for virtio-blk-devices to 1. ++ ++That way we fix the cases 5 and 6. ++ ++5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 ++ ++ qemu-5.2 -M pc-5.1 sets number of queues to be 1. ++ qemu-5.1 -M pc-5.1 expects number of queues to be 1. ++ ++ correct. migration works. ++ ++6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 ++ ++ qemu-5.1 -M pc-5.1 sets number of queues to be 1. ++ qemu-5.2 -M pc-5.1 expects number of queues to be 1. ++ ++ correct. migration works. ++ ++And now the other interesting case, case 3. In this case we have: ++ ++3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 ++ ++ Here we have the same QEMU in both sides. So it doesn't matter a ++ lot if we have set the number of queues to 1 or not, because ++ they are the same. ++ ++ WRONG! ++ ++ Think what happens if we do one of this double migrations: ++ ++ A -> migrates -> B -> migrates -> C ++ ++ where: ++ ++ A: qemu-5.1 -M pc-5.1 ++ B: qemu-5.2 -M pc-5.1 ++ C: qemu-5.2 -M pc-5.1 ++ ++ migration A -> B is case 6, so number of queues needs to be 1. ++ ++ migration B -> C is case 3, so we don't care. But actually we ++ care because we haven't started the guest in qemu-5.2, it came ++ migrated from qemu-5.1. So to be in the safe place, we need to ++ always use number of queues 1 when we are using pc-5.1. ++ ++Now, how was this done in reality? The following commit shows how it ++was done:: ++ ++ commit 9445e1e15e66c19e42bea942ba810db28052cd05 ++ Author: Stefan Hajnoczi ++ Date: Tue Aug 18 15:33:47 2020 +0100 ++ ++ virtio-blk-pci: default num_queues to -smp N ++ ++The relevant parts for migration are:: ++ ++ @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { ++ #endif ++ DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, ++ true), ++ - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), ++ + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, ++ + VIRTIO_BLK_AUTO_NUM_QUEUES), ++ DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), ++ ++It changes the default value of num_queues. But it fishes it for old ++machine types to have the right value:: ++ ++ @@ -31,6 +31,7 @@ ++ GlobalProperty hw_compat_5_1[] = { ++ ... ++ + { "virtio-blk-device", "num-queues", "1"}, ++ ... ++ }; ++ ++A device with different features on both sides ++---------------------------------------------- ++ ++Let's assume that we are using the same QEMU binary on both sides, ++just to make the things easier. But we have a device that has ++different features on both sides of the migration. That can be ++because the devices are different, because the kernel driver of both ++devices have different features, whatever. ++ ++How can we get this to work with migration. The way to do that is ++"theoretically" easy. You have to get the features that the device ++has in the source of the migration. The features that the device has ++on the target of the migration, you get the intersection of the ++features of both sides, and that is the way that you should launch ++QEMU. ++ ++Notice that this is not completely related to QEMU. The most ++important thing here is that this should be handled by the managing ++application that launches QEMU. If QEMU is configured correctly, the ++migration will succeed. ++ ++That said, actually doing it is complicated. Almost all devices are ++bad at being able to be launched with only some features enabled. ++With one big exception: cpus. ++ ++You can read the documentation for QEMU x86 cpu models here: ++ ++https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html ++ ++See when they talk about migration they recommend that one chooses the ++newest cpu model that is supported for all cpus. ++ ++Let's say that we have: ++ ++Host A: ++ ++Device X has the feature Y ++ ++Host B: ++ ++Device X has not the feature Y ++ ++If we try to migrate without any care from host A to host B, it will ++fail because when migration tries to load the feature Y on ++destination, it will find that the hardware is not there. ++ ++Doing this would be the equivalent of doing with cpus: ++ ++Host A: ++ ++$ qemu-system-x86_64 -cpu host ++ ++Host B: ++ ++$ qemu-system-x86_64 -cpu host ++ ++When both hosts have different cpu features this is guaranteed to ++fail. Especially if Host B has less features than host A. If host A ++has less features than host B, sometimes it works. Important word of ++last sentence is "sometimes". ++ ++So, forgetting about cpu models and continuing with the -cpu host ++example, let's see that the differences of the cpus is that Host A and ++B have the following features: ++ ++Features: 'pcid' 'stibp' 'taa-no' ++Host A: X X ++Host B: X ++ ++And we want to migrate between them, the way configure both QEMU cpu ++will be: ++ ++Host A: ++ ++$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off ++ ++Host B: ++ ++$ qemu-system-x86_64 -cpu host,taa-no=off ++ ++And you would be able to migrate between them. It is responsibility ++of the management application or of the user to make sure that the ++configuration is correct. QEMU doesn't know how to look at this kind ++of features in general. ++ ++Notice that we don't recommend to use -cpu host for migration. It is ++used in this example because it makes the example simpler. ++ ++Other devices have worse control about individual features. If they ++want to be able to migrate between hosts that show different features, ++the device needs a way to configure which ones it is going to use. ++ ++In this section we have considered that we are using the same QEMU ++binary in both sides of the migration. If we use different QEMU ++versions process, then we need to have into account all other ++differences and the examples become even more complicated. ++ ++How to mitigate when we have a backward compatibility error ++----------------------------------------------------------- ++ ++We broke migration for old machine types continuously during ++development. But as soon as we find that there is a problem, we fix ++it. The problem is what happens when we detect after we have done a ++release that something has gone wrong. ++ ++Let see how it worked with one example. ++ ++After the release of qemu-8.0 we found a problem when doing migration ++of the machine type pc-7.2. ++ ++- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 ++ ++ This migration works ++ ++- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 ++ ++ This migration works ++ ++- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 ++ ++ This migration fails ++ ++- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 ++ ++ This migration fails ++ ++So clearly something fails when migration between qemu-7.2 and ++qemu-8.0 with machine type pc-7.2. The error messages, and git bisect ++pointed to this commit. ++ ++In qemu-8.0 we got this commit:: ++ ++ commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 ++ Author: Jonathan Cameron ++ Date: Thu Mar 2 13:37:02 2023 +0000 ++ ++ hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register ++ ++ ++The relevant bits of the commit for our example are this ones:: ++ ++ --- a/hw/pci/pcie_aer.c ++ +++ b/hw/pci/pcie_aer.c ++ @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, ++ ++ pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, ++ PCI_ERR_UNC_SUPPORTED); ++ + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, ++ + PCI_ERR_UNC_MASK_DEFAULT); ++ + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, ++ + PCI_ERR_UNC_SUPPORTED); ++ ++ pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, ++ PCI_ERR_UNC_SEVERITY_DEFAULT); ++ ++The patch changes how we configure PCI space for AER. But QEMU fails ++when the PCI space configuration is different between source and ++destination. ++ ++The following commit shows how this got fixed:: ++ ++ commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f ++ Author: Leonardo Bras ++ Date: Tue May 2 21:27:02 2023 -0300 ++ ++ hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 ++ ++ [...] ++ ++The relevant parts of the fix in QEMU are as follow: ++ ++First, we create a new property for the device to be able to configure ++the old behaviour or the new behaviour:: ++ ++ diff --git a/hw/pci/pci.c b/hw/pci/pci.c ++ index 8a87ccc8b0..5153ad63d6 100644 ++ --- a/hw/pci/pci.c ++ +++ b/hw/pci/pci.c ++ @@ -79,6 +79,8 @@ static Property pci_props[] = { ++ DEFINE_PROP_STRING("failover_pair_id", PCIDevice, ++ failover_pair_id), ++ DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), ++ + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, ++ + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), ++ DEFINE_PROP_END_OF_LIST() ++ }; ++ ++Notice that we enable the feature for new machine types. ++ ++Now we see how the fix is done. This is going to depend on what kind ++of breakage happens, but in this case it is quite simple:: ++ ++ diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c ++ index 103667c368..374d593ead 100644 ++ --- a/hw/pci/pcie_aer.c ++ +++ b/hw/pci/pcie_aer.c ++ @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, ++ uint16_t offset, ++ ++ pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, ++ PCI_ERR_UNC_SUPPORTED); ++ - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, ++ - PCI_ERR_UNC_MASK_DEFAULT); ++ - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, ++ - PCI_ERR_UNC_SUPPORTED); ++ + ++ + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { ++ + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, ++ + PCI_ERR_UNC_MASK_DEFAULT); ++ + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, ++ + PCI_ERR_UNC_SUPPORTED); ++ + } ++ ++ pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, ++ PCI_ERR_UNC_SEVERITY_DEFAULT); ++ ++I.e. If the property bit is enabled, we configure it as we did for ++qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. ++ ++And now, everything that is missing is disabling the feature for old ++machine types:: ++ ++ diff --git a/hw/core/machine.c b/hw/core/machine.c ++ index 47a34841a5..07f763eb2e 100644 ++ --- a/hw/core/machine.c ++ +++ b/hw/core/machine.c ++ @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { ++ { "e1000e", "migrate-timadj", "off" }, ++ { "virtio-mem", "x-early-migration", "false" }, ++ { "migration", "x-preempt-pre-7-2", "true" }, ++ + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, ++ }; ++ const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); ++ ++And now, when qemu-8.0.1 is released with this fix, all combinations ++are going to work as supposed. ++ ++- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) ++- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) ++- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) ++- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) ++ ++So the normality has been restored and everything is ok, no? ++ ++Not really, now our matrix is much bigger. We started with the easy ++cases, migration from the same version to the same version always ++works: ++ ++- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 ++- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 ++- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 ++ ++Now the interesting ones. When the QEMU processes versions are ++different. For the 1st set, their fail and we can do nothing, both ++versions are released and we can't change anything. ++ ++- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 ++- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 ++ ++This two are the ones that work. The whole point of making the ++change in qemu-8.0.1 release was to fix this issue: ++ ++- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 ++- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 ++ ++But now we found that qemu-8.0 neither can migrate to qemu-7.2 not ++qemu-8.0.1. ++ ++- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 ++- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 ++ ++So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to ++anything except to qemu-8.0. ++ ++Can we do better? ++ ++Yeap. If we know that we are going to do this migration: ++ ++- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 ++ ++We can launch the appropriate devices with:: ++ ++ --device...,x-pci-e-err-unc-mask=on ++ ++And now we can receive a migration from 8.0. And from now on, we can ++do that migration to new machine types if we remember to enable that ++property for pc-7.2. Notice that we need to remember, it is not ++enough to know that the source of the migration is qemu-8.0. Think of ++this example: ++ ++$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 ++ ++In the second migration, the source is not qemu-8.0, but we still have ++that "problem" and have that property enabled. Notice that we need to ++continue having this mark/property until we have this machine ++rebooted. But it is not a normal reboot (that don't reload QEMU) we ++need the machine to poweroff/poweron on a fixed QEMU. And from now ++on we can use the proper real machine. +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +index 2cb701c77c..7fc02b9520 100644 +--- a/docs/devel/migration/index.rst ++++ b/docs/devel/migration/index.rst +@@ -8,5 +8,6 @@ QEMU live migration works. + :maxdepth: 2 + + main ++ compatibility + vfio + virtio +diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst +index 82cdb420bf..04194414af 100644 +--- a/docs/devel/migration/main.rst ++++ b/docs/devel/migration/main.rst +@@ -993,522 +993,3 @@ In some cases it may be best to tie specific firmware versions to specific + versioned machine types to cut down on the combinations that will need + support. This is also useful when newer versions of firmware outgrow + the padding. +- +- +-Backwards compatibility +-======================= +- +-How backwards compatibility works +---------------------------------- +- +-When we do migration, we have two QEMU processes: the source and the +-target. There are two cases, they are the same version or they are +-different versions. The easy case is when they are the same version. +-The difficult one is when they are different versions. +- +-There are two things that are different, but they have very similar +-names and sometimes get confused: +- +-- QEMU version +-- machine type version +- +-Let's start with a practical example, we start with: +- +-- qemu-system-x86_64 (v5.2), from now on qemu-5.2. +-- qemu-system-x86_64 (v5.1), from now on qemu-5.1. +- +-Related to this are the "latest" machine types defined on each of +-them: +- +-- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 +-- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 +- +-First of all, migration is only supposed to work if you use the same +-machine type in both source and destination. The QEMU hardware +-configuration needs to be the same also on source and destination. +-Most aspects of the backend configuration can be changed at will, +-except for a few cases where the backend features influence frontend +-device feature exposure. But that is not relevant for this section. +- +-I am going to list the number of combinations that we can have. Let's +-start with the trivial ones, QEMU is the same on source and +-destination: +- +-1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 +- +- This is the latest QEMU with the latest machine type. +- This have to work, and if it doesn't work it is a bug. +- +-2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 +- +- Exactly the same case than the previous one, but for 5.1. +- Nothing to see here either. +- +-This are the easiest ones, we will not talk more about them in this +-section. +- +-Now we start with the more interesting cases. Consider the case where +-we have the same QEMU version in both sides (qemu-5.2) but we are using +-the latest machine type for that version (pc-5.2) but one of an older +-QEMU version, in this case pc-5.1. +- +-3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 +- +- It needs to use the definition of pc-5.1 and the devices as they +- were configured on 5.1, but this should be easy in the sense that +- both sides are the same QEMU and both sides have exactly the same +- idea of what the pc-5.1 machine is. +- +-4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 +- +- This combination is not possible as the qemu-5.1 doesn't understand +- pc-5.2 machine type. So nothing to worry here. +- +-Now it comes the interesting ones, when both QEMU processes are +-different. Notice also that the machine type needs to be pc-5.1, +-because we have the limitation than qemu-5.1 doesn't know pc-5.2. So +-the possible cases are: +- +-5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 +- +- This migration is known as newer to older. We need to make sure +- when we are developing 5.2 we need to take care about not to break +- migration to qemu-5.1. Notice that we can't make updates to +- qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is +- in qemu-5.2 side to make the relevant changes. +- +-6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 +- +- This migration is known as older to newer. We need to make sure +- than we are able to receive migrations from qemu-5.1. The problem is +- similar to the previous one. +- +-If qemu-5.1 and qemu-5.2 were the same, there will not be any +-compatibility problems. But the reason that we create qemu-5.2 is to +-get new features, devices, defaults, etc. +- +-If we get a device that has a new feature, or change a default value, +-we have a problem when we try to migrate between different QEMU +-versions. +- +-So we need a way to tell qemu-5.2 that when we are using machine type +-pc-5.1, it needs to **not** use the feature, to be able to migrate to +-real qemu-5.1. +- +-And the equivalent part when migrating from qemu-5.1 to qemu-5.2. +-qemu-5.2 has to expect that it is not going to get data for the new +-feature, because qemu-5.1 doesn't know about it. +- +-How do we tell QEMU about these device feature changes? In +-hw/core/machine.c:hw_compat_X_Y arrays. +- +-If we change a default value, we need to put back the old value on +-that array. And the device, during initialization needs to look at +-that array to see what value it needs to get for that feature. And +-what are we going to put in that array, the value of a property. +- +-To create a property for a device, we need to use one of the +-DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the +-macros that exist. With it, we set the default value for that +-property, and that is what it is going to get in the latest released +-version. But if we want a different value for a previous version, we +-can change that in the hw_compat_X_Y arrays. +- +-hw_compat_X_Y is an array of registers that have the format: +- +-- name_device +-- name_property +-- value +- +-Let's see a practical example. +- +-In qemu-5.2 virtio-blk-device got multi queue support. This is a +-change that is not backward compatible. In qemu-5.1 it has one +-queue. In qemu-5.2 it has the same number of queues as the number of +-cpus in the system. +- +-When we are doing migration, if we migrate from a device that has 4 +-queues to a device that have only one queue, we don't know where to +-put the extra information for the other 3 queues, and we fail +-migration. +- +-Similar problem when we migrate from qemu-5.1 that has only one queue +-to qemu-5.2, we only sent information for one queue, but destination +-has 4, and we have 3 queues that are not properly initialized and +-anything can happen. +- +-So, how can we address this problem. Easy, just convince qemu-5.2 +-that when it is running pc-5.1, it needs to set the number of queues +-for virtio-blk-devices to 1. +- +-That way we fix the cases 5 and 6. +- +-5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 +- +- qemu-5.2 -M pc-5.1 sets number of queues to be 1. +- qemu-5.1 -M pc-5.1 expects number of queues to be 1. +- +- correct. migration works. +- +-6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 +- +- qemu-5.1 -M pc-5.1 sets number of queues to be 1. +- qemu-5.2 -M pc-5.1 expects number of queues to be 1. +- +- correct. migration works. +- +-And now the other interesting case, case 3. In this case we have: +- +-3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 +- +- Here we have the same QEMU in both sides. So it doesn't matter a +- lot if we have set the number of queues to 1 or not, because +- they are the same. +- +- WRONG! +- +- Think what happens if we do one of this double migrations: +- +- A -> migrates -> B -> migrates -> C +- +- where: +- +- A: qemu-5.1 -M pc-5.1 +- B: qemu-5.2 -M pc-5.1 +- C: qemu-5.2 -M pc-5.1 +- +- migration A -> B is case 6, so number of queues needs to be 1. +- +- migration B -> C is case 3, so we don't care. But actually we +- care because we haven't started the guest in qemu-5.2, it came +- migrated from qemu-5.1. So to be in the safe place, we need to +- always use number of queues 1 when we are using pc-5.1. +- +-Now, how was this done in reality? The following commit shows how it +-was done:: +- +- commit 9445e1e15e66c19e42bea942ba810db28052cd05 +- Author: Stefan Hajnoczi +- Date: Tue Aug 18 15:33:47 2020 +0100 +- +- virtio-blk-pci: default num_queues to -smp N +- +-The relevant parts for migration are:: +- +- @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { +- #endif +- DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, +- true), +- - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), +- + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, +- + VIRTIO_BLK_AUTO_NUM_QUEUES), +- DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), +- +-It changes the default value of num_queues. But it fishes it for old +-machine types to have the right value:: +- +- @@ -31,6 +31,7 @@ +- GlobalProperty hw_compat_5_1[] = { +- ... +- + { "virtio-blk-device", "num-queues", "1"}, +- ... +- }; +- +-A device with different features on both sides +----------------------------------------------- +- +-Let's assume that we are using the same QEMU binary on both sides, +-just to make the things easier. But we have a device that has +-different features on both sides of the migration. That can be +-because the devices are different, because the kernel driver of both +-devices have different features, whatever. +- +-How can we get this to work with migration. The way to do that is +-"theoretically" easy. You have to get the features that the device +-has in the source of the migration. The features that the device has +-on the target of the migration, you get the intersection of the +-features of both sides, and that is the way that you should launch +-QEMU. +- +-Notice that this is not completely related to QEMU. The most +-important thing here is that this should be handled by the managing +-application that launches QEMU. If QEMU is configured correctly, the +-migration will succeed. +- +-That said, actually doing it is complicated. Almost all devices are +-bad at being able to be launched with only some features enabled. +-With one big exception: cpus. +- +-You can read the documentation for QEMU x86 cpu models here: +- +-https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html +- +-See when they talk about migration they recommend that one chooses the +-newest cpu model that is supported for all cpus. +- +-Let's say that we have: +- +-Host A: +- +-Device X has the feature Y +- +-Host B: +- +-Device X has not the feature Y +- +-If we try to migrate without any care from host A to host B, it will +-fail because when migration tries to load the feature Y on +-destination, it will find that the hardware is not there. +- +-Doing this would be the equivalent of doing with cpus: +- +-Host A: +- +-$ qemu-system-x86_64 -cpu host +- +-Host B: +- +-$ qemu-system-x86_64 -cpu host +- +-When both hosts have different cpu features this is guaranteed to +-fail. Especially if Host B has less features than host A. If host A +-has less features than host B, sometimes it works. Important word of +-last sentence is "sometimes". +- +-So, forgetting about cpu models and continuing with the -cpu host +-example, let's see that the differences of the cpus is that Host A and +-B have the following features: +- +-Features: 'pcid' 'stibp' 'taa-no' +-Host A: X X +-Host B: X +- +-And we want to migrate between them, the way configure both QEMU cpu +-will be: +- +-Host A: +- +-$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off +- +-Host B: +- +-$ qemu-system-x86_64 -cpu host,taa-no=off +- +-And you would be able to migrate between them. It is responsibility +-of the management application or of the user to make sure that the +-configuration is correct. QEMU doesn't know how to look at this kind +-of features in general. +- +-Notice that we don't recommend to use -cpu host for migration. It is +-used in this example because it makes the example simpler. +- +-Other devices have worse control about individual features. If they +-want to be able to migrate between hosts that show different features, +-the device needs a way to configure which ones it is going to use. +- +-In this section we have considered that we are using the same QEMU +-binary in both sides of the migration. If we use different QEMU +-versions process, then we need to have into account all other +-differences and the examples become even more complicated. +- +-How to mitigate when we have a backward compatibility error +------------------------------------------------------------ +- +-We broke migration for old machine types continuously during +-development. But as soon as we find that there is a problem, we fix +-it. The problem is what happens when we detect after we have done a +-release that something has gone wrong. +- +-Let see how it worked with one example. +- +-After the release of qemu-8.0 we found a problem when doing migration +-of the machine type pc-7.2. +- +-- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- +- This migration works +- +-- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- +- This migration works +- +-- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- +- This migration fails +- +-- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- +- This migration fails +- +-So clearly something fails when migration between qemu-7.2 and +-qemu-8.0 with machine type pc-7.2. The error messages, and git bisect +-pointed to this commit. +- +-In qemu-8.0 we got this commit:: +- +- commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 +- Author: Jonathan Cameron +- Date: Thu Mar 2 13:37:02 2023 +0000 +- +- hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register +- +- +-The relevant bits of the commit for our example are this ones:: +- +- --- a/hw/pci/pcie_aer.c +- +++ b/hw/pci/pcie_aer.c +- @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, +- +- pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, +- PCI_ERR_UNC_SUPPORTED); +- + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, +- + PCI_ERR_UNC_MASK_DEFAULT); +- + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, +- + PCI_ERR_UNC_SUPPORTED); +- +- pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, +- PCI_ERR_UNC_SEVERITY_DEFAULT); +- +-The patch changes how we configure PCI space for AER. But QEMU fails +-when the PCI space configuration is different between source and +-destination. +- +-The following commit shows how this got fixed:: +- +- commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f +- Author: Leonardo Bras +- Date: Tue May 2 21:27:02 2023 -0300 +- +- hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 +- +- [...] +- +-The relevant parts of the fix in QEMU are as follow: +- +-First, we create a new property for the device to be able to configure +-the old behaviour or the new behaviour:: +- +- diff --git a/hw/pci/pci.c b/hw/pci/pci.c +- index 8a87ccc8b0..5153ad63d6 100644 +- --- a/hw/pci/pci.c +- +++ b/hw/pci/pci.c +- @@ -79,6 +79,8 @@ static Property pci_props[] = { +- DEFINE_PROP_STRING("failover_pair_id", PCIDevice, +- failover_pair_id), +- DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), +- + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, +- + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), +- DEFINE_PROP_END_OF_LIST() +- }; +- +-Notice that we enable the feature for new machine types. +- +-Now we see how the fix is done. This is going to depend on what kind +-of breakage happens, but in this case it is quite simple:: +- +- diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c +- index 103667c368..374d593ead 100644 +- --- a/hw/pci/pcie_aer.c +- +++ b/hw/pci/pcie_aer.c +- @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, +- uint16_t offset, +- +- pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, +- PCI_ERR_UNC_SUPPORTED); +- - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, +- - PCI_ERR_UNC_MASK_DEFAULT); +- - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, +- - PCI_ERR_UNC_SUPPORTED); +- + +- + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { +- + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, +- + PCI_ERR_UNC_MASK_DEFAULT); +- + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, +- + PCI_ERR_UNC_SUPPORTED); +- + } +- +- pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, +- PCI_ERR_UNC_SEVERITY_DEFAULT); +- +-I.e. If the property bit is enabled, we configure it as we did for +-qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. +- +-And now, everything that is missing is disabling the feature for old +-machine types:: +- +- diff --git a/hw/core/machine.c b/hw/core/machine.c +- index 47a34841a5..07f763eb2e 100644 +- --- a/hw/core/machine.c +- +++ b/hw/core/machine.c +- @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { +- { "e1000e", "migrate-timadj", "off" }, +- { "virtio-mem", "x-early-migration", "false" }, +- { "migration", "x-preempt-pre-7-2", "true" }, +- + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, +- }; +- const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); +- +-And now, when qemu-8.0.1 is released with this fix, all combinations +-are going to work as supposed. +- +-- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +-- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) +-- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +-- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) +- +-So the normality has been restored and everything is ok, no? +- +-Not really, now our matrix is much bigger. We started with the easy +-cases, migration from the same version to the same version always +-works: +- +-- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +-- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +-- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- +-Now the interesting ones. When the QEMU processes versions are +-different. For the 1st set, their fail and we can do nothing, both +-versions are released and we can't change anything. +- +-- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +-- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- +-This two are the ones that work. The whole point of making the +-change in qemu-8.0.1 release was to fix this issue: +- +-- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +-- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- +-But now we found that qemu-8.0 neither can migrate to qemu-7.2 not +-qemu-8.0.1. +- +-- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +-- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- +-So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to +-anything except to qemu-8.0. +- +-Can we do better? +- +-Yeap. If we know that we are going to do this migration: +- +-- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- +-We can launch the appropriate devices with:: +- +- --device...,x-pci-e-err-unc-mask=on +- +-And now we can receive a migration from 8.0. And from now on, we can +-do that migration to new machine types if we remember to enable that +-property for pc-7.2. Notice that we need to remember, it is not +-enough to know that the source of the migration is qemu-8.0. Think of +-this example: +- +-$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 +- +-In the second migration, the source is not qemu-8.0, but we still have +-that "problem" and have that property enabled. Notice that we need to +-continue having this mark/property until we have this machine +-rebooted. But it is not a normal reboot (that don't reload QEMU) we +-need the machine to poweroff/poweron on a fixed QEMU. And from now +-on we can use the proper real machine. +-- +2.43.0 + diff --git a/0373-docs-migration-split-debugging-and-firmware.patch b/0373-docs-migration-split-debugging-and-firmware.patch new file mode 100644 index 0000000..e31ccaa --- /dev/null +++ b/0373-docs-migration-split-debugging-and-firmware.patch @@ -0,0 +1,151 @@ +From 4c7b89093ddc95842153f45251fc14b6a56ab597 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:23 +0800 +Subject: [PATCH] docs/migration: Split "Debugging" and "Firmware" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 774ad6b53b9449223115ffa8851eb93de92b0ce7 upstream. + +Move the two sections into a separate file called "best-practices.rst". +Add the entry into index. + +Intel-SIG: commit 774ad6b53b94 docs/migration: Split "Debugging" and "Firmware" + +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-6-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/best-practices.rst | 48 +++++++++++++++++++++++++ + docs/devel/migration/index.rst | 1 + + docs/devel/migration/main.rst | 44 ----------------------- + 3 files changed, 49 insertions(+), 44 deletions(-) + create mode 100644 docs/devel/migration/best-practices.rst + +diff --git a/docs/devel/migration/best-practices.rst b/docs/devel/migration/best-practices.rst +new file mode 100644 +index 0000000000..d7c34a3014 +--- /dev/null ++++ b/docs/devel/migration/best-practices.rst +@@ -0,0 +1,48 @@ ++============== ++Best practices ++============== ++ ++Debugging ++========= ++ ++The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. ++ ++Example usage: ++ ++.. code-block:: shell ++ ++ $ qemu-system-x86_64 -display none -monitor stdio ++ (qemu) migrate "exec:cat > mig" ++ (qemu) q ++ $ ./scripts/analyze-migration.py -f mig ++ { ++ "ram (3)": { ++ "section sizes": { ++ "pc.ram": "0x0000000008000000", ++ ... ++ ++See also ``analyze-migration.py -h`` help for more options. ++ ++Firmware ++======== ++ ++Migration migrates the copies of RAM and ROM, and thus when running ++on the destination it includes the firmware from the source. Even after ++resetting a VM, the old firmware is used. Only once QEMU has been restarted ++is the new firmware in use. ++ ++- Changes in firmware size can cause changes in the required RAMBlock size ++ to hold the firmware and thus migration can fail. In practice it's best ++ to pad firmware images to convenient powers of 2 with plenty of space ++ for growth. ++ ++- Care should be taken with device emulation code so that newer ++ emulation code can work with older firmware to allow forward migration. ++ ++- Care should be taken with newer firmware so that backward migration ++ to older systems with older device emulation code will work. ++ ++In some cases it may be best to tie specific firmware versions to specific ++versioned machine types to cut down on the combinations that will need ++support. This is also useful when newer versions of firmware outgrow ++the padding. +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +index 7fc02b9520..9a8fd1ead7 100644 +--- a/docs/devel/migration/index.rst ++++ b/docs/devel/migration/index.rst +@@ -11,3 +11,4 @@ QEMU live migration works. + compatibility + vfio + virtio ++ best-practices +diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst +index 04194414af..7ca3b4dd3f 100644 +--- a/docs/devel/migration/main.rst ++++ b/docs/devel/migration/main.rst +@@ -52,27 +52,6 @@ All these migration protocols use the same infrastructure to + save/restore state devices. This infrastructure is shared with the + savevm/loadvm functionality. + +-Debugging +-========= +- +-The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. +- +-Example usage: +- +-.. code-block:: shell +- +- $ qemu-system-x86_64 -display none -monitor stdio +- (qemu) migrate "exec:cat > mig" +- (qemu) q +- $ ./scripts/analyze-migration.py -f mig +- { +- "ram (3)": { +- "section sizes": { +- "pc.ram": "0x0000000008000000", +- ... +- +-See also ``analyze-migration.py -h`` help for more options. +- + Common infrastructure + ===================== + +@@ -970,26 +949,3 @@ the background migration channel. Anyone who cares about latencies of page + faults during a postcopy migration should enable this feature. By default, + it's not enabled. + +-Firmware +-======== +- +-Migration migrates the copies of RAM and ROM, and thus when running +-on the destination it includes the firmware from the source. Even after +-resetting a VM, the old firmware is used. Only once QEMU has been restarted +-is the new firmware in use. +- +-- Changes in firmware size can cause changes in the required RAMBlock size +- to hold the firmware and thus migration can fail. In practice it's best +- to pad firmware images to convenient powers of 2 with plenty of space +- for growth. +- +-- Care should be taken with device emulation code so that newer +- emulation code can work with older firmware to allow forward migration. +- +-- Care should be taken with newer firmware so that backward migration +- to older systems with older device emulation code will work. +- +-In some cases it may be best to tie specific firmware versions to specific +-versioned machine types to cut down on the combinations that will need +-support. This is also useful when newer versions of firmware outgrow +-the padding. +-- +2.43.0 + diff --git a/0374-docs-migration-split-postcopy.patch b/0374-docs-migration-split-postcopy.patch new file mode 100644 index 0000000..dd84417 --- /dev/null +++ b/0374-docs-migration-split-postcopy.patch @@ -0,0 +1,681 @@ +From 84decefd3fecc3b226ba339f42c91d5e57e86b24 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:24 +0800 +Subject: [PATCH] docs/migration: Split "Postcopy" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit bfb4c7cd99f1c39dedf33381954d03b9f8f244ec upstream. + +Split postcopy into a separate file. Introduce a head page "features.rst" +to keep all the features on top of migration framework. + +Intel-SIG: commit bfb4c7cd99f1 docs/migration: Split "Postcopy" + +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-7-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/features.rst | 9 + + docs/devel/migration/index.rst | 1 + + docs/devel/migration/main.rst | 305 ------------------------------ + docs/devel/migration/postcopy.rst | 304 +++++++++++++++++++++++++++++ + 4 files changed, 314 insertions(+), 305 deletions(-) + create mode 100644 docs/devel/migration/features.rst + create mode 100644 docs/devel/migration/postcopy.rst + +diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst +new file mode 100644 +index 0000000000..0054e0c900 +--- /dev/null ++++ b/docs/devel/migration/features.rst +@@ -0,0 +1,9 @@ ++Migration features ++================== ++ ++Migration has plenty of features to support different use cases. ++ ++.. toctree:: ++ :maxdepth: 2 ++ ++ postcopy +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +index 9a8fd1ead7..21ad58b189 100644 +--- a/docs/devel/migration/index.rst ++++ b/docs/devel/migration/index.rst +@@ -8,6 +8,7 @@ QEMU live migration works. + :maxdepth: 2 + + main ++ features + compatibility + vfio + virtio +diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst +index 7ca3b4dd3f..1e98e9e40c 100644 +--- a/docs/devel/migration/main.rst ++++ b/docs/devel/migration/main.rst +@@ -644,308 +644,3 @@ algorithm will restrict virtual CPUs as needed to keep their dirty page + rate inside the limit. This leads to more steady reading performance during + live migration and can aid in improving large guest responsiveness. + +-Postcopy +-======== +- +-'Postcopy' migration is a way to deal with migrations that refuse to converge +-(or take too long to converge) its plus side is that there is an upper bound on +-the amount of migration traffic and time it takes, the down side is that during +-the postcopy phase, a failure of *either* side causes the guest to be lost. +- +-In postcopy the destination CPUs are started before all the memory has been +-transferred, and accesses to pages that are yet to be transferred cause +-a fault that's translated by QEMU into a request to the source QEMU. +- +-Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +-doesn't finish in a given time the switch is made to postcopy. +- +-Enabling postcopy +------------------ +- +-To enable postcopy, issue this command on the monitor (both source and +-destination) prior to the start of migration: +- +-``migrate_set_capability postcopy-ram on`` +- +-The normal commands are then used to start a migration, which is still +-started in precopy mode. Issuing: +- +-``migrate_start_postcopy`` +- +-will now cause the transition from precopy to postcopy. +-It can be issued immediately after migration is started or any +-time later on. Issuing it after the end of a migration is harmless. +- +-Blocktime is a postcopy live migration metric, intended to show how +-long the vCPU was in state of interruptible sleep due to pagefault. +-That metric is calculated both for all vCPUs as overlapped value, and +-separately for each vCPU. These values are calculated on destination +-side. To enable postcopy blocktime calculation, enter following +-command on destination monitor: +- +-``migrate_set_capability postcopy-blocktime on`` +- +-Postcopy blocktime can be retrieved by query-migrate qmp command. +-postcopy-blocktime value of qmp command will show overlapped blocking +-time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +-time per vCPU. +- +-.. note:: +- During the postcopy phase, the bandwidth limits set using +- ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that +- the destination is waiting for). +- +-Postcopy device transfer +------------------------- +- +-Loading of device data may cause the device emulation to access guest RAM +-that may trigger faults that have to be resolved by the source, as such +-the migration stream has to be able to respond with page data *during* the +-device load, and hence the device data has to be read from the stream completely +-before the device load begins to free the stream up. This is achieved by +-'packaging' the device data into a blob that's read in one go. +- +-Source behaviour +----------------- +- +-Until postcopy is entered the migration stream is identical to normal +-precopy, except for the addition of a 'postcopy advise' command at +-the beginning, to tell the destination that postcopy might happen. +-When postcopy starts the source sends the page discard data and then +-forms the 'package' containing: +- +- - Command: 'postcopy listen' +- - The device state +- +- A series of sections, identical to the precopy streams device state stream +- containing everything except postcopiable devices (i.e. RAM) +- - Command: 'postcopy run' +- +-The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the +-contents are formatted in the same way as the main migration stream. +- +-During postcopy the source scans the list of dirty pages and sends them +-to the destination without being requested (in much the same way as precopy), +-however when a page request is received from the destination, the dirty page +-scanning restarts from the requested location. This causes requested pages +-to be sent quickly, and also causes pages directly after the requested page +-to be sent quickly in the hope that those pages are likely to be used +-by the destination soon. +- +-Destination behaviour +---------------------- +- +-Initially the destination looks the same as precopy, with a single thread +-reading the migration stream; the 'postcopy advise' and 'discard' commands +-are processed to change the way RAM is managed, but don't affect the stream +-processing. +- +-:: +- +- ------------------------------------------------------------------------------ +- 1 2 3 4 5 6 7 +- main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) +- thread | | +- | (page request) +- | \___ +- v \ +- listen thread: --- page -- page -- page -- page -- page -- +- +- a b c +- ------------------------------------------------------------------------------ +- +-- On receipt of ``CMD_PACKAGED`` (1) +- +- All the data associated with the package - the ( ... ) section in the diagram - +- is read into memory, and the main thread recurses into qemu_loadvm_state_main +- to process the contents of the package (2) which contains commands (3,6) and +- devices (4...) +- +-- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) +- +- a new thread (a) is started that takes over servicing the migration stream, +- while the main thread carries on loading the package. It loads normal +- background page data (b) but if during a device load a fault happens (5) +- the returned page (c) is loaded by the listen thread allowing the main +- threads device load to carry on. +- +-- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) +- +- letting the destination CPUs start running. At the end of the +- ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and +- is no longer used by migration, while the listen thread carries on servicing +- page data until the end of migration. +- +-Postcopy Recovery +------------------ +- +-Comparing to precopy, postcopy is special on error handlings. When any +-error happens (in this case, mostly network errors), QEMU cannot easily +-fail a migration because VM data resides in both source and destination +-QEMU instances. On the other hand, when issue happens QEMU on both sides +-will go into a paused state. It'll need a recovery phase to continue a +-paused postcopy migration. +- +-The recovery phase normally contains a few steps: +- +- - When network issue occurs, both QEMU will go into PAUSED state +- +- - When the network is recovered (or a new network is provided), the admin +- can setup the new channel for migration using QMP command +- 'migrate-recover' on destination node, preparing for a resume. +- +- - On source host, the admin can continue the interrupted postcopy +- migration using QMP command 'migrate' with resume=true flag set. +- +- - After the connection is re-established, QEMU will continue the postcopy +- migration on both sides. +- +-During a paused postcopy migration, the VM can logically still continue +-running, and it will not be impacted from any page access to pages that +-were already migrated to destination VM before the interruption happens. +-However, if any of the missing pages got accessed on destination VM, the VM +-thread will be halted waiting for the page to be migrated, it means it can +-be halted until the recovery is complete. +- +-The impact of accessing missing pages can be relevant to different +-configurations of the guest. For example, when with async page fault +-enabled, logically the guest can proactively schedule out the threads +-accessing missing pages. +- +-Postcopy states +---------------- +- +-Postcopy moves through a series of states (see postcopy_state) from +-ADVISE->DISCARD->LISTEN->RUNNING->END +- +- - Advise +- +- Set at the start of migration if postcopy is enabled, even +- if it hasn't had the start command; here the destination +- checks that its OS has the support needed for postcopy, and performs +- setup to ensure the RAM mappings are suitable for later postcopy. +- The destination will fail early in migration at this point if the +- required OS support is not present. +- (Triggered by reception of POSTCOPY_ADVISE command) +- +- - Discard +- +- Entered on receipt of the first 'discard' command; prior to +- the first Discard being performed, hugepages are switched off +- (using madvise) to ensure that no new huge pages are created +- during the postcopy phase, and to cause any huge pages that +- have discards on them to be broken. +- +- - Listen +- +- The first command in the package, POSTCOPY_LISTEN, switches +- the destination state to Listen, and starts a new thread +- (the 'listen thread') which takes over the job of receiving +- pages off the migration stream, while the main thread carries +- on processing the blob. With this thread able to process page +- reception, the destination now 'sensitises' the RAM to detect +- any access to missing pages (on Linux using the 'userfault' +- system). +- +- - Running +- +- POSTCOPY_RUN causes the destination to synchronise all +- state and start the CPUs and IO devices running. The main +- thread now finishes processing the migration package and +- now carries on as it would for normal precopy migration +- (although it can't do the cleanup it would do as it +- finishes a normal migration). +- +- - Paused +- +- Postcopy can run into a paused state (normally on both sides when +- happens), where all threads will be temporarily halted mostly due to +- network errors. When reaching paused state, migration will make sure +- the qemu binary on both sides maintain the data without corrupting +- the VM. To continue the migration, the admin needs to fix the +- migration channel using the QMP command 'migrate-recover' on the +- destination node, then resume the migration using QMP command 'migrate' +- again on source node, with resume=true flag set. +- +- - End +- +- The listen thread can now quit, and perform the cleanup of migration +- state, the migration is now complete. +- +-Source side page map +--------------------- +- +-The 'migration bitmap' in postcopy is basically the same as in the precopy, +-where each of the bit to indicate that page is 'dirty' - i.e. needs +-sending. During the precopy phase this is updated as the CPU dirties +-pages, however during postcopy the CPUs are stopped and nothing should +-dirty anything any more. Instead, dirty bits are cleared when the relevant +-pages are sent during postcopy. +- +-Postcopy with hugepages +------------------------ +- +-Postcopy now works with hugetlbfs backed memory: +- +- a) The linux kernel on the destination must support userfault on hugepages. +- b) The huge-page configuration on the source and destination VMs must be +- identical; i.e. RAMBlocks on both sides must use the same page size. +- c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal +- RAM if it doesn't have enough hugepages, triggering (b) to fail. +- Using ``-mem-prealloc`` enforces the allocation using hugepages. +- d) Care should be taken with the size of hugepage used; postcopy with 2MB +- hugepages works well, however 1GB hugepages are likely to be problematic +- since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, +- and until the full page is transferred the destination thread is blocked. +- +-Postcopy with shared memory +---------------------------- +- +-Postcopy migration with shared memory needs explicit support from the other +-processes that share memory and from QEMU. There are restrictions on the type of +-memory that userfault can support shared. +- +-The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` +-(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` +-for hugetlbfs which may be a problem in some configurations). +- +-The vhost-user code in QEMU supports clients that have Postcopy support, +-and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes +-to support postcopy. +- +-The client needs to open a userfaultfd and register the areas +-of memory that it maps with userfault. The client must then pass the +-userfaultfd back to QEMU together with a mapping table that allows +-fault addresses in the clients address space to be converted back to +-RAMBlock/offsets. The client's userfaultfd is added to the postcopy +-fault-thread and page requests are made on behalf of the client by QEMU. +-QEMU performs 'wake' operations on the client's userfaultfd to allow it +-to continue after a page has arrived. +- +-.. note:: +- There are two future improvements that would be nice: +- a) Some way to make QEMU ignorant of the addresses in the clients +- address space +- b) Avoiding the need for QEMU to perform ufd-wake calls after the +- pages have arrived +- +-Retro-fitting postcopy to existing clients is possible: +- a) A mechanism is needed for the registration with userfault as above, +- and the registration needs to be coordinated with the phases of +- postcopy. In vhost-user extra messages are added to the existing +- control channel. +- b) Any thread that can block due to guest memory accesses must be +- identified and the implication understood; for example if the +- guest memory access is made while holding a lock then all other +- threads waiting for that lock will also be blocked. +- +-Postcopy Preemption Mode +------------------------- +- +-Postcopy preempt is a new capability introduced in 8.0 QEMU release, it +-allows urgent pages (those got page fault requested from destination QEMU +-explicitly) to be sent in a separate preempt channel, rather than queued in +-the background migration channel. Anyone who cares about latencies of page +-faults during a postcopy migration should enable this feature. By default, +-it's not enabled. +- +diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst +new file mode 100644 +index 0000000000..d60eec06ab +--- /dev/null ++++ b/docs/devel/migration/postcopy.rst +@@ -0,0 +1,304 @@ ++Postcopy ++======== ++ ++'Postcopy' migration is a way to deal with migrations that refuse to converge ++(or take too long to converge) its plus side is that there is an upper bound on ++the amount of migration traffic and time it takes, the down side is that during ++the postcopy phase, a failure of *either* side causes the guest to be lost. ++ ++In postcopy the destination CPUs are started before all the memory has been ++transferred, and accesses to pages that are yet to be transferred cause ++a fault that's translated by QEMU into a request to the source QEMU. ++ ++Postcopy can be combined with precopy (i.e. normal migration) so that if precopy ++doesn't finish in a given time the switch is made to postcopy. ++ ++Enabling postcopy ++----------------- ++ ++To enable postcopy, issue this command on the monitor (both source and ++destination) prior to the start of migration: ++ ++``migrate_set_capability postcopy-ram on`` ++ ++The normal commands are then used to start a migration, which is still ++started in precopy mode. Issuing: ++ ++``migrate_start_postcopy`` ++ ++will now cause the transition from precopy to postcopy. ++It can be issued immediately after migration is started or any ++time later on. Issuing it after the end of a migration is harmless. ++ ++Blocktime is a postcopy live migration metric, intended to show how ++long the vCPU was in state of interruptible sleep due to pagefault. ++That metric is calculated both for all vCPUs as overlapped value, and ++separately for each vCPU. These values are calculated on destination ++side. To enable postcopy blocktime calculation, enter following ++command on destination monitor: ++ ++``migrate_set_capability postcopy-blocktime on`` ++ ++Postcopy blocktime can be retrieved by query-migrate qmp command. ++postcopy-blocktime value of qmp command will show overlapped blocking ++time for all vCPU, postcopy-vcpu-blocktime will show list of blocking ++time per vCPU. ++ ++.. note:: ++ During the postcopy phase, the bandwidth limits set using ++ ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that ++ the destination is waiting for). ++ ++Postcopy device transfer ++------------------------ ++ ++Loading of device data may cause the device emulation to access guest RAM ++that may trigger faults that have to be resolved by the source, as such ++the migration stream has to be able to respond with page data *during* the ++device load, and hence the device data has to be read from the stream completely ++before the device load begins to free the stream up. This is achieved by ++'packaging' the device data into a blob that's read in one go. ++ ++Source behaviour ++---------------- ++ ++Until postcopy is entered the migration stream is identical to normal ++precopy, except for the addition of a 'postcopy advise' command at ++the beginning, to tell the destination that postcopy might happen. ++When postcopy starts the source sends the page discard data and then ++forms the 'package' containing: ++ ++ - Command: 'postcopy listen' ++ - The device state ++ ++ A series of sections, identical to the precopy streams device state stream ++ containing everything except postcopiable devices (i.e. RAM) ++ - Command: 'postcopy run' ++ ++The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the ++contents are formatted in the same way as the main migration stream. ++ ++During postcopy the source scans the list of dirty pages and sends them ++to the destination without being requested (in much the same way as precopy), ++however when a page request is received from the destination, the dirty page ++scanning restarts from the requested location. This causes requested pages ++to be sent quickly, and also causes pages directly after the requested page ++to be sent quickly in the hope that those pages are likely to be used ++by the destination soon. ++ ++Destination behaviour ++--------------------- ++ ++Initially the destination looks the same as precopy, with a single thread ++reading the migration stream; the 'postcopy advise' and 'discard' commands ++are processed to change the way RAM is managed, but don't affect the stream ++processing. ++ ++:: ++ ++ ------------------------------------------------------------------------------ ++ 1 2 3 4 5 6 7 ++ main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) ++ thread | | ++ | (page request) ++ | \___ ++ v \ ++ listen thread: --- page -- page -- page -- page -- page -- ++ ++ a b c ++ ------------------------------------------------------------------------------ ++ ++- On receipt of ``CMD_PACKAGED`` (1) ++ ++ All the data associated with the package - the ( ... ) section in the diagram - ++ is read into memory, and the main thread recurses into qemu_loadvm_state_main ++ to process the contents of the package (2) which contains commands (3,6) and ++ devices (4...) ++ ++- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) ++ ++ a new thread (a) is started that takes over servicing the migration stream, ++ while the main thread carries on loading the package. It loads normal ++ background page data (b) but if during a device load a fault happens (5) ++ the returned page (c) is loaded by the listen thread allowing the main ++ threads device load to carry on. ++ ++- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) ++ ++ letting the destination CPUs start running. At the end of the ++ ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and ++ is no longer used by migration, while the listen thread carries on servicing ++ page data until the end of migration. ++ ++Postcopy Recovery ++----------------- ++ ++Comparing to precopy, postcopy is special on error handlings. When any ++error happens (in this case, mostly network errors), QEMU cannot easily ++fail a migration because VM data resides in both source and destination ++QEMU instances. On the other hand, when issue happens QEMU on both sides ++will go into a paused state. It'll need a recovery phase to continue a ++paused postcopy migration. ++ ++The recovery phase normally contains a few steps: ++ ++ - When network issue occurs, both QEMU will go into PAUSED state ++ ++ - When the network is recovered (or a new network is provided), the admin ++ can setup the new channel for migration using QMP command ++ 'migrate-recover' on destination node, preparing for a resume. ++ ++ - On source host, the admin can continue the interrupted postcopy ++ migration using QMP command 'migrate' with resume=true flag set. ++ ++ - After the connection is re-established, QEMU will continue the postcopy ++ migration on both sides. ++ ++During a paused postcopy migration, the VM can logically still continue ++running, and it will not be impacted from any page access to pages that ++were already migrated to destination VM before the interruption happens. ++However, if any of the missing pages got accessed on destination VM, the VM ++thread will be halted waiting for the page to be migrated, it means it can ++be halted until the recovery is complete. ++ ++The impact of accessing missing pages can be relevant to different ++configurations of the guest. For example, when with async page fault ++enabled, logically the guest can proactively schedule out the threads ++accessing missing pages. ++ ++Postcopy states ++--------------- ++ ++Postcopy moves through a series of states (see postcopy_state) from ++ADVISE->DISCARD->LISTEN->RUNNING->END ++ ++ - Advise ++ ++ Set at the start of migration if postcopy is enabled, even ++ if it hasn't had the start command; here the destination ++ checks that its OS has the support needed for postcopy, and performs ++ setup to ensure the RAM mappings are suitable for later postcopy. ++ The destination will fail early in migration at this point if the ++ required OS support is not present. ++ (Triggered by reception of POSTCOPY_ADVISE command) ++ ++ - Discard ++ ++ Entered on receipt of the first 'discard' command; prior to ++ the first Discard being performed, hugepages are switched off ++ (using madvise) to ensure that no new huge pages are created ++ during the postcopy phase, and to cause any huge pages that ++ have discards on them to be broken. ++ ++ - Listen ++ ++ The first command in the package, POSTCOPY_LISTEN, switches ++ the destination state to Listen, and starts a new thread ++ (the 'listen thread') which takes over the job of receiving ++ pages off the migration stream, while the main thread carries ++ on processing the blob. With this thread able to process page ++ reception, the destination now 'sensitises' the RAM to detect ++ any access to missing pages (on Linux using the 'userfault' ++ system). ++ ++ - Running ++ ++ POSTCOPY_RUN causes the destination to synchronise all ++ state and start the CPUs and IO devices running. The main ++ thread now finishes processing the migration package and ++ now carries on as it would for normal precopy migration ++ (although it can't do the cleanup it would do as it ++ finishes a normal migration). ++ ++ - Paused ++ ++ Postcopy can run into a paused state (normally on both sides when ++ happens), where all threads will be temporarily halted mostly due to ++ network errors. When reaching paused state, migration will make sure ++ the qemu binary on both sides maintain the data without corrupting ++ the VM. To continue the migration, the admin needs to fix the ++ migration channel using the QMP command 'migrate-recover' on the ++ destination node, then resume the migration using QMP command 'migrate' ++ again on source node, with resume=true flag set. ++ ++ - End ++ ++ The listen thread can now quit, and perform the cleanup of migration ++ state, the migration is now complete. ++ ++Source side page map ++-------------------- ++ ++The 'migration bitmap' in postcopy is basically the same as in the precopy, ++where each of the bit to indicate that page is 'dirty' - i.e. needs ++sending. During the precopy phase this is updated as the CPU dirties ++pages, however during postcopy the CPUs are stopped and nothing should ++dirty anything any more. Instead, dirty bits are cleared when the relevant ++pages are sent during postcopy. ++ ++Postcopy with hugepages ++----------------------- ++ ++Postcopy now works with hugetlbfs backed memory: ++ ++ a) The linux kernel on the destination must support userfault on hugepages. ++ b) The huge-page configuration on the source and destination VMs must be ++ identical; i.e. RAMBlocks on both sides must use the same page size. ++ c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal ++ RAM if it doesn't have enough hugepages, triggering (b) to fail. ++ Using ``-mem-prealloc`` enforces the allocation using hugepages. ++ d) Care should be taken with the size of hugepage used; postcopy with 2MB ++ hugepages works well, however 1GB hugepages are likely to be problematic ++ since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, ++ and until the full page is transferred the destination thread is blocked. ++ ++Postcopy with shared memory ++--------------------------- ++ ++Postcopy migration with shared memory needs explicit support from the other ++processes that share memory and from QEMU. There are restrictions on the type of ++memory that userfault can support shared. ++ ++The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` ++(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` ++for hugetlbfs which may be a problem in some configurations). ++ ++The vhost-user code in QEMU supports clients that have Postcopy support, ++and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes ++to support postcopy. ++ ++The client needs to open a userfaultfd and register the areas ++of memory that it maps with userfault. The client must then pass the ++userfaultfd back to QEMU together with a mapping table that allows ++fault addresses in the clients address space to be converted back to ++RAMBlock/offsets. The client's userfaultfd is added to the postcopy ++fault-thread and page requests are made on behalf of the client by QEMU. ++QEMU performs 'wake' operations on the client's userfaultfd to allow it ++to continue after a page has arrived. ++ ++.. note:: ++ There are two future improvements that would be nice: ++ a) Some way to make QEMU ignorant of the addresses in the clients ++ address space ++ b) Avoiding the need for QEMU to perform ufd-wake calls after the ++ pages have arrived ++ ++Retro-fitting postcopy to existing clients is possible: ++ a) A mechanism is needed for the registration with userfault as above, ++ and the registration needs to be coordinated with the phases of ++ postcopy. In vhost-user extra messages are added to the existing ++ control channel. ++ b) Any thread that can block due to guest memory accesses must be ++ identified and the implication understood; for example if the ++ guest memory access is made while holding a lock then all other ++ threads waiting for that lock will also be blocked. ++ ++Postcopy Preemption Mode ++------------------------ ++ ++Postcopy preempt is a new capability introduced in 8.0 QEMU release, it ++allows urgent pages (those got page fault requested from destination QEMU ++explicitly) to be sent in a separate preempt channel, rather than queued in ++the background migration channel. Anyone who cares about latencies of page ++faults during a postcopy migration should enable this feature. By default, ++it's not enabled. +-- +2.43.0 + diff --git a/0375-docs-migration-split-dirty-limit.patch b/0375-docs-migration-split-dirty-limit.patch new file mode 100644 index 0000000..c66217a --- /dev/null +++ b/0375-docs-migration-split-dirty-limit.patch @@ -0,0 +1,194 @@ +From 995b2226249e66175445fc9e577127b3ed896987 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:25 +0800 +Subject: [PATCH] docs/migration: Split "dirty limit" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 4c6f8a79ae539eeb1f86af6522e4000edde3638b upstream. + +Split that into a separate file, put under "features". + +Intel-SIG: commit 4c6f8a79ae53 docs/migration: Split "dirty limit" + +Cc: Yong Huang +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-8-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/dirty-limit.rst | 71 ++++++++++++++++++++++++++++ + docs/devel/migration/features.rst | 1 + + docs/devel/migration/main.rst | 71 ---------------------------- + 3 files changed, 72 insertions(+), 71 deletions(-) + create mode 100644 docs/devel/migration/dirty-limit.rst + +diff --git a/docs/devel/migration/dirty-limit.rst b/docs/devel/migration/dirty-limit.rst +new file mode 100644 +index 0000000000..8f32329d5f +--- /dev/null ++++ b/docs/devel/migration/dirty-limit.rst +@@ -0,0 +1,71 @@ ++Dirty limit ++=========== ++ ++The dirty limit, short for dirty page rate upper limit, is a new capability ++introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM ++dirty ring to throttle down the guest during live migration. ++ ++The algorithm framework is as follows: ++ ++:: ++ ++ ------------------------------------------------------------------------------ ++ main --------------> throttle thread ------------> PREPARE(1) <-------- ++ thread \ | | ++ \ | | ++ \ V | ++ -\ CALCULATE(2) | ++ \ | | ++ \ | | ++ \ V | ++ \ SET PENALTY(3) ----- ++ -\ | ++ \ | ++ \ V ++ -> virtual CPU thread -------> ACCEPT PENALTY(4) ++ ------------------------------------------------------------------------------ ++ ++When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, ++the QEMU main thread starts the throttle thread. The throttle thread, once ++launched, executes the loop, which consists of three steps: ++ ++ - PREPARE (1) ++ ++ The entire work of PREPARE (1) is preparation for the second stage, ++ CALCULATE(2), as the name implies. It involves preparing the dirty ++ page rate value and the corresponding upper limit of the VM: ++ The dirty page rate is calculated via the KVM dirty ring mechanism, ++ which tells QEMU how many dirty pages a virtual CPU has had since the ++ last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper ++ limit is specified by caller, therefore fetch it directly. ++ ++ - CALCULATE (2) ++ ++ Calculate a suitable sleep period for each virtual CPU, which will be ++ used to determine the penalty for the target virtual CPU. The ++ computation must be done carefully in order to reduce the dirty page ++ rate progressively down to the upper limit without oscillation. To ++ achieve this, two strategies are provided: the first is to add or ++ subtract sleep time based on the ratio of the current dirty page rate ++ to the limit, which is used when the current dirty page rate is far ++ from the limit; the second is to add or subtract a fixed time when ++ the current dirty page rate is close to the limit. ++ ++ - SET PENALTY (3) ++ ++ Set the sleep time for each virtual CPU that should be penalized based ++ on the results of the calculation supplied by step CALCULATE (2). ++ ++After completing the three above stages, the throttle thread loops back ++to step PREPARE (1) until the dirty limit is reached. ++ ++On the other hand, each virtual CPU thread reads the sleep duration and ++sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that ++is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will ++obviously exit to the path and get penalized, whereas virtual CPUs involved ++with read processes will not. ++ ++In summary, thanks to the KVM dirty ring technology, the dirty limit ++algorithm will restrict virtual CPUs as needed to keep their dirty page ++rate inside the limit. This leads to more steady reading performance during ++live migration and can aid in improving large guest responsiveness. +diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst +index 0054e0c900..e257d0d100 100644 +--- a/docs/devel/migration/features.rst ++++ b/docs/devel/migration/features.rst +@@ -7,3 +7,4 @@ Migration has plenty of features to support different use cases. + :maxdepth: 2 + + postcopy ++ dirty-limit +diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst +index 1e98e9e40c..396c7c51ca 100644 +--- a/docs/devel/migration/main.rst ++++ b/docs/devel/migration/main.rst +@@ -573,74 +573,3 @@ path. + Return path - opened by main thread, written by main thread AND postcopy + thread (protected by rp_mutex) + +-Dirty limit +-===================== +-The dirty limit, short for dirty page rate upper limit, is a new capability +-introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM +-dirty ring to throttle down the guest during live migration. +- +-The algorithm framework is as follows: +- +-:: +- +- ------------------------------------------------------------------------------ +- main --------------> throttle thread ------------> PREPARE(1) <-------- +- thread \ | | +- \ | | +- \ V | +- -\ CALCULATE(2) | +- \ | | +- \ | | +- \ V | +- \ SET PENALTY(3) ----- +- -\ | +- \ | +- \ V +- -> virtual CPU thread -------> ACCEPT PENALTY(4) +- ------------------------------------------------------------------------------ +- +-When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, +-the QEMU main thread starts the throttle thread. The throttle thread, once +-launched, executes the loop, which consists of three steps: +- +- - PREPARE (1) +- +- The entire work of PREPARE (1) is preparation for the second stage, +- CALCULATE(2), as the name implies. It involves preparing the dirty +- page rate value and the corresponding upper limit of the VM: +- The dirty page rate is calculated via the KVM dirty ring mechanism, +- which tells QEMU how many dirty pages a virtual CPU has had since the +- last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper +- limit is specified by caller, therefore fetch it directly. +- +- - CALCULATE (2) +- +- Calculate a suitable sleep period for each virtual CPU, which will be +- used to determine the penalty for the target virtual CPU. The +- computation must be done carefully in order to reduce the dirty page +- rate progressively down to the upper limit without oscillation. To +- achieve this, two strategies are provided: the first is to add or +- subtract sleep time based on the ratio of the current dirty page rate +- to the limit, which is used when the current dirty page rate is far +- from the limit; the second is to add or subtract a fixed time when +- the current dirty page rate is close to the limit. +- +- - SET PENALTY (3) +- +- Set the sleep time for each virtual CPU that should be penalized based +- on the results of the calculation supplied by step CALCULATE (2). +- +-After completing the three above stages, the throttle thread loops back +-to step PREPARE (1) until the dirty limit is reached. +- +-On the other hand, each virtual CPU thread reads the sleep duration and +-sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that +-is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will +-obviously exit to the path and get penalized, whereas virtual CPUs involved +-with read processes will not. +- +-In summary, thanks to the KVM dirty ring technology, the dirty limit +-algorithm will restrict virtual CPUs as needed to keep their dirty page +-rate inside the limit. This leads to more steady reading performance during +-live migration and can aid in improving large guest responsiveness. +- +-- +2.43.0 + diff --git a/0376-docs-migration-organize-postcopy-page.patch b/0376-docs-migration-organize-postcopy-page.patch new file mode 100644 index 0000000..18eca20 --- /dev/null +++ b/0376-docs-migration-organize-postcopy-page.patch @@ -0,0 +1,231 @@ +From e38c8e0acb77186e4cbdd2c489b08c4567042548 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:26 +0800 +Subject: [PATCH] docs/migration: Organize "Postcopy" page +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 21b17cd011c959c3fd3fdad994389410a02df901 upstream. + +Reorganize the page, moving things around, and add a few +headlines ("Postcopy internals", "Postcopy features") to cover sub-areas. + +Intel-SIG: commit 21b17cd011c9 docs/migration: Organize "Postcopy" page + +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-9-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/postcopy.rst | 159 ++++++++++++++++-------------- + 1 file changed, 84 insertions(+), 75 deletions(-) + +diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst +index d60eec06ab..6c51e96d79 100644 +--- a/docs/devel/migration/postcopy.rst ++++ b/docs/devel/migration/postcopy.rst +@@ -1,6 +1,9 @@ ++======== + Postcopy + ======== + ++.. contents:: ++ + 'Postcopy' migration is a way to deal with migrations that refuse to converge + (or take too long to converge) its plus side is that there is an upper bound on + the amount of migration traffic and time it takes, the down side is that during +@@ -14,7 +17,7 @@ Postcopy can be combined with precopy (i.e. normal migration) so that if precopy + doesn't finish in a given time the switch is made to postcopy. + + Enabling postcopy +------------------ ++================= + + To enable postcopy, issue this command on the monitor (both source and + destination) prior to the start of migration: +@@ -49,8 +52,71 @@ time per vCPU. + ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that + the destination is waiting for). + +-Postcopy device transfer +------------------------- ++Postcopy internals ++================== ++ ++State machine ++------------- ++ ++Postcopy moves through a series of states (see postcopy_state) from ++ADVISE->DISCARD->LISTEN->RUNNING->END ++ ++ - Advise ++ ++ Set at the start of migration if postcopy is enabled, even ++ if it hasn't had the start command; here the destination ++ checks that its OS has the support needed for postcopy, and performs ++ setup to ensure the RAM mappings are suitable for later postcopy. ++ The destination will fail early in migration at this point if the ++ required OS support is not present. ++ (Triggered by reception of POSTCOPY_ADVISE command) ++ ++ - Discard ++ ++ Entered on receipt of the first 'discard' command; prior to ++ the first Discard being performed, hugepages are switched off ++ (using madvise) to ensure that no new huge pages are created ++ during the postcopy phase, and to cause any huge pages that ++ have discards on them to be broken. ++ ++ - Listen ++ ++ The first command in the package, POSTCOPY_LISTEN, switches ++ the destination state to Listen, and starts a new thread ++ (the 'listen thread') which takes over the job of receiving ++ pages off the migration stream, while the main thread carries ++ on processing the blob. With this thread able to process page ++ reception, the destination now 'sensitises' the RAM to detect ++ any access to missing pages (on Linux using the 'userfault' ++ system). ++ ++ - Running ++ ++ POSTCOPY_RUN causes the destination to synchronise all ++ state and start the CPUs and IO devices running. The main ++ thread now finishes processing the migration package and ++ now carries on as it would for normal precopy migration ++ (although it can't do the cleanup it would do as it ++ finishes a normal migration). ++ ++ - Paused ++ ++ Postcopy can run into a paused state (normally on both sides when ++ happens), where all threads will be temporarily halted mostly due to ++ network errors. When reaching paused state, migration will make sure ++ the qemu binary on both sides maintain the data without corrupting ++ the VM. To continue the migration, the admin needs to fix the ++ migration channel using the QMP command 'migrate-recover' on the ++ destination node, then resume the migration using QMP command 'migrate' ++ again on source node, with resume=true flag set. ++ ++ - End ++ ++ The listen thread can now quit, and perform the cleanup of migration ++ state, the migration is now complete. ++ ++Device transfer ++--------------- + + Loading of device data may cause the device emulation to access guest RAM + that may trigger faults that have to be resolved by the source, as such +@@ -130,7 +196,20 @@ processing. + is no longer used by migration, while the listen thread carries on servicing + page data until the end of migration. + +-Postcopy Recovery ++Source side page bitmap ++----------------------- ++ ++The 'migration bitmap' in postcopy is basically the same as in the precopy, ++where each of the bit to indicate that page is 'dirty' - i.e. needs ++sending. During the precopy phase this is updated as the CPU dirties ++pages, however during postcopy the CPUs are stopped and nothing should ++dirty anything any more. Instead, dirty bits are cleared when the relevant ++pages are sent during postcopy. ++ ++Postcopy features ++================= ++ ++Postcopy recovery + ----------------- + + Comparing to precopy, postcopy is special on error handlings. When any +@@ -166,76 +245,6 @@ configurations of the guest. For example, when with async page fault + enabled, logically the guest can proactively schedule out the threads + accessing missing pages. + +-Postcopy states +---------------- +- +-Postcopy moves through a series of states (see postcopy_state) from +-ADVISE->DISCARD->LISTEN->RUNNING->END +- +- - Advise +- +- Set at the start of migration if postcopy is enabled, even +- if it hasn't had the start command; here the destination +- checks that its OS has the support needed for postcopy, and performs +- setup to ensure the RAM mappings are suitable for later postcopy. +- The destination will fail early in migration at this point if the +- required OS support is not present. +- (Triggered by reception of POSTCOPY_ADVISE command) +- +- - Discard +- +- Entered on receipt of the first 'discard' command; prior to +- the first Discard being performed, hugepages are switched off +- (using madvise) to ensure that no new huge pages are created +- during the postcopy phase, and to cause any huge pages that +- have discards on them to be broken. +- +- - Listen +- +- The first command in the package, POSTCOPY_LISTEN, switches +- the destination state to Listen, and starts a new thread +- (the 'listen thread') which takes over the job of receiving +- pages off the migration stream, while the main thread carries +- on processing the blob. With this thread able to process page +- reception, the destination now 'sensitises' the RAM to detect +- any access to missing pages (on Linux using the 'userfault' +- system). +- +- - Running +- +- POSTCOPY_RUN causes the destination to synchronise all +- state and start the CPUs and IO devices running. The main +- thread now finishes processing the migration package and +- now carries on as it would for normal precopy migration +- (although it can't do the cleanup it would do as it +- finishes a normal migration). +- +- - Paused +- +- Postcopy can run into a paused state (normally on both sides when +- happens), where all threads will be temporarily halted mostly due to +- network errors. When reaching paused state, migration will make sure +- the qemu binary on both sides maintain the data without corrupting +- the VM. To continue the migration, the admin needs to fix the +- migration channel using the QMP command 'migrate-recover' on the +- destination node, then resume the migration using QMP command 'migrate' +- again on source node, with resume=true flag set. +- +- - End +- +- The listen thread can now quit, and perform the cleanup of migration +- state, the migration is now complete. +- +-Source side page map +--------------------- +- +-The 'migration bitmap' in postcopy is basically the same as in the precopy, +-where each of the bit to indicate that page is 'dirty' - i.e. needs +-sending. During the precopy phase this is updated as the CPU dirties +-pages, however during postcopy the CPUs are stopped and nothing should +-dirty anything any more. Instead, dirty bits are cleared when the relevant +-pages are sent during postcopy. +- + Postcopy with hugepages + ----------------------- + +@@ -293,7 +302,7 @@ Retro-fitting postcopy to existing clients is possible: + guest memory access is made while holding a lock then all other + threads waiting for that lock will also be blocked. + +-Postcopy Preemption Mode ++Postcopy preemption mode + ------------------------ + + Postcopy preempt is a new capability introduced in 8.0 QEMU release, it +-- +2.43.0 + diff --git a/0377-docs-migration-further-move-vfio-to-be-feature-of-mi.patch b/0377-docs-migration-further-move-vfio-to-be-feature-of-mi.patch new file mode 100644 index 0000000..f850a47 --- /dev/null +++ b/0377-docs-migration-further-move-vfio-to-be-feature-of-mi.patch @@ -0,0 +1,49 @@ +From bbff394df400f085459f96cb8f29aa69633ee0e2 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:27 +0800 +Subject: [PATCH] docs/migration: Further move vfio to be feature of migration +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 66fd3b1a7ab02f7d8c84f92eba23e3ddc955204d upstream. + +Move it one layer down, so taking VFIO-migration as a feature for +migration. + +Intel-SIG: commit 66fd3b1a7ab0 docs/migration: Further move vfio to be feature of migration + +Cc: Alex Williamson +Cc: Cédric Le Goater +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-10-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/features.rst | 1 + + docs/devel/migration/index.rst | 1 - + 2 files changed, 1 insertion(+), 1 deletion(-) + +diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst +index e257d0d100..dea016f707 100644 +--- a/docs/devel/migration/features.rst ++++ b/docs/devel/migration/features.rst +@@ -8,3 +8,4 @@ Migration has plenty of features to support different use cases. + + postcopy + dirty-limit ++ vfio +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +index 21ad58b189..b1357309e1 100644 +--- a/docs/devel/migration/index.rst ++++ b/docs/devel/migration/index.rst +@@ -10,6 +10,5 @@ QEMU live migration works. + main + features + compatibility +- vfio + virtio + best-practices +-- +2.43.0 + diff --git a/0378-docs-migration-further-move-virtio-to-be-feature-of-.patch b/0378-docs-migration-further-move-virtio-to-be-feature-of-.patch new file mode 100644 index 0000000..020f83a --- /dev/null +++ b/0378-docs-migration-further-move-virtio-to-be-feature-of-.patch @@ -0,0 +1,49 @@ +From cce4501dfab77e6156cb9d08026a1a87b795da48 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 9 Jan 2024 14:46:28 +0800 +Subject: [PATCH] docs/migration: Further move virtio to be feature of + migration +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit eb9f6daae49c06bb91e9660908587cc55265e43a upstream. + +Move it one layer down, so taking Virtio-migration as a feature for +migration. + +Intel-SIG: commit eb9f6daae49c docs/migration: Further move virtio to be feature of migration + +Cc: "Michael S. Tsirkin" +Cc: Jason Wang +Reviewed-by: Cédric Le Goater +Link: https://lore.kernel.org/r/20240109064628.595453-11-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + docs/devel/migration/features.rst | 1 + + docs/devel/migration/index.rst | 1 - + 2 files changed, 1 insertion(+), 1 deletion(-) + +diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst +index dea016f707..a9acaf618e 100644 +--- a/docs/devel/migration/features.rst ++++ b/docs/devel/migration/features.rst +@@ -9,3 +9,4 @@ Migration has plenty of features to support different use cases. + postcopy + dirty-limit + vfio ++ virtio +diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst +index b1357309e1..2aa294d631 100644 +--- a/docs/devel/migration/index.rst ++++ b/docs/devel/migration/index.rst +@@ -10,5 +10,4 @@ QEMU live migration works. + main + features + compatibility +- virtio + best-practices +-- +2.43.0 + diff --git a/0379-migration-multifd-drop-stale-comment-for-multifd-zer.patch b/0379-migration-multifd-drop-stale-comment-for-multifd-zer.patch new file mode 100644 index 0000000..bd322ab --- /dev/null +++ b/0379-migration-multifd-drop-stale-comment-for-multifd-zer.patch @@ -0,0 +1,45 @@ +From ddaedcd6ca969b2d62442a7d2027dda42460f0c1 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:35 +0800 +Subject: [PATCH] migration/multifd: Drop stale comment for multifd zero copy + +commit 8888a552bf7af200e36ff123772547dfb4f133c4 upstream. + +We've already done that with multifd_flush_after_each_section, for multifd +in general. Drop the stale "TODO-like" comment. + +Intel-SIG: commit 8888a552bf7a migration/multifd: Drop stale comment for multifd zero copy + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-2-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 11 ----------- + 1 file changed, 11 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 25cbc6dc6b..eee2586770 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -598,17 +598,6 @@ int multifd_send_sync_main(void) + } + } + +- /* +- * When using zero-copy, it's necessary to flush the pages before any of +- * the pages can be sent again, so we'll make sure the new version of the +- * pages will always arrive _later_ than the old pages. +- * +- * Currently we achieve this by flushing the zero-page requested writes +- * per ram iteration, but in the future we could potentially optimize it +- * to be less frequent, e.g. only after we finished one whole scanning of +- * all the dirty bitmaps. +- */ +- + flush_zero_copy = migrate_zero_copy_send(); + + for (i = 0; i < migrate_multifd_channels(); i++) { +-- +2.43.0 + diff --git a/0380-migration-multifd-multifd-send-kick-main.patch b/0380-migration-multifd-multifd-send-kick-main.patch new file mode 100644 index 0000000..dbd7eb8 --- /dev/null +++ b/0380-migration-multifd-multifd-send-kick-main.patch @@ -0,0 +1,78 @@ +From e9d0d1e2f6dda97ba8b6358af6050feb8dd5ede8 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:36 +0800 +Subject: [PATCH] migration/multifd: multifd_send_kick_main() + +commit 48c0f5d56fd2ff0a0cda23301637b742c690f59a upstream. + +When a multifd sender thread hit errors, it always needs to kick the main +thread by kicking all the semaphores that it can be waiting upon. + +Provide a helper for it and deduplicate the code. + +Intel-SIG: commit 48c0f5d56fd2 migration/multifd: multifd_send_kick_main() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-3-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index eee2586770..b8d2c96533 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -372,6 +372,18 @@ struct { + MultiFDMethods *ops; + } *multifd_send_state; + ++/* ++ * The migration thread can wait on either of the two semaphores. This ++ * function can be used to kick the main thread out of waiting on either of ++ * them. Should mostly only be called when something wrong happened with ++ * the current multifd send thread. ++ */ ++static void multifd_send_kick_main(MultiFDSendParams *p) ++{ ++ qemu_sem_post(&p->sem_sync); ++ qemu_sem_post(&multifd_send_state->channels_ready); ++} ++ + /* + * How we use multifd_send_state->pages and channel->pages? + * +@@ -739,8 +751,7 @@ out: + assert(local_err); + trace_multifd_send_error(p->id); + multifd_send_terminate_threads(local_err); +- qemu_sem_post(&p->sem_sync); +- qemu_sem_post(&multifd_send_state->channels_ready); ++ multifd_send_kick_main(p); + error_free(local_err); + } + +@@ -781,8 +792,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, + * is not created, and then tell who pay attention to me. + */ + p->quit = true; +- qemu_sem_post(&multifd_send_state->channels_ready); +- qemu_sem_post(&p->sem_sync); ++ multifd_send_kick_main(p); + error_free(err); + } + +@@ -852,8 +862,7 @@ static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, + { + migrate_set_error(migrate_get_current(), err); + /* Error happen, we need to tell who pay attention to me */ +- qemu_sem_post(&multifd_send_state->channels_ready); +- qemu_sem_post(&p->sem_sync); ++ multifd_send_kick_main(p); + /* + * Although multifd_send_thread is not created, but main migration + * thread need to judge whether it is running, so we need to mark +-- +2.43.0 + diff --git a/0381-migration-multifd-drop-multifdsendparams-quit-cleanu.patch b/0381-migration-multifd-drop-multifdsendparams-quit-cleanu.patch new file mode 100644 index 0000000..d2eabf5 --- /dev/null +++ b/0381-migration-multifd-drop-multifdsendparams-quit-cleanu.patch @@ -0,0 +1,253 @@ +From ab2ecf32c3592cf5d84ec98db3df4329c61afb85 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:37 +0800 +Subject: [PATCH] migration/multifd: Drop MultiFDSendParams.quit, cleanup error + paths + +commit 15f3f21d598148895c33b6fc41e29777cf6ad992 upstream. + +Multifd send side has two fields to indicate error quits: + + - MultiFDSendParams.quit + - &multifd_send_state->exiting + +Merge them into the global one. The replacement is done by changing all +p->quit checks into the global var check. The global check doesn't need +any lock. + +A few more things done on top of this altogether: + + - multifd_send_terminate_threads() + + Moving the xchg() of &multifd_send_state->exiting upper, so as to cover + the tracepoint, migrate_set_error() and migrate_set_state(). + + - multifd_send_sync_main() + + In the 2nd loop, add one more check over the global var to make sure we + don't keep the looping if QEMU already decided to quit. + + - multifd_tls_outgoing_handshake() + + Use multifd_send_terminate_threads() to set the error state. That has + a benefit of updating MigrationState.error to that error too, so we can + persist that 1st error we hit in that specific channel. + + - multifd_new_send_channel_async() + + Take similar approach like above, drop the migrate_set_error() because + multifd_send_terminate_threads() already covers that. Unwrap the helper + multifd_new_send_channel_cleanup() along the way; not really needed. + +Intel-SIG: commit 15f3f21d5981 migration/multifd: Drop MultiFDSendParams.quit, cleanup error paths + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-4-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 85 ++++++++++++++++++--------------------------- + migration/multifd.h | 2 -- + 2 files changed, 33 insertions(+), 54 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index b8d2c96533..2c98023d67 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -372,6 +372,11 @@ struct { + MultiFDMethods *ops; + } *multifd_send_state; + ++static bool multifd_send_should_exit(void) ++{ ++ return qatomic_read(&multifd_send_state->exiting); ++} ++ + /* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of +@@ -409,7 +414,7 @@ static int multifd_send_pages(void) + MultiFDSendParams *p = NULL; /* make happy gcc */ + MultiFDPages_t *pages = multifd_send_state->pages; + +- if (qatomic_read(&multifd_send_state->exiting)) { ++ if (multifd_send_should_exit()) { + return -1; + } + +@@ -421,14 +426,11 @@ static int multifd_send_pages(void) + */ + next_channel %= migrate_multifd_channels(); + for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { +- p = &multifd_send_state->params[i]; +- +- qemu_mutex_lock(&p->mutex); +- if (p->quit) { +- error_report("%s: channel %d has already quit!", __func__, i); +- qemu_mutex_unlock(&p->mutex); ++ if (multifd_send_should_exit()) { + return -1; + } ++ p = &multifd_send_state->params[i]; ++ qemu_mutex_lock(&p->mutex); + if (!p->pending_job) { + p->pending_job++; + next_channel = (i + 1) % migrate_multifd_channels(); +@@ -483,6 +485,16 @@ static void multifd_send_terminate_threads(Error *err) + { + int i; + ++ /* ++ * We don't want to exit each threads twice. Depending on where ++ * we get the error, or if there are two independent errors in two ++ * threads at the same time, we can end calling this function ++ * twice. ++ */ ++ if (qatomic_xchg(&multifd_send_state->exiting, 1)) { ++ return; ++ } ++ + trace_multifd_send_terminate_threads(err != NULL); + + if (err) { +@@ -497,26 +509,13 @@ static void multifd_send_terminate_threads(Error *err) + } + } + +- /* +- * We don't want to exit each threads twice. Depending on where +- * we get the error, or if there are two independent errors in two +- * threads at the same time, we can end calling this function +- * twice. +- */ +- if (qatomic_xchg(&multifd_send_state->exiting, 1)) { +- return; +- } +- + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +- qemu_mutex_lock(&p->mutex); +- p->quit = true; + qemu_sem_post(&p->sem); + if (p->c) { + qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } +- qemu_mutex_unlock(&p->mutex); + } + } + +@@ -615,16 +614,13 @@ int multifd_send_sync_main(void) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +- trace_multifd_send_sync_main_signal(p->id); +- +- qemu_mutex_lock(&p->mutex); +- +- if (p->quit) { +- error_report("%s: channel %d has already quit", __func__, i); +- qemu_mutex_unlock(&p->mutex); ++ if (multifd_send_should_exit()) { + return -1; + } + ++ trace_multifd_send_sync_main_signal(p->id); ++ ++ qemu_mutex_lock(&p->mutex); + p->packet_num = multifd_send_state->packet_num++; + p->flags |= MULTIFD_FLAG_SYNC; + p->pending_job++; +@@ -634,6 +630,10 @@ int multifd_send_sync_main(void) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + ++ if (multifd_send_should_exit()) { ++ return -1; ++ } ++ + qemu_sem_wait(&multifd_send_state->channels_ready); + trace_multifd_send_sync_main_wait(p->id); + qemu_sem_wait(&p->sem_sync); +@@ -671,7 +671,7 @@ static void *multifd_send_thread(void *opaque) + qemu_sem_post(&multifd_send_state->channels_ready); + qemu_sem_wait(&p->sem); + +- if (qatomic_read(&multifd_send_state->exiting)) { ++ if (multifd_send_should_exit()) { + break; + } + qemu_mutex_lock(&p->mutex); +@@ -786,12 +786,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, + + trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + +- migrate_set_error(migrate_get_current(), err); +- /* +- * Error happen, mark multifd_send_thread status as 'quit' although it +- * is not created, and then tell who pay attention to me. +- */ +- p->quit = true; ++ multifd_send_terminate_threads(err); + multifd_send_kick_main(p); + error_free(err); + } +@@ -857,22 +852,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + return true; + } + +-static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, +- QIOChannel *ioc, Error *err) +-{ +- migrate_set_error(migrate_get_current(), err); +- /* Error happen, we need to tell who pay attention to me */ +- multifd_send_kick_main(p); +- /* +- * Although multifd_send_thread is not created, but main migration +- * thread need to judge whether it is running, so we need to mark +- * its status. +- */ +- p->quit = true; +- object_unref(OBJECT(ioc)); +- error_free(err); +-} +- + static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + { + MultiFDSendParams *p = opaque; +@@ -889,7 +868,10 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + } + + trace_multifd_new_send_channel_async_error(p->id, local_err); +- multifd_new_send_channel_cleanup(p, ioc, local_err); ++ multifd_send_terminate_threads(local_err); ++ multifd_send_kick_main(p); ++ object_unref(OBJECT(ioc)); ++ error_free(local_err); + } + + static void multifd_new_send_channel_create(gpointer opaque) +@@ -921,7 +903,6 @@ int multifd_save_setup(Error **errp) + qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem, 0); + qemu_sem_init(&p->sem_sync, 0); +- p->quit = false; + p->pending_job = 0; + p->id = i; + p->pages = multifd_pages_init(page_count); +diff --git a/migration/multifd.h b/migration/multifd.h +index 35d11f103c..7c040cb85a 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -95,8 +95,6 @@ typedef struct { + QemuMutex mutex; + /* is this channel thread running */ + bool running; +- /* should this thread finish */ +- bool quit; + /* multifd flags for each packet */ + uint32_t flags; + /* global number of generated multifd packets */ +-- +2.43.0 + diff --git a/0382-migration-multifd-postpone-reset-of-multifdpages-t.patch b/0382-migration-multifd-postpone-reset-of-multifdpages-t.patch new file mode 100644 index 0000000..3e039cf --- /dev/null +++ b/0382-migration-multifd-postpone-reset-of-multifdpages-t.patch @@ -0,0 +1,81 @@ +From 859aab983d4a4650f5e7a9824d58d6e6a0521381 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:38 +0800 +Subject: [PATCH] migration/multifd: Postpone reset of MultiFDPages_t + +commit 836eca47f62f9f6d5b8e9b6fedfc3539775c4e2e upstream. + +Now we reset MultiFDPages_t object in the multifd sender thread in the +middle of the sending job. That's not necessary, because the "*pages" +struct will not be reused anyway until pending_job is cleared. + +Move that to the end after the job is completed, provide a helper to reset +a "*pages" object. Use that same helper when free the object too. + +This prepares us to keep using p->pages in the follow up patches, where we +may drop p->normal[]. + +Intel-SIG: commit 836eca47f62f migration/multifd: Postpone reset of MultiFDPages_t + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-5-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 2c98023d67..5633ac245a 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -172,6 +172,17 @@ void multifd_register_ops(int method, MultiFDMethods *ops) + multifd_ops[method] = ops; + } + ++/* Reset a MultiFDPages_t* object for the next use */ ++static void multifd_pages_reset(MultiFDPages_t *pages) ++{ ++ /* ++ * We don't need to touch offset[] array, because it will be ++ * overwritten later when reused. ++ */ ++ pages->num = 0; ++ pages->block = NULL; ++} ++ + static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) + { + MultiFDInit_t msg = {}; +@@ -248,9 +259,8 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n) + + static void multifd_pages_clear(MultiFDPages_t *pages) + { +- pages->num = 0; ++ multifd_pages_reset(pages); + pages->allocated = 0; +- pages->block = NULL; + g_free(pages->offset); + pages->offset = NULL; + g_free(pages); +@@ -704,8 +714,6 @@ static void *multifd_send_thread(void *opaque) + p->flags = 0; + p->num_packets++; + p->total_normal_pages += p->normal_num; +- p->pages->num = 0; +- p->pages->block = NULL; + qemu_mutex_unlock(&p->mutex); + + trace_multifd_send(p->id, packet_num, p->normal_num, flags, +@@ -732,6 +740,8 @@ static void *multifd_send_thread(void *opaque) + + stat64_add(&mig_stats.multifd_bytes, + p->next_packet_size + p->packet_len); ++ ++ multifd_pages_reset(p->pages); + p->next_packet_size = 0; + qemu_mutex_lock(&p->mutex); + p->pending_job--; +-- +2.43.0 + diff --git a/0383-migration-multifd-drop-multifdsendparams-normal-arra.patch b/0383-migration-multifd-drop-multifdsendparams-normal-arra.patch new file mode 100644 index 0000000..c7b1c45 --- /dev/null +++ b/0383-migration-multifd-drop-multifdsendparams-normal-arra.patch @@ -0,0 +1,214 @@ +From 8fc040aff86bf9be0dedc388328448d51067b4de Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:39 +0800 +Subject: [PATCH] migration/multifd: Drop MultiFDSendParams.normal[] array + +commit efd8c5439db7eaf00f35adc0fcc4f01d916e8619 upstream. + +This array is redundant when p->pages exists. Now we extended the life of +p->pages to the whole period where pending_job is set, it should be safe to +always use p->pages->offset[] rather than p->normal[]. Drop the array. + +Alongside, the normal_num is also redundant, which is the same to +p->pages->num. + +This doesn't apply to recv side, because there's no extra buffering on recv +side, so p->normal[] array is still needed. + +Intel-SIG: commit efd8c5439db7 migration/multifd: Drop MultiFDSendParams.normal[] array + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-6-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd-zlib.c | 7 ++++--- + migration/multifd-zstd.c | 7 ++++--- + migration/multifd.c | 33 +++++++++++++-------------------- + migration/multifd.h | 4 ---- + 4 files changed, 21 insertions(+), 30 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 37ce48621e..100809abc1 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -116,17 +116,18 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + { ++ MultiFDPages_t *pages = p->pages; + struct zlib_data *z = p->data; + z_stream *zs = &z->zs; + uint32_t out_size = 0; + int ret; + uint32_t i; + +- for (i = 0; i < p->normal_num; i++) { ++ for (i = 0; i < pages->num; i++) { + uint32_t available = z->zbuff_len - out_size; + int flush = Z_NO_FLUSH; + +- if (i == p->normal_num - 1) { ++ if (i == pages->num - 1) { + flush = Z_SYNC_FLUSH; + } + +@@ -135,7 +136,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + * with compression. zlib does not guarantee that this is safe, + * therefore copy the page before calling deflate(). + */ +- memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); ++ memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); + zs->avail_in = p->page_size; + zs->next_in = z->buf; + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index b471daadcd..2023edd8cc 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -113,6 +113,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + { ++ MultiFDPages_t *pages = p->pages; + struct zstd_data *z = p->data; + int ret; + uint32_t i; +@@ -121,13 +122,13 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + z->out.size = z->zbuff_len; + z->out.pos = 0; + +- for (i = 0; i < p->normal_num; i++) { ++ for (i = 0; i < pages->num; i++) { + ZSTD_EndDirective flush = ZSTD_e_continue; + +- if (i == p->normal_num - 1) { ++ if (i == pages->num - 1) { + flush = ZSTD_e_flush; + } +- z->in.src = p->pages->block->host + p->normal[i]; ++ z->in.src = p->pages->block->host + pages->offset[i]; + z->in.size = p->page_size; + z->in.pos = 0; + +diff --git a/migration/multifd.c b/migration/multifd.c +index 5633ac245a..8bb1fd95cf 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -90,13 +90,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + { + MultiFDPages_t *pages = p->pages; + +- for (int i = 0; i < p->normal_num; i++) { +- p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; ++ for (int i = 0; i < pages->num; i++) { ++ p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + +- p->next_packet_size = p->normal_num * p->page_size; ++ p->next_packet_size = pages->num * p->page_size; + p->flags |= MULTIFD_FLAG_NOCOMP; + return 0; + } +@@ -269,21 +269,22 @@ static void multifd_pages_clear(MultiFDPages_t *pages) + static void multifd_send_fill_packet(MultiFDSendParams *p) + { + MultiFDPacket_t *packet = p->packet; ++ MultiFDPages_t *pages = p->pages; + int i; + + packet->flags = cpu_to_be32(p->flags); + packet->pages_alloc = cpu_to_be32(p->pages->allocated); +- packet->normal_pages = cpu_to_be32(p->normal_num); ++ packet->normal_pages = cpu_to_be32(pages->num); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); + packet->packet_num = cpu_to_be64(p->packet_num); + +- if (p->pages->block) { +- strncpy(packet->ramblock, p->pages->block->idstr, 256); ++ if (pages->block) { ++ strncpy(packet->ramblock, pages->block->idstr, 256); + } + +- for (i = 0; i < p->normal_num; i++) { ++ for (i = 0; i < pages->num; i++) { + /* there are architectures where ram_addr_t is 32 bit */ +- uint64_t temp = p->normal[i]; ++ uint64_t temp = pages->offset[i]; + + packet->offset[i] = cpu_to_be64(temp); + } +@@ -570,8 +571,6 @@ void multifd_save_cleanup(void) + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; +- g_free(p->normal); +- p->normal = NULL; + multifd_send_state->ops->send_cleanup(p, &local_err); + if (local_err) { + migrate_set_error(migrate_get_current(), local_err); +@@ -688,8 +687,8 @@ static void *multifd_send_thread(void *opaque) + + if (p->pending_job) { + uint64_t packet_num = p->packet_num; ++ MultiFDPages_t *pages = p->pages; + uint32_t flags; +- p->normal_num = 0; + + if (use_zero_copy_send) { + p->iovs_num = 0; +@@ -697,12 +696,7 @@ static void *multifd_send_thread(void *opaque) + p->iovs_num = 1; + } + +- for (int i = 0; i < p->pages->num; i++) { +- p->normal[p->normal_num] = p->pages->offset[i]; +- p->normal_num++; +- } +- +- if (p->normal_num) { ++ if (pages->num) { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); +@@ -713,10 +707,10 @@ static void *multifd_send_thread(void *opaque) + flags = p->flags; + p->flags = 0; + p->num_packets++; +- p->total_normal_pages += p->normal_num; ++ p->total_normal_pages += pages->num; + qemu_mutex_unlock(&p->mutex); + +- trace_multifd_send(p->id, packet_num, p->normal_num, flags, ++ trace_multifd_send(p->id, packet_num, pages->num, flags, + p->next_packet_size); + + if (use_zero_copy_send) { +@@ -924,7 +918,6 @@ int multifd_save_setup(Error **errp) + p->name = g_strdup_printf("multifdsend_%d", i); + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); +- p->normal = g_new0(ram_addr_t, page_count); + p->page_size = qemu_target_page_size(); + p->page_count = page_count; + +diff --git a/migration/multifd.h b/migration/multifd.h +index 7c040cb85a..3920bdbcf1 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -122,10 +122,6 @@ typedef struct { + struct iovec *iov; + /* number of iovs used */ + uint32_t iovs_num; +- /* Pages that are not zero */ +- ram_addr_t *normal; +- /* num of non zero pages */ +- uint32_t normal_num; + /* used for compression methods */ + void *data; + } MultiFDSendParams; +-- +2.43.0 + diff --git a/0384-migration-multifd-separate-sync-request-with-normal-.patch b/0384-migration-multifd-separate-sync-request-with-normal-.patch new file mode 100644 index 0000000..30cd56d --- /dev/null +++ b/0384-migration-multifd-separate-sync-request-with-normal-.patch @@ -0,0 +1,192 @@ +From 9f8ca0c64ae41989ed439a18245fb12f7ce6ac40 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:40 +0800 +Subject: [PATCH] migration/multifd: Separate SYNC request with normal jobs + +commit f5f48a7891cf6664a920ba52f6f4dea1646049a4 upstream. + +Multifd provide a threaded model for processing jobs. On sender side, +there can be two kinds of job: (1) a list of pages to send, or (2) a sync +request. + +The sync request is a very special kind of job. It never contains a page +array, but only a multifd packet telling the dest side to synchronize with +sent pages. + +Before this patch, both requests use the pending_job field, no matter what +the request is, it will boost pending_job, while multifd sender thread will +decrement it after it finishes one job. + +However this should be racy, because SYNC is special in that it needs to +set p->flags with MULTIFD_FLAG_SYNC, showing that this is a sync request. +Consider a sequence of operations where: + + - migration thread enqueue a job to send some pages, pending_job++ (0->1) + + - [...before the selected multifd sender thread wakes up...] + + - migration thread enqueue another job to sync, pending_job++ (1->2), + setup p->flags=MULTIFD_FLAG_SYNC + + - multifd sender thread wakes up, found pending_job==2 + - send the 1st packet with MULTIFD_FLAG_SYNC and list of pages + - send the 2nd packet with flags==0 and no pages + +This is not expected, because MULTIFD_FLAG_SYNC should hopefully be done +after all the pages are received. Meanwhile, the 2nd packet will be +completely useless, which contains zero information. + +I didn't verify above, but I think this issue is still benign in that at +least on the recv side we always receive pages before handling +MULTIFD_FLAG_SYNC. However that's not always guaranteed and just tricky. + +One other reason I want to separate it is using p->flags to communicate +between the two threads is also not clearly defined, it's very hard to read +and understand why accessing p->flags is always safe; see the current impl +of multifd_send_thread() where we tried to cache only p->flags. It doesn't +need to be that complicated. + +This patch introduces pending_sync, a separate flag just to show that the +requester needs a sync. Alongside, we remove the tricky caching of +p->flags now because after this patch p->flags should only be used by +multifd sender thread now, which will be crystal clear. So it is always +thread safe to access p->flags. + +With that, we can also safely convert the pending_job into a boolean, +because we don't support >1 pending jobs anyway. + +Always use atomic ops to access both flags to make sure no cache effect. +When at it, drop the initial setting of "pending_job = 0" because it's +always allocated using g_new0(). + +Intel-SIG: commit f5f48a7891cf migration/multifd: Separate SYNC request with normal jobs + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-7-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 39 +++++++++++++++++++++++++-------------- + migration/multifd.h | 13 +++++++++++-- + 2 files changed, 36 insertions(+), 16 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 8bb1fd95cf..ea25bbe6bd 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -442,8 +442,8 @@ static int multifd_send_pages(void) + } + p = &multifd_send_state->params[i]; + qemu_mutex_lock(&p->mutex); +- if (!p->pending_job) { +- p->pending_job++; ++ if (qatomic_read(&p->pending_job) == false) { ++ qatomic_set(&p->pending_job, true); + next_channel = (i + 1) % migrate_multifd_channels(); + break; + } +@@ -631,8 +631,12 @@ int multifd_send_sync_main(void) + + qemu_mutex_lock(&p->mutex); + p->packet_num = multifd_send_state->packet_num++; +- p->flags |= MULTIFD_FLAG_SYNC; +- p->pending_job++; ++ /* ++ * We should be the only user so far, so not possible to be set by ++ * others concurrently. ++ */ ++ assert(qatomic_read(&p->pending_sync) == false); ++ qatomic_set(&p->pending_sync, true); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem); + } +@@ -685,10 +689,9 @@ static void *multifd_send_thread(void *opaque) + } + qemu_mutex_lock(&p->mutex); + +- if (p->pending_job) { ++ if (qatomic_read(&p->pending_job)) { + uint64_t packet_num = p->packet_num; + MultiFDPages_t *pages = p->pages; +- uint32_t flags; + + if (use_zero_copy_send) { + p->iovs_num = 0; +@@ -704,13 +707,11 @@ static void *multifd_send_thread(void *opaque) + } + } + multifd_send_fill_packet(p); +- flags = p->flags; +- p->flags = 0; + p->num_packets++; + p->total_normal_pages += pages->num; + qemu_mutex_unlock(&p->mutex); + +- trace_multifd_send(p->id, packet_num, pages->num, flags, ++ trace_multifd_send(p->id, packet_num, pages->num, p->flags, + p->next_packet_size); + + if (use_zero_copy_send) { +@@ -738,12 +739,23 @@ static void *multifd_send_thread(void *opaque) + multifd_pages_reset(p->pages); + p->next_packet_size = 0; + qemu_mutex_lock(&p->mutex); +- p->pending_job--; ++ qatomic_set(&p->pending_job, false); + qemu_mutex_unlock(&p->mutex); +- +- if (flags & MULTIFD_FLAG_SYNC) { +- qemu_sem_post(&p->sem_sync); ++ } else if (qatomic_read(&p->pending_sync)) { ++ p->flags = MULTIFD_FLAG_SYNC; ++ multifd_send_fill_packet(p); ++ ret = qio_channel_write_all(p->c, (void *)p->packet, ++ p->packet_len, &local_err); ++ if (ret != 0) { ++ qemu_mutex_unlock(&p->mutex); ++ break; + } ++ /* p->next_packet_size will always be zero for a SYNC packet */ ++ stat64_add(&mig_stats.multifd_bytes, p->packet_len); ++ p->flags = 0; ++ qatomic_set(&p->pending_sync, false); ++ qemu_mutex_unlock(&p->mutex); ++ qemu_sem_post(&p->sem_sync); + } else { + qemu_mutex_unlock(&p->mutex); + /* sometimes there are spurious wakeups */ +@@ -907,7 +919,6 @@ int multifd_save_setup(Error **errp) + qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem, 0); + qemu_sem_init(&p->sem_sync, 0); +- p->pending_job = 0; + p->id = i; + p->pages = multifd_pages_init(page_count); + p->packet_len = sizeof(MultiFDPacket_t) +diff --git a/migration/multifd.h b/migration/multifd.h +index 3920bdbcf1..08f26ef3fe 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -99,8 +99,17 @@ typedef struct { + uint32_t flags; + /* global number of generated multifd packets */ + uint64_t packet_num; +- /* thread has work to do */ +- int pending_job; ++ /* ++ * The sender thread has work to do if either of below boolean is set. ++ * ++ * @pending_job: a job is pending ++ * @pending_sync: a sync request is pending ++ * ++ * For both of these fields, they're only set by the requesters, and ++ * cleared by the multifd sender threads. ++ */ ++ bool pending_job; ++ bool pending_sync; + /* array of pages to sent. + * The owner of 'pages' depends of 'pending_job' value: + * pending_job == 0 -> migration_thread can use it. +-- +2.43.0 + diff --git a/0385-migration-multifd-simplify-locking-in-sender-thread.patch b/0385-migration-multifd-simplify-locking-in-sender-thread.patch new file mode 100644 index 0000000..fb4b7e5 --- /dev/null +++ b/0385-migration-multifd-simplify-locking-in-sender-thread.patch @@ -0,0 +1,101 @@ +From 15c0a596e2dc180d83b3eb5d5b75fe7afab2300e Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:41 +0800 +Subject: [PATCH] migration/multifd: Simplify locking in sender thread + +commit e3cce9af10b06c51434ced4e1a6686f1ce43e124 upstream. + +The sender thread will yield the p->mutex before IO starts, trying to not +block the requester thread. This may be unnecessary lock optimizations, +because the requester can already read pending_job safely even without the +lock, because the requester is currently the only one who can assign a +task. + +Drop that lock complication on both sides: + + (1) in the sender thread, always take the mutex until job done + (2) in the requester thread, check pending_job clear lockless + +Intel-SIG: commit e3cce9af10b0 migration/multifd: Simplify locking in sender thread + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-8-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 23 ++++++++++++++++------- + 1 file changed, 16 insertions(+), 7 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index ea25bbe6bd..4d5a01ed93 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -429,7 +429,9 @@ static int multifd_send_pages(void) + return -1; + } + ++ /* We wait here, until at least one channel is ready */ + qemu_sem_wait(&multifd_send_state->channels_ready); ++ + /* + * next_channel can remain from a previous migration that was + * using more channels, so ensure it doesn't overflow if the +@@ -441,17 +443,26 @@ static int multifd_send_pages(void) + return -1; + } + p = &multifd_send_state->params[i]; +- qemu_mutex_lock(&p->mutex); ++ /* ++ * Lockless read to p->pending_job is safe, because only multifd ++ * sender thread can clear it. ++ */ + if (qatomic_read(&p->pending_job) == false) { +- qatomic_set(&p->pending_job, true); + next_channel = (i + 1) % migrate_multifd_channels(); + break; + } +- qemu_mutex_unlock(&p->mutex); + } ++ ++ qemu_mutex_lock(&p->mutex); + assert(!p->pages->num); + assert(!p->pages->block); +- ++ /* ++ * Double check on pending_job==false with the lock. In the future if ++ * we can have >1 requester thread, we can replace this with a "goto ++ * retry", but that is for later. ++ */ ++ assert(qatomic_read(&p->pending_job) == false); ++ qatomic_set(&p->pending_job, true); + p->packet_num = multifd_send_state->packet_num++; + multifd_send_state->pages = p->pages; + p->pages = pages; +@@ -709,8 +720,6 @@ static void *multifd_send_thread(void *opaque) + multifd_send_fill_packet(p); + p->num_packets++; + p->total_normal_pages += pages->num; +- qemu_mutex_unlock(&p->mutex); +- + trace_multifd_send(p->id, packet_num, pages->num, p->flags, + p->next_packet_size); + +@@ -730,6 +739,7 @@ static void *multifd_send_thread(void *opaque) + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, + 0, p->write_flags, &local_err); + if (ret != 0) { ++ qemu_mutex_unlock(&p->mutex); + break; + } + +@@ -738,7 +748,6 @@ static void *multifd_send_thread(void *opaque) + + multifd_pages_reset(p->pages); + p->next_packet_size = 0; +- qemu_mutex_lock(&p->mutex); + qatomic_set(&p->pending_job, false); + qemu_mutex_unlock(&p->mutex); + } else if (qatomic_read(&p->pending_sync)) { +-- +2.43.0 + diff --git a/0386-migration-multifd-drop-pages-num-check-in-sender-thr.patch b/0386-migration-multifd-drop-pages-num-check-in-sender-thr.patch new file mode 100644 index 0000000..7a9b6e2 --- /dev/null +++ b/0386-migration-multifd-drop-pages-num-check-in-sender-thr.patch @@ -0,0 +1,48 @@ +From 8169d2d73abc117db0b29f129fb5cceef6f927d0 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:42 +0800 +Subject: [PATCH] migration/multifd: Drop pages->num check in sender thread + +commit 83c560fb4249ee5698652249e0c1730c3d611a9b upstream. + +Now with a split SYNC handler, we always have pages->num set for +pending_job==true. Assert it instead. + +Intel-SIG: commit 83c560fb4249 migration/multifd: Drop pages->num check in sender thread + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-9-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 4d5a01ed93..518f9de723 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -710,13 +710,14 @@ static void *multifd_send_thread(void *opaque) + p->iovs_num = 1; + } + +- if (pages->num) { +- ret = multifd_send_state->ops->send_prepare(p, &local_err); +- if (ret != 0) { +- qemu_mutex_unlock(&p->mutex); +- break; +- } ++ assert(pages->num); ++ ++ ret = multifd_send_state->ops->send_prepare(p, &local_err); ++ if (ret != 0) { ++ qemu_mutex_unlock(&p->mutex); ++ break; + } ++ + multifd_send_fill_packet(p); + p->num_packets++; + p->total_normal_pages += pages->num; +-- +2.43.0 + diff --git a/0387-migration-multifd-rename-p-num-packets-and-clean-it-.patch b/0387-migration-multifd-rename-p-num-packets-and-clean-it-.patch new file mode 100644 index 0000000..1033c1b --- /dev/null +++ b/0387-migration-multifd-rename-p-num-packets-and-clean-it-.patch @@ -0,0 +1,142 @@ +From 506b2e87fa8656e97879f8b1cee46868c87b247a Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:43 +0800 +Subject: [PATCH] migration/multifd: Rename p->num_packets and clean it up + +commit 05b7ec1890158471afb8537a6817a7e0d0a6c938 upstream. + +This field, no matter whether on src or dest, is only used for debugging +purpose. + +They can even be removed already, unless it still more or less provide some +accounting on "how many packets are sent/recved for this thread". The +other more important one is called packet_num, which is embeded in the +multifd packet headers (MultiFDPacket_t). + +So let's keep them for now, but make them much easier to understand, by +doing below: + + - Rename both of them to packets_sent / packets_recved, the old + name (num_packets) are waaay too confusing when we already have + MultiFDPacket_t.packets_num. + + - Avoid worrying on the "initial packet": we know we will send it, that's + good enough. The accounting won't matter a great deal to start with 0 or + with 1. + + - Move them to where we send/recv the packets. They're: + + - multifd_send_fill_packet() for senders. + - multifd_recv_unfill_packet() for receivers. + +Intel-SIG: commit 05b7ec189015 migration/multifd: Rename p->num_packets and clean it up + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-10-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 13 +++++-------- + migration/multifd.h | 6 +++--- + 2 files changed, 8 insertions(+), 11 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 518f9de723..eca76e2c18 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -288,6 +288,8 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + + packet->offset[i] = cpu_to_be64(temp); + } ++ ++ p->packets_sent++; + } + + static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +@@ -335,6 +337,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + p->packet_num = be64_to_cpu(packet->packet_num); ++ p->packets_recved++; + + if (p->normal_num == 0) { + return 0; +@@ -688,8 +691,6 @@ static void *multifd_send_thread(void *opaque) + ret = -1; + goto out; + } +- /* initial packet */ +- p->num_packets = 1; + + while (true) { + qemu_sem_post(&multifd_send_state->channels_ready); +@@ -719,7 +720,6 @@ static void *multifd_send_thread(void *opaque) + } + + multifd_send_fill_packet(p); +- p->num_packets++; + p->total_normal_pages += pages->num; + trace_multifd_send(p->id, packet_num, pages->num, p->flags, + p->next_packet_size); +@@ -787,7 +787,7 @@ out: + + rcu_unregister_thread(); + migration_threads_remove(thread); +- trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); ++ trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); + + return NULL; + } +@@ -1124,7 +1124,6 @@ static void *multifd_recv_thread(void *opaque) + p->flags &= ~MULTIFD_FLAG_SYNC; + trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, + p->next_packet_size); +- p->num_packets++; + p->total_normal_pages += p->normal_num; + qemu_mutex_unlock(&p->mutex); + +@@ -1150,7 +1149,7 @@ static void *multifd_recv_thread(void *opaque) + qemu_mutex_unlock(&p->mutex); + + rcu_unregister_thread(); +- trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); ++ trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); + + return NULL; + } +@@ -1252,8 +1251,6 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) + } + p->c = ioc; + object_ref(OBJECT(ioc)); +- /* initial packet */ +- p->num_packets = 1; + + p->running = true; + qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, +diff --git a/migration/multifd.h b/migration/multifd.h +index 08f26ef3fe..2e4ad0dc56 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -124,7 +124,7 @@ typedef struct { + /* size of the next packet that contains pages */ + uint32_t next_packet_size; + /* packets sent through this channel */ +- uint64_t num_packets; ++ uint64_t packets_sent; + /* non zero pages sent through this channel */ + uint64_t total_normal_pages; + /* buffers to send */ +@@ -174,8 +174,8 @@ typedef struct { + MultiFDPacket_t *packet; + /* size of the next packet that contains pages */ + uint32_t next_packet_size; +- /* packets sent through this channel */ +- uint64_t num_packets; ++ /* packets received through this channel */ ++ uint64_t packets_recved; + /* ramblock */ + RAMBlock *block; + /* ramblock host address */ +-- +2.43.0 + diff --git a/0388-migration-multifd-move-total-normal-pages-accounting.patch b/0388-migration-multifd-move-total-normal-pages-accounting.patch new file mode 100644 index 0000000..cde58c9 --- /dev/null +++ b/0388-migration-multifd-move-total-normal-pages-accounting.patch @@ -0,0 +1,59 @@ +From ed2465ec8687cdf13b3ce821d1f52dea786ffaa1 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:44 +0800 +Subject: [PATCH] migration/multifd: Move total_normal_pages accounting + +commit db7e1cc5103137743394a939045a17fa2b30a0dc upstream. + +Just like the previous patch, move the accounting for total_normal_pages on +both src/dst sides into the packet fill/unfill procedures. + +Intel-SIG: commit db7e1cc51031 migration/multifd: Move total_normal_pages accounting + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-11-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index eca76e2c18..94a0124934 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -290,6 +290,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + } + + p->packets_sent++; ++ p->total_normal_pages += pages->num; + } + + static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +@@ -338,6 +339,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; ++ p->total_normal_pages += p->normal_num; + + if (p->normal_num == 0) { + return 0; +@@ -720,7 +722,6 @@ static void *multifd_send_thread(void *opaque) + } + + multifd_send_fill_packet(p); +- p->total_normal_pages += pages->num; + trace_multifd_send(p->id, packet_num, pages->num, p->flags, + p->next_packet_size); + +@@ -1124,7 +1125,6 @@ static void *multifd_recv_thread(void *opaque) + p->flags &= ~MULTIFD_FLAG_SYNC; + trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, + p->next_packet_size); +- p->total_normal_pages += p->normal_num; + qemu_mutex_unlock(&p->mutex); + + if (p->normal_num) { +-- +2.43.0 + diff --git a/0389-migration-multifd-move-trace-multifd-send-recv.patch b/0389-migration-multifd-move-trace-multifd-send-recv.patch new file mode 100644 index 0000000..a1f3651 --- /dev/null +++ b/0389-migration-multifd-move-trace-multifd-send-recv.patch @@ -0,0 +1,73 @@ +From 863fe85a04f3b1ecf91895803eb876d4a9d90a33 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:45 +0800 +Subject: [PATCH] migration/multifd: Move trace_multifd_send|recv() + +commit 8a9ef1738037e1d1132f9e1bd3e2f1102bde719f upstream. + +Move them into fill/unfill of packets. With that, we can further cleanup +the send/recv thread procedure, and remove one more temp var. + +Intel-SIG: commit 8a9ef1738037 migration/multifd: Move trace_multifd_send|recv() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-12-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 94a0124934..44163e4e28 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -291,6 +291,9 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) + + p->packets_sent++; + p->total_normal_pages += pages->num; ++ ++ trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, ++ p->next_packet_size); + } + + static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +@@ -341,6 +344,9 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + p->packets_recved++; + p->total_normal_pages += p->normal_num; + ++ trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, ++ p->next_packet_size); ++ + if (p->normal_num == 0) { + return 0; + } +@@ -704,7 +710,6 @@ static void *multifd_send_thread(void *opaque) + qemu_mutex_lock(&p->mutex); + + if (qatomic_read(&p->pending_job)) { +- uint64_t packet_num = p->packet_num; + MultiFDPages_t *pages = p->pages; + + if (use_zero_copy_send) { +@@ -722,8 +727,6 @@ static void *multifd_send_thread(void *opaque) + } + + multifd_send_fill_packet(p); +- trace_multifd_send(p->id, packet_num, pages->num, p->flags, +- p->next_packet_size); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ +@@ -1123,8 +1126,6 @@ static void *multifd_recv_thread(void *opaque) + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; +- trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, +- p->next_packet_size); + qemu_mutex_unlock(&p->mutex); + + if (p->normal_num) { +-- +2.43.0 + diff --git a/0390-migration-multifd-multifd-send-prepare-header.patch b/0390-migration-multifd-multifd-send-prepare-header.patch new file mode 100644 index 0000000..0ca3ac1 --- /dev/null +++ b/0390-migration-multifd-multifd-send-prepare-header.patch @@ -0,0 +1,84 @@ +From fa79734d8ab99dd6a96801c1bb550507694af2b6 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:46 +0800 +Subject: [PATCH] migration/multifd: multifd_send_prepare_header() + +commit 452b205702335ddd45554aaf0eb37baf50bdfa00 upstream. + +Introduce a helper multifd_send_prepare_header() to setup the header packet +for multifd sender. + +It's fine to setup the IOV[0] _before_ send_prepare() because the packet +buffer is already ready, even if the content is to be filled in. + +With this helper, we can already slightly clean up the zero copy path. + +Note that I explicitly put it into multifd.h, because I want it inlined +directly into multifd*.c where necessary later. + +Intel-SIG: commit 452b20570233 migration/multifd: multifd_send_prepare_header() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-13-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 16 ++++++++-------- + migration/multifd.h | 8 ++++++++ + 2 files changed, 16 insertions(+), 8 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 44163e4e28..cd4467aff4 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -712,10 +712,14 @@ static void *multifd_send_thread(void *opaque) + if (qatomic_read(&p->pending_job)) { + MultiFDPages_t *pages = p->pages; + +- if (use_zero_copy_send) { +- p->iovs_num = 0; +- } else { +- p->iovs_num = 1; ++ p->iovs_num = 0; ++ ++ if (!use_zero_copy_send) { ++ /* ++ * Only !zerocopy needs the header in IOV; zerocopy will ++ * send it separately. ++ */ ++ multifd_send_prepare_header(p); + } + + assert(pages->num); +@@ -735,10 +739,6 @@ static void *multifd_send_thread(void *opaque) + if (ret != 0) { + break; + } +- } else { +- /* Send header using the same writev call */ +- p->iov[0].iov_len = p->packet_len; +- p->iov[0].iov_base = p->packet; + } + + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, +diff --git a/migration/multifd.h b/migration/multifd.h +index 2e4ad0dc56..4ec005f53f 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -209,5 +209,13 @@ typedef struct { + + void multifd_register_ops(int method, MultiFDMethods *ops); + ++static inline void multifd_send_prepare_header(MultiFDSendParams *p) ++{ ++ p->iov[0].iov_len = p->packet_len; ++ p->iov[0].iov_base = p->packet; ++ p->iovs_num++; ++} ++ ++ + #endif + +-- +2.43.0 + diff --git a/0391-migration-multifd-move-header-prepare-fill-into-send.patch b/0391-migration-multifd-move-header-prepare-fill-into-send.patch new file mode 100644 index 0000000..995e921 --- /dev/null +++ b/0391-migration-multifd-move-header-prepare-fill-into-send.patch @@ -0,0 +1,229 @@ +From b69bacc9e2d6c2d9d5c9659cea5e03694cf4a562 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:47 +0800 +Subject: [PATCH] migration/multifd: Move header prepare/fill into + send_prepare() + +commit 25a1f8787597f6906b151b2f73ae6cc92a31de57 upstream. + +This patch redefines the interfacing of ->send_prepare(). It further +simplifies multifd_send_thread() especially on zero copy. + +Now with the new interface, we require the hook to do all the work for +preparing the IOVs to send. After it's completed, the IOVs should be ready +to be dumped into the specific multifd QIOChannel later. + +So now the API looks like: + + p->pages -----------> send_prepare() -------------> IOVs + +This also prepares for the case where the input can be extended to even not +any p->pages. But that's for later. + +This patch will achieve similar goal of what Fabiano used to propose here: + +https://lore.kernel.org/r/20240126221943.26628-1-farosas@suse.de + +However the send() interface may not be necessary. I'm boldly attaching a +"Co-developed-by" for Fabiano. + +Intel-SIG: commit 25a1f8787597 migration/multifd: Move header prepare/fill into send_prepare() + +Co-developed-by: Fabiano Rosas +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-14-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd-zlib.c | 4 +++ + migration/multifd-zstd.c | 4 +++ + migration/multifd.c | 61 ++++++++++++++++++---------------------- + migration/multifd.h | 1 + + 4 files changed, 37 insertions(+), 33 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 100809abc1..012e3bdea1 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -123,6 +123,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + int ret; + uint32_t i; + ++ multifd_send_prepare_header(p); ++ + for (i = 0; i < pages->num; i++) { + uint32_t available = z->zbuff_len - out_size; + int flush = Z_NO_FLUSH; +@@ -172,6 +174,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + p->next_packet_size = out_size; + p->flags |= MULTIFD_FLAG_ZLIB; + ++ multifd_send_fill_packet(p); ++ + return 0; + } + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 2023edd8cc..dc8fe43e94 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -118,6 +118,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + int ret; + uint32_t i; + ++ multifd_send_prepare_header(p); ++ + z->out.dst = z->zbuff; + z->out.size = z->zbuff_len; + z->out.pos = 0; +@@ -161,6 +163,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + p->next_packet_size = z->out.pos; + p->flags |= MULTIFD_FLAG_ZSTD; + ++ multifd_send_fill_packet(p); ++ + return 0; + } + +diff --git a/migration/multifd.c b/migration/multifd.c +index cd4467aff4..6aa44340de 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -50,15 +50,15 @@ typedef struct { + /** + * nocomp_send_setup: setup send side + * +- * For no compression this function does nothing. +- * +- * Returns 0 for success or -1 for error +- * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ + static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) + { ++ if (migrate_zero_copy_send()) { ++ p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; ++ } ++ + return 0; + } + +@@ -88,7 +88,17 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + { ++ bool use_zero_copy_send = migrate_zero_copy_send(); + MultiFDPages_t *pages = p->pages; ++ int ret; ++ ++ if (!use_zero_copy_send) { ++ /* ++ * Only !zerocopy needs the header in IOV; zerocopy will ++ * send it separately. ++ */ ++ multifd_send_prepare_header(p); ++ } + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; +@@ -98,6 +108,18 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + + p->next_packet_size = pages->num * p->page_size; + p->flags |= MULTIFD_FLAG_NOCOMP; ++ ++ multifd_send_fill_packet(p); ++ ++ if (use_zero_copy_send) { ++ /* Send header first, without zerocopy */ ++ ret = qio_channel_write_all(p->c, (void *)p->packet, ++ p->packet_len, errp); ++ if (ret != 0) { ++ return -1; ++ } ++ } ++ + return 0; + } + +@@ -266,7 +288,7 @@ static void multifd_pages_clear(MultiFDPages_t *pages) + g_free(pages); + } + +-static void multifd_send_fill_packet(MultiFDSendParams *p) ++void multifd_send_fill_packet(MultiFDSendParams *p) + { + MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; +@@ -688,7 +710,6 @@ static void *multifd_send_thread(void *opaque) + MigrationThread *thread = NULL; + Error *local_err = NULL; + int ret = 0; +- bool use_zero_copy_send = migrate_zero_copy_send(); + + thread = migration_threads_add(p->name, qemu_get_thread_id()); + +@@ -713,15 +734,6 @@ static void *multifd_send_thread(void *opaque) + MultiFDPages_t *pages = p->pages; + + p->iovs_num = 0; +- +- if (!use_zero_copy_send) { +- /* +- * Only !zerocopy needs the header in IOV; zerocopy will +- * send it separately. +- */ +- multifd_send_prepare_header(p); +- } +- + assert(pages->num); + + ret = multifd_send_state->ops->send_prepare(p, &local_err); +@@ -730,17 +742,6 @@ static void *multifd_send_thread(void *opaque) + break; + } + +- multifd_send_fill_packet(p); +- +- if (use_zero_copy_send) { +- /* Send header first, without zerocopy */ +- ret = qio_channel_write_all(p->c, (void *)p->packet, +- p->packet_len, &local_err); +- if (ret != 0) { +- break; +- } +- } +- + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, + 0, p->write_flags, &local_err); + if (ret != 0) { +@@ -945,13 +946,7 @@ int multifd_save_setup(Error **errp) + p->iov = g_new0(struct iovec, page_count + 1); + p->page_size = qemu_target_page_size(); + p->page_count = page_count; +- +- if (migrate_zero_copy_send()) { +- p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; +- } else { +- p->write_flags = 0; +- } +- ++ p->write_flags = 0; + multifd_new_send_channel_create(p); + } + +diff --git a/migration/multifd.h b/migration/multifd.h +index 4ec005f53f..34a2ecb9f4 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -208,6 +208,7 @@ typedef struct { + } MultiFDMethods; + + void multifd_register_ops(int method, MultiFDMethods *ops); ++void multifd_send_fill_packet(MultiFDSendParams *p); + + static inline void multifd_send_prepare_header(MultiFDSendParams *p) + { +-- +2.43.0 + diff --git a/0392-migration-multifd-forbid-spurious-wakeups.patch b/0392-migration-multifd-forbid-spurious-wakeups.patch new file mode 100644 index 0000000..597e04b --- /dev/null +++ b/0392-migration-multifd-forbid-spurious-wakeups.patch @@ -0,0 +1,53 @@ +From 88febde3da24bdafa3d603cf678611fa14bfd0e9 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:48 +0800 +Subject: [PATCH] migration/multifd: Forbid spurious wakeups + +commit 859ebaf346e8b5dece6cf255c604fe953d8ec9ab upstream. + +Now multifd's logic is designed to have no spurious wakeup. I still +remember a talk to Juan and he seems to agree we should drop it now, and if +my memory was right it was there because multifd used to hit that when +still debugging. + +Let's drop it and see what can explode; as long as it's not reaching +soft-freeze. + +Intel-SIG: commit 859ebaf346e8 migration/multifd: Forbid spurious wakeups + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-15-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 6aa44340de..28b54100cd 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -756,7 +756,9 @@ static void *multifd_send_thread(void *opaque) + p->next_packet_size = 0; + qatomic_set(&p->pending_job, false); + qemu_mutex_unlock(&p->mutex); +- } else if (qatomic_read(&p->pending_sync)) { ++ } else { ++ /* If not a normal job, must be a sync request */ ++ assert(qatomic_read(&p->pending_sync)); + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, +@@ -771,9 +773,6 @@ static void *multifd_send_thread(void *opaque) + qatomic_set(&p->pending_sync, false); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem_sync); +- } else { +- qemu_mutex_unlock(&p->mutex); +- /* sometimes there are spurious wakeups */ + } + } + +-- +2.43.0 + diff --git a/0393-migration-multifd-split-multifd-send-terminate-threa.patch b/0393-migration-multifd-split-multifd-send-terminate-threa.patch new file mode 100644 index 0000000..851c205 --- /dev/null +++ b/0393-migration-multifd-split-multifd-send-terminate-threa.patch @@ -0,0 +1,133 @@ +From 1ccb3e5c3229fcde7e1d3b16fa3d8a0b1c704d45 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:49 +0800 +Subject: [PATCH] migration/multifd: Split multifd_send_terminate_threads() + +commit 3ab4441d97af59ea09ee015d68c4770704b2b34f upstream. + +Split multifd_send_terminate_threads() into two functions: + + - multifd_send_set_error(): used when an error happened on the sender + side, set error and quit state only + + - multifd_send_terminate_threads(): used only by the main thread to kick + all multifd send threads out of sleep, for the last recycling. + +Use multifd_send_set_error() in the three old call sites where only the +error will be set. + +Use multifd_send_terminate_threads() in the last one where the main thread +will kick the multifd threads at last in multifd_save_cleanup(). + +Both helpers will need to set quitting=1. + +Intel-SIG: commit 3ab4441d97af migration/multifd: Split multifd_send_terminate_threads() + +Suggested-by: Fabiano Rosas +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-16-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 27 ++++++++++++++++++--------- + migration/trace-events | 2 +- + 2 files changed, 19 insertions(+), 10 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 28b54100cd..ba86f9dda5 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -536,10 +536,9 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) + return 1; + } + +-static void multifd_send_terminate_threads(Error *err) ++/* Multifd send side hit an error; remember it and prepare to quit */ ++static void multifd_send_set_error(Error *err) + { +- int i; +- + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two +@@ -550,8 +549,6 @@ static void multifd_send_terminate_threads(Error *err) + return; + } + +- trace_multifd_send_terminate_threads(err != NULL); +- + if (err) { + MigrationState *s = migrate_get_current(); + migrate_set_error(s, err); +@@ -563,7 +560,19 @@ static void multifd_send_terminate_threads(Error *err) + MIGRATION_STATUS_FAILED); + } + } ++} ++ ++static void multifd_send_terminate_threads(void) ++{ ++ int i; ++ ++ trace_multifd_send_terminate_threads(); + ++ /* ++ * Tell everyone we're quitting. No xchg() needed here; we simply ++ * always set it. ++ */ ++ qatomic_set(&multifd_send_state->exiting, 1); + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +@@ -586,7 +595,7 @@ void multifd_save_cleanup(void) + if (!migrate_multifd()) { + return; + } +- multifd_send_terminate_threads(NULL); ++ multifd_send_terminate_threads(); + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +@@ -780,7 +789,7 @@ out: + if (ret) { + assert(local_err); + trace_multifd_send_error(p->id); +- multifd_send_terminate_threads(local_err); ++ multifd_send_set_error(local_err); + multifd_send_kick_main(p); + error_free(local_err); + } +@@ -816,7 +825,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, + + trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + +- multifd_send_terminate_threads(err); ++ multifd_send_set_error(err); + multifd_send_kick_main(p); + error_free(err); + } +@@ -898,7 +907,7 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + } + + trace_multifd_new_send_channel_async_error(p->id, local_err); +- multifd_send_terminate_threads(local_err); ++ multifd_send_set_error(local_err); + multifd_send_kick_main(p); + object_unref(OBJECT(ioc)); + error_free(local_err); +diff --git a/migration/trace-events b/migration/trace-events +index de4a743c8a..298ad2b0dd 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -141,7 +141,7 @@ multifd_send_error(uint8_t id) "channel %u" + multifd_send_sync_main(long packet_num) "packet num %ld" + multifd_send_sync_main_signal(uint8_t id) "channel %u" + multifd_send_sync_main_wait(uint8_t id) "channel %u" +-multifd_send_terminate_threads(bool error) "error %d" ++multifd_send_terminate_threads(void) "" + multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 + multifd_send_thread_start(uint8_t id) "%u" + multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" +-- +2.43.0 + diff --git a/0394-migration-multifd-change-retval-of-multifd-queue-pag.patch b/0394-migration-multifd-change-retval-of-multifd-queue-pag.patch new file mode 100644 index 0000000..9bcc3a1 --- /dev/null +++ b/0394-migration-multifd-change-retval-of-multifd-queue-pag.patch @@ -0,0 +1,90 @@ +From 7a3062b1dd86de6a03e48317105bcd5b1e977205 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:50 +0800 +Subject: [PATCH] migration/multifd: Change retval of multifd_queue_page() + +commit d6556d174a6b9fc443f2320193f18e71eb67052a upstream. + +Using int is an overkill when there're only two options. Change it to a +boolean. + +Intel-SIG: commit d6556d174a6b migration/multifd: Change retval of multifd_queue_page() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-17-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 9 +++++---- + migration/multifd.h | 2 +- + migration/ram.c | 2 +- + 3 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index ba86f9dda5..12e587fda8 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -505,7 +505,8 @@ static int multifd_send_pages(void) + return 1; + } + +-int multifd_queue_page(RAMBlock *block, ram_addr_t offset) ++/* Returns true if enqueue successful, false otherwise */ ++bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) + { + MultiFDPages_t *pages = multifd_send_state->pages; + bool changed = false; +@@ -519,21 +520,21 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) + pages->num++; + + if (pages->num < pages->allocated) { +- return 1; ++ return true; + } + } else { + changed = true; + } + + if (multifd_send_pages() < 0) { +- return -1; ++ return false; + } + + if (changed) { + return multifd_queue_page(block, offset); + } + +- return 1; ++ return true; + } + + /* Multifd send side hit an error; remember it and prepare to quit */ +diff --git a/migration/multifd.h b/migration/multifd.h +index 34a2ecb9f4..a320c53a6f 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -22,7 +22,7 @@ bool multifd_recv_all_channels_created(void); + void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); + void multifd_recv_sync_main(void); + int multifd_send_sync_main(void); +-int multifd_queue_page(RAMBlock *block, ram_addr_t offset); ++bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); + + /* Multifd Compression flags */ + #define MULTIFD_FLAG_SYNC (1 << 0) +diff --git a/migration/ram.c b/migration/ram.c +index 4706dcda7d..9d17628100 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1386,7 +1386,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) + + static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) + { +- if (multifd_queue_page(block, offset) < 0) { ++ if (!multifd_queue_page(block, offset)) { + return -1; + } + stat64_add(&mig_stats.normal_pages, 1); +-- +2.43.0 + diff --git a/0395-migration-multifd-change-retval-of-multifd-send-page.patch b/0395-migration-multifd-change-retval-of-multifd-send-page.patch new file mode 100644 index 0000000..1bbbc82 --- /dev/null +++ b/0395-migration-multifd-change-retval-of-multifd-send-page.patch @@ -0,0 +1,85 @@ +From 8732ea793741e572d978e2373c22c42f0a080cfc Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:51 +0800 +Subject: [PATCH] migration/multifd: Change retval of multifd_send_pages() + +commit 3b40964a863d69121733c8b9794a02347ed0000b upstream. + +Using int is an overkill when there're only two options. Change it to a +boolean. + +Intel-SIG: commit 3b40964a863d migration/multifd: Change retval of multifd_send_pages() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-18-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 12e587fda8..35d4e8ad1f 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -449,9 +449,10 @@ static void multifd_send_kick_main(MultiFDSendParams *p) + * thread is using the channel mutex when changing it, and the channel + * have to had finish with its own, otherwise pending_job can't be + * false. ++ * ++ * Returns true if succeed, false otherwise. + */ +- +-static int multifd_send_pages(void) ++static bool multifd_send_pages(void) + { + int i; + static int next_channel; +@@ -459,7 +460,7 @@ static int multifd_send_pages(void) + MultiFDPages_t *pages = multifd_send_state->pages; + + if (multifd_send_should_exit()) { +- return -1; ++ return false; + } + + /* We wait here, until at least one channel is ready */ +@@ -473,7 +474,7 @@ static int multifd_send_pages(void) + next_channel %= migrate_multifd_channels(); + for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { + if (multifd_send_should_exit()) { +- return -1; ++ return false; + } + p = &multifd_send_state->params[i]; + /* +@@ -502,7 +503,7 @@ static int multifd_send_pages(void) + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem); + +- return 1; ++ return true; + } + + /* Returns true if enqueue successful, false otherwise */ +@@ -526,7 +527,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) + changed = true; + } + +- if (multifd_send_pages() < 0) { ++ if (!multifd_send_pages()) { + return false; + } + +@@ -666,7 +667,7 @@ int multifd_send_sync_main(void) + return 0; + } + if (multifd_send_state->pages->num) { +- if (multifd_send_pages() < 0) { ++ if (!multifd_send_pages()) { + error_report("%s: multifd_send_pages fail", __func__); + return -1; + } +-- +2.43.0 + diff --git a/0396-migration-multifd-rewrite-multifd-queue-page.patch b/0396-migration-multifd-rewrite-multifd-queue-page.patch new file mode 100644 index 0000000..43aff4c --- /dev/null +++ b/0396-migration-multifd-rewrite-multifd-queue-page.patch @@ -0,0 +1,114 @@ +From 2a25c76aa24e0c9b89977ba524dcc7632117bae6 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:52 +0800 +Subject: [PATCH] migration/multifd: Rewrite multifd_queue_page() + +commit f88f86c4ee3fe673b34873e27af2de0a16fe01fd upstream. + +The current multifd_queue_page() is not easy to read and follow. It is not +good with a few reasons: + + - No helper at all to show what exactly does a condition mean; in short, + readability is low. + + - Rely on pages->ramblock being cleared to detect an empty queue. It's + slightly an overload of the ramblock pointer, per Fabiano [1], which I + also agree. + + - Contains a self recursion, even if not necessary.. + +Rewrite this function. We add some comments to make it even clearer on +what it does. + +[1] https://lore.kernel.org/r/87wmrpjzew.fsf@suse.de + +Intel-SIG: commit f88f86c4ee3f migration/multifd: Rewrite multifd_queue_page() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-19-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 56 ++++++++++++++++++++++++++++++--------------- + 1 file changed, 37 insertions(+), 19 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 35d4e8ad1f..4ab8e6eff2 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -506,35 +506,53 @@ static bool multifd_send_pages(void) + return true; + } + ++static inline bool multifd_queue_empty(MultiFDPages_t *pages) ++{ ++ return pages->num == 0; ++} ++ ++static inline bool multifd_queue_full(MultiFDPages_t *pages) ++{ ++ return pages->num == pages->allocated; ++} ++ ++static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) ++{ ++ pages->offset[pages->num++] = offset; ++} ++ + /* Returns true if enqueue successful, false otherwise */ + bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) + { +- MultiFDPages_t *pages = multifd_send_state->pages; +- bool changed = false; ++ MultiFDPages_t *pages; ++ ++retry: ++ pages = multifd_send_state->pages; + +- if (!pages->block) { ++ /* If the queue is empty, we can already enqueue now */ ++ if (multifd_queue_empty(pages)) { + pages->block = block; ++ multifd_enqueue(pages, offset); ++ return true; + } + +- if (pages->block == block) { +- pages->offset[pages->num] = offset; +- pages->num++; +- +- if (pages->num < pages->allocated) { +- return true; ++ /* ++ * Not empty, meanwhile we need a flush. It can because of either: ++ * ++ * (1) The page is not on the same ramblock of previous ones, or, ++ * (2) The queue is full. ++ * ++ * After flush, always retry. ++ */ ++ if (pages->block != block || multifd_queue_full(pages)) { ++ if (!multifd_send_pages()) { ++ return false; + } +- } else { +- changed = true; +- } +- +- if (!multifd_send_pages()) { +- return false; +- } +- +- if (changed) { +- return multifd_queue_page(block, offset); ++ goto retry; + } + ++ /* Not empty, and we still have space, do it! */ ++ multifd_enqueue(pages, offset); + return true; + } + +-- +2.43.0 + diff --git a/0397-migration-multifd-cleanup-multifd-save-cleanup.patch b/0397-migration-multifd-cleanup-multifd-save-cleanup.patch new file mode 100644 index 0000000..6e85d13 --- /dev/null +++ b/0397-migration-multifd-cleanup-multifd-save-cleanup.patch @@ -0,0 +1,161 @@ +From e5d8660ef831f8c6217ccadba32c71722a06e593 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:53 +0800 +Subject: [PATCH] migration/multifd: Cleanup multifd_save_cleanup() + +commit 12808db3b8c22d26c9bc3da6f41756890ce882e4 upstream. + +Shrink the function by moving relevant works into helpers: move the thread +join()s into multifd_send_terminate_threads(), then create two more helpers +to cover channel/state cleanups. + +Add a TODO entry for the thread terminate process because p->running is +still buggy. We need to fix it at some point but not yet covered. + +Intel-SIG: commit 12808db3b8c2 migration/multifd: Cleanup multifd_save_cleanup() + +Suggested-by: Fabiano Rosas +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-20-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 91 +++++++++++++++++++++++++++++---------------- + 1 file changed, 59 insertions(+), 32 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 4ab8e6eff2..4cb0d2cc17 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -593,6 +593,11 @@ static void multifd_send_terminate_threads(void) + * always set it. + */ + qatomic_set(&multifd_send_state->exiting, 1); ++ ++ /* ++ * Firstly, kick all threads out; no matter whether they are just idle, ++ * or blocked in an IO system call. ++ */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +@@ -601,6 +606,21 @@ static void multifd_send_terminate_threads(void) + qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } + } ++ ++ /* ++ * Finally recycle all the threads. ++ * ++ * TODO: p->running is still buggy, e.g. we can reach here without the ++ * corresponding multifd_new_send_channel_async() get invoked yet, ++ * then a new thread can even be created after this function returns. ++ */ ++ for (i = 0; i < migrate_multifd_channels(); i++) { ++ MultiFDSendParams *p = &multifd_send_state->params[i]; ++ ++ if (p->running) { ++ qemu_thread_join(&p->thread); ++ } ++ } + } + + static int multifd_send_channel_destroy(QIOChannel *send) +@@ -608,6 +628,41 @@ static int multifd_send_channel_destroy(QIOChannel *send) + return socket_send_channel_destroy(send); + } + ++static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) ++{ ++ if (p->registered_yank) { ++ migration_ioc_unregister_yank(p->c); ++ } ++ multifd_send_channel_destroy(p->c); ++ p->c = NULL; ++ qemu_mutex_destroy(&p->mutex); ++ qemu_sem_destroy(&p->sem); ++ qemu_sem_destroy(&p->sem_sync); ++ g_free(p->name); ++ p->name = NULL; ++ multifd_pages_clear(p->pages); ++ p->pages = NULL; ++ p->packet_len = 0; ++ g_free(p->packet); ++ p->packet = NULL; ++ g_free(p->iov); ++ p->iov = NULL; ++ multifd_send_state->ops->send_cleanup(p, errp); ++ ++ return *errp == NULL; ++} ++ ++static void multifd_send_cleanup_state(void) ++{ ++ qemu_sem_destroy(&multifd_send_state->channels_ready); ++ g_free(multifd_send_state->params); ++ multifd_send_state->params = NULL; ++ multifd_pages_clear(multifd_send_state->pages); ++ multifd_send_state->pages = NULL; ++ g_free(multifd_send_state); ++ multifd_send_state = NULL; ++} ++ + void multifd_save_cleanup(void) + { + int i; +@@ -615,48 +670,20 @@ void multifd_save_cleanup(void) + if (!migrate_multifd()) { + return; + } ++ + multifd_send_terminate_threads(); +- for (i = 0; i < migrate_multifd_channels(); i++) { +- MultiFDSendParams *p = &multifd_send_state->params[i]; + +- if (p->running) { +- qemu_thread_join(&p->thread); +- } +- } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + Error *local_err = NULL; + +- if (p->registered_yank) { +- migration_ioc_unregister_yank(p->c); +- } +- multifd_send_channel_destroy(p->c); +- p->c = NULL; +- qemu_mutex_destroy(&p->mutex); +- qemu_sem_destroy(&p->sem); +- qemu_sem_destroy(&p->sem_sync); +- g_free(p->name); +- p->name = NULL; +- multifd_pages_clear(p->pages); +- p->pages = NULL; +- p->packet_len = 0; +- g_free(p->packet); +- p->packet = NULL; +- g_free(p->iov); +- p->iov = NULL; +- multifd_send_state->ops->send_cleanup(p, &local_err); +- if (local_err) { ++ if (!multifd_send_cleanup_channel(p, &local_err)) { + migrate_set_error(migrate_get_current(), local_err); + error_free(local_err); + } + } +- qemu_sem_destroy(&multifd_send_state->channels_ready); +- g_free(multifd_send_state->params); +- multifd_send_state->params = NULL; +- multifd_pages_clear(multifd_send_state->pages); +- multifd_send_state->pages = NULL; +- g_free(multifd_send_state); +- multifd_send_state = NULL; ++ ++ multifd_send_cleanup_state(); + } + + static int multifd_zero_copy_flush(QIOChannel *c) +-- +2.43.0 + diff --git a/0398-migration-multifd-cleanup-multifd-load-cleanup.patch b/0398-migration-multifd-cleanup-multifd-load-cleanup.patch new file mode 100644 index 0000000..7e6e9d6 --- /dev/null +++ b/0398-migration-multifd-cleanup-multifd-load-cleanup.patch @@ -0,0 +1,96 @@ +From 93214c188f5aea75e5b3a4b168436473a831b6bf Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:54 +0800 +Subject: [PATCH] migration/multifd: Cleanup multifd_load_cleanup() + +commit 5e6ea8a1d64e72e648b5a5277f08ec7fb09c3b8e upstream. + +Use similar logic to cleanup the recv side. + +Note that multifd_recv_terminate_threads() may need some similar rework +like the sender side, but let's leave that for later. + +Intel-SIG: commit 5e6ea8a1d64e migration/multifd: Cleanup multifd_load_cleanup() + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-21-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 52 ++++++++++++++++++++++++++------------------- + 1 file changed, 30 insertions(+), 22 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 4cb0d2cc17..e2dd2f6e04 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -1070,6 +1070,34 @@ void multifd_load_shutdown(void) + } + } + ++static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) ++{ ++ migration_ioc_unregister_yank(p->c); ++ object_unref(OBJECT(p->c)); ++ p->c = NULL; ++ qemu_mutex_destroy(&p->mutex); ++ qemu_sem_destroy(&p->sem_sync); ++ g_free(p->name); ++ p->name = NULL; ++ p->packet_len = 0; ++ g_free(p->packet); ++ p->packet = NULL; ++ g_free(p->iov); ++ p->iov = NULL; ++ g_free(p->normal); ++ p->normal = NULL; ++ multifd_recv_state->ops->recv_cleanup(p); ++} ++ ++static void multifd_recv_cleanup_state(void) ++{ ++ qemu_sem_destroy(&multifd_recv_state->sem_sync); ++ g_free(multifd_recv_state->params); ++ multifd_recv_state->params = NULL; ++ g_free(multifd_recv_state); ++ multifd_recv_state = NULL; ++} ++ + void multifd_load_cleanup(void) + { + int i; +@@ -1092,29 +1120,9 @@ void multifd_load_cleanup(void) + qemu_thread_join(&p->thread); + } + for (i = 0; i < migrate_multifd_channels(); i++) { +- MultiFDRecvParams *p = &multifd_recv_state->params[i]; +- +- migration_ioc_unregister_yank(p->c); +- object_unref(OBJECT(p->c)); +- p->c = NULL; +- qemu_mutex_destroy(&p->mutex); +- qemu_sem_destroy(&p->sem_sync); +- g_free(p->name); +- p->name = NULL; +- p->packet_len = 0; +- g_free(p->packet); +- p->packet = NULL; +- g_free(p->iov); +- p->iov = NULL; +- g_free(p->normal); +- p->normal = NULL; +- multifd_recv_state->ops->recv_cleanup(p); ++ multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); + } +- qemu_sem_destroy(&multifd_recv_state->sem_sync); +- g_free(multifd_recv_state->params); +- multifd_recv_state->params = NULL; +- g_free(multifd_recv_state); +- multifd_recv_state = NULL; ++ multifd_recv_cleanup_state(); + } + + void multifd_recv_sync_main(void) +-- +2.43.0 + diff --git a/0399-migration-multifd-stick-with-send-recv-on-function-n.patch b/0399-migration-multifd-stick-with-send-recv-on-function-n.patch new file mode 100644 index 0000000..ec077bc --- /dev/null +++ b/0399-migration-multifd-stick-with-send-recv-on-function-n.patch @@ -0,0 +1,156 @@ +From bee6f0078bc23a74df7be48763049942ca3024ae Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:55 +0800 +Subject: [PATCH] migration/multifd: Stick with send/recv on function names + +commit cde85c37ca54e4a2dbee8653181938499887f6be upstream. + +Most of the multifd code uses send/recv to represent the two sides, but +some rare cases use save/load. + +Since send/recv is the majority, replacing the save/load use cases to use +send/recv globally. Now we reach a consensus on the naming. + +Intel-SIG: commit cde85c37ca54 migration/multifd: Stick with send/recv on function names + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-22-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/migration.c | 12 ++++++------ + migration/multifd.c | 10 +++++----- + migration/multifd.h | 10 +++++----- + 3 files changed, 16 insertions(+), 16 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 699ba0c834..0765b515ea 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -276,7 +276,7 @@ void migration_incoming_state_destroy(void) + { + struct MigrationIncomingState *mis = migration_incoming_get_current(); + +- multifd_load_cleanup(); ++ multifd_recv_cleanup(); + compress_threads_load_cleanup(); + + if (mis->to_src_file) { +@@ -629,7 +629,7 @@ static void process_incoming_migration_bh(void *opaque) + + trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); + +- multifd_load_shutdown(); ++ multifd_recv_shutdown(); + + dirty_bitmap_mig_before_vm_start(); + +@@ -730,7 +730,7 @@ fail: + MIGRATION_STATUS_FAILED); + qemu_fclose(mis->from_src_file); + +- multifd_load_cleanup(); ++ multifd_recv_cleanup(); + compress_threads_load_cleanup(); + + exit(EXIT_FAILURE); +@@ -863,7 +863,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) + default_channel = !mis->from_src_file; + } + +- if (multifd_load_setup(errp) != 0) { ++ if (multifd_recv_setup(errp) != 0) { + return; + } + +@@ -1315,7 +1315,7 @@ static void migrate_fd_cleanup(MigrationState *s) + } + qemu_mutex_lock_iothread(); + +- multifd_save_cleanup(); ++ multifd_send_shutdown(); + qemu_mutex_lock(&s->qemu_file_lock); + tmp = s->to_dst_file; + s->to_dst_file = NULL; +@@ -3650,7 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + return; + } + +- if (multifd_save_setup(&local_err) != 0) { ++ if (multifd_send_setup(&local_err) != 0) { + migrate_set_error(s, local_err); + error_report_err(local_err); + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, +diff --git a/migration/multifd.c b/migration/multifd.c +index e2dd2f6e04..130f86a1fb 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -663,7 +663,7 @@ static void multifd_send_cleanup_state(void) + multifd_send_state = NULL; + } + +-void multifd_save_cleanup(void) ++void multifd_send_shutdown(void) + { + int i; + +@@ -965,7 +965,7 @@ static void multifd_new_send_channel_create(gpointer opaque) + socket_send_channel_create(multifd_new_send_channel_async, opaque); + } + +-int multifd_save_setup(Error **errp) ++int multifd_send_setup(Error **errp) + { + int thread_count; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); +@@ -1063,7 +1063,7 @@ static void multifd_recv_terminate_threads(Error *err) + } + } + +-void multifd_load_shutdown(void) ++void multifd_recv_shutdown(void) + { + if (migrate_multifd()) { + multifd_recv_terminate_threads(NULL); +@@ -1098,7 +1098,7 @@ static void multifd_recv_cleanup_state(void) + multifd_recv_state = NULL; + } + +-void multifd_load_cleanup(void) ++void multifd_recv_cleanup(void) + { + int i; + +@@ -1213,7 +1213,7 @@ static void *multifd_recv_thread(void *opaque) + return NULL; + } + +-int multifd_load_setup(Error **errp) ++int multifd_recv_setup(Error **errp) + { + int thread_count; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); +diff --git a/migration/multifd.h b/migration/multifd.h +index a320c53a6f..9b40a53cb6 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -13,11 +13,11 @@ + #ifndef QEMU_MIGRATION_MULTIFD_H + #define QEMU_MIGRATION_MULTIFD_H + +-int multifd_save_setup(Error **errp); +-void multifd_save_cleanup(void); +-int multifd_load_setup(Error **errp); +-void multifd_load_cleanup(void); +-void multifd_load_shutdown(void); ++int multifd_send_setup(Error **errp); ++void multifd_send_shutdown(void); ++int multifd_recv_setup(Error **errp); ++void multifd_recv_cleanup(void); ++void multifd_recv_shutdown(void); + bool multifd_recv_all_channels_created(void); + void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); + void multifd_recv_sync_main(void); +-- +2.43.0 + diff --git a/0400-migration-multifd-fix-multifdsendparams-packet-num-r.patch b/0400-migration-multifd-fix-multifdsendparams-packet-num-r.patch new file mode 100644 index 0000000..49b3dff --- /dev/null +++ b/0400-migration-multifd-fix-multifdsendparams-packet-num-r.patch @@ -0,0 +1,169 @@ +From 279c33ea29c643530c7c8a0823225a0ff7dc2957 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:56 +0800 +Subject: [PATCH] migration/multifd: Fix MultiFDSendParams.packet_num race + +commit 98ea497d8b8a5076be7b6ceb0dcc4a475373eb76 upstream. + +As reported correctly by Fabiano [1] (while per Fabiano, it sourced back to +Elena's initial report in Oct 2023), MultiFDSendParams.packet_num is buggy +to be assigned and stored. Consider two consequent operations of: (1) +queue a job into multifd send thread X, then (2) queue another sync request +to the same send thread X. Then the MultiFDSendParams.packet_num will be +assigned twice, and the first assignment can get lost already. + +To avoid that, we move the packet_num assignment from p->packet_num into +where the thread will fill in the packet. Use atomic operations to protect +the field, making sure there's no race. + +Note that atomic fetch_add() may not be good for scaling purposes, however +multifd should be fine as number of threads should normally not go beyond +16 threads. Let's leave that concern for later but fix the issue first. + +There's also a trick on how to make it always work even on 32 bit hosts for +uint64_t packet number. Switching to uintptr_t as of now to simply the +case. It will cause packet number to overflow easier on 32 bit, but that +shouldn't be a major concern for now as 32 bit systems is not the major +audience for any performance concerns like what multifd wants to address. + +We also need to move multifd_send_state definition upper, so that +multifd_send_fill_packet() can reference it. + +[1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de + +Intel-SIG: commit 98ea497d8b8a migration/multifd: Fix MultiFDSendParams.packet_num race + +Reported-by: Elena Ufimtseva +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-23-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 56 +++++++++++++++++++++++++++------------------ + migration/multifd.h | 2 -- + 2 files changed, 34 insertions(+), 24 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 130f86a1fb..b317d57d61 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -45,6 +45,35 @@ typedef struct { + uint64_t unused2[4]; /* Reserved for future use */ + } __attribute__((packed)) MultiFDInit_t; + ++struct { ++ MultiFDSendParams *params; ++ /* array of pages to sent */ ++ MultiFDPages_t *pages; ++ /* ++ * Global number of generated multifd packets. ++ * ++ * Note that we used 'uintptr_t' because it'll naturally support atomic ++ * operations on both 32bit / 64 bits hosts. It means on 32bit systems ++ * multifd will overflow the packet_num easier, but that should be ++ * fine. ++ * ++ * Another option is to use QEMU's Stat64 then it'll be 64 bits on all ++ * hosts, however so far it does not support atomic fetch_add() yet. ++ * Make it easy for now. ++ */ ++ uintptr_t packet_num; ++ /* send channels ready */ ++ QemuSemaphore channels_ready; ++ /* ++ * Have we already run terminate threads. There is a race when it ++ * happens that we got one error while we are exiting. ++ * We will use atomic operations. Only valid values are 0 and 1. ++ */ ++ int exiting; ++ /* multifd ops */ ++ MultiFDMethods *ops; ++} *multifd_send_state; ++ + /* Multifd without compression */ + + /** +@@ -292,13 +321,16 @@ void multifd_send_fill_packet(MultiFDSendParams *p) + { + MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; ++ uint64_t packet_num; + int i; + + packet->flags = cpu_to_be32(p->flags); + packet->pages_alloc = cpu_to_be32(p->pages->allocated); + packet->normal_pages = cpu_to_be32(pages->num); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); +- packet->packet_num = cpu_to_be64(p->packet_num); ++ ++ packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); ++ packet->packet_num = cpu_to_be64(packet_num); + + if (pages->block) { + strncpy(packet->ramblock, pages->block->idstr, 256); +@@ -314,7 +346,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p) + p->packets_sent++; + p->total_normal_pages += pages->num; + +- trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, ++ trace_multifd_send(p->id, packet_num, pages->num, p->flags, + p->next_packet_size); + } + +@@ -398,24 +430,6 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + return 0; + } + +-struct { +- MultiFDSendParams *params; +- /* array of pages to sent */ +- MultiFDPages_t *pages; +- /* global number of generated multifd packets */ +- uint64_t packet_num; +- /* send channels ready */ +- QemuSemaphore channels_ready; +- /* +- * Have we already run terminate threads. There is a race when it +- * happens that we got one error while we are exiting. +- * We will use atomic operations. Only valid values are 0 and 1. +- */ +- int exiting; +- /* multifd ops */ +- MultiFDMethods *ops; +-} *multifd_send_state; +- + static bool multifd_send_should_exit(void) + { + return qatomic_read(&multifd_send_state->exiting); +@@ -497,7 +511,6 @@ static bool multifd_send_pages(void) + */ + assert(qatomic_read(&p->pending_job) == false); + qatomic_set(&p->pending_job, true); +- p->packet_num = multifd_send_state->packet_num++; + multifd_send_state->pages = p->pages; + p->pages = pages; + qemu_mutex_unlock(&p->mutex); +@@ -730,7 +743,6 @@ int multifd_send_sync_main(void) + trace_multifd_send_sync_main_signal(p->id); + + qemu_mutex_lock(&p->mutex); +- p->packet_num = multifd_send_state->packet_num++; + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. +diff --git a/migration/multifd.h b/migration/multifd.h +index 9b40a53cb6..98876ff94a 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -97,8 +97,6 @@ typedef struct { + bool running; + /* multifd flags for each packet */ + uint32_t flags; +- /* global number of generated multifd packets */ +- uint64_t packet_num; + /* + * The sender thread has work to do if either of below boolean is set. + * +-- +2.43.0 + diff --git a/0401-migration-multifd-optimize-sender-side-to-be-lockles.patch b/0401-migration-multifd-optimize-sender-side-to-be-lockles.patch new file mode 100644 index 0000000..bb9e385 --- /dev/null +++ b/0401-migration-multifd-optimize-sender-side-to-be-lockles.patch @@ -0,0 +1,206 @@ +From 218acbd2b14cfdb4ca5f9df7146955c08c33054d Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 2 Feb 2024 18:28:57 +0800 +Subject: [PATCH] migration/multifd: Optimize sender side to be lockless + +commit 488c84acb465c21b716c3fd14de27ab5ce388c85 upstream. + +When reviewing my attempt to refactor send_prepare(), Fabiano suggested we +try out with dropping the mutex in multifd code [1]. + +I thought about that before but I never tried to change the code. Now +maybe it's time to give it a stab. This only optimizes the sender side. + +The trick here is multifd has a clear provider/consumer model, that the +migration main thread publishes requests (either pending_job/pending_sync), +while the multifd sender threads are consumers. Here we don't have a lot +of complicated data sharing, and the jobs can logically be submitted +lockless. + +Arm the code with atomic weapons. Two things worth mentioning: + + - For multifd_send_pages(): we can use qatomic_load_acquire() when trying + to find a free channel, but that's expensive if we attach one ACQUIRE per + channel. Instead, keep the qatomic_read() on reading the pending_job + flag as we do already, meanwhile use one smp_mb_acquire() after the loop + to guarantee the memory ordering. + + - For pending_sync: it doesn't have any extra data required since now + p->flags are never touched, it should be safe to not use memory barrier. + That's different from pending_job. + +Provide rich comments for all the lockless operations to state how they are +paired. With that, we can remove the mutex. + +[1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de + +Intel-SIG: commit 488c84acb465 migration/multifd: Optimize sender side to be lockless + +Suggested-by: Fabiano Rosas +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240202102857.110210-24-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 51 +++++++++++++++++++++++---------------------- + migration/multifd.h | 2 -- + 2 files changed, 26 insertions(+), 27 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index b317d57d61..fbdb129088 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -501,19 +501,19 @@ static bool multifd_send_pages(void) + } + } + +- qemu_mutex_lock(&p->mutex); +- assert(!p->pages->num); +- assert(!p->pages->block); + /* +- * Double check on pending_job==false with the lock. In the future if +- * we can have >1 requester thread, we can replace this with a "goto +- * retry", but that is for later. ++ * Make sure we read p->pending_job before all the rest. Pairs with ++ * qatomic_store_release() in multifd_send_thread(). + */ +- assert(qatomic_read(&p->pending_job) == false); +- qatomic_set(&p->pending_job, true); ++ smp_mb_acquire(); ++ assert(!p->pages->num); + multifd_send_state->pages = p->pages; + p->pages = pages; +- qemu_mutex_unlock(&p->mutex); ++ /* ++ * Making sure p->pages is setup before marking pending_job=true. Pairs ++ * with the qatomic_load_acquire() in multifd_send_thread(). ++ */ ++ qatomic_store_release(&p->pending_job, true); + qemu_sem_post(&p->sem); + + return true; +@@ -648,7 +648,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) + } + multifd_send_channel_destroy(p->c); + p->c = NULL; +- qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); +@@ -742,14 +741,12 @@ int multifd_send_sync_main(void) + + trace_multifd_send_sync_main_signal(p->id); + +- qemu_mutex_lock(&p->mutex); + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. + */ + assert(qatomic_read(&p->pending_sync) == false); + qatomic_set(&p->pending_sync, true); +- qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem); + } + for (i = 0; i < migrate_multifd_channels(); i++) { +@@ -796,9 +793,12 @@ static void *multifd_send_thread(void *opaque) + if (multifd_send_should_exit()) { + break; + } +- qemu_mutex_lock(&p->mutex); + +- if (qatomic_read(&p->pending_job)) { ++ /* ++ * Read pending_job flag before p->pages. Pairs with the ++ * qatomic_store_release() in multifd_send_pages(). ++ */ ++ if (qatomic_load_acquire(&p->pending_job)) { + MultiFDPages_t *pages = p->pages; + + p->iovs_num = 0; +@@ -806,14 +806,12 @@ static void *multifd_send_thread(void *opaque) + + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { +- qemu_mutex_unlock(&p->mutex); + break; + } + + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, + 0, p->write_flags, &local_err); + if (ret != 0) { +- qemu_mutex_unlock(&p->mutex); + break; + } + +@@ -822,24 +820,31 @@ static void *multifd_send_thread(void *opaque) + + multifd_pages_reset(p->pages); + p->next_packet_size = 0; +- qatomic_set(&p->pending_job, false); +- qemu_mutex_unlock(&p->mutex); ++ ++ /* ++ * Making sure p->pages is published before saying "we're ++ * free". Pairs with the smp_mb_acquire() in ++ * multifd_send_pages(). ++ */ ++ qatomic_store_release(&p->pending_job, false); + } else { +- /* If not a normal job, must be a sync request */ ++ /* ++ * If not a normal job, must be a sync request. Note that ++ * pending_sync is a standalone flag (unlike pending_job), so ++ * it doesn't require explicit memory barriers. ++ */ + assert(qatomic_read(&p->pending_sync)); + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { +- qemu_mutex_unlock(&p->mutex); + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; + qatomic_set(&p->pending_sync, false); +- qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem_sync); + } + } +@@ -853,10 +858,7 @@ out: + error_free(local_err); + } + +- qemu_mutex_lock(&p->mutex); + p->running = false; +- qemu_mutex_unlock(&p->mutex); +- + rcu_unregister_thread(); + migration_threads_remove(thread); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); +@@ -998,7 +1000,6 @@ int multifd_send_setup(Error **errp) + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +- qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem, 0); + qemu_sem_init(&p->sem_sync, 0); + p->id = i; +diff --git a/migration/multifd.h b/migration/multifd.h +index 98876ff94a..78a2317263 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -91,8 +91,6 @@ typedef struct { + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + +- /* this mutex protects the following parameters */ +- QemuMutex mutex; + /* is this channel thread running */ + bool running; + /* multifd flags for each packet */ +-- +2.43.0 + diff --git a/0402-migration-multifd-join-the-tls-thread.patch b/0402-migration-multifd-join-the-tls-thread.patch new file mode 100644 index 0000000..98584f8 --- /dev/null +++ b/0402-migration-multifd-join-the-tls-thread.patch @@ -0,0 +1,66 @@ +From 698dd961a54f9147753385671e6e5b3f991df36d Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 6 Feb 2024 18:51:13 -0300 +Subject: [PATCH] migration/multifd: Join the TLS thread + +commit e1921f10d9afe651f4887284e85f6789b37e67d3 upstream. + +We're currently leaking the resources of the TLS thread by not joining +it and also overwriting the p->thread pointer altogether. + +Intel-SIG: commit e1921f10d9af migration/multifd: Join the TLS thread + +Fixes: a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") +Cc: qemu-stable +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240206215118.6171-2-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 8 +++++++- + migration/multifd.h | 2 ++ + 2 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index fbdb129088..5551711a2a 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -630,6 +630,10 @@ static void multifd_send_terminate_threads(void) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + ++ if (p->tls_thread_created) { ++ qemu_thread_join(&p->tls_thread); ++ } ++ + if (p->running) { + qemu_thread_join(&p->thread); + } +@@ -921,7 +925,9 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, + trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); + qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); + p->c = QIO_CHANNEL(tioc); +- qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", ++ ++ p->tls_thread_created = true; ++ qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", + multifd_tls_handshake_thread, p, + QEMU_THREAD_JOINABLE); + return true; +diff --git a/migration/multifd.h b/migration/multifd.h +index 78a2317263..720c9d50db 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -73,6 +73,8 @@ typedef struct { + char *name; + /* channel thread id */ + QemuThread thread; ++ QemuThread tls_thread; ++ bool tls_thread_created; + /* communication channel */ + QIOChannel *c; + /* is the yank function registered */ +-- +2.43.0 + diff --git a/0403-migration-multifd-remove-p-running.patch b/0403-migration-multifd-remove-p-running.patch new file mode 100644 index 0000000..0fc50d2 --- /dev/null +++ b/0403-migration-multifd-remove-p-running.patch @@ -0,0 +1,177 @@ +From 7e8dc70f14528f540f5f49b212968ceef67fa847 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 6 Feb 2024 18:51:14 -0300 +Subject: [PATCH] migration/multifd: Remove p->running + +commit a2a63c4abd52f4e3ff4046dcb67fe44ebf0bb8de upstream. + +We currently only need p->running to avoid calling qemu_thread_join() +on a non existent thread if the thread has never been created. + +However, there are at least two bugs in this logic: + +1) On the sending side, p->running is set too early and +qemu_thread_create() can be skipped due to an error during TLS +handshake, leaving the flag set and leading to a crash when +multifd_send_cleanup() calls qemu_thread_join(). + +2) During exit, the multifd thread clears the flag while holding the +channel lock. The counterpart at multifd_send_cleanup() reads the flag +outside of the lock and might free the mutex while the multifd thread +still has it locked. + +Fix the first issue by setting the flag right before creating the +thread. Rename it from p->running to p->thread_created to clarify its +usage. + +Fix the second issue by not clearing the flag at the multifd thread +exit. We don't have any use for that. + +Note that these bugs are straight-forward logic issues and not race +conditions. There is still a gap for races to affect this code due to +multifd_send_cleanup() being allowed to run concurrently with the +thread creation loop. This issue is solved in the next patches. + +Intel-SIG: commit a2a63c4abd52 migration/multifd: Remove p->running + +Cc: qemu-stable +Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") +Reported-by: Avihai Horon +Reported-by: chenyuhui5@huawei.com +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240206215118.6171-3-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 27 ++++++++++++--------------- + migration/multifd.h | 7 ++----- + 2 files changed, 14 insertions(+), 20 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 5551711a2a..e6ac1ad6dc 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -634,7 +634,7 @@ static void multifd_send_terminate_threads(void) + qemu_thread_join(&p->tls_thread); + } + +- if (p->running) { ++ if (p->thread_created) { + qemu_thread_join(&p->thread); + } + } +@@ -862,7 +862,6 @@ out: + error_free(local_err); + } + +- p->running = false; + rcu_unregister_thread(); + migration_threads_remove(thread); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); +@@ -953,6 +952,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + migration_ioc_register_yank(ioc); + p->registered_yank = true; + p->c = ioc; ++ ++ p->thread_created = true; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); + return true; +@@ -967,7 +968,6 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + trace_multifd_new_send_channel_async(p->id); + if (!qio_task_propagate_error(task, &local_err)) { + qio_channel_set_delay(ioc, false); +- p->running = true; + if (multifd_channel_connect(p, ioc, &local_err)) { + return; + } +@@ -1128,15 +1128,15 @@ void multifd_recv_cleanup(void) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + +- if (p->running) { +- /* +- * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, +- * however try to wakeup it without harm in cleanup phase. +- */ +- qemu_sem_post(&p->sem_sync); +- } ++ /* ++ * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, ++ * however try to wakeup it without harm in cleanup phase. ++ */ ++ qemu_sem_post(&p->sem_sync); + +- qemu_thread_join(&p->thread); ++ if (p->thread_created) { ++ qemu_thread_join(&p->thread); ++ } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); +@@ -1222,9 +1222,6 @@ static void *multifd_recv_thread(void *opaque) + multifd_recv_terminate_threads(local_err); + error_free(local_err); + } +- qemu_mutex_lock(&p->mutex); +- p->running = false; +- qemu_mutex_unlock(&p->mutex); + + rcu_unregister_thread(); + trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); +@@ -1330,7 +1327,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) + p->c = ioc; + object_ref(OBJECT(ioc)); + +- p->running = true; ++ p->thread_created = true; + qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, + QEMU_THREAD_JOINABLE); + qatomic_inc(&multifd_recv_state->count); +diff --git a/migration/multifd.h b/migration/multifd.h +index 720c9d50db..7881980ee6 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -73,6 +73,7 @@ typedef struct { + char *name; + /* channel thread id */ + QemuThread thread; ++ bool thread_created; + QemuThread tls_thread; + bool tls_thread_created; + /* communication channel */ +@@ -93,8 +94,6 @@ typedef struct { + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + +- /* is this channel thread running */ +- bool running; + /* multifd flags for each packet */ + uint32_t flags; + /* +@@ -143,6 +142,7 @@ typedef struct { + char *name; + /* channel thread id */ + QemuThread thread; ++ bool thread_created; + /* communication channel */ + QIOChannel *c; + /* packet allocated len */ +@@ -157,8 +157,6 @@ typedef struct { + + /* this mutex protects the following parameters */ + QemuMutex mutex; +- /* is this channel thread running */ +- bool running; + /* should this thread finish */ + bool quit; + /* multifd flags for each packet */ +@@ -217,4 +215,3 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) + + + #endif +- +-- +2.43.0 + diff --git a/0404-migration-multifd-move-multifd-send-setup-error-hand.patch b/0404-migration-multifd-move-multifd-send-setup-error-hand.patch new file mode 100644 index 0000000..87ac1c1 --- /dev/null +++ b/0404-migration-multifd-move-multifd-send-setup-error-hand.patch @@ -0,0 +1,108 @@ +From e2052268e060991f5f0d1ce89ac209cdb70ea291 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 6 Feb 2024 18:51:15 -0300 +Subject: [PATCH] migration/multifd: Move multifd_send_setup error handling in + to the function + +commit bd8b0a8f82d8fc17aa285ab963ba75675c2fbe7a upstream. + +Hide the error handling inside multifd_send_setup to make it cleaner +for the next patch to move the function around. + +Intel-SIG: commit bd8b0a8f82d8 migration/multifd: Move multifd_send_setup error handling in to the function + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240206215118.6171-4-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/migration.c | 6 +----- + migration/multifd.c | 24 +++++++++++++++++------- + migration/multifd.h | 2 +- + 3 files changed, 19 insertions(+), 13 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 0765b515ea..27c7eda222 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3650,11 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + return; + } + +- if (multifd_send_setup(&local_err) != 0) { +- migrate_set_error(s, local_err); +- error_report_err(local_err); +- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, +- MIGRATION_STATUS_FAILED); ++ if (!multifd_send_setup()) { + migrate_fd_cleanup(s); + return; + } +diff --git a/migration/multifd.c b/migration/multifd.c +index e6ac1ad6dc..cf865edba0 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -985,14 +985,16 @@ static void multifd_new_send_channel_create(gpointer opaque) + socket_send_channel_create(multifd_new_send_channel_async, opaque); + } + +-int multifd_send_setup(Error **errp) ++bool multifd_send_setup(void) + { +- int thread_count; ++ MigrationState *s = migrate_get_current(); ++ Error *local_err = NULL; ++ int thread_count, ret = 0; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + uint8_t i; + + if (!migrate_multifd()) { +- return 0; ++ return true; + } + + thread_count = migrate_multifd_channels(); +@@ -1026,14 +1028,22 @@ int multifd_send_setup(Error **errp) + + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; +- int ret; + +- ret = multifd_send_state->ops->send_setup(p, errp); ++ ret = multifd_send_state->ops->send_setup(p, &local_err); + if (ret) { +- return ret; ++ break; + } + } +- return 0; ++ ++ if (ret) { ++ migrate_set_error(s, local_err); ++ error_report_err(local_err); ++ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, ++ MIGRATION_STATUS_FAILED); ++ return false; ++ } ++ ++ return true; + } + + struct { +diff --git a/migration/multifd.h b/migration/multifd.h +index 7881980ee6..8a1cad0996 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -13,7 +13,7 @@ + #ifndef QEMU_MIGRATION_MULTIFD_H + #define QEMU_MIGRATION_MULTIFD_H + +-int multifd_send_setup(Error **errp); ++bool multifd_send_setup(void); + void multifd_send_shutdown(void); + int multifd_recv_setup(Error **errp); + void multifd_recv_cleanup(void); +-- +2.43.0 + diff --git a/0405-migration-multifd-move-multifd-send-setup-into-migra.patch b/0405-migration-multifd-move-multifd-send-setup-into-migra.patch new file mode 100644 index 0000000..e9cfe74 --- /dev/null +++ b/0405-migration-multifd-move-multifd-send-setup-into-migra.patch @@ -0,0 +1,92 @@ +From 9a3fa8c00cad1f9c95dd0c67b91d5b849f463935 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 6 Feb 2024 18:51:16 -0300 +Subject: [PATCH] migration/multifd: Move multifd_send_setup into migration + thread + +commit dd904bc13f2af0c605c3fe72f118ea4e27a6610c upstream. + +We currently have an unfavorable situation around multifd channels +creation and the migration thread execution. + +We create the multifd channels with qio_channel_socket_connect_async +-> qio_task_run_in_thread, but only connect them at the +multifd_new_send_channel_async callback, called from +qio_task_complete, which is registered as a glib event. + +So at multifd_send_setup() we create the channels, but they will only +be actually usable after the whole multifd_send_setup() calling stack +returns back to the main loop. Which means that the migration thread +is already up and running without any possibility for the multifd +channels to be ready on time. + +We currently rely on the channels-ready semaphore blocking +multifd_send_sync_main() until channels start to come up and release +it. However there have been bugs recently found when a channel's +creation fails and multifd_send_cleanup() is allowed to run while +other channels are still being created. + +Let's start to organize this situation by moving the +multifd_send_setup() call into the migration thread. That way we +unblock the main-loop to dispatch the completion callbacks and +actually have a chance of getting the multifd channels ready for when +the migration thread needs them. + +The next patches will deal with the synchronization aspects. + +Note that this takes multifd_send_setup() out of the BQL. + +Intel-SIG: commit dd904bc13f2a migration/multifd: Move multifd_send_setup into migration thread + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240206215118.6171-5-farosas@suse.de +Signed-off-by: Peter Xu + + Conflicts: + migration/migration.c [bql] +[jz: upstream renamed qemu_mutex_lock_iothread() to bql_lock(), while + we havenot yet. Resolve context conflict due to this] +Signed-off-by: Jason Zeng +--- + migration/migration.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 27c7eda222..e7a513cb18 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3326,6 +3326,10 @@ static void *migration_thread(void *opaque) + object_ref(OBJECT(s)); + update_iteration_initial_status(s); + ++ if (!multifd_send_setup()) { ++ goto out; ++ } ++ + qemu_mutex_lock_iothread(); + qemu_savevm_state_header(s->to_dst_file); + qemu_mutex_unlock_iothread(); +@@ -3397,6 +3401,7 @@ static void *migration_thread(void *opaque) + urgent = migration_rate_limit(); + } + ++out: + trace_migration_thread_after_loop(); + migration_iteration_finish(s); + object_unref(OBJECT(s)); +@@ -3650,11 +3655,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + return; + } + +- if (!multifd_send_setup()) { +- migrate_fd_cleanup(s); +- return; +- } +- + if (migrate_background_snapshot()) { + qemu_thread_create(&s->thread, "bg_snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); +-- +2.43.0 + diff --git a/0406-migration-multifd-unify-multifd-and-tls-connection-p.patch b/0406-migration-multifd-unify-multifd-and-tls-connection-p.patch new file mode 100644 index 0000000..7b485fb --- /dev/null +++ b/0406-migration-multifd-unify-multifd-and-tls-connection-p.patch @@ -0,0 +1,177 @@ +From 5f4af2f53dfc60a1159e3453e3396f40daa1ef9d Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 6 Feb 2024 18:51:17 -0300 +Subject: [PATCH] migration/multifd: Unify multifd and TLS connection paths + +commit 2576ae488ef9aa692486157df7d8b410919cd219 upstream. + +During multifd channel creation (multifd_send_new_channel_async) when +TLS is enabled, the multifd_channel_connect function is called twice, +once to create the TLS handshake thread and another time after the +asynchrounous TLS handshake has finished. + +This creates a slightly confusing call stack where +multifd_channel_connect() is called more times than the number of +channels. It also splits error handling between the two callers of +multifd_channel_connect() causing some code duplication. Lastly, it +gets in the way of having a single point to determine whether all +channel creation tasks have been initiated. + +Refactor the code to move the reentrancy one level up at the +multifd_new_send_channel_async() level, de-duplicating the error +handling and allowing for the next patch to introduce a +synchronization point common to all the multifd channel creation, +regardless of TLS. + +Note that the previous code would never fail once p->c had been set. +This patch changes this assumption, which affects refcounting, so add +comments around object_unref to explain the situation. + +Intel-SIG: commit 2576ae488ef9 migration/multifd: Unify multifd and TLS connection paths + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240206215118.6171-6-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 83 ++++++++++++++++++++++----------------------- + 1 file changed, 40 insertions(+), 43 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index cf865edba0..3db18dc79e 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -869,30 +869,7 @@ out: + return NULL; + } + +-static bool multifd_channel_connect(MultiFDSendParams *p, +- QIOChannel *ioc, +- Error **errp); +- +-static void multifd_tls_outgoing_handshake(QIOTask *task, +- gpointer opaque) +-{ +- MultiFDSendParams *p = opaque; +- QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); +- Error *err = NULL; +- +- if (!qio_task_propagate_error(task, &err)) { +- trace_multifd_tls_outgoing_handshake_complete(ioc); +- if (multifd_channel_connect(p, ioc, &err)) { +- return; +- } +- } +- +- trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); +- +- multifd_send_set_error(err); +- multifd_send_kick_main(p); +- error_free(err); +-} ++static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); + + static void *multifd_tls_handshake_thread(void *opaque) + { +@@ -900,7 +877,7 @@ static void *multifd_tls_handshake_thread(void *opaque) + QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); + + qio_channel_tls_handshake(tioc, +- multifd_tls_outgoing_handshake, ++ multifd_new_send_channel_async, + p, + NULL, + NULL); +@@ -920,6 +897,10 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, + return false; + } + ++ /* ++ * Ownership of the socket channel now transfers to the newly ++ * created TLS channel, which has already taken a reference. ++ */ + object_unref(OBJECT(ioc)); + trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); + qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); +@@ -936,18 +917,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + QIOChannel *ioc, + Error **errp) + { +- trace_multifd_set_outgoing_channel( +- ioc, object_get_typename(OBJECT(ioc)), +- migrate_get_current()->hostname); +- +- if (migrate_channel_requires_tls_upgrade(ioc)) { +- /* +- * tls_channel_connect will call back to this +- * function after the TLS handshake, +- * so we mustn't call multifd_send_thread until then +- */ +- return multifd_tls_channel_connect(p, ioc, errp); +- } ++ qio_channel_set_delay(ioc, false); + + migration_ioc_register_yank(ioc); + p->registered_yank = true; +@@ -959,24 +929,51 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + return true; + } + ++/* ++ * When TLS is enabled this function is called once to establish the ++ * TLS connection and a second time after the TLS handshake to create ++ * the multifd channel. Without TLS it goes straight into the channel ++ * creation. ++ */ + static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + { + MultiFDSendParams *p = opaque; + QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); + Error *local_err = NULL; ++ bool ret; + + trace_multifd_new_send_channel_async(p->id); +- if (!qio_task_propagate_error(task, &local_err)) { +- qio_channel_set_delay(ioc, false); +- if (multifd_channel_connect(p, ioc, &local_err)) { +- return; +- } ++ ++ if (qio_task_propagate_error(task, &local_err)) { ++ ret = false; ++ goto out; ++ } ++ ++ trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)), ++ migrate_get_current()->hostname); ++ ++ if (migrate_channel_requires_tls_upgrade(ioc)) { ++ ret = multifd_tls_channel_connect(p, ioc, &local_err); ++ } else { ++ ret = multifd_channel_connect(p, ioc, &local_err); + } + ++ if (ret) { ++ return; ++ } ++ ++out: + trace_multifd_new_send_channel_async_error(p->id, local_err); + multifd_send_set_error(local_err); + multifd_send_kick_main(p); +- object_unref(OBJECT(ioc)); ++ if (!p->c) { ++ /* ++ * If no channel has been created, drop the initial ++ * reference. Otherwise cleanup happens at ++ * multifd_send_channel_destroy() ++ */ ++ object_unref(OBJECT(ioc)); ++ } + error_free(local_err); + } + +-- +2.43.0 + diff --git a/0407-migration-multifd-add-a-synchronization-point-for-ch.patch b/0407-migration-multifd-add-a-synchronization-point-for-ch.patch new file mode 100644 index 0000000..5cdee2f --- /dev/null +++ b/0407-migration-multifd-add-a-synchronization-point-for-ch.patch @@ -0,0 +1,129 @@ +From b0b94a24abb1ddc822b843b928b07414b22c1b0c Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 6 Feb 2024 18:51:18 -0300 +Subject: [PATCH] migration/multifd: Add a synchronization point for channel + creation + +commit 93fa9dc2e0522c54b813dee0898a5feb98b624c9 upstream. + +It is possible that one of the multifd channels fails to be created at +multifd_new_send_channel_async() while the rest of the channel +creation tasks are still in flight. + +This could lead to multifd_save_cleanup() executing the +qemu_thread_join() loop too early and not waiting for the threads +which haven't been created yet, leading to the freeing of resources +that the newly created threads will try to access and crash. + +Add a synchronization point after which there will be no attempts at +thread creation and therefore calling multifd_save_cleanup() past that +point will ensure it properly waits for the threads. + +A note about performance: Prior to this patch, if a channel took too +long to be established, other channels could finish connecting first +and already start taking load. Now we're bounded by the +slowest-connecting channel. + +Intel-SIG: commit 93fa9dc2e052 migration/multifd: Add a synchronization point for channel creation + +Reported-by: Avihai Horon +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240206215118.6171-7-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 32 ++++++++++++++++++++++++++------ + 1 file changed, 26 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 3db18dc79e..adfe8c9a0a 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -62,6 +62,11 @@ struct { + * Make it easy for now. + */ + uintptr_t packet_num; ++ /* ++ * Synchronization point past which no more channels will be ++ * created. ++ */ ++ QemuSemaphore channels_created; + /* send channels ready */ + QemuSemaphore channels_ready; + /* +@@ -622,10 +627,6 @@ static void multifd_send_terminate_threads(void) + + /* + * Finally recycle all the threads. +- * +- * TODO: p->running is still buggy, e.g. we can reach here without the +- * corresponding multifd_new_send_channel_async() get invoked yet, +- * then a new thread can even be created after this function returns. + */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; +@@ -670,6 +671,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) + + static void multifd_send_cleanup_state(void) + { ++ qemu_sem_destroy(&multifd_send_state->channels_created); + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; +@@ -954,18 +956,26 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + + if (migrate_channel_requires_tls_upgrade(ioc)) { + ret = multifd_tls_channel_connect(p, ioc, &local_err); ++ if (ret) { ++ return; ++ } + } else { + ret = multifd_channel_connect(p, ioc, &local_err); + } + ++out: ++ /* ++ * Here we're not interested whether creation succeeded, only that ++ * it happened at all. ++ */ ++ qemu_sem_post(&multifd_send_state->channels_created); ++ + if (ret) { + return; + } + +-out: + trace_multifd_new_send_channel_async_error(p->id, local_err); + multifd_send_set_error(local_err); +- multifd_send_kick_main(p); + if (!p->c) { + /* + * If no channel has been created, drop the initial +@@ -998,6 +1008,7 @@ bool multifd_send_setup(void) + multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); + multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); + multifd_send_state->pages = multifd_pages_init(page_count); ++ qemu_sem_init(&multifd_send_state->channels_created, 0); + qemu_sem_init(&multifd_send_state->channels_ready, 0); + qatomic_set(&multifd_send_state->exiting, 0); + multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; +@@ -1023,6 +1034,15 @@ bool multifd_send_setup(void) + multifd_new_send_channel_create(p); + } + ++ /* ++ * Wait until channel creation has started for all channels. The ++ * creation can still fail, but no more channels will be created ++ * past this point. ++ */ ++ for (i = 0; i < thread_count; i++) { ++ qemu_sem_wait(&multifd_send_state->channels_created); ++ } ++ + for (i = 0; i < thread_count; i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + +-- +2.43.0 + diff --git a/0408-migration-multifd-remove-p-quit-from-recv-side.patch b/0408-migration-multifd-remove-p-quit-from-recv-side.patch new file mode 100644 index 0000000..f909bb7 --- /dev/null +++ b/0408-migration-multifd-remove-p-quit-from-recv-side.patch @@ -0,0 +1,131 @@ +From 85161e97bf979f94cde8a04bb378abcb6dbdfb26 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 20 Feb 2024 19:41:08 -0300 +Subject: [PATCH] migration/multifd: Remove p->quit from recv side + +commit 11dd7be57524d400652cecf8740a016b3d66b53d upstream. + +Like we did on the sending side, replace the p->quit per-channel flag +with a global atomic 'exiting' flag. + +Intel-SIG: commit 11dd7be57524 migration/multifd: Remove p->quit from recv side + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240220224138.24759-5-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 41 ++++++++++++++++++++++++----------------- + 1 file changed, 24 insertions(+), 17 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index adfe8c9a0a..fba00b9e8f 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -79,6 +79,19 @@ struct { + MultiFDMethods *ops; + } *multifd_send_state; + ++struct { ++ MultiFDRecvParams *params; ++ /* number of created threads */ ++ int count; ++ /* syncs main thread and channels */ ++ QemuSemaphore sem_sync; ++ /* global number of generated multifd packets */ ++ uint64_t packet_num; ++ int exiting; ++ /* multifd ops */ ++ MultiFDMethods *ops; ++} *multifd_recv_state; ++ + /* Multifd without compression */ + + /** +@@ -440,6 +453,11 @@ static bool multifd_send_should_exit(void) + return qatomic_read(&multifd_send_state->exiting); + } + ++static bool multifd_recv_should_exit(void) ++{ ++ return qatomic_read(&multifd_recv_state->exiting); ++} ++ + /* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of +@@ -1063,24 +1081,16 @@ bool multifd_send_setup(void) + return true; + } + +-struct { +- MultiFDRecvParams *params; +- /* number of created threads */ +- int count; +- /* syncs main thread and channels */ +- QemuSemaphore sem_sync; +- /* global number of generated multifd packets */ +- uint64_t packet_num; +- /* multifd ops */ +- MultiFDMethods *ops; +-} *multifd_recv_state; +- + static void multifd_recv_terminate_threads(Error *err) + { + int i; + + trace_multifd_recv_terminate_threads(err != NULL); + ++ if (qatomic_xchg(&multifd_recv_state->exiting, 1)) { ++ return; ++ } ++ + if (err) { + MigrationState *s = migrate_get_current(); + migrate_set_error(s, err); +@@ -1094,8 +1104,6 @@ static void multifd_recv_terminate_threads(Error *err) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + +- qemu_mutex_lock(&p->mutex); +- p->quit = true; + /* + * We could arrive here for two reasons: + * - normal quit, i.e. everything went fine, just finished +@@ -1105,7 +1113,6 @@ static void multifd_recv_terminate_threads(Error *err) + if (p->c) { + qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } +- qemu_mutex_unlock(&p->mutex); + } + } + +@@ -1210,7 +1217,7 @@ static void *multifd_recv_thread(void *opaque) + while (true) { + uint32_t flags; + +- if (p->quit) { ++ if (multifd_recv_should_exit()) { + break; + } + +@@ -1274,6 +1281,7 @@ int multifd_recv_setup(Error **errp) + multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); + multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); + qatomic_set(&multifd_recv_state->count, 0); ++ qatomic_set(&multifd_recv_state->exiting, 0); + qemu_sem_init(&multifd_recv_state->sem_sync, 0); + multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()]; + +@@ -1282,7 +1290,6 @@ int multifd_recv_setup(Error **errp) + + qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem_sync, 0); +- p->quit = false; + p->id = i; + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; +-- +2.43.0 + diff --git a/0409-migration-multifd-release-recv-sem-sync-earlier.patch b/0409-migration-multifd-release-recv-sem-sync-earlier.patch new file mode 100644 index 0000000..4d2dfb9 --- /dev/null +++ b/0409-migration-multifd-release-recv-sem-sync-earlier.patch @@ -0,0 +1,54 @@ +From 6bfdfa7982152eb81aef5c5d911f5ccaa49aab29 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 20 Feb 2024 19:41:09 -0300 +Subject: [PATCH] migration/multifd: Release recv sem_sync earlier + +commit d13f0026c7a625a5a34a5dea4095a4d9cfa04652 upstream. + +Now that multifd_recv_terminate_threads() is called only once, release +the recv side sem_sync earlier like we do for the send side. + +Intel-SIG: commit d13f0026c7a6 migration/multifd: Release recv sem_sync earlier + +Signed-off-by: Fabiano Rosas +Reviewed-by: Peter Xu +Link: https://lore.kernel.org/r/20240220224138.24759-6-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index fba00b9e8f..43f0820996 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -1104,6 +1104,12 @@ static void multifd_recv_terminate_threads(Error *err) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + ++ /* ++ * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, ++ * however try to wakeup it without harm in cleanup phase. ++ */ ++ qemu_sem_post(&p->sem_sync); ++ + /* + * We could arrive here for two reasons: + * - normal quit, i.e. everything went fine, just finished +@@ -1162,12 +1168,6 @@ void multifd_recv_cleanup(void) + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + +- /* +- * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, +- * however try to wakeup it without harm in cleanup phase. +- */ +- qemu_sem_post(&p->sem_sync); +- + if (p->thread_created) { + qemu_thread_join(&p->thread); + } +-- +2.43.0 + diff --git a/0410-migration-multifd-cleanup-tls-iochannel-referencing.patch b/0410-migration-multifd-cleanup-tls-iochannel-referencing.patch new file mode 100644 index 0000000..5dde922 --- /dev/null +++ b/0410-migration-multifd-cleanup-tls-iochannel-referencing.patch @@ -0,0 +1,119 @@ +From 3d4e0bee1809a2ccb87fb186859bba39be265542 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 22 Feb 2024 17:52:57 +0800 +Subject: [PATCH] migration/multifd: Cleanup TLS iochannel referencing + +commit 9221e3c6a237da90ac296adfeb6e99ea9babfc20 upstream. + +Commit a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to +blocking handshake") introduced a thread for TLS channels, which will +resolve the issue on blocking the main thread. However in the same commit +p->c is slightly abused just to be able to pass over the pointer "p" into +the thread. + +That's the major reason we'll need to conditionally free the io channel in +the fault paths. + +To clean it up, using a separate structure to pass over both "p" and "tioc" +in the tls handshake thread. Then we can make it a rule that p->c will +never be set until the channel is completely setup. With that, we can drop +the tricky conditional unref of the io channel in the error path. + +Intel-SIG: commit 9221e3c6a237 migration/multifd: Cleanup TLS iochannel referencing + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240222095301.171137-2-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 37 +++++++++++++++++++++++-------------- + 1 file changed, 23 insertions(+), 14 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 43f0820996..84a6b9e58f 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -891,16 +891,22 @@ out: + + static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); + ++typedef struct { ++ MultiFDSendParams *p; ++ QIOChannelTLS *tioc; ++} MultiFDTLSThreadArgs; ++ + static void *multifd_tls_handshake_thread(void *opaque) + { +- MultiFDSendParams *p = opaque; +- QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); ++ MultiFDTLSThreadArgs *args = opaque; + +- qio_channel_tls_handshake(tioc, ++ qio_channel_tls_handshake(args->tioc, + multifd_new_send_channel_async, +- p, ++ args->p, + NULL, + NULL); ++ g_free(args); ++ + return NULL; + } + +@@ -910,6 +916,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, + { + MigrationState *s = migrate_get_current(); + const char *hostname = s->hostname; ++ MultiFDTLSThreadArgs *args; + QIOChannelTLS *tioc; + + tioc = migration_tls_client_create(ioc, hostname, errp); +@@ -924,11 +931,14 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, + object_unref(OBJECT(ioc)); + trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); + qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); +- p->c = QIO_CHANNEL(tioc); ++ ++ args = g_new0(MultiFDTLSThreadArgs, 1); ++ args->tioc = tioc; ++ args->p = p; + + p->tls_thread_created = true; + qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", +- multifd_tls_handshake_thread, p, ++ multifd_tls_handshake_thread, args, + QEMU_THREAD_JOINABLE); + return true; + } +@@ -941,6 +951,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + + migration_ioc_register_yank(ioc); + p->registered_yank = true; ++ /* Setup p->c only if the channel is completely setup */ + p->c = ioc; + + p->thread_created = true; +@@ -994,14 +1005,12 @@ out: + + trace_multifd_new_send_channel_async_error(p->id, local_err); + multifd_send_set_error(local_err); +- if (!p->c) { +- /* +- * If no channel has been created, drop the initial +- * reference. Otherwise cleanup happens at +- * multifd_send_channel_destroy() +- */ +- object_unref(OBJECT(ioc)); +- } ++ /* ++ * For error cases (TLS or non-TLS), IO channel is always freed here ++ * rather than when cleanup multifd: since p->c is not set, multifd ++ * cleanup code doesn't even know its existence. ++ */ ++ object_unref(OBJECT(ioc)); + error_free(local_err); + } + +-- +2.43.0 + diff --git a/0411-migration-multifd-drop-registered-yank.patch b/0411-migration-multifd-drop-registered-yank.patch new file mode 100644 index 0000000..ca064e6 --- /dev/null +++ b/0411-migration-multifd-drop-registered-yank.patch @@ -0,0 +1,67 @@ +From 1d19243d056a74655130a18fd2e8cacde0a8be83 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 22 Feb 2024 17:52:58 +0800 +Subject: [PATCH] migration/multifd: Drop registered_yank + +commit 0518b5d8d30d3a4d0ea4f45d61527bcdc43044d2 upstream. + +With a clear definition of p->c protocol, where we only set it up if the +channel is fully established (TLS or non-TLS), registered_yank boolean will +have equal meaning of "p->c != NULL". + +Drop registered_yank by checking p->c instead. + +Intel-SIG: commit 0518b5d8d30d migration/multifd: Drop registered_yank + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240222095301.171137-3-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 7 +++---- + migration/multifd.h | 2 -- + 2 files changed, 3 insertions(+), 6 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 84a6b9e58f..1d039a4840 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -666,11 +666,11 @@ static int multifd_send_channel_destroy(QIOChannel *send) + + static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) + { +- if (p->registered_yank) { ++ if (p->c) { + migration_ioc_unregister_yank(p->c); ++ multifd_send_channel_destroy(p->c); ++ p->c = NULL; + } +- multifd_send_channel_destroy(p->c); +- p->c = NULL; + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); +@@ -950,7 +950,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + qio_channel_set_delay(ioc, false); + + migration_ioc_register_yank(ioc); +- p->registered_yank = true; + /* Setup p->c only if the channel is completely setup */ + p->c = ioc; + +diff --git a/migration/multifd.h b/migration/multifd.h +index 8a1cad0996..b3fe27ae93 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -78,8 +78,6 @@ typedef struct { + bool tls_thread_created; + /* communication channel */ + QIOChannel *c; +- /* is the yank function registered */ +- bool registered_yank; + /* packet allocated len */ + uint32_t packet_len; + /* guest page size */ +-- +2.43.0 + diff --git a/0412-migration-multifd-make-multifd-channel-connect-retur.patch b/0412-migration-multifd-make-multifd-channel-connect-retur.patch new file mode 100644 index 0000000..56fece2 --- /dev/null +++ b/0412-migration-multifd-make-multifd-channel-connect-retur.patch @@ -0,0 +1,56 @@ +From 558dd1f9a0ed7b75d7c02b346a1759dbc2f610be Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 22 Feb 2024 17:52:59 +0800 +Subject: [PATCH] migration/multifd: Make multifd_channel_connect() return void + +commit 770de49c00fa9eb262473f282c92979b47b7fd22 upstream. + +It never fails, drop the retval and also the Error**. + +Intel-SIG: commit 770de49c00fa migration/multifd: Make multifd_channel_connect() return void + +Suggested-by: Avihai Horon +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240222095301.171137-4-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 1d039a4840..af89e05915 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -943,9 +943,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, + return true; + } + +-static bool multifd_channel_connect(MultiFDSendParams *p, +- QIOChannel *ioc, +- Error **errp) ++static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) + { + qio_channel_set_delay(ioc, false); + +@@ -956,7 +954,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, + p->thread_created = true; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); +- return true; + } + + /* +@@ -988,7 +985,8 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) + return; + } + } else { +- ret = multifd_channel_connect(p, ioc, &local_err); ++ multifd_channel_connect(p, ioc); ++ ret = true; + } + + out: +-- +2.43.0 + diff --git a/0413-migration-multifd-cleanup-outgoing-args-in-state-des.patch b/0413-migration-multifd-cleanup-outgoing-args-in-state-des.patch new file mode 100644 index 0000000..48f548f --- /dev/null +++ b/0413-migration-multifd-cleanup-outgoing-args-in-state-des.patch @@ -0,0 +1,80 @@ +From d54d3b211fa8524509b7cbcc7997795f462ee3dd Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 22 Feb 2024 17:53:00 +0800 +Subject: [PATCH] migration/multifd: Cleanup outgoing_args in state destroy + +commit 72b90b96872acc5d00f9c16dfc196543349361da upstream. + +outgoing_args is a global cache of socket address to be reused in multifd. +Freeing the cache in per-channel destructor is more or less a hack. Move +it to multifd_send_cleanup_state() so it only get checked once. Use a +small helper to do so because it's internal of socket.c. + +Intel-SIG: commit 72b90b96872a migration/multifd: Cleanup outgoing_args in state destroy + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240222095301.171137-5-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 1 + + migration/socket.c | 12 ++++++++---- + migration/socket.h | 2 ++ + 3 files changed, 11 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index af89e05915..fa33fd98b4 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -689,6 +689,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) + + static void multifd_send_cleanup_state(void) + { ++ socket_cleanup_outgoing_migration(); + qemu_sem_destroy(&multifd_send_state->channels_created); + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); +diff --git a/migration/socket.c b/migration/socket.c +index 98e3ea1514..3184c7c3c1 100644 +--- a/migration/socket.c ++++ b/migration/socket.c +@@ -64,10 +64,6 @@ int socket_send_channel_destroy(QIOChannel *send) + { + /* Remove channel */ + object_unref(OBJECT(send)); +- if (outgoing_args.saddr) { +- qapi_free_SocketAddress(outgoing_args.saddr); +- outgoing_args.saddr = NULL; +- } + return 0; + } + +@@ -137,6 +133,14 @@ void socket_start_outgoing_migration(MigrationState *s, + NULL); + } + ++void socket_cleanup_outgoing_migration(void) ++{ ++ if (outgoing_args.saddr) { ++ qapi_free_SocketAddress(outgoing_args.saddr); ++ outgoing_args.saddr = NULL; ++ } ++} ++ + static void socket_accept_incoming_migration(QIONetListener *listener, + QIOChannelSocket *cioc, + gpointer opaque) +diff --git a/migration/socket.h b/migration/socket.h +index 5e4c33b8ea..5f52eddd4c 100644 +--- a/migration/socket.h ++++ b/migration/socket.h +@@ -29,4 +29,6 @@ void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); + + void socket_start_outgoing_migration(MigrationState *s, + SocketAddress *saddr, Error **errp); ++void socket_cleanup_outgoing_migration(void); ++ + #endif +-- +2.43.0 + diff --git a/0414-migration-multifd-drop-unnecessary-helper-to-destroy.patch b/0414-migration-multifd-drop-unnecessary-helper-to-destroy.patch new file mode 100644 index 0000000..cc99471 --- /dev/null +++ b/0414-migration-multifd-drop-unnecessary-helper-to-destroy.patch @@ -0,0 +1,79 @@ +From b0bb05bc9fe0869dab98eb3a7915f83f2128957b Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 22 Feb 2024 17:53:01 +0800 +Subject: [PATCH] migration/multifd: Drop unnecessary helper to destroy IOC + +commit c9a7e83c9d64fd5ebc759186789e1b753c919d32 upstream. + +Both socket_send_channel_destroy() and multifd_send_channel_destroy() are +unnecessary wrappers to destroy an IOC, as the only thing to do is to +release the final IOC reference. We have plenty of code that destroys an +IOC using direct unref() already; keep that style. + +Intel-SIG: commit c9a7e83c9d64 migration/multifd: Drop unnecessary helper to destroy IOC + +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240222095301.171137-6-peterx@redhat.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 7 +------ + migration/socket.c | 7 ------- + migration/socket.h | 1 - + 3 files changed, 1 insertion(+), 14 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index fa33fd98b4..6c07f19af1 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -659,16 +659,11 @@ static void multifd_send_terminate_threads(void) + } + } + +-static int multifd_send_channel_destroy(QIOChannel *send) +-{ +- return socket_send_channel_destroy(send); +-} +- + static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) + { + if (p->c) { + migration_ioc_unregister_yank(p->c); +- multifd_send_channel_destroy(p->c); ++ object_unref(OBJECT(p->c)); + p->c = NULL; + } + qemu_sem_destroy(&p->sem); +diff --git a/migration/socket.c b/migration/socket.c +index 3184c7c3c1..9ab89b1e08 100644 +--- a/migration/socket.c ++++ b/migration/socket.c +@@ -60,13 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp) + return QIO_CHANNEL(sioc); + } + +-int socket_send_channel_destroy(QIOChannel *send) +-{ +- /* Remove channel */ +- object_unref(OBJECT(send)); +- return 0; +-} +- + struct SocketConnectData { + MigrationState *s; + char *hostname; +diff --git a/migration/socket.h b/migration/socket.h +index 5f52eddd4c..46c233ecd2 100644 +--- a/migration/socket.h ++++ b/migration/socket.h +@@ -23,7 +23,6 @@ + + void socket_send_channel_create(QIOTaskFunc f, void *data); + QIOChannel *socket_send_channel_create_sync(Error **errp); +-int socket_send_channel_destroy(QIOChannel *send); + + void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); + +-- +2.43.0 + diff --git a/0415-migration-multifd-cleanup-multifd-recv-sync-main.patch b/0415-migration-multifd-cleanup-multifd-recv-sync-main.patch new file mode 100644 index 0000000..af9c2f6 --- /dev/null +++ b/0415-migration-multifd-cleanup-multifd-recv-sync-main.patch @@ -0,0 +1,77 @@ +From 7715bd4fa71993a456b1111dfcf3f43f9f48e0c0 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 29 Feb 2024 12:29:55 -0300 +Subject: [PATCH] migration/multifd: Cleanup multifd_recv_sync_main + +commit 4aac6b1e9bd48677c4f24518fe86ffd34c677d5a upstream. + +Some minor cleanups and documentation for multifd_recv_sync_main. + +Use thread_count as done in other parts of the code. Remove p->id from +the multifd_recv_state sync, since that is global and not tied to a +channel. Add documentation for the sync steps. + +Intel-SIG: commit 4aac6b1e9bd4 migration/multifd: Cleanup multifd_recv_sync_main + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240229153017.2221-2-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 17 +++++++++++++---- + migration/trace-events | 2 +- + 2 files changed, 14 insertions(+), 5 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 6c07f19af1..c7389bf833 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -1182,18 +1182,27 @@ void multifd_recv_cleanup(void) + + void multifd_recv_sync_main(void) + { ++ int thread_count = migrate_multifd_channels(); + int i; + + if (!migrate_multifd()) { + return; + } +- for (i = 0; i < migrate_multifd_channels(); i++) { +- MultiFDRecvParams *p = &multifd_recv_state->params[i]; + +- trace_multifd_recv_sync_main_wait(p->id); ++ /* ++ * Initiate the synchronization by waiting for all channels. ++ * For socket-based migration this means each channel has received ++ * the SYNC packet on the stream. ++ */ ++ for (i = 0; i < thread_count; i++) { ++ trace_multifd_recv_sync_main_wait(i); + qemu_sem_wait(&multifd_recv_state->sem_sync); + } +- for (i = 0; i < migrate_multifd_channels(); i++) { ++ ++ /* ++ * Sync done. Release the channels for the next iteration. ++ */ ++ for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + WITH_QEMU_LOCK_GUARD(&p->mutex) { +diff --git a/migration/trace-events b/migration/trace-events +index 298ad2b0dd..bf1a069632 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -132,7 +132,7 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uin + multifd_recv_new_channel(uint8_t id) "channel %u" + multifd_recv_sync_main(long packet_num) "packet num %ld" + multifd_recv_sync_main_signal(uint8_t id) "channel %u" +-multifd_recv_sync_main_wait(uint8_t id) "channel %u" ++multifd_recv_sync_main_wait(uint8_t id) "iter %u" + multifd_recv_terminate_threads(bool error) "error %d" + multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 + multifd_recv_thread_start(uint8_t id) "%u" +-- +2.43.0 + diff --git a/0416-migration-multifd-rename-multifdsend-recvparams-data.patch b/0416-migration-multifd-rename-multifdsend-recvparams-data.patch new file mode 100644 index 0000000..ebce165 --- /dev/null +++ b/0416-migration-multifd-rename-multifdsend-recvparams-data.patch @@ -0,0 +1,201 @@ +From 5dfddf619c3482241fffc8ce4ab48dd2ae36f903 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 29 Feb 2024 12:30:06 -0300 +Subject: [PATCH] migration/multifd: Rename MultiFDSend|RecvParams::data to + compress_data + +commit 402dd7ac1c3be44f306c903cdfd2583ffec5e2fd upstream. + +Use a more specific name for the compression data so we can use the +generic for the multifd core code. + +Intel-SIG: commit 402dd7ac1c3b migration/multifd: Rename MultiFDSend|RecvParams::data to compress_data + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240229153017.2221-13-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd-zlib.c | 20 ++++++++++---------- + migration/multifd-zstd.c | 20 ++++++++++---------- + migration/multifd.h | 4 ++-- + 3 files changed, 22 insertions(+), 22 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 012e3bdea1..2a8f5fc9a6 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -69,7 +69,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) + err_msg = "out of memory for buf"; + goto err_free_zbuff; + } +- p->data = z; ++ p->compress_data = z; + return 0; + + err_free_zbuff: +@@ -92,15 +92,15 @@ err_free_z: + */ + static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + { +- struct zlib_data *z = p->data; ++ struct zlib_data *z = p->compress_data; + + deflateEnd(&z->zs); + g_free(z->zbuff); + z->zbuff = NULL; + g_free(z->buf); + z->buf = NULL; +- g_free(p->data); +- p->data = NULL; ++ g_free(p->compress_data); ++ p->compress_data = NULL; + } + + /** +@@ -117,7 +117,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + { + MultiFDPages_t *pages = p->pages; +- struct zlib_data *z = p->data; ++ struct zlib_data *z = p->compress_data; + z_stream *zs = &z->zs; + uint32_t out_size = 0; + int ret; +@@ -194,7 +194,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) + struct zlib_data *z = g_new0(struct zlib_data, 1); + z_stream *zs = &z->zs; + +- p->data = z; ++ p->compress_data = z; + zs->zalloc = Z_NULL; + zs->zfree = Z_NULL; + zs->opaque = Z_NULL; +@@ -224,13 +224,13 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) + */ + static void zlib_recv_cleanup(MultiFDRecvParams *p) + { +- struct zlib_data *z = p->data; ++ struct zlib_data *z = p->compress_data; + + inflateEnd(&z->zs); + g_free(z->zbuff); + z->zbuff = NULL; +- g_free(p->data); +- p->data = NULL; ++ g_free(p->compress_data); ++ p->compress_data = NULL; + } + + /** +@@ -246,7 +246,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) + */ + static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) + { +- struct zlib_data *z = p->data; ++ struct zlib_data *z = p->compress_data; + z_stream *zs = &z->zs; + uint32_t in_size = p->next_packet_size; + /* we measure the change of total_out */ +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index dc8fe43e94..593cf290ad 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -52,7 +52,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + struct zstd_data *z = g_new0(struct zstd_data, 1); + int res; + +- p->data = z; ++ p->compress_data = z; + z->zcs = ZSTD_createCStream(); + if (!z->zcs) { + g_free(z); +@@ -90,14 +90,14 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + */ + static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + { +- struct zstd_data *z = p->data; ++ struct zstd_data *z = p->compress_data; + + ZSTD_freeCStream(z->zcs); + z->zcs = NULL; + g_free(z->zbuff); + z->zbuff = NULL; +- g_free(p->data); +- p->data = NULL; ++ g_free(p->compress_data); ++ p->compress_data = NULL; + } + + /** +@@ -114,7 +114,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + { + MultiFDPages_t *pages = p->pages; +- struct zstd_data *z = p->data; ++ struct zstd_data *z = p->compress_data; + int ret; + uint32_t i; + +@@ -183,7 +183,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) + struct zstd_data *z = g_new0(struct zstd_data, 1); + int ret; + +- p->data = z; ++ p->compress_data = z; + z->zds = ZSTD_createDStream(); + if (!z->zds) { + g_free(z); +@@ -221,14 +221,14 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) + */ + static void zstd_recv_cleanup(MultiFDRecvParams *p) + { +- struct zstd_data *z = p->data; ++ struct zstd_data *z = p->compress_data; + + ZSTD_freeDStream(z->zds); + z->zds = NULL; + g_free(z->zbuff); + z->zbuff = NULL; +- g_free(p->data); +- p->data = NULL; ++ g_free(p->compress_data); ++ p->compress_data = NULL; + } + + /** +@@ -248,7 +248,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) + uint32_t out_size = 0; + uint32_t expected_size = p->normal_num * p->page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; +- struct zstd_data *z = p->data; ++ struct zstd_data *z = p->compress_data; + int ret; + int i; + +diff --git a/migration/multifd.h b/migration/multifd.h +index b3fe27ae93..adccd3532f 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -127,7 +127,7 @@ typedef struct { + /* number of iovs used */ + uint32_t iovs_num; + /* used for compression methods */ +- void *data; ++ void *compress_data; + } MultiFDSendParams; + + typedef struct { +@@ -183,7 +183,7 @@ typedef struct { + /* num of non zero pages */ + uint32_t normal_num; + /* used for de-compression methods */ +- void *data; ++ void *compress_data; + } MultiFDRecvParams; + + typedef struct { +-- +2.43.0 + diff --git a/0417-migration-multifd-decouple-recv-method-from-pages.patch b/0417-migration-multifd-decouple-recv-method-from-pages.patch new file mode 100644 index 0000000..45b7b07 --- /dev/null +++ b/0417-migration-multifd-decouple-recv-method-from-pages.patch @@ -0,0 +1,159 @@ +From f442d8522eecef0c50a8295f61b674e112c81240 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 29 Feb 2024 12:30:07 -0300 +Subject: [PATCH] migration/multifd: Decouple recv method from pages + +commit 9db191251381c75e57201f7b07330ca982a55d1e upstream. + +Next patches will abstract the type of data being received by the +channels, so do some cleanup now to remove references to pages and +dependency on 'normal_num'. + +Intel-SIG: commit 9db191251381 migration/multifd: Decouple recv method from pages + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240229153017.2221-14-farosas@suse.de +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/multifd-zlib.c | 6 +++--- + migration/multifd-zstd.c | 6 +++--- + migration/multifd.c | 13 ++++++++----- + migration/multifd.h | 4 ++-- + 4 files changed, 16 insertions(+), 13 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 2a8f5fc9a6..6120faad65 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -234,7 +234,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) + } + + /** +- * zlib_recv_pages: read the data from the channel into actual pages ++ * zlib_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. +@@ -244,7 +244,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +-static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) ++static int zlib_recv(MultiFDRecvParams *p, Error **errp) + { + struct zlib_data *z = p->compress_data; + z_stream *zs = &z->zs; +@@ -319,7 +319,7 @@ static MultiFDMethods multifd_zlib_ops = { + .send_prepare = zlib_send_prepare, + .recv_setup = zlib_recv_setup, + .recv_cleanup = zlib_recv_cleanup, +- .recv_pages = zlib_recv_pages ++ .recv = zlib_recv + }; + + static void multifd_zlib_register(void) +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 593cf290ad..cac236833d 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -232,7 +232,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) + } + + /** +- * zstd_recv_pages: read the data from the channel into actual pages ++ * zstd_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. +@@ -242,7 +242,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +-static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) ++static int zstd_recv(MultiFDRecvParams *p, Error **errp) + { + uint32_t in_size = p->next_packet_size; + uint32_t out_size = 0; +@@ -310,7 +310,7 @@ static MultiFDMethods multifd_zstd_ops = { + .send_prepare = zstd_send_prepare, + .recv_setup = zstd_recv_setup, + .recv_cleanup = zstd_recv_cleanup, +- .recv_pages = zstd_recv_pages ++ .recv = zstd_recv + }; + + static void multifd_zstd_register(void) +diff --git a/migration/multifd.c b/migration/multifd.c +index c7389bf833..3a8520097b 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -197,7 +197,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) + } + + /** +- * nocomp_recv_pages: read the data from the channel into actual pages ++ * nocomp_recv: read the data from the channel + * + * For no compression we just need to read things into the correct place. + * +@@ -206,7 +206,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +-static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) ++static int nocomp_recv(MultiFDRecvParams *p, Error **errp) + { + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + +@@ -228,7 +228,7 @@ static MultiFDMethods multifd_nocomp_ops = { + .send_prepare = nocomp_send_prepare, + .recv_setup = nocomp_recv_setup, + .recv_cleanup = nocomp_recv_cleanup, +- .recv_pages = nocomp_recv_pages ++ .recv = nocomp_recv + }; + + static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { +@@ -1227,6 +1227,8 @@ static void *multifd_recv_thread(void *opaque) + + while (true) { + uint32_t flags; ++ bool has_data = false; ++ p->normal_num = 0; + + if (multifd_recv_should_exit()) { + break; +@@ -1248,10 +1250,11 @@ static void *multifd_recv_thread(void *opaque) + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; ++ has_data = !!p->normal_num; + qemu_mutex_unlock(&p->mutex); + +- if (p->normal_num) { +- ret = multifd_recv_state->ops->recv_pages(p, &local_err); ++ if (has_data) { ++ ret = multifd_recv_state->ops->recv(p, &local_err); + if (ret != 0) { + break; + } +diff --git a/migration/multifd.h b/migration/multifd.h +index adccd3532f..6a54377cc1 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -197,8 +197,8 @@ typedef struct { + int (*recv_setup)(MultiFDRecvParams *p, Error **errp); + /* Cleanup for receiving side */ + void (*recv_cleanup)(MultiFDRecvParams *p); +- /* Read all pages */ +- int (*recv_pages)(MultiFDRecvParams *p, Error **errp); ++ /* Read all data */ ++ int (*recv)(MultiFDRecvParams *p, Error **errp); + } MultiFDMethods; + + void multifd_register_ops(int method, MultiFDMethods *ops); +-- +2.43.0 + diff --git a/0418-migration-multifd-allow-multifd-without-packets.patch b/0418-migration-multifd-allow-multifd-without-packets.patch new file mode 100644 index 0000000..9695c29 --- /dev/null +++ b/0418-migration-multifd-allow-multifd-without-packets.patch @@ -0,0 +1,364 @@ +From 14903feba798071a3d14b2db29a2fe89e0203a45 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 29 Feb 2024 12:30:08 -0300 +Subject: [PATCH] migration/multifd: Allow multifd without packets + +commit 06833d83f8978139395da0f1d6a9fad81b9dd024 upstream. + +For the upcoming support to the new 'mapped-ram' migration stream +format, we cannot use multifd packets because each write into the +ramblock section in the migration file is expected to contain only the +guest pages. They are written at their respective offsets relative to +the ramblock section header. + +There is no space for the packet information and the expected gains +from the new approach come partly from being able to write the pages +sequentially without extraneous data in between. + +The new format also simply doesn't need the packets and all necessary +information can be taken from the standard migration headers with some +(future) changes to multifd code. + +Use the presence of the mapped-ram capability to decide whether to +send packets. + +This only moves code under multifd_use_packets(), it has no effect for +now as mapped-ram cannot yet be enabled with multifd. + +Intel-SIG: commit 06833d83f897 migration/multifd: Allow multifd without packets + +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240229153017.2221-15-farosas@suse.de +Signed-off-by: Peter Xu +[jz: make multifd_use_packet to always return true, since mapped-ram + is not backported] +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 175 +++++++++++++++++++++++++++++--------------- + 1 file changed, 114 insertions(+), 61 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index 3a8520097b..9e617b608e 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -92,6 +92,11 @@ struct { + MultiFDMethods *ops; + } *multifd_recv_state; + ++static bool multifd_use_packets(void) ++{ ++ return true; ++} ++ + /* Multifd without compression */ + + /** +@@ -122,6 +127,19 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + return; + } + ++static void multifd_send_prepare_iovs(MultiFDSendParams *p) ++{ ++ MultiFDPages_t *pages = p->pages; ++ ++ for (int i = 0; i < pages->num; i++) { ++ p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; ++ p->iov[p->iovs_num].iov_len = p->page_size; ++ p->iovs_num++; ++ } ++ ++ p->next_packet_size = pages->num * p->page_size; ++} ++ + /** + * nocomp_send_prepare: prepare date to be able to send + * +@@ -136,9 +154,13 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + { + bool use_zero_copy_send = migrate_zero_copy_send(); +- MultiFDPages_t *pages = p->pages; + int ret; + ++ if (!multifd_use_packets()) { ++ multifd_send_prepare_iovs(p); ++ return 0; ++ } ++ + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will +@@ -147,13 +169,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + multifd_send_prepare_header(p); + } + +- for (int i = 0; i < pages->num; i++) { +- p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; +- p->iov[p->iovs_num].iov_len = p->page_size; +- p->iovs_num++; +- } +- +- p->next_packet_size = pages->num * p->page_size; ++ multifd_send_prepare_iovs(p); + p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); +@@ -208,7 +224,13 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) + */ + static int nocomp_recv(MultiFDRecvParams *p, Error **errp) + { +- uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; ++ uint32_t flags; ++ ++ if (!multifd_use_packets()) { ++ return 0; ++ } ++ ++ flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (flags != MULTIFD_FLAG_NOCOMP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", +@@ -795,15 +817,18 @@ static void *multifd_send_thread(void *opaque) + MigrationThread *thread = NULL; + Error *local_err = NULL; + int ret = 0; ++ bool use_packets = multifd_use_packets(); + + thread = migration_threads_add(p->name, qemu_get_thread_id()); + + trace_multifd_send_thread_start(p->id); + rcu_register_thread(); + +- if (multifd_send_initial_packet(p, &local_err) < 0) { +- ret = -1; +- goto out; ++ if (use_packets) { ++ if (multifd_send_initial_packet(p, &local_err) < 0) { ++ ret = -1; ++ goto out; ++ } + } + + while (true) { +@@ -854,16 +879,20 @@ static void *multifd_send_thread(void *opaque) + * it doesn't require explicit memory barriers. + */ + assert(qatomic_read(&p->pending_sync)); +- p->flags = MULTIFD_FLAG_SYNC; +- multifd_send_fill_packet(p); +- ret = qio_channel_write_all(p->c, (void *)p->packet, +- p->packet_len, &local_err); +- if (ret != 0) { +- break; ++ ++ if (use_packets) { ++ p->flags = MULTIFD_FLAG_SYNC; ++ multifd_send_fill_packet(p); ++ ret = qio_channel_write_all(p->c, (void *)p->packet, ++ p->packet_len, &local_err); ++ if (ret != 0) { ++ break; ++ } ++ /* p->next_packet_size will always be zero for a SYNC packet */ ++ stat64_add(&mig_stats.multifd_bytes, p->packet_len); ++ p->flags = 0; + } +- /* p->next_packet_size will always be zero for a SYNC packet */ +- stat64_add(&mig_stats.multifd_bytes, p->packet_len); +- p->flags = 0; ++ + qatomic_set(&p->pending_sync, false); + qemu_sem_post(&p->sem_sync); + } +@@ -1018,6 +1047,7 @@ bool multifd_send_setup(void) + Error *local_err = NULL; + int thread_count, ret = 0; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); ++ bool use_packets = multifd_use_packets(); + uint8_t i; + + if (!migrate_multifd()) { +@@ -1040,14 +1070,20 @@ bool multifd_send_setup(void) + qemu_sem_init(&p->sem_sync, 0); + p->id = i; + p->pages = multifd_pages_init(page_count); +- p->packet_len = sizeof(MultiFDPacket_t) +- + sizeof(uint64_t) * page_count; +- p->packet = g_malloc0(p->packet_len); +- p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); +- p->packet->version = cpu_to_be32(MULTIFD_VERSION); ++ ++ if (use_packets) { ++ p->packet_len = sizeof(MultiFDPacket_t) ++ + sizeof(uint64_t) * page_count; ++ p->packet = g_malloc0(p->packet_len); ++ p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); ++ p->packet->version = cpu_to_be32(MULTIFD_VERSION); ++ ++ /* We need one extra place for the packet header */ ++ p->iov = g_new0(struct iovec, page_count + 1); ++ } else { ++ p->iov = g_new0(struct iovec, page_count); ++ } + p->name = g_strdup_printf("multifdsend_%d", i); +- /* We need one extra place for the packet header */ +- p->iov = g_new0(struct iovec, page_count + 1); + p->page_size = qemu_target_page_size(); + p->page_count = page_count; + p->write_flags = 0; +@@ -1110,7 +1146,9 @@ static void multifd_recv_terminate_threads(Error *err) + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ +- qemu_sem_post(&p->sem_sync); ++ if (multifd_use_packets()) { ++ qemu_sem_post(&p->sem_sync); ++ } + + /* + * We could arrive here for two reasons: +@@ -1185,7 +1223,7 @@ void multifd_recv_sync_main(void) + int thread_count = migrate_multifd_channels(); + int i; + +- if (!migrate_multifd()) { ++ if (!migrate_multifd() || !multifd_use_packets()) { + return; + } + +@@ -1220,13 +1258,14 @@ static void *multifd_recv_thread(void *opaque) + { + MultiFDRecvParams *p = opaque; + Error *local_err = NULL; ++ bool use_packets = multifd_use_packets(); + int ret; + + trace_multifd_recv_thread_start(p->id); + rcu_register_thread(); + + while (true) { +- uint32_t flags; ++ uint32_t flags = 0; + bool has_data = false; + p->normal_num = 0; + +@@ -1234,25 +1273,27 @@ static void *multifd_recv_thread(void *opaque) + break; + } + +- ret = qio_channel_read_all_eof(p->c, (void *)p->packet, +- p->packet_len, &local_err); +- if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ +- break; +- } ++ if (use_packets) { ++ ret = qio_channel_read_all_eof(p->c, (void *)p->packet, ++ p->packet_len, &local_err); ++ if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ ++ break; ++ } + +- qemu_mutex_lock(&p->mutex); +- ret = multifd_recv_unfill_packet(p, &local_err); +- if (ret) { ++ qemu_mutex_lock(&p->mutex); ++ ret = multifd_recv_unfill_packet(p, &local_err); ++ if (ret) { ++ qemu_mutex_unlock(&p->mutex); ++ break; ++ } ++ ++ flags = p->flags; ++ /* recv methods don't know how to handle the SYNC flag */ ++ p->flags &= ~MULTIFD_FLAG_SYNC; ++ has_data = !!p->normal_num; + qemu_mutex_unlock(&p->mutex); +- break; + } + +- flags = p->flags; +- /* recv methods don't know how to handle the SYNC flag */ +- p->flags &= ~MULTIFD_FLAG_SYNC; +- has_data = !!p->normal_num; +- qemu_mutex_unlock(&p->mutex); +- + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); + if (ret != 0) { +@@ -1260,9 +1301,11 @@ static void *multifd_recv_thread(void *opaque) + } + } + +- if (flags & MULTIFD_FLAG_SYNC) { +- qemu_sem_post(&multifd_recv_state->sem_sync); +- qemu_sem_wait(&p->sem_sync); ++ if (use_packets) { ++ if (flags & MULTIFD_FLAG_SYNC) { ++ qemu_sem_post(&multifd_recv_state->sem_sync); ++ qemu_sem_wait(&p->sem_sync); ++ } + } + } + +@@ -1281,6 +1324,7 @@ int multifd_recv_setup(Error **errp) + { + int thread_count; + uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); ++ bool use_packets = multifd_use_packets(); + uint8_t i; + + /* +@@ -1305,9 +1349,12 @@ int multifd_recv_setup(Error **errp) + qemu_mutex_init(&p->mutex); + qemu_sem_init(&p->sem_sync, 0); + p->id = i; +- p->packet_len = sizeof(MultiFDPacket_t) +- + sizeof(uint64_t) * page_count; +- p->packet = g_malloc0(p->packet_len); ++ ++ if (use_packets) { ++ p->packet_len = sizeof(MultiFDPacket_t) ++ + sizeof(uint64_t) * page_count; ++ p->packet = g_malloc0(p->packet_len); ++ } + p->name = g_strdup_printf("multifdrecv_%d", i); + p->iov = g_new0(struct iovec, page_count); + p->normal = g_new0(ram_addr_t, page_count); +@@ -1351,18 +1398,24 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) + { + MultiFDRecvParams *p; + Error *local_err = NULL; ++ bool use_packets = multifd_use_packets(); + int id; + +- id = multifd_recv_initial_packet(ioc, &local_err); +- if (id < 0) { +- multifd_recv_terminate_threads(local_err); +- error_propagate_prepend(errp, local_err, +- "failed to receive packet" +- " via multifd channel %d: ", +- qatomic_read(&multifd_recv_state->count)); +- return; ++ if (use_packets) { ++ id = multifd_recv_initial_packet(ioc, &local_err); ++ if (id < 0) { ++ multifd_recv_terminate_threads(local_err); ++ error_propagate_prepend(errp, local_err, ++ "failed to receive packet" ++ " via multifd channel %d: ", ++ qatomic_read(&multifd_recv_state->count)); ++ return; ++ } ++ trace_multifd_recv_new_channel(id); ++ } else { ++ /* next patch gives this a meaningful value */ ++ id = 0; + } +- trace_multifd_recv_new_channel(id); + + p = &multifd_recv_state->params[id]; + if (p->c != NULL) { +-- +2.43.0 + diff --git a/0419-migration-multifd-add-new-migration-option-zero-page.patch b/0419-migration-multifd-add-new-migration-option-zero-page.patch new file mode 100644 index 0000000..3ca7269 --- /dev/null +++ b/0419-migration-multifd-add-new-migration-option-zero-page.patch @@ -0,0 +1,288 @@ +From 0fa7f00ab17ed6ba75b1fa240eb24f490bd11179 Mon Sep 17 00:00:00 2001 +From: Hao Xiang +Date: Mon, 11 Mar 2024 18:00:11 +0000 +Subject: [PATCH] migration/multifd: Add new migration option + zero-page-detection. + +commit 5fdbb1dfccfd59661c95cae760b8e276c5b8e65c upstream. + +This new parameter controls where the zero page checking is running. +1. If this parameter is set to 'legacy', zero page checking is +done in the migration main thread. +2. If this parameter is set to 'none', zero page checking is disabled. + +Intel-SIG: commit 5fdbb1dfccfd migration/multifd: Add new migration option zero-page-detection. + +Signed-off-by: Hao Xiang +Reviewed-by: Peter Xu +Acked-by: Markus Armbruster +Link: https://lore.kernel.org/r/20240311180015.3359271-4-hao.xiang@linux.dev +Signed-off-by: Peter Xu + + Conflicts: + hw/core/qdev-properties-system.c + include/hw/qdev-properties-system.h +[jz: resolve simple context conflicts] +Signed-off-by: Jason Zeng +--- + hw/core/qdev-properties-system.c | 10 ++++++++++ + include/hw/qdev-properties-system.h | 4 ++++ + migration/migration-hmp-cmds.c | 9 +++++++++ + migration/options.c | 21 +++++++++++++++++++++ + migration/options.h | 1 + + migration/ram.c | 4 ++++ + qapi/migration.json | 28 +++++++++++++++++++++++++++- + 7 files changed, 76 insertions(+), 1 deletion(-) + +diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c +index 1473ab3d5e..4171dd104f 100644 +--- a/hw/core/qdev-properties-system.c ++++ b/hw/core/qdev-properties-system.c +@@ -687,6 +687,16 @@ const PropertyInfo qdev_prop_mig_mode = { + .set_default_value = qdev_propinfo_set_default_value_enum, + }; + ++const PropertyInfo qdev_prop_zero_page_detection = { ++ .name = "ZeroPageDetection", ++ .description = "zero_page_detection values, " ++ "none,legacy", ++ .enum_table = &ZeroPageDetection_lookup, ++ .get = qdev_propinfo_get_enum, ++ .set = qdev_propinfo_set_enum, ++ .set_default_value = qdev_propinfo_set_default_value_enum, ++}; ++ + /* --- Reserved Region --- */ + + /* +diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h +index 91f7a2452d..85d4a03389 100644 +--- a/include/hw/qdev-properties-system.h ++++ b/include/hw/qdev-properties-system.h +@@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr; + extern const PropertyInfo qdev_prop_reserved_region; + extern const PropertyInfo qdev_prop_multifd_compression; + extern const PropertyInfo qdev_prop_mig_mode; ++extern const PropertyInfo qdev_prop_zero_page_detection; + extern const PropertyInfo qdev_prop_losttickpolicy; + extern const PropertyInfo qdev_prop_blockdev_on_error; + extern const PropertyInfo qdev_prop_bios_chs_trans; +@@ -46,6 +47,9 @@ extern const PropertyInfo qdev_prop_cpus390entitlement; + #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ + MigMode) ++#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \ ++ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \ ++ ZeroPageDetection) + #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ + LostTickPolicy) +diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c +index a2675518e0..16070c5eb3 100644 +--- a/migration/migration-hmp-cmds.c ++++ b/migration/migration-hmp-cmds.c +@@ -345,6 +345,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), + MultiFDCompression_str(params->multifd_compression)); ++ assert(params->has_zero_page_detection); ++ monitor_printf(mon, "%s: %s\n", ++ MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), ++ qapi_enum_lookup(&ZeroPageDetection_lookup, ++ params->zero_page_detection)); + monitor_printf(mon, "%s: %" PRIu64 " bytes\n", + MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), + params->xbzrle_cache_size); +@@ -651,6 +656,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) + p->has_multifd_zstd_level = true; + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); + break; ++ case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: ++ p->has_zero_page_detection = true; ++ visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); ++ break; + case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: + p->has_xbzrle_cache_size = true; + if (!visit_type_size(v, param, &cache_size, &err)) { +diff --git a/migration/options.c b/migration/options.c +index 70f6beb83c..38403bf745 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -179,6 +179,9 @@ Property migration_properties[] = { + DEFINE_PROP_MIG_MODE("mode", MigrationState, + parameters.mode, + MIG_MODE_NORMAL), ++ DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, ++ parameters.zero_page_detection, ++ ZERO_PAGE_DETECTION_LEGACY), + DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), + DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), + DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), +@@ -904,6 +907,13 @@ uint64_t migrate_xbzrle_cache_size(void) + return s->parameters.xbzrle_cache_size; + } + ++ZeroPageDetection migrate_zero_page_detection(void) ++{ ++ MigrationState *s = migrate_get_current(); ++ ++ return s->parameters.zero_page_detection; ++} ++ + /* parameter setters */ + + void migrate_set_block_incremental(bool value) +@@ -1017,6 +1027,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) + params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; + params->has_mode = true; + params->mode = s->parameters.mode; ++ params->has_zero_page_detection = true; ++ params->zero_page_detection = s->parameters.zero_page_detection; + + return params; + } +@@ -1053,6 +1065,7 @@ void migrate_params_init(MigrationParameters *params) + params->has_x_vcpu_dirty_limit_period = true; + params->has_vcpu_dirty_limit = true; + params->has_mode = true; ++ params->has_zero_page_detection = true; + + params->sev_pdh = g_strdup(""); + params->sev_plat_cert = g_strdup(""); +@@ -1359,6 +1372,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, + dest->mode = params->mode; + } + ++ if (params->has_zero_page_detection) { ++ dest->zero_page_detection = params->zero_page_detection; ++ } ++ + if (params->sev_pdh) { + assert(params->sev_pdh->type == QTYPE_QSTRING); + dest->sev_pdh = params->sev_pdh->u.s; +@@ -1516,6 +1533,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) + s->parameters.mode = params->mode; + } + ++ if (params->has_zero_page_detection) { ++ s->parameters.zero_page_detection = params->zero_page_detection; ++ } ++ + if (params->sev_pdh) { + g_free(s->parameters.sev_pdh); + assert(params->sev_pdh->type == QTYPE_QSTRING); +diff --git a/migration/options.h b/migration/options.h +index 246c160aee..b7c4fb3861 100644 +--- a/migration/options.h ++++ b/migration/options.h +@@ -93,6 +93,7 @@ const char *migrate_tls_authz(void); + const char *migrate_tls_creds(void); + const char *migrate_tls_hostname(void); + uint64_t migrate_xbzrle_cache_size(void); ++ZeroPageDetection migrate_zero_page_detection(void); + + /* parameters setters */ + +diff --git a/migration/ram.c b/migration/ram.c +index 9d17628100..b350a1e8d6 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1138,6 +1138,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, + QEMUFile *file = pss->pss_channel; + int len = 0; + ++ if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { ++ return 0; ++ } ++ + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { + return 0; + } +diff --git a/qapi/migration.json b/qapi/migration.json +index 3c4724db1b..af0cefce04 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -653,6 +653,18 @@ + { 'enum': 'MigMode', + 'data': [ 'normal', 'cpr-reboot' ] } + ++## ++# @ZeroPageDetection: ++# ++# @none: Do not perform zero page checking. ++# ++# @legacy: Perform zero page checking in main migration thread. ++# ++# Since: 9.0 ++## ++{ 'enum': 'ZeroPageDetection', ++ 'data': [ 'none', 'legacy' ] } ++ + ## + # @BitmapMigrationBitmapAliasTransform: + # +@@ -874,6 +886,10 @@ + # @mode: Migration mode. See description in @MigMode. Default is 'normal'. + # (Since 8.2) + # ++# @zero-page-detection: Whether and how to detect zero pages. ++# See description in @ZeroPageDetection. Default is 'legacy'. ++# (since 9.0) ++# + # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or + # pdh filename for hygon + # (Since 4.2) +@@ -919,6 +935,7 @@ + { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, + 'vcpu-dirty-limit', + 'mode', ++ 'zero-page-detection', + 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert'] } + + ## +@@ -1074,6 +1091,10 @@ + # @mode: Migration mode. See description in @MigMode. Default is 'normal'. + # (Since 8.2) + # ++# @zero-page-detection: Whether and how to detect zero pages. ++# See description in @ZeroPageDetection. Default is 'legacy'. ++# (since 9.0) ++# + # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or + # pdh filename for hygon + # (Since 4.2) +@@ -1139,11 +1160,11 @@ + 'features': [ 'unstable' ] }, + '*vcpu-dirty-limit': 'uint64', + '*mode': 'MigMode', ++ '*zero-page-detection': 'ZeroPageDetection', + '*sev-pdh': 'StrOrNull', + '*sev-plat-cert': 'StrOrNull', + '*sev-amd-cert' : 'StrOrNull' } } + +- + ## + # @migrate-set-parameters: + # +@@ -1317,6 +1338,10 @@ + # @mode: Migration mode. See description in @MigMode. Default is 'normal'. + # (Since 8.2) + # ++# @zero-page-detection: Whether and how to detect zero pages. ++# See description in @ZeroPageDetection. Default is 'legacy'. ++# (since 9.0) ++# + # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or + # pdh filename for hygon + # (Since 4.2) +@@ -1379,6 +1404,7 @@ + 'features': [ 'unstable' ] }, + '*vcpu-dirty-limit': 'uint64', + '*mode': 'MigMode', ++ '*zero-page-detection': 'ZeroPageDetection', + '*sev-pdh': 'str', + '*sev-plat-cert': 'str', + '*sev-amd-cert' : 'str'} } +-- +2.43.0 + diff --git a/0420-migration-multifd-implement-zero-page-transmission-o.patch b/0420-migration-multifd-implement-zero-page-transmission-o.patch new file mode 100644 index 0000000..3c7c54e --- /dev/null +++ b/0420-migration-multifd-implement-zero-page-transmission-o.patch @@ -0,0 +1,623 @@ +From b857f67d60737602c5d9e28d116fbf7d8b261a08 Mon Sep 17 00:00:00 2001 +From: Hao Xiang +Date: Mon, 11 Mar 2024 18:00:12 +0000 +Subject: [PATCH] migration/multifd: Implement zero page transmission on the + multifd thread. + +commit 303e6f54f9657be76ee060006ee2d4cacff263a0 upstream. + +1. Add zero_pages field in MultiFDPacket_t. +2. Implements the zero page detection and handling on the multifd +threads for non-compression, zlib and zstd compression backends. +3. Added a new value 'multifd' in ZeroPageDetection enumeration. +4. Adds zero page counters and updates multifd send/receive tracing +format to track the newly added counters. + +Intel-SIG: commit 303e6f54f965 migration/multifd: Implement zero page transmission on the multifd thread. + +Signed-off-by: Hao Xiang +Acked-by: Markus Armbruster +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240311180015.3359271-5-hao.xiang@linux.dev +Signed-off-by: Peter Xu + + Conflicts: + migration/multifd.c +[jz: there is no multifd_set_file_bitmap() because we didn't backport + mapped-ram, so abandon changes in multifd_set_file_bitmap()] +Signed-off-by: Jason Zeng +--- + hw/core/qdev-properties-system.c | 2 +- + migration/meson.build | 1 + + migration/multifd-zero-page.c | 87 ++++++++++++++++++++++++++++++++ + migration/multifd-zlib.c | 21 ++++++-- + migration/multifd-zstd.c | 20 ++++++-- + migration/multifd.c | 83 +++++++++++++++++++++++++----- + migration/multifd.h | 23 ++++++++- + migration/ram.c | 1 - + migration/trace-events | 8 +-- + qapi/migration.json | 7 ++- + 10 files changed, 222 insertions(+), 31 deletions(-) + create mode 100644 migration/multifd-zero-page.c + +diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c +index 4171dd104f..67d40c526c 100644 +--- a/hw/core/qdev-properties-system.c ++++ b/hw/core/qdev-properties-system.c +@@ -690,7 +690,7 @@ const PropertyInfo qdev_prop_mig_mode = { + const PropertyInfo qdev_prop_zero_page_detection = { + .name = "ZeroPageDetection", + .description = "zero_page_detection values, " +- "none,legacy", ++ "none,legacy,multifd", + .enum_table = &ZeroPageDetection_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, +diff --git a/migration/meson.build b/migration/meson.build +index 92b1cc4297..1eeb915ff6 100644 +--- a/migration/meson.build ++++ b/migration/meson.build +@@ -22,6 +22,7 @@ system_ss.add(files( + 'migration.c', + 'multifd.c', + 'multifd-zlib.c', ++ 'multifd-zero-page.c', + 'ram-compress.c', + 'options.c', + 'postcopy-ram.c', +diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c +new file mode 100644 +index 0000000000..1ba38be636 +--- /dev/null ++++ b/migration/multifd-zero-page.c +@@ -0,0 +1,87 @@ ++/* ++ * Multifd zero page detection implementation. ++ * ++ * Copyright (c) 2024 Bytedance Inc ++ * ++ * Authors: ++ * Hao Xiang ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "qemu/cutils.h" ++#include "exec/ramblock.h" ++#include "migration.h" ++#include "multifd.h" ++#include "options.h" ++#include "ram.h" ++ ++static bool multifd_zero_page_enabled(void) ++{ ++ return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; ++} ++ ++static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) ++{ ++ ram_addr_t temp; ++ ++ if (a == b) { ++ return; ++ } ++ ++ temp = pages_offset[a]; ++ pages_offset[a] = pages_offset[b]; ++ pages_offset[b] = temp; ++} ++ ++/** ++ * multifd_send_zero_page_detect: Perform zero page detection on all pages. ++ * ++ * Sorts normal pages before zero pages in p->pages->offset and updates ++ * p->pages->normal_num. ++ * ++ * @param p A pointer to the send params. ++ */ ++void multifd_send_zero_page_detect(MultiFDSendParams *p) ++{ ++ MultiFDPages_t *pages = p->pages; ++ RAMBlock *rb = pages->block; ++ int i = 0; ++ int j = pages->num - 1; ++ ++ if (!multifd_zero_page_enabled()) { ++ pages->normal_num = pages->num; ++ return; ++ } ++ ++ /* ++ * Sort the page offset array by moving all normal pages to ++ * the left and all zero pages to the right of the array. ++ */ ++ while (i <= j) { ++ uint64_t offset = pages->offset[i]; ++ ++ if (!buffer_is_zero(rb->host + offset, p->page_size)) { ++ i++; ++ continue; ++ } ++ ++ swap_page_offset(pages->offset, i, j); ++ ram_release_page(rb->idstr, offset); ++ j--; ++ } ++ ++ pages->normal_num = i; ++} ++ ++void multifd_recv_zero_page_process(MultiFDRecvParams *p) ++{ ++ for (int i = 0; i < p->zero_num; i++) { ++ void *page = p->host + p->zero[i]; ++ if (!buffer_is_zero(page, p->page_size)) { ++ memset(page, 0, p->page_size); ++ } ++ } ++} +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 6120faad65..83c0374380 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -123,13 +123,15 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + int ret; + uint32_t i; + +- multifd_send_prepare_header(p); ++ if (!multifd_send_prepare_common(p)) { ++ goto out; ++ } + +- for (i = 0; i < pages->num; i++) { ++ for (i = 0; i < pages->normal_num; i++) { + uint32_t available = z->zbuff_len - out_size; + int flush = Z_NO_FLUSH; + +- if (i == pages->num - 1) { ++ if (i == pages->normal_num - 1) { + flush = Z_SYNC_FLUSH; + } + +@@ -172,10 +174,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) + p->iov[p->iovs_num].iov_len = out_size; + p->iovs_num++; + p->next_packet_size = out_size; +- p->flags |= MULTIFD_FLAG_ZLIB; + ++out: ++ p->flags |= MULTIFD_FLAG_ZLIB; + multifd_send_fill_packet(p); +- + return 0; + } + +@@ -261,6 +263,14 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) + p->id, flags, MULTIFD_FLAG_ZLIB); + return -1; + } ++ ++ multifd_recv_zero_page_process(p); ++ ++ if (!p->normal_num) { ++ assert(in_size == 0); ++ return 0; ++ } ++ + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); + + if (ret != 0) { +@@ -310,6 +320,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) + p->id, out_size, expected_size); + return -1; + } ++ + return 0; + } + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index cac236833d..02112255ad 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -118,16 +118,18 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + int ret; + uint32_t i; + +- multifd_send_prepare_header(p); ++ if (!multifd_send_prepare_common(p)) { ++ goto out; ++ } + + z->out.dst = z->zbuff; + z->out.size = z->zbuff_len; + z->out.pos = 0; + +- for (i = 0; i < pages->num; i++) { ++ for (i = 0; i < pages->normal_num; i++) { + ZSTD_EndDirective flush = ZSTD_e_continue; + +- if (i == pages->num - 1) { ++ if (i == pages->normal_num - 1) { + flush = ZSTD_e_flush; + } + z->in.src = p->pages->block->host + pages->offset[i]; +@@ -161,10 +163,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + p->iov[p->iovs_num].iov_len = z->out.pos; + p->iovs_num++; + p->next_packet_size = z->out.pos; +- p->flags |= MULTIFD_FLAG_ZSTD; + ++out: ++ p->flags |= MULTIFD_FLAG_ZSTD; + multifd_send_fill_packet(p); +- + return 0; + } + +@@ -257,6 +259,14 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) + p->id, flags, MULTIFD_FLAG_ZSTD); + return -1; + } ++ ++ multifd_recv_zero_page_process(p); ++ ++ if (!p->normal_num) { ++ assert(in_size == 0); ++ return 0; ++ } ++ + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); + + if (ret != 0) { +diff --git a/migration/multifd.c b/migration/multifd.c +index 9e617b608e..14f2527708 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -11,6 +11,7 @@ + */ + + #include "qemu/osdep.h" ++#include "qemu/cutils.h" + #include "qemu/rcu.h" + #include "exec/target_page.h" + #include "sysemu/sysemu.h" +@@ -131,13 +132,13 @@ static void multifd_send_prepare_iovs(MultiFDSendParams *p) + { + MultiFDPages_t *pages = p->pages; + +- for (int i = 0; i < pages->num; i++) { ++ for (int i = 0; i < pages->normal_num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + +- p->next_packet_size = pages->num * p->page_size; ++ p->next_packet_size = pages->normal_num * p->page_size; + } + + /** +@@ -156,6 +157,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + bool use_zero_copy_send = migrate_zero_copy_send(); + int ret; + ++ multifd_send_zero_page_detect(p); ++ + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + return 0; +@@ -237,6 +240,13 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) + p->id, flags, MULTIFD_FLAG_NOCOMP); + return -1; + } ++ ++ multifd_recv_zero_page_process(p); ++ ++ if (!p->normal_num) { ++ return 0; ++ } ++ + for (int i = 0; i < p->normal_num; i++) { + p->iov[i].iov_base = p->host + p->normal[i]; + p->iov[i].iov_len = p->page_size; +@@ -271,6 +281,7 @@ static void multifd_pages_reset(MultiFDPages_t *pages) + * overwritten later when reused. + */ + pages->num = 0; ++ pages->normal_num = 0; + pages->block = NULL; + } + +@@ -362,11 +373,13 @@ void multifd_send_fill_packet(MultiFDSendParams *p) + MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; + uint64_t packet_num; ++ uint32_t zero_num = pages->num - pages->normal_num; + int i; + + packet->flags = cpu_to_be32(p->flags); + packet->pages_alloc = cpu_to_be32(p->pages->allocated); +- packet->normal_pages = cpu_to_be32(pages->num); ++ packet->normal_pages = cpu_to_be32(pages->normal_num); ++ packet->zero_pages = cpu_to_be32(zero_num); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); + + packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); +@@ -384,10 +397,11 @@ void multifd_send_fill_packet(MultiFDSendParams *p) + } + + p->packets_sent++; +- p->total_normal_pages += pages->num; ++ p->total_normal_pages += pages->normal_num; ++ p->total_zero_pages += zero_num; + +- trace_multifd_send(p->id, packet_num, pages->num, p->flags, +- p->next_packet_size); ++ trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, ++ p->flags, p->next_packet_size); + } + + static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +@@ -428,20 +442,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + p->normal_num = be32_to_cpu(packet->normal_pages); + if (p->normal_num > packet->pages_alloc) { + error_setg(errp, "multifd: received packet " +- "with %u pages and expected maximum pages are %u", ++ "with %u normal pages and expected maximum pages are %u", + p->normal_num, packet->pages_alloc) ; + return -1; + } + ++ p->zero_num = be32_to_cpu(packet->zero_pages); ++ if (p->zero_num > packet->pages_alloc - p->normal_num) { ++ error_setg(errp, "multifd: received packet " ++ "with %u zero pages and expected maximum zero pages are %u", ++ p->zero_num, packet->pages_alloc - p->normal_num) ; ++ return -1; ++ } ++ + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; + p->total_normal_pages += p->normal_num; ++ p->total_zero_pages += p->zero_num; + +- trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, +- p->next_packet_size); ++ trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, ++ p->flags, p->next_packet_size); + +- if (p->normal_num == 0) { ++ if (p->normal_num == 0 && p->zero_num == 0) { + return 0; + } + +@@ -467,6 +490,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) + p->normal[i] = offset; + } + ++ for (i = 0; i < p->zero_num; i++) { ++ uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); ++ ++ if (offset > (p->block->used_length - p->page_size)) { ++ error_setg(errp, "multifd: offset too long %" PRIu64 ++ " (max " RAM_ADDR_FMT ")", ++ offset, p->block->used_length); ++ return -1; ++ } ++ p->zero[i] = offset; ++ } ++ + return 0; + } + +@@ -862,6 +897,8 @@ static void *multifd_send_thread(void *opaque) + + stat64_add(&mig_stats.multifd_bytes, + p->next_packet_size + p->packet_len); ++ stat64_add(&mig_stats.normal_pages, pages->normal_num); ++ stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); + + multifd_pages_reset(p->pages); + p->next_packet_size = 0; +@@ -909,7 +946,8 @@ out: + + rcu_unregister_thread(); + migration_threads_remove(thread); +- trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); ++ trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, ++ p->total_zero_pages); + + return NULL; + } +@@ -1185,6 +1223,8 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; ++ g_free(p->zero); ++ p->zero = NULL; + multifd_recv_state->ops->recv_cleanup(p); + } + +@@ -1290,7 +1330,7 @@ static void *multifd_recv_thread(void *opaque) + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; +- has_data = !!p->normal_num; ++ has_data = p->normal_num || p->zero_num; + qemu_mutex_unlock(&p->mutex); + } + +@@ -1315,7 +1355,9 @@ static void *multifd_recv_thread(void *opaque) + } + + rcu_unregister_thread(); +- trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); ++ trace_multifd_recv_thread_end(p->id, p->packets_recved, ++ p->total_normal_pages, ++ p->total_zero_pages); + + return NULL; + } +@@ -1358,6 +1400,7 @@ int multifd_recv_setup(Error **errp) + p->name = g_strdup_printf("multifdrecv_%d", i); + p->iov = g_new0(struct iovec, page_count); + p->normal = g_new0(ram_addr_t, page_count); ++ p->zero = g_new0(ram_addr_t, page_count); + p->page_count = page_count; + p->page_size = qemu_target_page_size(); + } +@@ -1433,3 +1476,17 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) + QEMU_THREAD_JOINABLE); + qatomic_inc(&multifd_recv_state->count); + } ++ ++bool multifd_send_prepare_common(MultiFDSendParams *p) ++{ ++ multifd_send_zero_page_detect(p); ++ ++ if (!p->pages->normal_num) { ++ p->next_packet_size = 0; ++ return false; ++ } ++ ++ multifd_send_prepare_header(p); ++ ++ return true; ++} +diff --git a/migration/multifd.h b/migration/multifd.h +index 6a54377cc1..d99603c6a4 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -48,14 +48,24 @@ typedef struct { + /* size of the next packet that contains pages */ + uint32_t next_packet_size; + uint64_t packet_num; +- uint64_t unused[4]; /* Reserved for future use */ ++ /* zero pages */ ++ uint32_t zero_pages; ++ uint32_t unused32[1]; /* Reserved for future use */ ++ uint64_t unused64[3]; /* Reserved for future use */ + char ramblock[256]; ++ /* ++ * This array contains the pointers to: ++ * - normal pages (initial normal_pages entries) ++ * - zero pages (following zero_pages entries) ++ */ + uint64_t offset[]; + } __attribute__((packed)) MultiFDPacket_t; + + typedef struct { + /* number of used pages */ + uint32_t num; ++ /* number of normal pages */ ++ uint32_t normal_num; + /* number of allocated pages */ + uint32_t allocated; + /* offset of each page */ +@@ -122,6 +132,8 @@ typedef struct { + uint64_t packets_sent; + /* non zero pages sent through this channel */ + uint64_t total_normal_pages; ++ /* zero pages sent through this channel */ ++ uint64_t total_zero_pages; + /* buffers to send */ + struct iovec *iov; + /* number of iovs used */ +@@ -176,12 +188,18 @@ typedef struct { + uint8_t *host; + /* non zero pages recv through this channel */ + uint64_t total_normal_pages; ++ /* zero pages recv through this channel */ ++ uint64_t total_zero_pages; + /* buffers to recv */ + struct iovec *iov; + /* Pages that are not zero */ + ram_addr_t *normal; + /* num of non zero pages */ + uint32_t normal_num; ++ /* Pages that are zero */ ++ ram_addr_t *zero; ++ /* num of zero pages */ ++ uint32_t zero_num; + /* used for de-compression methods */ + void *compress_data; + } MultiFDRecvParams; +@@ -203,6 +221,9 @@ typedef struct { + + void multifd_register_ops(int method, MultiFDMethods *ops); + void multifd_send_fill_packet(MultiFDSendParams *p); ++bool multifd_send_prepare_common(MultiFDSendParams *p); ++void multifd_send_zero_page_detect(MultiFDSendParams *p); ++void multifd_recv_zero_page_process(MultiFDRecvParams *p); + + static inline void multifd_send_prepare_header(MultiFDSendParams *p) + { +diff --git a/migration/ram.c b/migration/ram.c +index b350a1e8d6..273ccec35b 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -1393,7 +1393,6 @@ static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) + if (!multifd_queue_page(block, offset)) { + return -1; + } +- stat64_add(&mig_stats.normal_pages, 1); + + return 1; + } +diff --git a/migration/trace-events b/migration/trace-events +index bf1a069632..f0e1cb80c7 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" + # multifd.c + multifd_new_send_channel_async(uint8_t id) "channel %u" + multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" +-multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" ++multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" + multifd_recv_new_channel(uint8_t id) "channel %u" + multifd_recv_sync_main(long packet_num) "packet num %ld" + multifd_recv_sync_main_signal(uint8_t id) "channel %u" + multifd_recv_sync_main_wait(uint8_t id) "iter %u" + multifd_recv_terminate_threads(bool error) "error %d" +-multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 ++multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 + multifd_recv_thread_start(uint8_t id) "%u" +-multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" ++multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" + multifd_send_error(uint8_t id) "channel %u" + multifd_send_sync_main(long packet_num) "packet num %ld" + multifd_send_sync_main_signal(uint8_t id) "channel %u" + multifd_send_sync_main_wait(uint8_t id) "channel %u" + multifd_send_terminate_threads(void) "" +-multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 ++multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 + multifd_send_thread_start(uint8_t id) "%u" + multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" + multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" +diff --git a/qapi/migration.json b/qapi/migration.json +index af0cefce04..e7249f24f8 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -660,10 +660,15 @@ + # + # @legacy: Perform zero page checking in main migration thread. + # ++# @multifd: Perform zero page checking in multifd sender thread if ++# multifd migration is enabled, else in the main migration ++# thread as for @legacy. ++# + # Since: 9.0 ++# + ## + { 'enum': 'ZeroPageDetection', +- 'data': [ 'none', 'legacy' ] } ++ 'data': [ 'none', 'legacy', 'multifd' ] } + + ## + # @BitmapMigrationBitmapAliasTransform: +-- +2.43.0 + diff --git a/0421-migration-multifd-implement-ram-save-target-page-mul.patch b/0421-migration-multifd-implement-ram-save-target-page-mul.patch new file mode 100644 index 0000000..ea2df72 --- /dev/null +++ b/0421-migration-multifd-implement-ram-save-target-page-mul.patch @@ -0,0 +1,95 @@ +From 92c1f7ae136d950a694bf99fdb34a7f1433ad068 Mon Sep 17 00:00:00 2001 +From: Hao Xiang +Date: Mon, 11 Mar 2024 18:00:13 +0000 +Subject: [PATCH] migration/multifd: Implement ram_save_target_page_multifd to + handle multifd version of MigrationOps::ram_save_target_page. + +commit 9ae90f73e623c8b8c7ec1fccd8ca493805df8fbd upstream. + +1. Add a dedicated handler for MigrationOps::ram_save_target_page in +multifd live migration. +2. Refactor ram_save_target_page_legacy so that the legacy and multifd +handlers don't have internal functions calling into each other. + +Intel-SIG: commit 9ae90f73e623 migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. + +Signed-off-by: Hao Xiang +Reviewed-by: Fabiano Rosas +Message-Id: <20240226195654.934709-4-hao.xiang@bytedance.com> +Link: https://lore.kernel.org/r/20240311180015.3359271-6-hao.xiang@linux.dev +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/ram.c | 38 +++++++++++++++++++++++++++++--------- + 1 file changed, 29 insertions(+), 9 deletions(-) + +diff --git a/migration/ram.c b/migration/ram.c +index 273ccec35b..0e08d86a46 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -2230,7 +2230,6 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, + */ + static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) + { +- RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + int res; + +@@ -2257,17 +2256,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) + return 1; + } + ++ return ram_save_page(rs, pss); ++} ++ ++/** ++ * ram_save_target_page_multifd: send one target page to multifd workers ++ * ++ * Returns 1 if the page was queued, -1 otherwise. ++ * ++ * @rs: current RAM state ++ * @pss: data about the page we want to send ++ */ ++static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) ++{ ++ RAMBlock *block = pss->block; ++ ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; ++ + /* +- * Do not use multifd in postcopy as one whole host page should be +- * placed. Meanwhile postcopy requires atomic update of pages, so even +- * if host page size == guest page size the dest guest during run may +- * still see partially copied pages which is data corruption. ++ * While using multifd live migration, we still need to handle zero ++ * page checking on the migration main thread. + */ +- if (migrate_multifd() && !migration_in_postcopy()) { +- return ram_save_multifd_page(block, offset); ++ if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { ++ if (save_zero_page(rs, pss, offset)) { ++ return 1; ++ } + } + +- return ram_save_page(rs, pss); ++ return ram_save_multifd_page(block, offset); + } + + /* Should be called before sending a host page */ +@@ -3403,7 +3418,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) + } + + migration_ops = g_malloc0(sizeof(MigrationOps)); +- migration_ops->ram_save_target_page = ram_save_target_page_legacy; ++ ++ if (migrate_multifd()) { ++ migration_ops->ram_save_target_page = ram_save_target_page_multifd; ++ } else { ++ migration_ops->ram_save_target_page = ram_save_target_page_legacy; ++ } + + qemu_mutex_unlock_iothread(); + ret = multifd_send_sync_main(); +-- +2.43.0 + diff --git a/0422-migration-multifd-solve-zero-page-causing-multiple-p.patch b/0422-migration-multifd-solve-zero-page-causing-multiple-p.patch new file mode 100644 index 0000000..dd8ca57 --- /dev/null +++ b/0422-migration-multifd-solve-zero-page-causing-multiple-p.patch @@ -0,0 +1,134 @@ +From de44c2f42e698e4c80d28477f7f058a92a1055b2 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 1 Apr 2024 23:41:10 +0800 +Subject: [PATCH] migration/multifd: solve zero page causing multiple page + faults + +commit 5ef7e26bdb7eda10d6d5e1b77121be9945e5e550 upstream. + +Implemented recvbitmap tracking of received pages in multifd. + +If the zero page appears for the first time in the recvbitmap, this +page is not checked and set. + +If the zero page has already appeared in the recvbitmap, there is no +need to check the data but directly set the data to 0, because it is +unlikely that the zero page will be migrated multiple times. + +Intel-SIG: commit 5ef7e26bdb7e migration/multifd: solve zero page causing multiple page faults + +Signed-off-by: Yuan Liu +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240401154110.2028453-2-yuan1.liu@intel.com +[peterx: touch up the comment, as the bitmap is used outside postcopy now] +Signed-off-by: Peter Xu + + Conflicts: + include/exec/ramblock.h +[jz: resolve context conflict due to mmaped-ram which is not backported] +Signed-off-by: Jason Zeng +--- + include/exec/ramblock.h | 2 +- + migration/multifd-zero-page.c | 4 +++- + migration/multifd-zlib.c | 1 + + migration/multifd-zstd.c | 1 + + migration/multifd.c | 1 + + migration/ram.c | 4 ++++ + migration/ram.h | 1 + + 7 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h +index 69c6a53902..8f9579ed70 100644 +--- a/include/exec/ramblock.h ++++ b/include/exec/ramblock.h +@@ -44,7 +44,7 @@ struct RAMBlock { + size_t page_size; + /* dirty bitmap used during migration */ + unsigned long *bmap; +- /* bitmap of already received pages in postcopy */ ++ /* Bitmap of already received pages. Only used on destination side. */ + unsigned long *receivedmap; + + /* +diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c +index 1ba38be636..e1b8370f88 100644 +--- a/migration/multifd-zero-page.c ++++ b/migration/multifd-zero-page.c +@@ -80,8 +80,10 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) + { + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; +- if (!buffer_is_zero(page, p->page_size)) { ++ if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { + memset(page, 0, p->page_size); ++ } else { ++ ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); + } + } + } +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index 83c0374380..b210725f6e 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -284,6 +284,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) + int flush = Z_NO_FLUSH; + unsigned long start = zs->total_out; + ++ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + if (i == p->normal_num - 1) { + flush = Z_SYNC_FLUSH; + } +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 02112255ad..256858df0a 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -278,6 +278,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) + z->in.pos = 0; + + for (i = 0; i < p->normal_num; i++) { ++ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + z->out.dst = p->host + p->normal[i]; + z->out.size = p->page_size; + z->out.pos = 0; +diff --git a/migration/multifd.c b/migration/multifd.c +index 14f2527708..fbab8c5b72 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -250,6 +250,7 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) + for (int i = 0; i < p->normal_num; i++) { + p->iov[i].iov_base = p->host + p->normal[i]; + p->iov[i].iov_len = p->page_size; ++ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); + } +diff --git a/migration/ram.c b/migration/ram.c +index 0e08d86a46..9d49765bcf 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -272,6 +272,10 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, + nr); + } + ++void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) ++{ ++ set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); ++} + #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) + + /* +diff --git a/migration/ram.h b/migration/ram.h +index 9b937a446b..cd263df026 100644 +--- a/migration/ram.h ++++ b/migration/ram.h +@@ -69,6 +69,7 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); + bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); + void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); + void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); ++void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset); + int64_t ramblock_recv_bitmap_send(QEMUFile *file, + const char *block_name); + bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); +-- +2.43.0 + diff --git a/0423-docs-migration-add-qpl-compression-feature.patch b/0423-docs-migration-add-qpl-compression-feature.patch new file mode 100644 index 0000000..1196cff --- /dev/null +++ b/0423-docs-migration-add-qpl-compression-feature.patch @@ -0,0 +1,306 @@ +From e2b570c56eed48279d0677c1152ef46e85fefc6c Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:04 +0800 +Subject: [PATCH] docs/migration: add qpl compression feature + +commit 0d40b3d76ced77c1c82c77a636af703fabdb407c upstream. + +add Intel Query Processing Library (QPL) compression method +introduction + +Intel-SIG: commit 0d40b3d76ced docs/migration: add qpl compression feature + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Fabiano Rosas +Acked-by: Peter Xu +Signed-off-by: Fabiano Rosas + + Conflicts: + docs/devel/migration/features.rst +[jz: resolve simple context conflict] +Signed-off-by: Jason Zeng +--- + docs/devel/migration/features.rst | 1 + + docs/devel/migration/qpl-compression.rst | 260 +++++++++++++++++++++++ + 2 files changed, 261 insertions(+) + create mode 100644 docs/devel/migration/qpl-compression.rst + +diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst +index a9acaf618e..9819393c12 100644 +--- a/docs/devel/migration/features.rst ++++ b/docs/devel/migration/features.rst +@@ -10,3 +10,4 @@ Migration has plenty of features to support different use cases. + dirty-limit + vfio + virtio ++ qpl-compression +diff --git a/docs/devel/migration/qpl-compression.rst b/docs/devel/migration/qpl-compression.rst +new file mode 100644 +index 0000000000..990992d786 +--- /dev/null ++++ b/docs/devel/migration/qpl-compression.rst +@@ -0,0 +1,260 @@ ++=============== ++QPL Compression ++=============== ++The Intel Query Processing Library (Intel ``QPL``) is an open-source library to ++provide compression and decompression features and it is based on deflate ++compression algorithm (RFC 1951). ++ ++The ``QPL`` compression relies on Intel In-Memory Analytics Accelerator(``IAA``) ++and Shared Virtual Memory(``SVM``) technology, they are new features supported ++from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids ++processor(``SPR``). ++ ++For more ``QPL`` introduction, please refer to `QPL Introduction ++`_ ++ ++QPL Compression Framework ++========================= ++ ++:: ++ ++ +----------------+ +------------------+ ++ | MultiFD Thread | |accel-config tool | ++ +-------+--------+ +--------+---------+ ++ | | ++ | | ++ |compress/decompress | ++ +-------+--------+ | Setup IAA ++ | QPL library | | Resources ++ +-------+---+----+ | ++ | | | ++ | +-------------+-------+ ++ | Open IAA | ++ | Devices +-----+-----+ ++ | |idxd driver| ++ | +-----+-----+ ++ | | ++ | | ++ | +-----+-----+ ++ +-----------+IAA Devices| ++ Submit jobs +-----------+ ++ via enqcmd ++ ++ ++QPL Build And Installation ++-------------------------- ++ ++.. code-block:: shell ++ ++ $git clone --recursive https://github.com/intel/qpl.git qpl ++ $mkdir qpl/build ++ $cd qpl/build ++ $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DQPL_LIBRARY_TYPE=SHARED .. ++ $sudo cmake --build . --target install ++ ++For more details about ``QPL`` installation, please refer to `QPL Installation ++`_ ++ ++IAA Device Management ++--------------------- ++ ++The number of ``IAA`` devices will vary depending on the Xeon product model. ++On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with up to ++4 devices per socket. ++ ++By default, all ``IAA`` devices are disabled and need to be configured and ++enabled by users manually. ++ ++Check the number of devices through the following command ++ ++.. code-block:: shell ++ ++ #lspci -d 8086:0cfe ++ 6a:02.0 System peripheral: Intel Corporation Device 0cfe ++ 6f:02.0 System peripheral: Intel Corporation Device 0cfe ++ 74:02.0 System peripheral: Intel Corporation Device 0cfe ++ 79:02.0 System peripheral: Intel Corporation Device 0cfe ++ e7:02.0 System peripheral: Intel Corporation Device 0cfe ++ ec:02.0 System peripheral: Intel Corporation Device 0cfe ++ f1:02.0 System peripheral: Intel Corporation Device 0cfe ++ f6:02.0 System peripheral: Intel Corporation Device 0cfe ++ ++IAA Device Configuration And Enabling ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++The ``accel-config`` tool is used to enable ``IAA`` devices and configure ++``IAA`` hardware resources(work queues and engines). One ``IAA`` device ++has 8 work queues and 8 processing engines, multiple engines can be assigned ++to a work queue via ``group`` attribute. ++ ++For ``accel-config`` installation, please refer to `accel-config installation ++`_ ++ ++One example of configuring and enabling an ``IAA`` device. ++ ++.. code-block:: shell ++ ++ #accel-config config-engine iax1/engine1.0 -g 0 ++ #accel-config config-engine iax1/engine1.1 -g 0 ++ #accel-config config-engine iax1/engine1.2 -g 0 ++ #accel-config config-engine iax1/engine1.3 -g 0 ++ #accel-config config-engine iax1/engine1.4 -g 0 ++ #accel-config config-engine iax1/engine1.5 -g 0 ++ #accel-config config-engine iax1/engine1.6 -g 0 ++ #accel-config config-engine iax1/engine1.7 -g 0 ++ #accel-config config-wq iax1/wq1.0 -g 0 -s 128 -p 10 -b 1 -t 128 -m shared -y user -n app1 -d user ++ #accel-config enable-device iax1 ++ #accel-config enable-wq iax1/wq1.0 ++ ++.. note:: ++ IAX is an early name for IAA ++ ++- The ``IAA`` device index is 1, use ``ls -lh /sys/bus/dsa/devices/iax*`` ++ command to query the ``IAA`` device index. ++ ++- 8 engines and 1 work queue are configured in group 0, so all compression jobs ++ submitted to this work queue can be processed by all engines at the same time. ++ ++- Set work queue attributes including the work mode, work queue size and so on. ++ ++- Enable the ``IAA1`` device and work queue 1.0 ++ ++.. note:: ++ ++ Set work queue mode to shared mode, since ``QPL`` library only supports ++ shared mode ++ ++For more detailed configuration, please refer to `IAA Configuration Samples ++`_ ++ ++IAA Unit Test ++^^^^^^^^^^^^^ ++ ++- Enabling ``IAA`` devices for Xeon platform, please refer to `IAA User Guide ++ `_ ++ ++- ``IAA`` device driver is Intel Data Accelerator Driver (idxd), it is ++ recommended that the minimum version of Linux kernel is 5.18. ++ ++- Add ``"intel_iommu=on,sm_on"`` parameter to kernel command line ++ for ``SVM`` feature enabling. ++ ++Here is an easy way to verify ``IAA`` device driver and ``SVM`` with `iaa_test ++`_ ++ ++.. code-block:: shell ++ ++ #./test/iaa_test ++ [ info] alloc wq 0 shared size 128 addr 0x7f26cebe5000 batch sz 0xfffffffe xfer sz 0x80000000 ++ [ info] test noop: tflags 0x1 num_desc 1 ++ [ info] preparing descriptor for noop ++ [ info] Submitted all noop jobs ++ [ info] verifying task result for 0x16f7e20 ++ [ info] test with op 0 passed ++ ++ ++IAA Resources Allocation For Migration ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++There is no ``IAA`` resource configuration parameters for migration and ++``accel-config`` tool configuration cannot directly specify the ``IAA`` ++resources used for migration. ++ ++The multifd migration with ``QPL`` compression method will use all work ++queues that are enabled and shared mode. ++ ++.. note:: ++ ++ Accessing IAA resources requires ``sudo`` command or ``root`` privileges ++ by default. Administrators can modify the IAA device node ownership ++ so that QEMU can use IAA with specified user permissions. ++ ++ For example ++ ++ #chown -R qemu /dev/iax ++ ++Shared Virtual Memory(SVM) Introduction ++======================================= ++ ++An ability for an accelerator I/O device to operate in the same virtual ++memory space of applications on host processors. It also implies the ++ability to operate from pageable memory, avoiding functional requirements ++to pin memory for DMA operations. ++ ++When using ``SVM`` technology, users do not need to reserve memory for the ++``IAA`` device and perform pin memory operation. The ``IAA`` device can ++directly access data using the virtual address of the process. ++ ++For more ``SVM`` technology, please refer to ++`Shared Virtual Addressing (SVA) with ENQCMD ++`_ ++ ++ ++How To Use QPL Compression In Migration ++======================================= ++ ++1 - Installation of ``QPL`` library and ``accel-config`` library if using IAA ++ ++2 - Configure and enable ``IAA`` devices and work queues via ``accel-config`` ++ ++3 - Build ``QEMU`` with ``--enable-qpl`` parameter ++ ++ E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qpl`` ++ ++4 - Enable ``QPL`` compression during migration ++ ++ Set ``migrate_set_parameter multifd-compression qpl`` when migrating, the ++ ``QPL`` compression does not support configuring the compression level, it ++ only supports one compression level. ++ ++The Difference Between QPL And ZLIB ++=================================== ++ ++Although both ``QPL`` and ``ZLIB`` are based on the deflate compression ++algorithm, and ``QPL`` can support the header and tail of ``ZLIB``, ``QPL`` ++is still not fully compatible with the ``ZLIB`` compression in the migration. ++ ++``QPL`` only supports 4K history buffer, and ``ZLIB`` is 32K by default. ++``ZLIB`` compresses data that ``QPL`` may not decompress correctly and ++vice versa. ++ ++``QPL`` does not support the ``Z_SYNC_FLUSH`` operation in ``ZLIB`` streaming ++compression, current ``ZLIB`` implementation uses ``Z_SYNC_FLUSH``, so each ++``multifd`` thread has a ``ZLIB`` streaming context, and all page compression ++and decompression are based on this stream. ``QPL`` cannot decompress such data ++and vice versa. ++ ++The introduction for ``Z_SYNC_FLUSH``, please refer to `Zlib Manual ++`_ ++ ++The Best Practices ++================== ++When user enables the IAA device for ``QPL`` compression, it is recommended ++to add ``-mem-prealloc`` parameter to the destination boot parameters. This ++parameter can avoid the occurrence of I/O page fault and reduce the overhead ++of IAA compression and decompression. ++ ++The example of booting with ``-mem-prealloc`` parameter ++ ++.. code-block:: shell ++ ++ $qemu-system-x86_64 --enable-kvm -cpu host --mem-prealloc ... ++ ++ ++An example about I/O page fault measurement of destination without ++``-mem-prealloc``, the ``svm_prq`` indicates the number of I/O page fault ++occurrences and processing time. ++ ++.. code-block:: shell ++ ++ #echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency ++ #echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency ++ #echo 3 > /sys/kernel/debug/iommu/intel/dmar_perf_latency ++ #echo 4 > /sys/kernel/debug/iommu/intel/dmar_perf_latency ++ #cat /sys/kernel/debug/iommu/intel/dmar_perf_latency ++ IOMMU: dmar18 Register Base Address: c87fc000 ++ <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms 1ms-10ms >=10ms min(us) max(us) average(us) ++ inv_iotlb 0 286 123 0 0 0 0 0 1 0 ++ inv_devtlb 0 276 133 0 0 0 0 0 2 0 ++ inv_iec 0 0 0 0 0 0 0 0 0 0 ++ svm_prq 0 0 25206 364 395 0 0 1 556 9 +-- +2.43.0 + diff --git a/0424-migration-multifd-put-iov-initialization-into-compre.patch b/0424-migration-multifd-put-iov-initialization-into-compre.patch new file mode 100644 index 0000000..d91fcd0 --- /dev/null +++ b/0424-migration-multifd-put-iov-initialization-into-compre.patch @@ -0,0 +1,170 @@ +From 0fea6f5ef03cf015c934d10285b4cb55da8e3545 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:05 +0800 +Subject: [PATCH] migration/multifd: put IOV initialization into compression + method + +commit d9d3e4f243214f742425d9d8360f0794bb05c999 upstream. + +Different compression methods may require different numbers of IOVs. +Based on streaming compression of zlib and zstd, all pages will be +compressed to a data block, so two IOVs are needed for packet header +and compressed data block. + +Intel-SIG: commit d9d3e4f24321 migration/multifd: put IOV initialization into compression method + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Fabiano Rosas +Reviewed-by: Peter Xu +Signed-off-by: Fabiano Rosas +Signed-off-by: Jason Zeng +--- + migration/multifd-zlib.c | 7 +++++++ + migration/multifd-zstd.c | 8 +++++++- + migration/multifd.c | 22 ++++++++++++---------- + 3 files changed, 26 insertions(+), 11 deletions(-) + +diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c +index b210725f6e..2df4983780 100644 +--- a/migration/multifd-zlib.c ++++ b/migration/multifd-zlib.c +@@ -70,6 +70,10 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) + goto err_free_zbuff; + } + p->compress_data = z; ++ ++ /* Needs 2 IOVs, one for packet header and one for compressed data */ ++ p->iov = g_new0(struct iovec, 2); ++ + return 0; + + err_free_zbuff: +@@ -101,6 +105,9 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) + z->buf = NULL; + g_free(p->compress_data); + p->compress_data = NULL; ++ ++ g_free(p->iov); ++ p->iov = NULL; + } + + /** +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index 256858df0a..ca17b7e310 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + struct zstd_data *z = g_new0(struct zstd_data, 1); + int res; + +- p->compress_data = z; + z->zcs = ZSTD_createCStream(); + if (!z->zcs) { + g_free(z); +@@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) + error_setg(errp, "multifd %u: out of memory for zbuff", p->id); + return -1; + } ++ p->compress_data = z; ++ ++ /* Needs 2 IOVs, one for packet header and one for compressed data */ ++ p->iov = g_new0(struct iovec, 2); + return 0; + } + +@@ -98,6 +101,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) + z->zbuff = NULL; + g_free(p->compress_data); + p->compress_data = NULL; ++ ++ g_free(p->iov); ++ p->iov = NULL; + } + + /** +diff --git a/migration/multifd.c b/migration/multifd.c +index fbab8c5b72..eae7040039 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -112,6 +112,13 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + ++ if (multifd_use_packets()) { ++ /* We need one extra place for the packet header */ ++ p->iov = g_new0(struct iovec, p->page_count + 1); ++ } else { ++ p->iov = g_new0(struct iovec, p->page_count); ++ } ++ + return 0; + } + +@@ -125,6 +132,8 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) + */ + static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) + { ++ g_free(p->iov); ++ p->iov = NULL; + return; + } + +@@ -201,6 +210,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) + */ + static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) + { ++ p->iov = g_new0(struct iovec, p->page_count); + return 0; + } + +@@ -213,6 +223,8 @@ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) + */ + static void nocomp_recv_cleanup(MultiFDRecvParams *p) + { ++ g_free(p->iov); ++ p->iov = NULL; + } + + /** +@@ -733,8 +745,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; +- g_free(p->iov); +- p->iov = NULL; + multifd_send_state->ops->send_cleanup(p, errp); + + return *errp == NULL; +@@ -1116,11 +1126,6 @@ bool multifd_send_setup(void) + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); +- +- /* We need one extra place for the packet header */ +- p->iov = g_new0(struct iovec, page_count + 1); +- } else { +- p->iov = g_new0(struct iovec, page_count); + } + p->name = g_strdup_printf("multifdsend_%d", i); + p->page_size = qemu_target_page_size(); +@@ -1220,8 +1225,6 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; +- g_free(p->iov); +- p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + g_free(p->zero); +@@ -1399,7 +1402,6 @@ int multifd_recv_setup(Error **errp) + p->packet = g_malloc0(p->packet_len); + } + p->name = g_strdup_printf("multifdrecv_%d", i); +- p->iov = g_new0(struct iovec, page_count); + p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); + p->page_count = page_count; +-- +2.43.0 + diff --git a/0425-configure-add-enable-qpl-build-option.patch b/0425-configure-add-enable-qpl-build-option.patch new file mode 100644 index 0000000..be84132 --- /dev/null +++ b/0425-configure-add-enable-qpl-build-option.patch @@ -0,0 +1,102 @@ +From 337dc1286acd6f1ed34051fb33431b732dc28ec0 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:06 +0800 +Subject: [PATCH] configure: add --enable-qpl build option + +commit b844a2c7cc7f7c7756a27d372e64f6688d67c4eb upstream. + +add --enable-qpl and --disable-qpl options to enable and disable +the QPL compression method for multifd migration. + +The Query Processing Library (QPL) is an open-source library +that supports data compression and decompression features. It +is based on the deflate compression algorithm and use Intel +In-Memory Analytics Accelerator(IAA) hardware for compression +and decompression acceleration. + +For more live migration with IAA, please refer to the document +docs/devel/migration/qpl-compression.rst + +Intel-SIG: commit b844a2c7cc7f configure: add --enable-qpl build option + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas +Signed-off-by: Jason Zeng +--- + meson.build | 8 ++++++++ + meson_options.txt | 2 ++ + scripts/meson-buildoptions.sh | 3 +++ + 3 files changed, 13 insertions(+) + +diff --git a/meson.build b/meson.build +index d92384c23a..c833cd2d47 100644 +--- a/meson.build ++++ b/meson.build +@@ -1051,6 +1051,12 @@ if not get_option('zstd').auto() or have_block + required: get_option('zstd'), + method: 'pkg-config') + endif ++qpl = not_found ++if not get_option('qpl').auto() or have_system ++ qpl = dependency('qpl', version: '>=1.5.0', ++ required: get_option('qpl'), ++ method: 'pkg-config') ++endif + virgl = not_found + + have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() +@@ -2218,6 +2224,7 @@ config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) + config_host_data.set('CONFIG_STATX', has_statx) + config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) + config_host_data.set('CONFIG_ZSTD', zstd.found()) ++config_host_data.set('CONFIG_QPL', qpl.found()) + config_host_data.set('CONFIG_FUSE', fuse.found()) + config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) + config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) +@@ -4393,6 +4400,7 @@ summary_info += {'snappy support': snappy} + summary_info += {'bzip2 support': libbzip2} + summary_info += {'lzfse support': liblzfse} + summary_info += {'zstd support': zstd} ++summary_info += {'Query Processing Library support': qpl} + summary_info += {'NUMA host support': numa} + summary_info += {'capstone': capstone} + summary_info += {'libpmem support': libpmem} +diff --git a/meson_options.txt b/meson_options.txt +index c9baeda639..e16aa8c823 100644 +--- a/meson_options.txt ++++ b/meson_options.txt +@@ -259,6 +259,8 @@ option('xkbcommon', type : 'feature', value : 'auto', + description: 'xkbcommon support') + option('zstd', type : 'feature', value : 'auto', + description: 'zstd compression support') ++option('qpl', type : 'feature', value : 'auto', ++ description: 'Query Processing Library support') + option('fuse', type: 'feature', value: 'auto', + description: 'FUSE block device export') + option('fuse_lseek', type : 'feature', value : 'auto', +diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh +index 680fa3f581..784f74fde9 100644 +--- a/scripts/meson-buildoptions.sh ++++ b/scripts/meson-buildoptions.sh +@@ -222,6 +222,7 @@ meson_options_help() { + printf "%s\n" ' Xen PCI passthrough support' + printf "%s\n" ' xkbcommon xkbcommon support' + printf "%s\n" ' zstd zstd compression support' ++ printf "%s\n" ' qpl Query Processing Library support' + } + _meson_option_parse() { + case $1 in +@@ -562,6 +563,8 @@ _meson_option_parse() { + --disable-xkbcommon) printf "%s" -Dxkbcommon=disabled ;; + --enable-zstd) printf "%s" -Dzstd=enabled ;; + --disable-zstd) printf "%s" -Dzstd=disabled ;; ++ --enable-qpl) printf "%s" -Dqpl=enabled ;; ++ --disable-qpl) printf "%s" -Dqpl=disabled ;; + *) return 1 ;; + esac + } +-- +2.43.0 + diff --git a/0426-migration-multifd-add-qpl-compression-method.patch b/0426-migration-multifd-add-qpl-compression-method.patch new file mode 100644 index 0000000..aaeb0d9 --- /dev/null +++ b/0426-migration-multifd-add-qpl-compression-method.patch @@ -0,0 +1,127 @@ +From 0bd175c2fba5c80ac5b51d5e3f81d2969c89c743 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:07 +0800 +Subject: [PATCH] migration/multifd: add qpl compression method + +commit 354cac2859e48ec5f7ee72a2a071da6c60a462d0 upstream. + +add the Query Processing Library (QPL) compression method + +Introduce the qpl as a new multifd migration compression method, it can +use In-Memory Analytics Accelerator(IAA) to accelerate compression and +decompression, which can not only reduce network bandwidth requirement +but also reduce host compression and decompression CPU overhead. + +How to enable qpl compression during migration: +migrate_set_parameter multifd-compression qpl + +There is no qpl compression level parameter added since it only supports +level one, users do not need to specify the qpl compression level. + +Intel-SIG: commit 354cac2859e4 migration/multifd: add qpl compression method + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Peter Xu +Reviewed-by: Fabiano Rosas +[fixed docs spacing in migration.json] +Signed-off-by: Fabiano Rosas +Signed-off-by: Jason Zeng +--- + hw/core/qdev-properties-system.c | 2 +- + migration/meson.build | 1 + + migration/multifd-qpl.c | 20 ++++++++++++++++++++ + migration/multifd.h | 1 + + qapi/migration.json | 8 +++++++- + 5 files changed, 30 insertions(+), 2 deletions(-) + create mode 100644 migration/multifd-qpl.c + +diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c +index 67d40c526c..523b3a9bae 100644 +--- a/hw/core/qdev-properties-system.c ++++ b/hw/core/qdev-properties-system.c +@@ -666,7 +666,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { + const PropertyInfo qdev_prop_multifd_compression = { + .name = "MultiFDCompression", + .description = "multifd_compression values, " +- "none/zlib/zstd", ++ "none/zlib/zstd/qpl", + .enum_table = &MultiFDCompression_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, +diff --git a/migration/meson.build b/migration/meson.build +index 1eeb915ff6..cb177de1d2 100644 +--- a/migration/meson.build ++++ b/migration/meson.build +@@ -41,6 +41,7 @@ if get_option('live_block_migration').allowed() + system_ss.add(files('block.c')) + endif + system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) ++system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) + + specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', + if_true: files('ram.c', +diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c +new file mode 100644 +index 0000000000..056a68a060 +--- /dev/null ++++ b/migration/multifd-qpl.c +@@ -0,0 +1,20 @@ ++/* ++ * Multifd qpl compression accelerator implementation ++ * ++ * Copyright (c) 2023 Intel Corporation ++ * ++ * Authors: ++ * Yuan Liu ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++#include "qemu/osdep.h" ++#include "qemu/module.h" ++ ++static void multifd_qpl_register(void) ++{ ++ /* noop */ ++} ++ ++migration_init(multifd_qpl_register); +diff --git a/migration/multifd.h b/migration/multifd.h +index d99603c6a4..11f05dd6d5 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -33,6 +33,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); + #define MULTIFD_FLAG_NOCOMP (0 << 1) + #define MULTIFD_FLAG_ZLIB (1 << 1) + #define MULTIFD_FLAG_ZSTD (2 << 1) ++#define MULTIFD_FLAG_QPL (4 << 1) + + /* This value needs to be a multiple of qemu_target_page_size() */ + #define MULTIFD_PACKET_SIZE (512 * 1024) +diff --git a/qapi/migration.json b/qapi/migration.json +index e7249f24f8..720988007d 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -625,11 +625,17 @@ + # + # @zstd: use zstd compression method. + # ++# @qpl: use qpl compression method. Query Processing Library(qpl) is ++# based on the deflate compression algorithm and use the Intel ++# In-Memory Analytics Accelerator(IAA) accelerated compression ++# and decompression. (Since 9.1) ++# + # Since: 5.0 + ## + { 'enum': 'MultiFDCompression', + 'data': [ 'none', 'zlib', +- { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } ++ { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, ++ { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } + + ## + # @MigMode: +-- +2.43.0 + diff --git a/0427-migration-multifd-include-ram-h-in-multifd-h.patch b/0427-migration-multifd-include-ram-h-in-multifd-h.patch new file mode 100644 index 0000000..4023082 --- /dev/null +++ b/0427-migration-multifd-include-ram-h-in-multifd-h.patch @@ -0,0 +1,33 @@ +From cb50e47a3d8b16d493306615c72bdf3555121007 Mon Sep 17 00:00:00 2001 +From: Jason Zeng +Date: Wed, 2 Apr 2025 18:09:21 +0800 +Subject: [PATCH] migration/multifd: include ram.h in multifd.h + +Header file ram.h was included by multifd.h when mapped-ram was +introduced in upstream code. This inclusion is needed by qpl when +multifd-qpl.c includes multifd.h. Since we don't backport +mapped-ram, add this inclusion separately. + +Intel-SIG: commit - migration/multifd: include ram.h in multifd.h + +Signed-off-by: Jason Zeng +--- + migration/multifd.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/migration/multifd.h b/migration/multifd.h +index 11f05dd6d5..41965df7a9 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -13,6 +13,8 @@ + #ifndef QEMU_MIGRATION_MULTIFD_H + #define QEMU_MIGRATION_MULTIFD_H + ++#include "ram.h" ++ + bool multifd_send_setup(void); + void multifd_send_shutdown(void); + int multifd_recv_setup(Error **errp); +-- +2.43.0 + diff --git a/0428-migration-multifd-implement-initialization-of-qpl-co.patch b/0428-migration-multifd-implement-initialization-of-qpl-co.patch new file mode 100644 index 0000000..a67c30a --- /dev/null +++ b/0428-migration-multifd-implement-initialization-of-qpl-co.patch @@ -0,0 +1,371 @@ +From 316bcefcfc17264738d0c19f99eb174c462f1393 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:08 +0800 +Subject: [PATCH] migration/multifd: implement initialization of qpl + compression + +commit 34e104b897da6e144a5f34e7c5eebf8a4c4d9d59 upstream. + +during initialization, a software job is allocated to each channel +for software path fallabck when the IAA hardware is unavailable or +the hardware job submission fails. If the IAA hardware is available, +multiple hardware jobs are allocated for batch processing. + +Intel-SIG: commit 34e104b897da migration/multifd: implement initialization of qpl compression + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas +Signed-off-by: Jason Zeng +--- + migration/multifd-qpl.c | 328 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 327 insertions(+), 1 deletion(-) + +diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c +index 056a68a060..6791a204d5 100644 +--- a/migration/multifd-qpl.c ++++ b/migration/multifd-qpl.c +@@ -9,12 +9,338 @@ + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ ++ + #include "qemu/osdep.h" + #include "qemu/module.h" ++#include "qapi/error.h" ++#include "multifd.h" ++#include "qpl/qpl.h" ++ ++typedef struct { ++ /* the QPL hardware path job */ ++ qpl_job *job; ++ /* indicates if fallback to software path is required */ ++ bool fallback_sw_path; ++ /* output data from the software path */ ++ uint8_t *sw_output; ++ /* output data length from the software path */ ++ uint32_t sw_output_len; ++} QplHwJob; ++ ++typedef struct { ++ /* array of hardware jobs, the number of jobs equals the number pages */ ++ QplHwJob *hw_jobs; ++ /* the QPL software job for the slow path and software fallback */ ++ qpl_job *sw_job; ++ /* the number of pages that the QPL needs to process at one time */ ++ uint32_t page_num; ++ /* array of compressed page buffers */ ++ uint8_t *zbuf; ++ /* array of compressed page lengths */ ++ uint32_t *zlen; ++ /* the status of the hardware device */ ++ bool hw_avail; ++} QplData; ++ ++/** ++ * check_hw_avail: check if IAA hardware is available ++ * ++ * If the IAA hardware does not exist or is unavailable, ++ * the QPL hardware job initialization will fail. ++ * ++ * Returns true if IAA hardware is available, otherwise false. ++ * ++ * @job_size: indicates the hardware job size if hardware is available ++ */ ++static bool check_hw_avail(uint32_t *job_size) ++{ ++ qpl_path_t path = qpl_path_hardware; ++ uint32_t size = 0; ++ qpl_job *job; ++ ++ if (qpl_get_job_size(path, &size) != QPL_STS_OK) { ++ return false; ++ } ++ assert(size > 0); ++ job = g_malloc0(size); ++ if (qpl_init_job(path, job) != QPL_STS_OK) { ++ g_free(job); ++ return false; ++ } ++ g_free(job); ++ *job_size = size; ++ return true; ++} ++ ++/** ++ * multifd_qpl_free_sw_job: clean up software job ++ * ++ * Free the software job resources. ++ * ++ * @qpl: pointer to the QplData structure ++ */ ++static void multifd_qpl_free_sw_job(QplData *qpl) ++{ ++ assert(qpl); ++ if (qpl->sw_job) { ++ qpl_fini_job(qpl->sw_job); ++ g_free(qpl->sw_job); ++ qpl->sw_job = NULL; ++ } ++} ++ ++/** ++ * multifd_qpl_free_jobs: clean up hardware jobs ++ * ++ * Free all hardware job resources. ++ * ++ * @qpl: pointer to the QplData structure ++ */ ++static void multifd_qpl_free_hw_job(QplData *qpl) ++{ ++ assert(qpl); ++ if (qpl->hw_jobs) { ++ for (int i = 0; i < qpl->page_num; i++) { ++ qpl_fini_job(qpl->hw_jobs[i].job); ++ g_free(qpl->hw_jobs[i].job); ++ qpl->hw_jobs[i].job = NULL; ++ } ++ g_free(qpl->hw_jobs); ++ qpl->hw_jobs = NULL; ++ } ++} ++ ++/** ++ * multifd_qpl_init_sw_job: initialize a software job ++ * ++ * Use the QPL software path to initialize a job ++ * ++ * @qpl: pointer to the QplData structure ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_init_sw_job(QplData *qpl, Error **errp) ++{ ++ qpl_path_t path = qpl_path_software; ++ uint32_t size = 0; ++ qpl_job *job = NULL; ++ qpl_status status; ++ ++ status = qpl_get_job_size(path, &size); ++ if (status != QPL_STS_OK) { ++ error_setg(errp, "qpl_get_job_size failed with error %d", status); ++ return -1; ++ } ++ job = g_malloc0(size); ++ status = qpl_init_job(path, job); ++ if (status != QPL_STS_OK) { ++ error_setg(errp, "qpl_init_job failed with error %d", status); ++ g_free(job); ++ return -1; ++ } ++ qpl->sw_job = job; ++ return 0; ++} ++ ++/** ++ * multifd_qpl_init_jobs: initialize hardware jobs ++ * ++ * Use the QPL hardware path to initialize jobs ++ * ++ * @qpl: pointer to the QplData structure ++ * @size: the size of QPL hardware path job ++ * @errp: pointer to an error ++ */ ++static void multifd_qpl_init_hw_job(QplData *qpl, uint32_t size, Error **errp) ++{ ++ qpl_path_t path = qpl_path_hardware; ++ qpl_job *job = NULL; ++ qpl_status status; ++ ++ qpl->hw_jobs = g_new0(QplHwJob, qpl->page_num); ++ for (int i = 0; i < qpl->page_num; i++) { ++ job = g_malloc0(size); ++ status = qpl_init_job(path, job); ++ /* the job initialization should succeed after check_hw_avail */ ++ assert(status == QPL_STS_OK); ++ qpl->hw_jobs[i].job = job; ++ } ++} ++ ++/** ++ * multifd_qpl_init: initialize QplData structure ++ * ++ * Allocate and initialize a QplData structure ++ * ++ * Returns a QplData pointer on success or NULL on error ++ * ++ * @num: the number of pages ++ * @size: the page size ++ * @errp: pointer to an error ++ */ ++static QplData *multifd_qpl_init(uint32_t num, uint32_t size, Error **errp) ++{ ++ uint32_t job_size = 0; ++ QplData *qpl; ++ ++ qpl = g_new0(QplData, 1); ++ qpl->page_num = num; ++ if (multifd_qpl_init_sw_job(qpl, errp) != 0) { ++ g_free(qpl); ++ return NULL; ++ } ++ qpl->hw_avail = check_hw_avail(&job_size); ++ if (qpl->hw_avail) { ++ multifd_qpl_init_hw_job(qpl, job_size, errp); ++ } ++ qpl->zbuf = g_malloc0(size * num); ++ qpl->zlen = g_new0(uint32_t, num); ++ return qpl; ++} ++ ++/** ++ * multifd_qpl_deinit: clean up QplData structure ++ * ++ * Free jobs, buffers and the QplData structure ++ * ++ * @qpl: pointer to the QplData structure ++ */ ++static void multifd_qpl_deinit(QplData *qpl) ++{ ++ if (qpl) { ++ multifd_qpl_free_sw_job(qpl); ++ multifd_qpl_free_hw_job(qpl); ++ g_free(qpl->zbuf); ++ g_free(qpl->zlen); ++ g_free(qpl); ++ } ++} ++ ++/** ++ * multifd_qpl_send_setup: set up send side ++ * ++ * Set up the channel with QPL compression. ++ * ++ * Returns 0 on success or -1 on error ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) ++{ ++ QplData *qpl; ++ ++ qpl = multifd_qpl_init(p->page_count, p->page_size, errp); ++ if (!qpl) { ++ return -1; ++ } ++ p->compress_data = qpl; ++ ++ /* ++ * the page will be compressed independently and sent using an IOV. The ++ * additional two IOVs are used to store packet header and compressed data ++ * length ++ */ ++ p->iov = g_new0(struct iovec, p->page_count + 2); ++ return 0; ++} ++ ++/** ++ * multifd_qpl_send_cleanup: clean up send side ++ * ++ * Close the channel and free memory. ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) ++{ ++ multifd_qpl_deinit(p->compress_data); ++ p->compress_data = NULL; ++ g_free(p->iov); ++ p->iov = NULL; ++} ++ ++/** ++ * multifd_qpl_send_prepare: prepare data to be able to send ++ * ++ * Create a compressed buffer with all the pages that we are going to ++ * send. ++ * ++ * Returns 0 on success or -1 on error ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) ++{ ++ /* Implement in next patch */ ++ return -1; ++} ++ ++/** ++ * multifd_qpl_recv_setup: set up receive side ++ * ++ * Create the compressed channel and buffer. ++ * ++ * Returns 0 on success or -1 on error ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) ++{ ++ QplData *qpl; ++ ++ qpl = multifd_qpl_init(p->page_count, p->page_size, errp); ++ if (!qpl) { ++ return -1; ++ } ++ p->compress_data = qpl; ++ return 0; ++} ++ ++/** ++ * multifd_qpl_recv_cleanup: set up receive side ++ * ++ * Close the channel and free memory. ++ * ++ * @p: Params for the channel being used ++ */ ++static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) ++{ ++ multifd_qpl_deinit(p->compress_data); ++ p->compress_data = NULL; ++} ++ ++/** ++ * multifd_qpl_recv: read the data from the channel into actual pages ++ * ++ * Read the compressed buffer, and uncompress it into the actual ++ * pages. ++ * ++ * Returns 0 on success or -1 on error ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) ++{ ++ /* Implement in next patch */ ++ return -1; ++} ++ ++static MultiFDMethods multifd_qpl_ops = { ++ .send_setup = multifd_qpl_send_setup, ++ .send_cleanup = multifd_qpl_send_cleanup, ++ .send_prepare = multifd_qpl_send_prepare, ++ .recv_setup = multifd_qpl_recv_setup, ++ .recv_cleanup = multifd_qpl_recv_cleanup, ++ .recv = multifd_qpl_recv, ++}; + + static void multifd_qpl_register(void) + { +- /* noop */ ++ multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops); + } + + migration_init(multifd_qpl_register); +-- +2.43.0 + diff --git a/0429-migration-multifd-implement-qpl-compression-and-deco.patch b/0429-migration-multifd-implement-qpl-compression-and-deco.patch new file mode 100644 index 0000000..409a56d --- /dev/null +++ b/0429-migration-multifd-implement-qpl-compression-and-deco.patch @@ -0,0 +1,512 @@ +From 0b6326ea51682ffbb95a49483d47205666cae5e6 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:09 +0800 +Subject: [PATCH] migration/multifd: implement qpl compression and + decompression + +commit f6fe9fea995249ecc2cd72975d803fbf4d512c02 upstream. + +QPL compression and decompression will use IAA hardware path if the IAA +hardware is available. Otherwise the QPL library software path is used. + +The hardware path will automatically fall back to QPL software path if +the IAA queues are busy. In some scenarios, this may happen frequently, +such as configuring 4 channels but only one IAA device is available. In +the case of insufficient IAA hardware resources, retry and fallback can +help optimize performance: + + 1. Retry + SW fallback: + total time: 14649 ms + downtime: 25 ms + throughput: 17666.57 mbps + pages-per-second: 1509647 + + 2. No fallback, always wait for work queues to become available + total time: 18381 ms + downtime: 25 ms + throughput: 13698.65 mbps + pages-per-second: 859607 + +If both the hardware and software paths fail, the uncompressed page is +sent directly. + +Intel-SIG: commit f6fe9fea9952 migration/multifd: implement qpl compression and decompression + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas +Signed-off-by: Jason Zeng +--- + migration/multifd-qpl.c | 424 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 420 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c +index 6791a204d5..9265098ee7 100644 +--- a/migration/multifd-qpl.c ++++ b/migration/multifd-qpl.c +@@ -13,9 +13,14 @@ + #include "qemu/osdep.h" + #include "qemu/module.h" + #include "qapi/error.h" ++#include "qapi/qapi-types-migration.h" ++#include "exec/ramblock.h" + #include "multifd.h" + #include "qpl/qpl.h" + ++/* Maximum number of retries to resubmit a job if IAA work queues are full */ ++#define MAX_SUBMIT_RETRY_NUM (3) ++ + typedef struct { + /* the QPL hardware path job */ + qpl_job *job; +@@ -260,6 +265,225 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) + p->iov = NULL; + } + ++/** ++ * multifd_qpl_prepare_job: prepare the job ++ * ++ * Set the QPL job parameters and properties. ++ * ++ * @job: pointer to the qpl_job structure ++ * @is_compression: indicates compression and decompression ++ * @input: pointer to the input data buffer ++ * @input_len: the length of the input data ++ * @output: pointer to the output data buffer ++ * @output_len: the length of the output data ++ */ ++static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression, ++ uint8_t *input, uint32_t input_len, ++ uint8_t *output, uint32_t output_len) ++{ ++ job->op = is_compression ? qpl_op_compress : qpl_op_decompress; ++ job->next_in_ptr = input; ++ job->next_out_ptr = output; ++ job->available_in = input_len; ++ job->available_out = output_len; ++ job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY; ++ /* only supports compression level 1 */ ++ job->level = 1; ++} ++ ++/** ++ * multifd_qpl_prepare_comp_job: prepare the compression job ++ * ++ * Set the compression job parameters and properties. ++ * ++ * @job: pointer to the qpl_job structure ++ * @input: pointer to the input data buffer ++ * @output: pointer to the output data buffer ++ * @size: the page size ++ */ ++static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input, ++ uint8_t *output, uint32_t size) ++{ ++ /* ++ * Set output length to less than the page size to force the job to ++ * fail in case it compresses to a larger size. We'll send that page ++ * without compression and skip the decompression operation on the ++ * destination. ++ */ ++ multifd_qpl_prepare_job(job, true, input, size, output, size - 1); ++} ++ ++/** ++ * multifd_qpl_prepare_decomp_job: prepare the decompression job ++ * ++ * Set the decompression job parameters and properties. ++ * ++ * @job: pointer to the qpl_job structure ++ * @input: pointer to the input data buffer ++ * @len: the length of the input data ++ * @output: pointer to the output data buffer ++ * @size: the page size ++ */ ++static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t *input, ++ uint32_t len, uint8_t *output, ++ uint32_t size) ++{ ++ multifd_qpl_prepare_job(job, false, input, len, output, size); ++} ++ ++/** ++ * multifd_qpl_fill_iov: fill in the IOV ++ * ++ * Fill in the QPL packet IOV ++ * ++ * @p: Params for the channel being used ++ * @data: pointer to the IOV data ++ * @len: The length of the IOV data ++ */ ++static void multifd_qpl_fill_iov(MultiFDSendParams *p, uint8_t *data, ++ uint32_t len) ++{ ++ p->iov[p->iovs_num].iov_base = data; ++ p->iov[p->iovs_num].iov_len = len; ++ p->iovs_num++; ++ p->next_packet_size += len; ++} ++ ++/** ++ * multifd_qpl_fill_packet: fill the compressed page into the QPL packet ++ * ++ * Fill the compressed page length and IOV into the QPL packet ++ * ++ * @idx: The index of the compressed length array ++ * @p: Params for the channel being used ++ * @data: pointer to the compressed page buffer ++ * @len: The length of the compressed page ++ */ ++static void multifd_qpl_fill_packet(uint32_t idx, MultiFDSendParams *p, ++ uint8_t *data, uint32_t len) ++{ ++ QplData *qpl = p->compress_data; ++ ++ qpl->zlen[idx] = cpu_to_be32(len); ++ multifd_qpl_fill_iov(p, data, len); ++} ++ ++/** ++ * multifd_qpl_submit_job: submit a job to the hardware ++ * ++ * Submit a QPL hardware job to the IAA device ++ * ++ * Returns true if the job is submitted successfully, otherwise false. ++ * ++ * @job: pointer to the qpl_job structure ++ */ ++static bool multifd_qpl_submit_job(qpl_job *job) ++{ ++ qpl_status status; ++ uint32_t num = 0; ++ ++retry: ++ status = qpl_submit_job(job); ++ if (status == QPL_STS_QUEUES_ARE_BUSY_ERR) { ++ if (num < MAX_SUBMIT_RETRY_NUM) { ++ num++; ++ goto retry; ++ } ++ } ++ return (status == QPL_STS_OK); ++} ++ ++/** ++ * multifd_qpl_compress_pages_slow_path: compress pages using slow path ++ * ++ * Compress the pages using software. If compression fails, the uncompressed ++ * page will be sent. ++ * ++ * @p: Params for the channel being used ++ */ ++static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) ++{ ++ QplData *qpl = p->compress_data; ++ uint32_t size = p->page_size; ++ qpl_job *job = qpl->sw_job; ++ uint8_t *zbuf = qpl->zbuf; ++ uint8_t *buf; ++ ++ for (int i = 0; i < p->pages->normal_num; i++) { ++ buf = p->pages->block->host + p->pages->offset[i]; ++ multifd_qpl_prepare_comp_job(job, buf, zbuf, size); ++ if (qpl_execute_job(job) == QPL_STS_OK) { ++ multifd_qpl_fill_packet(i, p, zbuf, job->total_out); ++ } else { ++ /* send the uncompressed page */ ++ multifd_qpl_fill_packet(i, p, buf, size); ++ } ++ zbuf += size; ++ } ++} ++ ++/** ++ * multifd_qpl_compress_pages: compress pages ++ * ++ * Submit the pages to the IAA hardware for compression. If hardware ++ * compression fails, it falls back to software compression. If software ++ * compression also fails, the uncompressed page is sent. ++ * ++ * @p: Params for the channel being used ++ */ ++static void multifd_qpl_compress_pages(MultiFDSendParams *p) ++{ ++ QplData *qpl = p->compress_data; ++ MultiFDPages_t *pages = p->pages; ++ uint32_t size = p->page_size; ++ QplHwJob *hw_job; ++ uint8_t *buf; ++ uint8_t *zbuf; ++ ++ for (int i = 0; i < pages->normal_num; i++) { ++ buf = pages->block->host + pages->offset[i]; ++ zbuf = qpl->zbuf + (size * i); ++ hw_job = &qpl->hw_jobs[i]; ++ multifd_qpl_prepare_comp_job(hw_job->job, buf, zbuf, size); ++ if (multifd_qpl_submit_job(hw_job->job)) { ++ hw_job->fallback_sw_path = false; ++ } else { ++ /* ++ * The IAA work queue is full, any immediate subsequent job ++ * submission is likely to fail, sending the page via the QPL ++ * software path at this point gives us a better chance of ++ * finding the queue open for the next pages. ++ */ ++ hw_job->fallback_sw_path = true; ++ multifd_qpl_prepare_comp_job(qpl->sw_job, buf, zbuf, size); ++ if (qpl_execute_job(qpl->sw_job) == QPL_STS_OK) { ++ hw_job->sw_output = zbuf; ++ hw_job->sw_output_len = qpl->sw_job->total_out; ++ } else { ++ hw_job->sw_output = buf; ++ hw_job->sw_output_len = size; ++ } ++ } ++ } ++ ++ for (int i = 0; i < pages->normal_num; i++) { ++ buf = pages->block->host + pages->offset[i]; ++ zbuf = qpl->zbuf + (size * i); ++ hw_job = &qpl->hw_jobs[i]; ++ if (hw_job->fallback_sw_path) { ++ multifd_qpl_fill_packet(i, p, hw_job->sw_output, ++ hw_job->sw_output_len); ++ continue; ++ } ++ if (qpl_wait_job(hw_job->job) == QPL_STS_OK) { ++ multifd_qpl_fill_packet(i, p, zbuf, hw_job->job->total_out); ++ } else { ++ /* send the uncompressed page */ ++ multifd_qpl_fill_packet(i, p, buf, size); ++ } ++ } ++} ++ + /** + * multifd_qpl_send_prepare: prepare data to be able to send + * +@@ -273,8 +497,26 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) + */ + static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) + { +- /* Implement in next patch */ +- return -1; ++ QplData *qpl = p->compress_data; ++ uint32_t len = 0; ++ ++ if (!multifd_send_prepare_common(p)) { ++ goto out; ++ } ++ ++ /* The first IOV is used to store the compressed page lengths */ ++ len = p->pages->normal_num * sizeof(uint32_t); ++ multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len); ++ if (qpl->hw_avail) { ++ multifd_qpl_compress_pages(p); ++ } else { ++ multifd_qpl_compress_pages_slow_path(p); ++ } ++ ++out: ++ p->flags |= MULTIFD_FLAG_QPL; ++ multifd_send_fill_packet(p); ++ return 0; + } + + /** +@@ -312,6 +554,140 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) + p->compress_data = NULL; + } + ++/** ++ * multifd_qpl_process_and_check_job: process and check a QPL job ++ * ++ * Process the job and check whether the job output length is the ++ * same as the specified length ++ * ++ * Returns true if the job execution succeeded and the output length ++ * is equal to the specified length, otherwise false. ++ * ++ * @job: pointer to the qpl_job structure ++ * @is_hardware: indicates whether the job is a hardware job ++ * @len: Specified output length ++ * @errp: pointer to an error ++ */ ++static bool multifd_qpl_process_and_check_job(qpl_job *job, bool is_hardware, ++ uint32_t len, Error **errp) ++{ ++ qpl_status status; ++ ++ status = (is_hardware ? qpl_wait_job(job) : qpl_execute_job(job)); ++ if (status != QPL_STS_OK) { ++ error_setg(errp, "qpl job failed with error %d", status); ++ return false; ++ } ++ if (job->total_out != len) { ++ error_setg(errp, "qpl decompressed len %u, expected len %u", ++ job->total_out, len); ++ return false; ++ } ++ return true; ++} ++ ++/** ++ * multifd_qpl_decompress_pages_slow_path: decompress pages using slow path ++ * ++ * Decompress the pages using software ++ * ++ * Returns 0 on success or -1 on error ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, ++ Error **errp) ++{ ++ QplData *qpl = p->compress_data; ++ uint32_t size = p->page_size; ++ qpl_job *job = qpl->sw_job; ++ uint8_t *zbuf = qpl->zbuf; ++ uint8_t *addr; ++ uint32_t len; ++ ++ for (int i = 0; i < p->normal_num; i++) { ++ len = qpl->zlen[i]; ++ addr = p->host + p->normal[i]; ++ /* the page is uncompressed, load it */ ++ if (len == size) { ++ memcpy(addr, zbuf, size); ++ zbuf += size; ++ continue; ++ } ++ multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); ++ if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { ++ return -1; ++ } ++ zbuf += len; ++ } ++ return 0; ++} ++ ++/** ++ * multifd_qpl_decompress_pages: decompress pages ++ * ++ * Decompress the pages using the IAA hardware. If hardware ++ * decompression fails, it falls back to software decompression. ++ * ++ * Returns 0 on success or -1 on error ++ * ++ * @p: Params for the channel being used ++ * @errp: pointer to an error ++ */ ++static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) ++{ ++ QplData *qpl = p->compress_data; ++ uint32_t size = p->page_size; ++ uint8_t *zbuf = qpl->zbuf; ++ uint8_t *addr; ++ uint32_t len; ++ qpl_job *job; ++ ++ for (int i = 0; i < p->normal_num; i++) { ++ addr = p->host + p->normal[i]; ++ len = qpl->zlen[i]; ++ /* the page is uncompressed if received length equals the page size */ ++ if (len == size) { ++ memcpy(addr, zbuf, size); ++ zbuf += size; ++ continue; ++ } ++ ++ job = qpl->hw_jobs[i].job; ++ multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); ++ if (multifd_qpl_submit_job(job)) { ++ qpl->hw_jobs[i].fallback_sw_path = false; ++ } else { ++ /* ++ * The IAA work queue is full, any immediate subsequent job ++ * submission is likely to fail, sending the page via the QPL ++ * software path at this point gives us a better chance of ++ * finding the queue open for the next pages. ++ */ ++ qpl->hw_jobs[i].fallback_sw_path = true; ++ job = qpl->sw_job; ++ multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); ++ if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { ++ return -1; ++ } ++ } ++ zbuf += len; ++ } ++ ++ for (int i = 0; i < p->normal_num; i++) { ++ /* ignore pages that have already been processed */ ++ if (qpl->zlen[i] == size || qpl->hw_jobs[i].fallback_sw_path) { ++ continue; ++ } ++ ++ job = qpl->hw_jobs[i].job; ++ if (!multifd_qpl_process_and_check_job(job, true, size, errp)) { ++ return -1; ++ } ++ } ++ return 0; ++} + /** + * multifd_qpl_recv: read the data from the channel into actual pages + * +@@ -325,8 +701,48 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) + */ + static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) + { +- /* Implement in next patch */ +- return -1; ++ QplData *qpl = p->compress_data; ++ uint32_t in_size = p->next_packet_size; ++ uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; ++ uint32_t len = 0; ++ uint32_t zbuf_len = 0; ++ int ret; ++ ++ if (flags != MULTIFD_FLAG_QPL) { ++ error_setg(errp, "multifd %u: flags received %x flags expected %x", ++ p->id, flags, MULTIFD_FLAG_QPL); ++ return -1; ++ } ++ multifd_recv_zero_page_process(p); ++ if (!p->normal_num) { ++ assert(in_size == 0); ++ return 0; ++ } ++ ++ /* read compressed page lengths */ ++ len = p->normal_num * sizeof(uint32_t); ++ assert(len < in_size); ++ ret = qio_channel_read_all(p->c, (void *) qpl->zlen, len, errp); ++ if (ret != 0) { ++ return ret; ++ } ++ for (int i = 0; i < p->normal_num; i++) { ++ qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); ++ assert(qpl->zlen[i] <= p->page_size); ++ zbuf_len += qpl->zlen[i]; ++ } ++ ++ /* read compressed pages */ ++ assert(in_size == len + zbuf_len); ++ ret = qio_channel_read_all(p->c, (void *) qpl->zbuf, zbuf_len, errp); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ if (qpl->hw_avail) { ++ return multifd_qpl_decompress_pages(p, errp); ++ } ++ return multifd_qpl_decompress_pages_slow_path(p, errp); + } + + static MultiFDMethods multifd_qpl_ops = { +-- +2.43.0 + diff --git a/0430-tests-migration-test-add-qpl-compression-test.patch b/0430-tests-migration-test-add-qpl-compression-test.patch new file mode 100644 index 0000000..2b8f811 --- /dev/null +++ b/0430-tests-migration-test-add-qpl-compression-test.patch @@ -0,0 +1,82 @@ +From 1770b562273243f72bf85466eece8298b706bb4d Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Mon, 10 Jun 2024 18:21:10 +0800 +Subject: [PATCH] tests/migration-test: add qpl compression test + +commit 08b82d207d138173ddd334c91b387213508a6e13 upstream. + +add qpl to compression method test for multifd migration + +the qpl compression supports software path and hardware +path(IAA device), and the hardware path is used first by +default. If the hardware path is unavailable, it will +automatically fallback to the software path for testing. + +Intel-SIG: commit 08b82d207d13 tests/migration-test: add qpl compression test + +Signed-off-by: Yuan Liu +Reviewed-by: Nanhai Zou +Reviewed-by: Peter Xu +Reviewed-by: Fabiano Rosas +Signed-off-by: Fabiano Rosas + + Conflicts: + tests/qtest/migration-test.c +[jz: resolve simple context conflict] +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-test.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 4bdf397828..3ccf4f7ecf 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -2572,6 +2572,15 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, + } + #endif /* CONFIG_ZSTD */ + ++#ifdef CONFIG_QPL ++static void * ++test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, ++ QTestState *to) ++{ ++ return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); ++} ++#endif /* CONFIG_QPL */ ++ + static void test_multifd_tcp_none(void) + { + MigrateCommon args = { +@@ -2607,6 +2616,17 @@ static void test_multifd_tcp_zstd(void) + } + #endif + ++#ifdef CONFIG_QPL ++static void test_multifd_tcp_qpl(void) ++{ ++ MigrateCommon args = { ++ .listen_uri = "defer", ++ .start_hook = test_migrate_precopy_tcp_multifd_qpl_start, ++ }; ++ test_precopy_common(&args); ++} ++#endif ++ + #ifdef CONFIG_GNUTLS + static void * + test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, +@@ -3483,6 +3503,10 @@ int main(int argc, char **argv) + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); + #endif ++#ifdef CONFIG_QPL ++ migration_test_add("/migration/multifd/tcp/plain/qpl", ++ test_multifd_tcp_qpl); ++#endif + #ifdef CONFIG_GNUTLS + migration_test_add("/migration/multifd/tcp/tls/psk/match", + test_multifd_tcp_tls_psk_match); +-- +2.43.0 + diff --git a/0431-migration-properly-apply-migration-compression-level.patch b/0431-migration-properly-apply-migration-compression-level.patch new file mode 100644 index 0000000..0d32145 --- /dev/null +++ b/0431-migration-properly-apply-migration-compression-level.patch @@ -0,0 +1,55 @@ +From 96e1cc62a2f88bf1c07002667d01db47e27d25be Mon Sep 17 00:00:00 2001 +From: Bryan Zhang +Date: Fri, 1 Mar 2024 03:59:00 +0000 +Subject: [PATCH] migration: Properly apply migration compression level + parameters + +commit b4014a2bf57ce08e2f6458cd82e9f968facf25c8 upstream. + +Some glue code was missing, so that using `qmp_migrate_set_parameters` +to set `multifd-zstd-level` or `multifd-zlib-level` did not work. This +commit adds the glue code to fix that. + +Intel-SIG: commit b4014a2bf57c migration: Properly apply migration compression level parameters + +Signed-off-by: Bryan Zhang +Link: https://lore.kernel.org/r/20240301035901.4006936-2-bryan.zhang@bytedance.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/options.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/migration/options.c b/migration/options.c +index 38403bf745..68ff81b885 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -1333,6 +1333,12 @@ static void migrate_params_test_apply(MigrateSetParameters *params, + if (params->has_multifd_compression) { + dest->multifd_compression = params->multifd_compression; + } ++ if (params->has_multifd_zlib_level) { ++ dest->multifd_zlib_level = params->multifd_zlib_level; ++ } ++ if (params->has_multifd_zstd_level) { ++ dest->multifd_zstd_level = params->multifd_zstd_level; ++ } + if (params->has_xbzrle_cache_size) { + dest->xbzrle_cache_size = params->xbzrle_cache_size; + } +@@ -1485,6 +1491,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) + if (params->has_multifd_compression) { + s->parameters.multifd_compression = params->multifd_compression; + } ++ if (params->has_multifd_zlib_level) { ++ s->parameters.multifd_zlib_level = params->multifd_zlib_level; ++ } ++ if (params->has_multifd_zstd_level) { ++ s->parameters.multifd_zstd_level = params->multifd_zstd_level; ++ } + if (params->has_xbzrle_cache_size) { + s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; + xbzrle_cache_resize(params->xbzrle_cache_size, errp); +-- +2.43.0 + diff --git a/0432-tests-migration-set-compression-level-in-migration-t.patch b/0432-tests-migration-set-compression-level-in-migration-t.patch new file mode 100644 index 0000000..e88042b --- /dev/null +++ b/0432-tests-migration-set-compression-level-in-migration-t.patch @@ -0,0 +1,51 @@ +From 036187650997385d3efe5694a16437a2123bbfaf Mon Sep 17 00:00:00 2001 +From: Bryan Zhang +Date: Fri, 1 Mar 2024 03:59:01 +0000 +Subject: [PATCH] tests/migration: Set compression level in migration tests + +commit 2b571432314ab42da742fbb578f4174166ecd7f5 upstream. + +Adds calls to set compression level for `zstd` and `zlib` migration +tests, just to make sure that the calls work. + +Intel-SIG: commit 2b571432314a tests/migration: Set compression level in migration tests + +Signed-off-by: Bryan Zhang +Link: https://lore.kernel.org/r/20240301035901.4006936-3-bryan.zhang@bytedance.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-test.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 3ccf4f7ecf..10a3f99d6c 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -2560,6 +2560,13 @@ static void * + test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, + QTestState *to) + { ++ /* ++ * Overloading this test to also check that set_parameter does not error. ++ * This is also done in the tests for the other compression methods. ++ */ ++ migrate_set_parameter_int(from, "multifd-zlib-level", 2); ++ migrate_set_parameter_int(to, "multifd-zlib-level", 2); ++ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); + } + +@@ -2568,6 +2575,9 @@ static void * + test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, + QTestState *to) + { ++ migrate_set_parameter_int(from, "multifd-zstd-level", 2); ++ migrate_set_parameter_int(to, "multifd-zstd-level", 2); ++ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); + } + #endif /* CONFIG_ZSTD */ +-- +2.43.0 + diff --git a/0433-docs-migration-add-qatzip-compression-feature.patch b/0433-docs-migration-add-qatzip-compression-feature.patch new file mode 100644 index 0000000..9930710 --- /dev/null +++ b/0433-docs-migration-add-qatzip-compression-feature.patch @@ -0,0 +1,212 @@ +From 2b1758770aa46a49baf9338f475c78c295a48bdc Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Fri, 30 Aug 2024 16:27:18 -0700 +Subject: [PATCH] docs/migration: add qatzip compression feature + +commit 85da4cbe6e5eb6ba6f31c8b30ee4582625546da7 upstream. + +add Intel QATzip compression method introduction + +Intel-SIG: commit 85da4cbe6e5e docs/migration: add qatzip compression feature + +Reviewed-by: Nanhai Zou +Reviewed-by: Peter Xu +Reviewed-by: Fabiano Rosas +Signed-off-by: Yuan Liu +Signed-off-by: Yichen Wang +Link: https://lore.kernel.org/r/20240830232722.58272-2-yichen.wang@bytedance.com +Signed-off-by: Peter Xu + + Conflicts: + docs/devel/migration/features.rst +[jz: resolve context conflict due to uadk which is not backported] +Signed-off-by: Jason Zeng +--- + docs/devel/migration/features.rst | 1 + + docs/devel/migration/qatzip-compression.rst | 165 ++++++++++++++++++++ + 2 files changed, 166 insertions(+) + create mode 100644 docs/devel/migration/qatzip-compression.rst + +diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst +index 9819393c12..9ba25882ac 100644 +--- a/docs/devel/migration/features.rst ++++ b/docs/devel/migration/features.rst +@@ -11,3 +11,4 @@ Migration has plenty of features to support different use cases. + vfio + virtio + qpl-compression ++ qatzip-compression +diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst +new file mode 100644 +index 0000000000..862b383164 +--- /dev/null ++++ b/docs/devel/migration/qatzip-compression.rst +@@ -0,0 +1,165 @@ ++================== ++QATzip Compression ++================== ++In scenarios with limited network bandwidth, the ``QATzip`` solution can help ++users save a lot of host CPU resources by accelerating compression and ++decompression through the Intel QuickAssist Technology(``QAT``) hardware. ++ ++ ++The following test was conducted using 8 multifd channels and 10Gbps network ++bandwidth. The results show that, compared to zstd, ``QATzip`` significantly ++saves CPU resources on the sender and reduces migration time. Compared to the ++uncompressed solution, ``QATzip`` greatly improves the dirty page processing ++capability, indicated by the Pages per Second metric, and also reduces the ++total migration time. ++ ++:: ++ ++ VM Configuration: 16 vCPU and 64G memory ++ VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data. ++ QAT Devices: 4 ++ |-----------|--------|---------|----------|----------|------|------| ++ |8 Channels |Total |down |throughput|pages per | send | recv | ++ | |time(ms)|time(ms) |(mbps) |second | cpu %| cpu% | ++ |-----------|--------|---------|----------|----------|------|------| ++ |qatzip | 16630| 28| 10467| 2940235| 160| 360| ++ |-----------|--------|---------|----------|----------|------|------| ++ |zstd | 20165| 24| 8579| 2391465| 810| 340| ++ |-----------|--------|---------|----------|----------|------|------| ++ |none | 46063| 40| 10848| 330240| 45| 85| ++ |-----------|--------|---------|----------|----------|------|------| ++ ++ ++QATzip Compression Framework ++============================ ++ ++``QATzip`` is a user space library which builds on top of the Intel QuickAssist ++Technology to provide extended accelerated compression and decompression ++services. ++ ++For more ``QATzip`` introduction, please refer to `QATzip Introduction ++`_ ++ ++:: ++ ++ +----------------+ ++ | MultiFd Thread | ++ +-------+--------+ ++ | ++ | compress/decompress ++ +-------+--------+ ++ | QATzip library | ++ +-------+--------+ ++ | ++ +-------+--------+ ++ | QAT library | ++ +-------+--------+ ++ | user space ++ --------+--------------------- ++ | kernel space ++ +------+-------+ ++ | QAT Driver | ++ +------+-------+ ++ | ++ +------+-------+ ++ | QAT Devices | ++ +--------------+ ++ ++ ++QATzip Installation ++------------------- ++ ++The ``QATzip`` installation package has been integrated into some Linux ++distributions and can be installed directly. For example, the Ubuntu Server ++24.04 LTS system can be installed using below command ++ ++.. code-block:: shell ++ ++ #apt search qatzip ++ libqatzip-dev/noble 1.2.0-0ubuntu3 amd64 ++ Intel QuickAssist user space library development files ++ ++ libqatzip3/noble 1.2.0-0ubuntu3 amd64 ++ Intel QuickAssist user space library ++ ++ qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed] ++ Compression user-space tool for Intel QuickAssist Technology ++ ++ #sudo apt install libqatzip-dev libqatzip3 qatzip ++ ++If your system does not support the ``QATzip`` installation package, you can ++use the source code to build and install, please refer to `QATzip source code installation ++`_ ++ ++QAT Hardware Deployment ++----------------------- ++ ++``QAT`` supports physical functions(PFs) and virtual functions(VFs) for ++deployment, and users can configure ``QAT`` resources for migration according ++to actual needs. For more details about ``QAT`` deployment, please refer to ++`Intel QuickAssist Technology Documentation ++`_ ++ ++For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview ++`_ ++ ++How To Use QATzip Compression ++============================= ++ ++1 - Install ``QATzip`` library ++ ++2 - Build ``QEMU`` with ``--enable-qatzip`` parameter ++ ++ E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip`` ++ ++3 - Set ``migrate_set_parameter multifd-compression qatzip`` ++ ++4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default ++comp_level value is 1, and it supports levels from 1 to 9 ++ ++QAT Memory Requirements ++======================= ++ ++The user needs to reserve system memory for the QAT memory management to ++allocate DMA memory. The size of the reserved system memory depends on the ++number of devices used for migration and the number of multifd channels. ++ ++Because memory usage depends on QAT configuration, please refer to `QAT Memory ++Driver Queries ++`_ ++for memory usage calculation. ++ ++.. list-table:: An example of a PF used for migration ++ :header-rows: 1 ++ ++ * - Number of channels ++ - Sender memory usage ++ - Receiver memory usage ++ * - 2 ++ - 10M ++ - 10M ++ * - 4 ++ - 12M ++ - 14M ++ * - 8 ++ - 16M ++ - 20M ++ ++How To Choose Between QATzip and QPL ++==================================== ++Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids ++processor(``SPR``), multiple built-in accelerators are supported including ++``QAT`` and ``IAA``. The former can accelerate ``QATzip`` and the latter is ++used to accelerate ``QPL``. ++ ++Here are some suggestions: ++ ++1 - If the live migration scenario is limited by network bandwidth and ``QAT`` ++hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a ++lot of host CPU resources for compression. ++ ++2 - If the system cannot support shared virtual memory (SVM) technology, use ++the ``QATzip`` method because ``QPL`` performance is not good without SVM ++support. ++ ++3 - For other scenarios, use the ``QPL`` method first. +-- +2.43.0 + diff --git a/0434-meson-introduce-qatzip-feature-to-the-build-system.patch b/0434-meson-introduce-qatzip-feature-to-the-build-system.patch new file mode 100644 index 0000000..04f0ab6 --- /dev/null +++ b/0434-meson-introduce-qatzip-feature-to-the-build-system.patch @@ -0,0 +1,102 @@ +From 7d0bc6fde3f62d698109017bc3675eeaa18120dd Mon Sep 17 00:00:00 2001 +From: Bryan Zhang +Date: Fri, 30 Aug 2024 16:27:19 -0700 +Subject: [PATCH] meson: Introduce 'qatzip' feature to the build system + +commit e28ed313c268aeb4e0cefb66dcd215c30e4443fe upstream. + +Add a 'qatzip' feature, which is automatically disabled, and which +depends on the QATzip library if enabled. + +Intel-SIG: commit e28ed313c268 meson: Introduce 'qatzip' feature to the build system + +Reviewed-by: Fabiano Rosas +Signed-off-by: Bryan Zhang +Signed-off-by: Hao Xiang +Signed-off-by: Yichen Wang +Link: https://lore.kernel.org/r/20240830232722.58272-3-yichen.wang@bytedance.com +Signed-off-by: Peter Xu + + Conflicts: + meson.build + meson_options.txt + scripts/meson-buildoptions.sh +[jz: resolve simple context conflicts] +Signed-off-by: Jason Zeng +--- + meson.build | 9 +++++++++ + meson_options.txt | 2 ++ + scripts/meson-buildoptions.sh | 3 +++ + 3 files changed, 14 insertions(+) + +diff --git a/meson.build b/meson.build +index c833cd2d47..273a894147 100644 +--- a/meson.build ++++ b/meson.build +@@ -1057,6 +1057,13 @@ if not get_option('qpl').auto() or have_system + required: get_option('qpl'), + method: 'pkg-config') + endif ++qatzip = not_found ++if not get_option('qatzip').auto() or have_system ++ qatzip = dependency('qatzip', version: '>=1.1.2', ++ required: get_option('qatzip'), ++ method: 'pkg-config') ++endif ++ + virgl = not_found + + have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() +@@ -2225,6 +2232,7 @@ config_host_data.set('CONFIG_STATX', has_statx) + config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) + config_host_data.set('CONFIG_ZSTD', zstd.found()) + config_host_data.set('CONFIG_QPL', qpl.found()) ++config_host_data.set('CONFIG_QATZIP', qatzip.found()) + config_host_data.set('CONFIG_FUSE', fuse.found()) + config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) + config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) +@@ -4401,6 +4409,7 @@ summary_info += {'bzip2 support': libbzip2} + summary_info += {'lzfse support': liblzfse} + summary_info += {'zstd support': zstd} + summary_info += {'Query Processing Library support': qpl} ++summary_info += {'qatzip support': qatzip} + summary_info += {'NUMA host support': numa} + summary_info += {'capstone': capstone} + summary_info += {'libpmem support': libpmem} +diff --git a/meson_options.txt b/meson_options.txt +index e16aa8c823..6a2d8351fd 100644 +--- a/meson_options.txt ++++ b/meson_options.txt +@@ -261,6 +261,8 @@ option('zstd', type : 'feature', value : 'auto', + description: 'zstd compression support') + option('qpl', type : 'feature', value : 'auto', + description: 'Query Processing Library support') ++option('qatzip', type: 'feature', value: 'auto', ++ description: 'QATzip compression support') + option('fuse', type: 'feature', value: 'auto', + description: 'FUSE block device export') + option('fuse_lseek', type : 'feature', value : 'auto', +diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh +index 784f74fde9..1f04c31bd0 100644 +--- a/scripts/meson-buildoptions.sh ++++ b/scripts/meson-buildoptions.sh +@@ -164,6 +164,7 @@ meson_options_help() { + printf "%s\n" ' plugins TCG plugins via shared library loading' + printf "%s\n" ' png PNG support with libpng' + printf "%s\n" ' pvrdma Enable PVRDMA support' ++ printf "%s\n" ' qatzip QATzip compression support' + printf "%s\n" ' qcow1 qcow1 image format support' + printf "%s\n" ' qed qed image format support' + printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)' +@@ -431,6 +432,8 @@ _meson_option_parse() { + --prefix=*) quote_sh "-Dprefix=$2" ;; + --enable-pvrdma) printf "%s" -Dpvrdma=enabled ;; + --disable-pvrdma) printf "%s" -Dpvrdma=disabled ;; ++ --enable-qatzip) printf "%s" -Dqatzip=enabled ;; ++ --disable-qatzip) printf "%s" -Dqatzip=disabled ;; + --enable-qcow1) printf "%s" -Dqcow1=enabled ;; + --disable-qcow1) printf "%s" -Dqcow1=disabled ;; + --enable-qed) printf "%s" -Dqed=enabled ;; +-- +2.43.0 + diff --git a/0435-migration-add-migration-parameters-for-qatzip.patch b/0435-migration-add-migration-parameters-for-qatzip.patch new file mode 100644 index 0000000..ab5b3b6 --- /dev/null +++ b/0435-migration-add-migration-parameters-for-qatzip.patch @@ -0,0 +1,216 @@ +From 20389c52f40c4dab0f59072f7caef05cefe8a9d9 Mon Sep 17 00:00:00 2001 +From: Bryan Zhang +Date: Fri, 30 Aug 2024 16:27:20 -0700 +Subject: [PATCH] migration: Add migration parameters for QATzip + +commit 86c6eb1f39cbb7eb0467c114469e98ef699fb515 upstream. + +Adds support for migration parameters to control QATzip compression +level. + +Intel-SIG: commit 86c6eb1f39cb migration: Add migration parameters for QATzip + +Acked-by: Markus Armbruster +Signed-off-by: Bryan Zhang +Signed-off-by: Hao Xiang +Signed-off-by: Yichen Wang +Reviewed-by: Fabiano Rosas +Reviewed-by: Prasad Pandit +Link: https://lore.kernel.org/r/20240830232722.58272-4-yichen.wang@bytedance.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + migration/migration-hmp-cmds.c | 4 ++++ + migration/options.c | 34 ++++++++++++++++++++++++++++++++++ + migration/options.h | 1 + + qapi/migration.json | 18 ++++++++++++++++++ + 4 files changed, 57 insertions(+) + +diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c +index 16070c5eb3..1e14adb9e1 100644 +--- a/migration/migration-hmp-cmds.c ++++ b/migration/migration-hmp-cmds.c +@@ -652,6 +652,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) + p->has_multifd_zlib_level = true; + visit_type_uint8(v, param, &p->multifd_zlib_level, &err); + break; ++ case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL: ++ p->has_multifd_qatzip_level = true; ++ visit_type_uint8(v, param, &p->multifd_qatzip_level, &err); ++ break; + case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: + p->has_multifd_zstd_level = true; + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); +diff --git a/migration/options.c b/migration/options.c +index 68ff81b885..39ba3e0c0f 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -62,6 +62,13 @@ + #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE + /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ + #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 ++/* ++ * 1: best speed, ... 9: best compress ratio ++ * There is some nuance here. Refer to QATzip documentation to understand ++ * the mapping of QATzip levels to standard deflate levels. ++ */ ++#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1 ++ + /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ + #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 + +@@ -143,6 +150,9 @@ Property migration_properties[] = { + DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, + parameters.multifd_zlib_level, + DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), ++ DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState, ++ parameters.multifd_qatzip_level, ++ DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL), + DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, + parameters.multifd_zstd_level, + DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), +@@ -865,6 +875,13 @@ int migrate_multifd_zlib_level(void) + return s->parameters.multifd_zlib_level; + } + ++int migrate_multifd_qatzip_level(void) ++{ ++ MigrationState *s = migrate_get_current(); ++ ++ return s->parameters.multifd_qatzip_level; ++} ++ + int migrate_multifd_zstd_level(void) + { + MigrationState *s = migrate_get_current(); +@@ -994,6 +1011,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) + params->multifd_compression = s->parameters.multifd_compression; + params->has_multifd_zlib_level = true; + params->multifd_zlib_level = s->parameters.multifd_zlib_level; ++ params->has_multifd_qatzip_level = true; ++ params->multifd_qatzip_level = s->parameters.multifd_qatzip_level; + params->has_multifd_zstd_level = true; + params->multifd_zstd_level = s->parameters.multifd_zstd_level; + params->has_xbzrle_cache_size = true; +@@ -1054,6 +1073,7 @@ void migrate_params_init(MigrationParameters *params) + params->has_multifd_channels = true; + params->has_multifd_compression = true; + params->has_multifd_zlib_level = true; ++ params->has_multifd_qatzip_level = true; + params->has_multifd_zstd_level = true; + params->has_xbzrle_cache_size = true; + params->has_max_postcopy_bandwidth = true; +@@ -1168,6 +1188,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) + return false; + } + ++ if (params->has_multifd_qatzip_level && ++ ((params->multifd_qatzip_level > 9) || ++ (params->multifd_qatzip_level < 1))) { ++ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level", ++ "a value between 1 and 9"); ++ return false; ++ } ++ + if (params->has_multifd_zstd_level && + (params->multifd_zstd_level > 20)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", +@@ -1333,6 +1361,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params, + if (params->has_multifd_compression) { + dest->multifd_compression = params->multifd_compression; + } ++ if (params->has_multifd_qatzip_level) { ++ dest->multifd_qatzip_level = params->multifd_qatzip_level; ++ } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } +@@ -1491,6 +1522,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) + if (params->has_multifd_compression) { + s->parameters.multifd_compression = params->multifd_compression; + } ++ if (params->has_multifd_qatzip_level) { ++ s->parameters.multifd_qatzip_level = params->multifd_qatzip_level; ++ } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } +diff --git a/migration/options.h b/migration/options.h +index b7c4fb3861..1f1629a04e 100644 +--- a/migration/options.h ++++ b/migration/options.h +@@ -87,6 +87,7 @@ MigMode migrate_mode(void); + int migrate_multifd_channels(void); + MultiFDCompression migrate_multifd_compression(void); + int migrate_multifd_zlib_level(void); ++int migrate_multifd_qatzip_level(void); + int migrate_multifd_zstd_level(void); + uint8_t migrate_throttle_trigger_threshold(void); + const char *migrate_tls_authz(void); +diff --git a/qapi/migration.json b/qapi/migration.json +index 720988007d..6e7c03aa60 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -865,6 +865,11 @@ + # speed, and 9 means best compression ratio which will consume + # more CPU. Defaults to 1. (Since 5.0) + # ++# @multifd-qatzip-level: Set the compression level to be used in live ++# migration. The level is an integer between 1 and 9, where 1 means ++# the best compression speed, and 9 means the best compression ++# ratio which will consume more CPU. Defaults to 1. (Since 9.2) ++# + # @multifd-zstd-level: Set the compression level to be used in live + # migration, the compression level is an integer between 0 and 20, + # where 0 means no compression, 1 means the best compression +@@ -942,6 +947,7 @@ + 'xbzrle-cache-size', 'max-postcopy-bandwidth', + 'max-cpu-throttle', 'multifd-compression', + 'multifd-zlib-level', 'multifd-zstd-level', ++ 'multifd-qatzip-level', + 'block-bitmap-mapping', + { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, + 'vcpu-dirty-limit', +@@ -1070,6 +1076,11 @@ + # speed, and 9 means best compression ratio which will consume + # more CPU. Defaults to 1. (Since 5.0) + # ++# @multifd-qatzip-level: Set the compression level to be used in live ++# migration. The level is an integer between 1 and 9, where 1 means ++# the best compression speed, and 9 means the best compression ++# ratio which will consume more CPU. Defaults to 1. (Since 9.2) ++# + # @multifd-zstd-level: Set the compression level to be used in live + # migration, the compression level is an integer between 0 and 20, + # where 0 means no compression, 1 means the best compression +@@ -1165,6 +1176,7 @@ + '*max-cpu-throttle': 'uint8', + '*multifd-compression': 'MultiFDCompression', + '*multifd-zlib-level': 'uint8', ++ '*multifd-qatzip-level': 'uint8', + '*multifd-zstd-level': 'uint8', + '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], + '*x-vcpu-dirty-limit-period': { 'type': 'uint64', +@@ -1317,6 +1329,11 @@ + # speed, and 9 means best compression ratio which will consume + # more CPU. Defaults to 1. (Since 5.0) + # ++# @multifd-qatzip-level: Set the compression level to be used in live ++# migration. The level is an integer between 1 and 9, where 1 means ++# the best compression speed, and 9 means the best compression ++# ratio which will consume more CPU. Defaults to 1. (Since 9.2) ++# + # @multifd-zstd-level: Set the compression level to be used in live + # migration, the compression level is an integer between 0 and 20, + # where 0 means no compression, 1 means the best compression +@@ -1409,6 +1426,7 @@ + '*max-cpu-throttle': 'uint8', + '*multifd-compression': 'MultiFDCompression', + '*multifd-zlib-level': 'uint8', ++ '*multifd-qatzip-level': 'uint8', + '*multifd-zstd-level': 'uint8', + '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], + '*x-vcpu-dirty-limit-period': { 'type': 'uint64', +-- +2.43.0 + diff --git a/0436-migration-introduce-qatzip-compression-method.patch b/0436-migration-introduce-qatzip-compression-method.patch new file mode 100644 index 0000000..811140e --- /dev/null +++ b/0436-migration-introduce-qatzip-compression-method.patch @@ -0,0 +1,508 @@ +From cb317b924b30ac44db572273620ceb9cd451c2fc Mon Sep 17 00:00:00 2001 +From: Bryan Zhang +Date: Fri, 30 Aug 2024 16:27:21 -0700 +Subject: [PATCH] migration: Introduce 'qatzip' compression method + +commit 80484f945989988091c5cd729c3e8bde6c14907a upstream. + +Adds support for 'qatzip' as an option for the multifd compression +method parameter, and implements using QAT for 'qatzip' compression and +decompression. + +Intel-SIG: commit 80484f945989 migration: Introduce 'qatzip' compression method + +Acked-by: Markus Armbruster +Reviewed-by: Fabiano Rosas +Reviewed-by: Prasad Pandit +Signed-off-by: Bryan Zhang +Signed-off-by: Hao Xiang +Signed-off-by: Yichen Wang +Link: https://lore.kernel.org/r/20240830232722.58272-5-yichen.wang@bytedance.com +Signed-off-by: Peter Xu + + Conflicts: + hw/core/qdev-properties-system.c + migration/meson.build + migration/multifd.h + qapi/migration.json +[jz: resolve context conflicts due to uadk which is not backported] +Signed-off-by: Jason Zeng +--- + hw/core/qdev-properties-system.c | 2 +- + migration/meson.build | 1 + + migration/multifd-qatzip.c | 394 +++++++++++++++++++++++++++++++ + migration/multifd.h | 5 +- + qapi/migration.json | 3 + + 5 files changed, 402 insertions(+), 3 deletions(-) + create mode 100644 migration/multifd-qatzip.c + +diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c +index 523b3a9bae..25ee188df7 100644 +--- a/hw/core/qdev-properties-system.c ++++ b/hw/core/qdev-properties-system.c +@@ -666,7 +666,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { + const PropertyInfo qdev_prop_multifd_compression = { + .name = "MultiFDCompression", + .description = "multifd_compression values, " +- "none/zlib/zstd/qpl", ++ "none/zlib/zstd/qpl/qatzip", + .enum_table = &MultiFDCompression_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, +diff --git a/migration/meson.build b/migration/meson.build +index cb177de1d2..e3a0b89651 100644 +--- a/migration/meson.build ++++ b/migration/meson.build +@@ -42,6 +42,7 @@ if get_option('live_block_migration').allowed() + endif + system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) + system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) ++system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) + + specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', + if_true: files('ram.c', +diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c +new file mode 100644 +index 0000000000..3c787ed879 +--- /dev/null ++++ b/migration/multifd-qatzip.c +@@ -0,0 +1,394 @@ ++/* ++ * Multifd QATzip compression implementation ++ * ++ * Copyright (c) Bytedance ++ * ++ * Authors: ++ * Bryan Zhang ++ * Hao Xiang ++ * Yichen Wang ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "exec/ramblock.h" ++#include "qapi/error.h" ++#include "qemu/error-report.h" ++#include "qapi/qapi-types-migration.h" ++#include "options.h" ++#include "multifd.h" ++#include ++ ++typedef struct { ++ /* ++ * Unique session for use with QATzip API ++ */ ++ QzSession_T sess; ++ ++ /* ++ * For compression: Buffer for pages to compress ++ * For decompression: Buffer for data to decompress ++ */ ++ uint8_t *in_buf; ++ uint32_t in_len; ++ ++ /* ++ * For compression: Output buffer of compressed data ++ * For decompression: Output buffer of decompressed data ++ */ ++ uint8_t *out_buf; ++ uint32_t out_len; ++} QatzipData; ++ ++/** ++ * qatzip_send_setup: Set up QATzip session and private buffers. ++ * ++ * @param p Multifd channel params ++ * @param errp Pointer to error, which will be set in case of error ++ * @return 0 on success, -1 on error (and *errp will be set) ++ */ ++static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) ++{ ++ QatzipData *q; ++ QzSessionParamsDeflate_T params; ++ const char *err_msg; ++ int ret; ++ ++ q = g_new0(QatzipData, 1); ++ p->compress_data = q; ++ /* We need one extra place for the packet header */ ++ p->iov = g_new0(struct iovec, 2); ++ ++ /* ++ * Initialize QAT device with software fallback by default. This allows ++ * QATzip to use CPU path when QAT hardware reaches maximum throughput. ++ */ ++ ret = qzInit(&q->sess, true); ++ if (ret != QZ_OK && ret != QZ_DUPLICATE) { ++ err_msg = "qzInit failed"; ++ goto err; ++ } ++ ++ ret = qzGetDefaultsDeflate(¶ms); ++ if (ret != QZ_OK) { ++ err_msg = "qzGetDefaultsDeflate failed"; ++ goto err; ++ } ++ ++ /* Make sure to use configured QATzip compression level. */ ++ params.common_params.comp_lvl = migrate_multifd_qatzip_level(); ++ ret = qzSetupSessionDeflate(&q->sess, ¶ms); ++ if (ret != QZ_OK && ret != QZ_DUPLICATE) { ++ err_msg = "qzSetupSessionDeflate failed"; ++ goto err; ++ } ++ ++ if (MULTIFD_PACKET_SIZE > UINT32_MAX) { ++ err_msg = "packet size too large for QAT"; ++ goto err; ++ } ++ ++ q->in_len = MULTIFD_PACKET_SIZE; ++ /* ++ * PINNED_MEM is an enum from qatzip headers, which means to use ++ * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device ++ * is not available or software fallback is used, the malloc flag needs to ++ * be set as COMMON_MEM. ++ */ ++ q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); ++ if (!q->in_buf) { ++ q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); ++ if (!q->in_buf) { ++ err_msg = "qzMalloc failed"; ++ goto err; ++ } ++ } ++ ++ q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess); ++ q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); ++ if (!q->out_buf) { ++ q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); ++ if (!q->out_buf) { ++ err_msg = "qzMalloc failed"; ++ goto err; ++ } ++ } ++ ++ return 0; ++ ++err: ++ error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg); ++ return -1; ++} ++ ++/** ++ * qatzip_send_cleanup: Tear down QATzip session and release private buffers. ++ * ++ * @param p Multifd channel params ++ * @param errp Pointer to error, which will be set in case of error ++ * @return None ++ */ ++static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp) ++{ ++ QatzipData *q = p->compress_data; ++ ++ if (q) { ++ if (q->in_buf) { ++ qzFree(q->in_buf); ++ } ++ if (q->out_buf) { ++ qzFree(q->out_buf); ++ } ++ (void)qzTeardownSession(&q->sess); ++ (void)qzClose(&q->sess); ++ g_free(q); ++ } ++ ++ g_free(p->iov); ++ p->iov = NULL; ++ p->compress_data = NULL; ++} ++ ++/** ++ * qatzip_send_prepare: Compress pages and update IO channel info. ++ * ++ * @param p Multifd channel params ++ * @param errp Pointer to error, which will be set in case of error ++ * @return 0 on success, -1 on error (and *errp will be set) ++ */ ++static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) ++{ ++ MultiFDPages_t *pages = p->pages; ++ QatzipData *q = p->compress_data; ++ int ret; ++ unsigned int in_len, out_len; ++ ++ if (!multifd_send_prepare_common(p)) { ++ goto out; ++ } ++ ++ /* ++ * Unlike other multifd compression implementations, we use a non-streaming ++ * API and place all the data into one buffer, rather than sending each ++ * page to the compression API at a time. Based on initial benchmarks, the ++ * non-streaming API outperforms the streaming API. Plus, the logic in QEMU ++ * is friendly to using the non-streaming API anyway. If either of these ++ * statements becomes no longer true, we can revisit adding a streaming ++ * implementation. ++ */ ++ for (int i = 0; i < pages->normal_num; i++) { ++ memcpy(q->in_buf + (i * p->page_size), ++ pages->block->host + pages->offset[i], ++ p->page_size); ++ } ++ ++ in_len = pages->normal_num * p->page_size; ++ if (in_len > q->in_len) { ++ error_setg(errp, "multifd %u: unexpectedly large input", p->id); ++ return -1; ++ } ++ out_len = q->out_len; ++ ++ ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1); ++ if (ret != QZ_OK) { ++ error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK", ++ p->id, ret); ++ return -1; ++ } ++ if (in_len != pages->normal_num * p->page_size) { ++ error_setg(errp, "multifd %u: QATzip failed to compress all input", ++ p->id); ++ return -1; ++ } ++ ++ p->iov[p->iovs_num].iov_base = q->out_buf; ++ p->iov[p->iovs_num].iov_len = out_len; ++ p->iovs_num++; ++ p->next_packet_size = out_len; ++ ++out: ++ p->flags |= MULTIFD_FLAG_QATZIP; ++ multifd_send_fill_packet(p); ++ return 0; ++} ++ ++/** ++ * qatzip_recv_setup: Set up QATzip session and allocate private buffers. ++ * ++ * @param p Multifd channel params ++ * @param errp Pointer to error, which will be set in case of error ++ * @return 0 on success, -1 on error (and *errp will be set) ++ */ ++static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) ++{ ++ QatzipData *q; ++ QzSessionParamsDeflate_T params; ++ const char *err_msg; ++ int ret; ++ ++ q = g_new0(QatzipData, 1); ++ p->compress_data = q; ++ ++ /* ++ * Initialize QAT device with software fallback by default. This allows ++ * QATzip to use CPU path when QAT hardware reaches maximum throughput. ++ */ ++ ret = qzInit(&q->sess, true); ++ if (ret != QZ_OK && ret != QZ_DUPLICATE) { ++ err_msg = "qzInit failed"; ++ goto err; ++ } ++ ++ ret = qzGetDefaultsDeflate(¶ms); ++ if (ret != QZ_OK) { ++ err_msg = "qzGetDefaultsDeflate failed"; ++ goto err; ++ } ++ ++ ret = qzSetupSessionDeflate(&q->sess, ¶ms); ++ if (ret != QZ_OK && ret != QZ_DUPLICATE) { ++ err_msg = "qzSetupSessionDeflate failed"; ++ goto err; ++ } ++ ++ /* ++ * Reserve extra spaces for the incoming packets. Current implementation ++ * doesn't send uncompressed pages in case the compression gets too big. ++ */ ++ q->in_len = MULTIFD_PACKET_SIZE * 2; ++ /* ++ * PINNED_MEM is an enum from qatzip headers, which means to use ++ * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device ++ * is not available or software fallback is used, the malloc flag needs to ++ * be set as COMMON_MEM. ++ */ ++ q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); ++ if (!q->in_buf) { ++ q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); ++ if (!q->in_buf) { ++ err_msg = "qzMalloc failed"; ++ goto err; ++ } ++ } ++ ++ q->out_len = MULTIFD_PACKET_SIZE; ++ q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); ++ if (!q->out_buf) { ++ q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); ++ if (!q->out_buf) { ++ err_msg = "qzMalloc failed"; ++ goto err; ++ } ++ } ++ ++ return 0; ++ ++err: ++ error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg); ++ return -1; ++} ++ ++/** ++ * qatzip_recv_cleanup: Tear down QATzip session and release private buffers. ++ * ++ * @param p Multifd channel params ++ * @return None ++ */ ++static void qatzip_recv_cleanup(MultiFDRecvParams *p) ++{ ++ QatzipData *q = p->compress_data; ++ ++ if (q) { ++ if (q->in_buf) { ++ qzFree(q->in_buf); ++ } ++ if (q->out_buf) { ++ qzFree(q->out_buf); ++ } ++ (void)qzTeardownSession(&q->sess); ++ (void)qzClose(&q->sess); ++ g_free(q); ++ } ++ p->compress_data = NULL; ++} ++ ++ ++/** ++ * qatzip_recv: Decompress pages and copy them to the appropriate ++ * locations. ++ * ++ * @param p Multifd channel params ++ * @param errp Pointer to error, which will be set in case of error ++ * @return 0 on success, -1 on error (and *errp will be set) ++ */ ++static int qatzip_recv(MultiFDRecvParams *p, Error **errp) ++{ ++ QatzipData *q = p->compress_data; ++ int ret; ++ unsigned int in_len, out_len; ++ uint32_t in_size = p->next_packet_size; ++ uint32_t expected_size = p->normal_num * p->page_size; ++ uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; ++ ++ if (in_size > q->in_len) { ++ error_setg(errp, "multifd %u: received unexpectedly large packet", ++ p->id); ++ return -1; ++ } ++ ++ if (flags != MULTIFD_FLAG_QATZIP) { ++ error_setg(errp, "multifd %u: flags received %x flags expected %x", ++ p->id, flags, MULTIFD_FLAG_QATZIP); ++ return -1; ++ } ++ ++ multifd_recv_zero_page_process(p); ++ if (!p->normal_num) { ++ assert(in_size == 0); ++ return 0; ++ } ++ ++ ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ in_len = in_size; ++ out_len = q->out_len; ++ ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len); ++ if (ret != QZ_OK) { ++ error_setg(errp, "multifd %u: qzDecompress failed", p->id); ++ return -1; ++ } ++ if (out_len != expected_size) { ++ error_setg(errp, "multifd %u: packet size received %u size expected %u", ++ p->id, out_len, expected_size); ++ return -1; ++ } ++ ++ /* Copy each page to its appropriate location. */ ++ for (int i = 0; i < p->normal_num; i++) { ++ memcpy(p->host + p->normal[i], ++ q->out_buf + p->page_size * i, ++ p->page_size); ++ } ++ return 0; ++} ++ ++static MultiFDMethods multifd_qatzip_ops = { ++ .send_setup = qatzip_send_setup, ++ .send_cleanup = qatzip_send_cleanup, ++ .send_prepare = qatzip_send_prepare, ++ .recv_setup = qatzip_recv_setup, ++ .recv_cleanup = qatzip_recv_cleanup, ++ .recv = qatzip_recv ++}; ++ ++static void multifd_qatzip_register(void) ++{ ++ multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops); ++} ++ ++migration_init(multifd_qatzip_register); +diff --git a/migration/multifd.h b/migration/multifd.h +index 41965df7a9..dedb1f1c14 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -29,13 +29,14 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); + /* Multifd Compression flags */ + #define MULTIFD_FLAG_SYNC (1 << 0) + +-/* We reserve 3 bits for compression methods */ +-#define MULTIFD_FLAG_COMPRESSION_MASK (7 << 1) ++/* We reserve 5 bits for compression methods */ ++#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1) + /* we need to be compatible. Before compression value was 0 */ + #define MULTIFD_FLAG_NOCOMP (0 << 1) + #define MULTIFD_FLAG_ZLIB (1 << 1) + #define MULTIFD_FLAG_ZSTD (2 << 1) + #define MULTIFD_FLAG_QPL (4 << 1) ++#define MULTIFD_FLAG_QATZIP (16 << 1) + + /* This value needs to be a multiple of qemu_target_page_size() */ + #define MULTIFD_PACKET_SIZE (512 * 1024) +diff --git a/qapi/migration.json b/qapi/migration.json +index 6e7c03aa60..5186755d60 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -625,6 +625,8 @@ + # + # @zstd: use zstd compression method. + # ++# @qatzip: use qatzip compression method. (Since 9.2) ++# + # @qpl: use qpl compression method. Query Processing Library(qpl) is + # based on the deflate compression algorithm and use the Intel + # In-Memory Analytics Accelerator(IAA) accelerated compression +@@ -635,6 +637,7 @@ + { 'enum': 'MultiFDCompression', + 'data': [ 'none', 'zlib', + { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, ++ { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'}, + { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } + + ## +-- +2.43.0 + diff --git a/0437-tests-migration-add-integration-test-for-qatzip-comp.patch b/0437-tests-migration-add-integration-test-for-qatzip-comp.patch new file mode 100644 index 0000000..8343dcc --- /dev/null +++ b/0437-tests-migration-add-integration-test-for-qatzip-comp.patch @@ -0,0 +1,78 @@ +From 1e2311f0146749b5109023c61a85c8f8c73319a2 Mon Sep 17 00:00:00 2001 +From: Bryan Zhang +Date: Fri, 30 Aug 2024 16:27:22 -0700 +Subject: [PATCH] tests/migration: Add integration test for 'qatzip' + compression method + +commit afe166d4e8bc33bc448cd573b55d0ac094187d48 upstream. + +Adds an integration test for 'qatzip'. + +Intel-SIG: commit afe166d4e8bc tests/migration: Add integration test for 'qatzip' compression method + +Reviewed-by: Fabiano Rosas +Signed-off-by: Bryan Zhang +Signed-off-by: Hao Xiang +Signed-off-by: Yichen Wang +Link: https://lore.kernel.org/r/20240830232722.58272-6-yichen.wang@bytedance.com +Signed-off-by: Peter Xu +Signed-off-by: Jason Zeng +--- + tests/qtest/migration-test.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c +index 10a3f99d6c..5998e7a50c 100644 +--- a/tests/qtest/migration-test.c ++++ b/tests/qtest/migration-test.c +@@ -2582,6 +2582,18 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, + } + #endif /* CONFIG_ZSTD */ + ++#ifdef CONFIG_QATZIP ++static void * ++test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from, ++ QTestState *to) ++{ ++ migrate_set_parameter_int(from, "multifd-qatzip-level", 2); ++ migrate_set_parameter_int(to, "multifd-qatzip-level", 2); ++ ++ return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip"); ++} ++#endif ++ + #ifdef CONFIG_QPL + static void * + test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, +@@ -2626,6 +2638,17 @@ static void test_multifd_tcp_zstd(void) + } + #endif + ++#ifdef CONFIG_QATZIP ++static void test_multifd_tcp_qatzip(void) ++{ ++ MigrateCommon args = { ++ .listen_uri = "defer", ++ .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start, ++ }; ++ test_precopy_common(&args); ++} ++#endif ++ + #ifdef CONFIG_QPL + static void test_multifd_tcp_qpl(void) + { +@@ -3513,6 +3536,10 @@ int main(int argc, char **argv) + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); + #endif ++#ifdef CONFIG_QATZIP ++ migration_test_add("/migration/multifd/tcp/plain/qatzip", ++ test_multifd_tcp_qatzip); ++#endif + #ifdef CONFIG_QPL + migration_test_add("/migration/multifd/tcp/plain/qpl", + test_multifd_tcp_qpl); +-- +2.43.0 + diff --git a/0438-migration-multifd-fix-rb-receivedmap-cleanup-race.patch b/0438-migration-multifd-fix-rb-receivedmap-cleanup-race.patch new file mode 100644 index 0000000..d0f3bf9 --- /dev/null +++ b/0438-migration-multifd-fix-rb-receivedmap-cleanup-race.patch @@ -0,0 +1,97 @@ +From 0707442d8c78b3940d39c703df95fccabe5ea3da Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Tue, 17 Sep 2024 15:58:02 -0300 +Subject: [PATCH] migration/multifd: Fix rb->receivedmap cleanup race + +commit 4ce56229087860805877075ddb29dd44578365a9 upstream. + +Fix a segmentation fault in multifd when rb->receivedmap is cleared +too early. + +After commit 5ef7e26bdb ("migration/multifd: solve zero page causing +multiple page faults"), multifd started using the rb->receivedmap +bitmap, which belongs to ram.c and is initialized and *freed* from the +ram SaveVMHandlers. + +Multifd threads are live until migration_incoming_state_destroy(), +which is called after qemu_loadvm_state_cleanup(), leading to a crash +when accessing rb->receivedmap. + +process_incoming_migration_co() ... + qemu_loadvm_state() multifd_nocomp_recv() + qemu_loadvm_state_cleanup() ramblock_recv_bitmap_set_offset() + rb->receivedmap = NULL set_bit_atomic(..., rb->receivedmap) + ... + migration_incoming_state_destroy() + multifd_recv_cleanup() + multifd_recv_terminate_threads(NULL) + +Move the loadvm cleanup into migration_incoming_state_destroy(), after +multifd_recv_cleanup() to ensure multifd threads have already exited +when rb->receivedmap is cleared. + +Adjust the postcopy listen thread comment to indicate that we still +want to skip the cpu synchronization. + +Intel-SIG: commit 4ce562290878 migration/multifd: Fix rb->receivedmap cleanup race + +CC: qemu-stable@nongnu.org +Fixes: 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults") +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240917185802.15619-3-farosas@suse.de +[peterx: added comment in migration_incoming_state_destroy()] +Signed-off-by: Peter Xu + +Conflicts: + migration/migration.c +[jz: resolve context conflict due to non-multifd compression which is + already deleted in upstream while still in anolis] +Signed-off-by: Jason Zeng +--- + migration/migration.c | 5 +++++ + migration/savevm.c | 6 ++++-- + 2 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index e7a513cb18..d1c8ec3be6 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -278,6 +278,11 @@ void migration_incoming_state_destroy(void) + + multifd_recv_cleanup(); + compress_threads_load_cleanup(); ++ /* ++ * RAM state cleanup needs to happen after multifd cleanup, because ++ * multifd threads can use some of its states (receivedmap). ++ */ ++ qemu_loadvm_state_cleanup(); + + if (mis->to_src_file) { + /* Tell source that we are done */ +diff --git a/migration/savevm.c b/migration/savevm.c +index cf88057efa..44f8587409 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -2957,7 +2957,10 @@ int qemu_loadvm_state(QEMUFile *f) + trace_qemu_loadvm_state_post_main(ret); + + if (mis->have_listen_thread) { +- /* Listen thread still going, can't clean up yet */ ++ /* ++ * Postcopy listen thread still going, don't synchronize the ++ * cpus yet. ++ */ + return ret; + } + +@@ -3000,7 +3003,6 @@ int qemu_loadvm_state(QEMUFile *f) + } + } + +- qemu_loadvm_state_cleanup(); + cpu_synchronize_all_post_init(); + + return ret; +-- +2.43.0 + diff --git a/0439-migration-multifd-fix-loop-conditions-in-multifd-zst.patch b/0439-migration-multifd-fix-loop-conditions-in-multifd-zst.patch new file mode 100644 index 0000000..cfabead --- /dev/null +++ b/0439-migration-multifd-fix-loop-conditions-in-multifd-zst.patch @@ -0,0 +1,59 @@ +From 71405e517adb3a9f087466a19a7ae912bf6a6388 Mon Sep 17 00:00:00 2001 +From: Stefan Weil +Date: Tue, 10 Sep 2024 07:41:38 +0200 +Subject: [PATCH] migration/multifd: Fix loop conditions in + multifd_zstd_send_prepare and multifd_zstd_recv + +commit cb0ed522a51a7d4b1fde535972d4aeeb82447928 upstream. + +GitHub's CodeQL reports four critical errors which are fixed by this commit: + + Unsigned difference expression compared to zero + +An expression (u - v > 0) with unsigned values u, v is only false if u == v, +so all changed expressions did not work as expected. + +Intel-SIG: commit cb0ed522a51a migration/multifd: Fix loop conditions in multifd_zstd_send_prepare and multifd_zstd_recv + +Signed-off-by: Stefan Weil +Link: https://lore.kernel.org/r/20240910054138.1458555-1-sw@weilnetz.de +[peterx: Fix mangled email for author] +Signed-off-by: Peter Xu + + Conflicts: + migration/multifd-zstd.c +[jz: resolve context conflict due to p->page which not renamed to page yet] +Signed-off-by: Jason Zeng +--- + migration/multifd-zstd.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c +index ca17b7e310..46ee68b6ce 100644 +--- a/migration/multifd-zstd.c ++++ b/migration/multifd-zstd.c +@@ -152,9 +152,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) + */ + do { + ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush); +- } while (ret > 0 && (z->in.size - z->in.pos > 0) +- && (z->out.size - z->out.pos > 0)); +- if (ret > 0 && (z->in.size - z->in.pos > 0)) { ++ } while (ret > 0 && (z->in.size > z->in.pos) ++ && (z->out.size > z->out.pos)); ++ if (ret > 0 && (z->in.size > z->in.pos)) { + error_setg(errp, "multifd %u: compressStream buffer too small", + p->id); + return -1; +@@ -299,7 +299,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) + */ + do { + ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); +- } while (ret > 0 && (z->in.size - z->in.pos > 0) ++ } while (ret > 0 && (z->in.size > z->in.pos) + && (z->out.pos < p->page_size)); + if (ret > 0 && (z->out.pos < p->page_size)) { + error_setg(errp, "multifd %u: decompressStream buffer too small", +-- +2.43.0 + diff --git a/0440-migration-multifd-ensure-packet-ramblock-is-null-ter.patch b/0440-migration-multifd-ensure-packet-ramblock-is-null-ter.patch new file mode 100644 index 0000000..1b0e56f --- /dev/null +++ b/0440-migration-multifd-ensure-packet-ramblock-is-null-ter.patch @@ -0,0 +1,71 @@ +From c5e6c00aec68ffed134c1f45733388178024f14f Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Thu, 19 Sep 2024 12:06:11 -0300 +Subject: [PATCH] migration/multifd: Ensure packet->ramblock is null-terminated + +commit 68e0fca625912c7c63a8bfbc784f53d4fefa1a13 upstream. + +Coverity points out that the current usage of strncpy to write the +ramblock name allows the field to not have an ending '\0' in case +idstr is already not null-terminated (e.g. if it's larger than 256 +bytes). + +This is currently harmless because the packet->ramblock field is never +touched again on the source side. The destination side reads only up +to the field's size from the stream and forces the last byte to be 0. + +We're still open to a programming error in the future in case this +field is ever passed into a function that expects a null-terminated +string. + +Change from strncpy to QEMU's pstrcpy, which puts a '\0' at the end of +the string and doesn't fill the extra space with zeros. + +(there's no spillage between iterations of fill_packet because after +commit 87bb9e953e ("migration/multifd: Isolate ram pages packet data") +the packet is always zeroed before filling) + +Intel-SIG: commit 68e0fca62591 migration/multifd: Ensure packet->ramblock is null-terminated + +Resolves: Coverity CID 1560071 +Reported-by: Peter Maydell +Signed-off-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240919150611.17074-1-farosas@suse.de +Signed-off-by: Peter Xu + + Conflicts: + migration/multifd-nocomp.c +[jz: upstream has split nocomp code into multifd-nocomp.c, while + we havenot yet. The function that needs to be fixed is + still named multifd_send_fill_packet in multifd.c, so we fix + it in multifd.c] +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index eae7040039..e77aaeeb30 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -16,6 +16,7 @@ + #include "exec/target_page.h" + #include "sysemu/sysemu.h" + #include "exec/ramblock.h" ++#include "qemu/cutils.h" + #include "qemu/error-report.h" + #include "qapi/error.h" + #include "ram.h" +@@ -399,7 +400,8 @@ void multifd_send_fill_packet(MultiFDSendParams *p) + packet->packet_num = cpu_to_be64(packet_num); + + if (pages->block) { +- strncpy(packet->ramblock, pages->block->idstr, 256); ++ pstrcpy(packet->ramblock, sizeof(packet->ramblock), ++ pages->block->idstr); + } + + for (i = 0; i < pages->num; i++) { +-- +2.43.0 + diff --git a/0441-migration-multifd-zero-p-flags-before-starting-filli.patch b/0441-migration-multifd-zero-p-flags-before-starting-filli.patch new file mode 100644 index 0000000..9e5d1ec --- /dev/null +++ b/0441-migration-multifd-zero-p-flags-before-starting-filli.patch @@ -0,0 +1,52 @@ +From 951bd029315de3ba0e0ced9d26edc39d87a28a19 Mon Sep 17 00:00:00 2001 +From: "Maciej S. Szmigiero" +Date: Tue, 29 Oct 2024 15:58:15 +0100 +Subject: [PATCH] migration/multifd: Zero p->flags before starting filling a + packet + +commit 00b4b216534d84ace7b0583cec70a3aaf256cb25 upstream. + +This way there aren't stale flags there. + +p->flags can't contain SYNC to be sent at the next RAM packet since syncs +are now handled separately in multifd_send_thread. + +Intel-SIG: commit 00b4b216534d migration/multifd: Zero p->flags before starting filling a packet + +Reviewed-by: Fabiano Rosas +Reviewed-by: Peter Xu +Signed-off-by: Maciej S. Szmigiero +Link: https://lore.kernel.org/r/1c96b6cdb797e6f035eb1a4ad9bfc24f4c7f5df8.1730203967.git.maciej.szmigiero@oracle.com +Signed-off-by: Peter Xu + + Conflicts: + migration/multifd.c +[jz: resolve simple context conflict] +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index e77aaeeb30..f35f4e1120 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -894,6 +894,7 @@ static void *multifd_send_thread(void *opaque) + if (qatomic_load_acquire(&p->pending_job)) { + MultiFDPages_t *pages = p->pages; + ++ p->flags = 0; + p->iovs_num = 0; + assert(pages->num); + +@@ -940,7 +941,6 @@ static void *multifd_send_thread(void *opaque) + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); +- p->flags = 0; + } + + qatomic_set(&p->pending_sync, false); +-- +2.43.0 + diff --git a/0442-multifd-bugfix-for-migration-using-compression-metho.patch b/0442-multifd-bugfix-for-migration-using-compression-metho.patch new file mode 100644 index 0000000..d0bc6df --- /dev/null +++ b/0442-multifd-bugfix-for-migration-using-compression-metho.patch @@ -0,0 +1,63 @@ +From 253d49f3d1dc402e0d8308f8b2e4a089f84fd4da Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Wed, 18 Dec 2024 17:14:11 +0800 +Subject: [PATCH] multifd: bugfix for migration using compression methods + +commit cdc3970f8597ebdc1a4c2090cfb4d11e297329ed upstream. + +When compression is enabled on the migration channel and +the pages processed are all zero pages, these pages will +not be sent and updated on the target side, resulting in +incorrect memory data on the source and target sides. + +The root cause is that all compression methods call +multifd_send_prepare_common to determine whether to compress +dirty pages, but multifd_send_prepare_common does not update +the IOV of MultiFDPacket_t when all dirty pages are zero pages. + +The solution is to always update the IOV of MultiFDPacket_t +regardless of whether the dirty pages are all zero pages. + +Intel-SIG: commit cdc3970f8597 multifd: bugfix for migration using compression methods + +Fixes: 303e6f54f9 ("migration/multifd: Implement zero page transmission on the multifd thread.") +Cc: qemu-stable@nongnu.org #9.0+ +Signed-off-by: Yuan Liu +Reviewed-by: Jason Zeng +Reviewed-by: Peter Xu +Message-Id: <20241218091413.140396-2-yuan1.liu@intel.com> +Signed-off-by: Fabiano Rosas + + Conflicts: + migration/multifd-nocomp.c +[jz: upstream has split nocomp code into multifd-nocomp.c, while + we havenot yet. The function that needs to be fixed is still + in multifd.c, so we fix it in multifd.c] +Signed-off-by: Jason Zeng +--- + migration/multifd.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index f35f4e1120..9c95d36d83 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -1484,6 +1484,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) + + bool multifd_send_prepare_common(MultiFDSendParams *p) + { ++ multifd_send_prepare_header(p); + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { +@@ -1491,7 +1492,5 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) + return false; + } + +- multifd_send_prepare_header(p); +- + return true; + } +-- +2.43.0 + diff --git a/0443-multifd-bugfix-for-incorrect-migration-data-with-qpl.patch b/0443-multifd-bugfix-for-incorrect-migration-data-with-qpl.patch new file mode 100644 index 0000000..86f0bd2 --- /dev/null +++ b/0443-multifd-bugfix-for-incorrect-migration-data-with-qpl.patch @@ -0,0 +1,47 @@ +From f7206a8bdf847415fd2a47f607206c92164d5b8b Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Wed, 18 Dec 2024 17:14:12 +0800 +Subject: [PATCH] multifd: bugfix for incorrect migration data with QPL + compression + +commit 2588a5f99b0c3493b4690e3ff01ed36f80e830cc upstream. + +When QPL compression is enabled on the migration channel and the same +dirty page changes from a normal page to a zero page in the iterative +memory copy, the dirty page will not be updated to a zero page again +on the target side, resulting in incorrect memory data on the source +and target sides. + +The root cause is that the target side does not record the normal pages +to the receivedmap. + +The solution is to add ramblock_recv_bitmap_set_offset in target side +to record the normal pages. + +Intel-SIG: commit 2588a5f99b0c multifd: bugfix for incorrect migration data with QPL compression + +Signed-off-by: Yuan Liu +Reviewed-by: Jason Zeng +Reviewed-by: Peter Xu +Message-Id: <20241218091413.140396-3-yuan1.liu@intel.com> +Signed-off-by: Fabiano Rosas +Signed-off-by: Jason Zeng +--- + migration/multifd-qpl.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c +index 9265098ee7..fea60e3937 100644 +--- a/migration/multifd-qpl.c ++++ b/migration/multifd-qpl.c +@@ -730,6 +730,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) + qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); + assert(qpl->zlen[i] <= p->page_size); + zbuf_len += qpl->zlen[i]; ++ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + + /* read compressed pages */ +-- +2.43.0 + diff --git a/0444-multifd-bugfix-for-incorrect-migration-data-with-qat.patch b/0444-multifd-bugfix-for-incorrect-migration-data-with-qat.patch new file mode 100644 index 0000000..25a066b --- /dev/null +++ b/0444-multifd-bugfix-for-incorrect-migration-data-with-qat.patch @@ -0,0 +1,51 @@ +From 40f47315df6771e41482ebc48abe582e36c80617 Mon Sep 17 00:00:00 2001 +From: Yuan Liu +Date: Wed, 18 Dec 2024 17:14:13 +0800 +Subject: [PATCH] multifd: bugfix for incorrect migration data with qatzip + compression + +commit a523bc52166c80d8a04d46584f9f3868bd53ef69 upstream. + +When QPL compression is enabled on the migration channel and the same +dirty page changes from a normal page to a zero page in the iterative +memory copy, the dirty page will not be updated to a zero page again +on the target side, resulting in incorrect memory data on the source +and target sides. + +The root cause is that the target side does not record the normal pages +to the receivedmap. + +The solution is to add ramblock_recv_bitmap_set_offset in target side +to record the normal pages. + +Intel-SIG: commit a523bc52166c multifd: bugfix for incorrect migration data with qatzip compression + +Signed-off-by: Yuan Liu +Reviewed-by: Jason Zeng +Reviewed-by: Peter Xu +Message-Id: <20241218091413.140396-4-yuan1.liu@intel.com> +Signed-off-by: Fabiano Rosas + + Conflicts: + migration/multifd-qatzip.c +[jz: resolve simple context conflict] +Signed-off-by: Jason Zeng +--- + migration/multifd-qatzip.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c +index 3c787ed879..88b6fb44ad 100644 +--- a/migration/multifd-qatzip.c ++++ b/migration/multifd-qatzip.c +@@ -373,6 +373,7 @@ static int qatzip_recv(MultiFDRecvParams *p, Error **errp) + memcpy(p->host + p->normal[i], + q->out_buf + p->page_size * i, + p->page_size); ++ ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + return 0; + } +-- +2.43.0 + diff --git a/0445-hw-audio-virtio-snd-fix-invalid-param-check.patch b/0445-hw-audio-virtio-snd-fix-invalid-param-check.patch new file mode 100644 index 0000000..00b5f33 --- /dev/null +++ b/0445-hw-audio-virtio-snd-fix-invalid-param-check.patch @@ -0,0 +1,49 @@ +From 15b4677cf2705a2f150a48c75cadcdf37928afb7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Volker=20R=C3=BCmelin?= +Date: Fri, 2 Aug 2024 09:18:05 +0200 +Subject: [PATCH] hw/audio/virtio-snd: fix invalid param check +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 7d14471a121878602cb4e748c4707f9ab9a9e3e2 upstream. + +Commit 9b6083465f ("virtio-snd: check for invalid param shift +operands") tries to prevent invalid parameters specified by the +guest. However, the code is not correct. + +Change the code so that the parameters format and rate, which are +a bit numbers, are compared with the bit size of the data type. + +Fixes: 9b6083465f ("virtio-snd: check for invalid param shift operands") +Signed-off-by: Volker Rümelin +Message-Id: <20240802071805.7123-1-vr_qemu@t-online.de> +Reviewed-by: Manos Pitsidianakis +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +--- + hw/audio/virtio-snd.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c +index 3ac5f78714..e2e315360d 100644 +--- a/hw/audio/virtio-snd.c ++++ b/hw/audio/virtio-snd.c +@@ -281,12 +281,12 @@ uint32_t virtio_snd_set_pcm_params(VirtIOSound *s, + error_report("Number of channels is not supported."); + return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); + } +- if (BIT(params->format) > sizeof(supported_formats) || ++ if (params->format >= sizeof(supported_formats) * BITS_PER_BYTE || + !(supported_formats & BIT(params->format))) { + error_report("Stream format is not supported."); + return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); + } +- if (BIT(params->rate) > sizeof(supported_rates) || ++ if (params->rate >= sizeof(supported_rates) * BITS_PER_BYTE || + !(supported_rates & BIT(params->rate))) { + error_report("Stream rate is not supported."); + return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); +-- +2.43.0 + diff --git a/0446-nbd-server-avoid-per-nbdrequest-nbd-client-get-put.patch b/0446-nbd-server-avoid-per-nbdrequest-nbd-client-get-put.patch new file mode 100644 index 0000000..5975b44 --- /dev/null +++ b/0446-nbd-server-avoid-per-nbdrequest-nbd-client-get-put.patch @@ -0,0 +1,46 @@ +From 4ac1f5e62be3f7a93b09f8c765cb73a315e44b7a Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 21 Dec 2023 14:24:50 -0500 +Subject: [PATCH] nbd/server: avoid per-NBDRequest nbd_client_get/put() + +nbd_trip() processes a single NBD request from start to finish and holds +an NBDClient reference throughout. NBDRequest does not outlive the scope +of nbd_trip(). Therefore it is unnecessary to ref/unref NBDClient for +each NBDRequest. + +Removing these nbd_client_get()/nbd_client_put() calls will make +thread-safety easier in the commits that follow. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Paolo Bonzini +Message-ID: <20231221192452.1785567-5-stefanha@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +--- + nbd/server.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/nbd/server.c b/nbd/server.c +index 895cf0a752..0b09ccc8dc 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -1557,7 +1557,6 @@ static NBDRequestData *nbd_request_get(NBDClient *client) + client->nb_requests++; + + req = g_new0(NBDRequestData, 1); +- nbd_client_get(client); + req->client = client; + return req; + } +@@ -1578,8 +1577,6 @@ static void nbd_request_put(NBDRequestData *req) + } + + nbd_client_receive_next_request(client); +- +- nbd_client_put(client); + } + + static void blk_aio_attached(AioContext *ctx, void *opaque) +-- +2.43.0 + diff --git a/0447-nbd-server-only-traverse-nbdexport-clients-from-main.patch b/0447-nbd-server-only-traverse-nbdexport-clients-from-main.patch new file mode 100644 index 0000000..5246fe1 --- /dev/null +++ b/0447-nbd-server-only-traverse-nbdexport-clients-from-main.patch @@ -0,0 +1,169 @@ +From f441892d17e522020e7c2a21b465a7228079c0ad Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 21 Dec 2023 14:24:51 -0500 +Subject: [PATCH] nbd/server: only traverse NBDExport->clients from main loop + thread + +The NBD clients list is currently accessed from both the export +AioContext and the main loop thread. When the AioContext lock is removed +there will be nothing protecting the clients list. + +Adding a lock around the clients list is tricky because NBDClient +structs are refcounted and may be freed from the export AioContext or +the main loop thread. nbd_export_request_shutdown() -> client_close() -> +nbd_client_put() is also tricky because the list lock would be held +while indirectly dropping references to NDBClients. + +A simpler approach is to only allow nbd_client_put() and client_close() +calls from the main loop thread. Then the NBD clients list is only +accessed from the main loop thread and no fancy locking is needed. + +nbd_trip() just needs to reschedule itself in the main loop AioContext +before calling nbd_client_put() and client_close(). This costs more CPU +cycles per NBD request so add nbd_client_put_nonzero() to optimize the +common case where more references to NBDClient remain. + +Note that nbd_client_get() can still be called from either thread, so +make NBDClient->refcount atomic. + +Signed-off-by: Stefan Hajnoczi +Message-ID: <20231221192452.1785567-6-stefanha@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +--- + nbd/server.c | 61 +++++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 51 insertions(+), 10 deletions(-) + +diff --git a/nbd/server.c b/nbd/server.c +index 0b09ccc8dc..e91e2e0903 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -122,7 +122,7 @@ struct NBDMetaContexts { + }; + + struct NBDClient { +- int refcount; ++ int refcount; /* atomic */ + void (*close_fn)(NBDClient *client, bool negotiated); + + NBDExport *exp; +@@ -1501,14 +1501,17 @@ static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *reque + + #define MAX_NBD_REQUESTS 16 + ++/* Runs in export AioContext and main loop thread */ + void nbd_client_get(NBDClient *client) + { +- client->refcount++; ++ qatomic_inc(&client->refcount); + } + + void nbd_client_put(NBDClient *client) + { +- if (--client->refcount == 0) { ++ assert(qemu_in_main_thread()); ++ ++ if (qatomic_fetch_dec(&client->refcount) == 1) { + /* The last reference should be dropped by client->close, + * which is called by client_close. + */ +@@ -1529,8 +1532,35 @@ void nbd_client_put(NBDClient *client) + } + } + ++/* ++ * Tries to release the reference to @client, but only if other references ++ * remain. This is an optimization for the common case where we want to avoid ++ * the expense of scheduling nbd_client_put() in the main loop thread. ++ * ++ * Returns true upon success or false if the reference was not released because ++ * it is the last reference. ++ */ ++static bool nbd_client_put_nonzero(NBDClient *client) ++{ ++ int old = qatomic_read(&client->refcount); ++ int expected; ++ ++ do { ++ if (old == 1) { ++ return false; ++ } ++ ++ expected = old; ++ old = qatomic_cmpxchg(&client->refcount, expected, expected - 1); ++ } while (old != expected); ++ ++ return true; ++} ++ + static void client_close(NBDClient *client, bool negotiated) + { ++ assert(qemu_in_main_thread()); ++ + if (client->closing) { + return; + } +@@ -2933,15 +2963,20 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, + static coroutine_fn void nbd_trip(void *opaque) + { + NBDClient *client = opaque; +- NBDRequestData *req; ++ NBDRequestData *req = NULL; + NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */ + int ret; + Error *local_err = NULL; + ++ /* ++ * Note that nbd_client_put() and client_close() must be called from the ++ * main loop thread. Use aio_co_reschedule_self() to switch AioContext ++ * before calling these functions. ++ */ ++ + trace_nbd_trip(); + if (client->closing) { +- nbd_client_put(client); +- return; ++ goto done; + } + + if (client->quiescing) { +@@ -2949,10 +2984,9 @@ static coroutine_fn void nbd_trip(void *opaque) + * We're switching between AIO contexts. Don't attempt to receive a new + * request and kick the main context which may be waiting for us. + */ +- nbd_client_put(client); + client->recv_coroutine = NULL; + aio_wait_kick(); +- return; ++ goto done; + } + + req = nbd_request_get(client); +@@ -3012,8 +3046,13 @@ static coroutine_fn void nbd_trip(void *opaque) + + qio_channel_set_cork(client->ioc, false); + done: +- nbd_request_put(req); +- nbd_client_put(client); ++ if (req) { ++ nbd_request_put(req); ++ } ++ if (!nbd_client_put_nonzero(client)) { ++ aio_co_reschedule_self(qemu_get_aio_context()); ++ nbd_client_put(client); ++ } + return; + + disconnect: +@@ -3021,6 +3060,8 @@ disconnect: + error_reportf_err(local_err, "Disconnect client, due to: "); + } + nbd_request_put(req); ++ ++ aio_co_reschedule_self(qemu_get_aio_context()); + client_close(client, true); + nbd_client_put(client); + } +-- +2.43.0 + diff --git a/0448-nbd-server-introduce-nbdclient-lock-to-protect-field.patch b/0448-nbd-server-introduce-nbdclient-lock-to-protect-field.patch new file mode 100644 index 0000000..f00c35c --- /dev/null +++ b/0448-nbd-server-introduce-nbdclient-lock-to-protect-field.patch @@ -0,0 +1,365 @@ +From 4b775789c430f52d27da1156e7e0fd567387266c Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 21 Dec 2023 14:24:52 -0500 +Subject: [PATCH] nbd/server: introduce NBDClient->lock to protect fields + +NBDClient has a number of fields that are accessed by both the export +AioContext and the main loop thread. When the AioContext lock is removed +these fields will need another form of protection. + +Add NBDClient->lock and protect fields that are accessed by both +threads. Also add assertions where possible and otherwise add doc +comments stating assumptions about which thread and lock holding. + +Note this patch moves the client->recv_coroutine assertion from +nbd_co_receive_request() to nbd_trip() where client->lock is held. + +Signed-off-by: Stefan Hajnoczi +Message-ID: <20231221192452.1785567-7-stefanha@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +--- + nbd/server.c | 144 +++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 111 insertions(+), 33 deletions(-) + +diff --git a/nbd/server.c b/nbd/server.c +index e91e2e0903..941832f178 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -125,23 +125,25 @@ struct NBDClient { + int refcount; /* atomic */ + void (*close_fn)(NBDClient *client, bool negotiated); + ++ QemuMutex lock; ++ + NBDExport *exp; + QCryptoTLSCreds *tlscreds; + char *tlsauthz; + QIOChannelSocket *sioc; /* The underlying data channel */ + QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ + +- Coroutine *recv_coroutine; ++ Coroutine *recv_coroutine; /* protected by lock */ + + CoMutex send_lock; + Coroutine *send_coroutine; + +- bool read_yielding; +- bool quiescing; ++ bool read_yielding; /* protected by lock */ ++ bool quiescing; /* protected by lock */ + + QTAILQ_ENTRY(NBDClient) next; +- int nb_requests; +- bool closing; ++ int nb_requests; /* protected by lock */ ++ bool closing; /* protected by lock */ + + uint32_t check_align; /* If non-zero, check for aligned client requests */ + +@@ -1415,11 +1417,18 @@ nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp) + + len = qio_channel_readv(client->ioc, &iov, 1, errp); + if (len == QIO_CHANNEL_ERR_BLOCK) { +- client->read_yielding = true; ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ client->read_yielding = true; ++ ++ /* Prompt main loop thread to re-run nbd_drained_poll() */ ++ aio_wait_kick(); ++ } + qio_channel_yield(client->ioc, G_IO_IN); +- client->read_yielding = false; +- if (client->quiescing) { +- return -EAGAIN; ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ client->read_yielding = false; ++ if (client->quiescing) { ++ return -EAGAIN; ++ } + } + continue; + } else if (len < 0) { +@@ -1528,6 +1537,7 @@ void nbd_client_put(NBDClient *client) + blk_exp_unref(&client->exp->common); + } + g_free(client->contexts.bitmaps); ++ qemu_mutex_destroy(&client->lock); + g_free(client); + } + } +@@ -1561,11 +1571,13 @@ static void client_close(NBDClient *client, bool negotiated) + { + assert(qemu_in_main_thread()); + +- if (client->closing) { +- return; +- } ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ if (client->closing) { ++ return; ++ } + +- client->closing = true; ++ client->closing = true; ++ } + + /* Force requests to finish. They will drop their own references, + * then we'll close the socket and free the NBDClient. +@@ -1579,6 +1591,7 @@ static void client_close(NBDClient *client, bool negotiated) + } + } + ++/* Runs in export AioContext with client->lock held */ + static NBDRequestData *nbd_request_get(NBDClient *client) + { + NBDRequestData *req; +@@ -1591,6 +1604,7 @@ static NBDRequestData *nbd_request_get(NBDClient *client) + return req; + } + ++/* Runs in export AioContext with client->lock held */ + static void nbd_request_put(NBDRequestData *req) + { + NBDClient *client = req->client; +@@ -1614,14 +1628,18 @@ static void blk_aio_attached(AioContext *ctx, void *opaque) + NBDExport *exp = opaque; + NBDClient *client; + ++ assert(qemu_in_main_thread()); ++ + trace_nbd_blk_aio_attached(exp->name, ctx); + + exp->common.ctx = ctx; + + QTAILQ_FOREACH(client, &exp->clients, next) { +- assert(client->nb_requests == 0); +- assert(client->recv_coroutine == NULL); +- assert(client->send_coroutine == NULL); ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ assert(client->nb_requests == 0); ++ assert(client->recv_coroutine == NULL); ++ assert(client->send_coroutine == NULL); ++ } + } + } + +@@ -1629,6 +1647,8 @@ static void blk_aio_detach(void *opaque) + { + NBDExport *exp = opaque; + ++ assert(qemu_in_main_thread()); ++ + trace_nbd_blk_aio_detach(exp->name, exp->common.ctx); + + exp->common.ctx = NULL; +@@ -1639,8 +1659,12 @@ static void nbd_drained_begin(void *opaque) + NBDExport *exp = opaque; + NBDClient *client; + ++ assert(qemu_in_main_thread()); ++ + QTAILQ_FOREACH(client, &exp->clients, next) { +- client->quiescing = true; ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ client->quiescing = true; ++ } + } + } + +@@ -1649,28 +1673,48 @@ static void nbd_drained_end(void *opaque) + NBDExport *exp = opaque; + NBDClient *client; + ++ assert(qemu_in_main_thread()); ++ + QTAILQ_FOREACH(client, &exp->clients, next) { +- client->quiescing = false; +- nbd_client_receive_next_request(client); ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ client->quiescing = false; ++ nbd_client_receive_next_request(client); ++ } + } + } + ++/* Runs in export AioContext */ ++static void nbd_wake_read_bh(void *opaque) ++{ ++ NBDClient *client = opaque; ++ qio_channel_wake_read(client->ioc); ++} ++ + static bool nbd_drained_poll(void *opaque) + { + NBDExport *exp = opaque; + NBDClient *client; + ++ assert(qemu_in_main_thread()); ++ + QTAILQ_FOREACH(client, &exp->clients, next) { +- if (client->nb_requests != 0) { +- /* +- * If there's a coroutine waiting for a request on nbd_read_eof() +- * enter it here so we don't depend on the client to wake it up. +- */ +- if (client->recv_coroutine != NULL && client->read_yielding) { +- qio_channel_wake_read(client->ioc); +- } ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ if (client->nb_requests != 0) { ++ /* ++ * If there's a coroutine waiting for a request on nbd_read_eof() ++ * enter it here so we don't depend on the client to wake it up. ++ * ++ * Schedule a BH in the export AioContext to avoid missing the ++ * wake up due to the race between qio_channel_wake_read() and ++ * qio_channel_yield(). ++ */ ++ if (client->recv_coroutine != NULL && client->read_yielding) { ++ aio_bh_schedule_oneshot(nbd_export_aio_context(client->exp), ++ nbd_wake_read_bh, client); ++ } + +- return true; ++ return true; ++ } + } + } + +@@ -1681,6 +1725,8 @@ static void nbd_eject_notifier(Notifier *n, void *data) + { + NBDExport *exp = container_of(n, NBDExport, eject_notifier); + ++ assert(qemu_in_main_thread()); ++ + blk_exp_request_shutdown(&exp->common); + } + +@@ -2566,7 +2612,6 @@ static int coroutine_fn nbd_co_receive_request(NBDRequestData *req, + int ret; + + g_assert(qemu_in_coroutine()); +- assert(client->recv_coroutine == qemu_coroutine_self()); + ret = nbd_receive_request(client, request, errp); + if (ret < 0) { + return ret; +@@ -2975,6 +3020,9 @@ static coroutine_fn void nbd_trip(void *opaque) + */ + + trace_nbd_trip(); ++ ++ qemu_mutex_lock(&client->lock); ++ + if (client->closing) { + goto done; + } +@@ -2990,7 +3038,21 @@ static coroutine_fn void nbd_trip(void *opaque) + } + + req = nbd_request_get(client); +- ret = nbd_co_receive_request(req, &request, &local_err); ++ ++ /* ++ * nbd_co_receive_request() returns -EAGAIN when nbd_drained_begin() has ++ * set client->quiescing but by the time we get back nbd_drained_end() may ++ * have already cleared client->quiescing. In that case we try again ++ * because nothing else will spawn an nbd_trip() coroutine until we set ++ * client->recv_coroutine = NULL further down. ++ */ ++ do { ++ assert(client->recv_coroutine == qemu_coroutine_self()); ++ qemu_mutex_unlock(&client->lock); ++ ret = nbd_co_receive_request(req, &request, &local_err); ++ qemu_mutex_lock(&client->lock); ++ } while (ret == -EAGAIN && !client->quiescing); ++ + client->recv_coroutine = NULL; + + if (client->closing) { +@@ -3002,15 +3064,16 @@ static coroutine_fn void nbd_trip(void *opaque) + } + + if (ret == -EAGAIN) { +- assert(client->quiescing); + goto done; + } + + nbd_client_receive_next_request(client); ++ + if (ret == -EIO) { + goto disconnect; + } + ++ qemu_mutex_unlock(&client->lock); + qio_channel_set_cork(client->ioc, true); + + if (ret < 0) { +@@ -3030,6 +3093,10 @@ static coroutine_fn void nbd_trip(void *opaque) + g_free(request.contexts->bitmaps); + g_free(request.contexts); + } ++ ++ qio_channel_set_cork(client->ioc, false); ++ qemu_mutex_lock(&client->lock); ++ + if (ret < 0) { + error_prepend(&local_err, "Failed to send reply: "); + goto disconnect; +@@ -3044,11 +3111,13 @@ static coroutine_fn void nbd_trip(void *opaque) + goto disconnect; + } + +- qio_channel_set_cork(client->ioc, false); + done: + if (req) { + nbd_request_put(req); + } ++ ++ qemu_mutex_unlock(&client->lock); ++ + if (!nbd_client_put_nonzero(client)) { + aio_co_reschedule_self(qemu_get_aio_context()); + nbd_client_put(client); +@@ -3059,13 +3128,19 @@ disconnect: + if (local_err) { + error_reportf_err(local_err, "Disconnect client, due to: "); + } ++ + nbd_request_put(req); ++ qemu_mutex_unlock(&client->lock); + + aio_co_reschedule_self(qemu_get_aio_context()); + client_close(client, true); + nbd_client_put(client); + } + ++/* ++ * Runs in export AioContext and main loop thread. Caller must hold ++ * client->lock. ++ */ + static void nbd_client_receive_next_request(NBDClient *client) + { + if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS && +@@ -3091,7 +3166,9 @@ static coroutine_fn void nbd_co_client_start(void *opaque) + return; + } + +- nbd_client_receive_next_request(client); ++ WITH_QEMU_LOCK_GUARD(&client->lock) { ++ nbd_client_receive_next_request(client); ++ } + } + + /* +@@ -3108,6 +3185,7 @@ void nbd_client_new(QIOChannelSocket *sioc, + Coroutine *co; + + client = g_new0(NBDClient, 1); ++ qemu_mutex_init(&client->lock); + client->refcount = 1; + client->tlscreds = tlscreds; + if (tlscreds) { +-- +2.43.0 + diff --git a/0449-nbd-minor-style-and-typo-fixes.patch b/0449-nbd-minor-style-and-typo-fixes.patch new file mode 100644 index 0000000..aa453a8 --- /dev/null +++ b/0449-nbd-minor-style-and-typo-fixes.patch @@ -0,0 +1,49 @@ +From e1fc2383faeae3533e5409b99a7e9fc72ba8d0dd Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Thu, 1 Aug 2024 16:49:20 -0500 +Subject: [PATCH] nbd: Minor style and typo fixes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Touch up a comment with the wrong type name, and an over-long line, +both noticed while working on upcoming patches. + +Signed-off-by: Eric Blake +Message-ID: <20240807174943.771624-10-eblake@redhat.com> +Reviewed-by: Daniel P. Berrangé +--- + nbd/server.c | 2 +- + qemu-nbd.c | 3 ++- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/nbd/server.c b/nbd/server.c +index 941832f178..0382d928e0 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -1938,7 +1938,7 @@ static void nbd_export_request_shutdown(BlockExport *blk_exp) + + blk_exp_ref(&exp->common); + /* +- * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a ++ * TODO: Should we expand QMP BlockExportRemoveMode enum to allow a + * close mode that stops advertising the export to new clients but + * still permits existing clients to run to completion? Because of + * that possibility, nbd_export_close() can be called more than +diff --git a/qemu-nbd.c b/qemu-nbd.c +index 186e6468b1..2b1817e661 100644 +--- a/qemu-nbd.c ++++ b/qemu-nbd.c +@@ -587,7 +587,8 @@ int main(int argc, char **argv) + pthread_t client_thread; + const char *fmt = NULL; + Error *local_err = NULL; +- BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; ++ BlockdevDetectZeroesOptions detect_zeroes = ++ BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; + QDict *options = NULL; + const char *export_name = NULL; /* defaults to "" later for server mode */ + const char *export_description = NULL; +-- +2.43.0 + diff --git a/0450-nbd-server-plumb-in-new-args-to-nbd-client-add.patch b/0450-nbd-server-plumb-in-new-args-to-nbd-client-add.patch new file mode 100644 index 0000000..827dee2 --- /dev/null +++ b/0450-nbd-server-plumb-in-new-args-to-nbd-client-add.patch @@ -0,0 +1,165 @@ +From c325fc515bbaebcc4b07cc872f1d112290c8d87c Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Wed, 7 Aug 2024 08:50:01 -0500 +Subject: [PATCH] nbd/server: Plumb in new args to nbd_client_add() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Upcoming patches to fix a CVE need to track an opaque pointer passed +in by the owner of a client object, as well as request for a time +limit on how fast negotiation must complete. Prepare for that by +changing the signature of nbd_client_new() and adding an accessor to +get at the opaque pointer, although for now the two servers +(qemu-nbd.c and blockdev-nbd.c) do not change behavior even though +they pass in a new default timeout value. + +Suggested-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Eric Blake +Message-ID: <20240807174943.771624-11-eblake@redhat.com> +Reviewed-by: Daniel P. Berrangé +[eblake: s/LIMIT/MAX_SECS/ as suggested by Dan] +Signed-off-by: Eric Blake +--- + blockdev-nbd.c | 6 ++++-- + include/block/nbd.h | 11 ++++++++++- + nbd/server.c | 20 +++++++++++++++++--- + qemu-nbd.c | 4 +++- + 4 files changed, 34 insertions(+), 7 deletions(-) + +diff --git a/blockdev-nbd.c b/blockdev-nbd.c +index 213012435f..267a1de903 100644 +--- a/blockdev-nbd.c ++++ b/blockdev-nbd.c +@@ -64,8 +64,10 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, + nbd_update_server_watch(nbd_server); + + qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server"); +- nbd_client_new(cioc, nbd_server->tlscreds, nbd_server->tlsauthz, +- nbd_blockdev_client_closed); ++ /* TODO - expose handshake timeout as QMP option */ ++ nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, ++ nbd_server->tlscreds, nbd_server->tlsauthz, ++ nbd_blockdev_client_closed, NULL); + } + + static void nbd_update_server_watch(NBDServerData *s) +diff --git a/include/block/nbd.h b/include/block/nbd.h +index 4e7bd6342f..1d4d65922d 100644 +--- a/include/block/nbd.h ++++ b/include/block/nbd.h +@@ -33,6 +33,12 @@ typedef struct NBDMetaContexts NBDMetaContexts; + + extern const BlockExportDriver blk_exp_nbd; + ++/* ++ * NBD_DEFAULT_HANDSHAKE_MAX_SECS: Number of seconds in which client must ++ * succeed at NBD_OPT_GO before being forcefully dropped as too slow. ++ */ ++#define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10 ++ + /* Handshake phase structs - this struct is passed on the wire */ + + typedef struct NBDOption { +@@ -403,9 +409,12 @@ AioContext *nbd_export_aio_context(NBDExport *exp); + NBDExport *nbd_export_find(const char *name); + + void nbd_client_new(QIOChannelSocket *sioc, ++ uint32_t handshake_max_secs, + QCryptoTLSCreds *tlscreds, + const char *tlsauthz, +- void (*close_fn)(NBDClient *, bool)); ++ void (*close_fn)(NBDClient *, bool), ++ void *owner); ++void *nbd_client_owner(NBDClient *client); + void nbd_client_get(NBDClient *client); + void nbd_client_put(NBDClient *client); + +diff --git a/nbd/server.c b/nbd/server.c +index 0382d928e0..6d9fa2a2f2 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -124,12 +124,14 @@ struct NBDMetaContexts { + struct NBDClient { + int refcount; /* atomic */ + void (*close_fn)(NBDClient *client, bool negotiated); ++ void *owner; + + QemuMutex lock; + + NBDExport *exp; + QCryptoTLSCreds *tlscreds; + char *tlsauthz; ++ uint32_t handshake_max_secs; + QIOChannelSocket *sioc; /* The underlying data channel */ + QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ + +@@ -3158,6 +3160,7 @@ static coroutine_fn void nbd_co_client_start(void *opaque) + + qemu_co_mutex_init(&client->send_lock); + ++ /* TODO - utilize client->handshake_max_secs */ + if (nbd_negotiate(client, &local_err)) { + if (local_err) { + error_report_err(local_err); +@@ -3172,14 +3175,17 @@ static coroutine_fn void nbd_co_client_start(void *opaque) + } + + /* +- * Create a new client listener using the given channel @sioc. ++ * Create a new client listener using the given channel @sioc and @owner. + * Begin servicing it in a coroutine. When the connection closes, call +- * @close_fn with an indication of whether the client completed negotiation. ++ * @close_fn with an indication of whether the client completed negotiation ++ * within @handshake_max_secs seconds (0 for unbounded). + */ + void nbd_client_new(QIOChannelSocket *sioc, ++ uint32_t handshake_max_secs, + QCryptoTLSCreds *tlscreds, + const char *tlsauthz, +- void (*close_fn)(NBDClient *, bool)) ++ void (*close_fn)(NBDClient *, bool), ++ void *owner) + { + NBDClient *client; + Coroutine *co; +@@ -3192,13 +3198,21 @@ void nbd_client_new(QIOChannelSocket *sioc, + object_ref(OBJECT(client->tlscreds)); + } + client->tlsauthz = g_strdup(tlsauthz); ++ client->handshake_max_secs = handshake_max_secs; + client->sioc = sioc; + qio_channel_set_delay(QIO_CHANNEL(sioc), false); + object_ref(OBJECT(client->sioc)); + client->ioc = QIO_CHANNEL(sioc); + object_ref(OBJECT(client->ioc)); + client->close_fn = close_fn; ++ client->owner = owner; + + co = qemu_coroutine_create(nbd_co_client_start, client); + qemu_coroutine_enter(co); + } ++ ++void * ++nbd_client_owner(NBDClient *client) ++{ ++ return client->owner; ++} +diff --git a/qemu-nbd.c b/qemu-nbd.c +index 2b1817e661..cb241e6567 100644 +--- a/qemu-nbd.c ++++ b/qemu-nbd.c +@@ -389,7 +389,9 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, + + nb_fds++; + nbd_update_server_watch(); +- nbd_client_new(cioc, tlscreds, tlsauthz, nbd_client_closed); ++ /* TODO - expose handshake timeout as command line option */ ++ nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, ++ tlscreds, tlsauthz, nbd_client_closed, NULL); + } + + static void nbd_update_server_watch(void) +-- +2.43.0 + diff --git a/0451-nbd-server-cve-2024-7409-cap-default-max-connections.patch b/0451-nbd-server-cve-2024-7409-cap-default-max-connections.patch new file mode 100644 index 0000000..9eb5ed6 --- /dev/null +++ b/0451-nbd-server-cve-2024-7409-cap-default-max-connections.patch @@ -0,0 +1,173 @@ +From 27bdc7029c666bacb93a9c01985823b6291f8b16 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 6 Aug 2024 13:53:00 -0500 +Subject: [PATCH] nbd/server: CVE-2024-7409: Cap default max-connections to 100 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Allowing an unlimited number of clients to any web service is a recipe +for a rudimentary denial of service attack: the client merely needs to +open lots of sockets without closing them, until qemu no longer has +any more fds available to allocate. + +For qemu-nbd, we default to allowing only 1 connection unless more are +explicitly asked for (-e or --shared); this was historically picked as +a nice default (without an explicit -t, a non-persistent qemu-nbd goes +away after a client disconnects, without needing any additional +follow-up commands), and we are not going to change that interface now +(besides, someday we want to point people towards qemu-storage-daemon +instead of qemu-nbd). + +But for qemu proper, and the newer qemu-storage-daemon, the QMP +nbd-server-start command has historically had a default of unlimited +number of connections, in part because unlike qemu-nbd it is +inherently persistent until nbd-server-stop. Allowing multiple client +sockets is particularly useful for clients that can take advantage of +MULTI_CONN (creating parallel sockets to increase throughput), +although known clients that do so (such as libnbd's nbdcopy) typically +use only 8 or 16 connections (the benefits of scaling diminish once +more sockets are competing for kernel attention). Picking a number +large enough for typical use cases, but not unlimited, makes it +slightly harder for a malicious client to perform a denial of service +merely by opening lots of connections withot progressing through the +handshake. + +This change does not eliminate CVE-2024-7409 on its own, but reduces +the chance for fd exhaustion or unlimited memory usage as an attack +surface. On the other hand, by itself, it makes it more obvious that +with a finite limit, we have the problem of an unauthenticated client +holding 100 fds opened as a way to block out a legitimate client from +being able to connect; thus, later patches will further add timeouts +to reject clients that are not making progress. + +This is an INTENTIONAL change in behavior, and will break any client +of nbd-server-start that was not passing an explicit max-connections +parameter, yet expects more than 100 simultaneous connections. We are +not aware of any such client (as stated above, most clients aware of +MULTI_CONN get by just fine on 8 or 16 connections, and probably cope +with later connections failing by relying on the earlier connections; +libvirt has not yet been passing max-connections, but generally +creates NBD servers with the intent for a single client for the sake +of live storage migration; meanwhile, the KubeSAN project anticipates +a large cluster sharing multiple clients [up to 8 per node, and up to +100 nodes in a cluster], but it currently uses qemu-nbd with an +explicit --shared=0 rather than qemu-storage-daemon with +nbd-server-start). + +We considered using a deprecation period (declare that omitting +max-parameters is deprecated, and make it mandatory in 3 releases - +then we don't need to pick an arbitrary default); that has zero risk +of breaking any apps that accidentally depended on more than 100 +connections, and where such breakage might not be noticed under unit +testing but only under the larger loads of production usage. But it +does not close the denial-of-service hole until far into the future, +and requires all apps to change to add the parameter even if 100 was +good enough. It also has a drawback that any app (like libvirt) that +is accidentally relying on an unlimited default should seriously +consider their own CVE now, at which point they are going to change to +pass explicit max-connections sooner than waiting for 3 qemu releases. +Finally, if our changed default breaks an app, that app can always +pass in an explicit max-parameters with a larger value. + +It is also intentional that the HMP interface to nbd-server-start is +not changed to expose max-connections (any client needing to fine-tune +things should be using QMP). + +Suggested-by: Daniel P. Berrangé +Signed-off-by: Eric Blake +Message-ID: <20240807174943.771624-12-eblake@redhat.com> +Reviewed-by: Daniel P. Berrangé +[ericb: Expand commit message to summarize Dan's argument for why we +break corner-case back-compat behavior without a deprecation period] +Signed-off-by: Eric Blake +--- + block/monitor/block-hmp-cmds.c | 3 ++- + blockdev-nbd.c | 8 ++++++++ + include/block/nbd.h | 7 +++++++ + qapi/block-export.json | 4 ++-- + 4 files changed, 19 insertions(+), 3 deletions(-) + +diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c +index c729cbf1eb..78a6975852 100644 +--- a/block/monitor/block-hmp-cmds.c ++++ b/block/monitor/block-hmp-cmds.c +@@ -415,7 +415,8 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict) + goto exit; + } + +- nbd_server_start(addr, NULL, NULL, 0, &local_err); ++ nbd_server_start(addr, NULL, NULL, NBD_DEFAULT_MAX_CONNECTIONS, ++ &local_err); + qapi_free_SocketAddress(addr); + if (local_err != NULL) { + goto exit; +diff --git a/blockdev-nbd.c b/blockdev-nbd.c +index 267a1de903..24ba5382db 100644 +--- a/blockdev-nbd.c ++++ b/blockdev-nbd.c +@@ -170,6 +170,10 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds, + + void nbd_server_start_options(NbdServerOptions *arg, Error **errp) + { ++ if (!arg->has_max_connections) { ++ arg->max_connections = NBD_DEFAULT_MAX_CONNECTIONS; ++ } ++ + nbd_server_start(arg->addr, arg->tls_creds, arg->tls_authz, + arg->max_connections, errp); + } +@@ -182,6 +186,10 @@ void qmp_nbd_server_start(SocketAddressLegacy *addr, + { + SocketAddress *addr_flat = socket_address_flatten(addr); + ++ if (!has_max_connections) { ++ max_connections = NBD_DEFAULT_MAX_CONNECTIONS; ++ } ++ + nbd_server_start(addr_flat, tls_creds, tls_authz, max_connections, errp); + qapi_free_SocketAddress(addr_flat); + } +diff --git a/include/block/nbd.h b/include/block/nbd.h +index 1d4d65922d..d4f8b21aec 100644 +--- a/include/block/nbd.h ++++ b/include/block/nbd.h +@@ -39,6 +39,13 @@ extern const BlockExportDriver blk_exp_nbd; + */ + #define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10 + ++/* ++ * NBD_DEFAULT_MAX_CONNECTIONS: Number of client sockets to allow at ++ * once; must be large enough to allow a MULTI_CONN-aware client like ++ * nbdcopy to create its typical number of 8-16 sockets. ++ */ ++#define NBD_DEFAULT_MAX_CONNECTIONS 100 ++ + /* Handshake phase structs - this struct is passed on the wire */ + + typedef struct NBDOption { +diff --git a/qapi/block-export.json b/qapi/block-export.json +index 7874a49ba7..1d255d77e3 100644 +--- a/qapi/block-export.json ++++ b/qapi/block-export.json +@@ -28,7 +28,7 @@ + # @max-connections: The maximum number of connections to allow at the + # same time, 0 for unlimited. Setting this to 1 also stops the + # server from advertising multiple client support (since 5.2; +-# default: 0) ++# default: 100) + # + # Since: 4.2 + ## +@@ -63,7 +63,7 @@ + # @max-connections: The maximum number of connections to allow at the + # same time, 0 for unlimited. Setting this to 1 also stops the + # server from advertising multiple client support (since 5.2; +-# default: 0). ++# default: 100). + # + # Returns: error if the server is already running. + # +-- +2.43.0 + diff --git a/0452-nbd-server-cve-2024-7409-drop-non-negotiating-client.patch b/0452-nbd-server-cve-2024-7409-drop-non-negotiating-client.patch new file mode 100644 index 0000000..0fc4c27 --- /dev/null +++ b/0452-nbd-server-cve-2024-7409-drop-non-negotiating-client.patch @@ -0,0 +1,124 @@ +From 31f71ad00d4ebaf58daca594af8a5a9b30f64dd9 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Thu, 8 Aug 2024 16:05:08 -0500 +Subject: [PATCH] nbd/server: CVE-2024-7409: Drop non-negotiating clients +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +A client that opens a socket but does not negotiate is merely hogging +qemu's resources (an open fd and a small amount of memory); and a +malicious client that can access the port where NBD is listening can +attempt a denial of service attack by intentionally opening and +abandoning lots of unfinished connections. The previous patch put a +default bound on the number of such ongoing connections, but once that +limit is hit, no more clients can connect (including legitimate ones). +The solution is to insist that clients complete handshake within a +reasonable time limit, defaulting to 10 seconds. A client that has +not successfully completed NBD_OPT_GO by then (including the case of +where the client didn't know TLS credentials to even reach the point +of NBD_OPT_GO) is wasting our time and does not deserve to stay +connected. Later patches will allow fine-tuning the limit away from +the default value (including disabling it for doing integration +testing of the handshake process itself). + +Note that this patch in isolation actually makes it more likely to see +qemu SEGV after nbd-server-stop, as any client socket still connected +when the server shuts down will now be closed after 10 seconds rather +than at the client's whims. That will be addressed in the next patch. + +For a demo of this patch in action: +$ qemu-nbd -f raw -r -t -e 10 file & +$ nbdsh --opt-mode -c ' +H = list() +for i in range(20): + print(i) + H.insert(i, nbd.NBD()) + H[i].set_opt_mode(True) + H[i].connect_uri("nbd://localhost") +' +$ kill $! + +where later connections get to start progressing once earlier ones are +forcefully dropped for taking too long, rather than hanging. + +Suggested-by: Daniel P. Berrangé +Signed-off-by: Eric Blake +Message-ID: <20240807174943.771624-13-eblake@redhat.com> +Reviewed-by: Daniel P. Berrangé +[eblake: rebase to changes earlier in series, reduce scope of timer] +Signed-off-by: Eric Blake +--- + nbd/server.c | 28 +++++++++++++++++++++++++++- + nbd/trace-events | 1 + + 2 files changed, 28 insertions(+), 1 deletion(-) + +diff --git a/nbd/server.c b/nbd/server.c +index 6d9fa2a2f2..58cf2bef6a 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -3153,22 +3153,48 @@ static void nbd_client_receive_next_request(NBDClient *client) + } + } + ++static void nbd_handshake_timer_cb(void *opaque) ++{ ++ QIOChannel *ioc = opaque; ++ ++ trace_nbd_handshake_timer_cb(); ++ qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); ++} ++ + static coroutine_fn void nbd_co_client_start(void *opaque) + { + NBDClient *client = opaque; + Error *local_err = NULL; ++ QEMUTimer *handshake_timer = NULL; + + qemu_co_mutex_init(&client->send_lock); + +- /* TODO - utilize client->handshake_max_secs */ ++ /* ++ * Create a timer to bound the time spent in negotiation. If the ++ * timer expires, it is likely nbd_negotiate will fail because the ++ * socket was shutdown. ++ */ ++ if (client->handshake_max_secs > 0) { ++ handshake_timer = aio_timer_new(qemu_get_aio_context(), ++ QEMU_CLOCK_REALTIME, ++ SCALE_NS, ++ nbd_handshake_timer_cb, ++ client->sioc); ++ timer_mod(handshake_timer, ++ qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ++ client->handshake_max_secs * NANOSECONDS_PER_SECOND); ++ } ++ + if (nbd_negotiate(client, &local_err)) { + if (local_err) { + error_report_err(local_err); + } ++ timer_free(handshake_timer); + client_close(client, false); + return; + } + ++ timer_free(handshake_timer); + WITH_QEMU_LOCK_GUARD(&client->lock) { + nbd_client_receive_next_request(client); + } +diff --git a/nbd/trace-events b/nbd/trace-events +index 00ae3216a1..cbd0a4ab7e 100644 +--- a/nbd/trace-events ++++ b/nbd/trace-events +@@ -76,6 +76,7 @@ nbd_co_receive_request_payload_received(uint64_t cookie, uint64_t len) "Payload + nbd_co_receive_ext_payload_compliance(uint64_t from, uint64_t len) "client sent non-compliant write without payload flag: from=0x%" PRIx64 ", len=0x%" PRIx64 + nbd_co_receive_align_compliance(const char *op, uint64_t from, uint64_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx64 ", align=0x%" PRIx32 + nbd_trip(void) "Reading request" ++nbd_handshake_timer_cb(void) "client took too long to negotiate" + + # client-connection.c + nbd_connect_thread_sleep(uint64_t timeout) "timeout %" PRIu64 +-- +2.43.0 + diff --git a/0453-nbd-server-cve-2024-7409-close-stray-clients-at-serv.patch b/0453-nbd-server-cve-2024-7409-close-stray-clients-at-serv.patch new file mode 100644 index 0000000..63e4f81 --- /dev/null +++ b/0453-nbd-server-cve-2024-7409-close-stray-clients-at-serv.patch @@ -0,0 +1,162 @@ +From 16b120b6f2fb05972b004d107df1e1d9bbd0a0cf Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Wed, 7 Aug 2024 12:23:13 -0500 +Subject: [PATCH] nbd/server: CVE-2024-7409: Close stray clients at server-stop +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +A malicious client can attempt to connect to an NBD server, and then +intentionally delay progress in the handshake, including if it does +not know the TLS secrets. Although the previous two patches reduce +this behavior by capping the default max-connections parameter and +killing slow clients, they did not eliminate the possibility of a +client waiting to close the socket until after the QMP nbd-server-stop +command is executed, at which point qemu would SEGV when trying to +dereference the NULL nbd_server global which is no longer present. +This amounts to a denial of service attack. Worse, if another NBD +server is started before the malicious client disconnects, I cannot +rule out additional adverse effects when the old client interferes +with the connection count of the new server (although the most likely +is a crash due to an assertion failure when checking +nbd_server->connections > 0). + +For environments without this patch, the CVE can be mitigated by +ensuring (such as via a firewall) that only trusted clients can +connect to an NBD server. Note that using frameworks like libvirt +that ensure that TLS is used and that nbd-server-stop is not executed +while any trusted clients are still connected will only help if there +is also no possibility for an untrusted client to open a connection +but then stall on the NBD handshake. + +Given the previous patches, it would be possible to guarantee that no +clients remain connected by having nbd-server-stop sleep for longer +than the default handshake deadline before finally freeing the global +nbd_server object, but that could make QMP non-responsive for a long +time. So intead, this patch fixes the problem by tracking all client +sockets opened while the server is running, and forcefully closing any +such sockets remaining without a completed handshake at the time of +nbd-server-stop, then waiting until the coroutines servicing those +sockets notice the state change. nbd-server-stop now has a second +AIO_WAIT_WHILE_UNLOCKED (the first is indirectly through the +blk_exp_close_all_type() that disconnects all clients that completed +handshakes), but forced socket shutdown is enough to progress the +coroutines and quickly tear down all clients before the server is +freed, thus finally fixing the CVE. + +This patch relies heavily on the fact that nbd/server.c guarantees +that it only calls nbd_blockdev_client_closed() from the main loop +(see the assertion in nbd_client_put() and the hoops used in +nbd_client_put_nonzero() to achieve that); if we did not have that +guarantee, we would also need a mutex protecting our accesses of the +list of connections to survive re-entrancy from independent iothreads. + +Although I did not actually try to test old builds, it looks like this +problem has existed since at least commit 862172f45c (v2.12.0, 2017) - +even back when that patch started using a QIONetListener to handle +listening on multiple sockets, nbd_server_free() was already unaware +that the nbd_blockdev_client_closed callback can be reached later by a +client thread that has not completed handshakes (and therefore the +client's socket never got added to the list closed in +nbd_export_close_all), despite that patch intentionally tearing down +the QIONetListener to prevent new clients. + +Reported-by: Alexander Ivanov +Fixes: CVE-2024-7409 +CC: qemu-stable@nongnu.org +Signed-off-by: Eric Blake +Message-ID: <20240807174943.771624-14-eblake@redhat.com> +Reviewed-by: Daniel P. Berrangé +--- + blockdev-nbd.c | 35 ++++++++++++++++++++++++++++++++++- + 1 file changed, 34 insertions(+), 1 deletion(-) + +diff --git a/blockdev-nbd.c b/blockdev-nbd.c +index 24ba5382db..f73409ae49 100644 +--- a/blockdev-nbd.c ++++ b/blockdev-nbd.c +@@ -21,12 +21,18 @@ + #include "io/channel-socket.h" + #include "io/net-listener.h" + ++typedef struct NBDConn { ++ QIOChannelSocket *cioc; ++ QLIST_ENTRY(NBDConn) next; ++} NBDConn; ++ + typedef struct NBDServerData { + QIONetListener *listener; + QCryptoTLSCreds *tlscreds; + char *tlsauthz; + uint32_t max_connections; + uint32_t connections; ++ QLIST_HEAD(, NBDConn) conns; + } NBDServerData; + + static NBDServerData *nbd_server; +@@ -51,6 +57,14 @@ int nbd_server_max_connections(void) + + static void nbd_blockdev_client_closed(NBDClient *client, bool ignored) + { ++ NBDConn *conn = nbd_client_owner(client); ++ ++ assert(qemu_in_main_thread() && nbd_server); ++ ++ object_unref(OBJECT(conn->cioc)); ++ QLIST_REMOVE(conn, next); ++ g_free(conn); ++ + nbd_client_put(client); + assert(nbd_server->connections > 0); + nbd_server->connections--; +@@ -60,14 +74,20 @@ static void nbd_blockdev_client_closed(NBDClient *client, bool ignored) + static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, + gpointer opaque) + { ++ NBDConn *conn = g_new0(NBDConn, 1); ++ ++ assert(qemu_in_main_thread() && nbd_server); + nbd_server->connections++; ++ object_ref(OBJECT(cioc)); ++ conn->cioc = cioc; ++ QLIST_INSERT_HEAD(&nbd_server->conns, conn, next); + nbd_update_server_watch(nbd_server); + + qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server"); + /* TODO - expose handshake timeout as QMP option */ + nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, + nbd_server->tlscreds, nbd_server->tlsauthz, +- nbd_blockdev_client_closed, NULL); ++ nbd_blockdev_client_closed, conn); + } + + static void nbd_update_server_watch(NBDServerData *s) +@@ -81,12 +101,25 @@ static void nbd_update_server_watch(NBDServerData *s) + + static void nbd_server_free(NBDServerData *server) + { ++ NBDConn *conn, *tmp; ++ + if (!server) { + return; + } + ++ /* ++ * Forcefully close the listener socket, and any clients that have ++ * not yet disconnected on their own. ++ */ + qio_net_listener_disconnect(server->listener); + object_unref(OBJECT(server->listener)); ++ QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) { ++ qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH, ++ NULL); ++ } ++ ++ AIO_WAIT_WHILE_UNLOCKED(NULL, server->connections > 0); ++ + if (server->tlscreds) { + object_unref(OBJECT(server->tlscreds)); + } +-- +2.43.0 + diff --git a/0454-nbd-server-cve-2024-7409-avoid-use-after-free-when-c.patch b/0454-nbd-server-cve-2024-7409-avoid-use-after-free-when-c.patch new file mode 100644 index 0000000..fe233a7 --- /dev/null +++ b/0454-nbd-server-cve-2024-7409-avoid-use-after-free-when-c.patch @@ -0,0 +1,90 @@ +From 0c0d6b214a403b8c178548502112b57b41cf73bd Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Thu, 22 Aug 2024 09:35:29 -0500 +Subject: [PATCH] nbd/server: CVE-2024-7409: Avoid use-after-free when closing + server + +Commit 3e7ef738 plugged the use-after-free of the global nbd_server +object, but overlooked a use-after-free of nbd_server->listener. +Although this race is harder to hit, notice that our shutdown path +first drops the reference count of nbd_server->listener, then triggers +actions that can result in a pending client reaching the +nbd_blockdev_client_closed() callback, which in turn calls +qio_net_listener_set_client_func on a potentially stale object. + +If we know we don't want any more clients to connect, and have already +told the listener socket to shut down, then we should not be trying to +update the listener socket's associated function. + +Reproducer: + +> #!/usr/bin/python3 +> +> import os +> from threading import Thread +> +> def start_stop(): +> while 1: +> os.system('virsh qemu-monitor-command VM \'{"execute": "nbd-server-start", ++"arguments":{"addr":{"type":"unix","data":{"path":"/tmp/nbd-sock"}}}}\'') +> os.system('virsh qemu-monitor-command VM \'{"execute": "nbd-server-stop"}\'') +> +> def nbd_list(): +> while 1: +> os.system('/path/to/build/qemu-nbd -L -k /tmp/nbd-sock') +> +> def test(): +> sst = Thread(target=start_stop) +> sst.start() +> nlt = Thread(target=nbd_list) +> nlt.start() +> +> sst.join() +> nlt.join() +> +> test() + +Fixes: CVE-2024-7409 +Fixes: 3e7ef738c8 ("nbd/server: CVE-2024-7409: Close stray clients at server-stop") +CC: qemu-stable@nongnu.org +Reported-by: Andrey Drobyshev +Signed-off-by: Eric Blake +Message-ID: <20240822143617.800419-2-eblake@redhat.com> +Reviewed-by: Stefan Hajnoczi +--- + blockdev-nbd.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/blockdev-nbd.c b/blockdev-nbd.c +index f73409ae49..b36f41b7c5 100644 +--- a/blockdev-nbd.c ++++ b/blockdev-nbd.c +@@ -92,10 +92,13 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, + + static void nbd_update_server_watch(NBDServerData *s) + { +- if (!s->max_connections || s->connections < s->max_connections) { +- qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, NULL); +- } else { +- qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL); ++ if (s->listener) { ++ if (!s->max_connections || s->connections < s->max_connections) { ++ qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, ++ NULL); ++ } else { ++ qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL); ++ } + } + } + +@@ -113,6 +116,7 @@ static void nbd_server_free(NBDServerData *server) + */ + qio_net_listener_disconnect(server->listener); + object_unref(OBJECT(server->listener)); ++ server->listener = NULL; + QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) { + qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH, + NULL); +-- +2.43.0 + diff --git a/0455-add-rtc-acpi-table.patch b/0455-add-rtc-acpi-table.patch new file mode 100644 index 0000000..ea2e635 --- /dev/null +++ b/0455-add-rtc-acpi-table.patch @@ -0,0 +1,57 @@ +From e406951cdf23f12528e24609475d21c7fc25d3b2 Mon Sep 17 00:00:00 2001 +From: Xianglai Li +Date: Fri, 6 Jun 2025 16:34:25 +0800 +Subject: [PATCH] add rtc acpi table + +add rtc acpi table for loongarch + +Signed-off-by: Xianglai Li +--- + hw/loongarch/acpi-build.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c +index c291cf663e..e8be52aac6 100644 +--- a/hw/loongarch/acpi-build.c ++++ b/hw/loongarch/acpi-build.c +@@ -433,6 +433,29 @@ static void acpi_dsdt_add_tpm(Aml *scope, LoongArchVirtMachineState *vms) + } + #endif + ++static void acpi_dsdt_add_rtc(Aml *scope) ++{ ++ uint32_t rtc_irq = VIRT_RTC_IRQ; ++ Aml *dev = aml_device("RTC"); ++ ++ aml_append(dev, aml_name_decl("_HID", aml_string("LOON0001"))); ++ aml_append(dev, aml_name_decl("_UID", aml_int(0))); ++ ++ Aml *crs = aml_resource_template(); ++ aml_append(crs, ++ aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED, ++ AML_NON_CACHEABLE, AML_READ_WRITE, ++ 0, VIRT_RTC_REG_BASE, ++ VIRT_RTC_REG_BASE + VIRT_RTC_LEN - 1, ++ 0, VIRT_RTC_LEN)); ++ aml_append(crs, ++ aml_interrupt(AML_CONSUMER, AML_EDGE, AML_ACTIVE_HIGH, ++ AML_EXCLUSIVE, &rtc_irq, 1)); ++ ++ aml_append(dev, aml_name_decl("_CRS", crs)); ++ aml_append(scope, dev); ++} ++ + /* build DSDT */ + static void + build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) +@@ -447,6 +470,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) + dsdt = init_aml_allocator(); + for (i = 0; i < VIRT_UART_COUNT; i++) + build_uart_device_aml(dsdt, i); ++ acpi_dsdt_add_rtc(dsdt); + build_pci_device_aml(dsdt, lvms); + build_la_ged_aml(dsdt, machine); + build_flash_aml(dsdt, lvms); +-- +2.43.0 + diff --git a/0456-virtio-net-ensure-queue-index-fits-with-rss.patch b/0456-virtio-net-ensure-queue-index-fits-with-rss.patch new file mode 100644 index 0000000..352807d --- /dev/null +++ b/0456-virtio-net-ensure-queue-index-fits-with-rss.patch @@ -0,0 +1,36 @@ +From a2cdd36f307140ea9bfb16a35adfe83bf087bcab Mon Sep 17 00:00:00 2001 +From: Akihiko Odaki +Date: Mon, 1 Jul 2024 20:58:04 +0900 +Subject: [PATCH] virtio-net: Ensure queue index fits with RSS + +Ensure the queue index points to a valid queue when software RSS +enabled. The new calculation matches with the behavior of Linux's TAP +device with the RSS eBPF program. + +Fixes: 4474e37a5b3a ("virtio-net: implement RX RSS processing") +Reported-by: Zhibin Hu +Cc: qemu-stable@nongnu.org +Signed-off-by: Akihiko Odaki +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index b6574f941a..69092f7c6c 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -1913,7 +1913,8 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, + if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) { + int index = virtio_net_process_rss(nc, buf, size); + if (index >= 0) { +- NetClientState *nc2 = qemu_get_subqueue(n->nic, index); ++ NetClientState *nc2 = ++ qemu_get_subqueue(n->nic, index % n->curr_queue_pairs); + return virtio_net_receive_rcu(nc2, buf, size, true); + } + } +-- +2.43.0 + diff --git a/0457-hw-virtio-introduce-virtio-bh-new-guarded-helper.patch b/0457-hw-virtio-introduce-virtio-bh-new-guarded-helper.patch new file mode 100644 index 0000000..fbb7477 --- /dev/null +++ b/0457-hw-virtio-introduce-virtio-bh-new-guarded-helper.patch @@ -0,0 +1,67 @@ +From 6a01934e61899c9978ea900af4ff1dd071b17ae5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Thu, 4 Apr 2024 20:56:11 +0200 +Subject: [PATCH] hw/virtio: Introduce virtio_bh_new_guarded() helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce virtio_bh_new_guarded(), similar to qemu_bh_new_guarded() +but using the transport memory guard, instead of the device one +(there can only be one virtio device per virtio bus). + +Inspired-by: Gerd Hoffmann +Reviewed-by: Gerd Hoffmann +Acked-by: Michael S. Tsirkin +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Michael S. Tsirkin +Message-Id: <20240409105537.18308-2-philmd@linaro.org> +--- + hw/virtio/virtio.c | 10 ++++++++++ + include/hw/virtio/virtio.h | 7 +++++++ + 2 files changed, 17 insertions(+) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index 09e2de60c1..d64835188b 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -4193,3 +4193,13 @@ static void virtio_register_types(void) + } + + type_init(virtio_register_types) ++ ++QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev, ++ QEMUBHFunc *cb, void *opaque, ++ const char *name) ++{ ++ DeviceState *transport = qdev_get_parent_bus(dev)->parent; ++ ++ return qemu_bh_new_full(cb, opaque, name, ++ &transport->mem_reentrancy_guard); ++} +diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h +index d2f4ed160d..7502e309d0 100644 +--- a/include/hw/virtio/virtio.h ++++ b/include/hw/virtio/virtio.h +@@ -22,6 +22,7 @@ + #include "standard-headers/linux/virtio_config.h" + #include "standard-headers/linux/virtio_ring.h" + #include "qom/object.h" ++#include "block/aio.h" + + /* + * A guest should never accept this. It implies negotiation is broken +@@ -520,4 +521,10 @@ static inline bool virtio_device_disabled(VirtIODevice *vdev) + bool virtio_legacy_allowed(VirtIODevice *vdev); + bool virtio_legacy_check_disabled(VirtIODevice *vdev); + ++QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev, ++ QEMUBHFunc *cb, void *opaque, ++ const char *name); ++#define virtio_bh_new_guarded(dev, cb, opaque) \ ++ virtio_bh_new_guarded_full((dev), (cb), (opaque), (stringify(cb))) ++ + #endif +-- +2.43.0 + diff --git a/0458-hw-display-virtio-gpu-protect-from-dma-re-entrancy-b.patch b/0458-hw-display-virtio-gpu-protect-from-dma-re-entrancy-b.patch new file mode 100644 index 0000000..233cf15 --- /dev/null +++ b/0458-hw-display-virtio-gpu-protect-from-dma-re-entrancy-b.patch @@ -0,0 +1,141 @@ +From e702e2ae6e5bc77f3f161cfa94721e95a8a9b7b0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Thu, 4 Apr 2024 20:56:27 +0200 +Subject: [PATCH] hw/display/virtio-gpu: Protect from DMA re-entrancy bugs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Replace qemu_bh_new_guarded() by virtio_bh_new_guarded() +so the bus and device use the same guard. Otherwise the +DMA-reentrancy protection can be bypassed: + + $ cat << EOF | qemu-system-i386 -display none -nodefaults \ + -machine q35,accel=qtest \ + -m 512M \ + -device virtio-gpu \ + -qtest stdio + outl 0xcf8 0x80000820 + outl 0xcfc 0xe0004000 + outl 0xcf8 0x80000804 + outw 0xcfc 0x06 + write 0xe0004030 0x4 0x024000e0 + write 0xe0004028 0x1 0xff + write 0xe0004020 0x4 0x00009300 + write 0xe000401c 0x1 0x01 + write 0x101 0x1 0x04 + write 0x103 0x1 0x1c + write 0x9301c8 0x1 0x18 + write 0x105 0x1 0x1c + write 0x107 0x1 0x1c + write 0x109 0x1 0x1c + write 0x10b 0x1 0x00 + write 0x10d 0x1 0x00 + write 0x10f 0x1 0x00 + write 0x111 0x1 0x00 + write 0x113 0x1 0x00 + write 0x115 0x1 0x00 + write 0x117 0x1 0x00 + write 0x119 0x1 0x00 + write 0x11b 0x1 0x00 + write 0x11d 0x1 0x00 + write 0x11f 0x1 0x00 + write 0x121 0x1 0x00 + write 0x123 0x1 0x00 + write 0x125 0x1 0x00 + write 0x127 0x1 0x00 + write 0x129 0x1 0x00 + write 0x12b 0x1 0x00 + write 0x12d 0x1 0x00 + write 0x12f 0x1 0x00 + write 0x131 0x1 0x00 + write 0x133 0x1 0x00 + write 0x135 0x1 0x00 + write 0x137 0x1 0x00 + write 0x139 0x1 0x00 + write 0xe0007003 0x1 0x00 + EOF + ... + ================================================================= + ==276099==ERROR: AddressSanitizer: heap-use-after-free on address 0x60d000011178 + at pc 0x562cc3b736c7 bp 0x7ffed49dee60 sp 0x7ffed49dee58 + READ of size 8 at 0x60d000011178 thread T0 + #0 0x562cc3b736c6 in virtio_gpu_ctrl_response hw/display/virtio-gpu.c:180:42 + #1 0x562cc3b7c40b in virtio_gpu_ctrl_response_nodata hw/display/virtio-gpu.c:192:5 + #2 0x562cc3b7c40b in virtio_gpu_simple_process_cmd hw/display/virtio-gpu.c:1015:13 + #3 0x562cc3b82873 in virtio_gpu_process_cmdq hw/display/virtio-gpu.c:1050:9 + #4 0x562cc4a85514 in aio_bh_call util/async.c:169:5 + #5 0x562cc4a85c52 in aio_bh_poll util/async.c:216:13 + #6 0x562cc4a1a79b in aio_dispatch util/aio-posix.c:423:5 + #7 0x562cc4a8a2da in aio_ctx_dispatch util/async.c:358:5 + #8 0x7f36840547a8 in g_main_context_dispatch (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x547a8) + #9 0x562cc4a8b753 in glib_pollfds_poll util/main-loop.c:290:9 + #10 0x562cc4a8b753 in os_host_main_loop_wait util/main-loop.c:313:5 + #11 0x562cc4a8b753 in main_loop_wait util/main-loop.c:592:11 + #12 0x562cc3938186 in qemu_main_loop system/runstate.c:782:9 + #13 0x562cc43b7af5 in qemu_default_main system/main.c:37:14 + #14 0x7f3683a6c189 in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16 + #15 0x7f3683a6c244 in __libc_start_main csu/../csu/libc-start.c:381:3 + #16 0x562cc2a58ac0 in _start (qemu-system-i386+0x231bac0) + + 0x60d000011178 is located 56 bytes inside of 136-byte region [0x60d000011140,0x60d0000111c8) + freed by thread T0 here: + #0 0x562cc2adb662 in __interceptor_free (qemu-system-i386+0x239e662) + #1 0x562cc3b86b21 in virtio_gpu_reset hw/display/virtio-gpu.c:1524:9 + #2 0x562cc416e20e in virtio_reset hw/virtio/virtio.c:2145:9 + #3 0x562cc37c5644 in virtio_pci_reset hw/virtio/virtio-pci.c:2249:5 + #4 0x562cc4233758 in memory_region_write_accessor system/memory.c:497:5 + #5 0x562cc4232eea in access_with_adjusted_size system/memory.c:573:18 + + previously allocated by thread T0 here: + #0 0x562cc2adb90e in malloc (qemu-system-i386+0x239e90e) + #1 0x7f368405a678 in g_malloc (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x5a678) + #2 0x562cc4163ffc in virtqueue_split_pop hw/virtio/virtio.c:1612:12 + #3 0x562cc4163ffc in virtqueue_pop hw/virtio/virtio.c:1783:16 + #4 0x562cc3b91a95 in virtio_gpu_handle_ctrl hw/display/virtio-gpu.c:1112:15 + #5 0x562cc4a85514 in aio_bh_call util/async.c:169:5 + #6 0x562cc4a85c52 in aio_bh_poll util/async.c:216:13 + #7 0x562cc4a1a79b in aio_dispatch util/aio-posix.c:423:5 + + SUMMARY: AddressSanitizer: heap-use-after-free hw/display/virtio-gpu.c:180:42 in virtio_gpu_ctrl_response + +With this change, the same reproducer triggers: + + qemu-system-i386: warning: Blocked re-entrant IO on MemoryRegion: virtio-pci-common-virtio-gpu at addr: 0x6 + +Fixes: CVE-2024-3446 +Cc: qemu-stable@nongnu.org +Reported-by: Alexander Bulekov +Reported-by: Yongkang Jia +Reported-by: Xiao Lei +Reported-by: Yiming Tao +Buglink: https://bugs.launchpad.net/qemu/+bug/1888606 +Reviewed-by: Gerd Hoffmann +Acked-by: Michael S. Tsirkin +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Michael S. Tsirkin +Message-Id: <20240409105537.18308-3-philmd@linaro.org> +--- + hw/display/virtio-gpu.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c +index d4f946eb76..746160a175 100644 +--- a/hw/display/virtio-gpu.c ++++ b/hw/display/virtio-gpu.c +@@ -1453,10 +1453,8 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp) + + g->ctrl_vq = virtio_get_queue(vdev, 0); + g->cursor_vq = virtio_get_queue(vdev, 1); +- g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g, +- &qdev->mem_reentrancy_guard); +- g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g, +- &qdev->mem_reentrancy_guard); ++ g->ctrl_bh = virtio_bh_new_guarded(qdev, virtio_gpu_ctrl_bh, g); ++ g->cursor_bh = virtio_bh_new_guarded(qdev, virtio_gpu_cursor_bh, g); + g->reset_bh = qemu_bh_new(virtio_gpu_reset_bh, g); + qemu_cond_init(&g->reset_cond); + QTAILQ_INIT(&g->reslist); +-- +2.43.0 + diff --git a/0459-hw-char-virtio-serial-bus-protect-from-dma-re-entran.patch b/0459-hw-char-virtio-serial-bus-protect-from-dma-re-entran.patch new file mode 100644 index 0000000..b1e2239 --- /dev/null +++ b/0459-hw-char-virtio-serial-bus-protect-from-dma-re-entran.patch @@ -0,0 +1,41 @@ +From 628560a5aae285506a5f588af56fd3599052ebcb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Thu, 4 Apr 2024 20:56:35 +0200 +Subject: [PATCH] hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Replace qemu_bh_new_guarded() by virtio_bh_new_guarded() +so the bus and device use the same guard. Otherwise the +DMA-reentrancy protection can be bypassed. + +Fixes: CVE-2024-3446 +Cc: qemu-stable@nongnu.org +Suggested-by: Alexander Bulekov +Reviewed-by: Gerd Hoffmann +Acked-by: Michael S. Tsirkin +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Michael S. Tsirkin +Message-Id: <20240409105537.18308-4-philmd@linaro.org> +--- + hw/char/virtio-serial-bus.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c +index dd619f0731..1221fb7f15 100644 +--- a/hw/char/virtio-serial-bus.c ++++ b/hw/char/virtio-serial-bus.c +@@ -985,8 +985,7 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp) + return; + } + +- port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port, +- &dev->mem_reentrancy_guard); ++ port->bh = virtio_bh_new_guarded(dev, flush_queued_data_bh, port); + port->elem = NULL; + } + +-- +2.43.0 + diff --git a/0460-hw-virtio-virtio-crypto-protect-from-dma-re-entrancy.patch b/0460-hw-virtio-virtio-crypto-protect-from-dma-re-entrancy.patch new file mode 100644 index 0000000..c60b93f --- /dev/null +++ b/0460-hw-virtio-virtio-crypto-protect-from-dma-re-entrancy.patch @@ -0,0 +1,42 @@ +From 70af19ebffa8be4af3c420031fceb966085d7c9b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Thu, 4 Apr 2024 20:56:41 +0200 +Subject: [PATCH] hw/virtio/virtio-crypto: Protect from DMA re-entrancy bugs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Replace qemu_bh_new_guarded() by virtio_bh_new_guarded() +so the bus and device use the same guard. Otherwise the +DMA-reentrancy protection can be bypassed. + +Fixes: CVE-2024-3446 +Cc: qemu-stable@nongnu.org +Suggested-by: Alexander Bulekov +Reviewed-by: Gerd Hoffmann +Acked-by: Michael S. Tsirkin +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Michael S. Tsirkin +Message-Id: <20240409105537.18308-5-philmd@linaro.org> +--- + hw/virtio/virtio-crypto.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c +index 0e2cc8d5a8..4aaced74be 100644 +--- a/hw/virtio/virtio-crypto.c ++++ b/hw/virtio/virtio-crypto.c +@@ -1080,8 +1080,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp) + vcrypto->vqs[i].dataq = + virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh); + vcrypto->vqs[i].dataq_bh = +- qemu_bh_new_guarded(virtio_crypto_dataq_bh, &vcrypto->vqs[i], +- &dev->mem_reentrancy_guard); ++ virtio_bh_new_guarded(dev, virtio_crypto_dataq_bh, ++ &vcrypto->vqs[i]); + vcrypto->vqs[i].vcrypto = vcrypto; + } + +-- +2.43.0 + diff --git a/qemu.spec b/qemu.spec index 034d6c7..0b7f475 100644 --- a/qemu.spec +++ b/qemu.spec @@ -1,4 +1,4 @@ -%define anolis_release 30 +%define anolis_release 31 %bcond_with check %global all_system_emu_support 0 @@ -640,6 +640,112 @@ Patch0351: 0351-smbios-function-to-set-default-processor-family.patch Patch0352: 0352-target-riscv-smbios-support-for-risc-v-virt-machine.patch Patch0353: 0353-qemu-options-enable-smbios-option-on-risc-v.patch Patch0354: 0354-kvm-use-configs-definition-to-conditionalize-debug-s.patch +Patch0355: 0355-migration-multifd-fix-error-message-in-multifd-recv-.patch +Patch0356: 0356-migration-multifd-simplify-multifd-channel-connect-i.patch +Patch0357: 0357-migration-multifd-fix-leaking-of-error-in-tls-error-.patch +Patch0358: 0358-migration-multifd-remove-error-setg-in-migration-ioc.patch +Patch0359: 0359-migration-fix-migration-channel-read-peek-error-path.patch +Patch0360: 0360-migration-multifd-remove-unnecessary-usage-of-local-.patch +Patch0361: 0361-migration-multifd-remove-multifdpages-t-packet-num.patch +Patch0362: 0362-migration-multifd-remove-qemufile-from-where-it-is-n.patch +Patch0363: 0363-migration-multifd-change-multifd-pages-init-argument.patch +Patch0364: 0364-migration-report-error-in-incoming-migration.patch +Patch0365: 0365-tests-qtest-migration-print-migration-incoming-error.patch +Patch0366: 0366-tests-qtest-migration-add-a-wrapper-to-print-test-na.patch +Patch0367: 0367-tests-qtest-migration-use-the-new-migration-test-add.patch +Patch0368: 0368-tests-qtest-re-enable-multifd-cancel-test.patch +Patch0369: 0369-docs-migration-create-migration-directory.patch +Patch0370: 0370-docs-migration-create-index-page.patch +Patch0371: 0371-docs-migration-convert-virtio-txt-into-rst.patch +Patch0372: 0372-docs-migration-split-backwards-compatibility-separat.patch +Patch0373: 0373-docs-migration-split-debugging-and-firmware.patch +Patch0374: 0374-docs-migration-split-postcopy.patch +Patch0375: 0375-docs-migration-split-dirty-limit.patch +Patch0376: 0376-docs-migration-organize-postcopy-page.patch +Patch0377: 0377-docs-migration-further-move-vfio-to-be-feature-of-mi.patch +Patch0378: 0378-docs-migration-further-move-virtio-to-be-feature-of-.patch +Patch0379: 0379-migration-multifd-drop-stale-comment-for-multifd-zer.patch +Patch0380: 0380-migration-multifd-multifd-send-kick-main.patch +Patch0381: 0381-migration-multifd-drop-multifdsendparams-quit-cleanu.patch +Patch0382: 0382-migration-multifd-postpone-reset-of-multifdpages-t.patch +Patch0383: 0383-migration-multifd-drop-multifdsendparams-normal-arra.patch +Patch0384: 0384-migration-multifd-separate-sync-request-with-normal-.patch +Patch0385: 0385-migration-multifd-simplify-locking-in-sender-thread.patch +Patch0386: 0386-migration-multifd-drop-pages-num-check-in-sender-thr.patch +Patch0387: 0387-migration-multifd-rename-p-num-packets-and-clean-it-.patch +Patch0388: 0388-migration-multifd-move-total-normal-pages-accounting.patch +Patch0389: 0389-migration-multifd-move-trace-multifd-send-recv.patch +Patch0390: 0390-migration-multifd-multifd-send-prepare-header.patch +Patch0391: 0391-migration-multifd-move-header-prepare-fill-into-send.patch +Patch0392: 0392-migration-multifd-forbid-spurious-wakeups.patch +Patch0393: 0393-migration-multifd-split-multifd-send-terminate-threa.patch +Patch0394: 0394-migration-multifd-change-retval-of-multifd-queue-pag.patch +Patch0395: 0395-migration-multifd-change-retval-of-multifd-send-page.patch +Patch0396: 0396-migration-multifd-rewrite-multifd-queue-page.patch +Patch0397: 0397-migration-multifd-cleanup-multifd-save-cleanup.patch +Patch0398: 0398-migration-multifd-cleanup-multifd-load-cleanup.patch +Patch0399: 0399-migration-multifd-stick-with-send-recv-on-function-n.patch +Patch0400: 0400-migration-multifd-fix-multifdsendparams-packet-num-r.patch +Patch0401: 0401-migration-multifd-optimize-sender-side-to-be-lockles.patch +Patch0402: 0402-migration-multifd-join-the-tls-thread.patch +Patch0403: 0403-migration-multifd-remove-p-running.patch +Patch0404: 0404-migration-multifd-move-multifd-send-setup-error-hand.patch +Patch0405: 0405-migration-multifd-move-multifd-send-setup-into-migra.patch +Patch0406: 0406-migration-multifd-unify-multifd-and-tls-connection-p.patch +Patch0407: 0407-migration-multifd-add-a-synchronization-point-for-ch.patch +Patch0408: 0408-migration-multifd-remove-p-quit-from-recv-side.patch +Patch0409: 0409-migration-multifd-release-recv-sem-sync-earlier.patch +Patch0410: 0410-migration-multifd-cleanup-tls-iochannel-referencing.patch +Patch0411: 0411-migration-multifd-drop-registered-yank.patch +Patch0412: 0412-migration-multifd-make-multifd-channel-connect-retur.patch +Patch0413: 0413-migration-multifd-cleanup-outgoing-args-in-state-des.patch +Patch0414: 0414-migration-multifd-drop-unnecessary-helper-to-destroy.patch +Patch0415: 0415-migration-multifd-cleanup-multifd-recv-sync-main.patch +Patch0416: 0416-migration-multifd-rename-multifdsend-recvparams-data.patch +Patch0417: 0417-migration-multifd-decouple-recv-method-from-pages.patch +Patch0418: 0418-migration-multifd-allow-multifd-without-packets.patch +Patch0419: 0419-migration-multifd-add-new-migration-option-zero-page.patch +Patch0420: 0420-migration-multifd-implement-zero-page-transmission-o.patch +Patch0421: 0421-migration-multifd-implement-ram-save-target-page-mul.patch +Patch0422: 0422-migration-multifd-solve-zero-page-causing-multiple-p.patch +Patch0423: 0423-docs-migration-add-qpl-compression-feature.patch +Patch0424: 0424-migration-multifd-put-iov-initialization-into-compre.patch +Patch0425: 0425-configure-add-enable-qpl-build-option.patch +Patch0426: 0426-migration-multifd-add-qpl-compression-method.patch +Patch0427: 0427-migration-multifd-include-ram-h-in-multifd-h.patch +Patch0428: 0428-migration-multifd-implement-initialization-of-qpl-co.patch +Patch0429: 0429-migration-multifd-implement-qpl-compression-and-deco.patch +Patch0430: 0430-tests-migration-test-add-qpl-compression-test.patch +Patch0431: 0431-migration-properly-apply-migration-compression-level.patch +Patch0432: 0432-tests-migration-set-compression-level-in-migration-t.patch +Patch0433: 0433-docs-migration-add-qatzip-compression-feature.patch +Patch0434: 0434-meson-introduce-qatzip-feature-to-the-build-system.patch +Patch0435: 0435-migration-add-migration-parameters-for-qatzip.patch +Patch0436: 0436-migration-introduce-qatzip-compression-method.patch +Patch0437: 0437-tests-migration-add-integration-test-for-qatzip-comp.patch +Patch0438: 0438-migration-multifd-fix-rb-receivedmap-cleanup-race.patch +Patch0439: 0439-migration-multifd-fix-loop-conditions-in-multifd-zst.patch +Patch0440: 0440-migration-multifd-ensure-packet-ramblock-is-null-ter.patch +Patch0441: 0441-migration-multifd-zero-p-flags-before-starting-filli.patch +Patch0442: 0442-multifd-bugfix-for-migration-using-compression-metho.patch +Patch0443: 0443-multifd-bugfix-for-incorrect-migration-data-with-qpl.patch +Patch0444: 0444-multifd-bugfix-for-incorrect-migration-data-with-qat.patch +Patch0445: 0445-hw-audio-virtio-snd-fix-invalid-param-check.patch +Patch0446: 0446-nbd-server-avoid-per-nbdrequest-nbd-client-get-put.patch +Patch0447: 0447-nbd-server-only-traverse-nbdexport-clients-from-main.patch +Patch0448: 0448-nbd-server-introduce-nbdclient-lock-to-protect-field.patch +Patch0449: 0449-nbd-minor-style-and-typo-fixes.patch +Patch0450: 0450-nbd-server-plumb-in-new-args-to-nbd-client-add.patch +Patch0451: 0451-nbd-server-cve-2024-7409-cap-default-max-connections.patch +Patch0452: 0452-nbd-server-cve-2024-7409-drop-non-negotiating-client.patch +Patch0453: 0453-nbd-server-cve-2024-7409-close-stray-clients-at-serv.patch +Patch0454: 0454-nbd-server-cve-2024-7409-avoid-use-after-free-when-c.patch +Patch0455: 0455-add-rtc-acpi-table.patch +Patch0456: 0456-virtio-net-ensure-queue-index-fits-with-rss.patch +Patch0457: 0457-hw-virtio-introduce-virtio-bh-new-guarded-helper.patch +Patch0458: 0458-hw-display-virtio-gpu-protect-from-dma-re-entrancy-b.patch +Patch0459: 0459-hw-char-virtio-serial-bus-protect-from-dma-re-entran.patch +Patch0460: 0460-hw-virtio-virtio-crypto-protect-from-dma-re-entrancy.patch ExclusiveArch: x86_64 aarch64 loongarch64 riscv64 @@ -2204,6 +2310,13 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Tue Jul 15 2025 wh02252983 - 2:8.2.0-31 +- virtio-net: Ensure queue index fits with RSS +- hw/virtio: Introduce virtio_bh_new_guarded() helper +- hw/display/virtio-gpu: Protect from DMA re-entrancy bugs +- hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs +- hw/virtio/virtio-crypto: Protect from DMA re-entrancy bugs + * Sat Jun 14 2025 Chang Gao - 2:8.2.0-30 - add patch to fix riscv build failure -- Gitee From 059f6e6b6f729346a3c6794e3d1e574afc9e82c1 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 15 Aug 2025 10:07:00 +0800 Subject: [PATCH 2/3] Sync upstream changes --- 0461-linux-headers-riscv-add-ptrace-h.patch | 159 ++++++++++++ ...ders-fix-forwarding-to-asm-generic-h.patch | 53 ++++ ...ders-move-pvpanic-h-to-correct-direc.patch | 86 +++++++ ...aders-import-linux-kvm-para-h-header.patch | 167 ++++++++++++ ...meson-fix-type-of-relocatable-option.patch | 29 +++ 0466-makefile-clean-qemu-iotests-output.patch | 39 +++ ...onfigure-remove-unnecessary-subshell.patch | 47 ++++ ...again-the-case-arms-in-probe-target-.patch | 165 ++++++++++++ ...dd-more-sections-to-main-meson-build.patch | 75 ++++++ 0470-meson-move-program-checks-together.patch | 158 ++++++++++++ ...eson-move-option-validation-together.patch | 191 ++++++++++++++ ...celerator-dependency-checks-together.patch | 241 ++++++++++++++++++ 0473-meson-keep-subprojects-together.patch | 134 ++++++++++ ...etection-code-with-other-compiler-fl.patch | 117 +++++++++ ...e-config-host-h-definitions-together.patch | 86 +++++++ ...e-subdirs-to-collect-sources-section.patch | 101 ++++++++ ...be-u2f-and-canokey-if-the-option-is-.patch | 34 +++ ...t-fix-overrun-in-update-sctp-checksu.patch | 71 ++++++ ...l-iov-do-not-assert-offset-is-in-iov.patch | 110 ++++++++ ...t-tx-pkt-fix-overrun-in-update-sctp-.patch | 42 +++ qemu.spec | 29 ++- 21 files changed, 2133 insertions(+), 1 deletion(-) create mode 100644 0461-linux-headers-riscv-add-ptrace-h.patch create mode 100644 0462-update-linux-headers-fix-forwarding-to-asm-generic-h.patch create mode 100644 0463-update-linux-headers-move-pvpanic-h-to-correct-direc.patch create mode 100644 0464-update-linux-headers-import-linux-kvm-para-h-header.patch create mode 100644 0465-meson-fix-type-of-relocatable-option.patch create mode 100644 0466-makefile-clean-qemu-iotests-output.patch create mode 100644 0467-configure-remove-unnecessary-subshell.patch create mode 100644 0468-configure-unify-again-the-case-arms-in-probe-target-.patch create mode 100644 0469-meson-add-more-sections-to-main-meson-build.patch create mode 100644 0470-meson-move-program-checks-together.patch create mode 100644 0471-meson-move-option-validation-together.patch create mode 100644 0472-meson-move-accelerator-dependency-checks-together.patch create mode 100644 0473-meson-keep-subprojects-together.patch create mode 100644 0474-meson-move-cfi-detection-code-with-other-compiler-fl.patch create mode 100644 0475-meson-move-config-host-h-definitions-together.patch create mode 100644 0476-meson-move-subdirs-to-collect-sources-section.patch create mode 100644 0477-meson-always-probe-u2f-and-canokey-if-the-option-is-.patch create mode 100644 0478-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-checksu.patch create mode 100644 0479-util-iov-do-not-assert-offset-is-in-iov.patch create mode 100644 0480-revert-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-.patch diff --git a/0461-linux-headers-riscv-add-ptrace-h.patch b/0461-linux-headers-riscv-add-ptrace-h.patch new file mode 100644 index 0000000..0b8f628 --- /dev/null +++ b/0461-linux-headers-riscv-add-ptrace-h.patch @@ -0,0 +1,159 @@ +From f79fc277fc4c9a3e504c4318c473fee8f90c1494 Mon Sep 17 00:00:00 2001 +From: Daniel Henrique Barboza +Date: Mon, 18 Dec 2023 17:43:19 -0300 +Subject: [PATCH] linux-headers: riscv: add ptrace.h + +commit 1583ca8aa61e1648d1f340c9a6ae3cd7ba3a82ae upstream. + +KVM vector support for RISC-V requires the linux-header ptrace.h. + +Signed-off-by: Daniel Henrique Barboza +Acked-by: Alistair Francis +Message-ID: <20231218204321.75757-3-dbarboza@ventanamicro.com> +Signed-off-by: Alistair Francis +--- + linux-headers/asm-riscv/ptrace.h | 132 +++++++++++++++++++++++++++++++ + 1 file changed, 132 insertions(+) + create mode 100644 linux-headers/asm-riscv/ptrace.h + +diff --git a/linux-headers/asm-riscv/ptrace.h b/linux-headers/asm-riscv/ptrace.h +new file mode 100644 +index 0000000000..1e3166caca +--- /dev/null ++++ b/linux-headers/asm-riscv/ptrace.h +@@ -0,0 +1,132 @@ ++/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ ++/* ++ * Copyright (C) 2012 Regents of the University of California ++ */ ++ ++#ifndef _ASM_RISCV_PTRACE_H ++#define _ASM_RISCV_PTRACE_H ++ ++#ifndef __ASSEMBLY__ ++ ++#include ++ ++#define PTRACE_GETFDPIC 33 ++ ++#define PTRACE_GETFDPIC_EXEC 0 ++#define PTRACE_GETFDPIC_INTERP 1 ++ ++/* ++ * User-mode register state for core dumps, ptrace, sigcontext ++ * ++ * This decouples struct pt_regs from the userspace ABI. ++ * struct user_regs_struct must form a prefix of struct pt_regs. ++ */ ++struct user_regs_struct { ++ unsigned long pc; ++ unsigned long ra; ++ unsigned long sp; ++ unsigned long gp; ++ unsigned long tp; ++ unsigned long t0; ++ unsigned long t1; ++ unsigned long t2; ++ unsigned long s0; ++ unsigned long s1; ++ unsigned long a0; ++ unsigned long a1; ++ unsigned long a2; ++ unsigned long a3; ++ unsigned long a4; ++ unsigned long a5; ++ unsigned long a6; ++ unsigned long a7; ++ unsigned long s2; ++ unsigned long s3; ++ unsigned long s4; ++ unsigned long s5; ++ unsigned long s6; ++ unsigned long s7; ++ unsigned long s8; ++ unsigned long s9; ++ unsigned long s10; ++ unsigned long s11; ++ unsigned long t3; ++ unsigned long t4; ++ unsigned long t5; ++ unsigned long t6; ++}; ++ ++struct __riscv_f_ext_state { ++ __u32 f[32]; ++ __u32 fcsr; ++}; ++ ++struct __riscv_d_ext_state { ++ __u64 f[32]; ++ __u32 fcsr; ++}; ++ ++struct __riscv_q_ext_state { ++ __u64 f[64] __attribute__((aligned(16))); ++ __u32 fcsr; ++ /* ++ * Reserved for expansion of sigcontext structure. Currently zeroed ++ * upon signal, and must be zero upon sigreturn. ++ */ ++ __u32 reserved[3]; ++}; ++ ++struct __riscv_ctx_hdr { ++ __u32 magic; ++ __u32 size; ++}; ++ ++struct __riscv_extra_ext_header { ++ __u32 __padding[129] __attribute__((aligned(16))); ++ /* ++ * Reserved for expansion of sigcontext structure. Currently zeroed ++ * upon signal, and must be zero upon sigreturn. ++ */ ++ __u32 reserved; ++ struct __riscv_ctx_hdr hdr; ++}; ++ ++union __riscv_fp_state { ++ struct __riscv_f_ext_state f; ++ struct __riscv_d_ext_state d; ++ struct __riscv_q_ext_state q; ++}; ++ ++struct __riscv_v_ext_state { ++ unsigned long vstart; ++ unsigned long vl; ++ unsigned long vtype; ++ unsigned long vcsr; ++ unsigned long vlenb; ++ void *datap; ++ /* ++ * In signal handler, datap will be set a correct user stack offset ++ * and vector registers will be copied to the address of datap ++ * pointer. ++ */ ++}; ++ ++struct __riscv_v_regset_state { ++ unsigned long vstart; ++ unsigned long vl; ++ unsigned long vtype; ++ unsigned long vcsr; ++ unsigned long vlenb; ++ char vreg[]; ++}; ++ ++/* ++ * According to spec: The number of bits in a single vector register, ++ * VLEN >= ELEN, which must be a power of 2, and must be no greater than ++ * 2^16 = 65536bits = 8192bytes ++ */ ++#define RISCV_MAX_VLENB (8192) ++ ++#endif /* __ASSEMBLY__ */ ++ ++#endif /* _ASM_RISCV_PTRACE_H */ +-- +2.39.3 + diff --git a/0462-update-linux-headers-fix-forwarding-to-asm-generic-h.patch b/0462-update-linux-headers-fix-forwarding-to-asm-generic-h.patch new file mode 100644 index 0000000..672341e --- /dev/null +++ b/0462-update-linux-headers-fix-forwarding-to-asm-generic-h.patch @@ -0,0 +1,53 @@ +From 836c891c3133aa7ca4f881635979e04cc2e4166e Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 3 Jun 2024 13:49:49 +0200 +Subject: [PATCH] update-linux-headers: fix forwarding to asm-generic headers + +commit ef7c70f020ca1fe9e7c98ea2cd9d6ba3c5714716 upstream. + +Afer commit 3efc75ad9d9 ("scripts/update-linux-headers.sh: Remove +temporary directory inbetween", 2024-05-29), updating linux-headers/ +results in errors such as + + cp: cannot stat '/tmp/tmp.1A1Eejh1UE/headers/include/asm/bitsperlong.h': No such file or directory + +because Loongarch does not have an asm/bitsperlong.h file and uses the +generic version. Before commit 3efc75ad9d9, the missing file would +incorrectly cause stale files to be included in linux-headers/. The files +were never committed to qemu.git, but were wrong nevertheless. The build +would just use the system version of the files, which is opposite to +the idea of importing Linux header files into QEMU's tree. + +Create forwarding headers, resembling the ones that are generated during a +kernel build by scripts/Makefile.asm-generic, if a file is only installed +under include/asm-generic/. + +Reviewed-by: Thomas Huth +Signed-off-by: Paolo Bonzini +--- + scripts/update-linux-headers.sh | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index 895a2c1722..0c6a288f13 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -119,7 +119,14 @@ for arch in $ARCHLIST; do + rm -rf "$output/linux-headers/asm-$arch" + mkdir -p "$output/linux-headers/asm-$arch" + for header in kvm.h unistd.h bitsperlong.h mman.h; do +- cp "$hdrdir/include/asm/$header" "$output/linux-headers/asm-$arch" ++ if test -f "$hdrdir/include/asm/$header"; then ++ cp "$hdrdir/include/asm/$header" "$output/linux-headers/asm-$arch" ++ elif test -f "$hdrdir/include/asm-generic/$header"; then ++ # not installed as , but used as such in kernel sources ++ cat <$output/linux-headers/asm-$arch/$header ++#include ++EOF ++ fi + done + + if [ $arch = mips ]; then +-- +2.39.3 + diff --git a/0463-update-linux-headers-move-pvpanic-h-to-correct-direc.patch b/0463-update-linux-headers-move-pvpanic-h-to-correct-direc.patch new file mode 100644 index 0000000..7976c7e --- /dev/null +++ b/0463-update-linux-headers-move-pvpanic-h-to-correct-direc.patch @@ -0,0 +1,86 @@ +From 53a446fa05303f811ec35be509a0e9be25669117 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 3 Jun 2024 14:16:55 +0200 +Subject: [PATCH] update-linux-headers: move pvpanic.h to correct directory + +commit b8116f4cbaa0f64bb07564f20b3b5219e23c8bff upstream. + +Linux has , not . Use the same +directory for QEMU's include/standard-headers/ copy. + +Reviewed-by: Thomas Huth +Signed-off-by: Paolo Bonzini +--- + hw/misc/pvpanic-isa.c | 2 +- + hw/misc/pvpanic-pci.c | 2 +- + hw/misc/pvpanic.c | 2 +- + include/standard-headers/{linux => misc}/pvpanic.h | 0 + scripts/update-linux-headers.sh | 6 ++++-- + 5 files changed, 7 insertions(+), 5 deletions(-) + rename include/standard-headers/{linux => misc}/pvpanic.h (100%) + +diff --git a/hw/misc/pvpanic-isa.c b/hw/misc/pvpanic-isa.c +index ccec50f61b..b4f84c4110 100644 +--- a/hw/misc/pvpanic-isa.c ++++ b/hw/misc/pvpanic-isa.c +@@ -21,7 +21,7 @@ + #include "hw/misc/pvpanic.h" + #include "qom/object.h" + #include "hw/isa/isa.h" +-#include "standard-headers/linux/pvpanic.h" ++#include "standard-headers/misc/pvpanic.h" + #include "hw/acpi/acpi_aml_interface.h" + + OBJECT_DECLARE_SIMPLE_TYPE(PVPanicISAState, PVPANIC_ISA_DEVICE) +diff --git a/hw/misc/pvpanic-pci.c b/hw/misc/pvpanic-pci.c +index fbcaa50731..0af7b15aef 100644 +--- a/hw/misc/pvpanic-pci.c ++++ b/hw/misc/pvpanic-pci.c +@@ -21,7 +21,7 @@ + #include "hw/misc/pvpanic.h" + #include "qom/object.h" + #include "hw/pci/pci_device.h" +-#include "standard-headers/linux/pvpanic.h" ++#include "standard-headers/misc/pvpanic.h" + + OBJECT_DECLARE_SIMPLE_TYPE(PVPanicPCIState, PVPANIC_PCI_DEVICE) + +diff --git a/hw/misc/pvpanic.c b/hw/misc/pvpanic.c +index 1540e9091a..80289ecf5f 100644 +--- a/hw/misc/pvpanic.c ++++ b/hw/misc/pvpanic.c +@@ -21,7 +21,7 @@ + #include "hw/qdev-properties.h" + #include "hw/misc/pvpanic.h" + #include "qom/object.h" +-#include "standard-headers/linux/pvpanic.h" ++#include "standard-headers/misc/pvpanic.h" + + static void handle_event(int event) + { +diff --git a/include/standard-headers/linux/pvpanic.h b/include/standard-headers/misc/pvpanic.h +similarity index 100% +rename from include/standard-headers/linux/pvpanic.h +rename to include/standard-headers/misc/pvpanic.h +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index 0c6a288f13..87fabf38df 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -236,10 +236,12 @@ for i in "$hdrdir"/include/linux/*virtio*.h \ + "$hdrdir/include/linux/const.h" \ + "$hdrdir/include/linux/kernel.h" \ + "$hdrdir/include/linux/vhost_types.h" \ +- "$hdrdir/include/linux/sysinfo.h" \ +- "$hdrdir/include/misc/pvpanic.h"; do ++ "$hdrdir/include/linux/sysinfo.h"; do + cp_portable "$i" "$output/include/standard-headers/linux" + done ++mkdir -p "$output/include/standard-headers/misc" ++cp_portable "$hdrdir/include/misc/pvpanic.h" \ ++ "$output/include/standard-headers/misc" + mkdir -p "$output/include/standard-headers/drm" + cp_portable "$hdrdir/include/drm/drm_fourcc.h" \ + "$output/include/standard-headers/drm" +-- +2.39.3 + diff --git a/0464-update-linux-headers-import-linux-kvm-para-h-header.patch b/0464-update-linux-headers-import-linux-kvm-para-h-header.patch new file mode 100644 index 0000000..4f251da --- /dev/null +++ b/0464-update-linux-headers-import-linux-kvm-para-h-header.patch @@ -0,0 +1,167 @@ +From beeac1c83741d9d86055f40312dbd493fe5a3f3f Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 3 Jun 2024 14:25:06 +0200 +Subject: [PATCH] update-linux-headers: import linux/kvm_para.h header + +commit aa274c33c39e7de981dc195abe60e1a246c9d248 upstream. + +Right now QEMU is importing arch/x86/include/uapi/asm/kvm_para.h +because it includes definitions for kvmclock and for KVM CPUID +bits. However, other definitions for KVM hypercall values and return +codes are included in include/uapi/linux/kvm_para.h and they will be +used by SEV-SNP. + +To ensure that it is possible to include both and +"standard-headers/asm-x86/kvm_para.h" without conflicts, provide +linux/kvm_para.h as a portable header too, and forward linux-headers/ +files to those in include/standard-headers. Note that +will include architecture-specific definitions as well, but +"standard-headers/linux/kvm_para.h" will not because it can be used in +architecture-independent files. + +This could easily be extended to other architectures, but right now +they do not need any symbol in their specific kvm_para.h files. + +Reviewed-by: Thomas Huth +Signed-off-by: Paolo Bonzini +--- + include/standard-headers/linux/kvm_para.h | 38 +++++++++++++++++++++++ + linux-headers/asm-x86/kvm_para.h | 1 + + linux-headers/linux/kvm_para.h | 2 ++ + scripts/update-linux-headers.sh | 22 ++++++++++++- + 4 files changed, 62 insertions(+), 1 deletion(-) + create mode 100644 include/standard-headers/linux/kvm_para.h + create mode 100644 linux-headers/asm-x86/kvm_para.h + create mode 100644 linux-headers/linux/kvm_para.h + +diff --git a/include/standard-headers/linux/kvm_para.h b/include/standard-headers/linux/kvm_para.h +new file mode 100644 +index 0000000000..015c166302 +--- /dev/null ++++ b/include/standard-headers/linux/kvm_para.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++#ifndef __LINUX_KVM_PARA_H ++#define __LINUX_KVM_PARA_H ++ ++/* ++ * This header file provides a method for making a hypercall to the host ++ * Architectures should define: ++ * - kvm_hypercall0, kvm_hypercall1... ++ * - kvm_arch_para_features ++ * - kvm_para_available ++ */ ++ ++/* Return values for hypercalls */ ++#define KVM_ENOSYS 1000 ++#define KVM_EFAULT EFAULT ++#define KVM_EINVAL EINVAL ++#define KVM_E2BIG E2BIG ++#define KVM_EPERM EPERM ++#define KVM_EOPNOTSUPP 95 ++ ++#define KVM_HC_VAPIC_POLL_IRQ 1 ++#define KVM_HC_MMU_OP 2 ++#define KVM_HC_FEATURES 3 ++#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 ++#define KVM_HC_KICK_CPU 5 ++#define KVM_HC_MIPS_GET_CLOCK_FREQ 6 ++#define KVM_HC_MIPS_EXIT_VM 7 ++#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 ++#define KVM_HC_CLOCK_PAIRING 9 ++#define KVM_HC_SEND_IPI 10 ++#define KVM_HC_SCHED_YIELD 11 ++#define KVM_HC_MAP_GPA_RANGE 12 ++ ++/* ++ * hypercalls use architecture specific ++ */ ++ ++#endif /* __LINUX_KVM_PARA_H */ +diff --git a/linux-headers/asm-x86/kvm_para.h b/linux-headers/asm-x86/kvm_para.h +new file mode 100644 +index 0000000000..1d3e0e0b07 +--- /dev/null ++++ b/linux-headers/asm-x86/kvm_para.h +@@ -0,0 +1 @@ ++#include "standard-headers/asm-x86/kvm_para.h" +diff --git a/linux-headers/linux/kvm_para.h b/linux-headers/linux/kvm_para.h +new file mode 100644 +index 0000000000..6a1e672259 +--- /dev/null ++++ b/linux-headers/linux/kvm_para.h +@@ -0,0 +1,2 @@ ++#include "standard-headers/linux/kvm_para.h" ++#include +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index 87fabf38df..e50d0844bf 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -64,6 +64,7 @@ cp_portable() { + -e 'linux/kernel' \ + -e 'linux/sysinfo' \ + -e 'asm/setup_data.h' \ ++ -e 'asm/kvm_para.h' \ + > /dev/null + then + echo "Unexpected #include in input file $f". +@@ -71,6 +72,15 @@ cp_portable() { + fi + + header=$(basename "$f"); ++ ++ if test -z "$arch"; then ++ # Let users of include/standard-headers/linux/ headers pick the ++ # asm-* header that they care about ++ arch_cmd='/]*\)>/d' ++ else ++ arch_cmd='s/]*\)>/"standard-headers\/asm-'$arch'\/\1"/' ++ fi ++ + sed -e 's/__aligned_u64/__u64 __attribute__((aligned(8)))/g' \ + -e 's/__u\([0-9][0-9]*\)/uint\1_t/g' \ + -e 's/u\([0-9][0-9]*\)/uint\1_t/g' \ +@@ -79,7 +89,7 @@ cp_portable() { + -e 's/__be\([0-9][0-9]*\)/uint\1_t/g' \ + -e 's/"\(input-event-codes\.h\)"/"standard-headers\/linux\/\1"/' \ + -e 's/]*\)>/"standard-headers\/linux\/\1"/' \ +- -e 's/]*\)>/"standard-headers\/asm-'$arch'\/\1"/' \ ++ -e "$arch_cmd" \ + -e 's/__bitwise//' \ + -e 's/__attribute__((packed))/QEMU_PACKED/' \ + -e 's/__inline__/inline/' \ +@@ -159,7 +169,12 @@ EOF + cp "$hdrdir/include/asm/unistd_32.h" "$output/linux-headers/asm-x86/" + cp "$hdrdir/include/asm/unistd_x32.h" "$output/linux-headers/asm-x86/" + cp "$hdrdir/include/asm/unistd_64.h" "$output/linux-headers/asm-x86/" ++ + cp_portable "$hdrdir/include/asm/kvm_para.h" "$output/include/standard-headers/asm-$arch" ++ cat <$output/linux-headers/asm-$arch/kvm_para.h ++#include "standard-headers/asm-$arch/kvm_para.h" ++EOF ++ + # Remove everything except the macros from bootparam.h avoiding the + # unnecessary import of several video/ist/etc headers + sed -e '/__ASSEMBLY__/,/__ASSEMBLY__/d' \ +@@ -213,6 +228,10 @@ if [ -d "$linux/LICENSES" ]; then + done + fi + ++cat <$output/linux-headers/linux/kvm_para.h ++#include "standard-headers/linux/kvm_para.h" ++#include ++EOF + cat <$output/linux-headers/linux/virtio_config.h + #include "standard-headers/linux/virtio_config.h" + EOF +@@ -235,6 +254,7 @@ for i in "$hdrdir"/include/linux/*virtio*.h \ + "$hdrdir/include/linux/ethtool.h" \ + "$hdrdir/include/linux/const.h" \ + "$hdrdir/include/linux/kernel.h" \ ++ "$hdrdir/include/linux/kvm_para.h" \ + "$hdrdir/include/linux/vhost_types.h" \ + "$hdrdir/include/linux/sysinfo.h"; do + cp_portable "$i" "$output/include/standard-headers/linux" +-- +2.39.3 + diff --git a/0465-meson-fix-type-of-relocatable-option.patch b/0465-meson-fix-type-of-relocatable-option.patch new file mode 100644 index 0000000..9e8a683 --- /dev/null +++ b/0465-meson-fix-type-of-relocatable-option.patch @@ -0,0 +1,29 @@ +From 2edb62e006ca7d76ca4703f1ede9b50aa3c48a5e Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 13 Dec 2023 11:30:09 +0100 +Subject: [PATCH] meson: fix type of "relocatable" option + +Since the option is of boolean type, the default value should be a boolean +rather than a string. + +Signed-off-by: Paolo Bonzini +--- + meson_options.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/meson_options.txt b/meson_options.txt +index 6a2d8351fd..df2ec63218 100644 +--- a/meson_options.txt ++++ b/meson_options.txt +@@ -101,7 +101,7 @@ option('cfi_debug', type: 'boolean', value: false, + description: 'Verbose errors in case of CFI violation') + option('multiprocess', type: 'feature', value: 'auto', + description: 'Out of process device emulation support') +-option('relocatable', type : 'boolean', value : 'true', ++option('relocatable', type : 'boolean', value : true, + description: 'toggle relocatable install') + option('vfio_user_server', type: 'feature', value: 'disabled', + description: 'vfio-user server support') +-- +2.39.3 + diff --git a/0466-makefile-clean-qemu-iotests-output.patch b/0466-makefile-clean-qemu-iotests-output.patch new file mode 100644 index 0000000..79a4d34 --- /dev/null +++ b/0466-makefile-clean-qemu-iotests-output.patch @@ -0,0 +1,39 @@ +From 315ccd8afa7c9572ff7f5b3611d1da920fc258f6 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 3 Nov 2023 10:00:04 +0100 +Subject: [PATCH] Makefile: clean qemu-iotests output + +Signed-off-by: Paolo Bonzini +--- + Makefile | 1 + + configure | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/Makefile b/Makefile +index 676a4a54f4..8f36990335 100644 +--- a/Makefile ++++ b/Makefile +@@ -202,6 +202,7 @@ clean: recurse-clean + ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-arm.a \ + -exec rm {} + + rm -f TAGS cscope.* *~ */*~ ++ @$(MAKE) -Ctests/qemu-iotests clean + + VERSION = $(shell cat $(SRC_PATH)/VERSION) + +diff --git a/configure b/configure +index 10d8824974..aaf68faf6e 100755 +--- a/configure ++++ b/configure +@@ -1574,7 +1574,7 @@ LINKS="$LINKS pc-bios/s390-ccw/Makefile" + LINKS="$LINKS pc-bios/vof/Makefile" + LINKS="$LINKS .gdbinit scripts" # scripts needed by relative path in .gdbinit + LINKS="$LINKS tests/avocado tests/data" +-LINKS="$LINKS tests/qemu-iotests/check" ++LINKS="$LINKS tests/qemu-iotests/check tests/qemu-iotests/Makefile" + LINKS="$LINKS python" + LINKS="$LINKS contrib/plugins/Makefile " + for f in $LINKS ; do +-- +2.39.3 + diff --git a/0467-configure-remove-unnecessary-subshell.patch b/0467-configure-remove-unnecessary-subshell.patch new file mode 100644 index 0000000..1b59575 --- /dev/null +++ b/0467-configure-remove-unnecessary-subshell.patch @@ -0,0 +1,47 @@ +From 8e17dc1767179c65cbe063e959ab65c9ec7629ed Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 3 Nov 2023 10:06:08 +0100 +Subject: [PATCH] configure: remove unnecessary subshell +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Do not use a subshell to hide the shadowing of $config_host_mak. + +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +--- + configure | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/configure b/configure +index aaf68faf6e..88dd116687 100755 +--- a/configure ++++ b/configure +@@ -1686,10 +1686,9 @@ if test "$targetos" = windows; then + fi + + # tests/tcg configuration +-(config_host_mak=tests/tcg/config-host.mak + mkdir -p tests/tcg +-echo "# Automatically generated by configure - do not modify" > $config_host_mak +-echo "SRC_PATH=$source_path" >> $config_host_mak ++echo "# Automatically generated by configure - do not modify" > tests/tcg/$config_host_mak ++echo "SRC_PATH=$source_path" >> tests/tcg/$config_host_mak + if test "$plugins" = "yes" ; then + echo "CONFIG_PLUGIN=y" >> tests/tcg/$config_host_mak + fi +@@ -1735,9 +1734,8 @@ for target in $target_list; do + done + + if test "$tcg" = "enabled"; then +- echo "TCG_TESTS_TARGETS=$tcg_tests_targets" >> config-host.mak ++ echo "TCG_TESTS_TARGETS=$tcg_tests_targets" >> $config_host_mak + fi +-) + + if test "$skip_meson" = no; then + cross="config-meson.cross.new" +-- +2.39.3 + diff --git a/0468-configure-unify-again-the-case-arms-in-probe-target-.patch b/0468-configure-unify-again-the-case-arms-in-probe-target-.patch new file mode 100644 index 0000000..a64c4d1 --- /dev/null +++ b/0468-configure-unify-again-the-case-arms-in-probe-target-.patch @@ -0,0 +1,165 @@ +From 6f6e767717cb46d2aee9915c0b11ba62539ac198 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 3 Nov 2023 09:48:21 +0100 +Subject: [PATCH] configure: unify again the case arms in probe_target_compiler + +Remove assignments that match the default, and group the +targets for debian-legacy-test-cross and debian-all-test-cross +into a single arm. + +Signed-off-by: Paolo Bonzini +--- + configure | 87 ++++++++++++++++++------------------------------------- + 1 file changed, 28 insertions(+), 59 deletions(-) + +diff --git a/configure b/configure +index 88dd116687..aa088b65f1 100755 +--- a/configure ++++ b/configure +@@ -1246,6 +1246,7 @@ probe_target_compiler() { + got_cross_cc=no + container_image= + container_hosts= ++ container_cross_prefix= + container_cross_cc= + container_cross_ar= + container_cross_as= +@@ -1287,16 +1288,33 @@ probe_target_compiler() { + test "$container" != no || continue + test "$host" = "$cpu" || continue + case $target_arch in ++ # debian-all-test-cross architectures ++ ++ hppa|m68k|mips|riscv64|sparc64) ++ container_image=debian-all-test-cross ++ ;; ++ mips64) ++ container_image=debian-all-test-cross ++ container_cross_prefix=mips64-linux-gnuabi64- ++ ;; ++ ppc|ppc64|ppc64le) ++ container_image=debian-all-test-cross ++ container_cross_prefix=powerpc${target_arch#ppc}-linux-gnu- ++ ;; ++ ++ # debian-legacy-test-cross architectures (need Debian 11) ++ # - libc6.1-dev-alpha-cross: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1054412 ++ # - sh4-linux-user: binaries don't run with bookworm compiler ++ ++ alpha|sh4) ++ container_image=debian-legacy-test-cross ++ ;; ++ ++ # architectures with individual containers ++ + aarch64) + # We don't have any bigendian build tools so we only use this for AArch64 + container_image=debian-arm64-cross +- container_cross_prefix=aarch64-linux-gnu- +- container_cross_cc=${container_cross_prefix}gcc +- ;; +- alpha) +- container_image=debian-legacy-test-cross +- container_cross_prefix=alpha-linux-gnu- +- container_cross_cc=${container_cross_prefix}gcc + ;; + arm) + # We don't have any bigendian build tools so we only use this for ARM +@@ -1305,18 +1323,11 @@ probe_target_compiler() { + ;; + cris) + container_image=fedora-cris-cross +- container_cross_prefix=cris-linux-gnu- + ;; + hexagon) +- container_image=debian-hexagon-cross + container_cross_prefix=hexagon-unknown-linux-musl- + container_cross_cc=${container_cross_prefix}clang + ;; +- hppa) +- container_image=debian-all-test-cross +- container_cross_prefix=hppa-linux-gnu- +- container_cross_cc=${container_cross_prefix}gcc +- ;; + i386) + container_image=debian-i686-cross + container_cross_prefix=i686-linux-gnu- +@@ -1325,59 +1336,19 @@ probe_target_compiler() { + container_image=debian-loongarch-cross + container_cross_prefix=loongarch64-unknown-linux-gnu- + ;; +- m68k) +- container_image=debian-all-test-cross +- container_cross_prefix=m68k-linux-gnu- +- container_cross_cc=${container_cross_prefix}gcc +- ;; + microblaze) +- container_image=debian-microblaze-cross + container_cross_prefix=microblaze-linux-musl- + ;; + mips64el) + container_image=debian-mips64el-cross + container_cross_prefix=mips64el-linux-gnuabi64- + ;; +- mips64) +- container_image=debian-all-test-cross +- container_cross_prefix=mips64-linux-gnuabi64- +- ;; +- mips) +- container_image=debian-all-test-cross +- container_cross_prefix=mips-linux-gnu- +- ;; +- nios2) +- container_image=debian-nios2-cross +- container_cross_prefix=nios2-linux-gnu- +- ;; +- ppc) +- container_image=debian-all-test-cross +- container_cross_prefix=powerpc-linux-gnu- +- container_cross_cc=${container_cross_prefix}gcc +- ;; +- ppc64|ppc64le) +- container_image=debian-all-test-cross +- container_cross_prefix=powerpc${target_arch#ppc}-linux-gnu- +- ;; +- riscv64) +- container_image=debian-all-test-cross +- container_cross_prefix=riscv64-linux-gnu- +- ;; +- sh4) +- container_image=debian-legacy-test-cross +- container_cross_prefix=sh4-linux-gnu- +- ;; +- sparc64) +- container_image=debian-all-test-cross +- container_cross_prefix=sparc64-linux-gnu- +- ;; + tricore) + container_image=debian-tricore-cross + container_cross_prefix=tricore- + ;; + x86_64) + container_image=debian-amd64-cross +- container_cross_prefix=x86_64-linux-gnu- + ;; + xtensa*) + container_image=debian-xtensa-cross +@@ -1385,12 +1356,10 @@ probe_target_compiler() { + # default to the dc232b cpu + container_cross_prefix=/opt/2020.07/xtensa-dc232b-elf/bin/xtensa-dc232b-elf- + ;; +- *) +- # Debian and GNU architecture names usually match +- container_image=debian-$target_arch-cross +- container_cross_prefix=$target_arch-linux-gnu- +- ;; + esac ++ # Debian and GNU architecture names usually match ++ : ${container_image:=debian-$target_arch-cross} ++ : ${container_cross_prefix:=$target_arch-linux-gnu-} + : ${container_cross_cc:=${container_cross_prefix}gcc} + : ${container_cross_ar:=${container_cross_prefix}ar} + : ${container_cross_as:=${container_cross_prefix}as} +-- +2.39.3 + diff --git a/0469-meson-add-more-sections-to-main-meson-build.patch b/0469-meson-add-more-sections-to-main-meson-build.patch new file mode 100644 index 0000000..04edf70 --- /dev/null +++ b/0469-meson-add-more-sections-to-main-meson-build.patch @@ -0,0 +1,75 @@ +From 0f66b1f3439f86d02da46af4da1109c504e2577e Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 8 Sep 2023 12:06:12 +0200 +Subject: [PATCH] meson: add more sections to main meson.build +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +--- + meson.build | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/meson.build b/meson.build +index 273a894147..6ea5469e89 100644 +--- a/meson.build ++++ b/meson.build +@@ -9,6 +9,10 @@ add_test_setup('thorough', env: ['G_TEST_SLOW=1', 'SPEED=thorough']) + + meson.add_postconf_script(find_program('scripts/symlink-install-tree.py')) + ++#################### ++# Global variables # ++#################### ++ + not_found = dependency('', required: false) + keyval = import('keyval') + ss = import('sourceset') +@@ -90,8 +94,16 @@ enable_modules = get_option('modules') \ + .allowed() + have_block = have_system or have_tools + ++############ ++# Programs # ++############ ++ + python = import('python').find_installation() + ++####################################### ++# Variables for host and accelerators # ++####################################### ++ + if cpu not in supported_cpus + host_arch = 'unknown' + elif cpu == 'x86' +@@ -526,9 +538,9 @@ if sparse.found() + '-Wno-non-pointer-null']) + endif + +-########################################### +-# Target-specific checks and dependencies # +-########################################### ++##################### ++# Option validation # ++##################### + + # Fuzzing + if get_option('fuzzing') and get_option('fuzzing_engine') == '' and \ +@@ -3552,9 +3564,9 @@ specific_ss.add_all(when: 'CONFIG_TCG_BUILTIN', if_true: tcg_module_ss) + target_modules += { 'accel' : { 'qtest': qtest_module_ss, + 'tcg': tcg_real_module_ss }} + +-######################## +-# Library dependencies # +-######################## ++############################################## ++# Internal static_libraries and dependencies # ++############################################## + + modinfo_collect = find_program('scripts/modinfo-collect.py') + modinfo_generate = find_program('scripts/modinfo-generate.py') +-- +2.39.3 + diff --git a/0470-meson-move-program-checks-together.patch b/0470-meson-move-program-checks-together.patch new file mode 100644 index 0000000..00ed184 --- /dev/null +++ b/0470-meson-move-program-checks-together.patch @@ -0,0 +1,158 @@ +From 6915c8f9dc180de3ded34af9a166c7b2d11e9c57 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 8 Sep 2023 12:06:57 +0200 +Subject: [PATCH] meson: move program checks together +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +--- + meson.build | 107 +++++++++++++++++++++++++++------------------------- + 1 file changed, 56 insertions(+), 51 deletions(-) + +diff --git a/meson.build b/meson.build +index 6ea5469e89..c5e41f4951 100644 +--- a/meson.build ++++ b/meson.build +@@ -19,21 +19,8 @@ ss = import('sourceset') + fs = import('fs') + + targetos = host_machine.system() +-sh = find_program('sh') + config_host = keyval.load(meson.current_build_dir() / 'config-host.mak') + +-cc = meson.get_compiler('c') +-all_languages = ['c'] +-if targetos == 'windows' and add_languages('cpp', required: false, native: false) +- all_languages += ['cpp'] +- cxx = meson.get_compiler('cpp') +-endif +-if targetos == 'darwin' and \ +- add_languages('objc', required: get_option('cocoa'), native: false) +- all_languages += ['objc'] +- objc = meson.get_compiler('objc') +-endif +- + # Temporary directory used for files created while + # configure runs. Since it is in the build directory + # we can safely blow away any previous version of it +@@ -69,6 +56,62 @@ if cpu == 'sw_64' + endif + + target_dirs = config_host['TARGET_DIRS'].split() ++ ++############ ++# Programs # ++############ ++ ++sh = find_program('sh') ++python = import('python').find_installation() ++ ++cc = meson.get_compiler('c') ++all_languages = ['c'] ++if targetos == 'windows' and add_languages('cpp', required: false, native: false) ++ all_languages += ['cpp'] ++ cxx = meson.get_compiler('cpp') ++endif ++if targetos == 'darwin' and \ ++ add_languages('objc', required: get_option('cocoa'), native: false) ++ all_languages += ['objc'] ++ objc = meson.get_compiler('objc') ++endif ++ ++dtrace = not_found ++stap = not_found ++if 'dtrace' in get_option('trace_backends') ++ dtrace = find_program('dtrace', required: true) ++ stap = find_program('stap', required: false) ++ if stap.found() ++ # Workaround to avoid dtrace(1) producing a file with 'hidden' symbol ++ # visibility. Define STAP_SDT_V2 to produce 'default' symbol visibility ++ # instead. QEMU --enable-modules depends on this because the SystemTap ++ # semaphores are linked into the main binary and not the module's shared ++ # object. ++ add_global_arguments('-DSTAP_SDT_V2', ++ native: false, language: all_languages) ++ endif ++endif ++ ++if get_option('iasl') == '' ++ iasl = find_program('iasl', required: false) ++else ++ iasl = find_program(get_option('iasl'), required: true) ++endif ++ ++edk2_targets = [ 'arm-softmmu', 'aarch64-softmmu', 'i386-softmmu', 'x86_64-softmmu' ] ++unpack_edk2_blobs = false ++foreach target : edk2_targets ++ if target in target_dirs ++ bzip2 = find_program('bzip2', required: get_option('install_blobs')) ++ unpack_edk2_blobs = bzip2.found() ++ break ++ endif ++endforeach ++ ++##################### ++# Option validation # ++##################### ++ + have_linux_user = false + have_bsd_user = false + have_system = false +@@ -94,12 +137,6 @@ enable_modules = get_option('modules') \ + .allowed() + have_block = have_system or have_tools + +-############ +-# Programs # +-############ +- +-python = import('python').find_installation() +- + ####################################### + # Variables for host and accelerators # + ####################################### +@@ -175,38 +212,6 @@ if targetos != 'darwin' + modular_tcg = ['i386-softmmu', 'x86_64-softmmu'] + endif + +-edk2_targets = [ 'arm-softmmu', 'aarch64-softmmu', 'i386-softmmu', 'x86_64-softmmu' ] +-unpack_edk2_blobs = false +-foreach target : edk2_targets +- if target in target_dirs +- bzip2 = find_program('bzip2', required: get_option('install_blobs')) +- unpack_edk2_blobs = bzip2.found() +- break +- endif +-endforeach +- +-dtrace = not_found +-stap = not_found +-if 'dtrace' in get_option('trace_backends') +- dtrace = find_program('dtrace', required: true) +- stap = find_program('stap', required: false) +- if stap.found() +- # Workaround to avoid dtrace(1) producing a file with 'hidden' symbol +- # visibility. Define STAP_SDT_V2 to produce 'default' symbol visibility +- # instead. QEMU --enable-modules depends on this because the SystemTap +- # semaphores are linked into the main binary and not the module's shared +- # object. +- add_global_arguments('-DSTAP_SDT_V2', +- native: false, language: all_languages) +- endif +-endif +- +-if get_option('iasl') == '' +- iasl = find_program('iasl', required: false) +-else +- iasl = find_program(get_option('iasl'), required: true) +-endif +- + ################## + # Compiler flags # + ################## +-- +2.39.3 + diff --git a/0471-meson-move-option-validation-together.patch b/0471-meson-move-option-validation-together.patch new file mode 100644 index 0000000..9289a4f --- /dev/null +++ b/0471-meson-move-option-validation-together.patch @@ -0,0 +1,191 @@ +From a57b6c67eea3d488dddbb06c236d104db3bfd324 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Sat, 30 Dec 2023 18:42:30 +0100 +Subject: [PATCH] meson: move option validation together + +Check options before compiler flags, because some compiler flags are +incompatible with modules. + +Signed-off-by: Paolo Bonzini +--- + meson.build | 137 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 68 insertions(+), 69 deletions(-) + +diff --git a/meson.build b/meson.build +index c5e41f4951..753e5e843b 100644 +--- a/meson.build ++++ b/meson.build +@@ -112,6 +112,71 @@ endforeach + # Option validation # + ##################### + ++# Fuzzing ++if get_option('fuzzing') and get_option('fuzzing_engine') == '' and \ ++ not cc.links(''' ++ #include ++ #include ++ int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size); ++ int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { return 0; } ++ ''', ++ args: ['-Werror', '-fsanitize=fuzzer']) ++ error('Your compiler does not support -fsanitize=fuzzer') ++endif ++ ++# Tracing backends ++if 'ftrace' in get_option('trace_backends') and targetos != 'linux' ++ error('ftrace is supported only on Linux') ++endif ++if 'syslog' in get_option('trace_backends') and not cc.compiles(''' ++ #include ++ int main(void) { ++ openlog("qemu", LOG_PID, LOG_DAEMON); ++ syslog(LOG_INFO, "configure"); ++ return 0; ++ }''') ++ error('syslog is not supported on this system') ++endif ++ ++# Miscellaneous Linux-only features ++get_option('mpath') \ ++ .require(targetos == 'linux', error_message: 'Multipath is supported only on Linux') ++ ++multiprocess_allowed = get_option('multiprocess') \ ++ .require(targetos == 'linux', error_message: 'Multiprocess QEMU is supported only on Linux') \ ++ .allowed() ++ ++vfio_user_server_allowed = get_option('vfio_user_server') \ ++ .require(targetos == 'linux', error_message: 'vfio-user server is supported only on Linux') \ ++ .allowed() ++ ++have_tpm = get_option('tpm') \ ++ .require(targetos != 'windows', error_message: 'TPM emulation only available on POSIX systems') \ ++ .allowed() ++ ++# vhost ++have_vhost_user = get_option('vhost_user') \ ++ .disable_auto_if(targetos != 'linux') \ ++ .require(targetos != 'windows', ++ error_message: 'vhost-user is not available on Windows').allowed() ++have_vhost_vdpa = get_option('vhost_vdpa') \ ++ .require(targetos == 'linux', ++ error_message: 'vhost-vdpa is only available on Linux').allowed() ++have_vhost_kernel = get_option('vhost_kernel') \ ++ .require(targetos == 'linux', ++ error_message: 'vhost-kernel is only available on Linux').allowed() ++have_vhost_user_crypto = get_option('vhost_crypto') \ ++ .require(have_vhost_user, ++ error_message: 'vhost-crypto requires vhost-user to be enabled').allowed() ++ ++have_vhost = have_vhost_user or have_vhost_vdpa or have_vhost_kernel ++ ++have_vhost_net_user = have_vhost_user and get_option('vhost_net').allowed() ++have_vhost_net_vdpa = have_vhost_vdpa and get_option('vhost_net').allowed() ++have_vhost_net_kernel = have_vhost_kernel and get_option('vhost_net').allowed() ++have_vhost_net = have_vhost_net_kernel or have_vhost_net_user or have_vhost_net_vdpa ++ ++# type of binaries to build + have_linux_user = false + have_bsd_user = false + have_system = false +@@ -121,6 +186,7 @@ foreach target : target_dirs + have_system = have_system or target.endswith('-softmmu') + endforeach + have_user = have_linux_user or have_bsd_user ++ + have_tools = get_option('tools') \ + .disable_auto_if(not have_system) \ + .allowed() +@@ -129,13 +195,14 @@ have_ga = get_option('guest_agent') \ + .require(targetos in ['sunos', 'linux', 'windows', 'freebsd', 'netbsd', 'openbsd'], + error_message: 'unsupported OS for QEMU guest agent') \ + .allowed() ++have_block = have_system or have_tools ++ + enable_modules = get_option('modules') \ + .require(targetos != 'windows', + error_message: 'Modules are not available for Windows') \ + .require(not get_option('prefer_static'), + error_message: 'Modules are incompatible with static linking') \ + .allowed() +-have_block = have_system or have_tools + + ####################################### + # Variables for host and accelerators # +@@ -543,74 +610,6 @@ if sparse.found() + '-Wno-non-pointer-null']) + endif + +-##################### +-# Option validation # +-##################### +- +-# Fuzzing +-if get_option('fuzzing') and get_option('fuzzing_engine') == '' and \ +- not cc.links(''' +- #include +- #include +- int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size); +- int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { return 0; } +- ''', +- args: ['-Werror', '-fsanitize=fuzzer']) +- error('Your compiler does not support -fsanitize=fuzzer') +-endif +- +-# Tracing backends +-if 'ftrace' in get_option('trace_backends') and targetos != 'linux' +- error('ftrace is supported only on Linux') +-endif +-if 'syslog' in get_option('trace_backends') and not cc.compiles(''' +- #include +- int main(void) { +- openlog("qemu", LOG_PID, LOG_DAEMON); +- syslog(LOG_INFO, "configure"); +- return 0; +- }''') +- error('syslog is not supported on this system') +-endif +- +-# Miscellaneous Linux-only features +-get_option('mpath') \ +- .require(targetos == 'linux', error_message: 'Multipath is supported only on Linux') +- +-multiprocess_allowed = get_option('multiprocess') \ +- .require(targetos == 'linux', error_message: 'Multiprocess QEMU is supported only on Linux') \ +- .allowed() +- +-vfio_user_server_allowed = get_option('vfio_user_server') \ +- .require(targetos == 'linux', error_message: 'vfio-user server is supported only on Linux') \ +- .allowed() +- +-have_tpm = get_option('tpm') \ +- .require(targetos != 'windows', error_message: 'TPM emulation only available on POSIX systems') \ +- .allowed() +- +-# vhost +-have_vhost_user = get_option('vhost_user') \ +- .disable_auto_if(targetos != 'linux') \ +- .require(targetos != 'windows', +- error_message: 'vhost-user is not available on Windows').allowed() +-have_vhost_vdpa = get_option('vhost_vdpa') \ +- .require(targetos == 'linux', +- error_message: 'vhost-vdpa is only available on Linux').allowed() +-have_vhost_kernel = get_option('vhost_kernel') \ +- .require(targetos == 'linux', +- error_message: 'vhost-kernel is only available on Linux').allowed() +-have_vhost_user_crypto = get_option('vhost_crypto') \ +- .require(have_vhost_user, +- error_message: 'vhost-crypto requires vhost-user to be enabled').allowed() +- +-have_vhost = have_vhost_user or have_vhost_vdpa or have_vhost_kernel +- +-have_vhost_net_user = have_vhost_user and get_option('vhost_net').allowed() +-have_vhost_net_vdpa = have_vhost_vdpa and get_option('vhost_net').allowed() +-have_vhost_net_kernel = have_vhost_kernel and get_option('vhost_net').allowed() +-have_vhost_net = have_vhost_net_kernel or have_vhost_net_user or have_vhost_net_vdpa +- + # Target-specific libraries and flags + libm = cc.find_library('m', required: false) + threads = dependency('threads') +-- +2.39.3 + diff --git a/0472-meson-move-accelerator-dependency-checks-together.patch b/0472-meson-move-accelerator-dependency-checks-together.patch new file mode 100644 index 0000000..593a2d1 --- /dev/null +++ b/0472-meson-move-accelerator-dependency-checks-together.patch @@ -0,0 +1,241 @@ +From 41311a3ca3f9c107a23e5a9effe1617128d49c19 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 8 Sep 2023 12:08:53 +0200 +Subject: [PATCH] meson: move accelerator dependency checks together + +Signed-off-by: Paolo Bonzini +--- + meson.build | 175 +++++++++++++++++++++++++++------------------------- + 1 file changed, 91 insertions(+), 84 deletions(-) + +diff --git a/meson.build b/meson.build +index 753e5e843b..5b1611dd85 100644 +--- a/meson.build ++++ b/meson.build +@@ -610,7 +610,10 @@ if sparse.found() + '-Wno-non-pointer-null']) + endif + +-# Target-specific libraries and flags ++##################################### ++# Host-specific libraries and flags # ++##################################### ++ + libm = cc.find_library('m', required: false) + threads = dependency('threads') + util = cc.find_library('util', required: false) +@@ -620,8 +623,6 @@ version_res = [] + coref = [] + iokit = [] + emulator_link_args = [] +-nvmm =not_found +-hvf = not_found + midl = not_found + widl = not_found + pathcch = not_found +@@ -657,7 +658,10 @@ elif targetos == 'openbsd' + endif + endif + +-# Target-specific configuration of accelerators ++############################################### ++# Host-specific configuration of accelerators # ++############################################### ++ + accelerators = [] + if get_option('kvm').allowed() and targetos == 'linux' + accelerators += 'CONFIG_KVM' +@@ -670,6 +674,8 @@ if get_option('whpx').allowed() and targetos == 'windows' + accelerators += 'CONFIG_WHPX' + endif + endif ++ ++hvf = not_found + if get_option('hvf').allowed() + hvf = dependency('appleframeworks', modules: 'Hypervisor', + required: get_option('hvf')) +@@ -677,6 +683,8 @@ if get_option('hvf').allowed() + accelerators += 'CONFIG_HVF' + endif + endif ++ ++nvmm = not_found + if targetos == 'netbsd' + nvmm = cc.find_library('nvmm', required: get_option('nvmm')) + if nvmm.found() +@@ -726,6 +734,85 @@ if 'CONFIG_WHPX' not in accelerators and get_option('whpx').enabled() + error('WHPX not available on this platform') + endif + ++xen = not_found ++if get_option('xen').enabled() or (get_option('xen').auto() and have_system) ++ xencontrol = dependency('xencontrol', required: false, ++ method: 'pkg-config') ++ if xencontrol.found() ++ xen_pc = declare_dependency(version: xencontrol.version(), ++ dependencies: [ ++ xencontrol, ++ # disabler: true makes xen_pc.found() return false if any is not found ++ dependency('xenstore', required: false, ++ method: 'pkg-config', ++ disabler: true), ++ dependency('xenforeignmemory', required: false, ++ method: 'pkg-config', ++ disabler: true), ++ dependency('xengnttab', required: false, ++ method: 'pkg-config', ++ disabler: true), ++ dependency('xenevtchn', required: false, ++ method: 'pkg-config', ++ disabler: true), ++ dependency('xendevicemodel', required: false, ++ method: 'pkg-config', ++ disabler: true), ++ # optional, no "disabler: true" ++ dependency('xentoolcore', required: false, ++ method: 'pkg-config')]) ++ if xen_pc.found() ++ xen = xen_pc ++ endif ++ endif ++ if not xen.found() ++ xen_tests = [ '4.11.0', '4.10.0', '4.9.0', '4.8.0', '4.7.1' ] ++ xen_libs = { ++ '4.11.0': [ 'xenstore', 'xenctrl', 'xendevicemodel', 'xenforeignmemory', 'xengnttab', 'xenevtchn', 'xentoolcore' ], ++ '4.10.0': [ 'xenstore', 'xenctrl', 'xendevicemodel', 'xenforeignmemory', 'xengnttab', 'xenevtchn', 'xentoolcore' ], ++ '4.9.0': [ 'xenstore', 'xenctrl', 'xendevicemodel', 'xenforeignmemory', 'xengnttab', 'xenevtchn' ], ++ '4.8.0': [ 'xenstore', 'xenctrl', 'xenforeignmemory', 'xengnttab', 'xenevtchn' ], ++ '4.7.1': [ 'xenstore', 'xenctrl', 'xenforeignmemory', 'xengnttab', 'xenevtchn' ], ++ } ++ xen_deps = {} ++ foreach ver: xen_tests ++ # cache the various library tests to avoid polluting the logs ++ xen_test_deps = [] ++ foreach l: xen_libs[ver] ++ if l not in xen_deps ++ xen_deps += { l: cc.find_library(l, required: false) } ++ endif ++ xen_test_deps += xen_deps[l] ++ endforeach ++ ++ # Use -D to pick just one of the test programs in scripts/xen-detect.c ++ xen_version = ver.split('.') ++ xen_ctrl_version = xen_version[0] + \ ++ ('0' + xen_version[1]).substring(-2) + \ ++ ('0' + xen_version[2]).substring(-2) ++ if cc.links(files('scripts/xen-detect.c'), ++ args: '-DCONFIG_XEN_CTRL_INTERFACE_VERSION=' + xen_ctrl_version, ++ dependencies: xen_test_deps) ++ xen = declare_dependency(version: ver, dependencies: xen_test_deps) ++ break ++ endif ++ endforeach ++ endif ++ if xen.found() ++ accelerators += 'CONFIG_XEN' ++ elif get_option('xen').enabled() ++ error('could not compile and link Xen test program') ++ endif ++endif ++have_xen_pci_passthrough = get_option('xen_pci_passthrough') \ ++ .require(xen.found(), ++ error_message: 'Xen PCI passthrough requested but Xen not enabled') \ ++ .require(targetos == 'linux', ++ error_message: 'Xen PCI passthrough not available on this platform') \ ++ .require(cpu == 'x86' or cpu == 'x86_64', ++ error_message: 'Xen PCI passthrough not available on this platform') \ ++ .allowed() ++ + ################ + # Dependencies # + ################ +@@ -1712,86 +1799,6 @@ if not get_option('rdma').auto() or have_system + endforeach + endif + +-xen = not_found +-if get_option('xen').enabled() or (get_option('xen').auto() and have_system) +- xencontrol = dependency('xencontrol', required: false, +- method: 'pkg-config') +- if xencontrol.found() +- xen_pc = declare_dependency(version: xencontrol.version(), +- dependencies: [ +- xencontrol, +- # disabler: true makes xen_pc.found() return false if any is not found +- dependency('xenstore', required: false, +- method: 'pkg-config', +- disabler: true), +- dependency('xenforeignmemory', required: false, +- method: 'pkg-config', +- disabler: true), +- dependency('xengnttab', required: false, +- method: 'pkg-config', +- disabler: true), +- dependency('xenevtchn', required: false, +- method: 'pkg-config', +- disabler: true), +- dependency('xendevicemodel', required: false, +- method: 'pkg-config', +- disabler: true), +- # optional, no "disabler: true" +- dependency('xentoolcore', required: false, +- method: 'pkg-config')]) +- if xen_pc.found() +- xen = xen_pc +- endif +- endif +- if not xen.found() +- xen_tests = [ '4.11.0', '4.10.0', '4.9.0', '4.8.0', '4.7.1' ] +- xen_libs = { +- '4.11.0': [ 'xenstore', 'xenctrl', 'xendevicemodel', 'xenforeignmemory', 'xengnttab', 'xenevtchn', 'xentoolcore' ], +- '4.10.0': [ 'xenstore', 'xenctrl', 'xendevicemodel', 'xenforeignmemory', 'xengnttab', 'xenevtchn', 'xentoolcore' ], +- '4.9.0': [ 'xenstore', 'xenctrl', 'xendevicemodel', 'xenforeignmemory', 'xengnttab', 'xenevtchn' ], +- '4.8.0': [ 'xenstore', 'xenctrl', 'xenforeignmemory', 'xengnttab', 'xenevtchn' ], +- '4.7.1': [ 'xenstore', 'xenctrl', 'xenforeignmemory', 'xengnttab', 'xenevtchn' ], +- } +- xen_deps = {} +- foreach ver: xen_tests +- # cache the various library tests to avoid polluting the logs +- xen_test_deps = [] +- foreach l: xen_libs[ver] +- if l not in xen_deps +- xen_deps += { l: cc.find_library(l, required: false) } +- endif +- xen_test_deps += xen_deps[l] +- endforeach +- +- # Use -D to pick just one of the test programs in scripts/xen-detect.c +- xen_version = ver.split('.') +- xen_ctrl_version = xen_version[0] + \ +- ('0' + xen_version[1]).substring(-2) + \ +- ('0' + xen_version[2]).substring(-2) +- if cc.links(files('scripts/xen-detect.c'), +- args: '-DCONFIG_XEN_CTRL_INTERFACE_VERSION=' + xen_ctrl_version, +- dependencies: xen_test_deps) +- xen = declare_dependency(version: ver, dependencies: xen_test_deps) +- break +- endif +- endforeach +- endif +- if xen.found() +- accelerators += 'CONFIG_XEN' +- elif get_option('xen').enabled() +- error('could not compile and link Xen test program') +- endif +-endif +-have_xen_pci_passthrough = get_option('xen_pci_passthrough') \ +- .require(xen.found(), +- error_message: 'Xen PCI passthrough requested but Xen not enabled') \ +- .require(targetos == 'linux', +- error_message: 'Xen PCI passthrough not available on this platform') \ +- .require(cpu == 'x86' or cpu == 'x86_64', +- error_message: 'Xen PCI passthrough not available on this platform') \ +- .allowed() +- +- + cacard = not_found + if not get_option('smartcard').auto() or have_system + cacard = dependency('libcacard', required: get_option('smartcard'), +-- +2.39.3 + diff --git a/0473-meson-keep-subprojects-together.patch b/0473-meson-keep-subprojects-together.patch new file mode 100644 index 0000000..f99cbe6 --- /dev/null +++ b/0473-meson-keep-subprojects-together.patch @@ -0,0 +1,134 @@ +From 85dc8c41629f2f972a32ef41cc0101f74bdc7264 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 8 Sep 2023 12:09:22 +0200 +Subject: [PATCH] meson: keep subprojects together + +And move away dependencies that are not subprojects anymore. + +Signed-off-by: Paolo Bonzini +--- + meson.build | 72 ++++++++++++++++++++++++++--------------------------- + 1 file changed, 36 insertions(+), 36 deletions(-) + +diff --git a/meson.build b/meson.build +index 5b1611dd85..7207348f8d 100644 +--- a/meson.build ++++ b/meson.build +@@ -1645,6 +1645,25 @@ if not gnutls_crypto.found() + endif + endif + ++capstone = not_found ++if not get_option('capstone').auto() or have_system or have_user ++ capstone = dependency('capstone', version: '>=3.0.5', ++ method: 'pkg-config', ++ required: get_option('capstone')) ++ ++ # Some versions of capstone have broken pkg-config file ++ # that reports a wrong -I path, causing the #include to ++ # fail later. If the system has such a broken version ++ # do not use it. ++ if capstone.found() and not cc.compiles('#include ', ++ dependencies: [capstone]) ++ capstone = not_found ++ if get_option('capstone').enabled() ++ error('capstone requested, but it does not appear to work') ++ endif ++ endif ++endif ++ + gmp = dependency('gmp', required: false, method: 'pkg-config') + if nettle.found() and gmp.found() + hogweed = dependency('hogweed', version: '>=3.4', +@@ -2150,6 +2169,7 @@ config_host_data.set('CONFIG_ATTR', libattr.found()) + config_host_data.set('CONFIG_BDRV_WHITELIST_TOOLS', get_option('block_drv_whitelist_in_tools')) + config_host_data.set('CONFIG_BRLAPI', brlapi.found()) + config_host_data.set('CONFIG_BSD', targetos in bsd_oses) ++config_host_data.set('CONFIG_CAPSTONE', capstone.found()) + config_host_data.set('CONFIG_COCOA', cocoa.found()) + config_host_data.set('CONFIG_DARWIN', targetos == 'darwin') + config_host_data.set('CONFIG_FUZZ', get_option('fuzzing')) +@@ -2213,6 +2233,7 @@ if seccomp.found() + config_host_data.set('CONFIG_SECCOMP_SYSRAWRC', seccomp_has_sysrawrc) + endif + config_host_data.set('CONFIG_PIXMAN', pixman.found()) ++config_host_data.set('CONFIG_SLIRP', slirp.found()) + config_host_data.set('CONFIG_SNAPPY', snappy.found()) + config_host_data.set('CONFIG_SOLARIS', targetos == 'sunos') + if get_option('tcg').allowed() +@@ -3119,28 +3140,9 @@ genh += custom_target('config-poison.h', + command: [find_program('scripts/make-config-poison.sh'), + target_configs_h]) + +-############## +-# Submodules # +-############## +- +-capstone = not_found +-if not get_option('capstone').auto() or have_system or have_user +- capstone = dependency('capstone', version: '>=3.0.5', +- method: 'pkg-config', +- required: get_option('capstone')) +- +- # Some versions of capstone have broken pkg-config file +- # that reports a wrong -I path, causing the #include to +- # fail later. If the system has such a broken version +- # do not use it. +- if capstone.found() and not cc.compiles('#include ', +- dependencies: [capstone]) +- capstone = not_found +- if get_option('capstone').enabled() +- error('capstone requested, but it does not appear to work') +- endif +- endif +-endif ++############### ++# Subprojects # ++############### + + libvfio_user_dep = not_found + if have_system and vfio_user_server_allowed +@@ -3184,9 +3186,19 @@ else + fdt_opt = 'disabled' + endif + +-config_host_data.set('CONFIG_CAPSTONE', capstone.found()) + config_host_data.set('CONFIG_FDT', fdt.found()) +-config_host_data.set('CONFIG_SLIRP', slirp.found()) ++ ++vhost_user = not_found ++if targetos == 'linux' and have_vhost_user ++ libvhost_user = subproject('libvhost-user') ++ vhost_user = libvhost_user.get_variable('vhost_user_dep') ++endif ++ ++libvduse = not_found ++if have_libvduse ++ libvduse_proj = subproject('libvduse') ++ libvduse = libvduse_proj.get_variable('libvduse_dep') ++endif + + ##################### + # Generated sources # +@@ -3419,18 +3431,6 @@ if have_system or have_user + ] + endif + +-vhost_user = not_found +-if targetos == 'linux' and have_vhost_user +- libvhost_user = subproject('libvhost-user') +- vhost_user = libvhost_user.get_variable('vhost_user_dep') +-endif +- +-libvduse = not_found +-if have_libvduse +- libvduse_proj = subproject('libvduse') +- libvduse = libvduse_proj.get_variable('libvduse_dep') +-endif +- + # NOTE: the trace/ subdirectory needs the qapi_trace_events variable + # that is filled in by qapi/. + subdir('qapi') +-- +2.39.3 + diff --git a/0474-meson-move-cfi-detection-code-with-other-compiler-fl.patch b/0474-meson-move-cfi-detection-code-with-other-compiler-fl.patch new file mode 100644 index 0000000..6495038 --- /dev/null +++ b/0474-meson-move-cfi-detection-code-with-other-compiler-fl.patch @@ -0,0 +1,117 @@ +From 8f40c6fa024c46043dcdbf9d66013f39c84d0ac1 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 10 May 2023 14:54:30 +0200 +Subject: [PATCH] meson: move CFI detection code with other compiler flags +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Keep it together with the other compiler modes, and before dependencies. + +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +--- + meson.build | 80 ++++++++++++++++++++++++++--------------------------- + 1 file changed, 40 insertions(+), 40 deletions(-) + +diff --git a/meson.build b/meson.build +index 7207348f8d..598ec79f8c 100644 +--- a/meson.build ++++ b/meson.build +@@ -525,6 +525,46 @@ if get_option('fuzzing') + endif + endif + ++if get_option('cfi') ++ cfi_flags=[] ++ # Check for dependency on LTO ++ if not get_option('b_lto') ++ error('Selected Control-Flow Integrity but LTO is disabled') ++ endif ++ if enable_modules ++ error('Selected Control-Flow Integrity is not compatible with modules') ++ endif ++ # Check for cfi flags. CFI requires LTO so we can't use ++ # get_supported_arguments, but need a more complex "compiles" which allows ++ # custom arguments ++ if cc.compiles('int main () { return 0; }', name: '-fsanitize=cfi-icall', ++ args: ['-flto', '-fsanitize=cfi-icall'] ) ++ cfi_flags += '-fsanitize=cfi-icall' ++ else ++ error('-fsanitize=cfi-icall is not supported by the compiler') ++ endif ++ if cc.compiles('int main () { return 0; }', ++ name: '-fsanitize-cfi-icall-generalize-pointers', ++ args: ['-flto', '-fsanitize=cfi-icall', ++ '-fsanitize-cfi-icall-generalize-pointers'] ) ++ cfi_flags += '-fsanitize-cfi-icall-generalize-pointers' ++ else ++ error('-fsanitize-cfi-icall-generalize-pointers is not supported by the compiler') ++ endif ++ if get_option('cfi_debug') ++ if cc.compiles('int main () { return 0; }', ++ name: '-fno-sanitize-trap=cfi-icall', ++ args: ['-flto', '-fsanitize=cfi-icall', ++ '-fno-sanitize-trap=cfi-icall'] ) ++ cfi_flags += '-fno-sanitize-trap=cfi-icall' ++ else ++ error('-fno-sanitize-trap=cfi-icall is not supported by the compiler') ++ endif ++ endif ++ add_global_arguments(cfi_flags, native: false, language: all_languages) ++ add_global_link_arguments(cfi_flags, native: false, language: all_languages) ++endif ++ + add_global_arguments(qemu_common_flags, native: false, language: all_languages) + add_global_link_arguments(qemu_ldflags, native: false, language: all_languages) + +@@ -2032,46 +2072,6 @@ endif + config_host_data.set('CONFIG_AUDIO_DRIVERS', + '"' + '", "'.join(audio_drivers_selected) + '", ') + +-if get_option('cfi') +- cfi_flags=[] +- # Check for dependency on LTO +- if not get_option('b_lto') +- error('Selected Control-Flow Integrity but LTO is disabled') +- endif +- if enable_modules +- error('Selected Control-Flow Integrity is not compatible with modules') +- endif +- # Check for cfi flags. CFI requires LTO so we can't use +- # get_supported_arguments, but need a more complex "compiles" which allows +- # custom arguments +- if cc.compiles('int main () { return 0; }', name: '-fsanitize=cfi-icall', +- args: ['-flto', '-fsanitize=cfi-icall'] ) +- cfi_flags += '-fsanitize=cfi-icall' +- else +- error('-fsanitize=cfi-icall is not supported by the compiler') +- endif +- if cc.compiles('int main () { return 0; }', +- name: '-fsanitize-cfi-icall-generalize-pointers', +- args: ['-flto', '-fsanitize=cfi-icall', +- '-fsanitize-cfi-icall-generalize-pointers'] ) +- cfi_flags += '-fsanitize-cfi-icall-generalize-pointers' +- else +- error('-fsanitize-cfi-icall-generalize-pointers is not supported by the compiler') +- endif +- if get_option('cfi_debug') +- if cc.compiles('int main () { return 0; }', +- name: '-fno-sanitize-trap=cfi-icall', +- args: ['-flto', '-fsanitize=cfi-icall', +- '-fno-sanitize-trap=cfi-icall'] ) +- cfi_flags += '-fno-sanitize-trap=cfi-icall' +- else +- error('-fno-sanitize-trap=cfi-icall is not supported by the compiler') +- endif +- endif +- add_global_arguments(cfi_flags, native: false, language: all_languages) +- add_global_link_arguments(cfi_flags, native: false, language: all_languages) +-endif +- + have_host_block_device = (targetos != 'darwin' or + cc.has_header('IOKit/storage/IOMedia.h')) + +-- +2.39.3 + diff --git a/0475-meson-move-config-host-h-definitions-together.patch b/0475-meson-move-config-host-h-definitions-together.patch new file mode 100644 index 0000000..33ffe5a --- /dev/null +++ b/0475-meson-move-config-host-h-definitions-together.patch @@ -0,0 +1,86 @@ +From 93765a87fe89fd1e5379659d80b79dc8598a6756 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 8 Sep 2023 12:10:08 +0200 +Subject: [PATCH] meson: move config-host.h definitions together + +Signed-off-by: Paolo Bonzini +--- + meson.build | 27 ++++++++++++++------------- + 1 file changed, 14 insertions(+), 13 deletions(-) + +diff --git a/meson.build b/meson.build +index 598ec79f8c..315e3466fd 100644 +--- a/meson.build ++++ b/meson.build +@@ -40,7 +40,6 @@ qemu_moddir = get_option('libdir') / get_option('qemu_suffix') + qemu_desktopdir = get_option('datadir') / 'applications' + qemu_icondir = get_option('datadir') / 'icons' + +-config_host_data = configuration_data() + genh = [] + qapi_trace_events = [] + +@@ -241,12 +240,6 @@ elif cpu in ['sw64'] + else + kvm_targets = [] + endif +- +-kvm_targets_c = '""' +-if get_option('kvm').allowed() and targetos == 'linux' +- kvm_targets_c = '"' + '" ,"'.join(kvm_targets) + '"' +-endif +-config_host_data.set('CONFIG_KVM_TARGETS', kvm_targets_c) + accelerator_targets = { 'CONFIG_KVM': kvm_targets } + + if cpu in ['x86', 'x86_64'] +@@ -1214,12 +1207,6 @@ if not get_option('virglrenderer').auto() or have_system or have_vhost_user_gpu + virgl = dependency('virglrenderer', + method: 'pkg-config', + required: get_option('virglrenderer')) +- if virgl.found() +- config_host_data.set('HAVE_VIRGL_D3D_INFO_EXT', +- cc.has_member('struct virgl_renderer_resource_info_ext', 'd3d_tex2d', +- prefix: '#include ', +- dependencies: virgl)) +- endif + endif + rutabaga = not_found + if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu +@@ -2031,6 +2018,8 @@ endif + # config-host.h # + ################# + ++config_host_data = configuration_data() ++ + audio_drivers_selected = [] + if have_system + audio_drivers_available = { +@@ -2160,6 +2149,12 @@ endif + + config_host_data.set('HOST_' + host_arch.to_upper(), 1) + ++kvm_targets_c = '""' ++if get_option('kvm').allowed() and targetos == 'linux' ++ kvm_targets_c = '"' + '" ,"'.join(kvm_targets) + '"' ++endif ++config_host_data.set('CONFIG_KVM_TARGETS', kvm_targets_c) ++ + if get_option('module_upgrades') and not enable_modules + error('Cannot enable module-upgrades as modules are not enabled') + endif +@@ -2259,6 +2254,12 @@ config_host_data.set('CONFIG_PNG', png.found()) + config_host_data.set('CONFIG_VNC', vnc.found()) + config_host_data.set('CONFIG_VNC_JPEG', jpeg.found()) + config_host_data.set('CONFIG_VNC_SASL', sasl.found()) ++if virgl.found() ++ config_host_data.set('HAVE_VIRGL_D3D_INFO_EXT', ++ cc.has_member('struct virgl_renderer_resource_info_ext', 'd3d_tex2d', ++ prefix: '#include ', ++ dependencies: virgl)) ++endif + config_host_data.set('CONFIG_VIRTFS', have_virtfs) + config_host_data.set('CONFIG_VTE', vte.found()) + config_host_data.set('CONFIG_XKBCOMMON', xkbcommon.found()) +-- +2.39.3 + diff --git a/0476-meson-move-subdirs-to-collect-sources-section.patch b/0476-meson-move-subdirs-to-collect-sources-section.patch new file mode 100644 index 0000000..a74cf0f --- /dev/null +++ b/0476-meson-move-subdirs-to-collect-sources-section.patch @@ -0,0 +1,101 @@ +From bce2095b2c8c9738fc23d085685e401dc9728380 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 3 Nov 2023 09:33:57 +0100 +Subject: [PATCH] meson: move subdirs to "Collect sources" section +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Paolo Bonzini +--- + meson.build | 66 ++++++++++++++++++++++++++--------------------------- + 1 file changed, 33 insertions(+), 33 deletions(-) + +diff --git a/meson.build b/meson.build +index 315e3466fd..679904a316 100644 +--- a/meson.build ++++ b/meson.build +@@ -3283,39 +3283,6 @@ foreach d : hx_headers + endforeach + genh += hxdep + +-################### +-# Collect sources # +-################### +- +-authz_ss = ss.source_set() +-blockdev_ss = ss.source_set() +-block_ss = ss.source_set() +-chardev_ss = ss.source_set() +-common_ss = ss.source_set() +-crypto_ss = ss.source_set() +-hwcore_ss = ss.source_set() +-io_ss = ss.source_set() +-qmp_ss = ss.source_set() +-qom_ss = ss.source_set() +-system_ss = ss.source_set() +-specific_fuzz_ss = ss.source_set() +-specific_ss = ss.source_set() +-stub_ss = ss.source_set() +-trace_ss = ss.source_set() +-user_ss = ss.source_set() +-util_ss = ss.source_set() +- +-# accel modules +-qtest_module_ss = ss.source_set() +-tcg_module_ss = ss.source_set() +- +-modules = {} +-target_modules = {} +-hw_arch = {} +-target_arch = {} +-target_system_arch = {} +-target_user_arch = {} +- + ############### + # Trace files # + ############### +@@ -3432,6 +3399,39 @@ if have_system or have_user + ] + endif + ++################### ++# Collect sources # ++################### ++ ++authz_ss = ss.source_set() ++blockdev_ss = ss.source_set() ++block_ss = ss.source_set() ++chardev_ss = ss.source_set() ++common_ss = ss.source_set() ++crypto_ss = ss.source_set() ++hwcore_ss = ss.source_set() ++io_ss = ss.source_set() ++qmp_ss = ss.source_set() ++qom_ss = ss.source_set() ++system_ss = ss.source_set() ++specific_fuzz_ss = ss.source_set() ++specific_ss = ss.source_set() ++stub_ss = ss.source_set() ++trace_ss = ss.source_set() ++user_ss = ss.source_set() ++util_ss = ss.source_set() ++ ++# accel modules ++qtest_module_ss = ss.source_set() ++tcg_module_ss = ss.source_set() ++ ++modules = {} ++target_modules = {} ++hw_arch = {} ++target_arch = {} ++target_system_arch = {} ++target_user_arch = {} ++ + # NOTE: the trace/ subdirectory needs the qapi_trace_events variable + # that is filled in by qapi/. + subdir('qapi') +-- +2.39.3 + diff --git a/0477-meson-always-probe-u2f-and-canokey-if-the-option-is-.patch b/0477-meson-always-probe-u2f-and-canokey-if-the-option-is-.patch new file mode 100644 index 0000000..c743a70 --- /dev/null +++ b/0477-meson-always-probe-u2f-and-canokey-if-the-option-is-.patch @@ -0,0 +1,34 @@ +From d726760b6d73a5566661637ec0842ab56faf1c38 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 8 Sep 2023 12:10:27 +0200 +Subject: [PATCH] meson: always probe u2f and canokey if the option is enabled + +commit e7c22ff87aa3a71e6cad1e88c2651dde8c7d504b upstream. + +Signed-off-by: Paolo Bonzini +--- + meson.build | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/meson.build b/meson.build +index 679904a316..5318b28e44 100644 +--- a/meson.build ++++ b/meson.build +@@ -1851,12 +1851,12 @@ if not get_option('smartcard').auto() or have_system + version: '>=2.5.1', method: 'pkg-config') + endif + u2f = not_found +-if have_system ++if not get_option('u2f').auto() or have_system + u2f = dependency('u2f-emu', required: get_option('u2f'), + method: 'pkg-config') + endif + canokey = not_found +-if have_system ++if not get_option('canokey').auto() or have_system + canokey = dependency('canokey-qemu', required: get_option('canokey'), + method: 'pkg-config') + endif +-- +2.39.3 + diff --git a/0478-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-checksu.patch b/0478-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-checksu.patch new file mode 100644 index 0000000..109ae0a --- /dev/null +++ b/0478-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-checksu.patch @@ -0,0 +1,71 @@ +From 21822c11a828053829531eb05cc0cdfcd59074a5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Tue, 9 Apr 2024 19:54:05 +0200 +Subject: [PATCH] hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 83ddb3dbba2ee0f1767442ae6ee665058aeb1093 upstream. + +If a fragmented packet size is too short, do not try to +calculate its checksum. + +Reproduced using: + + $ cat << EOF | qemu-system-i386 -display none -nodefaults \ + -machine q35,accel=qtest -m 32M \ + -device igb,netdev=net0 \ + -netdev user,id=net0 \ + -qtest stdio + outl 0xcf8 0x80000810 + outl 0xcfc 0xe0000000 + outl 0xcf8 0x80000804 + outw 0xcfc 0x06 + write 0xe0000403 0x1 0x02 + writel 0xe0003808 0xffffffff + write 0xe000381a 0x1 0x5b + write 0xe000381b 0x1 0x00 + EOF + Assertion failed: (offset == 0), function iov_from_buf_full, file util/iov.c, line 39. + #1 0x5575e81e952a in iov_from_buf_full qemu/util/iov.c:39:5 + #2 0x5575e6500768 in net_tx_pkt_update_sctp_checksum qemu/hw/net/net_tx_pkt.c:144:9 + #3 0x5575e659f3e1 in igb_setup_tx_offloads qemu/hw/net/igb_core.c:478:11 + #4 0x5575e659f3e1 in igb_tx_pkt_send qemu/hw/net/igb_core.c:552:10 + #5 0x5575e659f3e1 in igb_process_tx_desc qemu/hw/net/igb_core.c:671:17 + #6 0x5575e659f3e1 in igb_start_xmit qemu/hw/net/igb_core.c:903:9 + #7 0x5575e659f3e1 in igb_set_tdt qemu/hw/net/igb_core.c:2812:5 + #8 0x5575e657d6a4 in igb_core_write qemu/hw/net/igb_core.c:4248:9 + +Fixes: CVE-2024-3567 +Cc: qemu-stable@nongnu.org +Reported-by: Zheyu Ma +Fixes: f199b13bc1 ("igb: Implement Tx SCTP CSO") +Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2273 +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Akihiko Odaki +Acked-by: Jason Wang +Message-Id: <20240410070459.49112-1-philmd@linaro.org> +Signed-off-by: Bin Guo +--- + hw/net/net_tx_pkt.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index 2e5f58b3c9..d40d508a11 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -141,6 +141,10 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) + uint32_t csum = 0; + struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; + ++ if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) { ++ return false; ++ } ++ + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { + return false; + } +-- +2.39.3 + diff --git a/0479-util-iov-do-not-assert-offset-is-in-iov.patch b/0479-util-iov-do-not-assert-offset-is-in-iov.patch new file mode 100644 index 0000000..47a7508 --- /dev/null +++ b/0479-util-iov-do-not-assert-offset-is-in-iov.patch @@ -0,0 +1,110 @@ +From dcb03241609646d4397b24e33ba332b1f2299a56 Mon Sep 17 00:00:00 2001 +From: Akihiko Odaki +Date: Sun, 28 Apr 2024 20:11:22 +0900 +Subject: [PATCH] util/iov: Do not assert offset is in iov + +commit 9dc64bd5a4bebdc820e7e8484cb30e02befdc774 upstream. + +iov_from_buf(), iov_to_buf(), iov_memset(), and iov_copy() asserts +that the given offset fits in the iov while tolerating the specified +number of bytes to operate with to be greater than the size of iov. +This is inconsistent so remove the assertions. + +Asserting the offset fits in the iov makes sense if it is expected that +there are other operations that process the content before the offset +and the content is processed in order. Under this expectation, the +offset should point to the end of bytes that are previously processed +and fit in the iov. However, this expectation depends on the details of +the caller, and did not hold true at least one case and required code to +check iov_size(), which is added with commit 83ddb3dbba2e +("hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum()"). + +Adding such a check is inefficient and error-prone. These functions +already tolerate the specified number of bytes to operate with to be +greater than the size of iov to avoid such checks so remove the +assertions to tolerate invalid offset as well. They return the number of +bytes they operated with so their callers can still check the returned +value to ensure there are sufficient space at the given offset. + +Signed-off-by: Akihiko Odaki +Signed-off-by: Jason Wang +Signed-off-by: Bin Guo +--- + include/qemu/iov.h | 5 +++-- + util/iov.c | 5 ----- + 2 files changed, 3 insertions(+), 7 deletions(-) + +diff --git a/include/qemu/iov.h b/include/qemu/iov.h +index 63a1c01965..33548058d2 100644 +--- a/include/qemu/iov.h ++++ b/include/qemu/iov.h +@@ -30,7 +30,7 @@ size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt); + * only part of data will be copied, up to the end of the iovec. + * Number of bytes actually copied will be returned, which is + * min(bytes, iov_size(iov)-offset) +- * `Offset' must point to the inside of iovec. ++ * Returns 0 when `offset' points to the outside of iovec. + */ + size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, + size_t offset, const void *buf, size_t bytes); +@@ -66,11 +66,12 @@ iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt, + /** + * Set data bytes pointed out by iovec `iov' of size `iov_cnt' elements, + * starting at byte offset `start', to value `fillc', repeating it +- * `bytes' number of times. `Offset' must point to the inside of iovec. ++ * `bytes' number of times. + * If `bytes' is large enough, only last bytes portion of iovec, + * up to the end of it, will be filled with the specified value. + * Function return actual number of bytes processed, which is + * min(size, iov_size(iov) - offset). ++ * Returns 0 when `offset' points to the outside of iovec. + */ + size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt, + size_t offset, int fillc, size_t bytes); +diff --git a/util/iov.c b/util/iov.c +index 7e73948f5e..a523b406b7 100644 +--- a/util/iov.c ++++ b/util/iov.c +@@ -36,7 +36,6 @@ size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, + offset -= iov[i].iov_len; + } + } +- assert(offset == 0); + return done; + } + +@@ -55,7 +54,6 @@ size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt, + offset -= iov[i].iov_len; + } + } +- assert(offset == 0); + return done; + } + +@@ -74,7 +72,6 @@ size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt, + offset -= iov[i].iov_len; + } + } +- assert(offset == 0); + return done; + } + +@@ -266,7 +263,6 @@ unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, + bytes -= len; + offset = 0; + } +- assert(offset == 0); + return j; + } + +@@ -337,7 +333,6 @@ size_t qemu_iovec_concat_iov(QEMUIOVector *dst, + soffset -= src_iov[i].iov_len; + } + } +- assert(soffset == 0); /* offset beyond end of src */ + + return done; + } +-- +2.39.3 + diff --git a/0480-revert-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-.patch b/0480-revert-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-.patch new file mode 100644 index 0000000..080f797 --- /dev/null +++ b/0480-revert-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-.patch @@ -0,0 +1,42 @@ +From ea6b34115e0f3448a2ca366917c1830f26e1b7f3 Mon Sep 17 00:00:00 2001 +From: Akihiko Odaki +Date: Sun, 28 Apr 2024 20:11:23 +0900 +Subject: [PATCH] Revert "hw/net/net_tx_pkt: Fix overrun in + update_sctp_checksum()" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 2938c36937559554a2408b008470c9a76cb9271a upstream. + +This reverts commit 83ddb3dbba2ee0f1767442ae6ee665058aeb1093. + +The added check is no longer necessary due to a change of +iov_from_buf(). + +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Akihiko Odaki +Signed-off-by: Jason Wang +Signed-off-by: Bin Guo +--- + hw/net/net_tx_pkt.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index d40d508a11..2e5f58b3c9 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -141,10 +141,6 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) + uint32_t csum = 0; + struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; + +- if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) { +- return false; +- } +- + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { + return false; + } +-- +2.39.3 + diff --git a/qemu.spec b/qemu.spec index 0b7f475..a0f6888 100644 --- a/qemu.spec +++ b/qemu.spec @@ -1,4 +1,4 @@ -%define anolis_release 31 +%define anolis_release 32 %bcond_with check %global all_system_emu_support 0 @@ -746,6 +746,26 @@ Patch0457: 0457-hw-virtio-introduce-virtio-bh-new-guarded-helper.patch Patch0458: 0458-hw-display-virtio-gpu-protect-from-dma-re-entrancy-b.patch Patch0459: 0459-hw-char-virtio-serial-bus-protect-from-dma-re-entran.patch Patch0460: 0460-hw-virtio-virtio-crypto-protect-from-dma-re-entrancy.patch +Patch0461: 0461-linux-headers-riscv-add-ptrace-h.patch +Patch0462: 0462-update-linux-headers-fix-forwarding-to-asm-generic-h.patch +Patch0463: 0463-update-linux-headers-move-pvpanic-h-to-correct-direc.patch +Patch0464: 0464-update-linux-headers-import-linux-kvm-para-h-header.patch +Patch0465: 0465-meson-fix-type-of-relocatable-option.patch +Patch0466: 0466-makefile-clean-qemu-iotests-output.patch +Patch0467: 0467-configure-remove-unnecessary-subshell.patch +Patch0468: 0468-configure-unify-again-the-case-arms-in-probe-target-.patch +Patch0469: 0469-meson-add-more-sections-to-main-meson-build.patch +Patch0470: 0470-meson-move-program-checks-together.patch +Patch0471: 0471-meson-move-option-validation-together.patch +Patch0472: 0472-meson-move-accelerator-dependency-checks-together.patch +Patch0473: 0473-meson-keep-subprojects-together.patch +Patch0474: 0474-meson-move-cfi-detection-code-with-other-compiler-fl.patch +Patch0475: 0475-meson-move-config-host-h-definitions-together.patch +Patch0476: 0476-meson-move-subdirs-to-collect-sources-section.patch +Patch0477: 0477-meson-always-probe-u2f-and-canokey-if-the-option-is-.patch +Patch0478: 0478-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-checksu.patch +Patch0479: 0479-util-iov-do-not-assert-offset-is-in-iov.patch +Patch0480: 0480-revert-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-.patch ExclusiveArch: x86_64 aarch64 loongarch64 riscv64 @@ -2310,6 +2330,13 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Fri Aug 15 2025 wh02252983 - 2:8.2.0-32 +- meson: move subdirs to "Collect sources" section +- meson: always probe u2f and canokey if the option is enabled +- hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum() +- util/iov: Do not assert offset is in iov +- Revert "hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum()" + * Tue Jul 15 2025 wh02252983 - 2:8.2.0-31 - virtio-net: Ensure queue index fits with RSS - hw/virtio: Introduce virtio_bh_new_guarded() helper -- Gitee From bcfbe9c5ceb457ac70d3c5ec25d140bf7d73e728 Mon Sep 17 00:00:00 2001 From: yangxinyu Date: Tue, 26 Aug 2025 11:07:09 +0800 Subject: [PATCH 3/3] [CVE] fix cve-2024-3567 to #bug21905 fix cve-2024-3567 Project: TC2024080204 Signed-off-by:yangxinyu --- 0481-qemu-cve-2024-3567.patch | 67 +++++++++++++++++++++++++++++++++++ qemu.spec | 8 ++++- 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 0481-qemu-cve-2024-3567.patch diff --git a/0481-qemu-cve-2024-3567.patch b/0481-qemu-cve-2024-3567.patch new file mode 100644 index 0000000..adc7a3a --- /dev/null +++ b/0481-qemu-cve-2024-3567.patch @@ -0,0 +1,67 @@ +From 83ddb3dbba2ee0f1767442ae6ee665058aeb1093 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= +Date: Tue, 9 Apr 2024 19:54:05 +0200 +Subject: [PATCH] hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If a fragmented packet size is too short, do not try to +calculate its checksum. + +Reproduced using: + + $ cat << EOF | qemu-system-i386 -display none -nodefaults \ + -machine q35,accel=qtest -m 32M \ + -device igb,netdev=net0 \ + -netdev user,id=net0 \ + -qtest stdio + outl 0xcf8 0x80000810 + outl 0xcfc 0xe0000000 + outl 0xcf8 0x80000804 + outw 0xcfc 0x06 + write 0xe0000403 0x1 0x02 + writel 0xe0003808 0xffffffff + write 0xe000381a 0x1 0x5b + write 0xe000381b 0x1 0x00 + EOF + Assertion failed: (offset == 0), function iov_from_buf_full, file util/iov.c, line 39. + #1 0x5575e81e952a in iov_from_buf_full qemu/util/iov.c:39:5 + #2 0x5575e6500768 in net_tx_pkt_update_sctp_checksum qemu/hw/net/net_tx_pkt.c:144:9 + #3 0x5575e659f3e1 in igb_setup_tx_offloads qemu/hw/net/igb_core.c:478:11 + #4 0x5575e659f3e1 in igb_tx_pkt_send qemu/hw/net/igb_core.c:552:10 + #5 0x5575e659f3e1 in igb_process_tx_desc qemu/hw/net/igb_core.c:671:17 + #6 0x5575e659f3e1 in igb_start_xmit qemu/hw/net/igb_core.c:903:9 + #7 0x5575e659f3e1 in igb_set_tdt qemu/hw/net/igb_core.c:2812:5 + #8 0x5575e657d6a4 in igb_core_write qemu/hw/net/igb_core.c:4248:9 + +Fixes: CVE-2024-3567 +Cc: qemu-stable@nongnu.org +Reported-by: Zheyu Ma +Fixes: f199b13bc1 ("igb: Implement Tx SCTP CSO") +Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2273 +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Akihiko Odaki +Acked-by: Jason Wang +Message-Id: <20240410070459.49112-1-philmd@linaro.org> +--- + hw/net/net_tx_pkt.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c +index 2134a18c4c9..b7b1de816dc 100644 +--- a/hw/net/net_tx_pkt.c ++++ b/hw/net/net_tx_pkt.c +@@ -141,6 +141,10 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) + uint32_t csum = 0; + struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; + ++ if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) { ++ return false; ++ } ++ + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { + return false; + } +-- +GitLab diff --git a/qemu.spec b/qemu.spec index a0f6888..f0e71cc 100644 --- a/qemu.spec +++ b/qemu.spec @@ -1,4 +1,4 @@ -%define anolis_release 32 +%define anolis_release 33 %bcond_with check %global all_system_emu_support 0 @@ -767,6 +767,9 @@ Patch0478: 0478-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-checksu.patch Patch0479: 0479-util-iov-do-not-assert-offset-is-in-iov.patch Patch0480: 0480-revert-hw-net-net-tx-pkt-fix-overrun-in-update-sctp-.patch +#https://gitlab.com/qemu-project/qemu/-/commit/83ddb3dbba2ee0f1767442ae6ee665058aeb1093 +Patch0481: 0481-qemu-cve-2024-3567.patch + ExclusiveArch: x86_64 aarch64 loongarch64 riscv64 BuildRequires: meson >= %{meson_version} @@ -2330,6 +2333,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Tue Aug 26 2025 yangxinyu - 2:8.2.0-33 +-fix cve-2024-3567 + * Fri Aug 15 2025 wh02252983 - 2:8.2.0-32 - meson: move subdirs to "Collect sources" section - meson: always probe u2f and canokey if the option is enabled -- Gitee