From 991a9cf347fba9fe9465af897fd6e791d62860b6 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 14:25:29 +0800 Subject: [PATCH 01/56] memory: RAM_NAMED_FILE flag This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-3-git-send-email-steven.sistare@oracle.com -------------------------------------------------------------------------------- A memory-backend-ram or a memory-backend-memfd block with the RAM_SHARED flag set is not migrated when migrate_ignore_shared() is true, but this is wrong, because it has no named backing store, and its contents will be lost. Define a new flag RAM_NAMED_FILE to distinguish this case. Cpr will also test this flag, for similar reasons. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-3-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- backends/hostmem-file.c | 1 + include/exec/memory.h | 3 +++ include/exec/ram_addr.h | 1 + migration/ram.c | 3 ++- softmmu/physmem.c | 7 ++++++- 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index cd038024fa..0ffaead394 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -55,6 +55,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) ram_flags = backend->share ? RAM_SHARED : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; ram_flags |= fb->is_pmem ? RAM_PMEM : 0; + ram_flags |= RAM_NAMED_FILE; memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name, backend->size, fb->align, ram_flags, fb->mem_path, fb->readonly, errp); diff --git a/include/exec/memory.h b/include/exec/memory.h index abb838f194..bb7e5c2c18 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -206,6 +206,9 @@ typedef struct IOMMUTLBEvent { /* RAM that isn't accessible through normal means. */ #define RAM_PROTECTED (1 << 8) +/* RAM is an mmap-ed named file */ +#define RAM_NAMED_FILE (1 << 9) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 64fb936c7c..1d214b3b42 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -94,6 +94,7 @@ static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr, } bool ramblock_is_pmem(RAMBlock *rb); +bool ramblock_is_named_file(RAMBlock *rb); long qemu_minrampagesize(void); long qemu_maxrampagesize(void); diff --git a/migration/ram.c b/migration/ram.c index 12b8c653d8..bf018f3299 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -201,7 +201,8 @@ out: bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || - (migrate_ignore_shared() && qemu_ram_is_shared(block)); + (migrate_ignore_shared() && qemu_ram_is_shared(block) && + ramblock_is_named_file(block)); } #undef RAMBLOCK_FOREACH diff --git a/softmmu/physmem.c b/softmmu/physmem.c index e5c3557d54..819e2c3c17 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -2077,7 +2077,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, /* Just support these ram flags by now. */ assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | - RAM_PROTECTED)) == 0); + RAM_PROTECTED | RAM_NAMED_FILE)) == 0); if (xen_enabled()) { error_setg(errp, "-mem-path not supported with Xen"); @@ -3682,6 +3682,11 @@ bool ramblock_is_pmem(RAMBlock *rb) return rb->flags & RAM_PMEM; } +bool ramblock_is_named_file(RAMBlock *rb) +{ + return rb->flags & RAM_NAMED_FILE; +} + static void mtree_print_phys_entries(int start, int end, int skip, int ptr) { if (start == end - 1) { -- Gitee From f6497afdd6371b51b48413ec4904c63e04040d52 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 14:59:46 +0800 Subject: [PATCH 02/56] migration: file URI This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-4-git-send-email-steven.sistare@oracle.com/ ----------------------------------------------------------------------------- Extend the migration URI to support file:. This can be used for any migration scenario that does not require a reverse path. It is easier to use than the fd: URI. It can be use in HMP commands, and as a qemu command-line parameter. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-4-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- migration/file.c | 62 ++++++++++++++++++++++++++++++++++++++++++ migration/file.h | 14 ++++++++++ migration/meson.build | 1 + migration/migration.c | 5 ++++ migration/trace-events | 4 +++ qemu-options.hx | 6 +++- 6 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 migration/file.c create mode 100644 migration/file.h diff --git a/migration/file.c b/migration/file.c new file mode 100644 index 0000000000..233bcda1ff --- /dev/null +++ b/migration/file.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "channel.h" +#include "file.h" +#include "migration.h" +#include "io/channel-file.h" +#include "io/channel-util.h" +#include "trace.h" + +void file_start_outgoing_migration(MigrationState *s, const char *filename, + Error **errp) +{ + g_autoptr(QIOChannelFile) fioc = NULL; + QIOChannel *ioc; + + trace_migration_file_outgoing(filename); + + fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC, + 0600, errp); + if (!fioc) { + return; + } + + ioc = QIO_CHANNEL(fioc); + qio_channel_set_name(ioc, "migration-file-outgoing"); + migration_channel_connect(s, ioc, NULL, NULL); +} + +static gboolean file_accept_incoming_migration(QIOChannel *ioc, + GIOCondition condition, + gpointer opaque) +{ + migration_channel_process_incoming(ioc); + object_unref(OBJECT(ioc)); + return G_SOURCE_REMOVE; +} + +void file_start_incoming_migration(const char *filename, Error **errp) +{ + QIOChannelFile *fioc = NULL; + QIOChannel *ioc; + + trace_migration_file_incoming(filename); + + fioc = qio_channel_file_new_path(filename, O_RDONLY, 0, errp); + if (!fioc) { + return; + } + + ioc = QIO_CHANNEL(fioc); + qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); +} diff --git a/migration/file.h b/migration/file.h new file mode 100644 index 0000000000..aa697df5d7 --- /dev/null +++ b/migration/file.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MIGRATION_FILE_H +#define QEMU_MIGRATION_FILE_H +void file_start_incoming_migration(const char *filename, Error **errp); + +void file_start_outgoing_migration(MigrationState *s, const char *filename, + Error **errp); +#endif diff --git a/migration/meson.build b/migration/meson.build index f8714dcb15..7cd4604322 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -17,6 +17,7 @@ softmmu_ss.add(files( 'colo.c', 'exec.c', 'fd.c', + 'file.c', 'global_state.c', 'migration.c', 'multifd.c', diff --git a/migration/migration.c b/migration/migration.c index 2ec116f901..59be6a7f00 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -20,6 +20,7 @@ #include "migration/blocker.h" #include "exec.h" #include "fd.h" +#include "file.h" #include "socket.h" #include "sysemu/runstate.h" #include "sysemu/sysemu.h" @@ -472,6 +473,8 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) exec_start_incoming_migration(p, errp); } else if (strstart(uri, "fd:", &p)) { fd_start_incoming_migration(p, errp); + } else if (strstart(uri, "file:", &p)) { + file_start_incoming_migration(p, errp); } else { error_setg(errp, "unknown migration protocol: %s", uri); } @@ -2353,6 +2356,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, exec_start_outgoing_migration(s, p, &local_err); } else if (strstart(uri, "fd:", &p)) { fd_start_outgoing_migration(s, p, &local_err); + } else if (strstart(uri, "file:", &p)) { + file_start_outgoing_migration(s, p, &local_err); } else { if (!(has_resume && resume)) { yank_unregister_instance(MIGRATION_YANK_INSTANCE); diff --git a/migration/trace-events b/migration/trace-events index b48d873b8a..880d47df60 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -291,6 +291,10 @@ migration_exec_incoming(const char *cmd) "cmd=%s" migration_fd_outgoing(int fd) "fd=%d" migration_fd_incoming(int fd) "fd=%d" +# file.c +migration_file_outgoing(const char *filename) "filename=%s" +migration_file_incoming(const char *filename) "filename=%s" + # socket.c migration_socket_incoming_accepted(void) "" migration_socket_outgoing_connected(const char *hostname) "hostname=%s" diff --git a/qemu-options.hx b/qemu-options.hx index e329ec58ca..d7e94665ed 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4418,6 +4418,7 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \ " prepare for incoming migration, listen on\n" \ " specified protocol and socket address\n" \ "-incoming fd:fd\n" \ + "-incoming file:filename\n" \ "-incoming exec:cmdline\n" \ " accept incoming migration on given file descriptor\n" \ " or from given external command\n" \ @@ -4434,7 +4435,10 @@ SRST Prepare for incoming migration, listen on a given unix socket. ``-incoming fd:fd`` - Accept incoming migration from a given filedescriptor. + Accept incoming migration from a given file descriptor. + +``-incoming file:filename`` + Accept incoming migration from a given file. ``-incoming exec:cmdline`` Accept incoming migration as an output from specified external -- Gitee From 3ae6dcd1df35c5533c988fe1b63aaa9f75ea099c Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 15:36:12 +0800 Subject: [PATCH 03/56] migration: mode parameter This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-5-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------------- Create a mode migration parameter that can be used to select alternate migration algorithms. The default mode is normal, representing the current migration algorithm, and does not need to be explicitly set. No functional change until a new mode is added, except that the mode is shown by the 'info migrate' command. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-5-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/core/qdev-properties-system.c | 12 ++++++++++++ include/hw/qdev-properties-system.h | 4 ++++ include/migration/misc.h | 4 ++++ migration/migration.c | 28 ++++++++++++++++++++++++++++ monitor/hmp-cmds.c | 8 ++++++++ qapi/migration.json | 28 +++++++++++++++++++++++++--- 6 files changed, 81 insertions(+), 3 deletions(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b93ed9b4dd..1237c13007 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -567,6 +567,18 @@ const PropertyInfo qdev_prop_losttickpolicy = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +/* --- MigMode --- */ + +const PropertyInfo qdev_prop_mig_mode = { + .name = "MigMode", + .description = "mig_mode values, " + "normal/exec", + .enum_table = &MigMode_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- blocksize --- */ static void set_blocksize(Object *obj, Visitor *v, const char *name, diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 906a027676..817dd8377a 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -7,6 +7,7 @@ extern const PropertyInfo qdev_prop_chr; extern const PropertyInfo qdev_prop_macaddr; extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; +extern const PropertyInfo qdev_prop_mig_mode; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; extern const PropertyInfo qdev_prop_blockdev_retry_interval; @@ -43,6 +44,9 @@ extern const PropertyInfo qdev_prop_pcie_link_width; #define DEFINE_PROP_MULTIFD_COMPRESSION(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_multifd_compression, \ MultiFDCompression) +#define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ + MigMode) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/include/migration/misc.h b/include/migration/misc.h index 465906710d..1e01134cfb 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -15,6 +15,7 @@ #define MIGRATION_MISC_H #include "qemu/notify.h" +#include "qapi/qapi-types-migration.h" #include "qapi/qapi-types-net.h" /* migration/ram.c */ @@ -75,4 +76,7 @@ bool migration_in_bg_snapshot(void); /* migration/block-dirty-bitmap.c */ void dirty_bitmap_mig_init(void); +MigMode migrate_mode(void); +MigMode migrate_mode_of(MigrationState *s); + #endif diff --git a/migration/migration.c b/migration/migration.c index 59be6a7f00..3bb9130fc8 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -851,6 +851,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) /* TODO use QAPI_CLONE() instead of duplicating it inline */ params = g_malloc0(sizeof(*params)); + params->has_mode = true; + params->mode = s->parameters.mode; params->has_compress_level = true; params->compress_level = s->parameters.compress_level; params->has_compress_threads = true; @@ -1507,6 +1509,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, /* TODO use QAPI_CLONE() instead of duplicating it inline */ + if (params->has_mode) { + dest->mode = params->mode; + } + if (params->has_compress_level) { dest->compress_level = params->compress_level; } @@ -1608,6 +1614,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) /* TODO use QAPI_CLONE() instead of duplicating it inline */ + if (params->has_mode) { + s->parameters.mode = params->mode; + } + if (params->has_compress_level) { s->parameters.compress_level = params->compress_level; } @@ -2591,6 +2601,20 @@ int migrate_multifd_zstd_level(void) return s->parameters.multifd_zstd_level; } +MigMode migrate_mode(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.mode; +} + +MigMode migrate_mode_of(MigrationState *s) +{ + return s->parameters.mode; +} + int migrate_use_xbzrle(void) { MigrationState *s; @@ -4202,6 +4226,9 @@ static Property migration_properties[] = { clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT), /* Migration parameters */ + DEFINE_PROP_MIG_MODE("mode", MigrationState, + parameters.mode, + MIG_MODE_NORMAL), DEFINE_PROP_UINT8("x-compress-level", MigrationState, parameters.compress_level, DEFAULT_MIGRATE_COMPRESS_LEVEL), @@ -4329,6 +4356,7 @@ static void migration_instance_init(Object *obj) params->tls_creds = g_strdup(""); /* Set has_* up only for parameter checks */ + params->has_mode = true; params->has_compress_level = true; params->has_compress_threads = true; params->has_decompress_threads = true; diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 9570011232..7146e74a34 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -413,6 +413,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %" PRIu64 " ms\n", MigrationParameter_str(MIGRATION_PARAMETER_ANNOUNCE_STEP), params->announce_step); + assert(params->has_mode); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_MODE), + qapi_enum_lookup(&MigMode_lookup, params->mode)); assert(params->has_compress_level); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_LEVEL), @@ -1205,6 +1209,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) } switch (val) { + case MIGRATION_PARAMETER_MODE: + p->has_mode = true; + visit_type_MigMode(v, param, &p->mode, &err); + break; case MIGRATION_PARAMETER_COMPRESS_LEVEL: p->has_compress_level = true; visit_type_uint8(v, param, &p->compress_level, &err); diff --git a/qapi/migration.json b/qapi/migration.json index e965f4329b..279295b3ed 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -541,6 +541,16 @@ 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } +## +# @MigMode: +# +# @normal: the original form of migration. +# +# Since: 6.2 +## +{ 'enum': 'MigMode', + 'data': [ 'normal' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -615,6 +625,9 @@ # # Migration parameters enumeration # +# @mode: Migration mode. See description in @MigMode. Default is 'normal'. +# (Since 6.2) +# # @announce-initial: Initial delay (in milliseconds) before sending the first # announce (Since 4.0) # @@ -772,7 +785,8 @@ # Since: 2.4 ## { 'enum': 'MigrationParameter', - 'data': ['announce-initial', 'announce-max', + 'data': ['mode', + 'announce-initial', 'announce-max', 'announce-rounds', 'announce-step', 'compress-level', 'compress-threads', 'decompress-threads', 'compress-wait-thread', 'compress-method', 'throttle-trigger-threshold', @@ -791,6 +805,9 @@ ## # @MigrateSetParameters: # +# @mode: Migration mode. See description in @MigMode. Default is 'normal'. +# (Since 6.2) +# # @announce-initial: Initial delay (in milliseconds) before sending the first # announce (Since 4.0) # @@ -942,7 +959,8 @@ # TODO either fuse back into MigrationParameters, or make # MigrationParameters members mandatory { 'struct': 'MigrateSetParameters', - 'data': { '*announce-initial': 'size', + 'data': { '*mode': 'MigMode', + '*announce-initial': 'size', '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', @@ -993,6 +1011,9 @@ # # The optional members aren't actually optional. # +# @mode: Migration mode. See description in @MigMode. Default is 'normal'. +# (Since 6.2) +# # @announce-initial: Initial delay (in milliseconds) before sending the # first announce (Since 4.0) # @@ -1144,7 +1165,8 @@ # Since: 2.4 ## { 'struct': 'MigrationParameters', - 'data': { '*announce-initial': 'size', + 'data': { '*mode': 'MigMode', + '*announce-initial': 'size', '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', -- Gitee From 88edef537ffef44d4aff39d2976667f332cd1ef0 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 15:54:22 +0800 Subject: [PATCH 04/56] migration: migrate-enable-mode option This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-6-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------- Add the '-migrate-enable-mode ' command-line option as a pre-requisite for migration using the mode. Multiple -migrate-enable-mode options may be specified, one per mode. Requiring -migrate-enable-mode allows qemu to initialize objects differently, if necessary, so that migration for a mode is not blocked. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-6-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/migration/misc.h | 2 ++ migration/migration.c | 31 +++++++++++++++++++++++++++++++ qemu-options.hx | 10 ++++++++++ softmmu/vl.c | 4 ++++ 4 files changed, 47 insertions(+) diff --git a/include/migration/misc.h b/include/migration/misc.h index 1e01134cfb..71b62857aa 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -78,5 +78,7 @@ void dirty_bitmap_mig_init(void); MigMode migrate_mode(void); MigMode migrate_mode_of(MigrationState *s); +void migrate_enable_mode(MigMode mode); +bool migrate_mode_enabled(MigMode mode); #endif diff --git a/migration/migration.c b/migration/migration.c index 3bb9130fc8..3efb2df6cb 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -172,6 +172,7 @@ INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, static MigrationState *current_migration; static MigrationIncomingState *current_incoming; +static int migrate_enabled_modes = BIT(MIG_MODE_NORMAL); static GSList *migration_blockers; @@ -2070,6 +2071,29 @@ bool migration_is_active(MigrationState *s) s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } +void migrate_enable_mode(MigMode mode) +{ + migrate_enabled_modes |= BIT(mode); +} + +bool migrate_mode_enabled(MigMode mode) +{ + return !!(migrate_enabled_modes & BIT(mode)); +} + +static int migrate_check_enabled(Error **errp) +{ + MigMode mode = migrate_mode(); + + if (!migrate_mode_enabled(mode)) { + error_setg(errp, "migrate mode is not enabled. " + "Use '-migrate-mode-enable %s'.", + MigMode_str(mode)); + return -1; + } + return 0; +} + void migrate_init(MigrationState *s) { /* @@ -2140,6 +2164,9 @@ void qmp_migrate_incoming(const char *uri, Error **errp) Error *local_err = NULL; static bool once = true; + if (migrate_check_enabled(errp)) { + return; + } if (!once) { error_setg(errp, "The incoming migration has already been started"); return; @@ -2295,6 +2322,10 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (migrate_check_enabled(errp)) { + return false; + } + if (migration_is_blocked(errp)) { return false; } diff --git a/qemu-options.hx b/qemu-options.hx index d7e94665ed..b09c3818d7 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4458,6 +4458,16 @@ SRST an unmigratable state. ERST +DEF("migrate-mode-enable", HAS_ARG, QEMU_OPTION_migrate_mode_enable, \ + "-migrate-mode-enable enable the migration mode.\n", + QEMU_ARCH_ALL) +SRST +``-migrate-mode-enable `` + Enable the specified migrate mode. May be supplied + multiple times, once per mode. This is a pre-requisite for performing a + migration using any mode except 'normal'. +ERST + DEF("nodefaults", 0, QEMU_OPTION_nodefaults, \ "-nodefaults don't create default devices\n", QEMU_ARCH_ALL) SRST diff --git a/softmmu/vl.c b/softmmu/vl.c index d8996f3d6e..f79157f5bc 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -3512,6 +3512,10 @@ void qemu_init(int argc, char **argv, char **envp) case QEMU_OPTION_only_migratable: only_migratable = 1; break; + case QEMU_OPTION_migrate_mode_enable: + migrate_enable_mode(qapi_enum_parse(&MigMode_lookup, optarg, -1, + &error_fatal)); + break; case QEMU_OPTION_nodefaults: has_defaults = 0; break; -- Gitee From 6eefa188175c6c86b9b46ba88fffc0584878dcfb Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 16:59:24 +0800 Subject: [PATCH 05/56] migration: simplify blockers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-7-git-send-email-steven.sistare@oracle.com/ --------------------------------------------------------------------------- Modify migrate_add_blocker and migrate_del_blocker to take an Error ** reason. This allows migration to own the Error object, so that if an error occurs, migration code can free the Error and clear the client handle, simplifying client code. This is a pre-requisite for a subsequent patch that will allow one Error blocker to be registered for multiple migration modes. No functional change. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-7-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- backends/tpm/tpm_emulator.c | 9 ++------- block/parallels.c | 6 ++---- block/qcow.c | 6 ++---- block/vdi.c | 6 ++---- block/vhdx.c | 6 ++---- block/vmdk.c | 6 ++---- block/vpc.c | 6 ++---- block/vvfat.c | 6 ++---- dump/dump.c | 4 ++-- hw/9pfs/9p.c | 10 ++-------- hw/display/virtio-gpu-base.c | 8 ++------ hw/intc/arm_gic_kvm.c | 3 +-- hw/intc/arm_gicv3_its_kvm.c | 3 +-- hw/intc/arm_gicv3_kvm.c | 3 +-- hw/misc/ivshmem.c | 8 ++------ hw/ppc/pef.c | 2 +- hw/ppc/spapr.c | 2 +- hw/ppc/spapr_events.c | 2 +- hw/ppc/spapr_rtas.c | 2 +- hw/remote/proxy.c | 7 ++----- hw/s390x/s390-virtio-ccw.c | 9 +++------ hw/scsi/vhost-scsi.c | 8 +++----- hw/vfio/migration.c | 13 ++----------- hw/virtio/vhost.c | 8 ++------ include/migration/blocker.h | 28 ++++++++++++++++++++++------ migration/migration.c | 29 +++++++++++++++++++++-------- stubs/migr-blocker.c | 4 ++-- target/i386/kvm/kvm.c | 8 ++++---- target/i386/nvmm/nvmm-all.c | 3 +-- target/i386/sev.c | 2 +- target/i386/whpx/whpx-all.c | 3 +-- ui/vdagent.c | 5 ++--- 32 files changed, 97 insertions(+), 128 deletions(-) diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index 87d061e9bb..3e2d942e95 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -492,10 +492,8 @@ static int tpm_emulator_block_migration(TPMEmulator *tpm_emu) error_setg(&tpm_emu->migration_blocker, "Migration disabled: TPM emulator does not support " "migration"); - if (migrate_add_blocker(tpm_emu->migration_blocker, &err) < 0) { + if (migrate_add_blocker(&tpm_emu->migration_blocker, &err) < 0) { error_report_err(err); - error_free(tpm_emu->migration_blocker); - tpm_emu->migration_blocker = NULL; return -1; } @@ -950,10 +948,7 @@ static void tpm_emulator_inst_finalize(Object *obj) qapi_free_TPMEmulatorOptions(tpm_emu->options); - if (tpm_emu->migration_blocker) { - migrate_del_blocker(tpm_emu->migration_blocker); - error_free(tpm_emu->migration_blocker); - } + migrate_del_blocker(&tpm_emu->migration_blocker); tpm_sized_buffer_reset(&state_blobs->volatil); tpm_sized_buffer_reset(&state_blobs->permanent); diff --git a/block/parallels.c b/block/parallels.c index 6ebad2a2bb..b7853196c8 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -877,9 +877,8 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The Parallels format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } qemu_co_mutex_init(&s->lock); @@ -911,8 +910,7 @@ static void parallels_close(BlockDriverState *bs) g_free(s->bat_dirty_bmap); qemu_vfree(s->header); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static BlockDriver bdrv_parallels = { diff --git a/block/qcow.c b/block/qcow.c index c39940f33e..50b71a7a94 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -304,9 +304,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The qcow format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } @@ -798,8 +797,7 @@ static void qcow_close(BlockDriverState *bs) g_free(s->cluster_cache); g_free(s->cluster_data); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts, diff --git a/block/vdi.c b/block/vdi.c index bdc58d726e..606ea0343e 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -494,9 +494,8 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vdi format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail_free_bmap; } @@ -984,8 +983,7 @@ static void vdi_close(BlockDriverState *bs) qemu_vfree(s->bmap); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static int vdi_has_zero_init(BlockDriverState *bs) diff --git a/block/vhdx.c b/block/vhdx.c index 356ec4c455..739756f223 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -980,8 +980,7 @@ static void vhdx_close(BlockDriverState *bs) s->bat = NULL; qemu_vfree(s->parent_entries); s->parent_entries = NULL; - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); qemu_vfree(s->log.hdr); s->log.hdr = NULL; vhdx_region_unregister_all(s); @@ -1089,9 +1088,8 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } diff --git a/block/vmdk.c b/block/vmdk.c index 0dfab6e941..f2da121eb8 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1314,9 +1314,8 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } @@ -2810,8 +2809,7 @@ static void vmdk_close(BlockDriverState *bs) vmdk_free_extents(bs); g_free(s->create_type); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) diff --git a/block/vpc.c b/block/vpc.c index 297a26262a..d3776ded60 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -449,9 +449,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vpc format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } @@ -1186,8 +1185,7 @@ static void vpc_close(BlockDriverState *bs) g_free(s->pageentry_u8); #endif - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static QemuOptsList vpc_create_opts = { diff --git a/block/vvfat.c b/block/vvfat.c index 5dacc6cfac..f97f756e72 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1266,9 +1266,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, "The vvfat (rw) format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blocker(&s->migration_blocker, errp); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } } @@ -3202,8 +3201,7 @@ static void vvfat_close(BlockDriverState *bs) g_free(s->cluster_buffer); if (s->qcow) { - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } } diff --git a/dump/dump.c b/dump/dump.c index 662d0a62cd..38c6192950 100644 --- a/dump/dump.c +++ b/dump/dump.c @@ -104,7 +104,7 @@ static int dump_cleanup(DumpState *s) qemu_mutex_unlock_iothread(); } } - migrate_del_blocker(dump_migration_blocker); + migrate_del_blocker(&dump_migration_blocker); return 0; } @@ -2018,7 +2018,7 @@ void qmp_dump_guest_memory(bool paging, const char *file, * Allows even for -only-migratable, but forbid migration during the * process of dump guest memory. */ - if (migrate_add_blocker_internal(dump_migration_blocker, errp)) { + if (migrate_add_blocker_internal(&dump_migration_blocker, errp)) { /* Remember to release the fd before passing it over to dump state */ close(fd); return; diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 15b3f4d385..fb4979f8aa 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -393,11 +393,7 @@ static int coroutine_fn put_fid(V9fsPDU *pdu, V9fsFidState *fidp) * delete the migration blocker. Ideally, this * should be hooked to transport close notification */ - if (pdu->s->migration_blocker) { - migrate_del_blocker(pdu->s->migration_blocker); - error_free(pdu->s->migration_blocker); - pdu->s->migration_blocker = NULL; - } + migrate_del_blocker(&pdu->s->migration_blocker); } return free_fid(pdu, fidp); } @@ -1460,10 +1456,8 @@ static void coroutine_fn v9fs_attach(void *opaque) error_setg(&s->migration_blocker, "Migration is disabled when VirtFS export path '%s' is mounted in the guest using mount_tag '%s'", s->ctx.fs_root ? s->ctx.fs_root : "NULL", s->tag); - err = migrate_add_blocker(s->migration_blocker, NULL); + err = migrate_add_blocker(&s->migration_blocker, NULL); if (err < 0) { - error_free(s->migration_blocker); - s->migration_blocker = NULL; clunk_fid(s, fid); goto out; } diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c index c8da4806e0..9616e85ff2 100644 --- a/hw/display/virtio-gpu-base.c +++ b/hw/display/virtio-gpu-base.c @@ -163,8 +163,7 @@ virtio_gpu_base_device_realize(DeviceState *qdev, if (virtio_gpu_virgl_enabled(g->conf)) { error_setg(&g->migration_blocker, "virgl is not yet migratable"); - if (migrate_add_blocker(g->migration_blocker, errp) < 0) { - error_free(g->migration_blocker); + if (migrate_add_blocker(&g->migration_blocker, errp) < 0) { return false; } } @@ -228,10 +227,7 @@ virtio_gpu_base_device_unrealize(DeviceState *qdev) { VirtIOGPUBase *g = VIRTIO_GPU_BASE(qdev); - if (g->migration_blocker) { - migrate_del_blocker(g->migration_blocker); - error_free(g->migration_blocker); - } + migrate_del_blocker(&g->migration_blocker); } static void diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c index 7d2a13273a..c9fac45748 100644 --- a/hw/intc/arm_gic_kvm.c +++ b/hw/intc/arm_gic_kvm.c @@ -514,8 +514,7 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp) if (!kvm_arm_gic_can_save_restore(s)) { error_setg(&s->migration_blocker, "This operating system kernel does " "not support vGICv2 migration"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } diff --git a/hw/intc/arm_gicv3_its_kvm.c b/hw/intc/arm_gicv3_its_kvm.c index 0b4cbed28b..525f9d4e91 100644 --- a/hw/intc/arm_gicv3_its_kvm.c +++ b/hw/intc/arm_gicv3_its_kvm.c @@ -112,8 +112,7 @@ static void kvm_arm_its_realize(DeviceState *dev, Error **errp) GITS_CTLR)) { error_setg(&s->migration_blocker, "This operating system kernel " "does not support vITS migration"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } else { diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 2e2b08e31f..5f72afbcdf 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -899,8 +899,7 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) GICD_CTLR)) { error_setg(&s->migration_blocker, "This operating system kernel does " "not support vGICv3 migration"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 05f06ed6cf..9017bdbcce 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -905,8 +905,7 @@ static void ivshmem_common_realize(PCIDevice *dev, Error **errp) if (!ivshmem_is_master(s)) { error_setg(&s->migration_blocker, "Migration is disabled when using feature 'peer mode' in device 'ivshmem'"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } @@ -924,10 +923,7 @@ static void ivshmem_exit(PCIDevice *dev) IVShmemState *s = IVSHMEM_COMMON(dev); int i; - if (s->migration_blocker) { - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); - } + migrate_del_blocker(&s->migration_blocker); if (memory_region_is_mapped(s->ivshmem_bar2)) { if (!s->hostmem) { diff --git a/hw/ppc/pef.c b/hw/ppc/pef.c index cc44d5e339..d28ed3ba73 100644 --- a/hw/ppc/pef.c +++ b/hw/ppc/pef.c @@ -63,7 +63,7 @@ static int kvmppc_svm_init(ConfidentialGuestSupport *cgs, Error **errp) /* add migration blocker */ error_setg(&pef_mig_blocker, "PEF: Migration is not implemented"); /* NB: This can fail if --only-migratable is used */ - migrate_add_blocker(pef_mig_blocker, &error_fatal); + migrate_add_blocker(&pef_mig_blocker, &error_fatal); cgs->ready = true; diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 3b5fd749be..de9cd9f86d 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1681,7 +1681,7 @@ static void spapr_machine_reset(MachineState *machine) /* Signal all vCPUs waiting on this condition */ qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond); - migrate_del_blocker(spapr->fwnmi_migration_blocker); + migrate_del_blocker(&spapr->fwnmi_migration_blocker); } static void spapr_create_nvram(SpaprMachineState *spapr) diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c index 630e86282c..5865c52913 100644 --- a/hw/ppc/spapr_events.c +++ b/hw/ppc/spapr_events.c @@ -920,7 +920,7 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) * fails when running with -only-migrate. A proper interface to * delay migration completion for a bit could avoid that. */ - ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, NULL); + ret = migrate_add_blocker(&spapr->fwnmi_migration_blocker, NULL); if (ret == -EBUSY) { warn_report("Received a fwnmi while migration was in progress"); } diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index b476382ae6..83a1774c71 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -496,7 +496,7 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu, spapr->fwnmi_machine_check_interlock = -1; qemu_cond_signal(&spapr->fwnmi_machine_check_interlock_cond); rtas_st(rets, 0, RTAS_OUT_SUCCESS); - migrate_del_blocker(spapr->fwnmi_migration_blocker); + migrate_del_blocker(&spapr->fwnmi_migration_blocker); } static struct rtas_call { diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c index bad164299d..2cd1f4347c 100644 --- a/hw/remote/proxy.c +++ b/hw/remote/proxy.c @@ -109,8 +109,7 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp) error_setg(&dev->migration_blocker, "%s does not support migration", TYPE_PCI_PROXY_DEV); - if (migrate_add_blocker(dev->migration_blocker, errp) < 0) { - error_free(dev->migration_blocker); + if (migrate_add_blocker(&dev->migration_blocker, errp) < 0) { object_unref(dev->ioc); return; } @@ -136,9 +135,7 @@ static void pci_proxy_dev_exit(PCIDevice *pdev) qio_channel_close(dev->ioc, NULL); } - migrate_del_blocker(dev->migration_blocker); - - error_free(dev->migration_blocker); + migrate_del_blocker(&dev->migration_blocker); proxy_memory_listener_deconfigure(&dev->proxy_listener); diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 653587ea62..bd500589e5 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -322,8 +322,7 @@ static void s390_machine_unprotect(S390CcwMachineState *ms) { s390_pv_vm_disable(); ms->pv = false; - migrate_del_blocker(pv_mig_blocker); - error_free_or_abort(&pv_mig_blocker); + migrate_del_blocker(&pv_mig_blocker); ram_block_discard_disable(false); } @@ -346,11 +345,10 @@ static int s390_machine_protect(S390CcwMachineState *ms) error_setg(&pv_mig_blocker, "protected VMs are currently not migrateable."); - rc = migrate_add_blocker(pv_mig_blocker, &local_err); + rc = migrate_add_blocker(&pv_mig_blocker, &local_err); if (rc) { ram_block_discard_disable(false); error_report_err(local_err); - error_free_or_abort(&pv_mig_blocker); return rc; } @@ -358,8 +356,7 @@ static int s390_machine_protect(S390CcwMachineState *ms) rc = s390_pv_vm_enable(); if (rc) { ram_block_discard_disable(false); - migrate_del_blocker(pv_mig_blocker); - error_free_or_abort(&pv_mig_blocker); + migrate_del_blocker(&pv_mig_blocker); return rc; } diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 039caf2614..8263f77f3b 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -207,7 +207,7 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) "When external environment supports it (Orchestrator migrates " "target SCSI device state or use shared storage over network), " "set 'migratable' property to true to enable migration."); - if (migrate_add_blocker(vsc->migration_blocker, errp) < 0) { + if (migrate_add_blocker(&vsc->migration_blocker, errp) < 0) { goto free_virtio; } } @@ -234,10 +234,9 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) free_vqs: g_free(vsc->dev.vqs); if (!vsc->migratable) { - migrate_del_blocker(vsc->migration_blocker); + migrate_del_blocker(&vsc->migration_blocker); } free_virtio: - error_free(vsc->migration_blocker); virtio_scsi_common_unrealize(dev); close_fd: close(vhostfd); @@ -251,8 +250,7 @@ static void vhost_scsi_unrealize(DeviceState *dev) struct vhost_virtqueue *vqs = vsc->dev.vqs; if (!vsc->migratable) { - migrate_del_blocker(vsc->migration_blocker); - error_free(vsc->migration_blocker); + migrate_del_blocker(&vsc->migration_blocker); } /* This will stop vhost backend. */ diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index e69b5f2e42..fb491bb37b 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -886,12 +886,7 @@ add_blocker: "VFIO device doesn't support migration"); g_free(info); - ret = migrate_add_blocker(vbasedev->migration_blocker, errp); - if (ret < 0) { - error_free(vbasedev->migration_blocker); - vbasedev->migration_blocker = NULL; - } - return ret; + return migrate_add_blocker(&vbasedev->migration_blocker, errp); } void vfio_migration_finalize(VFIODevice *vbasedev) @@ -905,9 +900,5 @@ void vfio_migration_finalize(VFIODevice *vbasedev) vfio_migration_exit(vbasedev); } - if (vbasedev->migration_blocker) { - migrate_del_blocker(vbasedev->migration_blocker); - error_free(vbasedev->migration_blocker); - vbasedev->migration_blocker = NULL; - } + migrate_del_blocker(&vbasedev->migration_blocker); } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 3ac6cfde03..5e95099777 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1435,9 +1435,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, } if (hdev->migration_blocker != NULL) { - r = migrate_add_blocker(hdev->migration_blocker, errp); + r = migrate_add_blocker(&hdev->migration_blocker, errp); if (r < 0) { - error_free(hdev->migration_blocker); goto fail_busyloop; } } @@ -1489,10 +1488,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) memory_listener_unregister(&hdev->memory_listener); QLIST_REMOVE(hdev, entry); } - if (hdev->migration_blocker) { - migrate_del_blocker(hdev->migration_blocker); - error_free(hdev->migration_blocker); - } + migrate_del_blocker(&hdev->migration_blocker); g_free(hdev->mem); g_free(hdev->mem_sections); if (hdev->vhost_ops) { diff --git a/include/migration/blocker.h b/include/migration/blocker.h index 9cebe2ba06..1483f5ddd6 100644 --- a/include/migration/blocker.h +++ b/include/migration/blocker.h @@ -17,19 +17,22 @@ /** * @migrate_add_blocker - prevent migration from proceeding * - * @reason - an error to be returned whenever migration is attempted + * @reasonp - address of an error to be returned whenever migration is attempted * * @errp - [out] The reason (if any) we cannot block migration right now. * * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. + * + * *@reasonp is freed and set to NULL if failure is returned. + * On success, the caller must not free *@reasonp before the blocker is removed. */ -int migrate_add_blocker(Error *reason, Error **errp); +int migrate_add_blocker(Error **reasonp, Error **errp); /** * @migrate_add_blocker_internal - prevent migration from proceeding without * only-migrate implications * - * @reason - an error to be returned whenever migration is attempted + * @reasonp - address of an error to be returned whenever migration is attempted * * @errp - [out] The reason (if any) we cannot block migration right now. * @@ -38,14 +41,27 @@ int migrate_add_blocker(Error *reason, Error **errp); * Some of the migration blockers can be temporary (e.g., for a few seconds), * so it shouldn't need to conflict with "-only-migratable". For those cases, * we can call this function rather than @migrate_add_blocker(). + * + * *@reasonp is freed and set to NULL if failure is returned. + * On success, the caller must not free *@reasonp before the blocker is removed. + */ +int migrate_add_blocker_internal(Error **reasonp, Error **errp); + +/** + * @migrate_del_blocker - remove a blocking error from migration and free it. + * + * @reasonp - address of the error blocking migration + * + * This function frees *@reasonp and sets it to NULL. */ -int migrate_add_blocker_internal(Error *reason, Error **errp); +void migrate_del_blocker(Error **reasonp); /** - * @migrate_del_blocker - remove a blocking error from migration + * @migrate_remove_blocker - remove a migration blocker. * * @reason - the error blocking migration + * */ -void migrate_del_blocker(Error *reason); +void migrate_remove_blocker(Error *reason); #endif diff --git a/migration/migration.c b/migration/migration.c index 3efb2df6cb..88ed7e8226 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2128,35 +2128,48 @@ void migrate_init(MigrationState *s) s->threshold_size = 0; } -int migrate_add_blocker_internal(Error *reason, Error **errp) +int migrate_add_blocker_internal(Error **reasonp, Error **errp) { /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */ if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) { - error_propagate_prepend(errp, error_copy(reason), + error_propagate_prepend(errp, *reasonp, "disallowing migration blocker " "(migration/snapshot in progress) for: "); + *reasonp = NULL; return -EBUSY; } - migration_blockers = g_slist_prepend(migration_blockers, reason); + migration_blockers = g_slist_prepend(migration_blockers, *reasonp); return 0; } -int migrate_add_blocker(Error *reason, Error **errp) +int migrate_add_blocker(Error **reasonp, Error **errp) { if (only_migratable) { - error_propagate_prepend(errp, error_copy(reason), + error_propagate_prepend(errp, *reasonp, "disallowing migration blocker " "(--only-migratable) for: "); + *reasonp = NULL; return -EACCES; } - return migrate_add_blocker_internal(reason, errp); + return migrate_add_blocker_internal(reasonp, errp); } -void migrate_del_blocker(Error *reason) +void migrate_del_blocker(Error **reasonp) { - migration_blockers = g_slist_remove(migration_blockers, reason); + if (*reasonp) { + migrate_remove_blocker(*reasonp); + error_free(*reasonp); + *reasonp = NULL; + } +} + +void migrate_remove_blocker(Error *reason) +{ + if (reason) { + migration_blockers = g_slist_remove(migration_blockers, reason); + } } void qmp_migrate_incoming(const char *uri, Error **errp) diff --git a/stubs/migr-blocker.c b/stubs/migr-blocker.c index 5676a2f93c..17a5dbf87b 100644 --- a/stubs/migr-blocker.c +++ b/stubs/migr-blocker.c @@ -1,11 +1,11 @@ #include "qemu/osdep.h" #include "migration/blocker.h" -int migrate_add_blocker(Error *reason, Error **errp) +int migrate_add_blocker(Error **reasonp, Error **errp) { return 0; } -void migrate_del_blocker(Error *reason) +void migrate_del_blocker(Error **reasonp) { } diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index d323d08dcb..1e1b5e5e38 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1499,7 +1499,7 @@ static int hyperv_init_vcpu(X86CPU *cpu) error_setg(&hv_passthrough_mig_blocker, "'hv-passthrough' CPU flag prevents migration, use explicit" " set of hv-* flags instead"); - ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err); + ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err); if (ret < 0) { error_report_err(local_err); return ret; @@ -1513,7 +1513,7 @@ static int hyperv_init_vcpu(X86CPU *cpu) " use explicit 'hv-no-nonarch-coresharing=on' instead (but" " make sure SMT is disabled and/or that vCPUs are properly" " pinned)"); - ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err); + ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err); if (ret < 0) { error_report_err(local_err); return ret; @@ -2019,7 +2019,7 @@ int kvm_arch_init_vcpu(CPUState *cs) error_setg(&invtsc_mig_blocker, "State blocked by non-migratable CPU device" " (invtsc flag)"); - r = migrate_add_blocker(invtsc_mig_blocker, &local_err); + r = migrate_add_blocker(&invtsc_mig_blocker, &local_err); if (r < 0) { error_report_err(local_err); return r; @@ -2086,7 +2086,7 @@ int kvm_arch_init_vcpu(CPUState *cs) return 0; fail: - migrate_del_blocker(invtsc_mig_blocker); + migrate_del_blocker(&invtsc_mig_blocker); return r; } diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c index 9af261eea3..ea45b52b30 100644 --- a/target/i386/nvmm/nvmm-all.c +++ b/target/i386/nvmm/nvmm-all.c @@ -936,9 +936,8 @@ nvmm_init_vcpu(CPUState *cpu) error_setg(&nvmm_migration_blocker, "NVMM: Migration not supported"); - if (migrate_add_blocker(nvmm_migration_blocker, &local_error) < 0) { + if (migrate_add_blocker(&nvmm_migration_blocker, &local_error) < 0) { error_report_err(local_error); - error_free(nvmm_migration_blocker); return -EINVAL; } } diff --git a/target/i386/sev.c b/target/i386/sev.c index 025ff7a6f8..81ef6260f0 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -851,7 +851,7 @@ sev_launch_finish(SevGuestState *sev) /* add migration blocker */ error_setg(&sev_mig_blocker, "SEV: Migration is not implemented"); - migrate_add_blocker(sev_mig_blocker, &error_fatal); + migrate_add_blocker(&sev_mig_blocker, &error_fatal); } static void diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c index ef896da0a2..ce647a1e09 100644 --- a/target/i386/whpx/whpx-all.c +++ b/target/i386/whpx/whpx-all.c @@ -1346,9 +1346,8 @@ int whpx_init_vcpu(CPUState *cpu) "State blocked due to non-migratable CPUID feature support," "dirty memory tracking support, and XSAVE/XRSTOR support"); - if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { + if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { error_report_err(local_error); - error_free(whpx_migration_blocker); ret = -EINVAL; goto error; } diff --git a/ui/vdagent.c b/ui/vdagent.c index 19e8fbfc96..d452eddf0b 100644 --- a/ui/vdagent.c +++ b/ui/vdagent.c @@ -603,7 +603,7 @@ static void vdagent_chr_open(Chardev *chr, return; #endif - if (migrate_add_blocker(vd->migration_blocker, errp) != 0) { + if (migrate_add_blocker(&vd->migration_blocker, errp) != 0) { return; } @@ -848,10 +848,9 @@ static void vdagent_chr_fini(Object *obj) { VDAgentChardev *vd = QEMU_VDAGENT_CHARDEV(obj); - migrate_del_blocker(vd->migration_blocker); + migrate_del_blocker(&vd->migration_blocker); vdagent_disconnect(vd); buffer_free(&vd->outbuf); - error_free(vd->migration_blocker); } static const TypeInfo vdagent_chr_type_info = { -- Gitee From 3dad3e5f45198b77d3e9a6dd729fcf6053fd0e7c Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 17:35:59 +0800 Subject: [PATCH 06/56] migration: per-mode blockers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-8-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------- Extend the blocker interface so that a blocker can be registered for one or more migration modes. The existing interfaces register a blocker for all modes, and the new interfaces take a varargs list of modes. Internally, maintain a separate blocker list per mode. The same Error object may be added to multiple lists. When a block is deleted, it is removed from every list, and the Error is freed. No functional change until a new mode is added. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-8-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/migration/blocker.h | 45 +++++++++++++++++-- migration/migration.c | 87 ++++++++++++++++++++++++++++++++----- stubs/migr-blocker.c | 5 +++ 3 files changed, 123 insertions(+), 14 deletions(-) diff --git a/include/migration/blocker.h b/include/migration/blocker.h index 1483f5ddd6..1a2193787b 100644 --- a/include/migration/blocker.h +++ b/include/migration/blocker.h @@ -14,8 +14,12 @@ #ifndef MIGRATION_BLOCKER_H #define MIGRATION_BLOCKER_H +#include "qapi/qapi-types-migration.h" + +#define MIG_MODE_ALL MIG_MODE__MAX + /** - * @migrate_add_blocker - prevent migration from proceeding + * @migrate_add_blocker - prevent all modes of migration from proceeding * * @reasonp - address of an error to be returned whenever migration is attempted * @@ -28,9 +32,42 @@ */ int migrate_add_blocker(Error **reasonp, Error **errp); +/** + * @migrate_add_blockers - prevent migration for specified modes from proceeding + * + * @reasonp - address of an error to be returned whenever migration is attempted + * + * @errp - [out] The reason (if any) we cannot block migration right now. + * + * @mode - one or more migration modes to be blocked. The list is terminated + * by -1 or MIG_MODE_ALL. For the latter, all modes are blocked. + * + * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. + * + * *@reasonp is freed and set to NULL if failure is returned. + * On success, the caller must not free *@reasonp before the blocker is removed. + */ +int migrate_add_blockers(Error **reasonp, Error **errp, MigMode mode, ...); + +/** + * @migrate_add_blocker_always - permanently prevent migration for specified + * modes from proceeding. The blocker cannot be deleted. + * + * @msg - text of error to be returned whenever migration is attempted + * + * @errp - [out] The reason (if any) we cannot block migration right now. + * + * @mode - one or more migration modes to be blocked. The list is terminated + * by -1 or MIG_MODE_ALL. For the latter, all modes are blocked. + * + * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. + */ +int +migrate_add_blocker_always(const char *msg, Error **errp, MigMode mode, ...); + /** * @migrate_add_blocker_internal - prevent migration from proceeding without - * only-migrate implications + * only-migrate implications, for all modes * * @reasonp - address of an error to be returned whenever migration is attempted * @@ -48,7 +85,7 @@ int migrate_add_blocker(Error **reasonp, Error **errp); int migrate_add_blocker_internal(Error **reasonp, Error **errp); /** - * @migrate_del_blocker - remove a blocking error from migration and free it. + * @migrate_del_blocker - remove a migration blocker for all modes and free it. * * @reasonp - address of the error blocking migration * @@ -57,7 +94,7 @@ int migrate_add_blocker_internal(Error **reasonp, Error **errp); void migrate_del_blocker(Error **reasonp); /** - * @migrate_remove_blocker - remove a migration blocker. + * @migrate_remove_blocker - remove a migration blocker for all modes. * * @reason - the error blocking migration * diff --git a/migration/migration.c b/migration/migration.c index 88ed7e8226..31dcd5abae 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -174,7 +174,7 @@ static MigrationState *current_migration; static MigrationIncomingState *current_incoming; static int migrate_enabled_modes = BIT(MIG_MODE_NORMAL); -static GSList *migration_blockers; +static GSList *migration_blockers[MIG_MODE__MAX]; static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, @@ -1065,7 +1065,7 @@ static void populate_disk_info(MigrationInfo *info) static void fill_source_migration_info(MigrationInfo *info) { MigrationState *s = migrate_get_current(); - GSList *cur_blocker = migration_blockers; + GSList *cur_blocker = migration_blockers[migrate_mode()]; info->blocked_reasons = NULL; @@ -2128,8 +2128,10 @@ void migrate_init(MigrationState *s) s->threshold_size = 0; } -int migrate_add_blocker_internal(Error **reasonp, Error **errp) +static int add_blockers(Error **reasonp, Error **errp, int modes) { + MigMode mode; + /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */ if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) { error_propagate_prepend(errp, *reasonp, @@ -2139,13 +2141,20 @@ int migrate_add_blocker_internal(Error **reasonp, Error **errp) return -EBUSY; } - migration_blockers = g_slist_prepend(migration_blockers, *reasonp); + for (mode = 0; mode < MIG_MODE__MAX; mode++) { + if (modes & BIT(mode)) { + migration_blockers[mode] = g_slist_prepend(migration_blockers[mode], + *reasonp); + } + } return 0; } -int migrate_add_blocker(Error **reasonp, Error **errp) +static int check_blockers(Error **reasonp, Error **errp, int modes) { - if (only_migratable) { + ERRP_GUARD(); + + if (only_migratable && (modes & BIT(MIG_MODE_NORMAL))) { error_propagate_prepend(errp, *reasonp, "disallowing migration blocker " "(--only-migratable) for: "); @@ -2153,7 +2162,60 @@ int migrate_add_blocker(Error **reasonp, Error **errp) return -EACCES; } - return migrate_add_blocker_internal(reasonp, errp); + return add_blockers(reasonp, errp, modes); +} + +int migrate_add_blocker(Error **reasonp, Error **errp) +{ + return migrate_add_blockers(reasonp, errp, MIG_MODE_ALL); +} + +int migrate_add_blocker_internal(Error **reasonp, Error **errp) +{ + int modes = BIT(MIG_MODE__MAX) - 1; + + return add_blockers(reasonp, errp, modes); +} + +static int get_modes(MigMode mode, va_list ap) +{ + int modes = 0; + + while (mode != -1 && mode != MIG_MODE_ALL) { + assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX); + modes |= BIT(mode); + mode = va_arg(ap, MigMode); + } + if (mode == MIG_MODE_ALL) { + modes = BIT(MIG_MODE__MAX) - 1; + } + return modes; +} + +int migrate_add_blockers(Error **reasonp, Error **errp, MigMode mode, ...) +{ + int modes; + va_list ap; + + va_start(ap, mode); + modes = get_modes(mode, ap); + va_end(ap); + + return check_blockers(reasonp, errp, modes); +} + +int migrate_add_blocker_always(const char *msg, Error **errp, MigMode mode, ...) +{ + int modes; + va_list ap; + Error *reason = NULL; + + va_start(ap, mode); + modes = get_modes(mode, ap); + va_end(ap); + + error_setg(&reason, "%s", msg); + return check_blockers(&reason, errp, modes); } void migrate_del_blocker(Error **reasonp) @@ -2168,7 +2230,10 @@ void migrate_del_blocker(Error **reasonp) void migrate_remove_blocker(Error *reason) { if (reason) { - migration_blockers = g_slist_remove(migration_blockers, reason); + for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { + migration_blockers[mode] = g_slist_remove(migration_blockers[mode], + reason); + } } } @@ -2272,12 +2337,14 @@ void qmp_migrate_pause(Error **errp) bool migration_is_blocked(Error **errp) { + GSList *blockers = migration_blockers[migrate_mode()]; + if (qemu_savevm_state_blocked(errp)) { return true; } - if (migration_blockers) { - error_propagate(errp, error_copy(migration_blockers->data)); + if (blockers) { + error_propagate(errp, error_copy(blockers->data)); return true; } diff --git a/stubs/migr-blocker.c b/stubs/migr-blocker.c index 17a5dbf87b..60769d8b79 100644 --- a/stubs/migr-blocker.c +++ b/stubs/migr-blocker.c @@ -6,6 +6,11 @@ int migrate_add_blocker(Error **reasonp, Error **errp) return 0; } +int migrate_add_blockers(Error **reasonp, Error **errp, MigMode mode, ...) +{ + return 0; +} + void migrate_del_blocker(Error **reasonp) { } -- Gitee From 01e72d024caed7dd2dd3ff717b866214d50d74bf Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 18:00:31 +0800 Subject: [PATCH 07/56] cpr: relax some blockers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-9-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------- Some devices block migration because they rely on local state that is not migrated to the target host, such as for local filesystems. These need not block cpr, which will restart qemu on the same host. Narrow the scope of these blockers so they only apply to normal mode. They will not block cpr modes when they are added in subsequent patches. No functional change until a new mode is added. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-9-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- backends/tpm/tpm_emulator.c | 3 ++- block/parallels.c | 3 ++- block/qcow.c | 3 ++- block/vdi.c | 3 ++- block/vhdx.c | 3 ++- block/vmdk.c | 3 ++- block/vpc.c | 3 ++- block/vvfat.c | 3 ++- hw/9pfs/9p.c | 3 ++- hw/scsi/vhost-scsi.c | 3 ++- hw/virtio/vhost.c | 3 ++- target/i386/nvmm/nvmm-all.c | 3 ++- 12 files changed, 24 insertions(+), 12 deletions(-) diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index 3e2d942e95..9721927660 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -492,7 +492,8 @@ static int tpm_emulator_block_migration(TPMEmulator *tpm_emu) error_setg(&tpm_emu->migration_blocker, "Migration disabled: TPM emulator does not support " "migration"); - if (migrate_add_blocker(&tpm_emu->migration_blocker, &err) < 0) { + if (migrate_add_blockers(&tpm_emu->migration_blocker, &err, + MIG_MODE_NORMAL, -1) < 0) { error_report_err(err); return -1; diff --git a/block/parallels.c b/block/parallels.c index b7853196c8..9a4a9e6df0 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -877,7 +877,8 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The Parallels format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail; } diff --git a/block/qcow.c b/block/qcow.c index 50b71a7a94..2c7885d224 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -304,7 +304,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The qcow format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail; } diff --git a/block/vdi.c b/block/vdi.c index 606ea0343e..0103284394 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -494,7 +494,8 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vdi format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail_free_bmap; } diff --git a/block/vhdx.c b/block/vhdx.c index 739756f223..85557d106c 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1088,7 +1088,8 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail; } diff --git a/block/vmdk.c b/block/vmdk.c index f2da121eb8..10024d872b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1314,7 +1314,8 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail; } diff --git a/block/vpc.c b/block/vpc.c index d3776ded60..09da705cdb 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -449,7 +449,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vpc format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail; } diff --git a/block/vvfat.c b/block/vvfat.c index f97f756e72..43993b506f 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1266,7 +1266,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, "The vvfat (rw) format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(&s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { goto fail; } diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index fb4979f8aa..0b5162b58c 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -1456,7 +1456,8 @@ static void coroutine_fn v9fs_attach(void *opaque) error_setg(&s->migration_blocker, "Migration is disabled when VirtFS export path '%s' is mounted in the guest using mount_tag '%s'", s->ctx.fs_root ? s->ctx.fs_root : "NULL", s->tag); - err = migrate_add_blocker(&s->migration_blocker, NULL); + err = migrate_add_blockers(&s->migration_blocker, NULL, MIG_MODE_NORMAL, + -1); if (err < 0) { clunk_fid(s, fid); goto out; diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 8263f77f3b..d393202ec0 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -207,7 +207,8 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) "When external environment supports it (Orchestrator migrates " "target SCSI device state or use shared storage over network), " "set 'migratable' property to true to enable migration."); - if (migrate_add_blocker(&vsc->migration_blocker, errp) < 0) { + if (migrate_add_blockers(&vsc->migration_blocker, errp, MIG_MODE_NORMAL, + -1) < 0) { goto free_virtio; } } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 5e95099777..27330cf260 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1435,7 +1435,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, } if (hdev->migration_blocker != NULL) { - r = migrate_add_blocker(&hdev->migration_blocker, errp); + r = migrate_add_blockers(&hdev->migration_blocker, errp, + MIG_MODE_NORMAL, -1); if (r < 0) { goto fail_busyloop; } diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c index ea45b52b30..03aa19489d 100644 --- a/target/i386/nvmm/nvmm-all.c +++ b/target/i386/nvmm/nvmm-all.c @@ -936,7 +936,8 @@ nvmm_init_vcpu(CPUState *cpu) error_setg(&nvmm_migration_blocker, "NVMM: Migration not supported"); - if (migrate_add_blocker(&nvmm_migration_blocker, &local_error) < 0) { + if (migrate_add_blockers(&nvmm_migration_blocker, &local_error, + MIG_MODE_NORMAL, -1) < 0) { error_report_err(local_error); return -EINVAL; } -- Gitee From a0bddd48ab9fecd60aee9d86b9daf92bc2dc7dbf Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 19:43:21 +0800 Subject: [PATCH 08/56] qdev-properties: strList This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-11-git-send-email-steven.sistare@oracle.com/ --------------------------------------------------------------------- Define a list-of-strings property, to be used for the cpr-exec-args migration property in a subsequent patch. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-11-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/core/qdev-properties.c | 44 ++++++++++++++++++++++++++++++++++++ include/hw/qdev-properties.h | 3 +++ 2 files changed, 47 insertions(+) diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index 2d5f662663..f3c984d569 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -9,6 +9,7 @@ #include "qemu/units.h" #include "qemu/cutils.h" #include "qdev-prop-internal.h" +#include "qapi/qapi-builtin-visit.h" void qdev_prop_set_after_realize(DeviceState *dev, const char *name, Error **errp) @@ -471,6 +472,49 @@ const PropertyInfo qdev_prop_string = { .set = set_string, }; +/* --- strList --- */ + +static void release_strList(Object *obj, const char *name, void *opaque) +{ + Property *prop = opaque; + g_free(*(char **)object_field_prop_ptr(obj, prop)); +} + +static void get_strList(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Property *prop = opaque; + strList **ptr = object_field_prop_ptr(obj, prop); + + if (!*ptr) { + strList *str = NULL; + visit_type_strList(v, name, &str, errp); + } else { + visit_type_strList(v, name, ptr, errp); + } +} + +static void set_strList(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Property *prop = opaque; + strList **ptr = object_field_prop_ptr(obj, prop); + strList *str; + + if (!visit_type_strList(v, name, &str, errp)) { + return; + } + g_free(*ptr); + *ptr = str; +} + +const PropertyInfo qdev_prop_strlist = { + .name = "strList", + .release = release_strList, + .get = get_strList, + .set = set_strList, +}; + /* --- on/off/auto --- */ const PropertyInfo qdev_prop_on_off_auto = { diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index ea129d65a6..fa2b6b3b68 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -57,6 +57,7 @@ extern const PropertyInfo qdev_prop_uint64; extern const PropertyInfo qdev_prop_int64; extern const PropertyInfo qdev_prop_size; extern const PropertyInfo qdev_prop_string; +extern const PropertyInfo qdev_prop_strlist; extern const PropertyInfo qdev_prop_on_off_auto; extern const PropertyInfo qdev_prop_compress_method; extern const PropertyInfo qdev_prop_size32; @@ -160,6 +161,8 @@ extern const PropertyInfo qdev_prop_link; DEFINE_PROP_UNSIGNED(_n, _s, _f, _d, qdev_prop_size, uint64_t) #define DEFINE_PROP_STRING(_n, _s, _f) \ DEFINE_PROP(_n, _s, _f, qdev_prop_string, char*) +#define DEFINE_PROP_STRLIST(_n, _s, _f) \ + DEFINE_PROP(_n, _s, _f, qdev_prop_strlist, strList*) #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto) #define DEFINE_PROP_COMPRESS_METHOD(_n, _s, _f, _d) \ -- Gitee From 05ff765d142d264a6a24a206b3e24ce9980d2910 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 19:59:43 +0800 Subject: [PATCH 09/56] qapi: strList_from_string This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-12-git-send-email-steven.sistare@oracle.com/ Generalize strList_from_comma_list() to take any delimiter character, rename as strList_from_string(), and move it to qapi/util.c. No functional change. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-12-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/qapi/util.h | 9 +++++++++ monitor/hmp-cmds.c | 29 ++--------------------------- qapi/qapi-util.c | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/include/qapi/util.h b/include/qapi/util.h index 81a2b13a33..7d88b099a5 100644 --- a/include/qapi/util.h +++ b/include/qapi/util.h @@ -22,6 +22,8 @@ typedef struct QEnumLookup { const int size; } QEnumLookup; +struct strList; + const char *qapi_enum_lookup(const QEnumLookup *lookup, int val); int qapi_enum_parse(const QEnumLookup *lookup, const char *buf, int def, Error **errp); @@ -30,6 +32,13 @@ bool qapi_bool_parse(const char *name, const char *value, bool *obj, int parse_qapi_name(const char *name, bool complete); +/* + * Produce a strList from the character delimited string @in. + * All strings are g_strdup'd. + * A NULL or empty input string returns NULL. + */ +struct strList *strList_from_string(const char *in, char delim); + /* * For any GenericList @list, insert @element at the front. * diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 7146e74a34..238cc29889 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -42,6 +42,7 @@ #include "qapi/qapi-commands-run-state.h" #include "qapi/qapi-commands-tpm.h" #include "qapi/qapi-commands-ui.h" +#include "qapi/util.h" #include "qapi/qapi-visit-net.h" #include "qapi/qapi-visit-migration.h" #include "qapi/qmp/qdict.h" @@ -70,32 +71,6 @@ bool hmp_handle_error(Monitor *mon, Error *err) return false; } -/* - * Produce a strList from a comma separated list. - * A NULL or empty input string return NULL. - */ -static strList *strList_from_comma_list(const char *in) -{ - strList *res = NULL; - strList **tail = &res; - - while (in && in[0]) { - char *comma = strchr(in, ','); - char *value; - - if (comma) { - value = g_strndup(in, comma - in); - in = comma + 1; /* skip the , */ - } else { - value = g_strdup(in); - in = NULL; - } - QAPI_LIST_APPEND(tail, value); - } - - return res; -} - void hmp_info_name(Monitor *mon, const QDict *qdict) { NameInfo *info; @@ -1110,7 +1085,7 @@ void hmp_announce_self(Monitor *mon, const QDict *qdict) migrate_announce_params()); qapi_free_strList(params->interfaces); - params->interfaces = strList_from_comma_list(interfaces_str); + params->interfaces = strList_from_string(interfaces_str, ','); params->has_interfaces = params->interfaces != NULL; params->id = g_strdup(id); params->has_id = !!params->id; diff --git a/qapi/qapi-util.c b/qapi/qapi-util.c index fda7044539..eb53f45916 100644 --- a/qapi/qapi-util.c +++ b/qapi/qapi-util.c @@ -15,6 +15,7 @@ #include "qapi/error.h" #include "qemu/ctype.h" #include "qapi/qmp/qerror.h" +#include "qapi/qapi-builtin-types.h" CompatPolicy compat_policy; @@ -152,3 +153,25 @@ int parse_qapi_name(const char *str, bool complete) } return p - str; } + +strList *strList_from_string(const char *in, char delim) +{ + strList *res = NULL; + strList **tail = &res; + + while (in && in[0]) { + char *next = strchr(in, delim); + char *value; + + if (next) { + value = g_strndup(in, next - in); + in = next + 1; /* skip the delim */ + } else { + value = g_strdup(in); + in = NULL; + } + QAPI_LIST_APPEND(tail, value); + } + + return res; +} -- Gitee From 3ccad72a747020f3ac42a8a999299fee6502e0ff Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 20:15:06 +0800 Subject: [PATCH 10/56] qapi: strv_from_strList MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-13-git-send-email-steven.sistare@oracle.com/ https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-14-git-send-email-steven.sistare@oracle.com/ Signed-off-by: Steve Sistare Reviewed-by: Marc-André Lureau Message-Id: <1658851843-236870-13-git-send-email-steven.sistare@oracle.com> Message-Id: <1658851843-236870-14-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/qapi/util.h | 19 +++++++++++++++++++ qapi/qapi-util.c | 14 ++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/qapi/util.h b/include/qapi/util.h index 7d88b099a5..51ff64e757 100644 --- a/include/qapi/util.h +++ b/include/qapi/util.h @@ -32,6 +32,12 @@ bool qapi_bool_parse(const char *name, const char *value, bool *obj, int parse_qapi_name(const char *name, bool complete); +/* + * Produce and return a NULL-terminated array of strings from @args. + * All strings are g_strdup'd. + */ +GStrv strv_from_strList(const struct strList *args); + /* * Produce a strList from the character delimited string @in. * All strings are g_strdup'd. @@ -65,4 +71,17 @@ struct strList *strList_from_string(const char *in, char delim); (tail) = &(*(tail))->next; \ } while (0) +/* + * For any GenericList @list, return its length. + */ +#define QAPI_LIST_LENGTH(list) \ + ({ \ + int len = 0; \ + typeof(list) elem; \ + for (elem = list; elem != NULL; elem = elem->next) { \ + len++; \ + } \ + len; \ + }) + #endif diff --git a/qapi/qapi-util.c b/qapi/qapi-util.c index eb53f45916..b8e19385ff 100644 --- a/qapi/qapi-util.c +++ b/qapi/qapi-util.c @@ -154,6 +154,20 @@ int parse_qapi_name(const char *str, bool complete) return p - str; } +GStrv strv_from_strList(const strList *args) +{ + const strList *arg; + int i = 0; + GStrv argv = g_new(char *, QAPI_LIST_LENGTH(args) + 1); + + for (arg = args; arg != NULL; arg = arg->next) { + argv[i++] = g_strdup(arg->value); + } + argv[i] = NULL; + + return argv; +} + strList *strList_from_string(const char *in, char delim) { strList *res = NULL; -- Gitee From c42caf095421e832a5b84b27b1e4d03370bff577 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 20:22:38 +0800 Subject: [PATCH 11/56] qapi: strList unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-15-git-send-email-steven.sistare@oracle.com/ Signed-off-by: Steve Sistare Reviewed-by: Marc-André Lureau Message-Id: <1658851843-236870-15-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- MAINTAINERS | 6 +++ tests/unit/meson.build | 1 + tests/unit/test-strlist.c | 81 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) create mode 100644 tests/unit/test-strlist.c diff --git a/MAINTAINERS b/MAINTAINERS index fbd6d0b174..0ed0d383d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2984,6 +2984,12 @@ F: net/colo* F: net/filter-rewriter.c F: net/filter-mirror.c +CPR +M: Steve Sistare +R: Mark Kanda +S: Maintained +F: tests/unit/test-strlist.c + Record/replay M: Pavel Dovgalyuk R: Paolo Bonzini diff --git a/tests/unit/meson.build b/tests/unit/meson.build index c21d817874..3700dcec65 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -17,6 +17,7 @@ tests = { 'test-forward-visitor': [testqapi], 'test-string-input-visitor': [testqapi], 'test-string-output-visitor': [testqapi], + 'test-strlist': [testqapi], 'test-opts-visitor': [testqapi], 'test-visitor-serialization': [testqapi], 'test-bitmap': [], diff --git a/tests/unit/test-strlist.c b/tests/unit/test-strlist.c new file mode 100644 index 0000000000..ef740dccc7 --- /dev/null +++ b/tests/unit/test-strlist.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/util.h" +#include "qapi/qapi-builtin-types.h" + +static strList *make_list(int length) +{ + strList *head = 0, *list, **prev = &head; + + while (length--) { + list = *prev = g_new0(strList, 1); + list->value = g_strdup("aaa"); + prev = &list->next; + } + return head; +} + +static void test_length(void) +{ + strList *list; + int i; + + for (i = 0; i < 5; i++) { + list = make_list(i); + g_assert_cmpint(i, ==, QAPI_LIST_LENGTH(list)); + qapi_free_strList(list); + } +} + +struct { + const char *string; + char delim; + const char *args[5]; +} list_data[] = { + { 0, ',', { 0 } }, + { "", ',', { 0 } }, + { "a", ',', { "a", 0 } }, + { "a,b", ',', { "a", "b", 0 } }, + { "a,b,c", ',', { "a", "b", "c", 0 } }, + { "first last", ' ', { "first", "last", 0 } }, + { "a:", ':', { "a", 0 } }, + { "a::b", ':', { "a", "", "b", 0 } }, + { ":", ':', { "", 0 } }, + { ":a", ':', { "", "a", 0 } }, + { "::a", ':', { "", "", "a", 0 } }, +}; + +static void test_strv(void) +{ + int i, j; + const char **expect; + strList *list; + GStrv args; + + for (i = 0; i < ARRAY_SIZE(list_data); i++) { + expect = list_data[i].args; + list = strList_from_string(list_data[i].string, list_data[i].delim); + args = strv_from_strList(list); + qapi_free_strList(list); + for (j = 0; expect[j] && args[j]; j++) { + g_assert_cmpstr(expect[j], ==, args[j]); + } + g_assert_null(expect[j]); + g_assert_null(args[j]); + g_strfreev(args); + } +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/test-string/length", test_length); + g_test_add_func("/test-string/strv", test_strv); + return g_test_run(); +} -- Gitee From 3f2237e6fc627bb097fb853826481116fa2350fb Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 8 Aug 2023 20:41:56 +0800 Subject: [PATCH 12/56] migration: cpr-exec-args parameter This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-16-git-send-email-steven.sistare@oracle.com/ ----------------------------------------------------------------------- Create the cpr-exec-args migration parameter, defined as a list of strings. It will be used for cpr-exec migration mode in a subsequent patch. No functional change, except that cpr-exec-args is shown by the 'info migrate' command. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-16-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hmp-commands.hx | 2 +- migration/migration.c | 15 +++++++++++++++ monitor/hmp-cmds.c | 20 ++++++++++++++++++++ qapi/migration.json | 9 +++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index 5bedee2d49..bd9e21af36 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -995,7 +995,7 @@ ERST { .name = "migrate_set_parameter", - .args_type = "parameter:s,value:s", + .args_type = "parameter:s,value:S", .params = "parameter value", .help = "Set the parameter for migration", .cmd = hmp_migrate_set_parameter, diff --git a/migration/migration.c b/migration/migration.c index 31dcd5abae..b87ebf1374 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -872,6 +872,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; params->has_cpu_throttle_tailslow = true; params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow; + params->has_cpr_exec_args = true; + params->cpr_exec_args = QAPI_CLONE(strList, s->parameters.cpr_exec_args); params->has_tls_creds = true; params->tls_creds = g_strdup(s->parameters.tls_creds); params->has_tls_hostname = true; @@ -1550,6 +1552,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpr_exec_args) { + dest->cpr_exec_args = params->cpr_exec_args; + } + if (params->has_tls_creds) { assert(params->tls_creds->type == QTYPE_QSTRING); dest->tls_creds = params->tls_creds->u.s; @@ -1655,6 +1661,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpr_exec_args) { + qapi_free_strList(s->parameters.cpr_exec_args); + s->parameters.cpr_exec_args = + QAPI_CLONE(strList, params->cpr_exec_args); + } + if (params->has_tls_creds) { g_free(s->parameters.tls_creds); assert(params->tls_creds->type == QTYPE_QSTRING); @@ -4365,6 +4377,8 @@ static Property migration_properties[] = { DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState, parameters.cpu_throttle_tailslow, false), + DEFINE_PROP_STRLIST("cpr-exec-args", MigrationState, + parameters.cpr_exec_args), DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, @@ -4473,6 +4487,7 @@ static void migration_instance_init(Object *obj) params->has_decompress_threads = true; params->has_compress_method = true; params->has_throttle_trigger_threshold = true; + params->has_cpr_exec_args = true; params->has_cpu_throttle_initial = true; params->has_cpu_throttle_increment = true; params->has_cpu_throttle_tailslow = true; diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 238cc29889..f60ce3c99e 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -369,6 +369,18 @@ void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict) qapi_free_MigrationCapabilityStatusList(caps); } +static void monitor_print_cpr_exec_args(Monitor *mon, strList *args) +{ + monitor_printf(mon, "%s:", + MigrationParameter_str(MIGRATION_PARAMETER_CPR_EXEC_ARGS)); + + while (args) { + monitor_printf(mon, " %s", args->value); + args = args->next; + } + monitor_printf(mon, "\n"); +} + void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) { MigrationParameters *params; @@ -431,6 +443,8 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_CPU_THROTTLE), params->max_cpu_throttle); + assert(params->has_cpr_exec_args); + monitor_print_cpr_exec_args(mon, params->cpr_exec_args); assert(params->has_tls_creds); monitor_printf(mon, "%s: '%s'\n", MigrationParameter_str(MIGRATION_PARAMETER_TLS_CREDS), @@ -1176,6 +1190,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) uint64_t cache_size; CompressMethod compress_method; Error *err = NULL; + g_autofree char *str = NULL; int val, ret; val = qapi_enum_parse(&MigrationParameter_lookup, param, -1, &err); @@ -1232,6 +1247,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_max_cpu_throttle = true; visit_type_uint8(v, param, &p->max_cpu_throttle, &err); break; + case MIGRATION_PARAMETER_CPR_EXEC_ARGS: + p->has_cpr_exec_args = true; + visit_type_str(v, param, &str, &err); + p->cpr_exec_args = strList_from_string(str, ' '); + break; case MIGRATION_PARAMETER_TLS_CREDS: p->has_tls_creds = true; p->tls_creds = g_new0(StrOrNull, 1); diff --git a/qapi/migration.json b/qapi/migration.json index 279295b3ed..de29cb6a57 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -689,6 +689,8 @@ # at tail stage. # The default value is false. (Since 5.1) # +# @cpr-exec-args: defined in a subsequent patch. +# # @tls-creds: ID of the 'tls-creds' object that provides credentials for # establishing a TLS connection over the migration data channel. # On the outgoing side of the migration, the credentials must @@ -792,6 +794,7 @@ 'compress-wait-thread', 'compress-method', 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', 'cpu-throttle-tailslow', + 'cpr-exec-args', 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth', 'downtime-limit', { 'name': 'x-checkpoint-delay', 'features': [ 'unstable' ] }, @@ -861,6 +864,8 @@ # at tail stage. # The default value is false. (Since 5.1) # +# @cpr-exec-args: defined in a subsequent patch. +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -973,6 +978,7 @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpr-exec-args': [ 'str' ], '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', @@ -1067,6 +1073,8 @@ # at tail stage. # The default value is false. (Since 5.1) # +# @cpr-exec-args: defined in a subsequent patch. +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -1179,6 +1187,7 @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpr-exec-args': [ 'str' ], '*tls-creds': 'str', '*tls-hostname': 'str', '*tls-authz': 'str', -- Gitee From a7260c9aa09012365efc70cdffba8c2935a80fbf Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 09:41:58 +0800 Subject: [PATCH 13/56] migration: simplify notifiers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-17-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------ Pass the callback function to add_migration_state_change_notifier so that migration can initialize the notifier on add and clear it on delete, which simplifies the call sites. Also shorten the function names. No functional change. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-17-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/net/virtio-net.c | 6 +++--- hw/vfio/migration.c | 6 +++--- include/migration/misc.h | 6 ++++-- migration/migration.c | 22 ++++++++++++++++------ ui/spice-core.c | 3 +-- 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 3bd786cc22..cb627c523f 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3409,8 +3409,8 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) n->primary_listener.hide_device = failover_hide_primary_device; qatomic_set(&n->failover_primary_hidden, true); device_listener_register(&n->primary_listener); - n->migration_state.notify = virtio_net_migration_state_notifier; - add_migration_state_change_notifier(&n->migration_state); + migration_add_notifier(&n->migration_state, + virtio_net_migration_state_notifier); n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY); } @@ -3575,7 +3575,7 @@ static void virtio_net_device_unrealize(DeviceState *dev) if (n->failover) { qobject_unref(n->primary_opts); device_listener_unregister(&n->primary_listener); - remove_migration_state_change_notifier(&n->migration_state); + migration_remove_notifier(&n->migration_state); } else { assert(n->primary_opts == NULL); } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index fb491bb37b..9247f15557 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -840,8 +840,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, migration->vm_state = qdev_add_vm_change_state_handler(vbasedev->dev, vfio_vmstate_change, vbasedev); - migration->migration_state.notify = vfio_migration_state_notifier; - add_migration_state_change_notifier(&migration->migration_state); + migration_add_notifier(&migration->migration_state, + vfio_migration_state_notifier); return 0; err: @@ -894,7 +894,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) if (vbasedev->migration) { VFIOMigration *migration = vbasedev->migration; - remove_migration_state_change_notifier(&migration->migration_state); + migration_remove_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); vfio_migration_exit(vbasedev); diff --git a/include/migration/misc.h b/include/migration/misc.h index 71b62857aa..d4c2d7da57 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -60,8 +60,10 @@ void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); bool migration_is_active(MigrationState *); -void add_migration_state_change_notifier(Notifier *notify); -void remove_migration_state_change_notifier(Notifier *notify); +void migration_add_notifier(Notifier *notify, + void (*func)(Notifier *notifier, void *data)); +void migration_remove_notifier(Notifier *notify); +void migration_call_notifiers(MigrationState *s); bool migration_in_setup(MigrationState *); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); diff --git a/migration/migration.c b/migration/migration.c index b87ebf1374..a3dc5c89da 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1892,7 +1892,7 @@ static void migrate_fd_cleanup(MigrationState *s) /* It is used on info migrate. We can't free it */ error_report_err(error_copy(s->error)); } - notifier_list_notify(&migration_state_notifiers, s); + migration_call_notifiers(s); block_cleanup_parameters(s); yank_unregister_instance(MIGRATION_YANK_INSTANCE); } @@ -1987,14 +1987,24 @@ static void migrate_fd_cancel(MigrationState *s) } } -void add_migration_state_change_notifier(Notifier *notify) +void migration_add_notifier(Notifier *notify, + void (*func)(Notifier *notifier, void *data)) { + notify->notify = func; notifier_list_add(&migration_state_notifiers, notify); } -void remove_migration_state_change_notifier(Notifier *notify) +void migration_remove_notifier(Notifier *notify) +{ + if (notify->notify) { + notifier_remove(notify); + notify->notify = NULL; + } +} + +void migration_call_notifiers(MigrationState *s) { - notifier_remove(notify); + notifier_list_notify(&migration_state_notifiers, s); } bool migration_in_setup(MigrationState *s) @@ -3257,7 +3267,7 @@ static int postcopy_start(MigrationState *ms) * spice needs to trigger a transition now */ ms->postcopy_after_devices = true; - notifier_list_notify(&migration_state_notifiers, ms); + migration_call_notifiers(ms); ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; @@ -4268,7 +4278,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO; /* Notify before starting migration thread */ - notifier_list_notify(&migration_state_notifiers, s); + migration_call_notifiers(s); } qemu_file_set_rate_limit(s->to_dst_file, rate_limit); diff --git a/ui/spice-core.c b/ui/spice-core.c index 31974b8d6c..6a1c7f790b 100644 --- a/ui/spice-core.c +++ b/ui/spice-core.c @@ -819,8 +819,7 @@ static void qemu_spice_init(void) }; using_spice = 1; - migration_state.notify = migration_state_notifier; - add_migration_state_change_notifier(&migration_state); + migration_add_notifier(&migration_state, migration_state_notifier); spice_migrate.base.sif = &migrate_interface.base; qemu_spice.add_interface(&spice_migrate.base); -- Gitee From 6cff7c22979593fdb503c2455c6229e75b168f0e Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 09:50:38 +0800 Subject: [PATCH 14/56] migration: check mode in notifiers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-18-git-send-email-steven.sistare@oracle.com/ --------------------------------------------------------------- The existing notifiers should only apply to normal mode. No functional change. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-18-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/net/virtio-net.c | 4 ++++ hw/vfio/migration.c | 3 +++ ui/spice-core.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index cb627c523f..42f7a75757 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3317,6 +3317,10 @@ static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s) static void virtio_net_migration_state_notifier(Notifier *notifier, void *data) { MigrationState *s = data; + + if (migrate_mode_of(s) != MIG_MODE_NORMAL) { + return; + } VirtIONet *n = container_of(notifier, VirtIONet, migration_state); virtio_net_handle_migration_primary(n, s); } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 9247f15557..6d09adbfb8 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -759,6 +759,9 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) VFIODevice *vbasedev = migration->vbasedev; int ret; + if (migrate_mode_of(s) != MIG_MODE_NORMAL) { + return; + } trace_vfio_migration_state_notifier(vbasedev->name, MigrationStatus_str(s->state)); diff --git a/ui/spice-core.c b/ui/spice-core.c index 6a1c7f790b..de960b4563 100644 --- a/ui/spice-core.c +++ b/ui/spice-core.c @@ -558,7 +558,7 @@ static void migration_state_notifier(Notifier *notifier, void *data) { MigrationState *s = data; - if (!spice_have_target_host) { + if (!spice_have_target_host || migrate_mode_of(s) != MIG_MODE_NORMAL) { return; } -- Gitee From 2f519a896c1358ce63194d6cea510739dc8b247a Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 10:01:00 +0800 Subject: [PATCH 15/56] memory: flat section iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-19-git-send-email-steven.sistare@oracle.com/ Add an iterator over the sections of a flattened address space. Signed-off-by: Steve Sistare Reviewed-by: Marc-André Lureau Message-Id: <1658851843-236870-19-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/exec/memory.h | 19 +++++++++++++++++++ softmmu/memory.c | 21 +++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/include/exec/memory.h b/include/exec/memory.h index bb7e5c2c18..bab1322894 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2343,6 +2343,25 @@ static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) void memory_region_set_ram_discard_manager(MemoryRegion *mr, RamDiscardManager *rdm); +typedef int (*memory_region_section_cb)(MemoryRegionSection *mrs, + void *opaque, + Error **errp); + +/** + * address_space_flat_for_each_section: walk the ranges in the address space + * flat view and call @func for each. Return 0 on success, else return non-zero + * with a message in @errp. + * + * @as: target address space + * @func: callback function + * @opaque: passed to @func + * @errp: passed to @func + */ +int address_space_flat_for_each_section(AddressSpace *as, + memory_region_section_cb func, + void *opaque, + Error **errp); + /** * memory_region_find: translate an address/size relative to a * MemoryRegion into a #MemoryRegionSection. diff --git a/softmmu/memory.c b/softmmu/memory.c index 7340e19ff5..520c199dc7 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -2663,6 +2663,27 @@ bool memory_region_is_mapped(MemoryRegion *mr) return mr->container ? true : false; } +int address_space_flat_for_each_section(AddressSpace *as, + memory_region_section_cb func, + void *opaque, + Error **errp) +{ + FlatView *view = address_space_get_flatview(as); + FlatRange *fr; + int ret; + + FOR_EACH_FLAT_RANGE(fr, view) { + MemoryRegionSection mrs = section_from_flat_range(fr, view); + ret = func(&mrs, opaque, errp); + if (ret) { + return ret; + } + } + + flatview_unref(view); + return 0; +} + /* Same as memory_region_find, but it does not add a reference to the * returned region. It must be called from an RCU critical section. */ -- Gitee From 83e211f29e0d59e92ab8d31899d2508e6cc3cade Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 10:07:56 +0800 Subject: [PATCH 16/56] oslib: qemu_clear_cloexec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-20-git-send-email-steven.sistare@oracle.com/ ----------------------------------------------------------------- Define qemu_clear_cloexec, analogous to qemu_set_cloexec. Signed-off-by: Steve Sistare Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Marc-André Lureau Message-Id: <1658851843-236870-20-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/qemu/osdep.h | 9 +++++++++ util/oslib-posix.c | 10 ++++++++++ util/oslib-win32.c | 4 ++++ 3 files changed, 23 insertions(+) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index fd9e53f623..c8cba40b96 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -642,6 +642,15 @@ static inline void qemu_timersub(const struct timeval *val1, void qemu_set_cloexec(int fd); +/* + * Clear FD_CLOEXEC for a descriptor. + * + * The caller must guarantee that no other fork+exec's occur before the + * exec that is intended to inherit this descriptor, eg by suspending CPUs + * and blocking monitor commands. + */ +void qemu_clear_cloexec(int fd); + /* Starting on QEMU 2.5, qemu_hw_version() returns "2.5+" by default * instead of QEMU_VERSION, so setting hw_version on MachineClass * is no longer mandatory. diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 18a38b9464..9f35340119 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -310,6 +310,16 @@ void qemu_set_cloexec(int fd) assert(f != -1); } + +void qemu_clear_cloexec(int fd) +{ + int f; + f = fcntl(fd, F_GETFD); + assert(f != -1); + f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC); + assert(f != -1); +} + /* * Creates a pipe with FD_CLOEXEC set on both file descriptors */ diff --git a/util/oslib-win32.c b/util/oslib-win32.c index af559ef339..acc3e0661c 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -265,6 +265,10 @@ void qemu_set_cloexec(int fd) { } +void qemu_clear_cloexec(int fd) +{ +} + /* Offset between 1/1/1601 and 1/1/1970 in 100 nanosec units */ #define _W32_FT_OFFSET (116444736000000000ULL) -- Gitee From e791e7e191f85e17e2b4bcd727dbe1fd3dabb698 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 10:22:05 +0800 Subject: [PATCH 17/56] vl: helper to request re-exec This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-21-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------------ Add a qemu_system_exec_request() hook that causes the main loop to exit and re-exec qemu using the specified arguments. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-21-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/sysemu/runstate.h | 1 + softmmu/runstate.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index a535691573..91c89d8276 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -56,6 +56,7 @@ void qemu_system_wakeup_enable(WakeupReason reason, bool enabled); void qemu_register_wakeup_notifier(Notifier *notifier); void qemu_register_wakeup_support(void); void qemu_system_shutdown_request(ShutdownCause reason); +void qemu_system_exec_request(const strList *args); void qemu_system_powerdown_request(void); void qemu_register_powerdown_notifier(Notifier *notifier); void qemu_register_shutdown_notifier(Notifier *notifier); diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 0757d7f26a..c6119e6d67 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -38,6 +38,7 @@ #include "monitor/monitor.h" #include "net/net.h" #include "net/vhost_net.h" +#include "qapi/util.h" #include "qapi/error.h" #include "qapi/qapi-commands-run-state.h" #include "qapi/qapi-events-run-state.h" @@ -353,6 +354,7 @@ static NotifierList wakeup_notifiers = static NotifierList shutdown_notifiers = NOTIFIER_LIST_INITIALIZER(shutdown_notifiers); static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE); +static GStrv exec_argv; ShutdownCause qemu_shutdown_requested_get(void) { @@ -369,6 +371,11 @@ static int qemu_shutdown_requested(void) return qatomic_xchg(&shutdown_requested, SHUTDOWN_CAUSE_NONE); } +static int qemu_exec_requested(void) +{ + return exec_argv != NULL; +} + static void qemu_kill_report(void) { if (!qtest_driver() && shutdown_signal) { @@ -628,6 +635,13 @@ void qemu_system_shutdown_request(ShutdownCause reason) qemu_notify_event(); } +void qemu_system_exec_request(const strList *args) +{ + exec_argv = strv_from_strList(args); + shutdown_requested = 1; + qemu_notify_event(); +} + static void qemu_system_powerdown(void) { qapi_event_send_powerdown(); @@ -676,6 +690,15 @@ static bool main_loop_should_exit(void) } request = qemu_shutdown_requested(); if (request) { + if (qemu_exec_requested()) { + Error *err = NULL; + execvp(exec_argv[0], exec_argv); + error_setg_errno(&err, errno, "execvp %s failed", exec_argv[0]); + error_report_err(err); + g_strfreev(exec_argv); + exec_argv = NULL; + return false; + } qemu_kill_report(); qemu_system_shutdown(request); if (shutdown_action == SHUTDOWN_ACTION_PAUSE) { -- Gitee From 8cf1db17177c34409050f722d7f5d8b5e1bb28a2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 14:14:43 +0800 Subject: [PATCH 18/56] cpr: preserve extra state This is mainly from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-21-git-send-email-steven.sistare@oracle.com/ https://patchew.org/QEMU/1655304746-102776-1-git-send-email-steven.sistare@oracle.com/1655304746-102776-20-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------------ cpr must save state that is needed after qemu is restarted, when devices are realized. Thus the extra state cannot be saved in the migration stream file, as objects must already exist before that file can be loaded. Instead, define auxilliary state structures and vmstate descriptions, not associated with any registered object, and serialize the aux state to a memfd file. Deserialize after qemu restarts, before devices are realized. The following state is saved: * cpr mode * file descriptor names and values * memfd values and properties for ram blocks Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-22-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- MAINTAINERS | 3 + include/migration/cpr-state.h | 30 +++ migration/cpr-state.c | 347 ++++++++++++++++++++++++++++++++++ migration/meson.build | 1 + migration/qemu-file-channel.c | 12 ++ migration/qemu-file-channel.h | 3 + migration/trace-events | 8 + stubs/cpr-state.c | 26 +++ stubs/meson.build | 1 + 9 files changed, 431 insertions(+) create mode 100644 include/migration/cpr-state.h create mode 100644 migration/cpr-state.c create mode 100644 stubs/cpr-state.c diff --git a/MAINTAINERS b/MAINTAINERS index 0ed0d383d6..b015363420 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2989,6 +2989,9 @@ M: Steve Sistare R: Mark Kanda S: Maintained F: tests/unit/test-strlist.c +F: include/migration/cpr-state.h +F: migration/cpr-state.c +F: stubs/cpr-state.c Record/replay M: Pavel Dovgalyuk diff --git a/include/migration/cpr-state.h b/include/migration/cpr-state.h new file mode 100644 index 0000000000..a9ae6ae239 --- /dev/null +++ b/include/migration/cpr-state.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef MIGRATION_CPR_STATE_H +#define MIGRATION_CPR_STATE_H + +#include "qapi/qapi-types-migration.h" + +typedef int (*cpr_walk_fd_cb)(const char *name, int id, int fd, void *opaque); + +void cpr_save_fd(const char *name, int id, int fd); +void cpr_delete_fd(const char *name, int id); +int cpr_find_fd(const char *name, int id); +int cpr_walk_fd(cpr_walk_fd_cb cb, void *handle); +void cpr_save_memfd(const char *name, int fd, size_t len, size_t maxlen, + uint64_t align); +int cpr_find_memfd(const char *name, size_t *lenp, size_t *maxlenp, + uint64_t *alignp); +void cpr_delete_memfd(const char *name); +void cpr_resave_fd(const char *name, int id, int fd); +int cpr_state_save(Error **errp); +void cpr_state_unsave(void); +int cpr_state_load(Error **errp); +void cpr_state_print(void); + +#endif diff --git a/migration/cpr-state.c b/migration/cpr-state.c new file mode 100644 index 0000000000..528e4f9200 --- /dev/null +++ b/migration/cpr-state.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qemu/queue.h" +#include "qemu/memfd.h" +#include "qapi/error.h" +#include "io/channel-file.h" +#include "migration/vmstate.h" +#include "migration/cpr-state.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/qemu-file.h" +#include "migration/qemu-file-channel.h" +#include "trace.h" + +/*************************************************************************/ +/* cpr state container for all information to be saved. */ + +typedef QLIST_HEAD(CprNameList, CprName) CprNameList; + +typedef struct CprState { + MigMode mode; + CprNameList fds; /* list of CprFd */ + CprNameList memfd; /* list of CprMemfd */ +} CprState; + +static CprState cpr_state = { + .mode = MIG_MODE_NORMAL, +}; + +/*************************************************************************/ +/* Generic list of names. */ + +typedef struct CprName { + char *name; + unsigned int namelen; + int id; + QLIST_ENTRY(CprName) next; +} CprName; + +static const VMStateDescription vmstate_cpr_name = { + .name = "cpr name", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(namelen, CprName), + VMSTATE_VBUFFER_ALLOC_UINT32(name, CprName, 0, NULL, namelen), + VMSTATE_INT32(id, CprName), + VMSTATE_END_OF_LIST() + } +}; + +static void +add_name(CprNameList *head, const char *name, int id, CprName *elem) +{ + elem->name = g_strdup(name); + elem->namelen = strlen(name) + 1; + elem->id = id; + QLIST_INSERT_HEAD(head, elem, next); +} + +static CprName *find_name(CprNameList *head, const char *name, int id) +{ + CprName *elem; + + QLIST_FOREACH(elem, head, next) { + if (!strcmp(elem->name, name) && elem->id == id) { + return elem; + } + } + return NULL; +} + +static void delete_name(CprNameList *head, const char *name, int id) +{ + CprName *elem = find_name(head, name, id); + + if (elem) { + QLIST_REMOVE(elem, next); + g_free(elem->name); + g_free(elem); + } +} + +/****************************************************************************/ +/* Lists of named things. The first field of each entry must be a CprName. */ + +typedef struct CprFd { + CprName name; /* must be first */ + int fd; +} CprFd; + +static const VMStateDescription vmstate_cpr_fd = { + .name = "cpr fd", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(name, CprFd, 1, vmstate_cpr_name, CprName), + VMSTATE_INT32(fd, CprFd), + VMSTATE_END_OF_LIST() + } +}; + +#define CPR_FD(elem) ((CprFd *)(elem)) +#define CPR_FD_FD(elem) (CPR_FD(elem)->fd) + +void cpr_save_fd(const char *name, int id, int fd) +{ + CprFd *elem = g_new0(CprFd, 1); + + trace_cpr_save_fd(name, id, fd); + elem->fd = fd; + add_name(&cpr_state.fds, name, id, &elem->name); +} + +void cpr_delete_fd(const char *name, int id) +{ + trace_cpr_delete_fd(name, id); + delete_name(&cpr_state.fds, name, id); +} + +int cpr_find_fd(const char *name, int id) +{ + CprName *elem = find_name(&cpr_state.fds, name, id); + int fd = elem ? CPR_FD_FD(elem) : -1; + + if (fd >= 0) { + /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */ + qemu_set_cloexec(fd); + } + + trace_cpr_find_fd(name, id, fd); + return fd; +} + +int cpr_walk_fd(cpr_walk_fd_cb cb, void *opaque) +{ + CprName *elem; + + QLIST_FOREACH(elem, &cpr_state.fds, next) { + if (cb(elem->name, elem->id, CPR_FD_FD(elem), opaque)) { + return 1; + } + } + return 0; +} + +void cpr_resave_fd(const char *name, int id, int fd) +{ + CprName *elem = find_name(&cpr_state.fds, name, id); + int old_fd = elem ? CPR_FD_FD(elem) : -1; + + if (old_fd < 0) { + cpr_save_fd(name, id, fd); + } else if (old_fd != fd) { + error_setg(&error_fatal, + "internal error: cpr fd '%s' id %d value %d " + "already saved with a different value %d", + name, id, fd, old_fd); + } +} + +/*************************************************************************/ +/* A memfd ram block. */ + +typedef struct CprMemfd { + CprName name; /* must be first */ + size_t len; + size_t maxlen; + uint64_t align; +} CprMemfd; + +static const VMStateDescription vmstate_cpr_memfd = { + .name = "cpr memfd", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(name, CprMemfd, 1, vmstate_cpr_name, CprName), + VMSTATE_UINT64(len, CprMemfd), + VMSTATE_UINT64(maxlen, CprMemfd), + VMSTATE_UINT64(align, CprMemfd), + VMSTATE_END_OF_LIST() + } +}; + +#define CPR_MEMFD(elem) ((CprMemfd *)(elem)) +#define CPR_MEMFD_LEN(elem) (CPR_MEMFD(elem)->len) +#define CPR_MEMFD_MAXLEN(elem) (CPR_MEMFD(elem)->maxlen) +#define CPR_MEMFD_ALIGN(elem) (CPR_MEMFD(elem)->align) + +void cpr_save_memfd(const char *name, int fd, size_t len, size_t maxlen, + uint64_t align) +{ + CprMemfd *elem = g_new0(CprMemfd, 1); + + trace_cpr_save_memfd(name, len, maxlen, align); + elem->len = len; + elem->maxlen = maxlen; + elem->align = align; + add_name(&cpr_state.memfd, name, 0, &elem->name); + cpr_save_fd(name, 0, fd); +} + +void cpr_delete_memfd(const char *name) +{ + trace_cpr_delete_memfd(name); + delete_name(&cpr_state.memfd, name, 0); + cpr_delete_fd(name, 0); +} + +int cpr_find_memfd(const char *name, size_t *lenp, size_t *maxlenp, + uint64_t *alignp) +{ + int fd = cpr_find_fd(name, 0); + CprName *elem = find_name(&cpr_state.memfd, name, 0); + + if (elem) { + *lenp = CPR_MEMFD_LEN(elem); + *maxlenp = CPR_MEMFD_MAXLEN(elem); + *alignp = CPR_MEMFD_ALIGN(elem); + } else { + *lenp = 0; + *maxlenp = 0; + *alignp = 0; + } + + trace_cpr_find_memfd(name, *lenp, *maxlenp, *alignp); + return fd; +} + +/*************************************************************************/ +/* cpr state container interface and implementation. */ + +#define CPR_STATE_NAME "QEMU_CPR_STATE" + +static const VMStateDescription vmstate_cpr_state = { + .name = CPR_STATE_NAME, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(mode, CprState), + VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, name.next), + VMSTATE_QLIST_V(memfd, CprState, 1, vmstate_cpr_memfd, CprMemfd, + name.next), + VMSTATE_END_OF_LIST() + } +}; + +int cpr_state_save(Error **errp) +{ + int ret, mfd; + QEMUFile *f; + char val[16]; + + mfd = memfd_create(CPR_STATE_NAME, 0); + if (mfd < 0) { + error_setg_errno(errp, errno, "memfd_create failed"); + return -1; + } + + cpr_state.mode = migrate_mode(); + qemu_clear_cloexec(mfd); + + f = qemu_fopen_fd(mfd, true, CPR_STATE_NAME); + ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0); + if (ret) { + error_setg(errp, "vmstate_save_state error %d", ret); + goto error; + } + + /* Do not close f, as mfd must remain open. */ + qemu_fflush(f); + lseek(mfd, 0, SEEK_SET); + + /* Remember mfd for post-exec cpr_state_load */ + snprintf(val, sizeof(val), "%d", mfd); + g_setenv(CPR_STATE_NAME, val, 1); + + return 0; + +error: + close(mfd); + cpr_state.mode = MIG_MODE_NORMAL; + return ret; +} + +void cpr_state_unsave(void) +{ + int mfd; + const char *val = g_getenv(CPR_STATE_NAME); + + if (val) { + g_unsetenv(CPR_STATE_NAME); + if (!qemu_strtoi(val, NULL, 10, &mfd)) { + close(mfd); + } + } +} + +int cpr_state_load(Error **errp) +{ + int ret, mfd; + QEMUFile *f; + const char *val = g_getenv(CPR_STATE_NAME); + + if (!val) { + return 0; + } + g_unsetenv(CPR_STATE_NAME); + if (qemu_strtoi(val, NULL, 10, &mfd)) { + error_setg(errp, "Bad %s env value %s", CPR_STATE_NAME, val); + return 1; + } + f = qemu_fopen_fd(mfd, false, CPR_STATE_NAME); + ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1); + qemu_fclose(f); + + if (!ret) { + migrate_get_current()->parameters.mode = cpr_state.mode; + } else { + error_setg(errp, "vmstate_load_state error %d", ret); + } + + return ret; +} + +void cpr_state_print(void) +{ + CprName *elem; + + printf("cpr_state:\n"); + printf("- mode = %d\n", cpr_state.mode); + QLIST_FOREACH(elem, &cpr_state.fds, next) { + printf("- %s %d : fd=%d\n", elem->name, elem->id, CPR_FD_FD(elem)); + } + QLIST_FOREACH(elem, &cpr_state.memfd, next) { + printf("- %s : len=%lu, maxlen=%lu, align=%lu\n", elem->name, + CPR_MEMFD_LEN(elem), CPR_MEMFD_MAXLEN(elem), + CPR_MEMFD_ALIGN(elem)); + } +} diff --git a/migration/meson.build b/migration/meson.build index 7cd4604322..38b1765814 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -15,6 +15,7 @@ softmmu_ss.add(files( 'channel.c', 'colo-failover.c', 'colo.c', + 'cpr-state.c', 'exec.c', 'fd.c', 'file.c', diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c index bb5a5752df..290b20f794 100644 --- a/migration/qemu-file-channel.c +++ b/migration/qemu-file-channel.c @@ -27,8 +27,10 @@ #include "qemu-file.h" #include "io/channel-socket.h" #include "io/channel-tls.h" +#include "io/channel-file.h" #include "qemu/iov.h" #include "qemu/yank.h" +#include "qapi/error.h" #include "yank_functions.h" @@ -192,3 +194,13 @@ QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc) object_ref(OBJECT(ioc)); return qemu_fopen_ops(ioc, &channel_output_ops, true); } + +QEMUFile *qemu_fopen_fd(int fd, bool writable, const char *name) +{ + g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd); + QIOChannel *ioc = QIO_CHANNEL(fioc); + QEMUFile *f = writable ? qemu_fopen_channel_output(ioc) : + qemu_fopen_channel_input(ioc); + qio_channel_set_name(ioc, name); + return f; +} diff --git a/migration/qemu-file-channel.h b/migration/qemu-file-channel.h index 0028a09eb6..1b3f94da31 100644 --- a/migration/qemu-file-channel.h +++ b/migration/qemu-file-channel.h @@ -29,4 +29,7 @@ QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc); QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc); + +QEMUFile *qemu_fopen_fd(int fd, bool writable, const char *name); + #endif diff --git a/migration/trace-events b/migration/trace-events index 880d47df60..47d6bc51e2 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -316,6 +316,14 @@ colo_receive_message(const char *msg) "Receive '%s' message" # colo-failover.c colo_failover_set_state(const char *new_state) "new state %s" +# cpr-state.c +cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d" +cpr_delete_fd(const char *name, int id) "%s, id %d" +cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d" +cpr_save_memfd(const char *name, size_t len, size_t maxlen, uint64_t align) "%s, len %lu, maxlen %lu, align %lu" +cpr_delete_memfd(const char *name) "%s" +cpr_find_memfd(const char *name, size_t len, size_t maxlen, uint64_t align) "%s, len %lu, maxlen %lu, align %lu" + # block-dirty-bitmap.c send_bitmap_header_enter(void) "" send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64 diff --git a/stubs/cpr-state.c b/stubs/cpr-state.c new file mode 100644 index 0000000000..1c9dc78a38 --- /dev/null +++ b/stubs/cpr-state.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "migration/cpr-state.h" + +void cpr_save_fd(const char *name, int id, int fd) +{ +} + +void cpr_delete_fd(const char *name, int id) +{ +} + +int cpr_find_fd(const char *name, int id) +{ + return -1; +} + +void cpr_resave_fd(const char *name, int id, int fd) +{ +} diff --git a/stubs/meson.build b/stubs/meson.build index 71469c1d50..9565c7dc3e 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -4,6 +4,7 @@ stub_ss.add(files('blk-exp-close-all.c')) stub_ss.add(files('blockdev-close-all-bdrv-states.c')) stub_ss.add(files('change-state-handler.c')) stub_ss.add(files('cmos.c')) +stub_ss.add(files('cpr-state.c')) stub_ss.add(files('cpu-get-clock.c')) stub_ss.add(files('cpus-get-virtual-clock.c')) stub_ss.add(files('qemu-timer-notify-cb.c')) -- Gitee From 738537d2eeccb33406e1110142c576b2649ce23f Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 14:59:46 +0800 Subject: [PATCH 19/56] cpr: exec mode This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-23-git-send-email-steven.sistare@oracle.com/ -------------------------------------------------------------------------------- Add the cpr-exec migration mode. Usage: qemu-system-$arch -migrate-mode-enable cpr-exec ... migrate_set_parameter mode cpr-exec migrate_set_parameter cpr-exec-args ... -incoming defer migrate -d file: ... poll for runstate inmigrate ... migrate_incoming file: In this mode, the migrate command saves state to a file, directly exec's a new version of qemu on the same host, replacing the original process while retaining its PID, and loads the file via the migrate-incoming command. The caller must specify a migration URI that writes to and reads from a file. Arguments for the new qemu process are taken from the @cpr-exec-args parameter. The first argument should be the path of a new qemu binary, or a prefix command that exec's the new qemu binary. The arguments must match those used to initially start qemu, plus the -incoming option. Guest RAM must be backed by a memory backend with share=on, but cannot be memory-backend-ram, and the '-migrate-mode-enable cpr-exec' option is required. This causes secondary guest ram blocks (those not specified on the command line) to be allocated by mmap'ing a memfd. The memfds are kept open across exec, their values are saved in special cpr state which is retrieved after exec, and they are re-mmap'd. Hence guest RAM is preserved in place, albeit with new virtual addresses in the qemu process. Since guest RAM is not copied, and storage blocks are not migrated, the caller must disable all capabilities related to page and block copy, and the implementation ignores all related parameters. Cpr-exec mode supports memory-backend-memfd, memory-backend-epc, and vfio devices in subsequent patches. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-23-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- MAINTAINERS | 2 + include/migration/cpr.h | 18 +++++++++ migration/cpr.c | 85 +++++++++++++++++++++++++++++++++++++++++ migration/meson.build | 1 + migration/migration.c | 9 +++++ migration/ram.c | 4 +- qapi/migration.json | 25 ++++++++++-- softmmu/physmem.c | 49 +++++++++++++++++++++++- softmmu/runstate.c | 4 +- trace-events | 1 + 10 files changed, 191 insertions(+), 7 deletions(-) create mode 100644 include/migration/cpr.h create mode 100644 migration/cpr.c diff --git a/MAINTAINERS b/MAINTAINERS index b015363420..e71c05cd75 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2992,6 +2992,8 @@ F: tests/unit/test-strlist.c F: include/migration/cpr-state.h F: migration/cpr-state.c F: stubs/cpr-state.c +F: include/migration/cpr.h +F: migration/cpr.c Record/replay M: Pavel Dovgalyuk diff --git a/include/migration/cpr.h b/include/migration/cpr.h new file mode 100644 index 0000000000..c48be2d571 --- /dev/null +++ b/include/migration/cpr.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef MIGRATION_CPR_H +#define MIGRATION_CPR_H + +extern bool only_cpr_capable; + +void cpr_init(void); +void cpr_exec(void); +void cpr_exec_failed(Error *err); +void cpr_preserve_fds(void); + +#endif diff --git a/migration/cpr.c b/migration/cpr.c new file mode 100644 index 0000000000..698baa455e --- /dev/null +++ b/migration/cpr.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/cpr.h" +#include "migration/cpr-state.h" +#include "sysemu/runstate.h" + +bool only_cpr_capable; +static Notifier cpr_fd_notifier; + +static int preserve_fd(const char *name, int id, int fd, void *opaque) +{ + qemu_clear_cloexec(fd); + return 0; +} + +static int unpreserve_fd(const char *name, int id, int fd, void *opaque) +{ + qemu_set_cloexec(fd); + return 0; +} + +static void cpr_fd_notifier_func(Notifier *notifier, void *data) +{ + MigrationState *s = data; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC && migration_has_failed(s)) { + cpr_walk_fd(unpreserve_fd, 0); + } +} + +void cpr_preserve_fds(void) +{ + cpr_walk_fd(preserve_fd, 0); +} + +void cpr_init(void) +{ + cpr_state_load(&error_fatal); + migration_add_notifier(&cpr_fd_notifier, cpr_fd_notifier_func); +} + +void cpr_exec(void) +{ + MigrationState *s = migrate_get_current(); + Error *err = NULL; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC && !migration_has_failed(s)) { + if (!migration_has_finished(s)) { + error_setg(&err, "cannot exec: migration status is '%s', " + "but must be 'completed'", + MigrationStatus_str(s->state)); + goto error; + } + + if (cpr_state_save(&err)) { + goto error; + } + + qemu_system_exec_request(s->parameters.cpr_exec_args); + } + return; + +error: + cpr_exec_failed(err); +} + +void cpr_exec_failed(Error *err) +{ + MigrationState *s = migrate_get_current(); + + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + migrate_set_error(s, err); + error_report_err(err); + migration_call_notifiers(s); + cpr_state_unsave(); +} diff --git a/migration/meson.build b/migration/meson.build index 38b1765814..494774a575 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -15,6 +15,7 @@ softmmu_ss.add(files( 'channel.c', 'colo-failover.c', 'colo.c', + 'cpr.c', 'cpr-state.c', 'exec.c', 'fd.c', diff --git a/migration/migration.c b/migration/migration.c index a3dc5c89da..5d1cac8792 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -33,6 +33,7 @@ #include "savevm.h" #include "qemu-file-channel.h" #include "qemu-file.h" +#include "migration/cpr.h" #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" @@ -216,6 +217,7 @@ void migration_object_init(void) blk_mig_init(); ram_mig_init(); dirty_bitmap_mig_init(); + cpr_init(); } void migration_cancel(const Error *error) @@ -1894,6 +1896,7 @@ static void migrate_fd_cleanup(MigrationState *s) } migration_call_notifiers(s); block_cleanup_parameters(s); + cpr_exec(); yank_unregister_instance(MIGRATION_YANK_INSTANCE); } @@ -2428,6 +2431,12 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC && + !s->parameters.has_cpr_exec_args) { + error_setg(errp, "cpr-exec mode requires setting cpr-exec-args"); + return false; + } + if (migration_is_blocked(errp)) { return false; } diff --git a/migration/ram.c b/migration/ram.c index bf018f3299..3d3ea489a5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -201,6 +201,7 @@ out: bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || + migrate_mode() == MIG_MODE_CPR_EXEC || (migrate_ignore_shared() && qemu_ram_is_shared(block) && ramblock_is_named_file(block)); } @@ -3194,7 +3195,8 @@ static void ram_init_bitmaps(RAMState *rs) WITH_RCU_READ_LOCK_GUARD() { ram_list_init_bitmaps(); /* We don't use dirty log with background snapshots */ - if (!migrate_background_snapshot()) { + if (!migrate_background_snapshot() && + migrate_mode() == MIG_MODE_NORMAL) { memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs); } diff --git a/qapi/migration.json b/qapi/migration.json index de29cb6a57..40fdf275d0 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -546,10 +546,21 @@ # # @normal: the original form of migration. # +# @cpr-exec: The migrate command saves state to a file, directly exec's a +# new version of qemu on the same host, replacing the original +# process while retaining its PID, and loads the file via the +# migrate-incoming command. The caller must specify a migration URI +# that writes to and reads from a file. Guest RAM must be backed by +# a memory backend with share=on, and cannot be memory-backend-ram. +# Guest RAM is not copied, and storage blocks are not migrated, so +# all capabilities related to page and block copy must be disabled, +# and all related parameters are ignored. Arguments for the new +# qemu process are taken from the @cpr-exec-args parameter. +# # Since: 6.2 ## { 'enum': 'MigMode', - 'data': [ 'normal' ] } + 'data': [ 'normal', 'cpr-exec' ] } ## # @BitmapMigrationBitmapAliasTransform: @@ -689,7 +700,11 @@ # at tail stage. # The default value is false. (Since 5.1) # -# @cpr-exec-args: defined in a subsequent patch. +# @cpr-exec-args: arguments passed to new qemu for cpr-exec mode. The first +# argument should be the path of a new qemu binary, or a prefix +# command that exec's the new qemu binary. The arguments must +# match those used to initially start qemu, plus the -incoming +# option. (Since 6.2) # # @tls-creds: ID of the 'tls-creds' object that provides credentials for # establishing a TLS connection over the migration data channel. @@ -864,7 +879,8 @@ # at tail stage. # The default value is false. (Since 5.1) # -# @cpr-exec-args: defined in a subsequent patch. +# @cpr-exec-args: Arguments passed to new qemu for cpr-exec mode. +# See description in @MigrationParameter. (Since 6.2) # # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data @@ -1073,7 +1089,8 @@ # at tail stage. # The default value is false. (Since 5.1) # -# @cpr-exec-args: defined in a subsequent patch. +# @cpr-exec-args: Arguments passed to new qemu for cpr-exec mode. +# See description in @MigrationParameter. (Since 6.2) # # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 819e2c3c17..63009738bf 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -43,6 +43,7 @@ #include "qemu/qemu-print.h" #include "qemu/log.h" #include "exec/memory.h" +#include "qemu/memfd.h" #include "exec/ioport.h" #include "sysemu/dma.h" #include "sysemu/hostmem.h" @@ -65,6 +66,8 @@ #include "qemu/pmem.h" +#include "migration/cpr-state.h" +#include "migration/misc.h" #include "migration/vmstate.h" #include "qemu/range.h" @@ -1978,6 +1981,40 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, } } +static bool memory_region_is_backend(MemoryRegion *mr) +{ + return !!object_dynamic_cast(OBJECT(mr)->parent, TYPE_MEMORY_BACKEND); +} + +static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error **errp) +{ + size_t len, align; + void *addr; + struct MemoryRegion *mr = rb->mr; + const char *name = memory_region_name(mr); + int mfd = cpr_find_memfd(name, &len, &maxlen, &align); + + if (mfd >= 0) { + rb->used_length = len; + rb->max_length = maxlen; + mr->align = align; + } else { + len = rb->used_length; + maxlen = rb->max_length; + mr->align = QEMU_VMALLOC_ALIGN; + mfd = qemu_memfd_create(name, maxlen + mr->align, 0, 0, 0, errp); + if (mfd < 0) { + return NULL; + } + cpr_save_memfd(name, mfd, len, maxlen, mr->align); + } + rb->flags |= RAM_SHARED; + qemu_set_cloexec(mfd); + addr = file_ram_alloc(rb, maxlen, mfd, false, false, 0, errp); + trace_anon_memfd_alloc(name, maxlen, addr, mfd); + return addr; +} + static void ram_block_add(RAMBlock *new_block, Error **errp) { const bool noreserve = qemu_ram_is_noreserve(new_block); @@ -2001,6 +2038,14 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); return; } + } else if (migrate_mode_enabled(MIG_MODE_CPR_EXEC) && + !memory_region_is_backend(new_block->mr)) { + new_block->host = qemu_anon_memfd_alloc(new_block, + new_block->max_length, + errp); + if (!new_block->host) { + return; + } } else { new_block->host = qemu_anon_ram_alloc(new_block->max_length, &new_block->mr->align, @@ -2012,8 +2057,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); return; } - memory_try_enable_merging(new_block->host, new_block->max_length); } + if (!xen_enabled()) + memory_try_enable_merging(new_block->host, new_block->max_length); } new_ram_size = MAX(old_ram_size, @@ -2246,6 +2292,7 @@ void qemu_ram_free(RAMBlock *block) } qemu_mutex_lock_ramlist(); + cpr_delete_memfd(memory_region_name(block->mr)); QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; /* Write list before version */ diff --git a/softmmu/runstate.c b/softmmu/runstate.c index c6119e6d67..b7d9675a89 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -33,6 +33,7 @@ #include "exec/exec-all.h" #include "exec/gdbstub.h" #include "hw/boards.h" +#include "migration/cpr.h" #include "migration/misc.h" #include "migration/postcopy-ram.h" #include "monitor/monitor.h" @@ -692,9 +693,10 @@ static bool main_loop_should_exit(void) if (request) { if (qemu_exec_requested()) { Error *err = NULL; + cpr_preserve_fds(); execvp(exec_argv[0], exec_argv); error_setg_errno(&err, errno, "execvp %s failed", exec_argv[0]); - error_report_err(err); + cpr_exec_failed(err); g_strfreev(exec_argv); exec_argv = NULL; return false; diff --git a/trace-events b/trace-events index a637a61eba..770a9ac0b7 100644 --- a/trace-events +++ b/trace-events @@ -45,6 +45,7 @@ ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_ # accel/tcg/cputlb.c memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u" memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64 +anon_memfd_alloc(const char *name, size_t size, void *ptr, int fd) "%s size %zu ptr %p fd %d" # gdbstub.c gdbstub_op_start(const char *device) "Starting gdbstub using device %s" -- Gitee From b1ce9d8ac80ddf3c407dd86f93723f6e7edf6efc Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 15:12:04 +0800 Subject: [PATCH 20/56] cpr: add exec-mode blockers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-24-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------------- Add blockers for cpr-exec migration mode for devices and options that do not support it. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-24-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- accel/xen/xen-all.c | 4 ++++ migration/migration.c | 5 +++++ replay/replay.c | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c index 69aa7d018b..bf188ab15a 100644 --- a/accel/xen/xen-all.c +++ b/accel/xen/xen-all.c @@ -181,6 +181,10 @@ static int xen_init(MachineState *ms) * opt out of system RAM being allocated by generic code */ mc->default_ram_id = NULL; + + migrate_add_blocker_always("xen does not support cpr exec", + &error_fatal, MIG_MODE_CPR_EXEC, -1); + return 0; } diff --git a/migration/migration.c b/migration/migration.c index 5d1cac8792..175d98e1b3 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1263,6 +1263,11 @@ static bool migrate_caps_check(bool *cap_list, return false; } + if (cap_list[MIGRATION_CAPABILITY_X_COLO]) { + return migrate_add_blocker_always("x-colo is not compatible with cpr", + errp, MIG_MODE_CPR_EXEC, -1); + } + return true; } diff --git a/replay/replay.c b/replay/replay.c index 6df2abc18c..901782ac8b 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -19,6 +19,7 @@ #include "qemu/option.h" #include "sysemu/cpus.h" #include "qemu/error-report.h" +#include "migration/blocker.h" /* Current version of the replay mechanism. Increase it when file format changes. */ @@ -245,6 +246,9 @@ static void replay_enable(const char *fname, int mode) const char *fmode = NULL; assert(!replay_file); + migrate_add_blocker_always("replay is not compatible with cpr", + &error_fatal, MIG_MODE_CPR_EXEC, -1); + switch (mode) { case REPLAY_MODE_RECORD: fmode = "wb"; -- Gitee From 49594e60496a9dd4e319b91fa615e68e9832f24a Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 15:37:44 +0800 Subject: [PATCH 21/56] cpr: ram block blockers This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-25-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------- cpr-exec mode cannot save volatile ram blocks in the migration stream file and recreate them later, because the physical memory for the blocks is pinned and registered for vfio. Add an exec-mode blocker for volatile ram blocks. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-25-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/exec/memory.h | 2 ++ include/exec/ramblock.h | 1 + softmmu/physmem.c | 45 +++++++++++++++++++++++++++++++++++++++++ softmmu/vl.c | 2 ++ 4 files changed, 50 insertions(+) diff --git a/include/exec/memory.h b/include/exec/memory.h index bab1322894..4a5237cb4b 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -3022,6 +3022,8 @@ bool ram_block_discard_is_disabled(void); */ bool ram_block_discard_is_required(void); +void ram_block_add_cpr_blockers(Error **errp); + #endif #endif diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 664701b759..83d2923e8b 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -37,6 +37,7 @@ struct RAMBlock { /* RCU-enabled, writes protected by the ramlist lock */ QLIST_ENTRY(RAMBlock) next; QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; + Error *cpr_blocker; int fd; size_t page_size; /* dirty bitmap used during migration */ diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 63009738bf..92b84b08cc 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -66,6 +66,7 @@ #include "qemu/pmem.h" +#include "migration/blocker.h" #include "migration/cpr-state.h" #include "migration/misc.h" #include "migration/vmstate.h" @@ -1986,6 +1987,49 @@ static bool memory_region_is_backend(MemoryRegion *mr) return !!object_dynamic_cast(OBJECT(mr)->parent, TYPE_MEMORY_BACKEND); } +/* + * Return true if ram contents would be lost during cpr for MIG_MODE_CPR_EXEC. + * Return false for ram_device because it is remapped after exec. Do not + * exclude rom, even though it is readonly, because the rom file could change + * in the new qemu. Return false for non-migratable blocks. They are either + * re-created after exec, or are handled specially, or are covered by a + * device-level cpr blocker. Return false for an fd, because it is visible and + * can be remapped in the new process. + */ +static bool ram_is_volatile(RAMBlock *rb) +{ + MemoryRegion *mr = rb->mr; + + return mr && + memory_region_is_ram(mr) && + !memory_region_is_ram_device(mr) && + (!qemu_ram_is_shared(rb) || !ramblock_is_named_file(rb)) && + qemu_ram_is_migratable(rb) && + rb->fd < 0; +} + +/* + * Add a MIG_MODE_CPR_EXEC blocker for each volatile ram block. This cannot be + * performed in ram_block_add because the migratable flag has not been set yet. + * No need to examine anonymous (non-backend) blocks, because they are + * created using memfd if cpr-exec mode is enabled. + */ +void ram_block_add_cpr_blockers(Error **errp) +{ + RAMBlock *rb; + + RAMBLOCK_FOREACH(rb) { + if (ram_is_volatile(rb) && memory_region_is_backend(rb->mr)) { + const char *name = memory_region_name(rb->mr); + rb->cpr_blocker = NULL; + error_setg(&rb->cpr_blocker, + "Memory region %s is volatile. A memory-backend-memfd or" + " memory-backend-file with share=on is required.", name); + migrate_add_blockers(&rb->cpr_blocker, errp, MIG_MODE_CPR_EXEC, -1); + } + } +} + static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error **errp) { size_t len, align; @@ -2293,6 +2337,7 @@ void qemu_ram_free(RAMBlock *block) qemu_mutex_lock_ramlist(); cpr_delete_memfd(memory_region_name(block->mr)); + migrate_del_blocker(&block->cpr_blocker); QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; /* Write list before version */ diff --git a/softmmu/vl.c b/softmmu/vl.c index f79157f5bc..4660c173a9 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -28,6 +28,7 @@ #include "qemu/units.h" #include "qemu/log.h" #include "exec/cpu-common.h" +#include "exec/memory.h" #include "hw/qdev-properties.h" #include "qapi/compat-policy.h" #include "qapi/error.h" @@ -2745,6 +2746,7 @@ void qmp_x_exit_preconfig(Error **errp) qemu_init_board(); qemu_create_cli_devices(); qemu_machine_creation_done(); + ram_block_add_cpr_blockers(&error_fatal); if (loadvm) { load_snapshot(loadvm, NULL, false, NULL, &error_fatal); -- Gitee From a169ff9fa0f6f369585cd3c98e4fee2a0fb97110 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 15:58:24 +0800 Subject: [PATCH 22/56] cpr: only-cpr-capable This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-26-git-send-email-steven.sistare@oracle.com/ --------------------------------------------------------------------------------- Add the only-cpr-capable option, which causes qemu to exit with an error if any devices that are not capable of cpr are added. This guarantees that the migrate commanmd will not fail due to a blocker. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-26-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/migration/cpr.h | 2 ++ migration/migration.c | 13 +++++++++++++ qemu-options.hx | 8 ++++++++ softmmu/vl.c | 4 ++++ 4 files changed, 27 insertions(+) diff --git a/include/migration/cpr.h b/include/migration/cpr.h index c48be2d571..07907975e3 100644 --- a/include/migration/cpr.h +++ b/include/migration/cpr.h @@ -8,6 +8,8 @@ #ifndef MIGRATION_CPR_H #define MIGRATION_CPR_H +#define CPR_MODES (BIT(MIG_MODE_CPR_EXEC)) + extern bool only_cpr_capable; void cpr_init(void); diff --git a/migration/migration.c b/migration/migration.c index 175d98e1b3..992fe9f50f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2111,6 +2111,11 @@ bool migrate_mode_enabled(MigMode mode) return !!(migrate_enabled_modes & BIT(mode)); } +static bool migrate_modes_enabled(int modes) +{ + return (modes & migrate_enabled_modes) == modes; +} + static int migrate_check_enabled(Error **errp) { MigMode mode = migrate_mode(); @@ -2192,6 +2197,14 @@ static int check_blockers(Error **reasonp, Error **errp, int modes) return -EACCES; } + if (only_cpr_capable && (modes & CPR_MODES) && + migrate_modes_enabled(modes & CPR_MODES)) { + error_propagate_prepend(errp, *reasonp, + "-only-cpr-capable specified, but: "); + *reasonp = NULL; + return -EACCES; + } + return add_blockers(reasonp, errp, modes); } diff --git a/qemu-options.hx b/qemu-options.hx index b09c3818d7..ad948f4b71 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4468,6 +4468,14 @@ SRST migration using any mode except 'normal'. ERST +DEF("only-cpr-capable", 0, QEMU_OPTION_only_cpr_capable, \ + "-only-cpr-capable allow only cpr capable devices\n", QEMU_ARCH_ALL) +SRST +``-only-cpr-capable`` + Only allow cpr capable devices, which guarantees that cpr will not + fail due to a cpr blocker. +ERST + DEF("nodefaults", 0, QEMU_OPTION_nodefaults, \ "-nodefaults don't create default devices\n", QEMU_ARCH_ALL) SRST diff --git a/softmmu/vl.c b/softmmu/vl.c index 4660c173a9..e18b761f10 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -76,6 +76,7 @@ #include "hw/block/block.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" +#include "migration/cpr.h" #include "migration/misc.h" #include "migration/snapshot.h" #include "sysemu/tpm.h" @@ -3514,6 +3515,9 @@ void qemu_init(int argc, char **argv, char **envp) case QEMU_OPTION_only_migratable: only_migratable = 1; break; + case QEMU_OPTION_only_cpr_capable: + only_cpr_capable = true; + break; case QEMU_OPTION_migrate_mode_enable: migrate_enable_mode(qapi_enum_parse(&MigMode_lookup, optarg, -1, &error_fatal)); -- Gitee From 7f8c461c3f8d0eabe2f207da4aeccc1ebfa493cf Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 16:12:23 +0800 Subject: [PATCH 23/56] cpr: Mismatched GPAs fix This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-27-git-send-email-steven.sistare@oracle.com/ ----------------------------------------------------------------------------- For cpr-exec mode, ramblock_is_ignored is always true, and the address of each migrated memory region must match the address of the statically initialized region on the target. However, for a PCI rom block, the region address is set when the guest writes to a BAR on the source, which does not occur on the target, causing a "Mismatched GPAs" error during cpr-exec migration. To fix, unconditionally set the target's address to the source's address if the region does not have an address yet. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-27-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/exec/memory.h | 12 ++++++++++++ migration/ram.c | 17 ++++++++++------- softmmu/memory.c | 10 ++++++++-- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 4a5237cb4b..90718e911b 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -738,6 +738,7 @@ struct MemoryRegion { bool flush_coalesced_mmio; uint8_t dirty_log_mask; bool is_iommu; + bool has_addr; RAMBlock *ram_block; Object *owner; @@ -2266,6 +2267,17 @@ void memory_region_set_enabled(MemoryRegion *mr, bool enabled); */ void memory_region_set_address(MemoryRegion *mr, hwaddr addr); +/* + * memory_region_set_address_only: set the address of a region. + * + * Same as memory_region_set_address, but without causing transaction side + * effects. + * + * @mr: the region to be updated + * @addr: new address, relative to container region + */ +void memory_region_set_address_only(MemoryRegion *mr, hwaddr addr); + /* * memory_region_set_size: dynamically update the size of a region. * diff --git a/migration/ram.c b/migration/ram.c index 3d3ea489a5..32b7507da0 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -4398,13 +4398,16 @@ static int ram_load_precopy(QEMUFile *f) } if (migrate_ignore_shared()) { hwaddr addr = qemu_get_be64(f); - if (ramblock_is_ignored(block) && - block->mr->addr != addr) { - error_report("Mismatched GPAs for block %s " - "%" PRId64 "!= %" PRId64, - id, (uint64_t)addr, - (uint64_t)block->mr->addr); - ret = -EINVAL; + if (ramblock_is_ignored(block)) { + if (!block->mr->has_addr) { + memory_region_set_address_only(block->mr, addr); + } else if (block->mr->addr != addr) { + error_report("Mismatched GPAs for block %s " + "%" PRId64 "!= %" PRId64, + id, (uint64_t)addr, + (uint64_t)block->mr->addr); + ret = -EINVAL; + } } } ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, diff --git a/softmmu/memory.c b/softmmu/memory.c index 520c199dc7..f430556cf8 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -2537,7 +2537,7 @@ static void memory_region_add_subregion_common(MemoryRegion *mr, { assert(!subregion->container); subregion->container = mr; - subregion->addr = offset; + memory_region_set_address_only(subregion, offset); memory_region_update_container_subregions(subregion); } @@ -2612,10 +2612,16 @@ static void memory_region_readd_subregion(MemoryRegion *mr) } } +void memory_region_set_address_only(MemoryRegion *mr, hwaddr addr) +{ + mr->addr = addr; + mr->has_addr = true; +} + void memory_region_set_address(MemoryRegion *mr, hwaddr addr) { if (addr != mr->addr) { - mr->addr = addr; + memory_region_set_address_only(mr, addr); memory_region_readd_subregion(mr); } } -- Gitee From ea6004bf72de4021f924a541441be0f9b28dd2cc Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 16:19:13 +0800 Subject: [PATCH 24/56] hostmem-memfd: cpr support This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-28-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------------- Preserve memory-backend-memfd memory objects during cpr. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-28-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- backends/hostmem-memfd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c index 3fc85c3db8..d421a4fea5 100644 --- a/backends/hostmem-memfd.c +++ b/backends/hostmem-memfd.c @@ -14,6 +14,7 @@ #include "sysemu/hostmem.h" #include "qom/object_interfaces.h" #include "qemu/memfd.h" +#include "migration/cpr-state.h" #include "qemu/module.h" #include "qapi/error.h" #include "qom/object.h" @@ -36,28 +37,29 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend); uint32_t ram_flags; - char *name; - int fd; + g_autofree char *name = host_memory_backend_get_name(backend); + int fd = cpr_find_fd(name, 0); if (!backend->size) { error_setg(errp, "can't create backend with size 0"); return; } - fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size, - m->hugetlb, m->hugetlbsize, m->seal ? - F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0, - errp); - if (fd == -1) { - return; + if (fd < 0) { + fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size, + m->hugetlb, m->hugetlbsize, m->seal ? + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0, + errp); + if (fd == -1) { + return; + } + cpr_save_fd(name, 0, fd); } - name = host_memory_backend_get_name(backend); ram_flags = backend->share ? RAM_SHARED : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, fd, 0, errp); - g_free(name); } static bool -- Gitee From 225f5092c0d63f277dbfdd21ee5d5c513353b3b4 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 16:26:43 +0800 Subject: [PATCH 25/56] hostmem-epc: cpr support This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-29-git-send-email-steven.sistare@oracle.com/ ----------------------------------------------------------------------------- Preserve memory-backend-epc memory objects during cpr. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-29-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- backends/hostmem-epc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/backends/hostmem-epc.c b/backends/hostmem-epc.c index b47f98b6a3..2a57b92bea 100644 --- a/backends/hostmem-epc.c +++ b/backends/hostmem-epc.c @@ -17,32 +17,34 @@ #include "qapi/error.h" #include "sysemu/hostmem.h" #include "hw/i386/hostmem-epc.h" +#include "migration/cpr-state.h" static void sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { uint32_t ram_flags; - char *name; - int fd; + g_autofree char *name = object_get_canonical_path(OBJECT(backend)); + int fd = cpr_find_fd(name, 0); if (!backend->size) { error_setg(errp, "can't create backend with size 0"); return; } - fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); if (fd < 0) { - error_setg_errno(errp, errno, - "failed to open /dev/sgx_vepc to alloc SGX EPC"); - return; + fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, + "failed to open /dev/sgx_vepc to alloc SGX EPC"); + return; + } + cpr_save_fd(name, 0, fd); } - name = object_get_canonical_path(OBJECT(backend)); ram_flags = (backend->share ? RAM_SHARED : 0) | RAM_PROTECTED; memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, fd, 0, errp); - g_free(name); } static void sgx_epc_backend_instance_init(Object *obj) -- Gitee From c3bde753558abf4e6af022460b9a1a4e67276517 Mon Sep 17 00:00:00 2001 From: luofei Date: Wed, 9 Aug 2023 16:40:35 +0800 Subject: [PATCH 26/56] hostmem-file: cpr support Preserve memory-backend-file memory objects during cpr. Signed-off-by: luofei --- softmmu/memory.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/softmmu/memory.c b/softmmu/memory.c index f430556cf8..c0d7be9e9d 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -33,6 +33,7 @@ #include "qemu/accel.h" #include "hw/boards.h" #include "migration/vmstate.h" +#include "migration/cpr-state.h" //#define DEBUG_UNASSIGNED @@ -1593,6 +1594,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, bool readonly, Error **errp) { + int fd; Error *err = NULL; memory_region_init(mr, owner, name, size); mr->ram = true; @@ -1600,8 +1602,21 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, mr->terminates = true; mr->destructor = memory_region_destructor_ram; mr->align = align; - mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path, - readonly, &err); + + /* make sure mr has a valid name */ + memory_region_name(mr); + fd = cpr_find_fd(mr->name, 0); + if (fd < 0) { + mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path, + readonly, &err); + if (mr->ram_block) { + fd = mr->ram_block->fd; + cpr_save_fd(mr->name, 0, fd); + } + } else { + mr->ram_block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, + readonly, &err); + } if (err) { mr->size = int128_zero(); object_unparent(OBJECT(mr)); -- Gitee From 318dec2da5a596a86149687b2e27fc324661eae9 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 16:45:28 +0800 Subject: [PATCH 27/56] pci: export msix_is_pending This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-30-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------------ Export msix_is_pending for use by cpr. No functional change. Signed-off-by: Steve Sistare Acked-by: Michael S. Tsirkin Message-Id: <1658851843-236870-30-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/pci/msix.c | 2 +- include/hw/pci/msix.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/pci/msix.c b/hw/pci/msix.c index ae9331cd0b..e492ce0e0f 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -64,7 +64,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) return dev->msix_pba + vector / 8; } -static int msix_is_pending(PCIDevice *dev, int vector) +int msix_is_pending(PCIDevice *dev, unsigned int vector) { return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); } diff --git a/include/hw/pci/msix.h b/include/hw/pci/msix.h index 4c4a60c739..00653548b7 100644 --- a/include/hw/pci/msix.h +++ b/include/hw/pci/msix.h @@ -32,6 +32,7 @@ int msix_present(PCIDevice *dev); bool msix_is_masked(PCIDevice *dev, unsigned vector); void msix_set_pending(PCIDevice *dev, unsigned vector); void msix_clr_pending(PCIDevice *dev, int vector); +int msix_is_pending(PCIDevice *dev, unsigned vector); int msix_vector_use(PCIDevice *dev, unsigned vector); void msix_vector_unuse(PCIDevice *dev, unsigned vector); -- Gitee From e395071d9b4ce19b95283e1ed966335a478aeedd Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Wed, 9 Aug 2023 17:45:48 +0800 Subject: [PATCH 28/56] vfio-pci: refactor for cpr This is mainly from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1640199934-455149-1-git-send-email-steven.sistare@oracle.com/1640199934-455149-19-git-send-email-steven.sistare@oracle.com/ https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-31-git-send-email-steven.sistare@oracle.com/ -------------------------------------------------------------------------------- Refactor vector use into a helper vfio_vector_init. Add vfio_notifier_init and vfio_notifier_cleanup for named notifiers, and pass additional arguments to vfio_remove_kvm_msi_virq. All for use by cpr in a subsequent patch. No functional change. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-31-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/vfio/pci.c | 102 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 35 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b085389ff8..9ed9d661b3 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -48,6 +48,27 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +/* Create new or reuse existing eventfd */ +static int vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + int fd = -1; /* placeholder until a subsequent patch */ + int ret = 0; + + if (fd >= 0) { + event_notifier_init_fd(e, fd); + } else { + ret = event_notifier_init(e, 0); + } + return ret; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -128,8 +149,8 @@ static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0)) { + error_setg(errp, "vfio_notifier_init intx-unmask failed"); goto fail; } @@ -161,7 +182,7 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); @@ -190,7 +211,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -281,9 +302,10 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); + ret = vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + error_setg_errno(errp, -ret, + "vfio_notifier_init intx-interrupt failed"); return ret; } fd = event_notifier_get_fd(&vdev->intx.interrupt); @@ -292,7 +314,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return -errno; } @@ -320,7 +342,7 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; @@ -410,41 +432,43 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) } static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) + int nr, bool msix) { int virq; + const char *name = "kvm_interrupt"; if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (vfio_notifier_init(vdev, &vector->kvm_interrupt, name, nr)) { return; } - virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev); + virq = kvm_irqchip_add_msi_route(kvm_state, nr, &vdev->pdev); if (virq < 0) { - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, name, nr); return; } if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, NULL, virq) < 0) { kvm_irqchip_release_virq(kvm_state, virq); - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, name, nr); return; } vector->virq = virq; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -454,6 +478,20 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void vfio_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + + vector->vdev = vdev; + vector->virq = -1; + if (vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr)) { + error_report("vfio: vfio_notifier_init interrupt failed"); + } + vector->use = true; + msix_vector_use(pdev, nr); +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -466,13 +504,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -484,7 +516,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } @@ -629,8 +661,8 @@ retry: vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i)) { + error_report("vfio: Error: vfio_notifier_init failed"); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -658,11 +690,11 @@ retry: for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } g_free(vdev->msi_vectors); @@ -697,11 +729,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -2694,7 +2726,7 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { + if (vfio_notifier_init(vdev, &vdev->err_notifier, "err", 0)) { error_report("vfio: Unable to init event notifier for error detection"); vdev->pci_aer = false; return; @@ -2707,7 +2739,7 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2726,7 +2758,7 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2760,7 +2792,7 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { + if (vfio_notifier_init(vdev, &vdev->req_notifier, "req", 0)) { error_report("vfio: Unable to init event notifier for device request"); return; } @@ -2772,7 +2804,7 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2792,7 +2824,7 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } -- Gitee From 217b779d830634a4336a390bb6e895f1d78f1aba Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Thu, 10 Aug 2023 15:36:17 +0800 Subject: [PATCH 29/56] vfio-pci: cpr part 1 (fd and dma) This is mainly from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-32-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------------- Enable vfio-pci devices to be saved and restored across a cpr-exec of qemu. At vfio creation time, save the value of vfio container, group, and device descriptors in cpr state. In the container pre_save handler, suspend the use of virtual addresses in DMA mappings with VFIO_DMA_UNMAP_FLAG_VADDR, because guest ram will be remapped at a different VA after exec. DMA to already-mapped pages continues. Save the msi message area as part of vfio-pci vmstate, save the interrupt and notifier eventfd's in cpr state, and clear the close-on-exec flag for the vfio descriptors. The flag is not cleared earlier because the descriptors should not persist across miscellaneous fork and exec calls that may be performed during normal operation. On qemu restart, vfio_realize() finds the saved descriptors, uses the descriptors, and notes that the device is being reused. Device and iommu state is already configured, so operations in vfio_realize that would modify the configuration are skipped for a reused device, including vfio ioctl's and writes to PCI configuration space. Vfio PCI device reset is also suppressed. The result is that vfio_realize constructs qemu data structures that reflect the current state of the device. However, the reconstruction is not complete until migrate_incoming is called. migrate_incoming loads the msi data, the vfio post_load handler finds eventfds in cpr state, rebuilds vector data structures, and attaches the interrupts to the new KVM instance. The container post_load handler then invokes the main vfio listener callback, which walks the flattened ranges of the vfio address space and calls VFIO_DMA_MAP_FLAG_VADDR to inform the kernel of the new VA's. Lastly, migration resumes the VM. This functionality is delivered by 3 patches for clarity. Part 1 handles device file descriptors and DMA. Part 2 adds eventfd and MSI/MSI-X vector support. Part 3 adds INTX support. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-32-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- MAINTAINERS | 1 + hw/pci/pci.c | 12 +++ hw/vfio/common.c | 150 ++++++++++++++++++++++++++++------ hw/vfio/cpr.c | 120 +++++++++++++++++++++++++++ hw/vfio/meson.build | 1 + hw/vfio/pci.c | 43 ++++++++++ hw/vfio/trace-events | 1 + include/hw/vfio/vfio-common.h | 11 +++ include/migration/vmstate.h | 2 + 9 files changed, 318 insertions(+), 23 deletions(-) create mode 100644 hw/vfio/cpr.c diff --git a/MAINTAINERS b/MAINTAINERS index e71c05cd75..13ac0854f5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2994,6 +2994,7 @@ F: migration/cpr-state.c F: stubs/cpr-state.c F: include/migration/cpr.h F: migration/cpr.c +F: hw/vfio/cpr.c Record/replay M: Pavel Dovgalyuk diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 3e6805d54a..4c110a9cfd 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -33,6 +33,7 @@ #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/misc.h" #include "migration/qemu-file-types.h" #include "migration/vmstate.h" #include "monitor/monitor.h" @@ -315,6 +316,17 @@ static void pci_do_device_reset(PCIDevice *dev) { int r; + /* + * A PCI device that is resuming for cpr is already configured, so do + * not reset it here when we are called from qemu_system_reset prior to + * cpr load, else interrupts may be lost for vfio-pci devices. It is + * safe to skip this reset for all PCI devices, because cpr load will set + * all fields that would have been set here. + */ + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + return; + } + pci_device_deassert_intx(dev); assert(dev->irq_state == 0); diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6cb91e7ffd..4ed06e049f 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -31,6 +31,7 @@ #include "exec/memory.h" #include "exec/ram_addr.h" #include "hw/hw.h" +#include "migration/cpr-state.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/range.h" @@ -483,6 +484,8 @@ static int vfio_dma_unmap(VFIOContainer *container, }; VFIODMARange *qrange; + assert(!container->reused); + if (iotlb && container->dirty_pages_supported && vfio_devices_all_running_and_saving(container)) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); @@ -535,7 +538,6 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, { struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), - .flags = VFIO_DMA_MAP_FLAG_READ, .vaddr = (__u64)(uintptr_t)vaddr, .iova = iova, .size = size, @@ -549,6 +551,19 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, /* XXX allocate the dirty bitmap on demand */ vfio_dma_range_init_dirty_bitmap(qrange); + /* + * Set the new vaddr for any mappings registered during cpr load. + * Reused is cleared thereafter. + */ + if (container->reused) { + map.flags = VFIO_DMA_MAP_FLAG_VADDR; + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + goto fail; + } + return 0; + } + + map.flags = VFIO_DMA_MAP_FLAG_READ; if (!readonly) { map.flags |= VFIO_DMA_MAP_FLAG_WRITE; } @@ -564,7 +579,9 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, return 0; } - error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); +fail: + error_report("vfio_dma_map %s (iova %lu, size %ld, va %p): %s", + (container->reused ? "VADDR" : ""), iova, size, vaddr, strerror(errno)); return -errno; } @@ -913,6 +930,12 @@ static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + vfio_container_region_add(container, section); +} + +void vfio_container_region_add(VFIOContainer *container, + MemoryRegionSection *section) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -1646,6 +1669,12 @@ static void vfio_listener_release(VFIOContainer *container) } } +void vfio_listener_register(VFIOContainer *container) +{ + container->listener = vfio_memory_listener; + memory_listener_register(&container->listener, container->space->as); +} + static struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) { @@ -2065,6 +2094,22 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, { int iommu_type, dirty_log_manual_clear, ret; + /* + * If container is reused, just set its type and skip the ioctls, as the + * container and group are already configured in the kernel. + * VFIO_TYPE1v2_IOMMU is the only type that supports reuse/cpr. + */ + if (container->reused) { + if (ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { + container->iommu_type = VFIO_TYPE1v2_IOMMU; + return 0; + } else { + error_setg(errp, "container was reused but VFIO_TYPE1v2_IOMMU " + "is not supported"); + return -errno; + } + } + iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { return iommu_type; @@ -2176,9 +2221,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, { VFIOContainer *container; int ret, fd; + bool reused; VFIOAddressSpace *space; space = vfio_get_address_space(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); + reused = (fd > 0); /* * VFIO is currently incompatible with discarding of RAM insofar as the @@ -2211,27 +2259,46 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ + /* + * If the container is reused, then the group is already attached in the + * kernel. If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, create + * the container struct and group list. + */ QLIST_FOREACH(container, &space->containers, next) { - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, - "Cannot set discarding of RAM broken"); - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, - &container->fd)) { - error_report("vfio: error disconnecting group %d from" - " container", group->groupid); - } - return ret; + if (reused) { + if (container->fd != fd) { + continue; + } + } else if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + continue; + } + + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, + "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, + &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); } - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); + goto delete_fd_exit; + } + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + if (!reused) { vfio_kvm_device_add_group(group); - return 0; + cpr_save_fd("vfio_container_for_group", group->groupid, + container->fd); } + return 0; + } + + if (!reused) { + fd = qemu_open_old("/dev/vfio/vfio", O_RDWR); } - fd = qemu_open_old("/dev/vfio/vfio", O_RDWR); if (fd < 0) { error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio"); ret = -errno; @@ -2249,6 +2316,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->space = space; container->fd = fd; + container->reused = reused; container->error = NULL; container->dirty_pages_supported = false; container->dma_max_mappings = 0; @@ -2262,10 +2330,15 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, goto free_container_exit; } + ret = vfio_cpr_register_container(container, errp); + if (ret) { + goto free_container_exit; + } + ret = vfio_ram_block_discard_disable(container, true); if (ret) { error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - goto free_container_exit; + goto unregister_container_exit; } switch (container->iommu_type) { @@ -2376,9 +2449,16 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); - container->listener = vfio_memory_listener; - - memory_listener_register(&container->listener, container->space->as); + /* + * If reused, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * vfio_dma_map to supply the new vaddr, and the calls will match the + * mappings remembered by the kernel. + */ + if (!reused) { + vfio_listener_register(container); + } if (container->error) { ret = -1; @@ -2388,6 +2468,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } container->initialized = true; + cpr_resave_fd("vfio_container_for_group", group->groupid, fd); return 0; listener_release_exit: @@ -2399,6 +2480,9 @@ listener_release_exit: enable_discards_exit: vfio_ram_block_discard_disable(container, false); +unregister_container_exit: + vfio_cpr_unregister_container(container); + free_container_exit: g_free(container); @@ -2408,6 +2492,9 @@ close_fd_exit: put_space_exit: vfio_put_address_space(space); +delete_fd_exit: + cpr_delete_fd("vfio_container_for_group", group->groupid); + return ret; } @@ -2417,6 +2504,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(group, container_next); group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); /* * Explicitly release the listener first before unset container, @@ -2453,6 +2541,7 @@ static void vfio_disconnect_container(VFIOGroup *group) } trace_vfio_disconnect_container(container->fd); + vfio_cpr_unregister_container(container); close(container->fd); g_free(container); @@ -2482,7 +2571,11 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) group = g_malloc0(sizeof(*group)); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open_old(path, O_RDWR); + + group->fd = cpr_find_fd("vfio_group", groupid); + if (group->fd < 0) { + group->fd = qemu_open_old(path, O_RDWR); + } if (group->fd < 0) { error_setg_errno(errp, errno, "failed to open %s", path); goto free_group_exit; @@ -2515,6 +2608,7 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) } QLIST_INSERT_HEAD(&vfio_group_list, group, next); + cpr_resave_fd("vfio_group", groupid, group->fd); return group; @@ -2540,6 +2634,7 @@ void vfio_put_group(VFIOGroup *group) vfio_disconnect_container(group); QLIST_REMOVE(group, next); trace_vfio_put_group(group->fd); + cpr_delete_fd("vfio_group", group->groupid); close(group->fd); g_free(group); @@ -2553,8 +2648,14 @@ int vfio_get_device(VFIOGroup *group, const char *name, { struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; int ret, fd; + bool reused; + + fd = cpr_find_fd(name, 0); + reused = (fd >= 0); + if (!reused) { + fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + } - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); if (fd < 0) { error_setg_errno(errp, errno, "error getting device from group %d", group->groupid); @@ -2599,11 +2700,13 @@ int vfio_get_device(VFIOGroup *group, const char *name, vbasedev->num_irqs = dev_info.num_irqs; vbasedev->num_regions = dev_info.num_regions; vbasedev->flags = dev_info.flags; + vbasedev->reused = reused; trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, dev_info.num_irqs); vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + cpr_resave_fd(name, 0, fd); return 0; } @@ -2615,6 +2718,7 @@ void vfio_put_base_device(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_put_base_device(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); close(vbasedev->fd); } diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c new file mode 100644 index 0000000000..83f787290c --- /dev/null +++ b/hw/vfio/cpr.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include +#include +#include "hw/vfio/vfio-common.h" +#include "sysemu/kvm.h" +#include "qapi/error.h" +#include "migration/blocker.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/vmstate.h" +#include "trace.h" + +static int +vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return -errno; + } + container->vaddr_unmapped = true; + return 0; +} + +static bool vfio_is_cpr_capable(VFIOContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR) || + !ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR " + "or VFIO_UNMAP_ALL"); + return false; + } else { + return true; + } +} + +static bool vfio_vmstate_needed(void *opaque) +{ + return migrate_mode() == MIG_MODE_CPR_EXEC; +} + +static int vfio_container_pre_save(void *opaque) +{ + VFIOContainer *container = (VFIOContainer *)opaque; + Error *err = NULL; + + if (!vfio_is_cpr_capable(container, &err) || + vfio_dma_unmap_vaddr_all(container, &err)) { + error_report_err(err); + return -1; + } + return 0; +} + +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOContainer *container = (VFIOContainer *)opaque; + VFIOGroup *group; + Error *err = NULL; + VFIODevice *vbasedev; + + if (!vfio_is_cpr_capable(container, &err)) { + error_report_err(err); + return -1; + } + + vfio_listener_register(container); + container->reused = false; + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + vbasedev->reused = false; + } + } + return 0; +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, + .needed = vfio_vmstate_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +int vfio_cpr_register_container(VFIOContainer *container, Error **errp) +{ + container->cpr_blocker = NULL; + if (!vfio_is_cpr_capable(container, &container->cpr_blocker)) { + return migrate_add_blockers(&container->cpr_blocker, errp, + MIG_MODE_CPR_EXEC, -1); + } + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + return 0; +} + +void vfio_cpr_unregister_container(VFIOContainer *container) +{ + migrate_del_blocker(&container->cpr_blocker); + + vmstate_unregister(NULL, &vfio_container_vmstate, container); +} diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index da9af297a0..e247b2bc73 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -5,6 +5,7 @@ vfio_ss.add(files( 'migration.c', )) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( + 'cpr.c', 'display.c', 'pci-quirks.c', 'pci.c', diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 9ed9d661b3..5a8da454b5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -28,6 +28,8 @@ #include "hw/pci/pci_bridge.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/misc.h" +#include "migration/cpr-state.h" #include "migration/vmstate.h" #include "qapi/qmp/qdict.h" #include "qemu/error-report.h" @@ -3197,6 +3199,11 @@ static void vfio_pci_reset(DeviceState *dev) { VFIOPCIDevice *vdev = VFIO_PCI(dev); + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (vdev->vbasedev.reused) { + return; + } + trace_vfio_pci_reset(vdev->vbasedev.name); vfio_pci_pre_reset(vdev); @@ -3304,6 +3311,41 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_pci_pre_load(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +static bool vfio_pci_needed(void *opaque) +{ + return migrate_mode() == MIG_MODE_CPR_EXEC; +} + +static const VMStateDescription vfio_pci_vmstate = { + .name = "vfio-pci", + .version_id = 0, + .minimum_version_id = 0, + .priority = MIG_PRI_VFIO_PCI, /* must load before container */ + .pre_load = vfio_pci_pre_load, + .needed = vfio_pci_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3311,6 +3353,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) dc->reset = vfio_pci_reset; device_class_set_props(dc, vfio_pci_dev_properties); + dc->vmsd = &vfio_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0ef1b5f4a6..63dd0fe910 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -118,6 +118,7 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" vfio_dma_unmap_overflow_workaround(void) "" +vfio_region_remap(const char *name, int fd, uint64_t iova_start, uint64_t iova_end, void *vaddr) "%s fd %d 0x%"PRIx64" - 0x%"PRIx64" [%p]" # platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0234f5e1b1..aad1bc2e57 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -89,11 +89,15 @@ typedef struct VFIOContainer { int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; MemoryListener prereg_listener; + Notifier cpr_notifier; + Error *cpr_blocker; unsigned iommu_type; Error *error; bool initialized; bool dirty_pages_supported; bool dirty_log_manual_clear; + bool reused; + bool vaddr_unmapped; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; @@ -146,6 +150,7 @@ typedef struct VFIODevice { bool no_mmap; bool ram_block_discard_allowed; bool enable_migration; + bool reused; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; @@ -223,6 +228,9 @@ void vfio_put_group(VFIOGroup *group); int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vbasedev, Error **errp); +int vfio_cpr_register_container(VFIOContainer *container, Error **errp); +void vfio_cpr_unregister_container(VFIOContainer *container); + extern const MemoryRegionOps vfio_region_ops; typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; extern VFIOGroupList vfio_group_list; @@ -244,6 +252,9 @@ struct vfio_info_cap_header * vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); #endif extern const MemoryListener vfio_prereg_listener; +void vfio_listener_register(VFIOContainer *container); +void vfio_container_region_add(VFIOContainer *container, + MemoryRegionSection *section); int vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 017c03675c..7aca6c5c23 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -157,6 +157,8 @@ typedef enum { MIG_PRI_GICV3_ITS, /* Must happen before PCI devices */ MIG_PRI_GICV3, /* Must happen before the ITS */ MIG_PRI_MAX, + MIG_PRI_VFIO_PCI = + MIG_PRI_DEFAULT + 1, /* Must happen before vfio containers */ } MigrationPriority; struct VMStateField { -- Gitee From 37ace778113a970d88a85bcec02822203bba9024 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Thu, 10 Aug 2023 16:16:31 +0800 Subject: [PATCH 30/56] vfio-pci: cpr part 2 (msi) This is mainly from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-33-git-send-email-steven.sistare@oracle.com/ https://patchew.org/QEMU/1640199934-455149-1-git-send-email-steven.sistare@oracle.com/1640199934-455149-21-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------------- Finish cpr for vfio-pci MSI/MSI-X devices by preserving eventfd's and vector state. Signed-off-by: Steve Sistare Message-Id: <1640199934-455149-21-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/vfio/pci.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 5a8da454b5..4bcd1474d3 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -50,17 +50,47 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +#define EVENT_FD_NAME(vdev, name) \ + g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) + +static void save_event_fd(VFIOPCIDevice *vdev, const char *name, int nr, + EventNotifier *ev) +{ + int fd = event_notifier_get_fd(ev); + + if (fd >= 0) { + g_autofree char *fdname = EVENT_FD_NAME(vdev, name); + + cpr_resave_fd(fdname, nr, fd); + } +} + +static int load_event_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = EVENT_FD_NAME(vdev, name); + return cpr_find_fd(fdname, nr); +} + +static void delete_event_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = EVENT_FD_NAME(vdev, name); + cpr_delete_fd(fdname, nr); +} + /* Create new or reuse existing eventfd */ static int vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, const char *name, int nr) { - int fd = -1; /* placeholder until a subsequent patch */ int ret = 0; + int fd = load_event_fd(vdev, name, nr); if (fd >= 0) { event_notifier_init_fd(e, fd); } else { ret = event_notifier_init(e, 0); + if (!ret) { + save_event_fd(vdev, name, nr, e); + } } return ret; } @@ -68,6 +98,7 @@ static int vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, const char *name, int nr) { + delete_event_fd(vdev, name, nr); event_notifier_cleanup(e); } @@ -501,6 +532,15 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, VFIOMSIVector *vector; int ret; + /* + * Ignore the callback from msix_set_vector_notifiers during resume. + * The necessary subset of these actions is called from vfio_claim_vectors + * during post load. + */ + if (vdev->vbasedev.reused) { + return 0; + } + trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); vector = &vdev->msi_vectors[nr]; @@ -2737,6 +2777,11 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (vdev->vbasedev.reused) { + return; + } + if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); @@ -2802,6 +2847,12 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (vdev->vbasedev.reused) { + vdev->req_enabled = true; + return; + } + if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); @@ -3311,6 +3362,42 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix) +{ + int i, fd; + bool pending = false; + PCIDevice *pdev = &vdev->pdev; + + vdev->nr_vectors = nr_vectors; + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + fd = load_event_fd(vdev, "interrupt", i); + if (fd >= 0) { + vfio_vector_init(vdev, i); + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); + } + + if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) { + vfio_add_kvm_msi_virq(vdev, vector, i, msix); + } else { + vdev->msi_vectors[i].virq = -1; + } + + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { + set_bit(i, vdev->msix->pending); + pending = true; + } + } + + if (msix) { + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); + } +} + /* * The kernel may change non-emulated config bits. Exclude them from the * changed-bits check in get_pci_config_device. @@ -3329,6 +3416,27 @@ static int vfio_pci_pre_load(void *opaque) return 0; } +static int vfio_pci_post_load(void *opaque, int version_id) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int nr_vectors; + + if (msix_enabled(pdev)) { + msix_set_vector_notifiers(pdev, vfio_msix_vector_use, + vfio_msix_vector_release, NULL); + nr_vectors = vdev->msix->entries; + vfio_claim_vectors(vdev, nr_vectors, true); + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + vfio_claim_vectors(vdev, nr_vectors, false); + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + assert(0); /* completed in a subsequent patch */ + } + + return 0; +} + static bool vfio_pci_needed(void *opaque) { return migrate_mode() == MIG_MODE_CPR_EXEC; @@ -3340,8 +3448,11 @@ static const VMStateDescription vfio_pci_vmstate = { .minimum_version_id = 0, .priority = MIG_PRI_VFIO_PCI, /* must load before container */ .pre_load = vfio_pci_pre_load, + .post_load = vfio_pci_post_load, .needed = vfio_pci_needed, .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), VMSTATE_END_OF_LIST() } }; -- Gitee From 3af02edda5565560879c236db97e92af37fad8d9 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Thu, 10 Aug 2023 16:39:26 +0800 Subject: [PATCH 31/56] vfio-pci: cpr part 3 (intx) This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-34-git-send-email-steven.sistare@oracle.com/ ----------------------------------------------------------------------------- Preserve vfio INTX state across cpr restart. Preserve VFIOINTx fields as follows: pin : Recover this from the vfio config in kernel space interrupt : Preserve its eventfd descriptor across exec. unmask : Ditto route.irq : This could perhaps be recovered in vfio_pci_post_load by calling pci_device_route_intx_to_irq(pin), whose implementation reads config space for a bridge device such as ich9. However, there is no guarantee that the bridge vmstate is read before vfio vmstate. Rather than fiddling with MigrationPriority for vmstate handlers, explicitly save route.irq in vfio vmstate. pending : save in vfio vmstate. mmap_timeout, mmap_timer : Re-initialize bool kvm_accel : Re-initialize In vfio_realize, defer calling vfio_intx_enable until the vmstate is available, in vfio_pci_post_load. Modify vfio_intx_enable and vfio_intx_kvm_enable to skip vfio initialization, but still perform kvm initialization. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-34-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 4bcd1474d3..508559303f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -175,11 +175,13 @@ static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) return; } - /* Get to a known interrupt state */ - qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); - vdev->intx.pending = false; - pci_irq_deassert(&vdev->pdev); + if (!vdev->vbasedev.reused) { + /* Get to a known interrupt state */ + qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); + vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vdev->intx.pending = false; + pci_irq_deassert(&vdev->pdev); + } /* Get an eventfd for resample/unmask */ if (vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0)) { @@ -195,15 +197,17 @@ static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) goto fail_irqfd; } - if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_UNMASK, - event_notifier_get_fd(&vdev->intx.unmask), - errp)) { - goto fail_vfio; - } + if (!vdev->vbasedev.reused) { + if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_UNMASK, + event_notifier_get_fd(&vdev->intx.unmask), + errp)) { + goto fail_vfio; + } - /* Let'em rip */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + /* Let'em rip */ + vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + } vdev->intx.kvm_accel = true; @@ -319,7 +323,13 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) return 0; } - vfio_disable_interrupts(vdev); + /* + * Do not alter interrupt state during vfio_realize and cpr load. The + * reused flag is cleared thereafter. + */ + if (!vdev->vbasedev.reused) { + vfio_disable_interrupts(vdev); + } vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ pci_config_set_interrupt_pin(vdev->pdev.config, pin); @@ -344,7 +354,8 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); - if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + if (!vdev->vbasedev.reused && + vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); @@ -3141,9 +3152,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_intx_routing_notifier); vdev->irqchip_change_notifier.notify = vfio_irqchip_change; kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); - ret = vfio_intx_enable(vdev, errp); - if (ret) { - goto out_deregister; + + /* Wait until cpr load reads intx routing data to enable */ + if (!vdev->vbasedev.reused) { + ret = vfio_intx_enable(vdev, errp); + if (ret) { + goto out_deregister; + } } } @@ -3421,6 +3436,7 @@ static int vfio_pci_post_load(void *opaque, int version_id) VFIOPCIDevice *vdev = opaque; PCIDevice *pdev = &vdev->pdev; int nr_vectors; + int ret = 0; if (msix_enabled(pdev)) { msix_set_vector_notifiers(pdev, vfio_msix_vector_use, @@ -3431,10 +3447,34 @@ static int vfio_pci_post_load(void *opaque, int version_id) nr_vectors = msi_nr_vectors_allocated(pdev); vfio_claim_vectors(vdev, nr_vectors, false); } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { - assert(0); /* completed in a subsequent patch */ + Error *err = 0; + ret = vfio_intx_enable(vdev, &err); + if (ret) { + error_report_err(err); + } } - return 0; + return ret; +} + +static const VMStateDescription vfio_intx_vmstate = { + .name = "vfio-intx", + .version_id = 0, + .minimum_version_id = 0, + .fields = (VMStateField[]) { + VMSTATE_BOOL(pending, VFIOINTx), + VMSTATE_UINT32(route.mode, VFIOINTx), + VMSTATE_INT32(route.irq, VFIOINTx), + VMSTATE_END_OF_LIST() + } +}; + +#define VMSTATE_VFIO_INTX(_field, _state) { \ + .name = (stringify(_field)), \ + .size = sizeof(VFIOINTx), \ + .vmsd = &vfio_intx_vmstate, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ } static bool vfio_pci_needed(void *opaque) @@ -3453,6 +3493,7 @@ static const VMStateDescription vfio_pci_vmstate = { .fields = (VMStateField[]) { VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), + VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), VMSTATE_END_OF_LIST() } }; -- Gitee From 4d2aea014bf3a54327f9b8831b96ed5305df5a3a Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Thu, 10 Aug 2023 17:13:58 +0800 Subject: [PATCH 32/56] vfio-pci: recover from unmap-all-vaddr failure This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-35-git-send-email-steven.sistare@oracle.com/ ---------------------------------------------------------------------------- If there are multiple containers and unmap-all fails for some container, we need to remap vaddr for the other containers for which unmap-all succeeded. Recover by walking all flat sections of all containers to restore the vaddr for each. Do so by invoking the vfio listener callback, and passing a new "remap" flag that tells it to restore a mapping without re-allocating new userland data structures. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-35-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- hw/vfio/common.c | 79 ++++++++++++++++++++++++++++------- hw/vfio/cpr.c | 36 ++++++++++++++++ include/hw/vfio/vfio-common.h | 2 +- 3 files changed, 100 insertions(+), 17 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 4ed06e049f..3a5db59dc7 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -926,15 +926,35 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } +static VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainer *container, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); - vfio_container_region_add(container, section); + vfio_container_region_add(container, section, false); } void vfio_container_region_add(VFIOContainer *container, - MemoryRegionSection *section) + MemoryRegionSection *section, bool remap) { hwaddr iova, end; Int128 llend, llsize; @@ -1056,6 +1076,30 @@ void vfio_container_region_add(VFIOContainer *container, int iommu_idx; trace_vfio_listener_region_add_iommu(iova, end); + + /* + * If remap, then VFIO_DMA_UNMAP_FLAG_VADDR has been called, and we + * want to remap the vaddr. vfio_container_region_add was already + * called in the past, so the giommu already exists. Find it and + * replay it, which calls vfio_dma_map further down the stack. + */ + + if (remap) { + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (giommu->iommu == iommu_mr && + giommu->iommu_offset == iommu_offset) { + memory_region_iommu_replay(giommu->iommu, &giommu->n); + return; + } + } + error_report("Container cannot find iommu region %s offset %lx", + memory_region_name(section->mr), iommu_offset); + goto fail; + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -1106,7 +1150,21 @@ void vfio_container_region_add(VFIOContainer *container, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(container, section); + /* + * If remap, then VFIO_DMA_UNMAP_FLAG_VADDR has been called, and we + * want to remap the vaddr. vfio_container_region_add was already + * called in the past, so the ram discard listener already exists. + * Call its populate function directly, which calls vfio_dma_map. + */ + if (remap) { + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(container, section); + if (vrdl->listener.notify_populate(&vrdl->listener, section)) { + error_report("listener.notify_populate failed"); + } + } else { + vfio_register_ram_discard_listener(container, section); + } return; } @@ -1435,19 +1493,8 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(container, section); /* * We only want/can synchronize the bitmap for actually mapped parts - diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 83f787290c..1f682cbba9 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -34,6 +34,15 @@ vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) return 0; } +static int +vfio_region_remap(MemoryRegionSection *section, void *handle, Error **errp) +{ + VFIOContainer *container = handle; + vfio_container_region_add(container, section, true); + container->vaddr_unmapped = false; + return 0; +} + static bool vfio_is_cpr_capable(VFIOContainer *container, Error **errp) { if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR) || @@ -99,6 +108,30 @@ static const VMStateDescription vfio_container_vmstate = { } }; +static void vfio_cpr_fail_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VFIOContainer *container; + Error *err = NULL; + + if (!migration_has_failed(s) || migrate_mode_of(s) != MIG_MODE_CPR_EXEC) { + return; + } + + container = container_of(notifier, VFIOContainer, cpr_notifier); + if (container->vaddr_unmapped) { + + /* Set reused so vfio_dma_map restores vaddr */ + container->reused = true; + if (address_space_flat_for_each_section(container->space->as, + vfio_region_remap, + container, &err)) { + error_report_err(err); + } + container->reused = false; + } +} + int vfio_cpr_register_container(VFIOContainer *container, Error **errp) { container->cpr_blocker = NULL; @@ -109,6 +142,7 @@ int vfio_cpr_register_container(VFIOContainer *container, Error **errp) vmstate_register(NULL, -1, &vfio_container_vmstate, container); + migration_add_notifier(&container->cpr_notifier, vfio_cpr_fail_notifier); return 0; } @@ -117,4 +151,6 @@ void vfio_cpr_unregister_container(VFIOContainer *container) migrate_del_blocker(&container->cpr_blocker); vmstate_unregister(NULL, &vfio_container_vmstate, container); + + migration_remove_notifier(&container->cpr_notifier); } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index aad1bc2e57..388e13e47c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -254,7 +254,7 @@ vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); extern const MemoryListener vfio_prereg_listener; void vfio_listener_register(VFIOContainer *container); void vfio_container_region_add(VFIOContainer *container, - MemoryRegionSection *section); + MemoryRegionSection *section, bool remap); int vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, -- Gitee From 12fc4f546dd05d258f765aa0e0249389b1a0f140 Mon Sep 17 00:00:00 2001 From: luofei Date: Thu, 10 Aug 2023 17:29:30 +0800 Subject: [PATCH 33/56] vhost: reset vhost devices for cpr The updated qemu does not inherit the fd of connecting with vhost device, althrough a vhost device is implicitly preserved across re-exec because the value of the fd is specified on the command line. So new qemu will reconnect to vhost device, that means new qemu will issue an VHOST_RESET_OWNER ioctl, which will fail because the device already has an owner. Here reset the owner prior to exec. this patch mainly from Steve Sistare's qemu live update, but block reset owner command when the migration status is in MIGRATION_STATUS_SETUP : https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-36-git-send-email-steven.sistare@oracle.com/ Signed-off-by: luofei --- hw/virtio/vhost.c | 35 +++++++++++++++++++++++++++++++++++ include/hw/virtio/vhost.h | 1 + 2 files changed, 36 insertions(+) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 27330cf260..a08ce500d2 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -23,6 +23,7 @@ #include "standard-headers/linux/vhost_types.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" +#include "migration/misc.h" #include "migration/blocker.h" #include "migration/qemu-file-types.h" #include "migration/migration.h" @@ -1350,6 +1351,37 @@ static bool vhost_dev_used_memslots_is_exceeded(struct vhost_dev *hdev) return false; } +static void vhost_cpr_exec_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + struct vhost_dev *dev; + int r = 0; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC) { + dev = container_of(notifier, struct vhost_dev, cpr_notifier); + if (migration_has_failed(s)) { + r = dev->vhost_ops->vhost_set_owner(dev); + } else { + /* + * Do not reset vhost device when status MIGRATION_STATUS_SETUP, + * because slave_read will read vring last_avail_idx etc information, + * if reset here, slave_read will read fail. + * + * Normally reset operation when migration succeed, and the connection + * to vhost bankend will be reestablished later. + */ + if (s->state == MIGRATION_STATUS_SETUP) { + VHOST_OPS_DEBUG("migration setup phase should not reset device"); + return; + } + r = dev->vhost_ops->vhost_reset_device(dev); + } + if (r < 0) { + VHOST_OPS_DEBUG("vhost_reset_device failed"); + } + } +} + int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) @@ -1359,6 +1391,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->vdev = NULL; hdev->migration_blocker = NULL; + hdev->cpr_notifier.notify = NULL; r = vhost_set_backend_type(hdev, backend_type); assert(r >= 0); @@ -1450,6 +1483,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->log_enabled = false; hdev->started = false; memory_listener_register(&hdev->memory_listener, &address_space_memory); + migration_add_notifier(&hdev->cpr_notifier, vhost_cpr_exec_notifier); QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); /* @@ -1490,6 +1524,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) QLIST_REMOVE(hdev, entry); } migrate_del_blocker(&hdev->migration_blocker); + migration_remove_notifier(&hdev->cpr_notifier); g_free(hdev->mem); g_free(hdev->mem_sections); if (hdev->vhost_ops) { diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 86f36f0106..c17d55ecfa 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -94,6 +94,7 @@ struct vhost_dev { QLIST_ENTRY(vhost_dev) entry; QLIST_HEAD(, vhost_iommu) iommu_list; IOMMUNotifier n; + Notifier cpr_notifier; const VhostDevConfigOps *config_ops; }; -- Gitee From ebca7df7c6197fa50e5e847134b860bb39467786 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Thu, 10 Aug 2023 19:23:20 +0800 Subject: [PATCH 34/56] chardev: cpr framework This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-37-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------------ Add QEMU_CHAR_FEATURE_CPR for devices that support cpr-exec by preserving an open descriptor across exec. Add the chardev reopen-on-cpr option for devices that should be closed on cpr and reopened after exec. Enable cpr for a chardev if it has QEMU_CHAR_FEATURE_CPR and reopen-on-cpr is false. Allow cpr-save if either QEMU_CHAR_FEATURE_CPR or reopen-on-cpr is true for all chardevs in the configuration. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-37-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- chardev/char.c | 48 ++++++++++++++++++++++++++++++++++++++---- include/chardev/char.h | 5 +++++ qapi/char.json | 7 +++++- qemu-options.hx | 26 +++++++++++++++++++---- 4 files changed, 77 insertions(+), 9 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index 0169d8dde4..43cb431cdd 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -36,6 +36,8 @@ #include "qemu/help_option.h" #include "qemu/module.h" #include "qemu/option.h" +#include "migration/cpr-state.h" +#include "migration/blocker.h" #include "qemu/id.h" #include "qemu/coroutine.h" #include "qemu/yank.h" @@ -236,26 +238,54 @@ int qemu_chr_add_client(Chardev *s, int fd) static void qemu_char_open(Chardev *chr, ChardevBackend *backend, bool *be_opened, Error **errp) { + ERRP_GUARD(); + g_autofree char *fdname = NULL; + ChardevClass *cc = CHARDEV_GET_CLASS(chr); /* Any ChardevCommon member would work */ ChardevCommon *common = backend ? backend->u.null.data : NULL; + bool has_logfile = (common && common->has_logfile); + bool has_feature_cpr; - if (common && common->has_logfile) { + if (has_logfile) { int flags = O_WRONLY; + fdname = g_strdup_printf("%s_log", chr->label); if (common->has_logappend && common->logappend) { flags |= O_APPEND; } else { flags |= O_TRUNC; } - chr->logfd = qemu_create(common->logfile, flags, 0666, errp); + chr->logfd = cpr_find_fd(fdname, 0); + if (chr->logfd < 0) { + chr->logfd = qemu_create(common->logfile, flags, 0666, errp); + } if (chr->logfd < 0) { return; } } + chr->reopen_on_cpr = (common && common->reopen_on_cpr); + if (cc->open) { cc->open(chr, backend, be_opened, errp); + if (*errp) { + return; + } + } + + /* Evaluate this after the open method sets the feature */ + has_feature_cpr = qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR); + chr->cpr_enabled = !chr->reopen_on_cpr && has_feature_cpr; + + if (!chr->reopen_on_cpr && !has_feature_cpr) { + chr->cpr_blocker = NULL; + error_setg(&chr->cpr_blocker, + "chardev %s -> %s does not allow cpr. See reopen-on-cpr.", + chr->label, chr->filename); + migrate_add_blockers(&chr->cpr_blocker, errp, MIG_MODE_CPR_EXEC, -1); + } else if (chr->cpr_enabled && has_logfile) { + cpr_resave_fd(fdname, 0, chr->logfd); } } @@ -297,11 +327,16 @@ static void char_finalize(Object *obj) if (chr->be) { chr->be->chr = NULL; } - g_free(chr->filename); - g_free(chr->label); if (chr->logfd != -1) { + g_autofree char *fdname = g_strdup_printf("%s_log", chr->label); + if (chr->cpr_enabled) { + cpr_delete_fd(fdname, 0); + } close(chr->logfd); } + migrate_del_blocker(&chr->cpr_blocker); + g_free(chr->filename); + g_free(chr->label); qemu_mutex_destroy(&chr->chr_write_lock); } @@ -501,6 +536,8 @@ void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend) backend->has_logappend = true; backend->logappend = qemu_opt_get_bool(opts, "logappend", false); + + backend->reopen_on_cpr = qemu_opt_get_bool(opts, "reopen-on-cpr", false); } static const ChardevClass *char_get_class(const char *driver, Error **errp) @@ -942,6 +979,9 @@ QemuOptsList qemu_chardev_opts = { },{ .name = "abstract", .type = QEMU_OPT_BOOL, + },{ + .name = "reopen-on-cpr", + .type = QEMU_OPT_BOOL, #endif }, { /* end of list */ } diff --git a/include/chardev/char.h b/include/chardev/char.h index f388d4b109..1560a547be 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -52,6 +52,8 @@ typedef enum { /* Whether the gcontext can be changed after calling * qemu_chr_be_update_read_handlers() */ QEMU_CHAR_FEATURE_GCONTEXT, + /* Whether the device supports cpr */ + QEMU_CHAR_FEATURE_CPR, QEMU_CHAR_FEATURE_LAST, } ChardevFeature; @@ -69,6 +71,9 @@ struct Chardev { int be_open; /* used to coordinate the chardev-change special-case: */ bool handover_yank_instance; + bool reopen_on_cpr; + bool cpr_enabled; + Error *cpr_blocker; GSource *gsource; GMainContext *gcontext; DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST); diff --git a/qapi/char.json b/qapi/char.json index f5133a5eeb..b50c2172e7 100644 --- a/qapi/char.json +++ b/qapi/char.json @@ -204,12 +204,17 @@ # @logfile: The name of a logfile to save output # @logappend: true to append instead of truncate # (default to false to truncate) +# @reopen-on-cpr: if true, close device's fd on cpr-save and reopen it after +# cpr-exec. Set this to allow CPR on a device that does not +# support QEMU_CHAR_FEATURE_CPR. defaults to false. +# since 6.2. # # Since: 2.6 ## { 'struct': 'ChardevCommon', 'data': { '*logfile': 'str', - '*logappend': 'bool' } } + '*logappend': 'bool', + '*reopen-on-cpr': 'bool' } } ## # @ChardevFile: diff --git a/qemu-options.hx b/qemu-options.hx index ad948f4b71..eeaea2347e 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -3265,43 +3265,57 @@ DEFHEADING(Character device options:) DEF("chardev", HAS_ARG, QEMU_OPTION_chardev, "-chardev help\n" - "-chardev null,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + "-chardev null,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off][,reopen-on-cpr=on|off]\n" "-chardev socket,id=id[,host=host],port=port[,to=to][,ipv4=on|off][,ipv6=on|off][,nodelay=on|off]\n" " [,server=on|off][,wait=on|off][,telnet=on|off][,websocket=on|off][,reconnect=seconds][,mux=on|off]\n" - " [,logfile=PATH][,logappend=on|off][,tls-creds=ID][,tls-authz=ID] (tcp)\n" + " [,logfile=PATH][,logappend=on|off][,tls-creds=ID][,tls-authz=ID][,reopen-on-cpr=on|off] (tcp)\n" "-chardev socket,id=id,path=path[,server=on|off][,wait=on|off][,telnet=on|off][,websocket=on|off][,reconnect=seconds]\n" - " [,mux=on|off][,logfile=PATH][,logappend=on|off][,abstract=on|off][,tight=on|off] (unix)\n" + " [,mux=on|off][,logfile=PATH][,logappend=on|off][,abstract=on|off][,tight=on|off][,reopen-on-cpr=on|off] (unix)\n" "-chardev udp,id=id[,host=host],port=port[,localaddr=localaddr]\n" " [,localport=localport][,ipv4=on|off][,ipv6=on|off][,mux=on|off]\n" - " [,logfile=PATH][,logappend=on|off]\n" + " [,logfile=PATH][,logappend=on|off][,reopen-on-cpr=on|off]\n" "-chardev msmouse,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev vc,id=id[[,width=width][,height=height]][[,cols=cols][,rows=rows]]\n" " [,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev ringbuf,id=id[,size=size][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev file,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev pipe,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #ifdef _WIN32 "-chardev console,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" "-chardev serial,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" #else "-chardev pty,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev stdio,id=id[,mux=on|off][,signal=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #ifdef CONFIG_BRLAPI "-chardev braille,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #if defined(__linux__) || defined(__sun__) || defined(__FreeBSD__) \ || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) "-chardev serial,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev tty,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #if defined(__linux__) || defined(__FreeBSD__) || defined(__DragonFly__) "-chardev parallel,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev parport,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #if defined(CONFIG_SPICE) "-chardev spicevmc,id=id,name=name[,debug=debug][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev spiceport,id=id,name=name[,debug=debug][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif , QEMU_ARCH_ALL ) @@ -3376,6 +3390,10 @@ The general form of a character device option is: ``logappend`` option controls whether the log file will be truncated or appended to when opened. + Every backend supports the ``reopen-on-cpr`` option. If on, the + devices's descriptor is closed during cpr save, and reopened after exec. + This is useful for devices that do not support cpr. + The available backends are: ``-chardev null,id=id`` -- Gitee From 0c2deae20432c406323cdb0cec7aa645e4c7d60f Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 09:44:42 +0800 Subject: [PATCH 35/56] chardev: cpr for simple devices This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-38-git-send-email-steven.sistare@oracle.com/ -------------------------------------------------------------------------- Set QEMU_CHAR_FEATURE_CPR for devices that trivially support cpr-exec. char-stdio is slightly less trivial. Allow the gdb server by closing it on exec. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-38-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- MAINTAINERS | 1 + chardev/char-mux.c | 1 + chardev/char-null.c | 1 + chardev/char-stdio.c | 31 +++++++++++++++++++++++++++++++ gdbstub.c | 1 + stubs/meson.build | 1 + stubs/migration.c | 33 +++++++++++++++++++++++++++++++++ 7 files changed, 69 insertions(+) create mode 100644 stubs/migration.c diff --git a/MAINTAINERS b/MAINTAINERS index 13ac0854f5..96c0502f61 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2868,6 +2868,7 @@ F: tests/qtest/migration-test.c F: docs/devel/migration.rst F: qapi/migration.json F: tests/migration/ +F: stubs/migration.c D-Bus M: Marc-André Lureau diff --git a/chardev/char-mux.c b/chardev/char-mux.c index ee2d47b20d..d47fa31b6b 100644 --- a/chardev/char-mux.c +++ b/chardev/char-mux.c @@ -337,6 +337,7 @@ static void qemu_chr_open_mux(Chardev *chr, */ *be_opened = muxes_opened; qemu_chr_fe_init(&d->chr, drv, errp); + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); } static void qemu_chr_parse_mux(QemuOpts *opts, ChardevBackend *backend, diff --git a/chardev/char-null.c b/chardev/char-null.c index 1c6a2900f9..02acaff03d 100644 --- a/chardev/char-null.c +++ b/chardev/char-null.c @@ -32,6 +32,7 @@ static void null_chr_open(Chardev *chr, Error **errp) { *be_opened = false; + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); } static void char_null_class_init(ObjectClass *oc, void *data) diff --git a/chardev/char-stdio.c b/chardev/char-stdio.c index 403da308c9..c18666e45e 100644 --- a/chardev/char-stdio.c +++ b/chardev/char-stdio.c @@ -27,6 +27,7 @@ #include "qemu/option.h" #include "qemu/sockets.h" #include "qapi/error.h" +#include "migration/misc.h" #include "chardev/char.h" #ifdef _WIN32 @@ -40,19 +41,46 @@ #ifndef _WIN32 /* init terminal so that we can grab keys */ static struct termios oldtty; +static struct termios newtty; static int old_fd0_flags; +static int new_fd0_flags; static bool stdio_in_use; static bool stdio_allow_signal; static bool stdio_echo_state; +static Notifier cpr_notifier; static void term_exit(void) { if (stdio_in_use) { + tcgetattr(0, &newtty); + new_fd0_flags = fcntl(0, F_GETFL); + tcsetattr(0, TCSANOW, &oldtty); fcntl(0, F_SETFL, old_fd0_flags); } } +static void term_reenter(void) +{ + if (stdio_in_use) { + tcsetattr(0, TCSANOW, &newtty); + fcntl(0, F_SETFL, new_fd0_flags); + } +} + +static void term_cpr_exec_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC) { + if (migration_has_finished(s)) { + term_exit(); + } else if (migration_has_failed(s)) { + term_reenter(); + } + } +} + static void qemu_chr_set_echo_stdio(Chardev *chr, bool echo) { struct termios tty; @@ -114,6 +142,8 @@ static void qemu_chr_open_stdio(Chardev *chr, stdio_allow_signal = !opts->has_signal || opts->signal; qemu_chr_set_echo_stdio(chr, false); + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); + migration_add_notifier(&cpr_notifier, term_cpr_exec_notifier); } #endif @@ -144,6 +174,7 @@ static void char_stdio_finalize(Object *obj) { #ifndef _WIN32 term_exit(); + migration_remove_notifier(&cpr_notifier); #endif } diff --git a/gdbstub.c b/gdbstub.c index 141d7bc4ec..f03d43bb7c 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -3538,6 +3538,7 @@ int gdbserver_start(const char *device) mon_chr = gdbserver_state.mon_chr; reset_gdbserver_state(); } + mon_chr->reopen_on_cpr = true; create_processes(&gdbserver_state); diff --git a/stubs/meson.build b/stubs/meson.build index 9565c7dc3e..cca5d208d8 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -25,6 +25,7 @@ if libaio.found() stub_ss.add(files('linux-aio.c')) endif stub_ss.add(files('migr-blocker.c')) +stub_ss.add(files('migration.c')) stub_ss.add(files('module-opts.c')) stub_ss.add(files('monitor.c')) stub_ss.add(files('monitor-core.c')) diff --git a/stubs/migration.c b/stubs/migration.c new file mode 100644 index 0000000000..f2f79bd98c --- /dev/null +++ b/stubs/migration.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "migration/misc.h" + +void migration_add_notifier(Notifier *notify, + void (*cb)(Notifier *notifier, void *data)) +{ +} + +void migration_remove_notifier(Notifier *notify) +{ +} + +bool migration_has_finished(MigrationState *s) +{ + return false; +} + +bool migration_has_failed(MigrationState *s) +{ + return false; +} + +MigMode migrate_mode_of(MigrationState *s) +{ + return 0; +} -- Gitee From da70b5ac43e0f33a66fd617794b7d29e088ce955 Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 09:54:59 +0800 Subject: [PATCH 36/56] chardev: cpr for pty Save and restore pty descriptors across cpr-exec. This patch mainly refers to Steve Sistare's qemu live update: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-39-git-send-email-steven.sistare@oracle.com/ Signed-off-by: luofei --- chardev/char-pty.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/chardev/char-pty.c b/chardev/char-pty.c index a2d1e7c985..960645588c 100644 --- a/chardev/char-pty.c +++ b/chardev/char-pty.c @@ -30,6 +30,7 @@ #include "qemu/sockets.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "migration/cpr-state.h" #include "qemu/qemu-print.h" #include "chardev/char-io.h" @@ -191,6 +192,9 @@ static void char_pty_finalize(Object *obj) Chardev *chr = CHARDEV(obj); PtyChardev *s = PTY_CHARDEV(obj); + if (chr->cpr_enabled) { + cpr_delete_fd(chr->label, 0); + } pty_chr_state(chr, 0); object_unref(OBJECT(s->ioc)); pty_chr_timer_cancel(s); @@ -207,12 +211,17 @@ static void char_pty_open(Chardev *chr, char pty_name[PATH_MAX]; char *name; + master_fd = cpr_find_fd(chr->label, 0); + if (master_fd >= 0) { + chr->filename = g_strdup_printf("pty:unknown"); + goto have_fd; + } master_fd = qemu_openpty_raw(&slave_fd, pty_name); if (master_fd < 0) { error_setg_errno(errp, errno, "Failed to create PTY"); return; } - + cpr_save_fd(chr->label, 0, master_fd); close(slave_fd); qemu_set_nonblock(master_fd); @@ -220,6 +229,8 @@ static void char_pty_open(Chardev *chr, qemu_printf("char device redirected to %s (label %s)\n", pty_name, chr->label); +have_fd: + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); s = PTY_CHARDEV(chr); s->ioc = QIO_CHANNEL(qio_channel_file_new_fd(master_fd)); name = g_strdup_printf("chardev-pty-%s", chr->label); -- Gitee From 941339ece65fef2e0b22304c4aed57b0427a3fe1 Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 10:07:54 +0800 Subject: [PATCH 37/56] chardev: cpr for serial Save and restore serial descriptors across cpr-exec. Signed-off-by: luofei --- chardev/char-serial.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/chardev/char-serial.c b/chardev/char-serial.c index 7c3d84ae24..a6bd439983 100644 --- a/chardev/char-serial.c +++ b/chardev/char-serial.c @@ -38,6 +38,7 @@ #endif #include "chardev/char-serial.h" +#include "migration/cpr-state.h" #ifdef _WIN32 @@ -266,14 +267,22 @@ static void qmp_chardev_open_serial(Chardev *chr, ChardevHostdev *serial = backend->u.serial.data; int fd; + fd = cpr_find_fd(chr->label, 0); + if (fd >= 0) { + goto have_fd; + } fd = qmp_chardev_open_file_source(serial->device, O_RDWR | O_NONBLOCK, errp); if (fd < 0) { return; + } else { + cpr_save_fd(chr->label, 0, fd); } qemu_set_nonblock(fd); tty_serial_init(fd, 115200, 'N', 8, 1); +have_fd: + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); qemu_chr_open_fd(chr, fd, fd); } #endif /* __linux__ || __sun__ */ -- Gitee From 88643f846b065453ed19e155ab86953434be3034 Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 11:09:29 +0800 Subject: [PATCH 38/56] cpr: cpr init before char socket establishing a connection Advance the cpr_init call to char socket establishing a connection, because subsequent char socket connections will use migration mode. Signed-off-by: luofei --- include/migration/misc.h | 1 + migration/migration.c | 8 ++++++-- softmmu/vl.c | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index d4c2d7da57..100d7638c4 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -56,6 +56,7 @@ AnnounceParameters *migrate_announce_params(void); void dump_vmstate_json_to_file(FILE *out_fp); /* migration/migration.c */ +void migration_object_early_init(void); void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); diff --git a/migration/migration.c b/migration/migration.c index 992fe9f50f..05d8d7f123 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -190,12 +190,17 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) return (a > b) - (a < b); } -void migration_object_init(void) +void migration_object_early_init(void) { /* This can only be called once. */ assert(!current_migration); current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION)); + cpr_init(); +} + +void migration_object_init(void) +{ /* * Init the migrate incoming object as well no matter whether * we'll use it or not. @@ -217,7 +222,6 @@ void migration_object_init(void) blk_mig_init(); ram_mig_init(); dirty_bitmap_mig_init(); - cpr_init(); } void migration_cancel(const Error *error) diff --git a/softmmu/vl.c b/softmmu/vl.c index e18b761f10..c1d4bff257 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -3725,6 +3725,8 @@ void qemu_init(int argc, char **argv, char **envp) suspend_mux_open(); + migration_object_early_init(); + qemu_disable_default_devices(); qemu_create_default_devices(); qemu_create_early_backends(); -- Gitee From 62f6a3b6651510608cacb3fad79a79b99b2cffc0 Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 14:39:09 +0800 Subject: [PATCH 39/56] chardev: cpr for sockets Save accepted socket fds before cpr-exec, and look for them after. Block cpr-exec if a socket enables the TLS or websocket option. Allow a monitor socket by closing it on exec. Currently, cpr supports only asynchronous connection establishment. Synchronization will be implemented in later patches. This patch mainly refers to Steve Sistare's qemu live update: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-40-git-send-email-steven.sistare@oracle.com/ Signed-off-by: luofei --- chardev/char-socket.c | 53 +++++++++++++++++++++++++++++++++++++++++++ stubs/migration.c | 5 ++++ 2 files changed, 58 insertions(+) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 57ae53304a..5636f24948 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -27,6 +27,9 @@ #include "io/channel-socket.h" #include "io/channel-tls.h" #include "io/channel-websock.h" +#include "migration/blocker.h" +#include "migration/cpr-state.h" +#include "migration/misc.h" #include "io/net-listener.h" #include "qemu/error-report.h" #include "qemu/module.h" @@ -35,6 +38,7 @@ #include "qapi/clone-visitor.h" #include "qapi/qapi-visit-sockets.h" #include "qemu/yank.h" +#include "sysemu/sysemu.h" #include "chardev/char-io.h" #include "qom/object.h" @@ -87,6 +91,7 @@ struct SocketChardev { bool connect_err_reported; QIOTask *connect_task; + Error *cpr_blocker; }; typedef struct SocketChardev SocketChardev; @@ -436,6 +441,11 @@ static void tcp_chr_free_connection(Chardev *chr) SocketChardev *s = SOCKET_CHARDEV(chr); int i; + if (chr->cpr_enabled) { + cpr_delete_fd(chr->label, 0); + } + migrate_del_blocker(&s->cpr_blocker); + if (s->read_msgfds_num) { for (i = 0; i < s->read_msgfds_num; i++) { close(s->read_msgfds[i]); @@ -998,6 +1008,10 @@ static void tcp_chr_accept(QIONetListener *listener, QIO_CHANNEL(cioc)); } tcp_chr_new_client(chr, cioc); + + if (s->sioc && chr->cpr_enabled) { + cpr_resave_fd(chr->label, 0, s->sioc->fd); + } } @@ -1253,6 +1267,27 @@ static gboolean socket_reconnect_timeout(gpointer opaque) return false; } +static int load_char_socket_fd(Chardev *chr, Error **errp) +{ + ERRP_GUARD(); + SocketChardev *sockchar = SOCKET_CHARDEV(chr); + QIOChannelSocket *sioc; + const char *label = chr->label; + int fd = cpr_find_fd(label, 0); + + if (fd != -1) { + sockchar = SOCKET_CHARDEV(chr); + sioc = qio_channel_socket_new_fd(fd, errp); + if (sioc) { + tcp_chr_accept(sockchar->listener, sioc, chr); + object_unref(OBJECT(sioc)); + } else { + error_prepend(errp, "could not restore socket for %s", label); + return -1; + } + } + return 0; +} static int qmp_chardev_open_socket_server(Chardev *chr, bool is_telnet, @@ -1457,6 +1492,18 @@ static void qmp_chardev_open_socket(Chardev *chr, } s->registered_yank = true; + if (!s->tls_creds && !s->is_websock) { + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); + } else if (!chr->reopen_on_cpr) { + s->cpr_blocker = NULL; + error_setg(&s->cpr_blocker, + "error: socket %s is not cpr capable due to %s option", + chr->label, (s->tls_creds ? "TLS" : "websocket")); + if (migrate_add_blockers(&s->cpr_blocker, errp, MIG_MODE_CPR_EXEC, + -1)) { + return; + } + } /* be isn't opened until we get a connection */ *be_opened = false; @@ -1472,6 +1519,12 @@ static void qmp_chardev_open_socket(Chardev *chr, return; } } + + if (migrate_mode() == MIG_MODE_CPR_EXEC && + qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR) && + !chr->reopen_on_cpr && !is_waitconnect) { + load_char_socket_fd(chr, errp); + } } static void qemu_chr_parse_socket(QemuOpts *opts, ChardevBackend *backend, diff --git a/stubs/migration.c b/stubs/migration.c index f2f79bd98c..166643cb0f 100644 --- a/stubs/migration.c +++ b/stubs/migration.c @@ -31,3 +31,8 @@ MigMode migrate_mode_of(MigrationState *s) { return 0; } + +MigMode migrate_mode(void) +{ + return 0; +} -- Gitee From f16409918f72ee568bcffbc7961cd4f0b69d7119 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 14:46:04 +0800 Subject: [PATCH 40/56] python/machine: QEMUMachine full_args This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-41-git-send-email-steven.sistare@oracle.com/ --------------------------------------------------------------------------- Provide full_args() to return all command-line arguments used to start a vm, some of which are not otherwise visible to QEMUMachine clients. This is needed by the cpr test, which must start a vm, then pass all qemu command-line arguments when setting the cpr-exec-args migration parameter. Signed-off-by: Steve Sistare Acked-by: John Snow Reviewed-by: John Snow Message-Id: <1658851843-236870-41-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- python/qemu/machine/machine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/qemu/machine/machine.py b/python/qemu/machine/machine.py index 67ab06ca2b..933f1957e3 100644 --- a/python/qemu/machine/machine.py +++ b/python/qemu/machine/machine.py @@ -308,6 +308,11 @@ def args(self) -> List[str]: """Returns the list of arguments given to the QEMU binary.""" return self._args + @property + def full_args(self) -> List[str]: + """Returns the full list of arguments used to launch QEMU.""" + return list(self._qemu_full_args) + def _pre_launch(self) -> None: if self._console_set: self._remove_files.append(self._console_address) -- Gitee From 44b293e507990b2a59d8951a860e244af9222b81 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 14:51:30 +0800 Subject: [PATCH 41/56] python/machine: QEMUMachine reopen_qmp_connection This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-42-git-send-email-steven.sistare@oracle.com/ Provide reopen_qmp_connection() to reopen a closed monitor connection. This is needed by cpr, because qemu exec closes the monitor socket. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-42-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- python/qemu/machine/machine.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/qemu/machine/machine.py b/python/qemu/machine/machine.py index 933f1957e3..66728cceb4 100644 --- a/python/qemu/machine/machine.py +++ b/python/qemu/machine/machine.py @@ -463,6 +463,15 @@ def _close_qmp_connection(self) -> None: finally: self._qmp_connection = None + def reopen_qmp_connection(self): + self._close_qmp_connection() + self._qmp_connection = QEMUMonitorProtocol( + self._monitor_address, + server=True, + nickname=self._name + ) + self._qmp.accept(self._qmp_timer) + def _early_cleanup(self) -> None: """ Perform any cleanup that needs to happen before the VM exits. -- Gitee From 878724826bf7da9740d808e8e0bd4e5c4071eb54 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 14:55:43 +0800 Subject: [PATCH 42/56] tests/avocado: add cpr regression test This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-43-git-send-email-steven.sistare@oracle.com/ Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-43-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- MAINTAINERS | 1 + tests/avocado/cpr.py | 176 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 tests/avocado/cpr.py diff --git a/MAINTAINERS b/MAINTAINERS index 96c0502f61..85ca4cd214 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2996,6 +2996,7 @@ F: stubs/cpr-state.c F: include/migration/cpr.h F: migration/cpr.c F: hw/vfio/cpr.c +F: tests/avocado/cpr.py Record/replay M: Pavel Dovgalyuk diff --git a/tests/avocado/cpr.py b/tests/avocado/cpr.py new file mode 100644 index 0000000000..11e1376d13 --- /dev/null +++ b/tests/avocado/cpr.py @@ -0,0 +1,176 @@ +# cpr test + +# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# +# This work is licensed under the terms of the GNU GPL, version 2. +# See the COPYING file in the top-level directory. + +import tempfile +from avocado_qemu import QemuSystemTest +from avocado.utils import wait + +class Cpr(QemuSystemTest): + """ + :avocado: tags=cpr + """ + + timeout = 5 + fast_timeout = 1 + + @staticmethod + def has_status(vm, status, command): + return vm.command(command)['status'] in status + + def wait_for_status(self, vm, status, command): + wait.wait_for(self.has_status, + timeout=self.timeout, + step=0.1, + args=(vm,status,command,)) + + def wait_for_runstate(self, vm, status): + self.wait_for_status(vm, status, 'query-status') + + def wait_for_migration(self, vm, status): + self.wait_for_status(vm, status, 'query-migrate') + + def run_and_fail(self, vm, msg): + # Qemu will fail fast, so disable monitor to avoid timeout in accept + vm.set_qmp_monitor(False) + vm.launch() + vm.wait(self.timeout) + self.assertRegex(vm.get_log(), msg) + + def get_vm_for_restart(self): + return self.get_vm('-nodefaults', + '-migrate-mode-enable', 'cpr-exec', + '-object', 'memory-backend-memfd,id=pc.ram,size=8M', + '-machine', 'memory-backend=pc.ram') + + def do_cpr_exec(self, vmstate_name): + vm = self.get_vm_for_restart() + vm.launch() + + uri = 'file:' + vmstate_name + args = vm.full_args + ['-incoming', 'defer'] + + vm.command('migrate-set-parameters', cpr_exec_args=args) + vm.command('migrate-set-parameters', mode='cpr-exec') + vm.qmp('migrate', uri=uri) + + # Cannot poll for migration status, because qemu may call execv before + # we see it. Wait for STOP instead. + vm.event_wait(name='STOP', timeout=self.fast_timeout) + + # Migrate execs and closes the monitor socket, so reopen it. + vm.reopen_qmp_connection() + + self.assertEqual(vm.command('query-status')['status'], 'inmigrate') + resp = vm.command('migrate-incoming', uri=uri) + self.wait_for_migration(vm, ('completed', 'failed')) + self.assertEqual(vm.command('query-migrate')['status'], 'completed') + + resp = vm.command('cont') + vm.event_wait(name='RESUME', timeout=self.fast_timeout) + self.assertEqual(vm.command('query-status')['status'], 'running') + + def do_cpr_reboot(self, vmstate_name): + args = ['-nodefaults', '-migrate-mode-enable', 'cpr-reboot' ] + old_vm = self.get_vm(*args) + old_vm.launch() + + uri = 'file:' + vmstate_name + + old_vm.command('migrate-set-capabilities', capabilities = [ + { "capability": "x-ignore-shared", "state": True }]) + old_vm.command('migrate-set-parameters', mode='cpr-reboot') + old_vm.qmp('migrate', uri=uri) + self.wait_for_migration(old_vm, ('completed', 'failed')) + self.assertEqual(old_vm.command('query-migrate')['status'], + 'completed') + self.assertEqual(old_vm.command('query-status')['status'], + 'postmigrate') + + args = args + ['-incoming', 'defer'] + new_vm = self.get_vm(*args) + new_vm.launch() + self.assertEqual(new_vm.command('query-status')['status'], 'inmigrate') + + new_vm.command('migrate-set-capabilities', capabilities = [ + { "capability": "x-ignore-shared", "state": True }]) + new_vm.command('migrate-set-parameters', mode='cpr-reboot') + new_vm.command('migrate-incoming', uri=uri) + self.wait_for_migration(new_vm, ('completed', 'failed')) + self.assertEqual(new_vm.command('query-migrate')['status'], 'completed') + + new_vm.command('cont') + new_vm.event_wait(name='RESUME', timeout=self.fast_timeout) + self.assertEqual(new_vm.command('query-status')['status'], 'running') + + def test_cpr_exec(self): + """ + Verify that cpr restart mode works + """ + with tempfile.NamedTemporaryFile() as vmstate_file: + self.do_cpr_exec(vmstate_file.name) + + def test_cpr_reboot(self): + """ + Verify that cpr reboot mode works + """ + with tempfile.NamedTemporaryFile() as vmstate_file: + self.do_cpr_reboot(vmstate_file.name) + + def test_cpr_block_cpr_exec(self): + """ + Verify that qemu rejects cpr restart mode for volatile memory + """ + + vm = self.get_vm('-nodefaults', + '-migrate-mode-enable', 'cpr-exec') + vm.launch() + uri='file:/dev/null' + args = vm.full_args + ['-S'] + resp = vm.command('migrate-set-parameters', mode='cpr-exec') + rsp = vm.qmp('migrate', uri=uri) + vm.qmp('quit') + + expect = r'Memory region .* is volatile' + self.assertRegex(rsp['error']['desc'], expect) + + def test_cpr_block_memfd(self): + + """ + Verify that qemu complains for only-cpr-capable and volatile memory + """ + vm = self.get_vm('-nodefaults', + '-migrate-mode-enable', 'cpr-exec', + '-only-cpr-capable') + self.run_and_fail(vm, r'only-cpr-capable specified.* Memory ') + + def test_cpr_block_replay(self): + """ + Verify that qemu complains for only-cpr-capable and replay + """ + vm = self.get_vm_for_restart() + vm.add_args('-only-cpr-capable', + '-icount', 'shift=10,rr=record,rrfile=/dev/null') + self.run_and_fail(vm, r'only-cpr-capable specified.* replay ') + + def test_cpr_block_chardev(self): + """ + Verify that qemu complains for only-cpr-capable and unsupported chardev + """ + vm = self.get_vm_for_restart() + vm.add_args('-only-cpr-capable', + '-chardev', 'vc,id=vc1') + self.run_and_fail(vm, r'only-cpr-capable specified.* vc1 ') + + def test_cpr_allow_chardev(self): + """ + Verify that qemu allows unsupported chardev with reopen-on-cpr + """ + vm = self.get_vm_for_restart() + vm.add_args('-only-cpr-capable', + '-chardev', 'vc,id=vc1,reopen-on-cpr=on') + vm.launch() + self.wait_for_runstate(vm, ('running')) -- Gitee From c3bd2066d49699018596224689984a8bc910f98d Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 15:53:41 +0800 Subject: [PATCH 43/56] vl: start on wakeup request This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-44-git-send-email-steven.sistare@oracle.com/ ------------------------------------------------------------------------------ If qemu starts and loads a VM in the suspended state, then a later wakeup request will set the state to running, which is not sufficient to initialize the vm, as vm_start was never called during this invocation of qemu. See qemu_system_wakeup_request(). Define the start_on_wakeup_requested() hook to cause vm_start() to be called when processing the wakeup request. This will be called in a subsequent migration patch. Signed-off-by: luofei --- include/sysemu/runstate.h | 1 + softmmu/runstate.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index 91c89d8276..198211b526 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -51,6 +51,7 @@ void qemu_system_reset_request(ShutdownCause reason); void qemu_system_suspend_request(void); void qemu_register_suspend_notifier(Notifier *notifier); bool qemu_wakeup_suspend_enabled(void); +void qemu_system_start_on_wakeup_request(void); void qemu_system_wakeup_request(WakeupReason reason, Error **errp); void qemu_system_wakeup_enable(WakeupReason reason, bool enabled); void qemu_register_wakeup_notifier(Notifier *notifier); diff --git a/softmmu/runstate.c b/softmmu/runstate.c index b7d9675a89..becd91e824 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -338,6 +338,7 @@ void vm_state_notify(bool running, RunState state) } } +static bool start_on_wakeup_requested; static ShutdownCause reset_requested; static ShutdownCause shutdown_requested; static int shutdown_signal; @@ -572,6 +573,11 @@ void qemu_register_suspend_notifier(Notifier *notifier) notifier_list_add(&suspend_notifiers, notifier); } +void qemu_system_start_on_wakeup_request(void) +{ + start_on_wakeup_requested = true; +} + void qemu_system_wakeup_request(WakeupReason reason, Error **errp) { trace_system_wakeup_request(reason); @@ -584,7 +590,13 @@ void qemu_system_wakeup_request(WakeupReason reason, Error **errp) if (!(wakeup_reason_mask & (1 << reason))) { return; } - runstate_set(RUN_STATE_RUNNING); + if (start_on_wakeup_requested) { + start_on_wakeup_requested = false; + vm_start(); + } else { + runstate_set(RUN_STATE_RUNNING); + } + wakeup_reason = reason; qemu_notify_event(); } -- Gitee From b2c8d7790f304960c29fbd13db53e2134ea1e6fa Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 16:00:38 +0800 Subject: [PATCH 44/56] migration: fix suspended runstate This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-45-git-send-email-steven.sistare@oracle.com/ -------------------------------------------------------------------------- Migration of a guest in the suspended state is broken. The incoming migration code automatically tries to wake the guest, which IMO is wrong -- the guest should end migration in the same state it started. Further, the wakeup attempt merely sets state to running but does not actually start the guest, as vm_start was never called during this invocation of qemu. To fix, leave the guest in the suspended state, but call qemu_system_start_on_wakeup_request() so the guest is properly resumed when the client sends a system_wakeup command. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-45-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- migration/migration.c | 11 ++++------- softmmu/runstate.c | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 05d8d7f123..a22c7a2b2f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -539,6 +539,10 @@ static void process_incoming_migration_bh(void *opaque) vm_start(); } else { runstate_set(global_state_get_runstate()); + if (runstate_check(RUN_STATE_SUSPENDED)) { + /* Force vm_start to be called later. */ + qemu_system_start_on_wakeup_request(); + } } /* * This must happen after any state changes since as soon as an external @@ -3190,7 +3194,6 @@ static int postcopy_start(MigrationState *ms) qemu_mutex_lock_iothread(); trace_postcopy_start_set_run(); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); global_state_store(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); if (ret < 0) { @@ -3402,7 +3405,6 @@ static void migration_completion(MigrationState *s) if (s->state == MIGRATION_STATUS_ACTIVE) { qemu_mutex_lock_iothread(); s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); s->vm_was_running = runstate_is_running(); ret = global_state_store(); @@ -4179,11 +4181,6 @@ static void *bg_migration_thread(void *opaque) qemu_mutex_lock_iothread(); - /* - * If VM is currently in suspended state, then, to make a valid runstate - * transition in vm_stop_force_state() we need to wakeup it up. - */ - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); s->vm_was_running = runstate_is_running(); if (global_state_store()) { diff --git a/softmmu/runstate.c b/softmmu/runstate.c index becd91e824..3762f4597a 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -154,6 +154,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_SUSPENDED, RUN_STATE_PAUSED }, { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH }, { RUN_STATE_SUSPENDED, RUN_STATE_COLO}, -- Gitee From b181655168dc40b5a781bd1549abb47659f91be2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 11 Aug 2023 16:10:45 +0800 Subject: [PATCH 45/56] migration: notifier error reporting This is from Steve Sistare's qemu live update patch: https://patchew.org/QEMU/1658851843-236870-1-git-send-email-steven.sistare@oracle.com/1658851843-236870-46-git-send-email-steven.sistare@oracle.com/ -------------------------------------------------------------------------- Provide migration_notifier_set_error(), which allows migration notifiers to return an error message indicating they have failed, which halts the migration. The error message is visible in the 'info migrate' command. No functional change until calls to migration_notifier_set_error are added. Signed-off-by: Steve Sistare Message-Id: <1658851843-236870-46-git-send-email-steven.sistare@oracle.com> Signed-off-by: luofei --- include/migration/misc.h | 3 ++- migration/migration.c | 31 +++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index 100d7638c4..9b7c0062dc 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -64,7 +64,8 @@ bool migration_is_active(MigrationState *); void migration_add_notifier(Notifier *notify, void (*func)(Notifier *notifier, void *data)); void migration_remove_notifier(Notifier *notify); -void migration_call_notifiers(MigrationState *s); +int migration_call_notifiers(MigrationState *s); +void migration_notifier_set_error(MigrationState *s, Error *err); bool migration_in_setup(MigrationState *); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); diff --git a/migration/migration.c b/migration/migration.c index a22c7a2b2f..f80109a8c8 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1867,6 +1867,8 @@ static void block_cleanup_parameters(MigrationState *s) static void migrate_fd_cleanup(MigrationState *s) { + bool already_failed; + qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; @@ -1907,7 +1909,15 @@ static void migrate_fd_cleanup(MigrationState *s) /* It is used on info migrate. We can't free it */ error_report_err(error_copy(s->error)); } - migration_call_notifiers(s); + + already_failed = migration_has_failed(s); + if (migration_call_notifiers(s)) { + if (!already_failed) { + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + /* Notify again to recover from this late failure. */ + migration_call_notifiers(s); + } + } block_cleanup_parameters(s); cpr_exec(); yank_unregister_instance(MIGRATION_YANK_INSTANCE); @@ -2018,9 +2028,16 @@ void migration_remove_notifier(Notifier *notify) } } -void migration_call_notifiers(MigrationState *s) +int migration_call_notifiers(MigrationState *s) { notifier_list_notify(&migration_state_notifiers, s); + return (s->error != NULL); +} + +void migration_notifier_set_error(MigrationState *s, Error *err) +{ + migrate_set_error(s, err); + error_report_err(err); } bool migration_in_setup(MigrationState *s) @@ -3301,7 +3318,9 @@ static int postcopy_start(MigrationState *ms) * spice needs to trigger a transition now */ ms->postcopy_after_devices = true; - migration_call_notifiers(ms); + if (migration_call_notifiers(ms)) { + goto fail; + } ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; @@ -4306,7 +4325,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO; /* Notify before starting migration thread */ - migration_call_notifiers(s); + if (migration_call_notifiers(s)) { + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + migrate_fd_cleanup(s); + return; + } } qemu_file_set_rate_limit(s->to_dst_file, rate_limit); -- Gitee From bc64b054a575d7fe35d761c794d727562c47b1b0 Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 16:15:27 +0800 Subject: [PATCH 46/56] vfio: allow cpr-exec migration The cpr-exec mode migration supports vfio devices, so just add blocker to normal mode. Signed-off-by: luofei --- hw/vfio/migration.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 6d09adbfb8..a85dc42f41 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -889,7 +889,8 @@ add_blocker: "VFIO device doesn't support migration"); g_free(info); - return migrate_add_blocker(&vbasedev->migration_blocker, errp); + return migrate_add_blockers(&vbasedev->migration_blocker, errp, + MIG_MODE_NORMAL, -1); } void vfio_migration_finalize(VFIODevice *vbasedev) -- Gitee From 356adef1a27cee2fd1ab3c82824f37e5b39ff71f Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 16:40:09 +0800 Subject: [PATCH 47/56] seccomp: do not deny spawn when enable cpr_exec mode Cpr exec mode need call fork+exec to start new qemu, so sandbox should not deny spawn. Signed-off-by: luofei --- include/sysemu/seccomp.h | 1 + softmmu/qemu-seccomp.c | 20 ++++++++++++++++++++ softmmu/vl.c | 4 ++++ 3 files changed, 25 insertions(+) diff --git a/include/sysemu/seccomp.h b/include/sysemu/seccomp.h index fe859894f6..a88f95b869 100644 --- a/include/sysemu/seccomp.h +++ b/include/sysemu/seccomp.h @@ -21,6 +21,7 @@ #define QEMU_SECCOMP_SET_SPAWN (1 << 3) #define QEMU_SECCOMP_SET_RESOURCECTL (1 << 4) +int cpr_exec_unset_spawn(void *opaque, QemuOpts *opts, Error **errp); int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp); #endif diff --git a/softmmu/qemu-seccomp.c b/softmmu/qemu-seccomp.c index f50026778c..a94a648727 100644 --- a/softmmu/qemu-seccomp.c +++ b/softmmu/qemu-seccomp.c @@ -196,6 +196,26 @@ static int seccomp_start(uint32_t seccomp_opts, Error **errp) return rc < 0 ? -1 : 0; } +int cpr_exec_unset_spawn(void *opaque, QemuOpts *opts, Error **errp) +{ + const char *value = NULL; + char *retstr = NULL; + + if (qemu_opt_get_bool(opts, "enable", false)) { + value = qemu_opt_get(opts, "spawn"); + if (value) { + /* CPR_EXEC mode need call fork+execv, so do not deny spawn */ + if (g_str_equal(value, "deny")) { + retstr = qemu_opt_get_del(opts, "spawn"); + } + if (retstr) + g_free(retstr); + } + } + + return 0; +} + int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp) { if (qemu_opt_get_bool(opts, "enable", false)) { diff --git a/softmmu/vl.c b/softmmu/vl.c index c1d4bff257..b2bc684733 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -2540,6 +2540,10 @@ static void qemu_process_early_options(void) #ifdef CONFIG_SECCOMP QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL); if (olist) { + if (migrate_mode_enabled(MIG_MODE_CPR_EXEC)) { + qemu_opts_foreach(olist, cpr_exec_unset_spawn, NULL, &error_fatal); + olist = qemu_find_opts_err("sandbox", NULL); + } qemu_opts_foreach(olist, parse_sandbox, NULL, &error_fatal); } #endif -- Gitee From d11c28724351d3103648924283f00b9516dd1791 Mon Sep 17 00:00:00 2001 From: luofei Date: Fri, 11 Aug 2023 17:33:00 +0800 Subject: [PATCH 48/56] char: export char device cpr capability Export char device cpr capability, This will be called in a subsequent charmonitor inherits connection capabilities patch. Signed-off-by: luofei --- chardev/char.c | 11 +++++++++++ include/chardev/char.h | 1 + 2 files changed, 12 insertions(+) diff --git a/chardev/char.c b/chardev/char.c index 43cb431cdd..e606315b3c 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -1000,6 +1000,17 @@ void qemu_chr_set_feature(Chardev *chr, return set_bit(feature, chr->features); } +bool qemu_chr_cpr_support(Chardev *chr) +{ + bool has_feature_cpr; + + has_feature_cpr = qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR); + if (chr->cpr_enabled || (has_feature_cpr && !chr->reopen_on_cpr)) + return true; + else + return false; +} + static Chardev *chardev_new(const char *id, const char *typename, ChardevBackend *backend, GMainContext *gcontext, diff --git a/include/chardev/char.h b/include/chardev/char.h index 1560a547be..020fce339b 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -229,6 +229,7 @@ bool qemu_chr_has_feature(Chardev *chr, ChardevFeature feature); void qemu_chr_set_feature(Chardev *chr, ChardevFeature feature); +bool qemu_chr_cpr_support(Chardev *chr); QemuOpts *qemu_chr_parse_compat(const char *label, const char *filename, bool permit_mux_mon); int qemu_chr_write(Chardev *s, const uint8_t *buf, int len, bool write_all); -- Gitee From 134475473d99ed16c9cd6dc1b5ae4593a2c15b71 Mon Sep 17 00:00:00 2001 From: luofei Date: Mon, 14 Aug 2023 09:57:52 +0800 Subject: [PATCH 49/56] charmonitor: stay connected to libvirtd The normal process is charmonitor Monitor commands is setted at qmp_qmp_capabilities() when interacting with libvirtd, but libvirtd do not support qemu live update on cpr exec mode yet, we will lose charmonitor Monitor commands setting, and a permission verificcation error will occurre on the connection between qemu and libvirtd. Meanwhile Use the cpr state save framework to implement connection permission inheritance betwween qemu and libvirtd. Signed-off-by: luofei --- chardev/char.c | 5 +++++ include/migration/cpr-state.h | 8 ++++++++ migration/cpr.c | 5 ++++- monitor/qmp-cmds-control.c | 17 +++++++++++++++++ monitor/qmp.c | 26 ++++++++++++++++++++++++++ 5 files changed, 60 insertions(+), 1 deletion(-) diff --git a/chardev/char.c b/chardev/char.c index e606315b3c..8805c9df41 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -323,6 +323,7 @@ static void char_class_init(ObjectClass *oc, void *data) static void char_finalize(Object *obj) { Chardev *chr = CHARDEV(obj); + char *name; if (chr->be) { chr->be->chr = NULL; @@ -330,7 +331,11 @@ static void char_finalize(Object *obj) if (chr->logfd != -1) { g_autofree char *fdname = g_strdup_printf("%s_log", chr->label); if (chr->cpr_enabled) { + name = g_strdup_printf("qmp-%s", chr->label); cpr_delete_fd(fdname, 0); + /* Delete qmp monitor */ + cpr_delete_fd(name, MONITOR_CAPAB); + g_free(name); } close(chr->logfd); } diff --git a/include/migration/cpr-state.h b/include/migration/cpr-state.h index a9ae6ae239..caa559824d 100644 --- a/include/migration/cpr-state.h +++ b/include/migration/cpr-state.h @@ -10,6 +10,14 @@ #include "qapi/qapi-types-migration.h" +/* + * Here, we use the cpr's file descriptor inheritance mechanism to + * pass the monitor capability. We define a special id number 0xFFFF + * to indicate that the fd is not a file descriptor. but a monitor + * capability. + * */ +#define MONITOR_CAPAB 0XFFFF + typedef int (*cpr_walk_fd_cb)(const char *name, int id, int fd, void *opaque); void cpr_save_fd(const char *name, int id, int fd); diff --git a/migration/cpr.c b/migration/cpr.c index 698baa455e..3b82557435 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -18,7 +18,10 @@ static Notifier cpr_fd_notifier; static int preserve_fd(const char *name, int id, int fd, void *opaque) { - qemu_clear_cloexec(fd); + /* MONITOR_CAPAB means fd is not a file descriptor */ + if (id != MONITOR_CAPAB) + qemu_clear_cloexec(fd); + return 0; } diff --git a/monitor/qmp-cmds-control.c b/monitor/qmp-cmds-control.c index 6e581713a3..b858178e95 100644 --- a/monitor/qmp-cmds-control.c +++ b/monitor/qmp-cmds-control.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" +#include "migration/cpr-state.h" #include "monitor-internal.h" #include "qemu-version.h" #include "qapi/compat-policy.h" @@ -91,6 +92,22 @@ void qmp_qmp_capabilities(bool has_enable, QMPCapabilityList *enable, } mon->commands = &qmp_commands; + + if (qemu_chr_cpr_support(cur_mon->chr.chr)) { + int i, fd; + char *name; + + assert(sizeof(mon->capab) < sizeof(fd)); + name = g_strdup_printf("qmp-%s", cur_mon->chr.chr->label); + memset(&fd, 0, sizeof(fd)); + for (i = 0; i < QMP_CAPABILITY__MAX; i++) { + if (mon->capab[i]) + fd |= 1 << i; + } + /* MONITOR_CAPAB means mon->capab */ + cpr_resave_fd(name, MONITOR_CAPAB, fd); + g_free(name); + } } VersionInfo *qmp_query_version(Error **errp) diff --git a/monitor/qmp.c b/monitor/qmp.c index 4d1ac66785..a7568a4c69 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -31,6 +31,8 @@ #include "qapi/qmp/qdict.h" #include "qapi/qmp/qjson.h" #include "qapi/qmp/qlist.h" +#include "migration/misc.h" +#include "migration/cpr-state.h" #include "trace.h" struct QMPRequest { @@ -57,6 +59,30 @@ static void monitor_qmp_caps_reset(MonitorQMP *mon) memset(mon->capab_offered, 0, sizeof(mon->capab_offered)); memset(mon->capab, 0, sizeof(mon->capab)); mon->capab_offered[QMP_CAPABILITY_OOB] = mon->common.use_io_thread; + + /* + * The normal process is charmonitor Monitor commands is setted + * at qmp_qmp_capabilities() when interacting with libvirtd, + * but libvirtd do not support qemu live update reconnect yet, + * on cpr exec mode ,we will lose charmonitor Monitor commands + * setting, and a permission verificcation error will occurre on + * the connection between qemu and libvirtd. + * */ + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + char *name; + int i, fd; + + name = g_strdup_printf("qmp-%s", mon->common.chr.chr->label); + fd = cpr_find_fd(name, MONITOR_CAPAB); + if (fd >= 0) { + mon->commands = &qmp_commands; + for (i = 0; i < QMP_CAPABILITY__MAX; i++) { + if (fd & (1 << i)) + mon->capab[i] = true; + } + } + g_free(name); + } } static void qmp_request_free(QMPRequest *req) -- Gitee From fb0c7ba683e5122d7654080ab5ecd1e80ba502cc Mon Sep 17 00:00:00 2001 From: luofei Date: Mon, 14 Aug 2023 10:34:22 +0800 Subject: [PATCH 50/56] vfio-pci: do not hot reset reused multiple in-use device This is a supplement to patch "vfio-pci: cpr part 1(217b779d830)", previous patch just suppressed single in-use device reset, but for multiple in-use device(such as NVIDIA GPU)supporting vfio_hot_reset_multi also needs to be suppressed, otherwise virtual machine will crash due to some memory region not mapped. Signed-off-by: luofei --- hw/vfio/pci.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 508559303f..e86242b41c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2491,6 +2491,8 @@ static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + /* Reused dev should not call reset handler */ + assert(!vdev->vbasedev.reused); return vfio_pci_hot_reset(vdev, false); } @@ -2498,7 +2500,11 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { - vbasedev->needs_reset = true; + if (vdev->vbasedev.reused) { + vbasedev->needs_reset = false; + } else { + vbasedev->needs_reset = true; + } } } -- Gitee From 78398613b19126d2a07c51bb1dc5b66c10cc9c76 Mon Sep 17 00:00:00 2001 From: luofei Date: Wed, 16 Aug 2023 14:41:33 +0800 Subject: [PATCH 51/56] vfio-pci: do not hot reset reused multiple in-use platform device Althrough vfio platform device vfio_hot_reset_multi handler not implemented, still suppress reset handler considering the possibility of implementing the handler later. Signed-off-by: luofei --- hw/vfio/platform.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index f8f08a0f36..178ec824f7 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -425,12 +425,21 @@ fail_irqfd: static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev) { - vbasedev->needs_reset = true; + if (vbasedev->reused) { + vbasedev->needs_reset = false; + } else { + vbasedev->needs_reset = true; + } } /* not implemented yet */ static int vfio_platform_hot_reset_multi(VFIODevice *vbasedev) { + /* + * Althrough platform hot reset handler not implemented, + * also put assert here to get attention. + */ + assert(!vbasedev->reused); return -1; } -- Gitee From 91e34507ab0fd58e904bac7943423210760d7f59 Mon Sep 17 00:00:00 2001 From: luofei Date: Mon, 14 Aug 2023 15:55:22 +0800 Subject: [PATCH 52/56] cpr: add cpr_exec_complete list to handle callbacks before vm start Add cpr_exec_prestart list to handle events that need to be processed before start virtual machine. This will be called in a subsequent patches. Signed-off-by: luofei --- include/qemu/typedefs.h | 1 + include/sysemu/runstate.h | 6 +++++ migration/migration.c | 6 +++++ softmmu/runstate.c | 47 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index ee60eb3de4..a55b0ffaa1 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -123,6 +123,7 @@ typedef struct SSIBus SSIBus; typedef struct TranslationBlock TranslationBlock; typedef struct VirtIODevice VirtIODevice; typedef struct Visitor Visitor; +typedef struct CprExecCompleteEntry CprExecCompleteEntry; typedef struct VMChangeStateEntry VMChangeStateEntry; typedef struct VMStateDescription VMStateDescription; diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index 198211b526..d0d6b7facf 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -10,6 +10,12 @@ bool runstate_is_running(void); bool runstate_needs_reset(void); bool runstate_store(char *str, size_t size); +typedef void CprExecCompleteHandler(void *opaque); +CprExecCompleteEntry *qemu_add_cpr_exec_complete_handler( + CprExecCompleteHandler *cb, void *opaque); +void qemu_del_all_cpr_exec_complete_handler(void); +void cpr_exec_complete_notify(void); + typedef void VMChangeStateHandler(void *opaque, bool running, RunState state); VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, diff --git a/migration/migration.c b/migration/migration.c index f80109a8c8..7f7c660cac 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -527,6 +527,12 @@ static void process_incoming_migration_bh(void *opaque) dirty_bitmap_mig_before_vm_start(); + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + cpr_exec_complete_notify(); + /* After qemu live update, no need to call handler anymore */ + qemu_del_all_cpr_exec_complete_handler(); + } + if (!global_state_received() || global_state_get_runstate() == RUN_STATE_RUNNING) { if (autostart) { diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 3762f4597a..0a74fbe185 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -265,6 +265,53 @@ void qemu_system_vmstop_request(RunState state) qemu_mutex_unlock(&vmstop_lock); qemu_notify_event(); } + +struct CprExecCompleteEntry { + CprExecCompleteHandler *cb; + void *opaque; + QTAILQ_ENTRY(CprExecCompleteEntry) entries; +}; + +static QTAILQ_HEAD(, CprExecCompleteEntry) cpr_exec_complete_head = + QTAILQ_HEAD_INITIALIZER(cpr_exec_complete_head); + +/* + * qemu_add_cpr_exec_complete_handler: + * @cb: the callback to invoke + * @opaque: user data passed to the callback + */ +CprExecCompleteEntry *qemu_add_cpr_exec_complete_handler( + CprExecCompleteHandler *cb, void *opaque) +{ + CprExecCompleteEntry *e; + + e = g_malloc0(sizeof(*e)); + e->cb = cb; + e->opaque = opaque; + QTAILQ_INSERT_TAIL(&cpr_exec_complete_head, e, entries); + + return e; +} + +void qemu_del_all_cpr_exec_complete_handler(void) +{ + CprExecCompleteEntry *e = NULL, *next_e = NULL; + + QTAILQ_FOREACH_SAFE(e, &cpr_exec_complete_head, entries, next_e) { + QTAILQ_REMOVE(&cpr_exec_complete_head, e, entries); + g_free(e); + } +} + +void cpr_exec_complete_notify(void) +{ + CprExecCompleteEntry *e; + + QTAILQ_FOREACH(e, &cpr_exec_complete_head, entries) { + e->cb(e->opaque); + } +} + struct VMChangeStateEntry { VMChangeStateHandler *cb; void *opaque; -- Gitee From 81788358967dafd33da266587bae7829da141f97 Mon Sep 17 00:00:00 2001 From: luofei Date: Mon, 14 Aug 2023 16:35:00 +0800 Subject: [PATCH 53/56] vfio/pci: irq compensation after qemu live update After qemu live update, it is found that there may be a loss of interrupts, which will case an exception to the passthrough device(such as performance degradation of GPU, etc.), so the interrupt is compensated here. Signed-off-by: luofei --- hw/vfio/pci.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 2 files changed, 52 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e86242b41c..1e7222200f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -431,6 +431,53 @@ static void vfio_msi_interrupt(void *opaque) notify(&vdev->pdev, nr); } +static void vfio_device_interrupt_compensate(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + VFIOMSIVector *vector; + MSIMessage msg; + int nr_vectors, nr; + + if (vdev->interrupt == VFIO_INT_MSIX) { + nr_vectors = vdev->nr_vectors; + for (nr = 0; nr < nr_vectors; nr++) { + vector = &vdev->msi_vectors[nr]; + event_notifier_test_and_clear(&vector->interrupt); + if (msix_is_masked(&vdev->pdev, nr)) { + set_bit(nr, vdev->msix->pending); + memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true); + trace_vfio_msix_pba_enable(vdev->vbasedev.name); + } + msg = msix_get_message(&vdev->pdev, nr); + trace_vfio_msi_interrupt_compensate(vdev->vbasedev.name, + nr, msg.address, msg.data); + msix_notify(&vdev->pdev, nr); + } + } else if (vdev->interrupt == VFIO_INT_MSI) { + nr_vectors = vdev->nr_vectors; + for (nr = 0; nr < nr_vectors; nr++) { + vector = &vdev->msi_vectors[nr]; + event_notifier_test_and_clear(&vector->interrupt); + msg = msi_get_message(&vdev->pdev, nr); + trace_vfio_msi_interrupt_compensate(vdev->vbasedev.name, + nr, msg.address, msg.data); + msi_notify(&vdev->pdev, nr); + } + } else { + /* legacy int */ + event_notifier_test_and_clear(&vdev->intx.interrupt); + vdev->intx.pending = true; + pci_irq_assert(&vdev->pdev); + vfio_mmap_set_enabled(vdev, false); + if (vdev->intx.mmap_timeout) { + timer_mod(vdev->intx.mmap_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); + } + } + + return; +} + static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) { struct vfio_irq_set *irq_set; @@ -3457,9 +3504,13 @@ static int vfio_pci_post_load(void *opaque, int version_id) ret = vfio_intx_enable(vdev, &err); if (ret) { error_report_err(err); + return ret; } } + qemu_add_cpr_exec_complete_handler(vfio_device_interrupt_compensate, + (void *)vdev); + return ret; } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 63dd0fe910..dd88516e99 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -9,6 +9,7 @@ vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved vfio_intx_enable(const char *name) " (%s)" vfio_intx_disable(const char *name) " (%s)" vfio_msi_interrupt(const char *name, int index, uint64_t addr, int data) " (%s) vector %d 0x%"PRIx64"/0x%x" +vfio_msi_interrupt_compensate(const char *name, int index, uint64_t addr, int data) " (%s) vector %d 0x%"PRIx64"/0x%x" vfio_msix_vector_do_use(const char *name, int index) " (%s) vector %d used" vfio_msix_vector_release(const char *name, int index) " (%s) vector %d released" vfio_msix_enable(const char *name) " (%s)" -- Gitee From 3e284a9e5776159686ef096d4d5e37f6ecb290ee Mon Sep 17 00:00:00 2001 From: luofei Date: Mon, 14 Aug 2023 16:57:35 +0800 Subject: [PATCH 54/56] migrationi/file: append pid to file URI Libvirtd passes the standard file URI to qemu, qemu appends the pid to file URI to identify and distinguish, meanwhile delete the URI file when qemu live update complete. Signed-off-by: luofei --- migration/file.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/migration/file.c b/migration/file.c index 233bcda1ff..7708c3c3f7 100644 --- a/migration/file.c +++ b/migration/file.c @@ -12,24 +12,39 @@ #include "io/channel-file.h" #include "io/channel-util.h" #include "trace.h" +#include "sysemu/runstate.h" +#include void file_start_outgoing_migration(MigrationState *s, const char *filename, Error **errp) { g_autoptr(QIOChannelFile) fioc = NULL; QIOChannel *ioc; + pid_t pid; + char *filename_p; - trace_migration_file_outgoing(filename); + pid = getpid(); + filename_p = g_strdup_printf("%s.%d", filename, pid); + trace_migration_file_outgoing(filename_p); - fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC, + fioc = qio_channel_file_new_path(filename_p, O_CREAT | O_WRONLY | O_TRUNC, 0600, errp); if (!fioc) { + g_free(filename_p); return; } ioc = QIO_CHANNEL(fioc); qio_channel_set_name(ioc, "migration-file-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); + g_free(filename_p); +} + +static void file_migrate_complete_unlink_file(void *opaque) +{ + char *filename = opaque; + unlink(filename); + g_free(filename); } static gboolean file_accept_incoming_migration(QIOChannel *ioc, @@ -45,11 +60,17 @@ void file_start_incoming_migration(const char *filename, Error **errp) { QIOChannelFile *fioc = NULL; QIOChannel *ioc; + pid_t pid; + char *filename_p; - trace_migration_file_incoming(filename); + pid = getpid(); + filename_p = g_strdup_printf("%s.%d", filename, pid); + trace_migration_file_incoming(filename_p); - fioc = qio_channel_file_new_path(filename, O_RDONLY, 0, errp); + + fioc = qio_channel_file_new_path(filename_p, O_RDONLY, 0, errp); if (!fioc) { + g_free(filename_p); return; } @@ -59,4 +80,11 @@ void file_start_incoming_migration(const char *filename, Error **errp) file_accept_incoming_migration, NULL, NULL, g_main_context_get_thread_default()); + + /* + * Register Handler to delete VM state save file when + * qemu live update complete + */ + qemu_add_cpr_exec_complete_handler(file_migrate_complete_unlink_file, + (void *)filename_p); } -- Gitee From d2dad2831e41585f45016c5163d20709e83eab70 Mon Sep 17 00:00:00 2001 From: luofei Date: Mon, 14 Aug 2023 17:11:11 +0800 Subject: [PATCH 55/56] cpr: ignore qemu -S option on cpr exec mode Cpr exec mode ignore qemu '-S' option to reduce interactions with libvirtd and acelerate qemu live update. This does not prevent other cases from setting autostart to false. We cannot delete '-S' option directly, because it needs to be compatible with other cases like virtual mahcine live migration etc. Signed-off-by: luofei --- migration/cpr-state.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/migration/cpr-state.c b/migration/cpr-state.c index 528e4f9200..5a8c2757af 100644 --- a/migration/cpr-state.c +++ b/migration/cpr-state.c @@ -17,6 +17,7 @@ #include "migration/misc.h" #include "migration/qemu-file.h" #include "migration/qemu-file-channel.h" +#include "sysemu/sysemu.h" #include "trace.h" /*************************************************************************/ @@ -323,6 +324,12 @@ int cpr_state_load(Error **errp) if (!ret) { migrate_get_current()->parameters.mode = cpr_state.mode; + assert(migrate_mode() == MIG_MODE_CPR_EXEC); + /* + * cpr exec mode ignore qemu -S option to reduce interactions + * with libvirtd and acelerate qemu live update. + */ + autostart = true; } else { error_setg(errp, "vmstate_load_state error %d", ret); } -- Gitee From e71b531674f0367ec973359b3c30fe8f3f3893a1 Mon Sep 17 00:00:00 2001 From: luofei Date: Wed, 16 Aug 2023 10:43:55 +0800 Subject: [PATCH 56/56] cpr: set migration mode to normal when qemu live update finished After qemu live update, set the migration mode to normal, otherwise qemu will crash when qemu and libvirtd are disconnected and reconnected. Signed-off-by: luofei --- migration/migration.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 7f7c660cac..ad4bca606b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -559,6 +559,12 @@ static void process_incoming_migration_bh(void *opaque) MIGRATION_STATUS_COMPLETED); qemu_bh_delete(mis->bh); migration_incoming_state_destroy(); + + /* After qemu live update, set the migration mode to normal */ + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + MigrationState *s = migrate_get_current(); + s->parameters.mode = MIG_MODE_NORMAL; + } } static void process_incoming_migration_co(void *opaque) -- Gitee