diff --git a/MAINTAINERS b/MAINTAINERS index fbd6d0b174afa18243e912dae6ebf4ee9c3d8103..85ca4cd21434ea6860c5f2913d5636f628dc7213 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2868,6 +2868,7 @@ F: tests/qtest/migration-test.c F: docs/devel/migration.rst F: qapi/migration.json F: tests/migration/ +F: stubs/migration.c D-Bus M: Marc-André Lureau @@ -2984,6 +2985,19 @@ F: net/colo* F: net/filter-rewriter.c F: net/filter-mirror.c +CPR +M: Steve Sistare +R: Mark Kanda +S: Maintained +F: tests/unit/test-strlist.c +F: include/migration/cpr-state.h +F: migration/cpr-state.c +F: stubs/cpr-state.c +F: include/migration/cpr.h +F: migration/cpr.c +F: hw/vfio/cpr.c +F: tests/avocado/cpr.py + Record/replay M: Pavel Dovgalyuk R: Paolo Bonzini diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c index 69aa7d018b28bab1cda4dfa5607c61e1b3eb8a63..bf188ab15a511e1d067411a9d3a0fd6887d4bea8 100644 --- a/accel/xen/xen-all.c +++ b/accel/xen/xen-all.c @@ -181,6 +181,10 @@ static int xen_init(MachineState *ms) * opt out of system RAM being allocated by generic code */ mc->default_ram_id = NULL; + + migrate_add_blocker_always("xen does not support cpr exec", + &error_fatal, MIG_MODE_CPR_EXEC, -1); + return 0; } diff --git a/backends/hostmem-epc.c b/backends/hostmem-epc.c index b47f98b6a3aa67ccb87e9a4e1d825ae572e3ec1f..2a57b92bea580e33b9433758f051fb710a522528 100644 --- a/backends/hostmem-epc.c +++ b/backends/hostmem-epc.c @@ -17,32 +17,34 @@ #include "qapi/error.h" #include "sysemu/hostmem.h" #include "hw/i386/hostmem-epc.h" +#include "migration/cpr-state.h" static void sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { uint32_t ram_flags; - char *name; - int fd; + g_autofree char *name = object_get_canonical_path(OBJECT(backend)); + int fd = cpr_find_fd(name, 0); if (!backend->size) { error_setg(errp, "can't create backend with size 0"); return; } - fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); if (fd < 0) { - error_setg_errno(errp, errno, - "failed to open /dev/sgx_vepc to alloc SGX EPC"); - return; + fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, + "failed to open /dev/sgx_vepc to alloc SGX EPC"); + return; + } + cpr_save_fd(name, 0, fd); } - name = object_get_canonical_path(OBJECT(backend)); ram_flags = (backend->share ? RAM_SHARED : 0) | RAM_PROTECTED; memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, fd, 0, errp); - g_free(name); } static void sgx_epc_backend_instance_init(Object *obj) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index cd038024fae3c056cc2f01e4654c2e962bb0336c..0ffaead394463d3837f74e498dd2f29662f05cfe 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -55,6 +55,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) ram_flags = backend->share ? RAM_SHARED : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; ram_flags |= fb->is_pmem ? RAM_PMEM : 0; + ram_flags |= RAM_NAMED_FILE; memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name, backend->size, fb->align, ram_flags, fb->mem_path, fb->readonly, errp); diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c index 3fc85c3db81bb71176cdfe65dbf0643e1504355b..d421a4fea5140b55f0f3fd1047c5c1727b8329fc 100644 --- a/backends/hostmem-memfd.c +++ b/backends/hostmem-memfd.c @@ -14,6 +14,7 @@ #include "sysemu/hostmem.h" #include "qom/object_interfaces.h" #include "qemu/memfd.h" +#include "migration/cpr-state.h" #include "qemu/module.h" #include "qapi/error.h" #include "qom/object.h" @@ -36,28 +37,29 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend); uint32_t ram_flags; - char *name; - int fd; + g_autofree char *name = host_memory_backend_get_name(backend); + int fd = cpr_find_fd(name, 0); if (!backend->size) { error_setg(errp, "can't create backend with size 0"); return; } - fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size, - m->hugetlb, m->hugetlbsize, m->seal ? - F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0, - errp); - if (fd == -1) { - return; + if (fd < 0) { + fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size, + m->hugetlb, m->hugetlbsize, m->seal ? + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0, + errp); + if (fd == -1) { + return; + } + cpr_save_fd(name, 0, fd); } - name = host_memory_backend_get_name(backend); ram_flags = backend->share ? RAM_SHARED : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, fd, 0, errp); - g_free(name); } static bool diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index 87d061e9bbd61cb572dfb0aa2e1bb68aac04edf4..9721927660e98dc6cc2ccc2d3a22e3ff2f6d403b 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -492,10 +492,9 @@ static int tpm_emulator_block_migration(TPMEmulator *tpm_emu) error_setg(&tpm_emu->migration_blocker, "Migration disabled: TPM emulator does not support " "migration"); - if (migrate_add_blocker(tpm_emu->migration_blocker, &err) < 0) { + if (migrate_add_blockers(&tpm_emu->migration_blocker, &err, + MIG_MODE_NORMAL, -1) < 0) { error_report_err(err); - error_free(tpm_emu->migration_blocker); - tpm_emu->migration_blocker = NULL; return -1; } @@ -950,10 +949,7 @@ static void tpm_emulator_inst_finalize(Object *obj) qapi_free_TPMEmulatorOptions(tpm_emu->options); - if (tpm_emu->migration_blocker) { - migrate_del_blocker(tpm_emu->migration_blocker); - error_free(tpm_emu->migration_blocker); - } + migrate_del_blocker(&tpm_emu->migration_blocker); tpm_sized_buffer_reset(&state_blobs->volatil); tpm_sized_buffer_reset(&state_blobs->permanent); diff --git a/block/parallels.c b/block/parallels.c index 6ebad2a2bbc9129ee184b7a38ce66ef5346e33de..9a4a9e6df07f35a69fd9f4fb738572422935b8d2 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -877,9 +877,9 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The Parallels format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } qemu_co_mutex_init(&s->lock); @@ -911,8 +911,7 @@ static void parallels_close(BlockDriverState *bs) g_free(s->bat_dirty_bmap); qemu_vfree(s->header); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static BlockDriver bdrv_parallels = { diff --git a/block/qcow.c b/block/qcow.c index c39940f33ebea70b48aa4f7cdc8ffe35084fb48f..2c7885d2243a1d827542a7532dc922c3e32fa810 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -304,9 +304,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The qcow format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } @@ -798,8 +798,7 @@ static void qcow_close(BlockDriverState *bs) g_free(s->cluster_cache); g_free(s->cluster_data); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts, diff --git a/block/vdi.c b/block/vdi.c index bdc58d726ee13a18c7c93e6a1a6bcc35e1f92465..01032843949d8f0be716a24be3b91de531209c69 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -494,9 +494,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vdi format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail_free_bmap; } @@ -984,8 +984,7 @@ static void vdi_close(BlockDriverState *bs) qemu_vfree(s->bmap); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static int vdi_has_zero_init(BlockDriverState *bs) diff --git a/block/vhdx.c b/block/vhdx.c index 356ec4c455a42be6e5f68b62c712536e9f2f6310..85557d106c285a175bae9123cbf186811f22e0ec 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -980,8 +980,7 @@ static void vhdx_close(BlockDriverState *bs) s->bat = NULL; qemu_vfree(s->parent_entries); s->parent_entries = NULL; - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); qemu_vfree(s->log.hdr); s->log.hdr = NULL; vhdx_region_unregister_all(s); @@ -1089,9 +1088,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } diff --git a/block/vmdk.c b/block/vmdk.c index 0dfab6e94130246a2dbc8d0646513f90487fbb6d..10024d872b7cb5587f9b96d5115ccf06cc54ea30 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1314,9 +1314,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } @@ -2810,8 +2810,7 @@ static void vmdk_close(BlockDriverState *bs) vmdk_free_extents(bs); g_free(s->create_type); - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) diff --git a/block/vpc.c b/block/vpc.c index 297a26262ab1e58d6d42549decda77578319a73c..09da705cdb94b84595c8485318771f29b25da856 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -449,9 +449,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, error_setg(&s->migration_blocker, "The vpc format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } @@ -1186,8 +1186,7 @@ static void vpc_close(BlockDriverState *bs) g_free(s->pageentry_u8); #endif - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } static QemuOptsList vpc_create_opts = { diff --git a/block/vvfat.c b/block/vvfat.c index 5dacc6cfac42decf61c04e15dc07dbc033316130..43993b506f5558517a245cdc4b3cf6dbf9145815 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1266,9 +1266,9 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, "The vvfat (rw) format used by node '%s' " "does not support live migration", bdrv_get_device_or_node_name(bs)); - ret = migrate_add_blocker(s->migration_blocker, errp); + ret = migrate_add_blockers(&s->migration_blocker, errp, MIG_MODE_NORMAL, + -1); if (ret < 0) { - error_free(s->migration_blocker); goto fail; } } @@ -3202,8 +3202,7 @@ static void vvfat_close(BlockDriverState *bs) g_free(s->cluster_buffer); if (s->qcow) { - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); + migrate_del_blocker(&s->migration_blocker); } } diff --git a/chardev/char-mux.c b/chardev/char-mux.c index ee2d47b20d9bd0d2ceb132343bf3a9e7584da259..d47fa31b6b9a73f3ea24bc518ba8bbd480e165de 100644 --- a/chardev/char-mux.c +++ b/chardev/char-mux.c @@ -337,6 +337,7 @@ static void qemu_chr_open_mux(Chardev *chr, */ *be_opened = muxes_opened; qemu_chr_fe_init(&d->chr, drv, errp); + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); } static void qemu_chr_parse_mux(QemuOpts *opts, ChardevBackend *backend, diff --git a/chardev/char-null.c b/chardev/char-null.c index 1c6a2900f9b5d506fdcab6ed2871046178f95dce..02acaff03d118ee912dc5ac4448d28c9e7a6eb5f 100644 --- a/chardev/char-null.c +++ b/chardev/char-null.c @@ -32,6 +32,7 @@ static void null_chr_open(Chardev *chr, Error **errp) { *be_opened = false; + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); } static void char_null_class_init(ObjectClass *oc, void *data) diff --git a/chardev/char-pty.c b/chardev/char-pty.c index a2d1e7c985bc49a76b27abb44998332df4f3da47..960645588c5cf3e59efdeff4fa9847134415fd50 100644 --- a/chardev/char-pty.c +++ b/chardev/char-pty.c @@ -30,6 +30,7 @@ #include "qemu/sockets.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "migration/cpr-state.h" #include "qemu/qemu-print.h" #include "chardev/char-io.h" @@ -191,6 +192,9 @@ static void char_pty_finalize(Object *obj) Chardev *chr = CHARDEV(obj); PtyChardev *s = PTY_CHARDEV(obj); + if (chr->cpr_enabled) { + cpr_delete_fd(chr->label, 0); + } pty_chr_state(chr, 0); object_unref(OBJECT(s->ioc)); pty_chr_timer_cancel(s); @@ -207,12 +211,17 @@ static void char_pty_open(Chardev *chr, char pty_name[PATH_MAX]; char *name; + master_fd = cpr_find_fd(chr->label, 0); + if (master_fd >= 0) { + chr->filename = g_strdup_printf("pty:unknown"); + goto have_fd; + } master_fd = qemu_openpty_raw(&slave_fd, pty_name); if (master_fd < 0) { error_setg_errno(errp, errno, "Failed to create PTY"); return; } - + cpr_save_fd(chr->label, 0, master_fd); close(slave_fd); qemu_set_nonblock(master_fd); @@ -220,6 +229,8 @@ static void char_pty_open(Chardev *chr, qemu_printf("char device redirected to %s (label %s)\n", pty_name, chr->label); +have_fd: + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); s = PTY_CHARDEV(chr); s->ioc = QIO_CHANNEL(qio_channel_file_new_fd(master_fd)); name = g_strdup_printf("chardev-pty-%s", chr->label); diff --git a/chardev/char-serial.c b/chardev/char-serial.c index 7c3d84ae243e9fdea610673fe5acfd8dda10c516..a6bd439983291aaf35172b8ccced5303702f4b9e 100644 --- a/chardev/char-serial.c +++ b/chardev/char-serial.c @@ -38,6 +38,7 @@ #endif #include "chardev/char-serial.h" +#include "migration/cpr-state.h" #ifdef _WIN32 @@ -266,14 +267,22 @@ static void qmp_chardev_open_serial(Chardev *chr, ChardevHostdev *serial = backend->u.serial.data; int fd; + fd = cpr_find_fd(chr->label, 0); + if (fd >= 0) { + goto have_fd; + } fd = qmp_chardev_open_file_source(serial->device, O_RDWR | O_NONBLOCK, errp); if (fd < 0) { return; + } else { + cpr_save_fd(chr->label, 0, fd); } qemu_set_nonblock(fd); tty_serial_init(fd, 115200, 'N', 8, 1); +have_fd: + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); qemu_chr_open_fd(chr, fd, fd); } #endif /* __linux__ || __sun__ */ diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 57ae53304ab58bd7aa7dd842b4dd272831871b1c..5636f249484c233e462c0f4b4b041fd9fc1b2367 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -27,6 +27,9 @@ #include "io/channel-socket.h" #include "io/channel-tls.h" #include "io/channel-websock.h" +#include "migration/blocker.h" +#include "migration/cpr-state.h" +#include "migration/misc.h" #include "io/net-listener.h" #include "qemu/error-report.h" #include "qemu/module.h" @@ -35,6 +38,7 @@ #include "qapi/clone-visitor.h" #include "qapi/qapi-visit-sockets.h" #include "qemu/yank.h" +#include "sysemu/sysemu.h" #include "chardev/char-io.h" #include "qom/object.h" @@ -87,6 +91,7 @@ struct SocketChardev { bool connect_err_reported; QIOTask *connect_task; + Error *cpr_blocker; }; typedef struct SocketChardev SocketChardev; @@ -436,6 +441,11 @@ static void tcp_chr_free_connection(Chardev *chr) SocketChardev *s = SOCKET_CHARDEV(chr); int i; + if (chr->cpr_enabled) { + cpr_delete_fd(chr->label, 0); + } + migrate_del_blocker(&s->cpr_blocker); + if (s->read_msgfds_num) { for (i = 0; i < s->read_msgfds_num; i++) { close(s->read_msgfds[i]); @@ -998,6 +1008,10 @@ static void tcp_chr_accept(QIONetListener *listener, QIO_CHANNEL(cioc)); } tcp_chr_new_client(chr, cioc); + + if (s->sioc && chr->cpr_enabled) { + cpr_resave_fd(chr->label, 0, s->sioc->fd); + } } @@ -1253,6 +1267,27 @@ static gboolean socket_reconnect_timeout(gpointer opaque) return false; } +static int load_char_socket_fd(Chardev *chr, Error **errp) +{ + ERRP_GUARD(); + SocketChardev *sockchar = SOCKET_CHARDEV(chr); + QIOChannelSocket *sioc; + const char *label = chr->label; + int fd = cpr_find_fd(label, 0); + + if (fd != -1) { + sockchar = SOCKET_CHARDEV(chr); + sioc = qio_channel_socket_new_fd(fd, errp); + if (sioc) { + tcp_chr_accept(sockchar->listener, sioc, chr); + object_unref(OBJECT(sioc)); + } else { + error_prepend(errp, "could not restore socket for %s", label); + return -1; + } + } + return 0; +} static int qmp_chardev_open_socket_server(Chardev *chr, bool is_telnet, @@ -1457,6 +1492,18 @@ static void qmp_chardev_open_socket(Chardev *chr, } s->registered_yank = true; + if (!s->tls_creds && !s->is_websock) { + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); + } else if (!chr->reopen_on_cpr) { + s->cpr_blocker = NULL; + error_setg(&s->cpr_blocker, + "error: socket %s is not cpr capable due to %s option", + chr->label, (s->tls_creds ? "TLS" : "websocket")); + if (migrate_add_blockers(&s->cpr_blocker, errp, MIG_MODE_CPR_EXEC, + -1)) { + return; + } + } /* be isn't opened until we get a connection */ *be_opened = false; @@ -1472,6 +1519,12 @@ static void qmp_chardev_open_socket(Chardev *chr, return; } } + + if (migrate_mode() == MIG_MODE_CPR_EXEC && + qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR) && + !chr->reopen_on_cpr && !is_waitconnect) { + load_char_socket_fd(chr, errp); + } } static void qemu_chr_parse_socket(QemuOpts *opts, ChardevBackend *backend, diff --git a/chardev/char-stdio.c b/chardev/char-stdio.c index 403da308c980ebf24eaacb61cc2e77c6ffb5ad03..c18666e45e9967468143a733d5e5f19f3cee0301 100644 --- a/chardev/char-stdio.c +++ b/chardev/char-stdio.c @@ -27,6 +27,7 @@ #include "qemu/option.h" #include "qemu/sockets.h" #include "qapi/error.h" +#include "migration/misc.h" #include "chardev/char.h" #ifdef _WIN32 @@ -40,19 +41,46 @@ #ifndef _WIN32 /* init terminal so that we can grab keys */ static struct termios oldtty; +static struct termios newtty; static int old_fd0_flags; +static int new_fd0_flags; static bool stdio_in_use; static bool stdio_allow_signal; static bool stdio_echo_state; +static Notifier cpr_notifier; static void term_exit(void) { if (stdio_in_use) { + tcgetattr(0, &newtty); + new_fd0_flags = fcntl(0, F_GETFL); + tcsetattr(0, TCSANOW, &oldtty); fcntl(0, F_SETFL, old_fd0_flags); } } +static void term_reenter(void) +{ + if (stdio_in_use) { + tcsetattr(0, TCSANOW, &newtty); + fcntl(0, F_SETFL, new_fd0_flags); + } +} + +static void term_cpr_exec_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC) { + if (migration_has_finished(s)) { + term_exit(); + } else if (migration_has_failed(s)) { + term_reenter(); + } + } +} + static void qemu_chr_set_echo_stdio(Chardev *chr, bool echo) { struct termios tty; @@ -114,6 +142,8 @@ static void qemu_chr_open_stdio(Chardev *chr, stdio_allow_signal = !opts->has_signal || opts->signal; qemu_chr_set_echo_stdio(chr, false); + qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR); + migration_add_notifier(&cpr_notifier, term_cpr_exec_notifier); } #endif @@ -144,6 +174,7 @@ static void char_stdio_finalize(Object *obj) { #ifndef _WIN32 term_exit(); + migration_remove_notifier(&cpr_notifier); #endif } diff --git a/chardev/char.c b/chardev/char.c index 0169d8dde4b533c9cf851831b03c8adcac24cff5..8805c9df4150b7690237e6b04d49e05eac9fa6ea 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -36,6 +36,8 @@ #include "qemu/help_option.h" #include "qemu/module.h" #include "qemu/option.h" +#include "migration/cpr-state.h" +#include "migration/blocker.h" #include "qemu/id.h" #include "qemu/coroutine.h" #include "qemu/yank.h" @@ -236,26 +238,54 @@ int qemu_chr_add_client(Chardev *s, int fd) static void qemu_char_open(Chardev *chr, ChardevBackend *backend, bool *be_opened, Error **errp) { + ERRP_GUARD(); + g_autofree char *fdname = NULL; + ChardevClass *cc = CHARDEV_GET_CLASS(chr); /* Any ChardevCommon member would work */ ChardevCommon *common = backend ? backend->u.null.data : NULL; + bool has_logfile = (common && common->has_logfile); + bool has_feature_cpr; - if (common && common->has_logfile) { + if (has_logfile) { int flags = O_WRONLY; + fdname = g_strdup_printf("%s_log", chr->label); if (common->has_logappend && common->logappend) { flags |= O_APPEND; } else { flags |= O_TRUNC; } - chr->logfd = qemu_create(common->logfile, flags, 0666, errp); + chr->logfd = cpr_find_fd(fdname, 0); + if (chr->logfd < 0) { + chr->logfd = qemu_create(common->logfile, flags, 0666, errp); + } if (chr->logfd < 0) { return; } } + chr->reopen_on_cpr = (common && common->reopen_on_cpr); + if (cc->open) { cc->open(chr, backend, be_opened, errp); + if (*errp) { + return; + } + } + + /* Evaluate this after the open method sets the feature */ + has_feature_cpr = qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR); + chr->cpr_enabled = !chr->reopen_on_cpr && has_feature_cpr; + + if (!chr->reopen_on_cpr && !has_feature_cpr) { + chr->cpr_blocker = NULL; + error_setg(&chr->cpr_blocker, + "chardev %s -> %s does not allow cpr. See reopen-on-cpr.", + chr->label, chr->filename); + migrate_add_blockers(&chr->cpr_blocker, errp, MIG_MODE_CPR_EXEC, -1); + } else if (chr->cpr_enabled && has_logfile) { + cpr_resave_fd(fdname, 0, chr->logfd); } } @@ -293,15 +323,25 @@ static void char_class_init(ObjectClass *oc, void *data) static void char_finalize(Object *obj) { Chardev *chr = CHARDEV(obj); + char *name; if (chr->be) { chr->be->chr = NULL; } - g_free(chr->filename); - g_free(chr->label); if (chr->logfd != -1) { + g_autofree char *fdname = g_strdup_printf("%s_log", chr->label); + if (chr->cpr_enabled) { + name = g_strdup_printf("qmp-%s", chr->label); + cpr_delete_fd(fdname, 0); + /* Delete qmp monitor */ + cpr_delete_fd(name, MONITOR_CAPAB); + g_free(name); + } close(chr->logfd); } + migrate_del_blocker(&chr->cpr_blocker); + g_free(chr->filename); + g_free(chr->label); qemu_mutex_destroy(&chr->chr_write_lock); } @@ -501,6 +541,8 @@ void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend) backend->has_logappend = true; backend->logappend = qemu_opt_get_bool(opts, "logappend", false); + + backend->reopen_on_cpr = qemu_opt_get_bool(opts, "reopen-on-cpr", false); } static const ChardevClass *char_get_class(const char *driver, Error **errp) @@ -942,6 +984,9 @@ QemuOptsList qemu_chardev_opts = { },{ .name = "abstract", .type = QEMU_OPT_BOOL, + },{ + .name = "reopen-on-cpr", + .type = QEMU_OPT_BOOL, #endif }, { /* end of list */ } @@ -960,6 +1005,17 @@ void qemu_chr_set_feature(Chardev *chr, return set_bit(feature, chr->features); } +bool qemu_chr_cpr_support(Chardev *chr) +{ + bool has_feature_cpr; + + has_feature_cpr = qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR); + if (chr->cpr_enabled || (has_feature_cpr && !chr->reopen_on_cpr)) + return true; + else + return false; +} + static Chardev *chardev_new(const char *id, const char *typename, ChardevBackend *backend, GMainContext *gcontext, diff --git a/dump/dump.c b/dump/dump.c index 662d0a62cd9c44796d2bc23885659df1bc1c0e91..38c6192950a9ee99f9d494789d6b48d2b7938c66 100644 --- a/dump/dump.c +++ b/dump/dump.c @@ -104,7 +104,7 @@ static int dump_cleanup(DumpState *s) qemu_mutex_unlock_iothread(); } } - migrate_del_blocker(dump_migration_blocker); + migrate_del_blocker(&dump_migration_blocker); return 0; } @@ -2018,7 +2018,7 @@ void qmp_dump_guest_memory(bool paging, const char *file, * Allows even for -only-migratable, but forbid migration during the * process of dump guest memory. */ - if (migrate_add_blocker_internal(dump_migration_blocker, errp)) { + if (migrate_add_blocker_internal(&dump_migration_blocker, errp)) { /* Remember to release the fd before passing it over to dump state */ close(fd); return; diff --git a/gdbstub.c b/gdbstub.c index 141d7bc4ec151342a29f0d8baf212dc722f942fa..f03d43bb7c438a0962cb15994eb1e019ad0cc293 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -3538,6 +3538,7 @@ int gdbserver_start(const char *device) mon_chr = gdbserver_state.mon_chr; reset_gdbserver_state(); } + mon_chr->reopen_on_cpr = true; create_processes(&gdbserver_state); diff --git a/hmp-commands.hx b/hmp-commands.hx index 5bedee2d49547acfcda639f7cb6fe8636d643e76..bd9e21af368c3262ea16ddca269e08980c348395 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -995,7 +995,7 @@ ERST { .name = "migrate_set_parameter", - .args_type = "parameter:s,value:s", + .args_type = "parameter:s,value:S", .params = "parameter value", .help = "Set the parameter for migration", .cmd = hmp_migrate_set_parameter, diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 15b3f4d3853d8a81da003a5ad04401322cd34970..0b5162b58c198ac29c3b80c394afe937064e8bf8 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -393,11 +393,7 @@ static int coroutine_fn put_fid(V9fsPDU *pdu, V9fsFidState *fidp) * delete the migration blocker. Ideally, this * should be hooked to transport close notification */ - if (pdu->s->migration_blocker) { - migrate_del_blocker(pdu->s->migration_blocker); - error_free(pdu->s->migration_blocker); - pdu->s->migration_blocker = NULL; - } + migrate_del_blocker(&pdu->s->migration_blocker); } return free_fid(pdu, fidp); } @@ -1460,10 +1456,9 @@ static void coroutine_fn v9fs_attach(void *opaque) error_setg(&s->migration_blocker, "Migration is disabled when VirtFS export path '%s' is mounted in the guest using mount_tag '%s'", s->ctx.fs_root ? s->ctx.fs_root : "NULL", s->tag); - err = migrate_add_blocker(s->migration_blocker, NULL); + err = migrate_add_blockers(&s->migration_blocker, NULL, MIG_MODE_NORMAL, + -1); if (err < 0) { - error_free(s->migration_blocker); - s->migration_blocker = NULL; clunk_fid(s, fid); goto out; } diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b93ed9b4dd4a7dffa33f6d0d7752c602085c7719..1237c1300740bf68ff06300cf6951cdb06ffcb1e 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -567,6 +567,18 @@ const PropertyInfo qdev_prop_losttickpolicy = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +/* --- MigMode --- */ + +const PropertyInfo qdev_prop_mig_mode = { + .name = "MigMode", + .description = "mig_mode values, " + "normal/exec", + .enum_table = &MigMode_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- blocksize --- */ static void set_blocksize(Object *obj, Visitor *v, const char *name, diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index 2d5f662663b6342ce55e31889e86979696fa4f5e..f3c984d569014e4b69082fd6e966b91535692469 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -9,6 +9,7 @@ #include "qemu/units.h" #include "qemu/cutils.h" #include "qdev-prop-internal.h" +#include "qapi/qapi-builtin-visit.h" void qdev_prop_set_after_realize(DeviceState *dev, const char *name, Error **errp) @@ -471,6 +472,49 @@ const PropertyInfo qdev_prop_string = { .set = set_string, }; +/* --- strList --- */ + +static void release_strList(Object *obj, const char *name, void *opaque) +{ + Property *prop = opaque; + g_free(*(char **)object_field_prop_ptr(obj, prop)); +} + +static void get_strList(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Property *prop = opaque; + strList **ptr = object_field_prop_ptr(obj, prop); + + if (!*ptr) { + strList *str = NULL; + visit_type_strList(v, name, &str, errp); + } else { + visit_type_strList(v, name, ptr, errp); + } +} + +static void set_strList(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Property *prop = opaque; + strList **ptr = object_field_prop_ptr(obj, prop); + strList *str; + + if (!visit_type_strList(v, name, &str, errp)) { + return; + } + g_free(*ptr); + *ptr = str; +} + +const PropertyInfo qdev_prop_strlist = { + .name = "strList", + .release = release_strList, + .get = get_strList, + .set = set_strList, +}; + /* --- on/off/auto --- */ const PropertyInfo qdev_prop_on_off_auto = { diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c index c8da4806e0bb7a94669bfa2c20ae9a04be5e1b27..9616e85ff245dbfa1600f1299b539dfe49a1e51f 100644 --- a/hw/display/virtio-gpu-base.c +++ b/hw/display/virtio-gpu-base.c @@ -163,8 +163,7 @@ virtio_gpu_base_device_realize(DeviceState *qdev, if (virtio_gpu_virgl_enabled(g->conf)) { error_setg(&g->migration_blocker, "virgl is not yet migratable"); - if (migrate_add_blocker(g->migration_blocker, errp) < 0) { - error_free(g->migration_blocker); + if (migrate_add_blocker(&g->migration_blocker, errp) < 0) { return false; } } @@ -228,10 +227,7 @@ virtio_gpu_base_device_unrealize(DeviceState *qdev) { VirtIOGPUBase *g = VIRTIO_GPU_BASE(qdev); - if (g->migration_blocker) { - migrate_del_blocker(g->migration_blocker); - error_free(g->migration_blocker); - } + migrate_del_blocker(&g->migration_blocker); } static void diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c index 7d2a13273a47971c159a3797cfa4597391b28fd2..c9fac45748137b3d745589cce118000c2f1d4426 100644 --- a/hw/intc/arm_gic_kvm.c +++ b/hw/intc/arm_gic_kvm.c @@ -514,8 +514,7 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp) if (!kvm_arm_gic_can_save_restore(s)) { error_setg(&s->migration_blocker, "This operating system kernel does " "not support vGICv2 migration"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } diff --git a/hw/intc/arm_gicv3_its_kvm.c b/hw/intc/arm_gicv3_its_kvm.c index 0b4cbed28b3b4128ca9c1cb530ed4128118ed1ff..525f9d4e91a62a4ebbf26925c2f2a4dec0867f96 100644 --- a/hw/intc/arm_gicv3_its_kvm.c +++ b/hw/intc/arm_gicv3_its_kvm.c @@ -112,8 +112,7 @@ static void kvm_arm_its_realize(DeviceState *dev, Error **errp) GITS_CTLR)) { error_setg(&s->migration_blocker, "This operating system kernel " "does not support vITS migration"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } else { diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 2e2b08e31f7ac3dccd95f0009c3472880e901a9f..5f72afbcdf50f45d31c1bc6240273f33eaca15f7 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -899,8 +899,7 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) GICD_CTLR)) { error_setg(&s->migration_blocker, "This operating system kernel does " "not support vGICv3 migration"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 05f06ed6cf4465f09921aa9f7c62111b42c6fac6..9017bdbcce6a060dc49292b8be7ac3686c9e3253 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -905,8 +905,7 @@ static void ivshmem_common_realize(PCIDevice *dev, Error **errp) if (!ivshmem_is_master(s)) { error_setg(&s->migration_blocker, "Migration is disabled when using feature 'peer mode' in device 'ivshmem'"); - if (migrate_add_blocker(s->migration_blocker, errp) < 0) { - error_free(s->migration_blocker); + if (migrate_add_blocker(&s->migration_blocker, errp) < 0) { return; } } @@ -924,10 +923,7 @@ static void ivshmem_exit(PCIDevice *dev) IVShmemState *s = IVSHMEM_COMMON(dev); int i; - if (s->migration_blocker) { - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); - } + migrate_del_blocker(&s->migration_blocker); if (memory_region_is_mapped(s->ivshmem_bar2)) { if (!s->hostmem) { diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 3bd786cc22b2d279aee5ee1258e3d42a700513f7..42f7a75757c48eb6b4738e0edb36b1194ee9f175 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3317,6 +3317,10 @@ static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s) static void virtio_net_migration_state_notifier(Notifier *notifier, void *data) { MigrationState *s = data; + + if (migrate_mode_of(s) != MIG_MODE_NORMAL) { + return; + } VirtIONet *n = container_of(notifier, VirtIONet, migration_state); virtio_net_handle_migration_primary(n, s); } @@ -3409,8 +3413,8 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) n->primary_listener.hide_device = failover_hide_primary_device; qatomic_set(&n->failover_primary_hidden, true); device_listener_register(&n->primary_listener); - n->migration_state.notify = virtio_net_migration_state_notifier; - add_migration_state_change_notifier(&n->migration_state); + migration_add_notifier(&n->migration_state, + virtio_net_migration_state_notifier); n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY); } @@ -3575,7 +3579,7 @@ static void virtio_net_device_unrealize(DeviceState *dev) if (n->failover) { qobject_unref(n->primary_opts); device_listener_unregister(&n->primary_listener); - remove_migration_state_change_notifier(&n->migration_state); + migration_remove_notifier(&n->migration_state); } else { assert(n->primary_opts == NULL); } diff --git a/hw/pci/msix.c b/hw/pci/msix.c index ae9331cd0b4597eab472b0708acced77c1c24080..e492ce0e0f36ae6a5f7dc2c5ae986e4fcd47b0a0 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -64,7 +64,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) return dev->msix_pba + vector / 8; } -static int msix_is_pending(PCIDevice *dev, int vector) +int msix_is_pending(PCIDevice *dev, unsigned int vector) { return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 3e6805d54ac414f9e474b6bf32f3379be2b7de67..4c110a9cfd5c2775886be2bb7f88a296e70da2ae 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -33,6 +33,7 @@ #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/misc.h" #include "migration/qemu-file-types.h" #include "migration/vmstate.h" #include "monitor/monitor.h" @@ -315,6 +316,17 @@ static void pci_do_device_reset(PCIDevice *dev) { int r; + /* + * A PCI device that is resuming for cpr is already configured, so do + * not reset it here when we are called from qemu_system_reset prior to + * cpr load, else interrupts may be lost for vfio-pci devices. It is + * safe to skip this reset for all PCI devices, because cpr load will set + * all fields that would have been set here. + */ + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + return; + } + pci_device_deassert_intx(dev); assert(dev->irq_state == 0); diff --git a/hw/ppc/pef.c b/hw/ppc/pef.c index cc44d5e33968ec0739574ddc3d1de01a3847895d..d28ed3ba7333af8d4628112338ed0544f389903c 100644 --- a/hw/ppc/pef.c +++ b/hw/ppc/pef.c @@ -63,7 +63,7 @@ static int kvmppc_svm_init(ConfidentialGuestSupport *cgs, Error **errp) /* add migration blocker */ error_setg(&pef_mig_blocker, "PEF: Migration is not implemented"); /* NB: This can fail if --only-migratable is used */ - migrate_add_blocker(pef_mig_blocker, &error_fatal); + migrate_add_blocker(&pef_mig_blocker, &error_fatal); cgs->ready = true; diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 3b5fd749be8961c682bee48b2239bcb07be3fc6d..de9cd9f86db891878f4108311293ae5af60db1d8 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1681,7 +1681,7 @@ static void spapr_machine_reset(MachineState *machine) /* Signal all vCPUs waiting on this condition */ qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond); - migrate_del_blocker(spapr->fwnmi_migration_blocker); + migrate_del_blocker(&spapr->fwnmi_migration_blocker); } static void spapr_create_nvram(SpaprMachineState *spapr) diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c index 630e86282c9b296887d1201bf3ad3264e4b32b4d..5865c529138ec30423c747de94c3d8d0c4cf1b30 100644 --- a/hw/ppc/spapr_events.c +++ b/hw/ppc/spapr_events.c @@ -920,7 +920,7 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) * fails when running with -only-migrate. A proper interface to * delay migration completion for a bit could avoid that. */ - ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, NULL); + ret = migrate_add_blocker(&spapr->fwnmi_migration_blocker, NULL); if (ret == -EBUSY) { warn_report("Received a fwnmi while migration was in progress"); } diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index b476382ae6f59228df37f2f8c13c3eec7b52490e..83a1774c719370c3ee2596ab33157e53dc6796b3 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -496,7 +496,7 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu, spapr->fwnmi_machine_check_interlock = -1; qemu_cond_signal(&spapr->fwnmi_machine_check_interlock_cond); rtas_st(rets, 0, RTAS_OUT_SUCCESS); - migrate_del_blocker(spapr->fwnmi_migration_blocker); + migrate_del_blocker(&spapr->fwnmi_migration_blocker); } static struct rtas_call { diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c index bad164299dd453a3b1e852285e8f0eb0490890a8..2cd1f4347cbdafebd5fbea33d7ee938a880355fc 100644 --- a/hw/remote/proxy.c +++ b/hw/remote/proxy.c @@ -109,8 +109,7 @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp) error_setg(&dev->migration_blocker, "%s does not support migration", TYPE_PCI_PROXY_DEV); - if (migrate_add_blocker(dev->migration_blocker, errp) < 0) { - error_free(dev->migration_blocker); + if (migrate_add_blocker(&dev->migration_blocker, errp) < 0) { object_unref(dev->ioc); return; } @@ -136,9 +135,7 @@ static void pci_proxy_dev_exit(PCIDevice *pdev) qio_channel_close(dev->ioc, NULL); } - migrate_del_blocker(dev->migration_blocker); - - error_free(dev->migration_blocker); + migrate_del_blocker(&dev->migration_blocker); proxy_memory_listener_deconfigure(&dev->proxy_listener); diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 653587ea62f4c1bf59a51c2a1459b6311829efaf..bd500589e52522b46e2d1f7b0b1d42223fd7a629 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -322,8 +322,7 @@ static void s390_machine_unprotect(S390CcwMachineState *ms) { s390_pv_vm_disable(); ms->pv = false; - migrate_del_blocker(pv_mig_blocker); - error_free_or_abort(&pv_mig_blocker); + migrate_del_blocker(&pv_mig_blocker); ram_block_discard_disable(false); } @@ -346,11 +345,10 @@ static int s390_machine_protect(S390CcwMachineState *ms) error_setg(&pv_mig_blocker, "protected VMs are currently not migrateable."); - rc = migrate_add_blocker(pv_mig_blocker, &local_err); + rc = migrate_add_blocker(&pv_mig_blocker, &local_err); if (rc) { ram_block_discard_disable(false); error_report_err(local_err); - error_free_or_abort(&pv_mig_blocker); return rc; } @@ -358,8 +356,7 @@ static int s390_machine_protect(S390CcwMachineState *ms) rc = s390_pv_vm_enable(); if (rc) { ram_block_discard_disable(false); - migrate_del_blocker(pv_mig_blocker); - error_free_or_abort(&pv_mig_blocker); + migrate_del_blocker(&pv_mig_blocker); return rc; } diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 039caf2614eb16db1df0ffb9636d5625eaf6bbb2..d393202ec05fb52b679d3833029890d927c274ac 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -207,7 +207,8 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) "When external environment supports it (Orchestrator migrates " "target SCSI device state or use shared storage over network), " "set 'migratable' property to true to enable migration."); - if (migrate_add_blocker(vsc->migration_blocker, errp) < 0) { + if (migrate_add_blockers(&vsc->migration_blocker, errp, MIG_MODE_NORMAL, + -1) < 0) { goto free_virtio; } } @@ -234,10 +235,9 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) free_vqs: g_free(vsc->dev.vqs); if (!vsc->migratable) { - migrate_del_blocker(vsc->migration_blocker); + migrate_del_blocker(&vsc->migration_blocker); } free_virtio: - error_free(vsc->migration_blocker); virtio_scsi_common_unrealize(dev); close_fd: close(vhostfd); @@ -251,8 +251,7 @@ static void vhost_scsi_unrealize(DeviceState *dev) struct vhost_virtqueue *vqs = vsc->dev.vqs; if (!vsc->migratable) { - migrate_del_blocker(vsc->migration_blocker); - error_free(vsc->migration_blocker); + migrate_del_blocker(&vsc->migration_blocker); } /* This will stop vhost backend. */ diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6cb91e7ffd264434e7b696ce35558e9d27efcf8c..3a5db59dc7adbc13b409c286498f037960230d2b 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -31,6 +31,7 @@ #include "exec/memory.h" #include "exec/ram_addr.h" #include "hw/hw.h" +#include "migration/cpr-state.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/range.h" @@ -483,6 +484,8 @@ static int vfio_dma_unmap(VFIOContainer *container, }; VFIODMARange *qrange; + assert(!container->reused); + if (iotlb && container->dirty_pages_supported && vfio_devices_all_running_and_saving(container)) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); @@ -535,7 +538,6 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, { struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), - .flags = VFIO_DMA_MAP_FLAG_READ, .vaddr = (__u64)(uintptr_t)vaddr, .iova = iova, .size = size, @@ -549,6 +551,19 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, /* XXX allocate the dirty bitmap on demand */ vfio_dma_range_init_dirty_bitmap(qrange); + /* + * Set the new vaddr for any mappings registered during cpr load. + * Reused is cleared thereafter. + */ + if (container->reused) { + map.flags = VFIO_DMA_MAP_FLAG_VADDR; + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + goto fail; + } + return 0; + } + + map.flags = VFIO_DMA_MAP_FLAG_READ; if (!readonly) { map.flags |= VFIO_DMA_MAP_FLAG_WRITE; } @@ -564,7 +579,9 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, return 0; } - error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); +fail: + error_report("vfio_dma_map %s (iova %lu, size %ld, va %p): %s", + (container->reused ? "VADDR" : ""), iova, size, vaddr, strerror(errno)); return -errno; } @@ -909,10 +926,36 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, g_free(vrdl); } +static VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainer *container, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + vfio_container_region_add(container, section, false); +} + +void vfio_container_region_add(VFIOContainer *container, + MemoryRegionSection *section, bool remap) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -1033,6 +1076,30 @@ static void vfio_listener_region_add(MemoryListener *listener, int iommu_idx; trace_vfio_listener_region_add_iommu(iova, end); + + /* + * If remap, then VFIO_DMA_UNMAP_FLAG_VADDR has been called, and we + * want to remap the vaddr. vfio_container_region_add was already + * called in the past, so the giommu already exists. Find it and + * replay it, which calls vfio_dma_map further down the stack. + */ + + if (remap) { + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (giommu->iommu == iommu_mr && + giommu->iommu_offset == iommu_offset) { + memory_region_iommu_replay(giommu->iommu, &giommu->n); + return; + } + } + error_report("Container cannot find iommu region %s offset %lx", + memory_region_name(section->mr), iommu_offset); + goto fail; + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -1083,7 +1150,21 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(container, section); + /* + * If remap, then VFIO_DMA_UNMAP_FLAG_VADDR has been called, and we + * want to remap the vaddr. vfio_container_region_add was already + * called in the past, so the ram discard listener already exists. + * Call its populate function directly, which calls vfio_dma_map. + */ + if (remap) { + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(container, section); + if (vrdl->listener.notify_populate(&vrdl->listener, section)) { + error_report("listener.notify_populate failed"); + } + } else { + vfio_register_ram_discard_listener(container, section); + } return; } @@ -1412,19 +1493,8 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(container, section); /* * We only want/can synchronize the bitmap for actually mapped parts - @@ -1646,6 +1716,12 @@ static void vfio_listener_release(VFIOContainer *container) } } +void vfio_listener_register(VFIOContainer *container) +{ + container->listener = vfio_memory_listener; + memory_listener_register(&container->listener, container->space->as); +} + static struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) { @@ -2065,6 +2141,22 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, { int iommu_type, dirty_log_manual_clear, ret; + /* + * If container is reused, just set its type and skip the ioctls, as the + * container and group are already configured in the kernel. + * VFIO_TYPE1v2_IOMMU is the only type that supports reuse/cpr. + */ + if (container->reused) { + if (ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { + container->iommu_type = VFIO_TYPE1v2_IOMMU; + return 0; + } else { + error_setg(errp, "container was reused but VFIO_TYPE1v2_IOMMU " + "is not supported"); + return -errno; + } + } + iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { return iommu_type; @@ -2176,9 +2268,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, { VFIOContainer *container; int ret, fd; + bool reused; VFIOAddressSpace *space; space = vfio_get_address_space(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); + reused = (fd > 0); /* * VFIO is currently incompatible with discarding of RAM insofar as the @@ -2211,27 +2306,46 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ + /* + * If the container is reused, then the group is already attached in the + * kernel. If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, create + * the container struct and group list. + */ QLIST_FOREACH(container, &space->containers, next) { - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, - "Cannot set discarding of RAM broken"); - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, - &container->fd)) { - error_report("vfio: error disconnecting group %d from" - " container", group->groupid); - } - return ret; + if (reused) { + if (container->fd != fd) { + continue; } - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); + } else if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + continue; + } + + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, + "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, + &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + goto delete_fd_exit; + } + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + if (!reused) { vfio_kvm_device_add_group(group); - return 0; + cpr_save_fd("vfio_container_for_group", group->groupid, + container->fd); } + return 0; + } + + if (!reused) { + fd = qemu_open_old("/dev/vfio/vfio", O_RDWR); } - fd = qemu_open_old("/dev/vfio/vfio", O_RDWR); if (fd < 0) { error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio"); ret = -errno; @@ -2249,6 +2363,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->space = space; container->fd = fd; + container->reused = reused; container->error = NULL; container->dirty_pages_supported = false; container->dma_max_mappings = 0; @@ -2262,10 +2377,15 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, goto free_container_exit; } + ret = vfio_cpr_register_container(container, errp); + if (ret) { + goto free_container_exit; + } + ret = vfio_ram_block_discard_disable(container, true); if (ret) { error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - goto free_container_exit; + goto unregister_container_exit; } switch (container->iommu_type) { @@ -2376,9 +2496,16 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); - container->listener = vfio_memory_listener; - - memory_listener_register(&container->listener, container->space->as); + /* + * If reused, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * vfio_dma_map to supply the new vaddr, and the calls will match the + * mappings remembered by the kernel. + */ + if (!reused) { + vfio_listener_register(container); + } if (container->error) { ret = -1; @@ -2388,6 +2515,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } container->initialized = true; + cpr_resave_fd("vfio_container_for_group", group->groupid, fd); return 0; listener_release_exit: @@ -2399,6 +2527,9 @@ listener_release_exit: enable_discards_exit: vfio_ram_block_discard_disable(container, false); +unregister_container_exit: + vfio_cpr_unregister_container(container); + free_container_exit: g_free(container); @@ -2408,6 +2539,9 @@ close_fd_exit: put_space_exit: vfio_put_address_space(space); +delete_fd_exit: + cpr_delete_fd("vfio_container_for_group", group->groupid); + return ret; } @@ -2417,6 +2551,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(group, container_next); group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); /* * Explicitly release the listener first before unset container, @@ -2453,6 +2588,7 @@ static void vfio_disconnect_container(VFIOGroup *group) } trace_vfio_disconnect_container(container->fd); + vfio_cpr_unregister_container(container); close(container->fd); g_free(container); @@ -2482,7 +2618,11 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) group = g_malloc0(sizeof(*group)); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open_old(path, O_RDWR); + + group->fd = cpr_find_fd("vfio_group", groupid); + if (group->fd < 0) { + group->fd = qemu_open_old(path, O_RDWR); + } if (group->fd < 0) { error_setg_errno(errp, errno, "failed to open %s", path); goto free_group_exit; @@ -2515,6 +2655,7 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) } QLIST_INSERT_HEAD(&vfio_group_list, group, next); + cpr_resave_fd("vfio_group", groupid, group->fd); return group; @@ -2540,6 +2681,7 @@ void vfio_put_group(VFIOGroup *group) vfio_disconnect_container(group); QLIST_REMOVE(group, next); trace_vfio_put_group(group->fd); + cpr_delete_fd("vfio_group", group->groupid); close(group->fd); g_free(group); @@ -2553,8 +2695,14 @@ int vfio_get_device(VFIOGroup *group, const char *name, { struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; int ret, fd; + bool reused; + + fd = cpr_find_fd(name, 0); + reused = (fd >= 0); + if (!reused) { + fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + } - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); if (fd < 0) { error_setg_errno(errp, errno, "error getting device from group %d", group->groupid); @@ -2599,11 +2747,13 @@ int vfio_get_device(VFIOGroup *group, const char *name, vbasedev->num_irqs = dev_info.num_irqs; vbasedev->num_regions = dev_info.num_regions; vbasedev->flags = dev_info.flags; + vbasedev->reused = reused; trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, dev_info.num_irqs); vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + cpr_resave_fd(name, 0, fd); return 0; } @@ -2615,6 +2765,7 @@ void vfio_put_base_device(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_put_base_device(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); close(vbasedev->fd); } diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c new file mode 100644 index 0000000000000000000000000000000000000000..1f682cbba9e661d2151e2ced75a74b216b365111 --- /dev/null +++ b/hw/vfio/cpr.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include +#include +#include "hw/vfio/vfio-common.h" +#include "sysemu/kvm.h" +#include "qapi/error.h" +#include "migration/blocker.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/vmstate.h" +#include "trace.h" + +static int +vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return -errno; + } + container->vaddr_unmapped = true; + return 0; +} + +static int +vfio_region_remap(MemoryRegionSection *section, void *handle, Error **errp) +{ + VFIOContainer *container = handle; + vfio_container_region_add(container, section, true); + container->vaddr_unmapped = false; + return 0; +} + +static bool vfio_is_cpr_capable(VFIOContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR) || + !ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR " + "or VFIO_UNMAP_ALL"); + return false; + } else { + return true; + } +} + +static bool vfio_vmstate_needed(void *opaque) +{ + return migrate_mode() == MIG_MODE_CPR_EXEC; +} + +static int vfio_container_pre_save(void *opaque) +{ + VFIOContainer *container = (VFIOContainer *)opaque; + Error *err = NULL; + + if (!vfio_is_cpr_capable(container, &err) || + vfio_dma_unmap_vaddr_all(container, &err)) { + error_report_err(err); + return -1; + } + return 0; +} + +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOContainer *container = (VFIOContainer *)opaque; + VFIOGroup *group; + Error *err = NULL; + VFIODevice *vbasedev; + + if (!vfio_is_cpr_capable(container, &err)) { + error_report_err(err); + return -1; + } + + vfio_listener_register(container); + container->reused = false; + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + vbasedev->reused = false; + } + } + return 0; +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, + .needed = vfio_vmstate_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_cpr_fail_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VFIOContainer *container; + Error *err = NULL; + + if (!migration_has_failed(s) || migrate_mode_of(s) != MIG_MODE_CPR_EXEC) { + return; + } + + container = container_of(notifier, VFIOContainer, cpr_notifier); + if (container->vaddr_unmapped) { + + /* Set reused so vfio_dma_map restores vaddr */ + container->reused = true; + if (address_space_flat_for_each_section(container->space->as, + vfio_region_remap, + container, &err)) { + error_report_err(err); + } + container->reused = false; + } +} + +int vfio_cpr_register_container(VFIOContainer *container, Error **errp) +{ + container->cpr_blocker = NULL; + if (!vfio_is_cpr_capable(container, &container->cpr_blocker)) { + return migrate_add_blockers(&container->cpr_blocker, errp, + MIG_MODE_CPR_EXEC, -1); + } + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + migration_add_notifier(&container->cpr_notifier, vfio_cpr_fail_notifier); + return 0; +} + +void vfio_cpr_unregister_container(VFIOContainer *container) +{ + migrate_del_blocker(&container->cpr_blocker); + + vmstate_unregister(NULL, &vfio_container_vmstate, container); + + migration_remove_notifier(&container->cpr_notifier); +} diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index da9af297a0c5914e39be0a6f515caddd37542471..e247b2bc73b5f092f14285f317cdeca2ee7bd70e 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -5,6 +5,7 @@ vfio_ss.add(files( 'migration.c', )) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( + 'cpr.c', 'display.c', 'pci-quirks.c', 'pci.c', diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index e69b5f2e42681b6086e915e03d2e6019db91b77c..a85dc42f41852d89f23a926fc8c326aa4611118b 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -759,6 +759,9 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) VFIODevice *vbasedev = migration->vbasedev; int ret; + if (migrate_mode_of(s) != MIG_MODE_NORMAL) { + return; + } trace_vfio_migration_state_notifier(vbasedev->name, MigrationStatus_str(s->state)); @@ -840,8 +843,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, migration->vm_state = qdev_add_vm_change_state_handler(vbasedev->dev, vfio_vmstate_change, vbasedev); - migration->migration_state.notify = vfio_migration_state_notifier; - add_migration_state_change_notifier(&migration->migration_state); + migration_add_notifier(&migration->migration_state, + vfio_migration_state_notifier); return 0; err: @@ -886,12 +889,8 @@ add_blocker: "VFIO device doesn't support migration"); g_free(info); - ret = migrate_add_blocker(vbasedev->migration_blocker, errp); - if (ret < 0) { - error_free(vbasedev->migration_blocker); - vbasedev->migration_blocker = NULL; - } - return ret; + return migrate_add_blockers(&vbasedev->migration_blocker, errp, + MIG_MODE_NORMAL, -1); } void vfio_migration_finalize(VFIODevice *vbasedev) @@ -899,15 +898,11 @@ void vfio_migration_finalize(VFIODevice *vbasedev) if (vbasedev->migration) { VFIOMigration *migration = vbasedev->migration; - remove_migration_state_change_notifier(&migration->migration_state); + migration_remove_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); vfio_migration_exit(vbasedev); } - if (vbasedev->migration_blocker) { - migrate_del_blocker(vbasedev->migration_blocker); - error_free(vbasedev->migration_blocker); - vbasedev->migration_blocker = NULL; - } + migrate_del_blocker(&vbasedev->migration_blocker); } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b085389ff85f923faae7726304c51d9c90bdb44d..1e7222200fbd7b6b03d0bc33a97a46fbd25ed9c8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -28,6 +28,8 @@ #include "hw/pci/pci_bridge.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/misc.h" +#include "migration/cpr-state.h" #include "migration/vmstate.h" #include "qapi/qmp/qdict.h" #include "qemu/error-report.h" @@ -48,6 +50,58 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +#define EVENT_FD_NAME(vdev, name) \ + g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) + +static void save_event_fd(VFIOPCIDevice *vdev, const char *name, int nr, + EventNotifier *ev) +{ + int fd = event_notifier_get_fd(ev); + + if (fd >= 0) { + g_autofree char *fdname = EVENT_FD_NAME(vdev, name); + + cpr_resave_fd(fdname, nr, fd); + } +} + +static int load_event_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = EVENT_FD_NAME(vdev, name); + return cpr_find_fd(fdname, nr); +} + +static void delete_event_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = EVENT_FD_NAME(vdev, name); + cpr_delete_fd(fdname, nr); +} + +/* Create new or reuse existing eventfd */ +static int vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + int ret = 0; + int fd = load_event_fd(vdev, name, nr); + + if (fd >= 0) { + event_notifier_init_fd(e, fd); + } else { + ret = event_notifier_init(e, 0); + if (!ret) { + save_event_fd(vdev, name, nr, e); + } + } + return ret; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + delete_event_fd(vdev, name, nr); + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -121,15 +175,17 @@ static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) return; } - /* Get to a known interrupt state */ - qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); - vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); - vdev->intx.pending = false; - pci_irq_deassert(&vdev->pdev); + if (!vdev->vbasedev.reused) { + /* Get to a known interrupt state */ + qemu_set_fd_handler(irq_fd, NULL, NULL, vdev); + vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + vdev->intx.pending = false; + pci_irq_deassert(&vdev->pdev); + } /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0)) { + error_setg(errp, "vfio_notifier_init intx-unmask failed"); goto fail; } @@ -141,15 +197,17 @@ static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) goto fail_irqfd; } - if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, - VFIO_IRQ_SET_ACTION_UNMASK, - event_notifier_get_fd(&vdev->intx.unmask), - errp)) { - goto fail_vfio; - } + if (!vdev->vbasedev.reused) { + if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_UNMASK, + event_notifier_get_fd(&vdev->intx.unmask), + errp)) { + goto fail_vfio; + } - /* Let'em rip */ - vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + /* Let'em rip */ + vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); + } vdev->intx.kvm_accel = true; @@ -161,7 +219,7 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); @@ -190,7 +248,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -265,7 +323,13 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) return 0; } - vfio_disable_interrupts(vdev); + /* + * Do not alter interrupt state during vfio_realize and cpr load. The + * reused flag is cleared thereafter. + */ + if (!vdev->vbasedev.reused) { + vfio_disable_interrupts(vdev); + } vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ pci_config_set_interrupt_pin(vdev->pdev.config, pin); @@ -281,18 +345,20 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); + ret = vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + error_setg_errno(errp, -ret, + "vfio_notifier_init intx-interrupt failed"); return ret; } fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); - if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, + if (!vdev->vbasedev.reused && + vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return -errno; } @@ -320,7 +386,7 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; @@ -365,6 +431,53 @@ static void vfio_msi_interrupt(void *opaque) notify(&vdev->pdev, nr); } +static void vfio_device_interrupt_compensate(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + VFIOMSIVector *vector; + MSIMessage msg; + int nr_vectors, nr; + + if (vdev->interrupt == VFIO_INT_MSIX) { + nr_vectors = vdev->nr_vectors; + for (nr = 0; nr < nr_vectors; nr++) { + vector = &vdev->msi_vectors[nr]; + event_notifier_test_and_clear(&vector->interrupt); + if (msix_is_masked(&vdev->pdev, nr)) { + set_bit(nr, vdev->msix->pending); + memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true); + trace_vfio_msix_pba_enable(vdev->vbasedev.name); + } + msg = msix_get_message(&vdev->pdev, nr); + trace_vfio_msi_interrupt_compensate(vdev->vbasedev.name, + nr, msg.address, msg.data); + msix_notify(&vdev->pdev, nr); + } + } else if (vdev->interrupt == VFIO_INT_MSI) { + nr_vectors = vdev->nr_vectors; + for (nr = 0; nr < nr_vectors; nr++) { + vector = &vdev->msi_vectors[nr]; + event_notifier_test_and_clear(&vector->interrupt); + msg = msi_get_message(&vdev->pdev, nr); + trace_vfio_msi_interrupt_compensate(vdev->vbasedev.name, + nr, msg.address, msg.data); + msi_notify(&vdev->pdev, nr); + } + } else { + /* legacy int */ + event_notifier_test_and_clear(&vdev->intx.interrupt); + vdev->intx.pending = true; + pci_irq_assert(&vdev->pdev); + vfio_mmap_set_enabled(vdev, false); + if (vdev->intx.mmap_timeout) { + timer_mod(vdev->intx.mmap_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout); + } + } + + return; +} + static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) { struct vfio_irq_set *irq_set; @@ -410,41 +523,43 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) } static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) + int nr, bool msix) { int virq; + const char *name = "kvm_interrupt"; if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (vfio_notifier_init(vdev, &vector->kvm_interrupt, name, nr)) { return; } - virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev); + virq = kvm_irqchip_add_msi_route(kvm_state, nr, &vdev->pdev); if (virq < 0) { - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, name, nr); return; } if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, NULL, virq) < 0) { kvm_irqchip_release_virq(kvm_state, virq); - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, name, nr); return; } vector->virq = virq; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -454,6 +569,20 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void vfio_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + + vector->vdev = vdev; + vector->virq = -1; + if (vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr)) { + error_report("vfio: vfio_notifier_init interrupt failed"); + } + vector->use = true; + msix_vector_use(pdev, nr); +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -461,18 +590,21 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, VFIOMSIVector *vector; int ret; + /* + * Ignore the callback from msix_set_vector_notifiers during resume. + * The necessary subset of these actions is called from vfio_claim_vectors + * during post load. + */ + if (vdev->vbasedev.reused) { + return 0; + } + trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -484,7 +616,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } @@ -629,8 +761,8 @@ retry: vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i)) { + error_report("vfio: Error: vfio_notifier_init failed"); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -658,11 +790,11 @@ retry: for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } g_free(vdev->msi_vectors); @@ -697,11 +829,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -2406,6 +2538,8 @@ static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev) static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + /* Reused dev should not call reset handler */ + assert(!vdev->vbasedev.reused); return vfio_pci_hot_reset(vdev, false); } @@ -2413,7 +2547,11 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { - vbasedev->needs_reset = true; + if (vdev->vbasedev.reused) { + vbasedev->needs_reset = false; + } else { + vbasedev->needs_reset = true; + } } } @@ -2694,7 +2832,7 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { + if (vfio_notifier_init(vdev, &vdev->err_notifier, "err", 0)) { error_report("vfio: Unable to init event notifier for error detection"); vdev->pci_aer = false; return; @@ -2703,11 +2841,16 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (vdev->vbasedev.reused) { + return; + } + if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2726,7 +2869,7 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2760,7 +2903,7 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { + if (vfio_notifier_init(vdev, &vdev->req_notifier, "req", 0)) { error_report("vfio: Unable to init event notifier for device request"); return; } @@ -2768,11 +2911,17 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (vdev->vbasedev.reused) { + vdev->req_enabled = true; + return; + } + if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2792,7 +2941,7 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } @@ -3056,9 +3205,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_intx_routing_notifier); vdev->irqchip_change_notifier.notify = vfio_irqchip_change; kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); - ret = vfio_intx_enable(vdev, errp); - if (ret) { - goto out_deregister; + + /* Wait until cpr load reads intx routing data to enable */ + if (!vdev->vbasedev.reused) { + ret = vfio_intx_enable(vdev, errp); + if (ret) { + goto out_deregister; + } } } @@ -3165,6 +3318,11 @@ static void vfio_pci_reset(DeviceState *dev) { VFIOPCIDevice *vdev = VFIO_PCI(dev); + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (vdev->vbasedev.reused) { + return; + } + trace_vfio_pci_reset(vdev->vbasedev.name); vfio_pci_pre_reset(vdev); @@ -3272,6 +3430,131 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix) +{ + int i, fd; + bool pending = false; + PCIDevice *pdev = &vdev->pdev; + + vdev->nr_vectors = nr_vectors; + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + fd = load_event_fd(vdev, "interrupt", i); + if (fd >= 0) { + vfio_vector_init(vdev, i); + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); + } + + if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) { + vfio_add_kvm_msi_virq(vdev, vector, i, msix); + } else { + vdev->msi_vectors[i].virq = -1; + } + + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { + set_bit(i, vdev->msix->pending); + pending = true; + } + } + + if (msix) { + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); + } +} + +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_pci_pre_load(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +static int vfio_pci_post_load(void *opaque, int version_id) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int nr_vectors; + int ret = 0; + + if (msix_enabled(pdev)) { + msix_set_vector_notifiers(pdev, vfio_msix_vector_use, + vfio_msix_vector_release, NULL); + nr_vectors = vdev->msix->entries; + vfio_claim_vectors(vdev, nr_vectors, true); + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + vfio_claim_vectors(vdev, nr_vectors, false); + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + Error *err = 0; + ret = vfio_intx_enable(vdev, &err); + if (ret) { + error_report_err(err); + return ret; + } + } + + qemu_add_cpr_exec_complete_handler(vfio_device_interrupt_compensate, + (void *)vdev); + + return ret; +} + +static const VMStateDescription vfio_intx_vmstate = { + .name = "vfio-intx", + .version_id = 0, + .minimum_version_id = 0, + .fields = (VMStateField[]) { + VMSTATE_BOOL(pending, VFIOINTx), + VMSTATE_UINT32(route.mode, VFIOINTx), + VMSTATE_INT32(route.irq, VFIOINTx), + VMSTATE_END_OF_LIST() + } +}; + +#define VMSTATE_VFIO_INTX(_field, _state) { \ + .name = (stringify(_field)), \ + .size = sizeof(VFIOINTx), \ + .vmsd = &vfio_intx_vmstate, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ +} + +static bool vfio_pci_needed(void *opaque) +{ + return migrate_mode() == MIG_MODE_CPR_EXEC; +} + +static const VMStateDescription vfio_pci_vmstate = { + .name = "vfio-pci", + .version_id = 0, + .minimum_version_id = 0, + .priority = MIG_PRI_VFIO_PCI, /* must load before container */ + .pre_load = vfio_pci_pre_load, + .post_load = vfio_pci_post_load, + .needed = vfio_pci_needed, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), + VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), + VMSTATE_END_OF_LIST() + } +}; + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3279,6 +3562,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) dc->reset = vfio_pci_reset; device_class_set_props(dc, vfio_pci_dev_properties); + dc->vmsd = &vfio_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index f8f08a0f362df21a122606921bbe63f4d6675fdc..3e91465db9fd23a73d609887afa73f3c3661a2c8 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -425,12 +425,21 @@ fail_irqfd: static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev) { - vbasedev->needs_reset = true; + if (vbasedev->resued) { + vbasedev->needs_reset = false; + } else { + vbasedev->needs_reset = true; + } } /* not implemented yet */ static int vfio_platform_hot_reset_multi(VFIODevice *vbasedev) { + /* + * Althrough platform hot reset handler not implemented, + * also put assert here to get attention. + */ + assert(!vbasedev->reused); return -1; } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0ef1b5f4a65ff38171380c91877002d13035bc34..dd88516e99de774b0bcf053a482454decb37b280 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -9,6 +9,7 @@ vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved vfio_intx_enable(const char *name) " (%s)" vfio_intx_disable(const char *name) " (%s)" vfio_msi_interrupt(const char *name, int index, uint64_t addr, int data) " (%s) vector %d 0x%"PRIx64"/0x%x" +vfio_msi_interrupt_compensate(const char *name, int index, uint64_t addr, int data) " (%s) vector %d 0x%"PRIx64"/0x%x" vfio_msix_vector_do_use(const char *name, int index) " (%s) vector %d used" vfio_msix_vector_release(const char *name, int index) " (%s) vector %d released" vfio_msix_enable(const char *name) " (%s)" @@ -118,6 +119,7 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" vfio_dma_unmap_overflow_workaround(void) "" +vfio_region_remap(const char *name, int fd, uint64_t iova_start, uint64_t iova_end, void *vaddr) "%s fd %d 0x%"PRIx64" - 0x%"PRIx64" [%p]" # platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 3ac6cfde030c39b5f1d02265d9c3cfcd413184ec..a08ce500d290f0c79691dc68c42cf77467c0acdf 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -23,6 +23,7 @@ #include "standard-headers/linux/vhost_types.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" +#include "migration/misc.h" #include "migration/blocker.h" #include "migration/qemu-file-types.h" #include "migration/migration.h" @@ -1350,6 +1351,37 @@ static bool vhost_dev_used_memslots_is_exceeded(struct vhost_dev *hdev) return false; } +static void vhost_cpr_exec_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + struct vhost_dev *dev; + int r = 0; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC) { + dev = container_of(notifier, struct vhost_dev, cpr_notifier); + if (migration_has_failed(s)) { + r = dev->vhost_ops->vhost_set_owner(dev); + } else { + /* + * Do not reset vhost device when status MIGRATION_STATUS_SETUP, + * because slave_read will read vring last_avail_idx etc information, + * if reset here, slave_read will read fail. + * + * Normally reset operation when migration succeed, and the connection + * to vhost bankend will be reestablished later. + */ + if (s->state == MIGRATION_STATUS_SETUP) { + VHOST_OPS_DEBUG("migration setup phase should not reset device"); + return; + } + r = dev->vhost_ops->vhost_reset_device(dev); + } + if (r < 0) { + VHOST_OPS_DEBUG("vhost_reset_device failed"); + } + } +} + int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) @@ -1359,6 +1391,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->vdev = NULL; hdev->migration_blocker = NULL; + hdev->cpr_notifier.notify = NULL; r = vhost_set_backend_type(hdev, backend_type); assert(r >= 0); @@ -1435,9 +1468,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, } if (hdev->migration_blocker != NULL) { - r = migrate_add_blocker(hdev->migration_blocker, errp); + r = migrate_add_blockers(&hdev->migration_blocker, errp, + MIG_MODE_NORMAL, -1); if (r < 0) { - error_free(hdev->migration_blocker); goto fail_busyloop; } } @@ -1450,6 +1483,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->log_enabled = false; hdev->started = false; memory_listener_register(&hdev->memory_listener, &address_space_memory); + migration_add_notifier(&hdev->cpr_notifier, vhost_cpr_exec_notifier); QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); /* @@ -1489,10 +1523,8 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) memory_listener_unregister(&hdev->memory_listener); QLIST_REMOVE(hdev, entry); } - if (hdev->migration_blocker) { - migrate_del_blocker(hdev->migration_blocker); - error_free(hdev->migration_blocker); - } + migrate_del_blocker(&hdev->migration_blocker); + migration_remove_notifier(&hdev->cpr_notifier); g_free(hdev->mem); g_free(hdev->mem_sections); if (hdev->vhost_ops) { diff --git a/include/chardev/char.h b/include/chardev/char.h index f388d4b109b4641d235f64979ba8bef8b795bc6a..020fce339b59cb193469fb2d61f43877603bffb6 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -52,6 +52,8 @@ typedef enum { /* Whether the gcontext can be changed after calling * qemu_chr_be_update_read_handlers() */ QEMU_CHAR_FEATURE_GCONTEXT, + /* Whether the device supports cpr */ + QEMU_CHAR_FEATURE_CPR, QEMU_CHAR_FEATURE_LAST, } ChardevFeature; @@ -69,6 +71,9 @@ struct Chardev { int be_open; /* used to coordinate the chardev-change special-case: */ bool handover_yank_instance; + bool reopen_on_cpr; + bool cpr_enabled; + Error *cpr_blocker; GSource *gsource; GMainContext *gcontext; DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST); @@ -224,6 +229,7 @@ bool qemu_chr_has_feature(Chardev *chr, ChardevFeature feature); void qemu_chr_set_feature(Chardev *chr, ChardevFeature feature); +bool qemu_chr_cpr_support(Chardev *chr); QemuOpts *qemu_chr_parse_compat(const char *label, const char *filename, bool permit_mux_mon); int qemu_chr_write(Chardev *s, const uint8_t *buf, int len, bool write_all); diff --git a/include/exec/memory.h b/include/exec/memory.h index abb838f19459487959c37458d68f356c94ec93b3..90718e911bf863dfc84898df3d10d6872956656b 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -206,6 +206,9 @@ typedef struct IOMMUTLBEvent { /* RAM that isn't accessible through normal means. */ #define RAM_PROTECTED (1 << 8) +/* RAM is an mmap-ed named file */ +#define RAM_NAMED_FILE (1 << 9) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, @@ -735,6 +738,7 @@ struct MemoryRegion { bool flush_coalesced_mmio; uint8_t dirty_log_mask; bool is_iommu; + bool has_addr; RAMBlock *ram_block; Object *owner; @@ -2263,6 +2267,17 @@ void memory_region_set_enabled(MemoryRegion *mr, bool enabled); */ void memory_region_set_address(MemoryRegion *mr, hwaddr addr); +/* + * memory_region_set_address_only: set the address of a region. + * + * Same as memory_region_set_address, but without causing transaction side + * effects. + * + * @mr: the region to be updated + * @addr: new address, relative to container region + */ +void memory_region_set_address_only(MemoryRegion *mr, hwaddr addr); + /* * memory_region_set_size: dynamically update the size of a region. * @@ -2340,6 +2355,25 @@ static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) void memory_region_set_ram_discard_manager(MemoryRegion *mr, RamDiscardManager *rdm); +typedef int (*memory_region_section_cb)(MemoryRegionSection *mrs, + void *opaque, + Error **errp); + +/** + * address_space_flat_for_each_section: walk the ranges in the address space + * flat view and call @func for each. Return 0 on success, else return non-zero + * with a message in @errp. + * + * @as: target address space + * @func: callback function + * @opaque: passed to @func + * @errp: passed to @func + */ +int address_space_flat_for_each_section(AddressSpace *as, + memory_region_section_cb func, + void *opaque, + Error **errp); + /** * memory_region_find: translate an address/size relative to a * MemoryRegion into a #MemoryRegionSection. @@ -3000,6 +3034,8 @@ bool ram_block_discard_is_disabled(void); */ bool ram_block_discard_is_required(void); +void ram_block_add_cpr_blockers(Error **errp); + #endif #endif diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 64fb936c7c748e6276a439bbed5c915c9c07993b..1d214b3b42024b91ec7deb9e4e9e54f907a4084f 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -94,6 +94,7 @@ static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr, } bool ramblock_is_pmem(RAMBlock *rb); +bool ramblock_is_named_file(RAMBlock *rb); long qemu_minrampagesize(void); long qemu_maxrampagesize(void); diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 664701b7594982f81b0baf35d6f79d5c7020cd2c..83d2923e8b953bc7a016249346deffdddd5b2d96 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -37,6 +37,7 @@ struct RAMBlock { /* RCU-enabled, writes protected by the ramlist lock */ QLIST_ENTRY(RAMBlock) next; QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; + Error *cpr_blocker; int fd; size_t page_size; /* dirty bitmap used during migration */ diff --git a/include/hw/pci/msix.h b/include/hw/pci/msix.h index 4c4a60c7399a0411c3c8c948f3be38e6af1d3913..00653548b70f1434e13b840d489bd9ab925e2d2b 100644 --- a/include/hw/pci/msix.h +++ b/include/hw/pci/msix.h @@ -32,6 +32,7 @@ int msix_present(PCIDevice *dev); bool msix_is_masked(PCIDevice *dev, unsigned vector); void msix_set_pending(PCIDevice *dev, unsigned vector); void msix_clr_pending(PCIDevice *dev, int vector); +int msix_is_pending(PCIDevice *dev, unsigned vector); int msix_vector_use(PCIDevice *dev, unsigned vector); void msix_vector_unuse(PCIDevice *dev, unsigned vector); diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 906a0276761693411d6a259612d7a6dff6814e89..817dd8377aab8f27e4dfedf0041a6acf023d97a6 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -7,6 +7,7 @@ extern const PropertyInfo qdev_prop_chr; extern const PropertyInfo qdev_prop_macaddr; extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; +extern const PropertyInfo qdev_prop_mig_mode; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; extern const PropertyInfo qdev_prop_blockdev_retry_interval; @@ -43,6 +44,9 @@ extern const PropertyInfo qdev_prop_pcie_link_width; #define DEFINE_PROP_MULTIFD_COMPRESSION(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_multifd_compression, \ MultiFDCompression) +#define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ + MigMode) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index ea129d65a684f368bd207b7583c6d79acce62ddf..fa2b6b3b6827ca7d4a45f5005841e39a835a16f4 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -57,6 +57,7 @@ extern const PropertyInfo qdev_prop_uint64; extern const PropertyInfo qdev_prop_int64; extern const PropertyInfo qdev_prop_size; extern const PropertyInfo qdev_prop_string; +extern const PropertyInfo qdev_prop_strlist; extern const PropertyInfo qdev_prop_on_off_auto; extern const PropertyInfo qdev_prop_compress_method; extern const PropertyInfo qdev_prop_size32; @@ -160,6 +161,8 @@ extern const PropertyInfo qdev_prop_link; DEFINE_PROP_UNSIGNED(_n, _s, _f, _d, qdev_prop_size, uint64_t) #define DEFINE_PROP_STRING(_n, _s, _f) \ DEFINE_PROP(_n, _s, _f, qdev_prop_string, char*) +#define DEFINE_PROP_STRLIST(_n, _s, _f) \ + DEFINE_PROP(_n, _s, _f, qdev_prop_strlist, strList*) #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto) #define DEFINE_PROP_COMPRESS_METHOD(_n, _s, _f, _d) \ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0234f5e1b1627012021b976fddf2e4f8161b24e7..388e13e47ccc02446a188fcdfef14b991203a574 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -89,11 +89,15 @@ typedef struct VFIOContainer { int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; MemoryListener prereg_listener; + Notifier cpr_notifier; + Error *cpr_blocker; unsigned iommu_type; Error *error; bool initialized; bool dirty_pages_supported; bool dirty_log_manual_clear; + bool reused; + bool vaddr_unmapped; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; @@ -146,6 +150,7 @@ typedef struct VFIODevice { bool no_mmap; bool ram_block_discard_allowed; bool enable_migration; + bool reused; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; @@ -223,6 +228,9 @@ void vfio_put_group(VFIOGroup *group); int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vbasedev, Error **errp); +int vfio_cpr_register_container(VFIOContainer *container, Error **errp); +void vfio_cpr_unregister_container(VFIOContainer *container); + extern const MemoryRegionOps vfio_region_ops; typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; extern VFIOGroupList vfio_group_list; @@ -244,6 +252,9 @@ struct vfio_info_cap_header * vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); #endif extern const MemoryListener vfio_prereg_listener; +void vfio_listener_register(VFIOContainer *container); +void vfio_container_region_add(VFIOContainer *container, + MemoryRegionSection *section, bool remap); int vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 86f36f010651bcfd8b1dee4e5b819deecd1dbcd8..c17d55ecfa76b4a834bb7a930e84c49dc6b4e185 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -94,6 +94,7 @@ struct vhost_dev { QLIST_ENTRY(vhost_dev) entry; QLIST_HEAD(, vhost_iommu) iommu_list; IOMMUNotifier n; + Notifier cpr_notifier; const VhostDevConfigOps *config_ops; }; diff --git a/include/migration/blocker.h b/include/migration/blocker.h index 9cebe2ba06acd669202721320c1d3ec2a6178879..1a2193787b483ca82e74123c9af164e21b0bc232 100644 --- a/include/migration/blocker.h +++ b/include/migration/blocker.h @@ -14,22 +14,62 @@ #ifndef MIGRATION_BLOCKER_H #define MIGRATION_BLOCKER_H +#include "qapi/qapi-types-migration.h" + +#define MIG_MODE_ALL MIG_MODE__MAX + /** - * @migrate_add_blocker - prevent migration from proceeding + * @migrate_add_blocker - prevent all modes of migration from proceeding * - * @reason - an error to be returned whenever migration is attempted + * @reasonp - address of an error to be returned whenever migration is attempted * * @errp - [out] The reason (if any) we cannot block migration right now. * * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. + * + * *@reasonp is freed and set to NULL if failure is returned. + * On success, the caller must not free *@reasonp before the blocker is removed. */ -int migrate_add_blocker(Error *reason, Error **errp); +int migrate_add_blocker(Error **reasonp, Error **errp); + +/** + * @migrate_add_blockers - prevent migration for specified modes from proceeding + * + * @reasonp - address of an error to be returned whenever migration is attempted + * + * @errp - [out] The reason (if any) we cannot block migration right now. + * + * @mode - one or more migration modes to be blocked. The list is terminated + * by -1 or MIG_MODE_ALL. For the latter, all modes are blocked. + * + * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. + * + * *@reasonp is freed and set to NULL if failure is returned. + * On success, the caller must not free *@reasonp before the blocker is removed. + */ +int migrate_add_blockers(Error **reasonp, Error **errp, MigMode mode, ...); + +/** + * @migrate_add_blocker_always - permanently prevent migration for specified + * modes from proceeding. The blocker cannot be deleted. + * + * @msg - text of error to be returned whenever migration is attempted + * + * @errp - [out] The reason (if any) we cannot block migration right now. + * + * @mode - one or more migration modes to be blocked. The list is terminated + * by -1 or MIG_MODE_ALL. For the latter, all modes are blocked. + * + * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. + */ +int +migrate_add_blocker_always(const char *msg, Error **errp, MigMode mode, ...); /** * @migrate_add_blocker_internal - prevent migration from proceeding without - * only-migrate implications + * only-migrate implications, for all modes * - * @reason - an error to be returned whenever migration is attempted + * @reasonp - address of an error to be returned whenever migration is attempted * * @errp - [out] The reason (if any) we cannot block migration right now. * @@ -38,14 +78,27 @@ int migrate_add_blocker(Error *reason, Error **errp); * Some of the migration blockers can be temporary (e.g., for a few seconds), * so it shouldn't need to conflict with "-only-migratable". For those cases, * we can call this function rather than @migrate_add_blocker(). + * + * *@reasonp is freed and set to NULL if failure is returned. + * On success, the caller must not free *@reasonp before the blocker is removed. */ -int migrate_add_blocker_internal(Error *reason, Error **errp); +int migrate_add_blocker_internal(Error **reasonp, Error **errp); /** - * @migrate_del_blocker - remove a blocking error from migration + * @migrate_del_blocker - remove a migration blocker for all modes and free it. + * + * @reasonp - address of the error blocking migration + * + * This function frees *@reasonp and sets it to NULL. + */ +void migrate_del_blocker(Error **reasonp); + +/** + * @migrate_remove_blocker - remove a migration blocker for all modes. * * @reason - the error blocking migration + * */ -void migrate_del_blocker(Error *reason); +void migrate_remove_blocker(Error *reason); #endif diff --git a/include/migration/cpr-state.h b/include/migration/cpr-state.h new file mode 100644 index 0000000000000000000000000000000000000000..caa559824de55563b717ec561bdbe680bf1e7a94 --- /dev/null +++ b/include/migration/cpr-state.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef MIGRATION_CPR_STATE_H +#define MIGRATION_CPR_STATE_H + +#include "qapi/qapi-types-migration.h" + +/* + * Here, we use the cpr's file descriptor inheritance mechanism to + * pass the monitor capability. We define a special id number 0xFFFF + * to indicate that the fd is not a file descriptor. but a monitor + * capability. + * */ +#define MONITOR_CAPAB 0XFFFF + +typedef int (*cpr_walk_fd_cb)(const char *name, int id, int fd, void *opaque); + +void cpr_save_fd(const char *name, int id, int fd); +void cpr_delete_fd(const char *name, int id); +int cpr_find_fd(const char *name, int id); +int cpr_walk_fd(cpr_walk_fd_cb cb, void *handle); +void cpr_save_memfd(const char *name, int fd, size_t len, size_t maxlen, + uint64_t align); +int cpr_find_memfd(const char *name, size_t *lenp, size_t *maxlenp, + uint64_t *alignp); +void cpr_delete_memfd(const char *name); +void cpr_resave_fd(const char *name, int id, int fd); +int cpr_state_save(Error **errp); +void cpr_state_unsave(void); +int cpr_state_load(Error **errp); +void cpr_state_print(void); + +#endif diff --git a/include/migration/cpr.h b/include/migration/cpr.h new file mode 100644 index 0000000000000000000000000000000000000000..07907975e3a87c593f10af3f4b71d4c93be8cd48 --- /dev/null +++ b/include/migration/cpr.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef MIGRATION_CPR_H +#define MIGRATION_CPR_H + +#define CPR_MODES (BIT(MIG_MODE_CPR_EXEC)) + +extern bool only_cpr_capable; + +void cpr_init(void); +void cpr_exec(void); +void cpr_exec_failed(Error *err); +void cpr_preserve_fds(void); + +#endif diff --git a/include/migration/misc.h b/include/migration/misc.h index 465906710de808e9381a3ed1c4142a533a74d940..9b7c0062dc9cd3aa081928919e7ecd116aac0db9 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -15,6 +15,7 @@ #define MIGRATION_MISC_H #include "qemu/notify.h" +#include "qapi/qapi-types-migration.h" #include "qapi/qapi-types-net.h" /* migration/ram.c */ @@ -55,12 +56,16 @@ AnnounceParameters *migrate_announce_params(void); void dump_vmstate_json_to_file(FILE *out_fp); /* migration/migration.c */ +void migration_object_early_init(void); void migration_object_init(void); void migration_shutdown(void); bool migration_is_idle(void); bool migration_is_active(MigrationState *); -void add_migration_state_change_notifier(Notifier *notify); -void remove_migration_state_change_notifier(Notifier *notify); +void migration_add_notifier(Notifier *notify, + void (*func)(Notifier *notifier, void *data)); +void migration_remove_notifier(Notifier *notify); +int migration_call_notifiers(MigrationState *s); +void migration_notifier_set_error(MigrationState *s, Error *err); bool migration_in_setup(MigrationState *); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); @@ -75,4 +80,9 @@ bool migration_in_bg_snapshot(void); /* migration/block-dirty-bitmap.c */ void dirty_bitmap_mig_init(void); +MigMode migrate_mode(void); +MigMode migrate_mode_of(MigrationState *s); +void migrate_enable_mode(MigMode mode); +bool migrate_mode_enabled(MigMode mode); + #endif diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 017c03675ca48af3e61e4fced931d5b5c8e81e3c..7aca6c5c2301d953849433a15d1029d3550edcdd 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -157,6 +157,8 @@ typedef enum { MIG_PRI_GICV3_ITS, /* Must happen before PCI devices */ MIG_PRI_GICV3, /* Must happen before the ITS */ MIG_PRI_MAX, + MIG_PRI_VFIO_PCI = + MIG_PRI_DEFAULT + 1, /* Must happen before vfio containers */ } MigrationPriority; struct VMStateField { diff --git a/include/qapi/util.h b/include/qapi/util.h index 81a2b13a3339c7f6341cc43f8679542949684cfe..51ff64e7579267c045f7342f77f68f774208ac01 100644 --- a/include/qapi/util.h +++ b/include/qapi/util.h @@ -22,6 +22,8 @@ typedef struct QEnumLookup { const int size; } QEnumLookup; +struct strList; + const char *qapi_enum_lookup(const QEnumLookup *lookup, int val); int qapi_enum_parse(const QEnumLookup *lookup, const char *buf, int def, Error **errp); @@ -30,6 +32,19 @@ bool qapi_bool_parse(const char *name, const char *value, bool *obj, int parse_qapi_name(const char *name, bool complete); +/* + * Produce and return a NULL-terminated array of strings from @args. + * All strings are g_strdup'd. + */ +GStrv strv_from_strList(const struct strList *args); + +/* + * Produce a strList from the character delimited string @in. + * All strings are g_strdup'd. + * A NULL or empty input string returns NULL. + */ +struct strList *strList_from_string(const char *in, char delim); + /* * For any GenericList @list, insert @element at the front. * @@ -56,4 +71,17 @@ int parse_qapi_name(const char *name, bool complete); (tail) = &(*(tail))->next; \ } while (0) +/* + * For any GenericList @list, return its length. + */ +#define QAPI_LIST_LENGTH(list) \ + ({ \ + int len = 0; \ + typeof(list) elem; \ + for (elem = list; elem != NULL; elem = elem->next) { \ + len++; \ + } \ + len; \ + }) + #endif diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index fd9e53f6239ab3cd50d95ccfa9507dc392cff202..c8cba40b9683ff989137ff640c20ad93876ec22f 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -642,6 +642,15 @@ static inline void qemu_timersub(const struct timeval *val1, void qemu_set_cloexec(int fd); +/* + * Clear FD_CLOEXEC for a descriptor. + * + * The caller must guarantee that no other fork+exec's occur before the + * exec that is intended to inherit this descriptor, eg by suspending CPUs + * and blocking monitor commands. + */ +void qemu_clear_cloexec(int fd); + /* Starting on QEMU 2.5, qemu_hw_version() returns "2.5+" by default * instead of QEMU_VERSION, so setting hw_version on MachineClass * is no longer mandatory. diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index ee60eb3de47a499e0f103c6524a9bcfeb2adaa73..a55b0ffaa1bb2686175ad1a552e3b106ccdbdc47 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -123,6 +123,7 @@ typedef struct SSIBus SSIBus; typedef struct TranslationBlock TranslationBlock; typedef struct VirtIODevice VirtIODevice; typedef struct Visitor Visitor; +typedef struct CprExecCompleteEntry CprExecCompleteEntry; typedef struct VMChangeStateEntry VMChangeStateEntry; typedef struct VMStateDescription VMStateDescription; diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index a53569157343499a997188621b664a3062183020..d0d6b7facf4512081bedd88e6a5459b5ed145770 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -10,6 +10,12 @@ bool runstate_is_running(void); bool runstate_needs_reset(void); bool runstate_store(char *str, size_t size); +typedef void CprExecCompleteHandler(void *opaque); +CprExecCompleteEntry *qemu_add_cpr_exec_complete_handler( + CprExecCompleteHandler *cb, void *opaque); +void qemu_del_all_cpr_exec_complete_handler(void); +void cpr_exec_complete_notify(void); + typedef void VMChangeStateHandler(void *opaque, bool running, RunState state); VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, @@ -51,11 +57,13 @@ void qemu_system_reset_request(ShutdownCause reason); void qemu_system_suspend_request(void); void qemu_register_suspend_notifier(Notifier *notifier); bool qemu_wakeup_suspend_enabled(void); +void qemu_system_start_on_wakeup_request(void); void qemu_system_wakeup_request(WakeupReason reason, Error **errp); void qemu_system_wakeup_enable(WakeupReason reason, bool enabled); void qemu_register_wakeup_notifier(Notifier *notifier); void qemu_register_wakeup_support(void); void qemu_system_shutdown_request(ShutdownCause reason); +void qemu_system_exec_request(const strList *args); void qemu_system_powerdown_request(void); void qemu_register_powerdown_notifier(Notifier *notifier); void qemu_register_shutdown_notifier(Notifier *notifier); diff --git a/include/sysemu/seccomp.h b/include/sysemu/seccomp.h index fe859894f6b22ca19c7f2983754d525165514bb5..a88f95b86936b4525fb6a4ee85f104b21a98a885 100644 --- a/include/sysemu/seccomp.h +++ b/include/sysemu/seccomp.h @@ -21,6 +21,7 @@ #define QEMU_SECCOMP_SET_SPAWN (1 << 3) #define QEMU_SECCOMP_SET_RESOURCECTL (1 << 4) +int cpr_exec_unset_spawn(void *opaque, QemuOpts *opts, Error **errp); int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp); #endif diff --git a/migration/cpr-state.c b/migration/cpr-state.c new file mode 100644 index 0000000000000000000000000000000000000000..5a8c2757afef35090e45d0336ad0953e10f5cc82 --- /dev/null +++ b/migration/cpr-state.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qemu/queue.h" +#include "qemu/memfd.h" +#include "qapi/error.h" +#include "io/channel-file.h" +#include "migration/vmstate.h" +#include "migration/cpr-state.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/qemu-file.h" +#include "migration/qemu-file-channel.h" +#include "sysemu/sysemu.h" +#include "trace.h" + +/*************************************************************************/ +/* cpr state container for all information to be saved. */ + +typedef QLIST_HEAD(CprNameList, CprName) CprNameList; + +typedef struct CprState { + MigMode mode; + CprNameList fds; /* list of CprFd */ + CprNameList memfd; /* list of CprMemfd */ +} CprState; + +static CprState cpr_state = { + .mode = MIG_MODE_NORMAL, +}; + +/*************************************************************************/ +/* Generic list of names. */ + +typedef struct CprName { + char *name; + unsigned int namelen; + int id; + QLIST_ENTRY(CprName) next; +} CprName; + +static const VMStateDescription vmstate_cpr_name = { + .name = "cpr name", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(namelen, CprName), + VMSTATE_VBUFFER_ALLOC_UINT32(name, CprName, 0, NULL, namelen), + VMSTATE_INT32(id, CprName), + VMSTATE_END_OF_LIST() + } +}; + +static void +add_name(CprNameList *head, const char *name, int id, CprName *elem) +{ + elem->name = g_strdup(name); + elem->namelen = strlen(name) + 1; + elem->id = id; + QLIST_INSERT_HEAD(head, elem, next); +} + +static CprName *find_name(CprNameList *head, const char *name, int id) +{ + CprName *elem; + + QLIST_FOREACH(elem, head, next) { + if (!strcmp(elem->name, name) && elem->id == id) { + return elem; + } + } + return NULL; +} + +static void delete_name(CprNameList *head, const char *name, int id) +{ + CprName *elem = find_name(head, name, id); + + if (elem) { + QLIST_REMOVE(elem, next); + g_free(elem->name); + g_free(elem); + } +} + +/****************************************************************************/ +/* Lists of named things. The first field of each entry must be a CprName. */ + +typedef struct CprFd { + CprName name; /* must be first */ + int fd; +} CprFd; + +static const VMStateDescription vmstate_cpr_fd = { + .name = "cpr fd", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(name, CprFd, 1, vmstate_cpr_name, CprName), + VMSTATE_INT32(fd, CprFd), + VMSTATE_END_OF_LIST() + } +}; + +#define CPR_FD(elem) ((CprFd *)(elem)) +#define CPR_FD_FD(elem) (CPR_FD(elem)->fd) + +void cpr_save_fd(const char *name, int id, int fd) +{ + CprFd *elem = g_new0(CprFd, 1); + + trace_cpr_save_fd(name, id, fd); + elem->fd = fd; + add_name(&cpr_state.fds, name, id, &elem->name); +} + +void cpr_delete_fd(const char *name, int id) +{ + trace_cpr_delete_fd(name, id); + delete_name(&cpr_state.fds, name, id); +} + +int cpr_find_fd(const char *name, int id) +{ + CprName *elem = find_name(&cpr_state.fds, name, id); + int fd = elem ? CPR_FD_FD(elem) : -1; + + if (fd >= 0) { + /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */ + qemu_set_cloexec(fd); + } + + trace_cpr_find_fd(name, id, fd); + return fd; +} + +int cpr_walk_fd(cpr_walk_fd_cb cb, void *opaque) +{ + CprName *elem; + + QLIST_FOREACH(elem, &cpr_state.fds, next) { + if (cb(elem->name, elem->id, CPR_FD_FD(elem), opaque)) { + return 1; + } + } + return 0; +} + +void cpr_resave_fd(const char *name, int id, int fd) +{ + CprName *elem = find_name(&cpr_state.fds, name, id); + int old_fd = elem ? CPR_FD_FD(elem) : -1; + + if (old_fd < 0) { + cpr_save_fd(name, id, fd); + } else if (old_fd != fd) { + error_setg(&error_fatal, + "internal error: cpr fd '%s' id %d value %d " + "already saved with a different value %d", + name, id, fd, old_fd); + } +} + +/*************************************************************************/ +/* A memfd ram block. */ + +typedef struct CprMemfd { + CprName name; /* must be first */ + size_t len; + size_t maxlen; + uint64_t align; +} CprMemfd; + +static const VMStateDescription vmstate_cpr_memfd = { + .name = "cpr memfd", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(name, CprMemfd, 1, vmstate_cpr_name, CprName), + VMSTATE_UINT64(len, CprMemfd), + VMSTATE_UINT64(maxlen, CprMemfd), + VMSTATE_UINT64(align, CprMemfd), + VMSTATE_END_OF_LIST() + } +}; + +#define CPR_MEMFD(elem) ((CprMemfd *)(elem)) +#define CPR_MEMFD_LEN(elem) (CPR_MEMFD(elem)->len) +#define CPR_MEMFD_MAXLEN(elem) (CPR_MEMFD(elem)->maxlen) +#define CPR_MEMFD_ALIGN(elem) (CPR_MEMFD(elem)->align) + +void cpr_save_memfd(const char *name, int fd, size_t len, size_t maxlen, + uint64_t align) +{ + CprMemfd *elem = g_new0(CprMemfd, 1); + + trace_cpr_save_memfd(name, len, maxlen, align); + elem->len = len; + elem->maxlen = maxlen; + elem->align = align; + add_name(&cpr_state.memfd, name, 0, &elem->name); + cpr_save_fd(name, 0, fd); +} + +void cpr_delete_memfd(const char *name) +{ + trace_cpr_delete_memfd(name); + delete_name(&cpr_state.memfd, name, 0); + cpr_delete_fd(name, 0); +} + +int cpr_find_memfd(const char *name, size_t *lenp, size_t *maxlenp, + uint64_t *alignp) +{ + int fd = cpr_find_fd(name, 0); + CprName *elem = find_name(&cpr_state.memfd, name, 0); + + if (elem) { + *lenp = CPR_MEMFD_LEN(elem); + *maxlenp = CPR_MEMFD_MAXLEN(elem); + *alignp = CPR_MEMFD_ALIGN(elem); + } else { + *lenp = 0; + *maxlenp = 0; + *alignp = 0; + } + + trace_cpr_find_memfd(name, *lenp, *maxlenp, *alignp); + return fd; +} + +/*************************************************************************/ +/* cpr state container interface and implementation. */ + +#define CPR_STATE_NAME "QEMU_CPR_STATE" + +static const VMStateDescription vmstate_cpr_state = { + .name = CPR_STATE_NAME, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(mode, CprState), + VMSTATE_QLIST_V(fds, CprState, 1, vmstate_cpr_fd, CprFd, name.next), + VMSTATE_QLIST_V(memfd, CprState, 1, vmstate_cpr_memfd, CprMemfd, + name.next), + VMSTATE_END_OF_LIST() + } +}; + +int cpr_state_save(Error **errp) +{ + int ret, mfd; + QEMUFile *f; + char val[16]; + + mfd = memfd_create(CPR_STATE_NAME, 0); + if (mfd < 0) { + error_setg_errno(errp, errno, "memfd_create failed"); + return -1; + } + + cpr_state.mode = migrate_mode(); + qemu_clear_cloexec(mfd); + + f = qemu_fopen_fd(mfd, true, CPR_STATE_NAME); + ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0); + if (ret) { + error_setg(errp, "vmstate_save_state error %d", ret); + goto error; + } + + /* Do not close f, as mfd must remain open. */ + qemu_fflush(f); + lseek(mfd, 0, SEEK_SET); + + /* Remember mfd for post-exec cpr_state_load */ + snprintf(val, sizeof(val), "%d", mfd); + g_setenv(CPR_STATE_NAME, val, 1); + + return 0; + +error: + close(mfd); + cpr_state.mode = MIG_MODE_NORMAL; + return ret; +} + +void cpr_state_unsave(void) +{ + int mfd; + const char *val = g_getenv(CPR_STATE_NAME); + + if (val) { + g_unsetenv(CPR_STATE_NAME); + if (!qemu_strtoi(val, NULL, 10, &mfd)) { + close(mfd); + } + } +} + +int cpr_state_load(Error **errp) +{ + int ret, mfd; + QEMUFile *f; + const char *val = g_getenv(CPR_STATE_NAME); + + if (!val) { + return 0; + } + g_unsetenv(CPR_STATE_NAME); + if (qemu_strtoi(val, NULL, 10, &mfd)) { + error_setg(errp, "Bad %s env value %s", CPR_STATE_NAME, val); + return 1; + } + f = qemu_fopen_fd(mfd, false, CPR_STATE_NAME); + ret = vmstate_load_state(f, &vmstate_cpr_state, &cpr_state, 1); + qemu_fclose(f); + + if (!ret) { + migrate_get_current()->parameters.mode = cpr_state.mode; + assert(migrate_mode() == MIG_MODE_CPR_EXEC); + /* + * cpr exec mode ignore qemu -S option to reduce interactions + * with libvirtd and acelerate qemu live update. + */ + autostart = true; + } else { + error_setg(errp, "vmstate_load_state error %d", ret); + } + + return ret; +} + +void cpr_state_print(void) +{ + CprName *elem; + + printf("cpr_state:\n"); + printf("- mode = %d\n", cpr_state.mode); + QLIST_FOREACH(elem, &cpr_state.fds, next) { + printf("- %s %d : fd=%d\n", elem->name, elem->id, CPR_FD_FD(elem)); + } + QLIST_FOREACH(elem, &cpr_state.memfd, next) { + printf("- %s : len=%lu, maxlen=%lu, align=%lu\n", elem->name, + CPR_MEMFD_LEN(elem), CPR_MEMFD_MAXLEN(elem), + CPR_MEMFD_ALIGN(elem)); + } +} diff --git a/migration/cpr.c b/migration/cpr.c new file mode 100644 index 0000000000000000000000000000000000000000..3b82557435b0c47156a06a2b1401c60f6dd08c7a --- /dev/null +++ b/migration/cpr.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "migration/cpr.h" +#include "migration/cpr-state.h" +#include "sysemu/runstate.h" + +bool only_cpr_capable; +static Notifier cpr_fd_notifier; + +static int preserve_fd(const char *name, int id, int fd, void *opaque) +{ + /* MONITOR_CAPAB means fd is not a file descriptor */ + if (id != MONITOR_CAPAB) + qemu_clear_cloexec(fd); + + return 0; +} + +static int unpreserve_fd(const char *name, int id, int fd, void *opaque) +{ + qemu_set_cloexec(fd); + return 0; +} + +static void cpr_fd_notifier_func(Notifier *notifier, void *data) +{ + MigrationState *s = data; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC && migration_has_failed(s)) { + cpr_walk_fd(unpreserve_fd, 0); + } +} + +void cpr_preserve_fds(void) +{ + cpr_walk_fd(preserve_fd, 0); +} + +void cpr_init(void) +{ + cpr_state_load(&error_fatal); + migration_add_notifier(&cpr_fd_notifier, cpr_fd_notifier_func); +} + +void cpr_exec(void) +{ + MigrationState *s = migrate_get_current(); + Error *err = NULL; + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC && !migration_has_failed(s)) { + if (!migration_has_finished(s)) { + error_setg(&err, "cannot exec: migration status is '%s', " + "but must be 'completed'", + MigrationStatus_str(s->state)); + goto error; + } + + if (cpr_state_save(&err)) { + goto error; + } + + qemu_system_exec_request(s->parameters.cpr_exec_args); + } + return; + +error: + cpr_exec_failed(err); +} + +void cpr_exec_failed(Error *err) +{ + MigrationState *s = migrate_get_current(); + + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + migrate_set_error(s, err); + error_report_err(err); + migration_call_notifiers(s); + cpr_state_unsave(); +} diff --git a/migration/file.c b/migration/file.c new file mode 100644 index 0000000000000000000000000000000000000000..7708c3c3f7e53d1d9926f24aa93ae20503d9582f --- /dev/null +++ b/migration/file.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "channel.h" +#include "file.h" +#include "migration.h" +#include "io/channel-file.h" +#include "io/channel-util.h" +#include "trace.h" +#include "sysemu/runstate.h" +#include + +void file_start_outgoing_migration(MigrationState *s, const char *filename, + Error **errp) +{ + g_autoptr(QIOChannelFile) fioc = NULL; + QIOChannel *ioc; + pid_t pid; + char *filename_p; + + pid = getpid(); + filename_p = g_strdup_printf("%s.%d", filename, pid); + trace_migration_file_outgoing(filename_p); + + fioc = qio_channel_file_new_path(filename_p, O_CREAT | O_WRONLY | O_TRUNC, + 0600, errp); + if (!fioc) { + g_free(filename_p); + return; + } + + ioc = QIO_CHANNEL(fioc); + qio_channel_set_name(ioc, "migration-file-outgoing"); + migration_channel_connect(s, ioc, NULL, NULL); + g_free(filename_p); +} + +static void file_migrate_complete_unlink_file(void *opaque) +{ + char *filename = opaque; + unlink(filename); + g_free(filename); +} + +static gboolean file_accept_incoming_migration(QIOChannel *ioc, + GIOCondition condition, + gpointer opaque) +{ + migration_channel_process_incoming(ioc); + object_unref(OBJECT(ioc)); + return G_SOURCE_REMOVE; +} + +void file_start_incoming_migration(const char *filename, Error **errp) +{ + QIOChannelFile *fioc = NULL; + QIOChannel *ioc; + pid_t pid; + char *filename_p; + + pid = getpid(); + filename_p = g_strdup_printf("%s.%d", filename, pid); + trace_migration_file_incoming(filename_p); + + + fioc = qio_channel_file_new_path(filename_p, O_RDONLY, 0, errp); + if (!fioc) { + g_free(filename_p); + return; + } + + ioc = QIO_CHANNEL(fioc); + qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + + /* + * Register Handler to delete VM state save file when + * qemu live update complete + */ + qemu_add_cpr_exec_complete_handler(file_migrate_complete_unlink_file, + (void *)filename_p); +} diff --git a/migration/file.h b/migration/file.h new file mode 100644 index 0000000000000000000000000000000000000000..aa697df5d75533e96e41066f53ef483e6a8e7c27 --- /dev/null +++ b/migration/file.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_MIGRATION_FILE_H +#define QEMU_MIGRATION_FILE_H +void file_start_incoming_migration(const char *filename, Error **errp); + +void file_start_outgoing_migration(MigrationState *s, const char *filename, + Error **errp); +#endif diff --git a/migration/meson.build b/migration/meson.build index f8714dcb154f86975483d8e1d7657e4d43e18afc..494774a575559e7dccc3a38eb7e641fcdef6b121 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -15,8 +15,11 @@ softmmu_ss.add(files( 'channel.c', 'colo-failover.c', 'colo.c', + 'cpr.c', + 'cpr-state.c', 'exec.c', 'fd.c', + 'file.c', 'global_state.c', 'migration.c', 'multifd.c', diff --git a/migration/migration.c b/migration/migration.c index 2ec116f9010f0ca8865855868113cea242db01ce..ad4bca606bb722ae1b3f7bebedd4decf33c700c1 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -20,6 +20,7 @@ #include "migration/blocker.h" #include "exec.h" #include "fd.h" +#include "file.h" #include "socket.h" #include "sysemu/runstate.h" #include "sysemu/sysemu.h" @@ -32,6 +33,7 @@ #include "savevm.h" #include "qemu-file-channel.h" #include "qemu-file.h" +#include "migration/cpr.h" #include "migration/vmstate.h" #include "block/block.h" #include "qapi/error.h" @@ -171,8 +173,9 @@ INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, static MigrationState *current_migration; static MigrationIncomingState *current_incoming; +static int migrate_enabled_modes = BIT(MIG_MODE_NORMAL); -static GSList *migration_blockers; +static GSList *migration_blockers[MIG_MODE__MAX]; static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, @@ -187,12 +190,17 @@ static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) return (a > b) - (a < b); } -void migration_object_init(void) +void migration_object_early_init(void) { /* This can only be called once. */ assert(!current_migration); current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION)); + cpr_init(); +} + +void migration_object_init(void) +{ /* * Init the migrate incoming object as well no matter whether * we'll use it or not. @@ -472,6 +480,8 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) exec_start_incoming_migration(p, errp); } else if (strstart(uri, "fd:", &p)) { fd_start_incoming_migration(p, errp); + } else if (strstart(uri, "file:", &p)) { + file_start_incoming_migration(p, errp); } else { error_setg(errp, "unknown migration protocol: %s", uri); } @@ -517,6 +527,12 @@ static void process_incoming_migration_bh(void *opaque) dirty_bitmap_mig_before_vm_start(); + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + cpr_exec_complete_notify(); + /* After qemu live update, no need to call handler anymore */ + qemu_del_all_cpr_exec_complete_handler(); + } + if (!global_state_received() || global_state_get_runstate() == RUN_STATE_RUNNING) { if (autostart) { @@ -529,6 +545,10 @@ static void process_incoming_migration_bh(void *opaque) vm_start(); } else { runstate_set(global_state_get_runstate()); + if (runstate_check(RUN_STATE_SUSPENDED)) { + /* Force vm_start to be called later. */ + qemu_system_start_on_wakeup_request(); + } } /* * This must happen after any state changes since as soon as an external @@ -539,6 +559,12 @@ static void process_incoming_migration_bh(void *opaque) MIGRATION_STATUS_COMPLETED); qemu_bh_delete(mis->bh); migration_incoming_state_destroy(); + + /* After qemu live update, set the migration mode to normal */ + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + MigrationState *s = migrate_get_current(); + s->parameters.mode = MIG_MODE_NORMAL; + } } static void process_incoming_migration_co(void *opaque) @@ -848,6 +874,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) /* TODO use QAPI_CLONE() instead of duplicating it inline */ params = g_malloc0(sizeof(*params)); + params->has_mode = true; + params->mode = s->parameters.mode; params->has_compress_level = true; params->compress_level = s->parameters.compress_level; params->has_compress_threads = true; @@ -866,6 +894,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; params->has_cpu_throttle_tailslow = true; params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow; + params->has_cpr_exec_args = true; + params->cpr_exec_args = QAPI_CLONE(strList, s->parameters.cpr_exec_args); params->has_tls_creds = true; params->tls_creds = g_strdup(s->parameters.tls_creds); params->has_tls_hostname = true; @@ -1059,7 +1089,7 @@ static void populate_disk_info(MigrationInfo *info) static void fill_source_migration_info(MigrationInfo *info) { MigrationState *s = migrate_get_current(); - GSList *cur_blocker = migration_blockers; + GSList *cur_blocker = migration_blockers[migrate_mode()]; info->blocked_reasons = NULL; @@ -1253,6 +1283,11 @@ static bool migrate_caps_check(bool *cap_list, return false; } + if (cap_list[MIGRATION_CAPABILITY_X_COLO]) { + return migrate_add_blocker_always("x-colo is not compatible with cpr", + errp, MIG_MODE_CPR_EXEC, -1); + } + return true; } @@ -1504,6 +1539,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, /* TODO use QAPI_CLONE() instead of duplicating it inline */ + if (params->has_mode) { + dest->mode = params->mode; + } + if (params->has_compress_level) { dest->compress_level = params->compress_level; } @@ -1540,6 +1579,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpr_exec_args) { + dest->cpr_exec_args = params->cpr_exec_args; + } + if (params->has_tls_creds) { assert(params->tls_creds->type == QTYPE_QSTRING); dest->tls_creds = params->tls_creds->u.s; @@ -1605,6 +1648,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) /* TODO use QAPI_CLONE() instead of duplicating it inline */ + if (params->has_mode) { + s->parameters.mode = params->mode; + } + if (params->has_compress_level) { s->parameters.compress_level = params->compress_level; } @@ -1641,6 +1688,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow; } + if (params->has_cpr_exec_args) { + qapi_free_strList(s->parameters.cpr_exec_args); + s->parameters.cpr_exec_args = + QAPI_CLONE(strList, params->cpr_exec_args); + } + if (params->has_tls_creds) { g_free(s->parameters.tls_creds); assert(params->tls_creds->type == QTYPE_QSTRING); @@ -1826,6 +1879,8 @@ static void block_cleanup_parameters(MigrationState *s) static void migrate_fd_cleanup(MigrationState *s) { + bool already_failed; + qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; @@ -1866,8 +1921,17 @@ static void migrate_fd_cleanup(MigrationState *s) /* It is used on info migrate. We can't free it */ error_report_err(error_copy(s->error)); } - notifier_list_notify(&migration_state_notifiers, s); + + already_failed = migration_has_failed(s); + if (migration_call_notifiers(s)) { + if (!already_failed) { + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + /* Notify again to recover from this late failure. */ + migration_call_notifiers(s); + } + } block_cleanup_parameters(s); + cpr_exec(); yank_unregister_instance(MIGRATION_YANK_INSTANCE); } @@ -1961,14 +2025,31 @@ static void migrate_fd_cancel(MigrationState *s) } } -void add_migration_state_change_notifier(Notifier *notify) +void migration_add_notifier(Notifier *notify, + void (*func)(Notifier *notifier, void *data)) { + notify->notify = func; notifier_list_add(&migration_state_notifiers, notify); } -void remove_migration_state_change_notifier(Notifier *notify) +void migration_remove_notifier(Notifier *notify) { - notifier_remove(notify); + if (notify->notify) { + notifier_remove(notify); + notify->notify = NULL; + } +} + +int migration_call_notifiers(MigrationState *s) +{ + notifier_list_notify(&migration_state_notifiers, s); + return (s->error != NULL); +} + +void migration_notifier_set_error(MigrationState *s, Error *err) +{ + migrate_set_error(s, err); + error_report_err(err); } bool migration_in_setup(MigrationState *s) @@ -2057,6 +2138,34 @@ bool migration_is_active(MigrationState *s) s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } +void migrate_enable_mode(MigMode mode) +{ + migrate_enabled_modes |= BIT(mode); +} + +bool migrate_mode_enabled(MigMode mode) +{ + return !!(migrate_enabled_modes & BIT(mode)); +} + +static bool migrate_modes_enabled(int modes) +{ + return (modes & migrate_enabled_modes) == modes; +} + +static int migrate_check_enabled(Error **errp) +{ + MigMode mode = migrate_mode(); + + if (!migrate_mode_enabled(mode)) { + error_setg(errp, "migrate mode is not enabled. " + "Use '-migrate-mode-enable %s'.", + MigMode_str(mode)); + return -1; + } + return 0; +} + void migrate_init(MigrationState *s) { /* @@ -2091,35 +2200,121 @@ void migrate_init(MigrationState *s) s->threshold_size = 0; } -int migrate_add_blocker_internal(Error *reason, Error **errp) +static int add_blockers(Error **reasonp, Error **errp, int modes) { + MigMode mode; + /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */ if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) { - error_propagate_prepend(errp, error_copy(reason), + error_propagate_prepend(errp, *reasonp, "disallowing migration blocker " "(migration/snapshot in progress) for: "); + *reasonp = NULL; return -EBUSY; } - migration_blockers = g_slist_prepend(migration_blockers, reason); + for (mode = 0; mode < MIG_MODE__MAX; mode++) { + if (modes & BIT(mode)) { + migration_blockers[mode] = g_slist_prepend(migration_blockers[mode], + *reasonp); + } + } return 0; } -int migrate_add_blocker(Error *reason, Error **errp) +static int check_blockers(Error **reasonp, Error **errp, int modes) { - if (only_migratable) { - error_propagate_prepend(errp, error_copy(reason), + ERRP_GUARD(); + + if (only_migratable && (modes & BIT(MIG_MODE_NORMAL))) { + error_propagate_prepend(errp, *reasonp, "disallowing migration blocker " "(--only-migratable) for: "); + *reasonp = NULL; + return -EACCES; + } + + if (only_cpr_capable && (modes & CPR_MODES) && + migrate_modes_enabled(modes & CPR_MODES)) { + error_propagate_prepend(errp, *reasonp, + "-only-cpr-capable specified, but: "); + *reasonp = NULL; return -EACCES; } - return migrate_add_blocker_internal(reason, errp); + return add_blockers(reasonp, errp, modes); } -void migrate_del_blocker(Error *reason) +int migrate_add_blocker(Error **reasonp, Error **errp) { - migration_blockers = g_slist_remove(migration_blockers, reason); + return migrate_add_blockers(reasonp, errp, MIG_MODE_ALL); +} + +int migrate_add_blocker_internal(Error **reasonp, Error **errp) +{ + int modes = BIT(MIG_MODE__MAX) - 1; + + return add_blockers(reasonp, errp, modes); +} + +static int get_modes(MigMode mode, va_list ap) +{ + int modes = 0; + + while (mode != -1 && mode != MIG_MODE_ALL) { + assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX); + modes |= BIT(mode); + mode = va_arg(ap, MigMode); + } + if (mode == MIG_MODE_ALL) { + modes = BIT(MIG_MODE__MAX) - 1; + } + return modes; +} + +int migrate_add_blockers(Error **reasonp, Error **errp, MigMode mode, ...) +{ + int modes; + va_list ap; + + va_start(ap, mode); + modes = get_modes(mode, ap); + va_end(ap); + + return check_blockers(reasonp, errp, modes); +} + +int migrate_add_blocker_always(const char *msg, Error **errp, MigMode mode, ...) +{ + int modes; + va_list ap; + Error *reason = NULL; + + va_start(ap, mode); + modes = get_modes(mode, ap); + va_end(ap); + + error_setg(&reason, "%s", msg); + return check_blockers(&reason, errp, modes); +} + +void migrate_del_blocker(Error **reasonp) +{ + if (*reasonp) { + migrate_remove_blocker(*reasonp); + error_free(*reasonp); + *reasonp = NULL; + } +} + +void migrate_remove_blocker(Error *reason) +{ + if (reason) { + for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { + migration_blockers[mode] = g_slist_remove(migration_blockers[mode], + reason); + } + } } void qmp_migrate_incoming(const char *uri, Error **errp) @@ -2127,6 +2322,9 @@ void qmp_migrate_incoming(const char *uri, Error **errp) Error *local_err = NULL; static bool once = true; + if (migrate_check_enabled(errp)) { + return; + } if (!once) { error_setg(errp, "The incoming migration has already been started"); return; @@ -2219,12 +2417,14 @@ void qmp_migrate_pause(Error **errp) bool migration_is_blocked(Error **errp) { + GSList *blockers = migration_blockers[migrate_mode()]; + if (qemu_savevm_state_blocked(errp)) { return true; } - if (migration_blockers) { - error_propagate(errp, error_copy(migration_blockers->data)); + if (blockers) { + error_propagate(errp, error_copy(blockers->data)); return true; } @@ -2282,6 +2482,16 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (migrate_check_enabled(errp)) { + return false; + } + + if (migrate_mode_of(s) == MIG_MODE_CPR_EXEC && + !s->parameters.has_cpr_exec_args) { + error_setg(errp, "cpr-exec mode requires setting cpr-exec-args"); + return false; + } + if (migration_is_blocked(errp)) { return false; } @@ -2353,6 +2563,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, exec_start_outgoing_migration(s, p, &local_err); } else if (strstart(uri, "fd:", &p)) { fd_start_outgoing_migration(s, p, &local_err); + } else if (strstart(uri, "file:", &p)) { + file_start_outgoing_migration(s, p, &local_err); } else { if (!(has_resume && resume)) { yank_unregister_instance(MIGRATION_YANK_INSTANCE); @@ -2586,6 +2798,20 @@ int migrate_multifd_zstd_level(void) return s->parameters.multifd_zstd_level; } +MigMode migrate_mode(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.mode; +} + +MigMode migrate_mode_of(MigrationState *s) +{ + return s->parameters.mode; +} + int migrate_use_xbzrle(void) { MigrationState *s; @@ -2997,7 +3223,6 @@ static int postcopy_start(MigrationState *ms) qemu_mutex_lock_iothread(); trace_postcopy_start_set_run(); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); global_state_store(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); if (ret < 0) { @@ -3105,7 +3330,9 @@ static int postcopy_start(MigrationState *ms) * spice needs to trigger a transition now */ ms->postcopy_after_devices = true; - notifier_list_notify(&migration_state_notifiers, ms); + if (migration_call_notifiers(ms)) { + goto fail; + } ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; @@ -3209,7 +3436,6 @@ static void migration_completion(MigrationState *s) if (s->state == MIGRATION_STATUS_ACTIVE) { qemu_mutex_lock_iothread(); s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); s->vm_was_running = runstate_is_running(); ret = global_state_store(); @@ -3986,11 +4212,6 @@ static void *bg_migration_thread(void *opaque) qemu_mutex_lock_iothread(); - /* - * If VM is currently in suspended state, then, to make a valid runstate - * transition in vm_stop_force_state() we need to wakeup it up. - */ - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); s->vm_was_running = runstate_is_running(); if (global_state_store()) { @@ -4116,7 +4337,11 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO; /* Notify before starting migration thread */ - notifier_list_notify(&migration_state_notifiers, s); + if (migration_call_notifiers(s)) { + migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); + migrate_fd_cleanup(s); + return; + } } qemu_file_set_rate_limit(s->to_dst_file, rate_limit); @@ -4197,6 +4422,9 @@ static Property migration_properties[] = { clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT), /* Migration parameters */ + DEFINE_PROP_MIG_MODE("mode", MigrationState, + parameters.mode, + MIG_MODE_NORMAL), DEFINE_PROP_UINT8("x-compress-level", MigrationState, parameters.compress_level, DEFAULT_MIGRATE_COMPRESS_LEVEL), @@ -4222,6 +4450,8 @@ static Property migration_properties[] = { DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState, parameters.cpu_throttle_tailslow, false), + DEFINE_PROP_STRLIST("cpr-exec-args", MigrationState, + parameters.cpr_exec_args), DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, parameters.max_bandwidth, MAX_THROTTLE), DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, @@ -4324,11 +4554,13 @@ static void migration_instance_init(Object *obj) params->tls_creds = g_strdup(""); /* Set has_* up only for parameter checks */ + params->has_mode = true; params->has_compress_level = true; params->has_compress_threads = true; params->has_decompress_threads = true; params->has_compress_method = true; params->has_throttle_trigger_threshold = true; + params->has_cpr_exec_args = true; params->has_cpu_throttle_initial = true; params->has_cpu_throttle_increment = true; params->has_cpu_throttle_tailslow = true; diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c index bb5a5752df2e5d5ac24bebd12617f0a56d1c4d8b..290b20f79404ecc460718f2f3a067275f56ddc38 100644 --- a/migration/qemu-file-channel.c +++ b/migration/qemu-file-channel.c @@ -27,8 +27,10 @@ #include "qemu-file.h" #include "io/channel-socket.h" #include "io/channel-tls.h" +#include "io/channel-file.h" #include "qemu/iov.h" #include "qemu/yank.h" +#include "qapi/error.h" #include "yank_functions.h" @@ -192,3 +194,13 @@ QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc) object_ref(OBJECT(ioc)); return qemu_fopen_ops(ioc, &channel_output_ops, true); } + +QEMUFile *qemu_fopen_fd(int fd, bool writable, const char *name) +{ + g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd); + QIOChannel *ioc = QIO_CHANNEL(fioc); + QEMUFile *f = writable ? qemu_fopen_channel_output(ioc) : + qemu_fopen_channel_input(ioc); + qio_channel_set_name(ioc, name); + return f; +} diff --git a/migration/qemu-file-channel.h b/migration/qemu-file-channel.h index 0028a09eb61f896a0067fbb5474c66443685a472..1b3f94da316d53c1674ce3bfdb0744677d09dfc1 100644 --- a/migration/qemu-file-channel.h +++ b/migration/qemu-file-channel.h @@ -29,4 +29,7 @@ QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc); QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc); + +QEMUFile *qemu_fopen_fd(int fd, bool writable, const char *name); + #endif diff --git a/migration/ram.c b/migration/ram.c index 12b8c653d87f6855a383e762fafeea600e150809..32b7507da0669efc1a008c82681d73bf4dde17a5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -201,7 +201,9 @@ out: bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || - (migrate_ignore_shared() && qemu_ram_is_shared(block)); + migrate_mode() == MIG_MODE_CPR_EXEC || + (migrate_ignore_shared() && qemu_ram_is_shared(block) && + ramblock_is_named_file(block)); } #undef RAMBLOCK_FOREACH @@ -3193,7 +3195,8 @@ static void ram_init_bitmaps(RAMState *rs) WITH_RCU_READ_LOCK_GUARD() { ram_list_init_bitmaps(); /* We don't use dirty log with background snapshots */ - if (!migrate_background_snapshot()) { + if (!migrate_background_snapshot() && + migrate_mode() == MIG_MODE_NORMAL) { memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs); } @@ -4395,13 +4398,16 @@ static int ram_load_precopy(QEMUFile *f) } if (migrate_ignore_shared()) { hwaddr addr = qemu_get_be64(f); - if (ramblock_is_ignored(block) && - block->mr->addr != addr) { - error_report("Mismatched GPAs for block %s " - "%" PRId64 "!= %" PRId64, - id, (uint64_t)addr, - (uint64_t)block->mr->addr); - ret = -EINVAL; + if (ramblock_is_ignored(block)) { + if (!block->mr->has_addr) { + memory_region_set_address_only(block->mr, addr); + } else if (block->mr->addr != addr) { + error_report("Mismatched GPAs for block %s " + "%" PRId64 "!= %" PRId64, + id, (uint64_t)addr, + (uint64_t)block->mr->addr); + ret = -EINVAL; + } } } ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, diff --git a/migration/trace-events b/migration/trace-events index b48d873b8a0313a855a43337a3d3aa2513f47fca..47d6bc51e286cdd5fa47403f23cef259abea93a7 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -291,6 +291,10 @@ migration_exec_incoming(const char *cmd) "cmd=%s" migration_fd_outgoing(int fd) "fd=%d" migration_fd_incoming(int fd) "fd=%d" +# file.c +migration_file_outgoing(const char *filename) "filename=%s" +migration_file_incoming(const char *filename) "filename=%s" + # socket.c migration_socket_incoming_accepted(void) "" migration_socket_outgoing_connected(const char *hostname) "hostname=%s" @@ -312,6 +316,14 @@ colo_receive_message(const char *msg) "Receive '%s' message" # colo-failover.c colo_failover_set_state(const char *new_state) "new state %s" +# cpr-state.c +cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d" +cpr_delete_fd(const char *name, int id) "%s, id %d" +cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d" +cpr_save_memfd(const char *name, size_t len, size_t maxlen, uint64_t align) "%s, len %lu, maxlen %lu, align %lu" +cpr_delete_memfd(const char *name) "%s" +cpr_find_memfd(const char *name, size_t len, size_t maxlen, uint64_t align) "%s, len %lu, maxlen %lu, align %lu" + # block-dirty-bitmap.c send_bitmap_header_enter(void) "" send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64 diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 9570011232e318e64d08dc50c79f888b23ab40d9..f60ce3c99ed9c3f1ab62b3785c5ee502a3ec20dc 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -42,6 +42,7 @@ #include "qapi/qapi-commands-run-state.h" #include "qapi/qapi-commands-tpm.h" #include "qapi/qapi-commands-ui.h" +#include "qapi/util.h" #include "qapi/qapi-visit-net.h" #include "qapi/qapi-visit-migration.h" #include "qapi/qmp/qdict.h" @@ -70,32 +71,6 @@ bool hmp_handle_error(Monitor *mon, Error *err) return false; } -/* - * Produce a strList from a comma separated list. - * A NULL or empty input string return NULL. - */ -static strList *strList_from_comma_list(const char *in) -{ - strList *res = NULL; - strList **tail = &res; - - while (in && in[0]) { - char *comma = strchr(in, ','); - char *value; - - if (comma) { - value = g_strndup(in, comma - in); - in = comma + 1; /* skip the , */ - } else { - value = g_strdup(in); - in = NULL; - } - QAPI_LIST_APPEND(tail, value); - } - - return res; -} - void hmp_info_name(Monitor *mon, const QDict *qdict) { NameInfo *info; @@ -394,6 +369,18 @@ void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict) qapi_free_MigrationCapabilityStatusList(caps); } +static void monitor_print_cpr_exec_args(Monitor *mon, strList *args) +{ + monitor_printf(mon, "%s:", + MigrationParameter_str(MIGRATION_PARAMETER_CPR_EXEC_ARGS)); + + while (args) { + monitor_printf(mon, " %s", args->value); + args = args->next; + } + monitor_printf(mon, "\n"); +} + void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) { MigrationParameters *params; @@ -413,6 +400,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %" PRIu64 " ms\n", MigrationParameter_str(MIGRATION_PARAMETER_ANNOUNCE_STEP), params->announce_step); + assert(params->has_mode); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_MODE), + qapi_enum_lookup(&MigMode_lookup, params->mode)); assert(params->has_compress_level); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_LEVEL), @@ -452,6 +443,8 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_MAX_CPU_THROTTLE), params->max_cpu_throttle); + assert(params->has_cpr_exec_args); + monitor_print_cpr_exec_args(mon, params->cpr_exec_args); assert(params->has_tls_creds); monitor_printf(mon, "%s: '%s'\n", MigrationParameter_str(MIGRATION_PARAMETER_TLS_CREDS), @@ -1106,7 +1099,7 @@ void hmp_announce_self(Monitor *mon, const QDict *qdict) migrate_announce_params()); qapi_free_strList(params->interfaces); - params->interfaces = strList_from_comma_list(interfaces_str); + params->interfaces = strList_from_string(interfaces_str, ','); params->has_interfaces = params->interfaces != NULL; params->id = g_strdup(id); params->has_id = !!params->id; @@ -1197,6 +1190,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) uint64_t cache_size; CompressMethod compress_method; Error *err = NULL; + g_autofree char *str = NULL; int val, ret; val = qapi_enum_parse(&MigrationParameter_lookup, param, -1, &err); @@ -1205,6 +1199,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) } switch (val) { + case MIGRATION_PARAMETER_MODE: + p->has_mode = true; + visit_type_MigMode(v, param, &p->mode, &err); + break; case MIGRATION_PARAMETER_COMPRESS_LEVEL: p->has_compress_level = true; visit_type_uint8(v, param, &p->compress_level, &err); @@ -1249,6 +1247,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_max_cpu_throttle = true; visit_type_uint8(v, param, &p->max_cpu_throttle, &err); break; + case MIGRATION_PARAMETER_CPR_EXEC_ARGS: + p->has_cpr_exec_args = true; + visit_type_str(v, param, &str, &err); + p->cpr_exec_args = strList_from_string(str, ' '); + break; case MIGRATION_PARAMETER_TLS_CREDS: p->has_tls_creds = true; p->tls_creds = g_new0(StrOrNull, 1); diff --git a/monitor/qmp-cmds-control.c b/monitor/qmp-cmds-control.c index 6e581713a3694dbaddcb042772026e4dc7022cce..b858178e95f8b14c3c766f5875f4e790081e2ca3 100644 --- a/monitor/qmp-cmds-control.c +++ b/monitor/qmp-cmds-control.c @@ -24,6 +24,7 @@ #include "qemu/osdep.h" +#include "migration/cpr-state.h" #include "monitor-internal.h" #include "qemu-version.h" #include "qapi/compat-policy.h" @@ -91,6 +92,22 @@ void qmp_qmp_capabilities(bool has_enable, QMPCapabilityList *enable, } mon->commands = &qmp_commands; + + if (qemu_chr_cpr_support(cur_mon->chr.chr)) { + int i, fd; + char *name; + + assert(sizeof(mon->capab) < sizeof(fd)); + name = g_strdup_printf("qmp-%s", cur_mon->chr.chr->label); + memset(&fd, 0, sizeof(fd)); + for (i = 0; i < QMP_CAPABILITY__MAX; i++) { + if (mon->capab[i]) + fd |= 1 << i; + } + /* MONITOR_CAPAB means mon->capab */ + cpr_resave_fd(name, MONITOR_CAPAB, fd); + g_free(name); + } } VersionInfo *qmp_query_version(Error **errp) diff --git a/monitor/qmp.c b/monitor/qmp.c index 4d1ac66785d673a28521e3e30bfb118dff6191db..a7568a4c69df62f8643b7e58c994a5be490824a3 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -31,6 +31,8 @@ #include "qapi/qmp/qdict.h" #include "qapi/qmp/qjson.h" #include "qapi/qmp/qlist.h" +#include "migration/misc.h" +#include "migration/cpr-state.h" #include "trace.h" struct QMPRequest { @@ -57,6 +59,30 @@ static void monitor_qmp_caps_reset(MonitorQMP *mon) memset(mon->capab_offered, 0, sizeof(mon->capab_offered)); memset(mon->capab, 0, sizeof(mon->capab)); mon->capab_offered[QMP_CAPABILITY_OOB] = mon->common.use_io_thread; + + /* + * The normal process is charmonitor Monitor commands is setted + * at qmp_qmp_capabilities() when interacting with libvirtd, + * but libvirtd do not support qemu live update reconnect yet, + * on cpr exec mode ,we will lose charmonitor Monitor commands + * setting, and a permission verificcation error will occurre on + * the connection between qemu and libvirtd. + * */ + if (migrate_mode() == MIG_MODE_CPR_EXEC) { + char *name; + int i, fd; + + name = g_strdup_printf("qmp-%s", mon->common.chr.chr->label); + fd = cpr_find_fd(name, MONITOR_CAPAB); + if (fd >= 0) { + mon->commands = &qmp_commands; + for (i = 0; i < QMP_CAPABILITY__MAX; i++) { + if (fd & (1 << i)) + mon->capab[i] = true; + } + } + g_free(name); + } } static void qmp_request_free(QMPRequest *req) diff --git a/python/qemu/machine/machine.py b/python/qemu/machine/machine.py index 67ab06ca2b6daa531b7c0ad9f7c241c1ca5f6787..66728cceb4b295c5713de03ed6909e85104b824d 100644 --- a/python/qemu/machine/machine.py +++ b/python/qemu/machine/machine.py @@ -308,6 +308,11 @@ def args(self) -> List[str]: """Returns the list of arguments given to the QEMU binary.""" return self._args + @property + def full_args(self) -> List[str]: + """Returns the full list of arguments used to launch QEMU.""" + return list(self._qemu_full_args) + def _pre_launch(self) -> None: if self._console_set: self._remove_files.append(self._console_address) @@ -458,6 +463,15 @@ def _close_qmp_connection(self) -> None: finally: self._qmp_connection = None + def reopen_qmp_connection(self): + self._close_qmp_connection() + self._qmp_connection = QEMUMonitorProtocol( + self._monitor_address, + server=True, + nickname=self._name + ) + self._qmp.accept(self._qmp_timer) + def _early_cleanup(self) -> None: """ Perform any cleanup that needs to happen before the VM exits. diff --git a/qapi/char.json b/qapi/char.json index f5133a5eeb372de126f12e267ec78cfdd1b82b96..b50c2172e7f6050a7aad803d5d615d16dc4db788 100644 --- a/qapi/char.json +++ b/qapi/char.json @@ -204,12 +204,17 @@ # @logfile: The name of a logfile to save output # @logappend: true to append instead of truncate # (default to false to truncate) +# @reopen-on-cpr: if true, close device's fd on cpr-save and reopen it after +# cpr-exec. Set this to allow CPR on a device that does not +# support QEMU_CHAR_FEATURE_CPR. defaults to false. +# since 6.2. # # Since: 2.6 ## { 'struct': 'ChardevCommon', 'data': { '*logfile': 'str', - '*logappend': 'bool' } } + '*logappend': 'bool', + '*reopen-on-cpr': 'bool' } } ## # @ChardevFile: diff --git a/qapi/migration.json b/qapi/migration.json index e965f4329b4f34dd755aaa4ebff93694ad6b5414..40fdf275d0b878477ed0b8ec23ff1f99fdb86dac 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -541,6 +541,27 @@ 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } +## +# @MigMode: +# +# @normal: the original form of migration. +# +# @cpr-exec: The migrate command saves state to a file, directly exec's a +# new version of qemu on the same host, replacing the original +# process while retaining its PID, and loads the file via the +# migrate-incoming command. The caller must specify a migration URI +# that writes to and reads from a file. Guest RAM must be backed by +# a memory backend with share=on, and cannot be memory-backend-ram. +# Guest RAM is not copied, and storage blocks are not migrated, so +# all capabilities related to page and block copy must be disabled, +# and all related parameters are ignored. Arguments for the new +# qemu process are taken from the @cpr-exec-args parameter. +# +# Since: 6.2 +## +{ 'enum': 'MigMode', + 'data': [ 'normal', 'cpr-exec' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -615,6 +636,9 @@ # # Migration parameters enumeration # +# @mode: Migration mode. See description in @MigMode. Default is 'normal'. +# (Since 6.2) +# # @announce-initial: Initial delay (in milliseconds) before sending the first # announce (Since 4.0) # @@ -676,6 +700,12 @@ # at tail stage. # The default value is false. (Since 5.1) # +# @cpr-exec-args: arguments passed to new qemu for cpr-exec mode. The first +# argument should be the path of a new qemu binary, or a prefix +# command that exec's the new qemu binary. The arguments must +# match those used to initially start qemu, plus the -incoming +# option. (Since 6.2) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials for # establishing a TLS connection over the migration data channel. # On the outgoing side of the migration, the credentials must @@ -772,12 +802,14 @@ # Since: 2.4 ## { 'enum': 'MigrationParameter', - 'data': ['announce-initial', 'announce-max', + 'data': ['mode', + 'announce-initial', 'announce-max', 'announce-rounds', 'announce-step', 'compress-level', 'compress-threads', 'decompress-threads', 'compress-wait-thread', 'compress-method', 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', 'cpu-throttle-tailslow', + 'cpr-exec-args', 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth', 'downtime-limit', { 'name': 'x-checkpoint-delay', 'features': [ 'unstable' ] }, @@ -791,6 +823,9 @@ ## # @MigrateSetParameters: # +# @mode: Migration mode. See description in @MigMode. Default is 'normal'. +# (Since 6.2) +# # @announce-initial: Initial delay (in milliseconds) before sending the first # announce (Since 4.0) # @@ -844,6 +879,9 @@ # at tail stage. # The default value is false. (Since 5.1) # +# @cpr-exec-args: Arguments passed to new qemu for cpr-exec mode. +# See description in @MigrationParameter. (Since 6.2) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -942,7 +980,8 @@ # TODO either fuse back into MigrationParameters, or make # MigrationParameters members mandatory { 'struct': 'MigrateSetParameters', - 'data': { '*announce-initial': 'size', + 'data': { '*mode': 'MigMode', + '*announce-initial': 'size', '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', @@ -955,6 +994,7 @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpr-exec-args': [ 'str' ], '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', @@ -993,6 +1033,9 @@ # # The optional members aren't actually optional. # +# @mode: Migration mode. See description in @MigMode. Default is 'normal'. +# (Since 6.2) +# # @announce-initial: Initial delay (in milliseconds) before sending the # first announce (Since 4.0) # @@ -1046,6 +1089,9 @@ # at tail stage. # The default value is false. (Since 5.1) # +# @cpr-exec-args: Arguments passed to new qemu for cpr-exec mode. +# See description in @MigrationParameter. (Since 6.2) +# # @tls-creds: ID of the 'tls-creds' object that provides credentials # for establishing a TLS connection over the migration data # channel. On the outgoing side of the migration, the credentials @@ -1144,7 +1190,8 @@ # Since: 2.4 ## { 'struct': 'MigrationParameters', - 'data': { '*announce-initial': 'size', + 'data': { '*mode': 'MigMode', + '*announce-initial': 'size', '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', @@ -1157,6 +1204,7 @@ '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', + '*cpr-exec-args': [ 'str' ], '*tls-creds': 'str', '*tls-hostname': 'str', '*tls-authz': 'str', diff --git a/qapi/qapi-util.c b/qapi/qapi-util.c index fda7044539040e7683b4735a302f51b1cbad8531..b8e19385ffc251c92503e0abf7aaf4ed7d17e6c7 100644 --- a/qapi/qapi-util.c +++ b/qapi/qapi-util.c @@ -15,6 +15,7 @@ #include "qapi/error.h" #include "qemu/ctype.h" #include "qapi/qmp/qerror.h" +#include "qapi/qapi-builtin-types.h" CompatPolicy compat_policy; @@ -152,3 +153,39 @@ int parse_qapi_name(const char *str, bool complete) } return p - str; } + +GStrv strv_from_strList(const strList *args) +{ + const strList *arg; + int i = 0; + GStrv argv = g_new(char *, QAPI_LIST_LENGTH(args) + 1); + + for (arg = args; arg != NULL; arg = arg->next) { + argv[i++] = g_strdup(arg->value); + } + argv[i] = NULL; + + return argv; +} + +strList *strList_from_string(const char *in, char delim) +{ + strList *res = NULL; + strList **tail = &res; + + while (in && in[0]) { + char *next = strchr(in, delim); + char *value; + + if (next) { + value = g_strndup(in, next - in); + in = next + 1; /* skip the delim */ + } else { + value = g_strdup(in); + in = NULL; + } + QAPI_LIST_APPEND(tail, value); + } + + return res; +} diff --git a/qemu-options.hx b/qemu-options.hx index e329ec58ca0c80669facf1559feb490fc17a4423..eeaea2347ee84fa9f75056d0d761bd9c20fce2b3 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -3265,43 +3265,57 @@ DEFHEADING(Character device options:) DEF("chardev", HAS_ARG, QEMU_OPTION_chardev, "-chardev help\n" - "-chardev null,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + "-chardev null,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off][,reopen-on-cpr=on|off]\n" "-chardev socket,id=id[,host=host],port=port[,to=to][,ipv4=on|off][,ipv6=on|off][,nodelay=on|off]\n" " [,server=on|off][,wait=on|off][,telnet=on|off][,websocket=on|off][,reconnect=seconds][,mux=on|off]\n" - " [,logfile=PATH][,logappend=on|off][,tls-creds=ID][,tls-authz=ID] (tcp)\n" + " [,logfile=PATH][,logappend=on|off][,tls-creds=ID][,tls-authz=ID][,reopen-on-cpr=on|off] (tcp)\n" "-chardev socket,id=id,path=path[,server=on|off][,wait=on|off][,telnet=on|off][,websocket=on|off][,reconnect=seconds]\n" - " [,mux=on|off][,logfile=PATH][,logappend=on|off][,abstract=on|off][,tight=on|off] (unix)\n" + " [,mux=on|off][,logfile=PATH][,logappend=on|off][,abstract=on|off][,tight=on|off][,reopen-on-cpr=on|off] (unix)\n" "-chardev udp,id=id[,host=host],port=port[,localaddr=localaddr]\n" " [,localport=localport][,ipv4=on|off][,ipv6=on|off][,mux=on|off]\n" - " [,logfile=PATH][,logappend=on|off]\n" + " [,logfile=PATH][,logappend=on|off][,reopen-on-cpr=on|off]\n" "-chardev msmouse,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev vc,id=id[[,width=width][,height=height]][[,cols=cols][,rows=rows]]\n" " [,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev ringbuf,id=id[,size=size][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev file,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev pipe,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #ifdef _WIN32 "-chardev console,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" "-chardev serial,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" #else "-chardev pty,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev stdio,id=id[,mux=on|off][,signal=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #ifdef CONFIG_BRLAPI "-chardev braille,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #if defined(__linux__) || defined(__sun__) || defined(__FreeBSD__) \ || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) "-chardev serial,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev tty,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #if defined(__linux__) || defined(__FreeBSD__) || defined(__DragonFly__) "-chardev parallel,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev parport,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif #if defined(CONFIG_SPICE) "-chardev spicevmc,id=id,name=name[,debug=debug][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" "-chardev spiceport,id=id,name=name[,debug=debug][,logfile=PATH][,logappend=on|off]\n" + " [,reopen-on-cpr=on|off]\n" #endif , QEMU_ARCH_ALL ) @@ -3376,6 +3390,10 @@ The general form of a character device option is: ``logappend`` option controls whether the log file will be truncated or appended to when opened. + Every backend supports the ``reopen-on-cpr`` option. If on, the + devices's descriptor is closed during cpr save, and reopened after exec. + This is useful for devices that do not support cpr. + The available backends are: ``-chardev null,id=id`` @@ -4418,6 +4436,7 @@ DEF("incoming", HAS_ARG, QEMU_OPTION_incoming, \ " prepare for incoming migration, listen on\n" \ " specified protocol and socket address\n" \ "-incoming fd:fd\n" \ + "-incoming file:filename\n" \ "-incoming exec:cmdline\n" \ " accept incoming migration on given file descriptor\n" \ " or from given external command\n" \ @@ -4434,7 +4453,10 @@ SRST Prepare for incoming migration, listen on a given unix socket. ``-incoming fd:fd`` - Accept incoming migration from a given filedescriptor. + Accept incoming migration from a given file descriptor. + +``-incoming file:filename`` + Accept incoming migration from a given file. ``-incoming exec:cmdline`` Accept incoming migration as an output from specified external @@ -4454,6 +4476,24 @@ SRST an unmigratable state. ERST +DEF("migrate-mode-enable", HAS_ARG, QEMU_OPTION_migrate_mode_enable, \ + "-migrate-mode-enable enable the migration mode.\n", + QEMU_ARCH_ALL) +SRST +``-migrate-mode-enable `` + Enable the specified migrate mode. May be supplied + multiple times, once per mode. This is a pre-requisite for performing a + migration using any mode except 'normal'. +ERST + +DEF("only-cpr-capable", 0, QEMU_OPTION_only_cpr_capable, \ + "-only-cpr-capable allow only cpr capable devices\n", QEMU_ARCH_ALL) +SRST +``-only-cpr-capable`` + Only allow cpr capable devices, which guarantees that cpr will not + fail due to a cpr blocker. +ERST + DEF("nodefaults", 0, QEMU_OPTION_nodefaults, \ "-nodefaults don't create default devices\n", QEMU_ARCH_ALL) SRST diff --git a/replay/replay.c b/replay/replay.c index 6df2abc18c7ff3e0a757ac0e5c01a6bb3ec32f6a..901782ac8b67f42b323935c26667796e70e7ecd3 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -19,6 +19,7 @@ #include "qemu/option.h" #include "sysemu/cpus.h" #include "qemu/error-report.h" +#include "migration/blocker.h" /* Current version of the replay mechanism. Increase it when file format changes. */ @@ -245,6 +246,9 @@ static void replay_enable(const char *fname, int mode) const char *fmode = NULL; assert(!replay_file); + migrate_add_blocker_always("replay is not compatible with cpr", + &error_fatal, MIG_MODE_CPR_EXEC, -1); + switch (mode) { case REPLAY_MODE_RECORD: fmode = "wb"; diff --git a/softmmu/memory.c b/softmmu/memory.c index 7340e19ff5e24b6776ddb5737443de18019a8414..c0d7be9e9d82a54e492cc62f0c1d5069e15385f3 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -33,6 +33,7 @@ #include "qemu/accel.h" #include "hw/boards.h" #include "migration/vmstate.h" +#include "migration/cpr-state.h" //#define DEBUG_UNASSIGNED @@ -1593,6 +1594,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, bool readonly, Error **errp) { + int fd; Error *err = NULL; memory_region_init(mr, owner, name, size); mr->ram = true; @@ -1600,8 +1602,21 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, mr->terminates = true; mr->destructor = memory_region_destructor_ram; mr->align = align; - mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path, - readonly, &err); + + /* make sure mr has a valid name */ + memory_region_name(mr); + fd = cpr_find_fd(mr->name, 0); + if (fd < 0) { + mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path, + readonly, &err); + if (mr->ram_block) { + fd = mr->ram_block->fd; + cpr_save_fd(mr->name, 0, fd); + } + } else { + mr->ram_block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, + readonly, &err); + } if (err) { mr->size = int128_zero(); object_unparent(OBJECT(mr)); @@ -2537,7 +2552,7 @@ static void memory_region_add_subregion_common(MemoryRegion *mr, { assert(!subregion->container); subregion->container = mr; - subregion->addr = offset; + memory_region_set_address_only(subregion, offset); memory_region_update_container_subregions(subregion); } @@ -2612,10 +2627,16 @@ static void memory_region_readd_subregion(MemoryRegion *mr) } } +void memory_region_set_address_only(MemoryRegion *mr, hwaddr addr) +{ + mr->addr = addr; + mr->has_addr = true; +} + void memory_region_set_address(MemoryRegion *mr, hwaddr addr) { if (addr != mr->addr) { - mr->addr = addr; + memory_region_set_address_only(mr, addr); memory_region_readd_subregion(mr); } } @@ -2663,6 +2684,27 @@ bool memory_region_is_mapped(MemoryRegion *mr) return mr->container ? true : false; } +int address_space_flat_for_each_section(AddressSpace *as, + memory_region_section_cb func, + void *opaque, + Error **errp) +{ + FlatView *view = address_space_get_flatview(as); + FlatRange *fr; + int ret; + + FOR_EACH_FLAT_RANGE(fr, view) { + MemoryRegionSection mrs = section_from_flat_range(fr, view); + ret = func(&mrs, opaque, errp); + if (ret) { + return ret; + } + } + + flatview_unref(view); + return 0; +} + /* Same as memory_region_find, but it does not add a reference to the * returned region. It must be called from an RCU critical section. */ diff --git a/softmmu/physmem.c b/softmmu/physmem.c index e5c3557d54b3d77af5c4c35f644eeeafaf840f20..92b84b08cc72bfc6efea556cbd0e36a2944cc8e9 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -43,6 +43,7 @@ #include "qemu/qemu-print.h" #include "qemu/log.h" #include "exec/memory.h" +#include "qemu/memfd.h" #include "exec/ioport.h" #include "sysemu/dma.h" #include "sysemu/hostmem.h" @@ -65,6 +66,9 @@ #include "qemu/pmem.h" +#include "migration/blocker.h" +#include "migration/cpr-state.h" +#include "migration/misc.h" #include "migration/vmstate.h" #include "qemu/range.h" @@ -1978,6 +1982,83 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, } } +static bool memory_region_is_backend(MemoryRegion *mr) +{ + return !!object_dynamic_cast(OBJECT(mr)->parent, TYPE_MEMORY_BACKEND); +} + +/* + * Return true if ram contents would be lost during cpr for MIG_MODE_CPR_EXEC. + * Return false for ram_device because it is remapped after exec. Do not + * exclude rom, even though it is readonly, because the rom file could change + * in the new qemu. Return false for non-migratable blocks. They are either + * re-created after exec, or are handled specially, or are covered by a + * device-level cpr blocker. Return false for an fd, because it is visible and + * can be remapped in the new process. + */ +static bool ram_is_volatile(RAMBlock *rb) +{ + MemoryRegion *mr = rb->mr; + + return mr && + memory_region_is_ram(mr) && + !memory_region_is_ram_device(mr) && + (!qemu_ram_is_shared(rb) || !ramblock_is_named_file(rb)) && + qemu_ram_is_migratable(rb) && + rb->fd < 0; +} + +/* + * Add a MIG_MODE_CPR_EXEC blocker for each volatile ram block. This cannot be + * performed in ram_block_add because the migratable flag has not been set yet. + * No need to examine anonymous (non-backend) blocks, because they are + * created using memfd if cpr-exec mode is enabled. + */ +void ram_block_add_cpr_blockers(Error **errp) +{ + RAMBlock *rb; + + RAMBLOCK_FOREACH(rb) { + if (ram_is_volatile(rb) && memory_region_is_backend(rb->mr)) { + const char *name = memory_region_name(rb->mr); + rb->cpr_blocker = NULL; + error_setg(&rb->cpr_blocker, + "Memory region %s is volatile. A memory-backend-memfd or" + " memory-backend-file with share=on is required.", name); + migrate_add_blockers(&rb->cpr_blocker, errp, MIG_MODE_CPR_EXEC, -1); + } + } +} + +static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error **errp) +{ + size_t len, align; + void *addr; + struct MemoryRegion *mr = rb->mr; + const char *name = memory_region_name(mr); + int mfd = cpr_find_memfd(name, &len, &maxlen, &align); + + if (mfd >= 0) { + rb->used_length = len; + rb->max_length = maxlen; + mr->align = align; + } else { + len = rb->used_length; + maxlen = rb->max_length; + mr->align = QEMU_VMALLOC_ALIGN; + mfd = qemu_memfd_create(name, maxlen + mr->align, 0, 0, 0, errp); + if (mfd < 0) { + return NULL; + } + cpr_save_memfd(name, mfd, len, maxlen, mr->align); + } + rb->flags |= RAM_SHARED; + qemu_set_cloexec(mfd); + addr = file_ram_alloc(rb, maxlen, mfd, false, false, 0, errp); + trace_anon_memfd_alloc(name, maxlen, addr, mfd); + return addr; +} + static void ram_block_add(RAMBlock *new_block, Error **errp) { const bool noreserve = qemu_ram_is_noreserve(new_block); @@ -2001,6 +2082,14 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); return; } + } else if (migrate_mode_enabled(MIG_MODE_CPR_EXEC) && + !memory_region_is_backend(new_block->mr)) { + new_block->host = qemu_anon_memfd_alloc(new_block, + new_block->max_length, + errp); + if (!new_block->host) { + return; + } } else { new_block->host = qemu_anon_ram_alloc(new_block->max_length, &new_block->mr->align, @@ -2012,8 +2101,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); return; } - memory_try_enable_merging(new_block->host, new_block->max_length); } + if (!xen_enabled()) + memory_try_enable_merging(new_block->host, new_block->max_length); } new_ram_size = MAX(old_ram_size, @@ -2077,7 +2167,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, /* Just support these ram flags by now. */ assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | - RAM_PROTECTED)) == 0); + RAM_PROTECTED | RAM_NAMED_FILE)) == 0); if (xen_enabled()) { error_setg(errp, "-mem-path not supported with Xen"); @@ -2246,6 +2336,8 @@ void qemu_ram_free(RAMBlock *block) } qemu_mutex_lock_ramlist(); + cpr_delete_memfd(memory_region_name(block->mr)); + migrate_del_blocker(&block->cpr_blocker); QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; /* Write list before version */ @@ -3682,6 +3774,11 @@ bool ramblock_is_pmem(RAMBlock *rb) return rb->flags & RAM_PMEM; } +bool ramblock_is_named_file(RAMBlock *rb) +{ + return rb->flags & RAM_NAMED_FILE; +} + static void mtree_print_phys_entries(int start, int end, int skip, int ptr) { if (start == end - 1) { diff --git a/softmmu/qemu-seccomp.c b/softmmu/qemu-seccomp.c index f50026778cf27c53b7e74fe668f891cd80e1b783..a94a6487279b8788f68a2fda41f4e55f271da437 100644 --- a/softmmu/qemu-seccomp.c +++ b/softmmu/qemu-seccomp.c @@ -196,6 +196,26 @@ static int seccomp_start(uint32_t seccomp_opts, Error **errp) return rc < 0 ? -1 : 0; } +int cpr_exec_unset_spawn(void *opaque, QemuOpts *opts, Error **errp) +{ + const char *value = NULL; + char *retstr = NULL; + + if (qemu_opt_get_bool(opts, "enable", false)) { + value = qemu_opt_get(opts, "spawn"); + if (value) { + /* CPR_EXEC mode need call fork+execv, so do not deny spawn */ + if (g_str_equal(value, "deny")) { + retstr = qemu_opt_get_del(opts, "spawn"); + } + if (retstr) + g_free(retstr); + } + } + + return 0; +} + int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp) { if (qemu_opt_get_bool(opts, "enable", false)) { diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 0757d7f26a79835c771c658fcbb509f3762f4000..0a74fbe1855f03fd45ecec47a5472f31471b177b 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -33,11 +33,13 @@ #include "exec/exec-all.h" #include "exec/gdbstub.h" #include "hw/boards.h" +#include "migration/cpr.h" #include "migration/misc.h" #include "migration/postcopy-ram.h" #include "monitor/monitor.h" #include "net/net.h" #include "net/vhost_net.h" +#include "qapi/util.h" #include "qapi/error.h" #include "qapi/qapi-commands-run-state.h" #include "qapi/qapi-events-run-state.h" @@ -152,6 +154,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, + { RUN_STATE_SUSPENDED, RUN_STATE_PAUSED }, { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH }, { RUN_STATE_SUSPENDED, RUN_STATE_COLO}, @@ -262,6 +265,53 @@ void qemu_system_vmstop_request(RunState state) qemu_mutex_unlock(&vmstop_lock); qemu_notify_event(); } + +struct CprExecCompleteEntry { + CprExecCompleteHandler *cb; + void *opaque; + QTAILQ_ENTRY(CprExecCompleteEntry) entries; +}; + +static QTAILQ_HEAD(, CprExecCompleteEntry) cpr_exec_complete_head = + QTAILQ_HEAD_INITIALIZER(cpr_exec_complete_head); + +/* + * qemu_add_cpr_exec_complete_handler: + * @cb: the callback to invoke + * @opaque: user data passed to the callback + */ +CprExecCompleteEntry *qemu_add_cpr_exec_complete_handler( + CprExecCompleteHandler *cb, void *opaque) +{ + CprExecCompleteEntry *e; + + e = g_malloc0(sizeof(*e)); + e->cb = cb; + e->opaque = opaque; + QTAILQ_INSERT_TAIL(&cpr_exec_complete_head, e, entries); + + return e; +} + +void qemu_del_all_cpr_exec_complete_handler(void) +{ + CprExecCompleteEntry *e = NULL, *next_e = NULL; + + QTAILQ_FOREACH_SAFE(e, &cpr_exec_complete_head, entries, next_e) { + QTAILQ_REMOVE(&cpr_exec_complete_head, e, entries); + g_free(e); + } +} + +void cpr_exec_complete_notify(void) +{ + CprExecCompleteEntry *e; + + QTAILQ_FOREACH(e, &cpr_exec_complete_head, entries) { + e->cb(e->opaque); + } +} + struct VMChangeStateEntry { VMChangeStateHandler *cb; void *opaque; @@ -336,6 +386,7 @@ void vm_state_notify(bool running, RunState state) } } +static bool start_on_wakeup_requested; static ShutdownCause reset_requested; static ShutdownCause shutdown_requested; static int shutdown_signal; @@ -353,6 +404,7 @@ static NotifierList wakeup_notifiers = static NotifierList shutdown_notifiers = NOTIFIER_LIST_INITIALIZER(shutdown_notifiers); static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE); +static GStrv exec_argv; ShutdownCause qemu_shutdown_requested_get(void) { @@ -369,6 +421,11 @@ static int qemu_shutdown_requested(void) return qatomic_xchg(&shutdown_requested, SHUTDOWN_CAUSE_NONE); } +static int qemu_exec_requested(void) +{ + return exec_argv != NULL; +} + static void qemu_kill_report(void) { if (!qtest_driver() && shutdown_signal) { @@ -564,6 +621,11 @@ void qemu_register_suspend_notifier(Notifier *notifier) notifier_list_add(&suspend_notifiers, notifier); } +void qemu_system_start_on_wakeup_request(void) +{ + start_on_wakeup_requested = true; +} + void qemu_system_wakeup_request(WakeupReason reason, Error **errp) { trace_system_wakeup_request(reason); @@ -576,7 +638,13 @@ void qemu_system_wakeup_request(WakeupReason reason, Error **errp) if (!(wakeup_reason_mask & (1 << reason))) { return; } - runstate_set(RUN_STATE_RUNNING); + if (start_on_wakeup_requested) { + start_on_wakeup_requested = false; + vm_start(); + } else { + runstate_set(RUN_STATE_RUNNING); + } + wakeup_reason = reason; qemu_notify_event(); } @@ -628,6 +696,13 @@ void qemu_system_shutdown_request(ShutdownCause reason) qemu_notify_event(); } +void qemu_system_exec_request(const strList *args) +{ + exec_argv = strv_from_strList(args); + shutdown_requested = 1; + qemu_notify_event(); +} + static void qemu_system_powerdown(void) { qapi_event_send_powerdown(); @@ -676,6 +751,16 @@ static bool main_loop_should_exit(void) } request = qemu_shutdown_requested(); if (request) { + if (qemu_exec_requested()) { + Error *err = NULL; + cpr_preserve_fds(); + execvp(exec_argv[0], exec_argv); + error_setg_errno(&err, errno, "execvp %s failed", exec_argv[0]); + cpr_exec_failed(err); + g_strfreev(exec_argv); + exec_argv = NULL; + return false; + } qemu_kill_report(); qemu_system_shutdown(request); if (shutdown_action == SHUTDOWN_ACTION_PAUSE) { diff --git a/softmmu/vl.c b/softmmu/vl.c index d8996f3d6eb647575803a59c076b38bdadfda90c..b2bc684733f0ad5bbca471fd73eedfc772dd7c8c 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -28,6 +28,7 @@ #include "qemu/units.h" #include "qemu/log.h" #include "exec/cpu-common.h" +#include "exec/memory.h" #include "hw/qdev-properties.h" #include "qapi/compat-policy.h" #include "qapi/error.h" @@ -75,6 +76,7 @@ #include "hw/block/block.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" +#include "migration/cpr.h" #include "migration/misc.h" #include "migration/snapshot.h" #include "sysemu/tpm.h" @@ -2538,6 +2540,10 @@ static void qemu_process_early_options(void) #ifdef CONFIG_SECCOMP QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL); if (olist) { + if (migrate_mode_enabled(MIG_MODE_CPR_EXEC)) { + qemu_opts_foreach(olist, cpr_exec_unset_spawn, NULL, &error_fatal); + olist = qemu_find_opts_err("sandbox", NULL); + } qemu_opts_foreach(olist, parse_sandbox, NULL, &error_fatal); } #endif @@ -2745,6 +2751,7 @@ void qmp_x_exit_preconfig(Error **errp) qemu_init_board(); qemu_create_cli_devices(); qemu_machine_creation_done(); + ram_block_add_cpr_blockers(&error_fatal); if (loadvm) { load_snapshot(loadvm, NULL, false, NULL, &error_fatal); @@ -3512,6 +3519,13 @@ void qemu_init(int argc, char **argv, char **envp) case QEMU_OPTION_only_migratable: only_migratable = 1; break; + case QEMU_OPTION_only_cpr_capable: + only_cpr_capable = true; + break; + case QEMU_OPTION_migrate_mode_enable: + migrate_enable_mode(qapi_enum_parse(&MigMode_lookup, optarg, -1, + &error_fatal)); + break; case QEMU_OPTION_nodefaults: has_defaults = 0; break; @@ -3715,6 +3729,8 @@ void qemu_init(int argc, char **argv, char **envp) suspend_mux_open(); + migration_object_early_init(); + qemu_disable_default_devices(); qemu_create_default_devices(); qemu_create_early_backends(); diff --git a/stubs/cpr-state.c b/stubs/cpr-state.c new file mode 100644 index 0000000000000000000000000000000000000000..1c9dc78a38b3d006faa2d08fde2e09f49c98bfd4 --- /dev/null +++ b/stubs/cpr-state.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "migration/cpr-state.h" + +void cpr_save_fd(const char *name, int id, int fd) +{ +} + +void cpr_delete_fd(const char *name, int id) +{ +} + +int cpr_find_fd(const char *name, int id) +{ + return -1; +} + +void cpr_resave_fd(const char *name, int id, int fd) +{ +} diff --git a/stubs/meson.build b/stubs/meson.build index 71469c1d50a18967812c7f46078715782a135f5d..cca5d208d82ec07333fc6b600fbcf4ffceb4b391 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -4,6 +4,7 @@ stub_ss.add(files('blk-exp-close-all.c')) stub_ss.add(files('blockdev-close-all-bdrv-states.c')) stub_ss.add(files('change-state-handler.c')) stub_ss.add(files('cmos.c')) +stub_ss.add(files('cpr-state.c')) stub_ss.add(files('cpu-get-clock.c')) stub_ss.add(files('cpus-get-virtual-clock.c')) stub_ss.add(files('qemu-timer-notify-cb.c')) @@ -24,6 +25,7 @@ if libaio.found() stub_ss.add(files('linux-aio.c')) endif stub_ss.add(files('migr-blocker.c')) +stub_ss.add(files('migration.c')) stub_ss.add(files('module-opts.c')) stub_ss.add(files('monitor.c')) stub_ss.add(files('monitor-core.c')) diff --git a/stubs/migr-blocker.c b/stubs/migr-blocker.c index 5676a2f93c80668bb5a77e77c3da9042c88495ae..60769d8b790edd82296db56fa7434b4fa3657e6f 100644 --- a/stubs/migr-blocker.c +++ b/stubs/migr-blocker.c @@ -1,11 +1,16 @@ #include "qemu/osdep.h" #include "migration/blocker.h" -int migrate_add_blocker(Error *reason, Error **errp) +int migrate_add_blocker(Error **reasonp, Error **errp) { return 0; } -void migrate_del_blocker(Error *reason) +int migrate_add_blockers(Error **reasonp, Error **errp, MigMode mode, ...) +{ + return 0; +} + +void migrate_del_blocker(Error **reasonp) { } diff --git a/stubs/migration.c b/stubs/migration.c new file mode 100644 index 0000000000000000000000000000000000000000..166643cb0fa4852d121ec6abeddf9a90a43fb130 --- /dev/null +++ b/stubs/migration.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021, 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "migration/misc.h" + +void migration_add_notifier(Notifier *notify, + void (*cb)(Notifier *notifier, void *data)) +{ +} + +void migration_remove_notifier(Notifier *notify) +{ +} + +bool migration_has_finished(MigrationState *s) +{ + return false; +} + +bool migration_has_failed(MigrationState *s) +{ + return false; +} + +MigMode migrate_mode_of(MigrationState *s) +{ + return 0; +} + +MigMode migrate_mode(void) +{ + return 0; +} diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index d323d08dcb088c96829bf1a1a054f62ce6d7d8ae..1e1b5e5e386e0df4c33bd0fde351f04a8846d4e2 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1499,7 +1499,7 @@ static int hyperv_init_vcpu(X86CPU *cpu) error_setg(&hv_passthrough_mig_blocker, "'hv-passthrough' CPU flag prevents migration, use explicit" " set of hv-* flags instead"); - ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err); + ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err); if (ret < 0) { error_report_err(local_err); return ret; @@ -1513,7 +1513,7 @@ static int hyperv_init_vcpu(X86CPU *cpu) " use explicit 'hv-no-nonarch-coresharing=on' instead (but" " make sure SMT is disabled and/or that vCPUs are properly" " pinned)"); - ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err); + ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err); if (ret < 0) { error_report_err(local_err); return ret; @@ -2019,7 +2019,7 @@ int kvm_arch_init_vcpu(CPUState *cs) error_setg(&invtsc_mig_blocker, "State blocked by non-migratable CPU device" " (invtsc flag)"); - r = migrate_add_blocker(invtsc_mig_blocker, &local_err); + r = migrate_add_blocker(&invtsc_mig_blocker, &local_err); if (r < 0) { error_report_err(local_err); return r; @@ -2086,7 +2086,7 @@ int kvm_arch_init_vcpu(CPUState *cs) return 0; fail: - migrate_del_blocker(invtsc_mig_blocker); + migrate_del_blocker(&invtsc_mig_blocker); return r; } diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c index 9af261eea32df74931bd4212c76ca3c66c32a596..03aa19489dc468d7d49d1356beb600e1faf5e3e8 100644 --- a/target/i386/nvmm/nvmm-all.c +++ b/target/i386/nvmm/nvmm-all.c @@ -936,9 +936,9 @@ nvmm_init_vcpu(CPUState *cpu) error_setg(&nvmm_migration_blocker, "NVMM: Migration not supported"); - if (migrate_add_blocker(nvmm_migration_blocker, &local_error) < 0) { + if (migrate_add_blockers(&nvmm_migration_blocker, &local_error, + MIG_MODE_NORMAL, -1) < 0) { error_report_err(local_error); - error_free(nvmm_migration_blocker); return -EINVAL; } } diff --git a/target/i386/sev.c b/target/i386/sev.c index 025ff7a6f8451c18158b5afcd6a898849d034ac1..81ef6260f0b78870d4d1fadf657fae85d978aaf9 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -851,7 +851,7 @@ sev_launch_finish(SevGuestState *sev) /* add migration blocker */ error_setg(&sev_mig_blocker, "SEV: Migration is not implemented"); - migrate_add_blocker(sev_mig_blocker, &error_fatal); + migrate_add_blocker(&sev_mig_blocker, &error_fatal); } static void diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c index ef896da0a2115c50a0afc6c64910d6def52c51b9..ce647a1e092c149e451b5bd40ca842d533f09bd0 100644 --- a/target/i386/whpx/whpx-all.c +++ b/target/i386/whpx/whpx-all.c @@ -1346,9 +1346,8 @@ int whpx_init_vcpu(CPUState *cpu) "State blocked due to non-migratable CPUID feature support," "dirty memory tracking support, and XSAVE/XRSTOR support"); - if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { + if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { error_report_err(local_error); - error_free(whpx_migration_blocker); ret = -EINVAL; goto error; } diff --git a/tests/avocado/cpr.py b/tests/avocado/cpr.py new file mode 100644 index 0000000000000000000000000000000000000000..11e1376d13a95514dcab4d30cf393d900a4e6aff --- /dev/null +++ b/tests/avocado/cpr.py @@ -0,0 +1,176 @@ +# cpr test + +# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# +# This work is licensed under the terms of the GNU GPL, version 2. +# See the COPYING file in the top-level directory. + +import tempfile +from avocado_qemu import QemuSystemTest +from avocado.utils import wait + +class Cpr(QemuSystemTest): + """ + :avocado: tags=cpr + """ + + timeout = 5 + fast_timeout = 1 + + @staticmethod + def has_status(vm, status, command): + return vm.command(command)['status'] in status + + def wait_for_status(self, vm, status, command): + wait.wait_for(self.has_status, + timeout=self.timeout, + step=0.1, + args=(vm,status,command,)) + + def wait_for_runstate(self, vm, status): + self.wait_for_status(vm, status, 'query-status') + + def wait_for_migration(self, vm, status): + self.wait_for_status(vm, status, 'query-migrate') + + def run_and_fail(self, vm, msg): + # Qemu will fail fast, so disable monitor to avoid timeout in accept + vm.set_qmp_monitor(False) + vm.launch() + vm.wait(self.timeout) + self.assertRegex(vm.get_log(), msg) + + def get_vm_for_restart(self): + return self.get_vm('-nodefaults', + '-migrate-mode-enable', 'cpr-exec', + '-object', 'memory-backend-memfd,id=pc.ram,size=8M', + '-machine', 'memory-backend=pc.ram') + + def do_cpr_exec(self, vmstate_name): + vm = self.get_vm_for_restart() + vm.launch() + + uri = 'file:' + vmstate_name + args = vm.full_args + ['-incoming', 'defer'] + + vm.command('migrate-set-parameters', cpr_exec_args=args) + vm.command('migrate-set-parameters', mode='cpr-exec') + vm.qmp('migrate', uri=uri) + + # Cannot poll for migration status, because qemu may call execv before + # we see it. Wait for STOP instead. + vm.event_wait(name='STOP', timeout=self.fast_timeout) + + # Migrate execs and closes the monitor socket, so reopen it. + vm.reopen_qmp_connection() + + self.assertEqual(vm.command('query-status')['status'], 'inmigrate') + resp = vm.command('migrate-incoming', uri=uri) + self.wait_for_migration(vm, ('completed', 'failed')) + self.assertEqual(vm.command('query-migrate')['status'], 'completed') + + resp = vm.command('cont') + vm.event_wait(name='RESUME', timeout=self.fast_timeout) + self.assertEqual(vm.command('query-status')['status'], 'running') + + def do_cpr_reboot(self, vmstate_name): + args = ['-nodefaults', '-migrate-mode-enable', 'cpr-reboot' ] + old_vm = self.get_vm(*args) + old_vm.launch() + + uri = 'file:' + vmstate_name + + old_vm.command('migrate-set-capabilities', capabilities = [ + { "capability": "x-ignore-shared", "state": True }]) + old_vm.command('migrate-set-parameters', mode='cpr-reboot') + old_vm.qmp('migrate', uri=uri) + self.wait_for_migration(old_vm, ('completed', 'failed')) + self.assertEqual(old_vm.command('query-migrate')['status'], + 'completed') + self.assertEqual(old_vm.command('query-status')['status'], + 'postmigrate') + + args = args + ['-incoming', 'defer'] + new_vm = self.get_vm(*args) + new_vm.launch() + self.assertEqual(new_vm.command('query-status')['status'], 'inmigrate') + + new_vm.command('migrate-set-capabilities', capabilities = [ + { "capability": "x-ignore-shared", "state": True }]) + new_vm.command('migrate-set-parameters', mode='cpr-reboot') + new_vm.command('migrate-incoming', uri=uri) + self.wait_for_migration(new_vm, ('completed', 'failed')) + self.assertEqual(new_vm.command('query-migrate')['status'], 'completed') + + new_vm.command('cont') + new_vm.event_wait(name='RESUME', timeout=self.fast_timeout) + self.assertEqual(new_vm.command('query-status')['status'], 'running') + + def test_cpr_exec(self): + """ + Verify that cpr restart mode works + """ + with tempfile.NamedTemporaryFile() as vmstate_file: + self.do_cpr_exec(vmstate_file.name) + + def test_cpr_reboot(self): + """ + Verify that cpr reboot mode works + """ + with tempfile.NamedTemporaryFile() as vmstate_file: + self.do_cpr_reboot(vmstate_file.name) + + def test_cpr_block_cpr_exec(self): + """ + Verify that qemu rejects cpr restart mode for volatile memory + """ + + vm = self.get_vm('-nodefaults', + '-migrate-mode-enable', 'cpr-exec') + vm.launch() + uri='file:/dev/null' + args = vm.full_args + ['-S'] + resp = vm.command('migrate-set-parameters', mode='cpr-exec') + rsp = vm.qmp('migrate', uri=uri) + vm.qmp('quit') + + expect = r'Memory region .* is volatile' + self.assertRegex(rsp['error']['desc'], expect) + + def test_cpr_block_memfd(self): + + """ + Verify that qemu complains for only-cpr-capable and volatile memory + """ + vm = self.get_vm('-nodefaults', + '-migrate-mode-enable', 'cpr-exec', + '-only-cpr-capable') + self.run_and_fail(vm, r'only-cpr-capable specified.* Memory ') + + def test_cpr_block_replay(self): + """ + Verify that qemu complains for only-cpr-capable and replay + """ + vm = self.get_vm_for_restart() + vm.add_args('-only-cpr-capable', + '-icount', 'shift=10,rr=record,rrfile=/dev/null') + self.run_and_fail(vm, r'only-cpr-capable specified.* replay ') + + def test_cpr_block_chardev(self): + """ + Verify that qemu complains for only-cpr-capable and unsupported chardev + """ + vm = self.get_vm_for_restart() + vm.add_args('-only-cpr-capable', + '-chardev', 'vc,id=vc1') + self.run_and_fail(vm, r'only-cpr-capable specified.* vc1 ') + + def test_cpr_allow_chardev(self): + """ + Verify that qemu allows unsupported chardev with reopen-on-cpr + """ + vm = self.get_vm_for_restart() + vm.add_args('-only-cpr-capable', + '-chardev', 'vc,id=vc1,reopen-on-cpr=on') + vm.launch() + self.wait_for_runstate(vm, ('running')) diff --git a/tests/unit/meson.build b/tests/unit/meson.build index c21d81787454854610738e274deb00d03e275990..3700dcec651826017871ae23cf751462caa93eb3 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -17,6 +17,7 @@ tests = { 'test-forward-visitor': [testqapi], 'test-string-input-visitor': [testqapi], 'test-string-output-visitor': [testqapi], + 'test-strlist': [testqapi], 'test-opts-visitor': [testqapi], 'test-visitor-serialization': [testqapi], 'test-bitmap': [], diff --git a/tests/unit/test-strlist.c b/tests/unit/test-strlist.c new file mode 100644 index 0000000000000000000000000000000000000000..ef740dccc7fbb9e9048e9931cfd1311b2e3fc8bd --- /dev/null +++ b/tests/unit/test-strlist.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/util.h" +#include "qapi/qapi-builtin-types.h" + +static strList *make_list(int length) +{ + strList *head = 0, *list, **prev = &head; + + while (length--) { + list = *prev = g_new0(strList, 1); + list->value = g_strdup("aaa"); + prev = &list->next; + } + return head; +} + +static void test_length(void) +{ + strList *list; + int i; + + for (i = 0; i < 5; i++) { + list = make_list(i); + g_assert_cmpint(i, ==, QAPI_LIST_LENGTH(list)); + qapi_free_strList(list); + } +} + +struct { + const char *string; + char delim; + const char *args[5]; +} list_data[] = { + { 0, ',', { 0 } }, + { "", ',', { 0 } }, + { "a", ',', { "a", 0 } }, + { "a,b", ',', { "a", "b", 0 } }, + { "a,b,c", ',', { "a", "b", "c", 0 } }, + { "first last", ' ', { "first", "last", 0 } }, + { "a:", ':', { "a", 0 } }, + { "a::b", ':', { "a", "", "b", 0 } }, + { ":", ':', { "", 0 } }, + { ":a", ':', { "", "a", 0 } }, + { "::a", ':', { "", "", "a", 0 } }, +}; + +static void test_strv(void) +{ + int i, j; + const char **expect; + strList *list; + GStrv args; + + for (i = 0; i < ARRAY_SIZE(list_data); i++) { + expect = list_data[i].args; + list = strList_from_string(list_data[i].string, list_data[i].delim); + args = strv_from_strList(list); + qapi_free_strList(list); + for (j = 0; expect[j] && args[j]; j++) { + g_assert_cmpstr(expect[j], ==, args[j]); + } + g_assert_null(expect[j]); + g_assert_null(args[j]); + g_strfreev(args); + } +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + g_test_add_func("/test-string/length", test_length); + g_test_add_func("/test-string/strv", test_strv); + return g_test_run(); +} diff --git a/trace-events b/trace-events index a637a61ebab4e2acba188ad029fc523c9f69f1ae..770a9ac0b7945562e052648402ffc1fd0b39fd72 100644 --- a/trace-events +++ b/trace-events @@ -45,6 +45,7 @@ ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_ # accel/tcg/cputlb.c memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u" memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64 +anon_memfd_alloc(const char *name, size_t size, void *ptr, int fd) "%s size %zu ptr %p fd %d" # gdbstub.c gdbstub_op_start(const char *device) "Starting gdbstub using device %s" diff --git a/ui/spice-core.c b/ui/spice-core.c index 31974b8d6c445ad85e335915e7898e6cb0c72e82..de960b456340fe86d3652b594f7eb0467fd33517 100644 --- a/ui/spice-core.c +++ b/ui/spice-core.c @@ -558,7 +558,7 @@ static void migration_state_notifier(Notifier *notifier, void *data) { MigrationState *s = data; - if (!spice_have_target_host) { + if (!spice_have_target_host || migrate_mode_of(s) != MIG_MODE_NORMAL) { return; } @@ -819,8 +819,7 @@ static void qemu_spice_init(void) }; using_spice = 1; - migration_state.notify = migration_state_notifier; - add_migration_state_change_notifier(&migration_state); + migration_add_notifier(&migration_state, migration_state_notifier); spice_migrate.base.sif = &migrate_interface.base; qemu_spice.add_interface(&spice_migrate.base); diff --git a/ui/vdagent.c b/ui/vdagent.c index 19e8fbfc96f156308e94df7379d3467f2026b046..d452eddf0bf834179cdbb6fd0d52531788302d0f 100644 --- a/ui/vdagent.c +++ b/ui/vdagent.c @@ -603,7 +603,7 @@ static void vdagent_chr_open(Chardev *chr, return; #endif - if (migrate_add_blocker(vd->migration_blocker, errp) != 0) { + if (migrate_add_blocker(&vd->migration_blocker, errp) != 0) { return; } @@ -848,10 +848,9 @@ static void vdagent_chr_fini(Object *obj) { VDAgentChardev *vd = QEMU_VDAGENT_CHARDEV(obj); - migrate_del_blocker(vd->migration_blocker); + migrate_del_blocker(&vd->migration_blocker); vdagent_disconnect(vd); buffer_free(&vd->outbuf); - error_free(vd->migration_blocker); } static const TypeInfo vdagent_chr_type_info = { diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 18a38b94643080895dff89f1f22faed6ca9e11b8..9f353401193e6805d538b36dac582476959b5695 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -310,6 +310,16 @@ void qemu_set_cloexec(int fd) assert(f != -1); } + +void qemu_clear_cloexec(int fd) +{ + int f; + f = fcntl(fd, F_GETFD); + assert(f != -1); + f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC); + assert(f != -1); +} + /* * Creates a pipe with FD_CLOEXEC set on both file descriptors */ diff --git a/util/oslib-win32.c b/util/oslib-win32.c index af559ef3398dc3473da4cdc22f7a1028bc5912bc..acc3e0661c70bedf1a15c0ba3c23fbab395f8b81 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -265,6 +265,10 @@ void qemu_set_cloexec(int fd) { } +void qemu_clear_cloexec(int fd) +{ +} + /* Offset between 1/1/1601 and 1/1/1970 in 100 nanosec units */ #define _W32_FT_OFFSET (116444736000000000ULL)