diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 84edbe8bb17c52d53e2a1c41756890dc7f266f17..4daff563a09a2f642746b2c4a393abb6631b71f0 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -138,6 +138,7 @@ bool kvm_direct_msi_allowed; bool kvm_ioeventfd_any_length_allowed; bool kvm_msi_use_devid; static bool kvm_immediate_exit; +static hwaddr kvm_max_slot_size = ~0; static const KVMCapabilityInfo kvm_required_capabilites[] = { KVM_CAP_INFO(USER_MEMORY), @@ -458,7 +459,7 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, static int kvm_section_update_flags(KVMMemoryListener *kml, MemoryRegionSection *section) { - hwaddr start_addr, size; + hwaddr start_addr, size, slot_size; KVMSlot *mem; int ret = 0; @@ -469,13 +470,18 @@ static int kvm_section_update_flags(KVMMemoryListener *kml, kvm_slots_lock(kml); - mem = kvm_lookup_matching_slot(kml, start_addr, size); - if (!mem) { - /* We don't have a slot if we want to trap every access. */ - goto out; - } + while (size && !ret) { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); + if (!mem) { + /* We don't have a slot if we want to trap every access. */ + goto out; + } - ret = kvm_slot_update_flags(kml, mem, section->mr); + ret = kvm_slot_update_flags(kml, mem, section->mr); + start_addr += slot_size; + size -= slot_size; + } out: kvm_slots_unlock(kml); @@ -530,6 +536,31 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) +/* Allocate the dirty bitmap for a slot */ +static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) +{ + /* + * XXX bad kernel interface alert + * For dirty bitmap, kernel allocates array of size aligned to + * bits-per-long. But for case when the kernel is 64bits and + * the userspace is 32bits, userspace can't align to the same + * bits-per-long, since sizeof(long) is different between kernel + * and user space. This way, userspace will provide buffer which + * may be 4 bytes less than the kernel will use, resulting in + * userspace memory corruption (which is not detectable by valgrind + * too, in most cases). + * So for now, let's align to 64 instead of HOST_LONG_BITS here, in + * a hope that sizeof(long) won't become >8 any time soon. + * + * Note: the granule of kvm dirty log is qemu_real_host_page_size. + * And mem->memory_size is aligned to it (otherwise this mem can't + * be registered to KVM). + */ + hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size, + /*HOST_LONG_BITS*/ 64) / 8; + mem->dirty_bmap = g_malloc0(bitmap_size); +} + /** * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space * @@ -548,33 +579,23 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, struct kvm_dirty_log d = {}; KVMSlot *mem; hwaddr start_addr, size; + hwaddr slot_size, slot_offset = 0; int ret = 0; size = kvm_align_section(section, &start_addr); - if (size) { - mem = kvm_lookup_matching_slot(kml, start_addr, size); + while (size) { + MemoryRegionSection subsection = *section; + + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); if (!mem) { /* We don't have a slot if we want to trap every access. */ goto out; } - /* XXX bad kernel interface alert - * For dirty bitmap, kernel allocates array of size aligned to - * bits-per-long. But for case when the kernel is 64bits and - * the userspace is 32bits, userspace can't align to the same - * bits-per-long, since sizeof(long) is different between kernel - * and user space. This way, userspace will provide buffer which - * may be 4 bytes less than the kernel will use, resulting in - * userspace memory corruption (which is not detectable by valgrind - * too, in most cases). - * So for now, let's align to 64 instead of HOST_LONG_BITS here, in - * a hope that sizeof(long) won't become >8 any time soon. - */ - size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), - /*HOST_LONG_BITS*/ 64) / 8; if (!mem->dirty_bmap) { /* Allocate on the first log_sync, once and for all */ - mem->dirty_bmap = g_malloc0(size); + kvm_memslot_init_dirty_bitmap(mem); } d.dirty_bitmap = mem->dirty_bmap; @@ -585,7 +606,13 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, goto out; } - kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); + subsection.offset_within_region += slot_offset; + subsection.size = int128_make64(slot_size); + kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); + + slot_offset += slot_size; + start_addr += slot_size; + size -= slot_size; } out: return ret; @@ -974,6 +1001,14 @@ kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) return NULL; } +void kvm_set_max_memslot_size(hwaddr max_slot_size) +{ + g_assert( + ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size + ); + kvm_max_slot_size = max_slot_size; +} + static void kvm_set_phys_mem(KVMMemoryListener *kml, MemoryRegionSection *section, bool add) { @@ -981,7 +1016,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, int err; MemoryRegion *mr = section->mr; bool writeable = !mr->readonly && !mr->rom_device; - hwaddr start_addr, size; + hwaddr start_addr, size, slot_size; void *ram; if (!memory_region_is_ram(mr)) { @@ -1006,41 +1041,59 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, kvm_slots_lock(kml); if (!add) { - mem = kvm_lookup_matching_slot(kml, start_addr, size); - if (!mem) { - goto out; - } - if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { - kvm_physical_sync_dirty_bitmap(kml, section); - } + do { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); + if (!mem) { + goto out; + } + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + kvm_physical_sync_dirty_bitmap(kml, section); + } - /* unregister the slot */ - g_free(mem->dirty_bmap); - mem->dirty_bmap = NULL; - mem->memory_size = 0; - mem->flags = 0; - err = kvm_set_user_memory_region(kml, mem, false); - if (err) { - fprintf(stderr, "%s: error unregistering slot: %s\n", - __func__, strerror(-err)); - abort(); - } + /* unregister the slot */ + g_free(mem->dirty_bmap); + mem->dirty_bmap = NULL; + mem->memory_size = 0; + mem->flags = 0; + err = kvm_set_user_memory_region(kml, mem, false); + if (err) { + fprintf(stderr, "%s: error unregistering slot: %s\n", + __func__, strerror(-err)); + abort(); + } + start_addr += slot_size; + size -= slot_size; + } while (size); goto out; } /* register the new slot */ - mem = kvm_alloc_slot(kml); - mem->memory_size = size; - mem->start_addr = start_addr; - mem->ram = ram; - mem->flags = kvm_mem_flags(mr); + do { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_alloc_slot(kml); + mem->memory_size = slot_size; + mem->start_addr = start_addr; + mem->ram = ram; + mem->flags = kvm_mem_flags(mr); - err = kvm_set_user_memory_region(kml, mem, true); - if (err) { - fprintf(stderr, "%s: error registering slot: %s\n", __func__, - strerror(-err)); - abort(); - } + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * Reallocate the bmap; it means it doesn't disappear in + * middle of a migrate. + */ + kvm_memslot_init_dirty_bitmap(mem); + } + err = kvm_set_user_memory_region(kml, mem, true); + if (err) { + fprintf(stderr, "%s: error registering slot: %s\n", __func__, + strerror(-err)); + abort(); + } + start_addr += slot_size; + ram += slot_size; + size -= slot_size; + } while (size); out: kvm_slots_unlock(kml); @@ -2880,6 +2933,7 @@ static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, for (i = 0; i < kvm->nr_as; ++i) { if (kvm->as[i].as == as && kvm->as[i].ml) { + size = MIN(kvm_max_slot_size, size); return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, start_addr, size); } diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a86a4c4506558a41a36d6f4a3be0dbec73ad588c..d9cc3509ef2db54061bc81f930b047a2f6a86726 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -310,7 +310,7 @@ bool vfio_mig_active(void) return true; } -static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) +static bool vfio_devices_all_saving(VFIOContainer *container) { VFIOGroup *group; VFIODevice *vbasedev; @@ -328,8 +328,11 @@ static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) return false; } - if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && - !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { + if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) + && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + return false; + } continue; } else { return false; @@ -1088,7 +1091,7 @@ static void vfio_listerner_log_sync(MemoryListener *listener, return; } - if (vfio_devices_all_stopped_and_saving(container)) { + if (vfio_devices_all_saving(container)) { vfio_sync_dirty_bitmap(container, section); } } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 1a9778448605bfbcbdf7d89c6596de014623cc94..033cb2b0c9f53fd45d2642149f213d096fcd953b 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -888,7 +888,7 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) Error *local_err = NULL; int ret = -ENOTSUP; - if (!container->dirty_pages_supported) { + if (!vbasedev->enable_migration || !container->dirty_pages_supported) { goto add_blocker; } @@ -903,8 +903,8 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) goto add_blocker; } - g_free(info); trace_vfio_migration_probe(vbasedev->name, info->index); + g_free(info); return 0; add_blocker: diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index da7c740bce590223cc125972908fee0288d5cccf..3641ad0c5c216a70ae1562f260deecc5286a2476 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3180,6 +3180,9 @@ static void vfio_instance_init(Object *obj) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), + DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, + vbasedev.pre_copy_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), @@ -3192,6 +3195,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice, + vbasedev.enable_migration, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, vbasedev.balloon_allowed, false), diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 048731e81f70bf6e836a81a68729eff9968e577c..475aa9fb40bc58936a7f1fd74cc2faafc36117ee 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -123,12 +123,14 @@ typedef struct VFIODevice { bool needs_reset; bool no_mmap; bool balloon_allowed; + bool enable_migration; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; VFIOMigration *migration; Error *migration_blocker; + OnOffAuto pre_copy_dirty_page_tracking; } VFIODevice; struct VFIODeviceOps { diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 787dbc7770e6726723648d5d535c4b2c296bc588..f8e884f1461d88ba249fca981a3dc956f90290d5 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -43,4 +43,5 @@ typedef struct KVMMemoryListener { void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, AddressSpace *as, int as_id); +void kvm_set_max_memslot_size(hwaddr max_slot_size); #endif diff --git a/memory.c b/memory.c index 44713efc66b2b8ed5ec198841f6daa8ae9777cf2..708b3dff3d8e765055febed969f3f01c64e7dae0 100644 --- a/memory.c +++ b/memory.c @@ -1825,7 +1825,10 @@ bool memory_region_is_ram_device(MemoryRegion *mr) uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; - if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) { + RAMBlock *rb = mr->ram_block; + + if (global_dirty_log && ((rb && qemu_ram_is_migratable(rb)) || + memory_region_is_iommu(mr))) { mask |= (1 << DIRTY_MEMORY_MIGRATION); } return mask; diff --git a/migration/migration.c b/migration/migration.c index 9faf5f63a627a7155f0d2fa1a13c53fa866af4e4..d34e57fc051f2919f5d7278d5ba57b61b9e1698d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2541,7 +2541,7 @@ retry: out: res = qemu_file_get_error(rp); if (res) { - if (res == -EIO) { + if (res == -EIO && migration_in_postcopy()) { /* * Maybe there is something we can do: it looks like a * network down issue, and we pause for a recovery.