diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 84edbe8bb17c52d53e2a1c41756890dc7f266f17..4daff563a09a2f642746b2c4a393abb6631b71f0 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -138,6 +138,7 @@ bool kvm_direct_msi_allowed; bool kvm_ioeventfd_any_length_allowed; bool kvm_msi_use_devid; static bool kvm_immediate_exit; +static hwaddr kvm_max_slot_size = ~0; static const KVMCapabilityInfo kvm_required_capabilites[] = { KVM_CAP_INFO(USER_MEMORY), @@ -458,7 +459,7 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, static int kvm_section_update_flags(KVMMemoryListener *kml, MemoryRegionSection *section) { - hwaddr start_addr, size; + hwaddr start_addr, size, slot_size; KVMSlot *mem; int ret = 0; @@ -469,13 +470,18 @@ static int kvm_section_update_flags(KVMMemoryListener *kml, kvm_slots_lock(kml); - mem = kvm_lookup_matching_slot(kml, start_addr, size); - if (!mem) { - /* We don't have a slot if we want to trap every access. */ - goto out; - } + while (size && !ret) { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); + if (!mem) { + /* We don't have a slot if we want to trap every access. */ + goto out; + } - ret = kvm_slot_update_flags(kml, mem, section->mr); + ret = kvm_slot_update_flags(kml, mem, section->mr); + start_addr += slot_size; + size -= slot_size; + } out: kvm_slots_unlock(kml); @@ -530,6 +536,31 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) +/* Allocate the dirty bitmap for a slot */ +static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) +{ + /* + * XXX bad kernel interface alert + * For dirty bitmap, kernel allocates array of size aligned to + * bits-per-long. But for case when the kernel is 64bits and + * the userspace is 32bits, userspace can't align to the same + * bits-per-long, since sizeof(long) is different between kernel + * and user space. This way, userspace will provide buffer which + * may be 4 bytes less than the kernel will use, resulting in + * userspace memory corruption (which is not detectable by valgrind + * too, in most cases). + * So for now, let's align to 64 instead of HOST_LONG_BITS here, in + * a hope that sizeof(long) won't become >8 any time soon. + * + * Note: the granule of kvm dirty log is qemu_real_host_page_size. + * And mem->memory_size is aligned to it (otherwise this mem can't + * be registered to KVM). + */ + hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size, + /*HOST_LONG_BITS*/ 64) / 8; + mem->dirty_bmap = g_malloc0(bitmap_size); +} + /** * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space * @@ -548,33 +579,23 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, struct kvm_dirty_log d = {}; KVMSlot *mem; hwaddr start_addr, size; + hwaddr slot_size, slot_offset = 0; int ret = 0; size = kvm_align_section(section, &start_addr); - if (size) { - mem = kvm_lookup_matching_slot(kml, start_addr, size); + while (size) { + MemoryRegionSection subsection = *section; + + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); if (!mem) { /* We don't have a slot if we want to trap every access. */ goto out; } - /* XXX bad kernel interface alert - * For dirty bitmap, kernel allocates array of size aligned to - * bits-per-long. But for case when the kernel is 64bits and - * the userspace is 32bits, userspace can't align to the same - * bits-per-long, since sizeof(long) is different between kernel - * and user space. This way, userspace will provide buffer which - * may be 4 bytes less than the kernel will use, resulting in - * userspace memory corruption (which is not detectable by valgrind - * too, in most cases). - * So for now, let's align to 64 instead of HOST_LONG_BITS here, in - * a hope that sizeof(long) won't become >8 any time soon. - */ - size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), - /*HOST_LONG_BITS*/ 64) / 8; if (!mem->dirty_bmap) { /* Allocate on the first log_sync, once and for all */ - mem->dirty_bmap = g_malloc0(size); + kvm_memslot_init_dirty_bitmap(mem); } d.dirty_bitmap = mem->dirty_bmap; @@ -585,7 +606,13 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, goto out; } - kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); + subsection.offset_within_region += slot_offset; + subsection.size = int128_make64(slot_size); + kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); + + slot_offset += slot_size; + start_addr += slot_size; + size -= slot_size; } out: return ret; @@ -974,6 +1001,14 @@ kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) return NULL; } +void kvm_set_max_memslot_size(hwaddr max_slot_size) +{ + g_assert( + ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size + ); + kvm_max_slot_size = max_slot_size; +} + static void kvm_set_phys_mem(KVMMemoryListener *kml, MemoryRegionSection *section, bool add) { @@ -981,7 +1016,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, int err; MemoryRegion *mr = section->mr; bool writeable = !mr->readonly && !mr->rom_device; - hwaddr start_addr, size; + hwaddr start_addr, size, slot_size; void *ram; if (!memory_region_is_ram(mr)) { @@ -1006,41 +1041,59 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, kvm_slots_lock(kml); if (!add) { - mem = kvm_lookup_matching_slot(kml, start_addr, size); - if (!mem) { - goto out; - } - if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { - kvm_physical_sync_dirty_bitmap(kml, section); - } + do { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); + if (!mem) { + goto out; + } + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + kvm_physical_sync_dirty_bitmap(kml, section); + } - /* unregister the slot */ - g_free(mem->dirty_bmap); - mem->dirty_bmap = NULL; - mem->memory_size = 0; - mem->flags = 0; - err = kvm_set_user_memory_region(kml, mem, false); - if (err) { - fprintf(stderr, "%s: error unregistering slot: %s\n", - __func__, strerror(-err)); - abort(); - } + /* unregister the slot */ + g_free(mem->dirty_bmap); + mem->dirty_bmap = NULL; + mem->memory_size = 0; + mem->flags = 0; + err = kvm_set_user_memory_region(kml, mem, false); + if (err) { + fprintf(stderr, "%s: error unregistering slot: %s\n", + __func__, strerror(-err)); + abort(); + } + start_addr += slot_size; + size -= slot_size; + } while (size); goto out; } /* register the new slot */ - mem = kvm_alloc_slot(kml); - mem->memory_size = size; - mem->start_addr = start_addr; - mem->ram = ram; - mem->flags = kvm_mem_flags(mr); + do { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_alloc_slot(kml); + mem->memory_size = slot_size; + mem->start_addr = start_addr; + mem->ram = ram; + mem->flags = kvm_mem_flags(mr); - err = kvm_set_user_memory_region(kml, mem, true); - if (err) { - fprintf(stderr, "%s: error registering slot: %s\n", __func__, - strerror(-err)); - abort(); - } + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * Reallocate the bmap; it means it doesn't disappear in + * middle of a migrate. + */ + kvm_memslot_init_dirty_bitmap(mem); + } + err = kvm_set_user_memory_region(kml, mem, true); + if (err) { + fprintf(stderr, "%s: error registering slot: %s\n", __func__, + strerror(-err)); + abort(); + } + start_addr += slot_size; + ram += slot_size; + size -= slot_size; + } while (size); out: kvm_slots_unlock(kml); @@ -2880,6 +2933,7 @@ static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, for (i = 0; i < kvm->nr_as; ++i) { if (kvm->as[i].as == as && kvm->as[i].ml) { + size = MIN(kvm_max_slot_size, size); return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, start_addr, size); } diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a86a4c4506558a41a36d6f4a3be0dbec73ad588c..206fb83e28184724bedccf493a6122d59bee8cec 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -310,7 +310,7 @@ bool vfio_mig_active(void) return true; } -static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) +static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; VFIODevice *vbasedev; @@ -328,10 +328,8 @@ static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) return false; } - if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && - !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { - continue; - } else { + if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) + && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { return false; } } @@ -374,7 +372,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, { struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; - uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; + uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size; int ret; unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); @@ -386,12 +384,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, bitmap = (struct vfio_bitmap *)&unmap->data; /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to - * TARGET_PAGE_SIZE. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize + * to qemu_real_host_page_size. */ - bitmap->pgsize = TARGET_PAGE_SIZE; + bitmap->pgsize = qemu_real_host_page_size; bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; @@ -422,6 +420,29 @@ unmap_exit: return ret; } +static VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size) +{ + VFIODMARange *qrange; + + QLIST_FOREACH(qrange, &container->dma_list, next) { + if (qrange->iova == start_addr && qrange->size == size) { + return qrange; + } + } + return NULL; +} + +static void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) +{ + uint64_t pages, size; + + pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size; + size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + qrange->bitmap = g_malloc0(size); +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ @@ -435,12 +456,29 @@ static int vfio_dma_unmap(VFIOContainer *container, .iova = iova, .size = size, }; + VFIODMARange *qrange; if (iotlb && container->dirty_pages_supported && vfio_devices_all_running_and_saving(container)) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } + /* + * unregister the DMA range + * + * It seems that the memory layer will give us the same section as the one + * used in region_add(). Otherwise it'll be complicated to manipulate the + * bitmap across region_{add,del}. Is there any guarantee? + * + * But there is really not such a restriction on the kernel interface + * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc). + */ + qrange = vfio_lookup_match_range(container, iova, size); + assert(qrange); + g_free(qrange->bitmap); + QLIST_REMOVE(qrange, next); + g_free(qrange); + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -477,6 +515,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, .iova = iova, .size = size, }; + VFIODMARange *qrange; + + qrange = g_malloc0(sizeof(*qrange)); + qrange->iova = iova; + qrange->size = size; + QLIST_INSERT_HEAD(&container->dma_list, qrange, next); + /* XXX allocate the dirty bitmap on demand */ + vfio_dma_range_init_dirty_bitmap(qrange); if (!readonly) { map.flags |= VFIO_DMA_MAP_FLAG_WRITE; @@ -669,16 +715,17 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != - (section->offset_within_region & ~TARGET_PAGE_MASK))) { + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { error_report("%s received unaligned region", __func__); return; } - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); if (int128_ge(int128_make64(iova), llend)) { return; @@ -863,8 +910,9 @@ static void vfio_listener_region_del(MemoryListener *listener, return; } - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != - (section->offset_within_region & ~TARGET_PAGE_MASK))) { + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { error_report("%s received unaligned region", __func__); return; } @@ -892,10 +940,10 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); if (int128_ge(int128_make64(iova), llend)) { return; @@ -947,37 +995,75 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +{ + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + error_report("Failed to set dirty tracking flag 0x%x errno: %d", + dirty.flags, errno); + } +} + +static void vfio_listener_log_global_start(MemoryListener *listener) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + vfio_set_dirty_page_tracking(container, true); +} + +static void vfio_listener_log_global_stop(MemoryListener *listener) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + vfio_set_dirty_page_tracking(container, false); +} + static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; + VFIODMARange *qrange; uint64_t pages; int ret; + qrange = vfio_lookup_match_range(container, iova, size); + /* the same as vfio_dma_unmap() */ + assert(qrange); + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); - dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + dbitmap->flags = container->dirty_log_manual_clear ? + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR : + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; range->iova = iova; range->size = size; /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to - * TARGET_PAGE_SIZE. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize + * to qemu_real_host_page_size. */ - range->bitmap.pgsize = TARGET_PAGE_SIZE; + range->bitmap.pgsize = qemu_real_host_page_size; - pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; + pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size; range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; - range->bitmap.data = g_try_malloc0(range->bitmap.size); - if (!range->bitmap.data) { - ret = -ENOMEM; - goto err_out; - } + + range->bitmap.data = (__u64 *)qrange->bitmap; ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); if (ret) { @@ -993,7 +1079,6 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, range->bitmap.size, ram_addr); err_out: - g_free(range->bitmap.data); g_free(dbitmap); return ret; @@ -1074,11 +1159,11 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, section->offset_within_region; return vfio_get_dirty_bitmap(container, - TARGET_PAGE_ALIGN(section->offset_within_address_space), - int128_get64(section->size), ram_addr); + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); } -static void vfio_listerner_log_sync(MemoryListener *listener, +static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); @@ -1088,15 +1173,153 @@ static void vfio_listerner_log_sync(MemoryListener *listener, return; } - if (vfio_devices_all_stopped_and_saving(container)) { + if (vfio_devices_all_dirty_tracking(container)) { vfio_sync_dirty_bitmap(container, section); } } +/* + * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP + * ioctl. But copy from kvm side and align {start, size} with 64 pages. + * + * I think the code can be simplified a lot if no alignment requirement. + */ +#define VFIO_CLEAR_LOG_SHIFT 6 +#define VFIO_CLEAR_LOG_ALIGN (qemu_real_host_page_size << VFIO_CLEAR_LOG_SHIFT) +#define VFIO_CLEAR_LOG_MASK (-VFIO_CLEAR_LOG_ALIGN) + +static int vfio_log_clear_one_range(VFIOContainer *container, + VFIODMARange *qrange, uint64_t start, uint64_t size) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + + /* + * Now let's deal with the actual bitmap, which is almost the same + * as the kvm side. + */ + uint64_t end, bmap_start, start_delta, bmap_npages; + unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; + int ret; + + bmap_start = start & VFIO_CLEAR_LOG_MASK; + start_delta = start - bmap_start; + bmap_start /= psize; + + bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN) + << VFIO_CLEAR_LOG_SHIFT; + end = qrange->size / psize; + if (bmap_npages > end - bmap_start) { + bmap_npages = end - bmap_start; + } + start_delta /= psize; + + if (start_delta) { + bmap_clear = bitmap_new(bmap_npages); + bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap, + bmap_start, start_delta + size / psize); + bitmap_clear(bmap_clear, 0, start_delta); + range->bitmap.data = (__u64 *)bmap_clear; + } else { + range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start)); + } + + range->iova = qrange->iova + bmap_start * psize; + range->size = bmap_npages * psize; + range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.pgsize = qemu_real_host_page_size; + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to clear dirty log for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize); +err_out: + g_free(bmap_clear); + g_free(dbitmap); + return 0; +} + +static int vfio_physical_log_clear(VFIOContainer *container, + MemoryRegionSection *section) +{ + uint64_t start, size, offset, count; + VFIODMARange *qrange; + int ret = 0; + + if (!container->dirty_log_manual_clear) { + /* No need to do explicit clear */ + return ret; + } + + start = section->offset_within_address_space; + size = int128_get64(section->size); + + if (!size) { + return ret; + } + + QLIST_FOREACH(qrange, &container->dma_list, next) { + /* + * Discard ranges that do not overlap the section (e.g., the + * Memory BAR regions of the device) + */ + if (qrange->iova > start + size - 1 || + start > qrange->iova + qrange->size - 1) { + continue; + } + + if (start >= qrange->iova) { + /* The range starts before section or is aligned to it. */ + offset = start - qrange->iova; + count = MIN(qrange->size - offset, size); + } else { + /* The range starts after section. */ + offset = 0; + count = MIN(qrange->size, size - (qrange->iova - start)); + } + ret = vfio_log_clear_one_range(container, qrange, offset, count); + if (ret < 0) { + break; + } + } + + return ret; +} + +static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_dirty_tracking(container)) { + vfio_physical_log_clear(container, section); + } +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, - .log_sync = vfio_listerner_log_sync, + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, + .log_sync = vfio_listener_log_sync, + .log_clear = vfio_listener_log_clear, }; static void vfio_listener_release(VFIOContainer *container) @@ -1478,7 +1701,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, static int vfio_init_container(VFIOContainer *container, int group_fd, Error **errp) { - int iommu_type, ret; + int iommu_type, dirty_log_manual_clear, ret; iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { @@ -1507,6 +1730,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, } container->iommu_type = iommu_type; + + dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION, + VFIO_DIRTY_LOG_MANUAL_CLEAR); + if (dirty_log_manual_clear) { + container->dirty_log_manual_clear = dirty_log_manual_clear; + } + return 0; } @@ -1569,10 +1799,10 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, header); /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. */ - if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) { container->dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; @@ -1645,6 +1875,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->dirty_pages_supported = false; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->dma_list); ret = vfio_init_container(container, group->fd, errp); if (ret) { diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 1a9778448605bfbcbdf7d89c6596de014623cc94..d9e0e12824a92d792e606feb4b7c01bd2490f053 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -395,40 +395,10 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } -static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start) -{ - int ret; - VFIOMigration *migration = vbasedev->migration; - VFIOContainer *container = vbasedev->group->container; - struct vfio_iommu_type1_dirty_bitmap dirty = { - .argsz = sizeof(dirty), - }; - - if (start) { - if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; - } else { - return -EINVAL; - } - } else { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; - } - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); - if (ret) { - error_report("Failed to set dirty tracking flag 0x%x errno: %d", - dirty.flags, errno); - return -errno; - } - return ret; -} - static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; - vfio_set_dirty_page_tracking(vbasedev, false); - if (migration->region.mmaps) { vfio_region_unmap(&migration->region); } @@ -469,11 +439,6 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) return ret; } - ret = vfio_set_dirty_page_tracking(vbasedev, true); - if (ret) { - return ret; - } - qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ret = qemu_file_get_error(f); @@ -888,7 +853,7 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) Error *local_err = NULL; int ret = -ENOTSUP; - if (!container->dirty_pages_supported) { + if (!vbasedev->enable_migration || !container->dirty_pages_supported) { goto add_blocker; } @@ -903,8 +868,8 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) goto add_blocker; } - g_free(info); trace_vfio_migration_probe(vbasedev->name, info->index); + g_free(info); return 0; add_blocker: @@ -928,6 +893,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) remove_migration_state_change_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); + unregister_savevm(vbasedev->dev, "vfio", vbasedev); vfio_migration_exit(vbasedev); } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index da7c740bce590223cc125972908fee0288d5cccf..3641ad0c5c216a70ae1562f260deecc5286a2476 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3180,6 +3180,9 @@ static void vfio_instance_init(Object *obj) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), + DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, + vbasedev.pre_copy_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), @@ -3192,6 +3195,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice, + vbasedev.enable_migration, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, vbasedev.balloon_allowed, false), diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 048731e81f70bf6e836a81a68729eff9968e577c..1277914ca8fcd89b23505aa801212aad994f7df8 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -76,6 +76,14 @@ typedef struct VFIOAddressSpace { struct VFIOGroup; +typedef struct VFIODMARange { + QLIST_ENTRY(VFIODMARange) next; + hwaddr iova; + size_t size; + void *vaddr; /* unused */ + unsigned long *bitmap; /* dirty bitmap cache for this range */ +} VFIODMARange; + typedef struct VFIOContainer { VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ @@ -85,12 +93,14 @@ typedef struct VFIOContainer { int error; bool initialized; bool dirty_pages_supported; + bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIODMARange) dma_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; @@ -123,12 +133,14 @@ typedef struct VFIODevice { bool needs_reset; bool no_mmap; bool balloon_allowed; + bool enable_migration; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; VFIOMigration *migration; Error *migration_blocker; + OnOffAuto pre_copy_dirty_page_tracking; } VFIODevice; struct VFIODeviceOps { diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 787dbc7770e6726723648d5d535c4b2c296bc588..f8e884f1461d88ba249fca981a3dc956f90290d5 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -43,4 +43,5 @@ typedef struct KVMMemoryListener { void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, AddressSpace *as, int as_id); +void kvm_set_max_memslot_size(hwaddr max_slot_size); #endif diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index a90672494dc584fff35d3141e248393d0697fdb7..120387ba5882ba53ace8c1d2423c84cb6ddcde31 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -46,6 +46,16 @@ */ #define VFIO_NOIOMMU_IOMMU 8 +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -1074,6 +1084,7 @@ struct vfio_bitmap { * field. No guarantee is made to the user that arbitrary unmaps of iova * or size different from those used in the original mapping call will * succeed. + * * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap * before unmapping IO virtual addresses. When this flag is set, the user must * provide a struct vfio_bitmap in data[]. User must provide zero-allocated @@ -1133,8 +1144,30 @@ struct vfio_iommu_type1_dma_unmap { * actual bitmap. If dirty pages logging is not enabled, an error will be * returned. * - * Only one of the flags _START, _STOP and _GET may be specified at a time. + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. */ struct vfio_iommu_type1_dirty_bitmap { __u32 argsz; @@ -1142,6 +1175,8 @@ struct vfio_iommu_type1_dirty_bitmap { #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) __u8 data[]; }; diff --git a/memory.c b/memory.c index 44713efc66b2b8ed5ec198841f6daa8ae9777cf2..708b3dff3d8e765055febed969f3f01c64e7dae0 100644 --- a/memory.c +++ b/memory.c @@ -1825,7 +1825,10 @@ bool memory_region_is_ram_device(MemoryRegion *mr) uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; - if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) { + RAMBlock *rb = mr->ram_block; + + if (global_dirty_log && ((rb && qemu_ram_is_migratable(rb)) || + memory_region_is_iommu(mr))) { mask |= (1 << DIRTY_MEMORY_MIGRATION); } return mask; diff --git a/migration/migration.c b/migration/migration.c index 9faf5f63a627a7155f0d2fa1a13c53fa866af4e4..d34e57fc051f2919f5d7278d5ba57b61b9e1698d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2541,7 +2541,7 @@ retry: out: res = qemu_file_get_error(rp); if (res) { - if (res == -EIO) { + if (res == -EIO && migration_in_postcopy()) { /* * Maybe there is something we can do: it looks like a * network down issue, and we pause for a recovery. diff --git a/migration/ram.c b/migration/ram.c index 2077ba5be4b917aa2c7f139e69a9502bc2e3c50d..1bd99ff9e5b30bfefe2a17b74442a9c478e8bba0 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -3052,6 +3052,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; + unsigned long hostpage_boundary = + QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); if (ramblock_is_ignored(pss->block)) { error_report("block %s should not be migrated !", pss->block->idstr); @@ -3060,29 +3062,31 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, do { /* Check the pages is dirty and if it is send it */ - if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { - pss->page++; - continue; - } + if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { + tmppages = ram_save_target_page(rs, pss, last_stage); + if (tmppages < 0) { + return tmppages; + } - tmppages = ram_save_target_page(rs, pss, last_stage); - if (tmppages < 0) { - return tmppages; - } + pages += tmppages; + if (pss->block->unsentmap) { + clear_bit(pss->page, pss->block->unsentmap); + } - pages += tmppages; - if (pss->block->unsentmap) { - clear_bit(pss->page, pss->block->unsentmap); + /* + * Allow rate limiting to happen in the middle of huge pages if + * something is sent in the current iteration. + */ + if (pagesize_bits > 1 && tmppages > 0) { + migration_rate_limit(); + } } - - pss->page++; - /* Allow rate limiting to happen in the middle of huge pages */ - migration_rate_limit(); + pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); } while ((pss->page & (pagesize_bits - 1)) && offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); - /* The offset we leave with is the last one we looked at */ - pss->page--; + /* The offset we leave with is the min boundary of host page and block */ + pss->page = MIN(pss->page, hostpage_boundary) - 1; return pages; } diff --git a/qdev-monitor.c b/qdev-monitor.c index c6c1d3f06a9f2205639d220545e63c28318709b5..ab2bdef105f2227f1c0c02dcf5ab9cec131210f2 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -587,7 +587,6 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) if (path != NULL) { bus = qbus_find(path, errp); if (!bus) { - error_setg(errp, "can not find bus for %s", driver); return NULL; } if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) {