diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 84edbe8bb17c52d53e2a1c41756890dc7f266f17..4daff563a09a2f642746b2c4a393abb6631b71f0 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -138,6 +138,7 @@ bool kvm_direct_msi_allowed; bool kvm_ioeventfd_any_length_allowed; bool kvm_msi_use_devid; static bool kvm_immediate_exit; +static hwaddr kvm_max_slot_size = ~0; static const KVMCapabilityInfo kvm_required_capabilites[] = { KVM_CAP_INFO(USER_MEMORY), @@ -458,7 +459,7 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, static int kvm_section_update_flags(KVMMemoryListener *kml, MemoryRegionSection *section) { - hwaddr start_addr, size; + hwaddr start_addr, size, slot_size; KVMSlot *mem; int ret = 0; @@ -469,13 +470,18 @@ static int kvm_section_update_flags(KVMMemoryListener *kml, kvm_slots_lock(kml); - mem = kvm_lookup_matching_slot(kml, start_addr, size); - if (!mem) { - /* We don't have a slot if we want to trap every access. */ - goto out; - } + while (size && !ret) { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); + if (!mem) { + /* We don't have a slot if we want to trap every access. */ + goto out; + } - ret = kvm_slot_update_flags(kml, mem, section->mr); + ret = kvm_slot_update_flags(kml, mem, section->mr); + start_addr += slot_size; + size -= slot_size; + } out: kvm_slots_unlock(kml); @@ -530,6 +536,31 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) +/* Allocate the dirty bitmap for a slot */ +static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) +{ + /* + * XXX bad kernel interface alert + * For dirty bitmap, kernel allocates array of size aligned to + * bits-per-long. But for case when the kernel is 64bits and + * the userspace is 32bits, userspace can't align to the same + * bits-per-long, since sizeof(long) is different between kernel + * and user space. This way, userspace will provide buffer which + * may be 4 bytes less than the kernel will use, resulting in + * userspace memory corruption (which is not detectable by valgrind + * too, in most cases). + * So for now, let's align to 64 instead of HOST_LONG_BITS here, in + * a hope that sizeof(long) won't become >8 any time soon. + * + * Note: the granule of kvm dirty log is qemu_real_host_page_size. + * And mem->memory_size is aligned to it (otherwise this mem can't + * be registered to KVM). + */ + hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size, + /*HOST_LONG_BITS*/ 64) / 8; + mem->dirty_bmap = g_malloc0(bitmap_size); +} + /** * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space * @@ -548,33 +579,23 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, struct kvm_dirty_log d = {}; KVMSlot *mem; hwaddr start_addr, size; + hwaddr slot_size, slot_offset = 0; int ret = 0; size = kvm_align_section(section, &start_addr); - if (size) { - mem = kvm_lookup_matching_slot(kml, start_addr, size); + while (size) { + MemoryRegionSection subsection = *section; + + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); if (!mem) { /* We don't have a slot if we want to trap every access. */ goto out; } - /* XXX bad kernel interface alert - * For dirty bitmap, kernel allocates array of size aligned to - * bits-per-long. But for case when the kernel is 64bits and - * the userspace is 32bits, userspace can't align to the same - * bits-per-long, since sizeof(long) is different between kernel - * and user space. This way, userspace will provide buffer which - * may be 4 bytes less than the kernel will use, resulting in - * userspace memory corruption (which is not detectable by valgrind - * too, in most cases). - * So for now, let's align to 64 instead of HOST_LONG_BITS here, in - * a hope that sizeof(long) won't become >8 any time soon. - */ - size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), - /*HOST_LONG_BITS*/ 64) / 8; if (!mem->dirty_bmap) { /* Allocate on the first log_sync, once and for all */ - mem->dirty_bmap = g_malloc0(size); + kvm_memslot_init_dirty_bitmap(mem); } d.dirty_bitmap = mem->dirty_bmap; @@ -585,7 +606,13 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, goto out; } - kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); + subsection.offset_within_region += slot_offset; + subsection.size = int128_make64(slot_size); + kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); + + slot_offset += slot_size; + start_addr += slot_size; + size -= slot_size; } out: return ret; @@ -974,6 +1001,14 @@ kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) return NULL; } +void kvm_set_max_memslot_size(hwaddr max_slot_size) +{ + g_assert( + ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size + ); + kvm_max_slot_size = max_slot_size; +} + static void kvm_set_phys_mem(KVMMemoryListener *kml, MemoryRegionSection *section, bool add) { @@ -981,7 +1016,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, int err; MemoryRegion *mr = section->mr; bool writeable = !mr->readonly && !mr->rom_device; - hwaddr start_addr, size; + hwaddr start_addr, size, slot_size; void *ram; if (!memory_region_is_ram(mr)) { @@ -1006,41 +1041,59 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, kvm_slots_lock(kml); if (!add) { - mem = kvm_lookup_matching_slot(kml, start_addr, size); - if (!mem) { - goto out; - } - if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { - kvm_physical_sync_dirty_bitmap(kml, section); - } + do { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); + if (!mem) { + goto out; + } + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + kvm_physical_sync_dirty_bitmap(kml, section); + } - /* unregister the slot */ - g_free(mem->dirty_bmap); - mem->dirty_bmap = NULL; - mem->memory_size = 0; - mem->flags = 0; - err = kvm_set_user_memory_region(kml, mem, false); - if (err) { - fprintf(stderr, "%s: error unregistering slot: %s\n", - __func__, strerror(-err)); - abort(); - } + /* unregister the slot */ + g_free(mem->dirty_bmap); + mem->dirty_bmap = NULL; + mem->memory_size = 0; + mem->flags = 0; + err = kvm_set_user_memory_region(kml, mem, false); + if (err) { + fprintf(stderr, "%s: error unregistering slot: %s\n", + __func__, strerror(-err)); + abort(); + } + start_addr += slot_size; + size -= slot_size; + } while (size); goto out; } /* register the new slot */ - mem = kvm_alloc_slot(kml); - mem->memory_size = size; - mem->start_addr = start_addr; - mem->ram = ram; - mem->flags = kvm_mem_flags(mr); + do { + slot_size = MIN(kvm_max_slot_size, size); + mem = kvm_alloc_slot(kml); + mem->memory_size = slot_size; + mem->start_addr = start_addr; + mem->ram = ram; + mem->flags = kvm_mem_flags(mr); - err = kvm_set_user_memory_region(kml, mem, true); - if (err) { - fprintf(stderr, "%s: error registering slot: %s\n", __func__, - strerror(-err)); - abort(); - } + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * Reallocate the bmap; it means it doesn't disappear in + * middle of a migrate. + */ + kvm_memslot_init_dirty_bitmap(mem); + } + err = kvm_set_user_memory_region(kml, mem, true); + if (err) { + fprintf(stderr, "%s: error registering slot: %s\n", __func__, + strerror(-err)); + abort(); + } + start_addr += slot_size; + ram += slot_size; + size -= slot_size; + } while (size); out: kvm_slots_unlock(kml); @@ -2880,6 +2933,7 @@ static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, for (i = 0; i < kvm->nr_as; ++i) { if (kvm->as[i].as == as && kvm->as[i].ml) { + size = MIN(kvm_max_slot_size, size); return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, start_addr, size); } diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 717d22bcbe78c62b339fe0091876aed229e79d75..de9468d33fb9567403ec4885a65a8e1cb6c3380a 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -395,6 +395,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n) entry.iova = n->start; entry.perm = IOMMU_NONE; entry.addr_mask = n->end - n->start; + entry.granularity = IOMMU_INV_GRAN_DOMAIN; memory_region_notify_one(n, &entry); } diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index e96d5beb9a8322cfd4bc3cb46d9bae54d12ffd37..f383143db135f0fd5812856136e70f8856bf8dbb 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -16,6 +16,10 @@ * with this program; if not, see . */ +#ifdef __linux__ +#include "linux/iommu.h" +#endif + #include "qemu/osdep.h" #include "hw/boards.h" #include "sysemu/sysemu.h" @@ -254,8 +258,9 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS, SMMU_CMDQS); - /* 4K and 64K granule support */ + /* 4K, 16K and 64K granule support */ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */ @@ -351,6 +356,7 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, "SMMUv3 S1 stalling fault model not allowed yet\n"); goto bad_ste; } + cfg->s1ctxptr = STE_CTXPTR(ste); return 0; bad_ste: @@ -480,7 +486,8 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event) tg = CD_TG(cd, i); tt->granule_sz = tg2granule(tg, i); - if ((tt->granule_sz != 12 && tt->granule_sz != 16) || CD_ENDI(cd)) { + if ((tt->granule_sz != 12 && tt->granule_sz != 14 && + tt->granule_sz != 16) || CD_ENDI(cd)) { goto bad_cd; } @@ -792,7 +799,7 @@ epilogue: static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, IOMMUNotifier *n, int asid, - dma_addr_t iova) + dma_addr_t iova, bool leaf) { SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); SMMUEventInfo event = {}; @@ -821,12 +828,39 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, entry.iova = iova; entry.addr_mask = (1 << tt->granule_sz) - 1; entry.perm = IOMMU_NONE; + entry.flags = IOMMU_INV_FLAGS_ARCHID; + entry.arch_id = asid; + entry.leaf = leaf; + + memory_region_notify_one(n, &entry); +} + +/** + * smmuv3_notify_asid - call the notifier @n for a given asid + * + * @mr: IOMMU mr region handle + * @n: notifier to be called + * @asid: address space ID or negative value if we don't care + */ +static void smmuv3_notify_asid(IOMMUMemoryRegion *mr, + IOMMUNotifier *n, int asid) +{ + IOMMUTLBEntry entry; + + entry.target_as = &address_space_memory; + entry.perm = IOMMU_NONE; + entry.granularity = IOMMU_INV_GRAN_PASID; + entry.flags = IOMMU_INV_FLAGS_ARCHID; + entry.arch_id = asid; + entry.iova = n->start; + entry.addr_mask = n->end - n->start; memory_region_notify_one(n, &entry); } /* invalidate an asid/iova tuple in all mr's */ -static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) +static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, + bool leaf) { SMMUDevice *sdev; @@ -837,11 +871,85 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova) trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova); IOMMU_NOTIFIER_FOREACH(n, mr) { - smmuv3_notify_iova(mr, n, asid, iova); + smmuv3_notify_iova(mr, n, asid, iova, leaf); } } } +static int smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) +{ +#ifdef __linux__ + IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); + SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid}; + IOMMUConfig iommu_config = {}; + SMMUTransCfg *cfg; + SMMUDevice *sdev; + int ret; + + if (!mr) { + return 0; + } + + sdev = container_of(mr, SMMUDevice, iommu); + + /* flush QEMU config cache */ + smmuv3_flush_config(sdev); + + if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) { + return 0; + } + + cfg = smmuv3_get_config(sdev, &event); + + if (!cfg) { + return 0; + } + + iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config); + iommu_config.pasid_cfg.version = PASID_TABLE_CFG_VERSION_1; + iommu_config.pasid_cfg.format = IOMMU_PASID_FORMAT_SMMUV3; + iommu_config.pasid_cfg.base_ptr = cfg->s1ctxptr; + iommu_config.pasid_cfg.pasid_bits = 0; + iommu_config.pasid_cfg.vendor_data.smmuv3.version = PASID_TABLE_SMMUV3_CFG_VERSION_1; + + if (cfg->disabled || cfg->bypassed) { + iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_BYPASS; + } else if (cfg->aborted) { + iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_ABORT; + } else { + iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_TRANSLATE; + } + + trace_smmuv3_notify_config_change(mr->parent_obj.name, + iommu_config.pasid_cfg.config, + iommu_config.pasid_cfg.base_ptr); + + ret = pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config); + if (ret) { + error_report("Failed to pass PASID table to host for iommu mr %s (%m)", + mr->parent_obj.name); + } + + return ret; +#endif +} + +static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid) +{ + SMMUDevice *sdev; + + trace_smmuv3_s1_asid_inval(asid); + QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) { + IOMMUMemoryRegion *mr = &sdev->iommu; + IOMMUNotifier *n; + + IOMMU_NOTIFIER_FOREACH(n, mr) { + smmuv3_notify_asid(mr, n, asid); + } + } + smmu_iotlb_inv_asid(s, asid); +} + static int smmuv3_cmdq_consume(SMMUv3State *s) { SMMUState *bs = ARM_SMMU(s); @@ -892,22 +1000,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_CFGI_STE: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { - break; - } - trace_smmuv3_cmdq_cfgi_ste(sid); - sdev = container_of(mr, SMMUDevice, iommu); - smmuv3_flush_config(sdev); - + smmuv3_notify_config_change(bs, sid); break; } case SMMU_CMD_CFGI_STE_RANGE: /* same as SMMU_CMD_CFGI_ALL */ @@ -924,14 +1024,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_cfgi_ste_range(start, end); for (i = start; i <= end; i++) { - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, i); - SMMUDevice *sdev; - - if (!mr) { - continue; - } - sdev = container_of(mr, SMMUDevice, iommu); - smmuv3_flush_config(sdev); + smmuv3_notify_config_change(bs, i); } break; } @@ -961,8 +1054,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) uint16_t asid = CMD_ASID(&cmd); trace_smmuv3_cmdq_tlbi_nh_asid(asid); - smmu_inv_notifiers_all(&s->smmu_state); - smmu_iotlb_inv_asid(bs, asid); + smmuv3_s1_asid_inval(bs, asid); break; } case SMMU_CMD_TLBI_NH_ALL: @@ -975,9 +1067,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) { dma_addr_t addr = CMD_ADDR(&cmd); uint16_t vmid = CMD_VMID(&cmd); + bool leaf = CMD_LEAF(&cmd); trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr); - smmuv3_inv_notifiers_iova(bs, -1, addr); + smmuv3_inv_notifiers_iova(bs, -1, addr, leaf); smmu_iotlb_inv_all(bs); break; } @@ -989,7 +1082,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) bool leaf = CMD_LEAF(&cmd); trace_smmuv3_cmdq_tlbi_nh_va(vmid, asid, addr, leaf); - smmuv3_inv_notifiers_iova(bs, asid, addr); + smmuv3_inv_notifiers_iova(bs, asid, addr, leaf); smmu_iotlb_inv_iova(bs, asid, addr); break; } @@ -1405,6 +1498,24 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } +static int smmuv3_post_load(void *opaque, int version_id) +{ + SMMUv3State *s3 = opaque; + SMMUState *s = &(s3->smmu_state); + SMMUDevice *sdev; + int ret = 0; + + QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) { + uint32_t sid = smmu_get_sid(sdev); + ret = smmuv3_notify_config_change(s, sid); + if (ret) { + break; + } + } + + return ret; +} + static const VMStateDescription vmstate_smmuv3_queue = { .name = "smmuv3_queue", .version_id = 1, @@ -1422,6 +1533,8 @@ static const VMStateDescription vmstate_smmuv3 = { .name = "smmuv3", .version_id = 1, .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, + .post_load = smmuv3_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(features, SMMUv3State), VMSTATE_UINT8(sid_size, SMMUv3State), @@ -1473,14 +1586,6 @@ static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, SMMUv3State *s3 = sdev->smmu; SMMUState *s = &(s3->smmu_state); - if (new & IOMMU_NOTIFIER_MAP) { - int bus_num = pci_bus_num(sdev->bus); - PCIDevice *pcidev = pci_find_device(sdev->bus, bus_num, sdev->devfn); - - warn_report("SMMUv3 does not support notification on MAP: " - "device %s will not function properly", pcidev->name); - } - if (old == IOMMU_NOTIFIER_NONE) { trace_smmuv3_notify_flag_add(iommu->parent_obj.name); QLIST_INSERT_HEAD(&s->devices_with_notifiers, sdev, next); @@ -1490,6 +1595,90 @@ static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, } } +static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, + enum IOMMUMemoryRegionAttr attr, + void *data) +{ + if (attr == IOMMU_ATTR_VFIO_NESTED) { + *(bool *) data = true; + return 0; + } else if (attr == IOMMU_ATTR_MSI_TRANSLATE) { + *(bool *) data = true; + return 0; + } + return -EINVAL; +} + +struct iommu_fault; + +static inline int +smmuv3_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, + struct iommu_fault *buf) +{ +#ifdef __linux__ + SMMUDevice *sdev = container_of(iommu_mr, SMMUDevice, iommu); + SMMUv3State *s3 = sdev->smmu; + uint32_t sid = smmu_get_sid(sdev); + int i; + + for (i = 0; i < count; i++) { + SMMUEventInfo info = {}; + struct iommu_fault_unrecoverable *record; + + if (buf[i].type != IOMMU_FAULT_DMA_UNRECOV) { + continue; + } + + info.sid = sid; + record = &buf[i].event; + + switch (record->reason) { + case IOMMU_FAULT_REASON_PASID_INVALID: + info.type = SMMU_EVT_C_BAD_SUBSTREAMID; + /* TODO further fill info.u.c_bad_substream */ + break; + case IOMMU_FAULT_REASON_PASID_FETCH: + info.type = SMMU_EVT_F_CD_FETCH; + break; + case IOMMU_FAULT_REASON_BAD_PASID_ENTRY: + info.type = SMMU_EVT_C_BAD_CD; + /* TODO further fill info.u.c_bad_cd */ + break; + case IOMMU_FAULT_REASON_WALK_EABT: + info.type = SMMU_EVT_F_WALK_EABT; + info.u.f_walk_eabt.addr = record->addr; + info.u.f_walk_eabt.addr2 = record->fetch_addr; + break; + case IOMMU_FAULT_REASON_PTE_FETCH: + info.type = SMMU_EVT_F_TRANSLATION; + info.u.f_translation.addr = record->addr; + break; + case IOMMU_FAULT_REASON_OOR_ADDRESS: + info.type = SMMU_EVT_F_ADDR_SIZE; + info.u.f_addr_size.addr = record->addr; + break; + case IOMMU_FAULT_REASON_ACCESS: + info.type = SMMU_EVT_F_ACCESS; + info.u.f_access.addr = record->addr; + break; + case IOMMU_FAULT_REASON_PERMISSION: + info.type = SMMU_EVT_F_PERMISSION; + info.u.f_permission.addr = record->addr; + break; + default: + warn_report("%s Unexpected fault reason received from host: %d", + __func__, record->reason); + continue; + } + + smmuv3_record_event(s3, &info); + } + return 0; +#else + return -1; +#endif +} + static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, void *data) { @@ -1497,6 +1686,8 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, imrc->translate = smmuv3_translate; imrc->notify_flag_changed = smmuv3_notify_flag_changed; + imrc->get_attr = smmuv3_get_attr; + imrc->inject_faults = smmuv3_inject_faults; } static const TypeInfo smmuv3_type_info = { diff --git a/hw/arm/trace-events b/hw/arm/trace-events index 0acedcedc6f3e6f573dea16ee4414e27afb1cee9..cbbe2ccafd5c9eccfcf66a201d3213cb021c5f89 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -44,6 +44,7 @@ smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t p smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)" smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d" smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64 +smmuv3_s1_asid_inval(int asid) "asid=%d" smmuv3_cmdq_tlbi_nh(void) "" smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" smmu_iotlb_cache_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d" @@ -52,4 +53,4 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d" smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova) "iommu mr=%s asid=%d iova=0x%"PRIx64 - +smmuv3_notify_config_change(const char *name, uint8_t config, uint64_t s1ctxptr) "iommu mr=%s config=%d s1ctxptr=0x%"PRIx64 diff --git a/hw/pci/pci.c b/hw/pci/pci.c index e74143ccc36c1b669a08468ea10c54a562146de6..a8b3d1c071eb9e9e03340a6972728f292fe6a35a 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2626,6 +2626,56 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) bus->iommu_opaque = opaque; } +void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops) +{ + assert(ops && !dev->pasid_ops); + dev->pasid_ops = ops; +} + +bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn) +{ + PCIDevice *dev; + + if (!bus) { + return false; + } + + dev = bus->devices[devfn]; + return !!(dev && dev->pasid_ops); +} + +int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, + IOMMUConfig *config) +{ + PCIDevice *dev; + + if (!bus) { + return -EINVAL; + } + + dev = bus->devices[devfn]; + if (dev && dev->pasid_ops && dev->pasid_ops->set_pasid_table) { + return dev->pasid_ops->set_pasid_table(bus, devfn, config); + } + return -ENOENT; +} + +int pci_device_return_page_response(PCIBus *bus, int32_t devfn, + IOMMUPageResponse *resp) +{ + PCIDevice *dev; + + if (!bus) { + return -EINVAL; + } + + dev = bus->devices[devfn]; + if (dev && dev->pasid_ops && dev->pasid_ops->return_page_response) { + return dev->pasid_ops->return_page_response(bus, devfn, resp); + } + return -ENOENT; +} + static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque) { Range *range = opaque; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a86a4c4506558a41a36d6f4a3be0dbec73ad588c..fb7ca6374858418bd792bf03c26bb7813b58b41e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -310,7 +310,7 @@ bool vfio_mig_active(void) return true; } -static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) +static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; VFIODevice *vbasedev; @@ -328,10 +328,8 @@ static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) return false; } - if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && - !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { - continue; - } else { + if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) + && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { return false; } } @@ -374,7 +372,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, { struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; - uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; + uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size; int ret; unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); @@ -386,12 +384,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, bitmap = (struct vfio_bitmap *)&unmap->data; /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to - * TARGET_PAGE_SIZE. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize + * to qemu_real_host_page_size. */ - bitmap->pgsize = TARGET_PAGE_SIZE; + bitmap->pgsize = qemu_real_host_page_size; bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; @@ -422,6 +420,29 @@ unmap_exit: return ret; } +static VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size) +{ + VFIODMARange *qrange; + + QLIST_FOREACH(qrange, &container->dma_list, next) { + if (qrange->iova == start_addr && qrange->size == size) { + return qrange; + } + } + return NULL; +} + +static void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) +{ + uint64_t pages, size; + + pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size; + size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + qrange->bitmap = g_malloc0(size); +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ @@ -435,12 +456,29 @@ static int vfio_dma_unmap(VFIOContainer *container, .iova = iova, .size = size, }; + VFIODMARange *qrange; if (iotlb && container->dirty_pages_supported && vfio_devices_all_running_and_saving(container)) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } + /* + * unregister the DMA range + * + * It seems that the memory layer will give us the same section as the one + * used in region_add(). Otherwise it'll be complicated to manipulate the + * bitmap across region_{add,del}. Is there any guarantee? + * + * But there is really not such a restriction on the kernel interface + * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc). + */ + qrange = vfio_lookup_match_range(container, iova, size); + assert(qrange); + g_free(qrange->bitmap); + QLIST_REMOVE(qrange, next); + g_free(qrange); + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -477,6 +515,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, .iova = iova, .size = size, }; + VFIODMARange *qrange; + + qrange = g_malloc0(sizeof(*qrange)); + qrange->iova = iova; + qrange->size = size; + QLIST_INSERT_HEAD(&container->dma_list, qrange, next); + /* XXX allocate the dirty bitmap on demand */ + vfio_dma_range_init_dirty_bitmap(qrange); if (!readonly) { map.flags |= VFIO_DMA_MAP_FLAG_WRITE; @@ -596,6 +642,132 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, return true; } +/* Propagate a guest IOTLB invalidation to the host (nested mode) */ +static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); + struct vfio_iommu_type1_cache_invalidate ustruct = {}; + VFIOContainer *container = giommu->container; + int ret; + + assert(iotlb->perm == IOMMU_NONE); + + ustruct.argsz = sizeof(ustruct); + ustruct.flags = 0; + ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info); + ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1; + ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB; + + switch (iotlb->granularity) { + case IOMMU_INV_GRAN_DOMAIN: + ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN; + break; + case IOMMU_INV_GRAN_PASID: + { + struct iommu_inv_pasid_info *pasid_info; + int archid = -1; + + pasid_info = &ustruct.info.granu.pasid_info; + ustruct.info.granularity = IOMMU_INV_GRANU_PASID; + if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { + pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; + archid = iotlb->arch_id; + } + pasid_info->archid = archid; + trace_vfio_iommu_asid_inv_iotlb(archid); + break; + } + case IOMMU_INV_GRAN_ADDR: + { + hwaddr start = iotlb->iova + giommu->iommu_offset; + struct iommu_inv_addr_info *addr_info; + size_t size = iotlb->addr_mask + 1; + int archid = -1; + + addr_info = &ustruct.info.granu.addr_info; + ustruct.info.granularity = IOMMU_INV_GRANU_ADDR; + if (iotlb->leaf) { + addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF; + } + if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { + addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; + archid = iotlb->arch_id; + } + addr_info->archid = archid; + addr_info->addr = start; + addr_info->granule_size = size; + addr_info->nb_granules = 1; + trace_vfio_iommu_addr_inv_iotlb(archid, start, size, + 1, iotlb->leaf); + break; + } + } + + ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct); + if (ret) { + error_report("%p: failed to invalidate CACHE (%d)", container, ret); + } +} + +int vfio_iommu_set_msi_binding(VFIOContainer *container, int n, + IOMMUTLBEntry *iotlb) +{ + struct vfio_iommu_type1_set_msi_binding ustruct; + VFIOMSIBinding *binding; + int ret; + + QLIST_FOREACH(binding, &container->msibinding_list, next) { + if (binding->index == n) { + return 0; + } + } + + ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding); + ustruct.iova = iotlb->iova; + ustruct.flags = VFIO_IOMMU_BIND_MSI; + ustruct.gpa = iotlb->translated_addr; + ustruct.size = iotlb->addr_mask + 1; + ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct); + if (ret) { + error_report("%s: failed to register the stage1 MSI binding (%m)", + __func__); + return ret; + } + binding = g_new0(VFIOMSIBinding, 1); + binding->iova = ustruct.iova; + binding->gpa = ustruct.gpa; + binding->size = ustruct.size; + binding->index = n; + + QLIST_INSERT_HEAD(&container->msibinding_list, binding, next); + return 0; +} + +int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n) +{ + struct vfio_iommu_type1_set_msi_binding ustruct; + VFIOMSIBinding *binding, *tmp; + int ret; + + ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding); + QLIST_FOREACH_SAFE(binding, &container->msibinding_list, next, tmp) { + if (binding->index != n) { + continue; + } + ustruct.flags = VFIO_IOMMU_UNBIND_MSI; + ustruct.iova = binding->iova; + ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct); + if (ret) { + error_report("Failed to unregister the stage1 MSI binding " + "for iova=0x%"PRIx64" (%m)", binding->iova); + } + QLIST_REMOVE(binding, next); + g_free(binding); + return ret; + } + return 0; +} + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); @@ -650,16 +822,167 @@ out: rcu_read_unlock(); } +static VFIOHostDMAWindow * +hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + return hostwin; + } + } + return NULL; +} + +static int vfio_dma_map_ram_section(VFIOContainer *container, + MemoryRegionSection *section) +{ + VFIOHostDMAWindow *hostwin; + Int128 llend, llsize; + hwaddr iova, end; + void *vaddr; + int ret; + + assert(memory_region_is_ram(section->mr)); + + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + end = int128_get64(int128_sub(llend, int128_one())); + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + hostwin = hostwin_from_range(container, iova, end); + if (!hostwin) { + error_report("vfio: IOMMU Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, + container, iova, end); + return -EFAULT; + } + + trace_vfio_dma_map_ram(iova, end, vaddr); + + llsize = int128_sub(llend, int128_make64(iova)); + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + + if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { + trace_vfio_listener_region_add_no_dma_map( + memory_region_name(section->mr), + section->offset_within_address_space, + int128_getlo(section->size), + pgmask + 1); + return 0; + } + } + + ret = vfio_dma_map(container, iova, int128_get64(llsize), + vaddr, section->readonly); + if (ret) { + error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%m)", + container, iova, int128_get64(llsize), vaddr, ret); + if (memory_region_is_ram_device(section->mr)) { + /* Allow unexpected mappings not to be fatal for RAM devices */ + return 0; + } + return ret; + } + return 0; +} + +static void vfio_dma_unmap_ram_section(VFIOContainer *container, + MemoryRegionSection *section) +{ + Int128 llend, llsize; + hwaddr iova, end; + bool try_unmap = true; + int ret; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + end = int128_get64(int128_sub(llend, int128_one())); + + llsize = int128_sub(llend, int128_make64(iova)); + + trace_vfio_dma_unmap_ram(iova, end); + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; + VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); + + assert(hostwin); /* or region_add() would have failed */ + + pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } + + if (try_unmap) { + if (int128_eq(llsize, int128_2_64())) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + llsize = int128_rshift(llsize, 1); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, int128_get64(llsize), ret); + } + iova += int128_get64(llsize); + } + ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, int128_get64(llsize), ret); + } + } +} + +static void vfio_prereg_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = + container_of(listener, VFIOContainer, prereg_listener); + + if (!memory_region_is_ram(section->mr)) { + return; + } + + vfio_dma_map_ram_section(container, section); +} + +static void vfio_prereg_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = + container_of(listener, VFIOContainer, prereg_listener); + + if (!memory_region_is_ram(section->mr)) { + return; + } + + vfio_dma_unmap_ram_section(container, section); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; - Int128 llend, llsize; - void *vaddr; + Int128 llend; int ret; VFIOHostDMAWindow *hostwin; - bool hostwin_found; if (vfio_listener_skipped_section(section)) { trace_vfio_listener_region_add_skip( @@ -669,16 +992,17 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != - (section->offset_within_region & ~TARGET_PAGE_MASK))) { + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { error_report("%s received unaligned region", __func__); return; } - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); if (int128_ge(int128_make64(iova), llend)) { return; @@ -736,15 +1060,8 @@ static void vfio_listener_region_add(MemoryListener *listener, #endif } - hostwin_found = false; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - - if (!hostwin_found) { + hostwin = hostwin_from_range(container, iova, end); + if (!hostwin) { error_report("vfio: IOMMU container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); @@ -755,9 +1072,10 @@ static void vfio_listener_region_add(MemoryListener *listener, memory_region_ref(section->mr); if (memory_region_is_iommu(section->mr)) { + IOMMUNotify notify; VFIOGuestIOMMU *giommu; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); - int iommu_idx; + int iommu_idx, flags; trace_vfio_listener_region_add_iommu(iova, end); /* @@ -776,52 +1094,33 @@ static void vfio_listener_region_add(MemoryListener *listener, llend = int128_sub(llend, int128_one()); iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, MEMTXATTRS_UNSPECIFIED); - iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, - IOMMU_NOTIFIER_ALL, + + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + /* IOTLB unmap notifier to propagate guest IOTLB invalidations */ + flags = IOMMU_NOTIFIER_UNMAP; + notify = vfio_iommu_unmap_notify; + } else { + /* MAP/UNMAP IOTLB notifier */ + flags = IOMMU_NOTIFIER_ALL; + notify = vfio_iommu_map_notify; + } + + iommu_notifier_init(&giommu->n, notify, flags, section->offset_within_region, int128_get64(llend), iommu_idx); QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); memory_region_register_iommu_notifier(section->mr, &giommu->n); - memory_region_iommu_replay(giommu->iommu, &giommu->n); + if (flags & IOMMU_NOTIFIER_MAP) { + memory_region_iommu_replay(giommu->iommu, &giommu->n); + } return; } /* Here we assume that memory_region_is_ram(section->mr)==true */ - - vaddr = memory_region_get_ram_ptr(section->mr) + - section->offset_within_region + - (iova - section->offset_within_address_space); - - trace_vfio_listener_region_add_ram(iova, end, vaddr); - - llsize = int128_sub(llend, int128_make64(iova)); - - if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; - - if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { - trace_vfio_listener_region_add_no_dma_map( - memory_region_name(section->mr), - section->offset_within_address_space, - int128_getlo(section->size), - pgmask + 1); - return; - } - } - - ret = vfio_dma_map(container, iova, int128_get64(llsize), - vaddr, section->readonly); - if (ret) { - error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx", %p) = %d (%m)", - container, iova, int128_get64(llsize), vaddr, ret); - if (memory_region_is_ram_device(section->mr)) { - /* Allow unexpected mappings not to be fatal for RAM devices */ - return; - } + if (vfio_dma_map_ram_section(container, section)) { goto fail; } @@ -850,10 +1149,6 @@ static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); - hwaddr iova, end; - Int128 llend, llsize; - int ret; - bool try_unmap = true; if (vfio_listener_skipped_section(section)) { trace_vfio_listener_region_del_skip( @@ -863,8 +1158,9 @@ static void vfio_listener_region_del(MemoryListener *listener, return; } - if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != - (section->offset_within_region & ~TARGET_PAGE_MASK))) { + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { error_report("%s received unaligned region", __func__); return; } @@ -883,6 +1179,16 @@ static void vfio_listener_region_del(MemoryListener *listener, } } + /* + * In nested mode, stage 2 (gpa->hpa) and the stage 1 + * (giova->gpa) are set separately. The ram section + * will be unmapped in vfio_prereg_listener_region_del(). + * Hence it doesn't need to unmap ram section here. + */ + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + return; + } + /* * FIXME: We assume the one big unmap below is adequate to * remove any individual page mappings in the IOMMU which @@ -892,92 +1198,114 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); - llend = int128_make64(section->offset_within_address_space); - llend = int128_add(llend, section->size); - llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); - - if (int128_ge(int128_make64(iova), llend)) { - return; - } - end = int128_get64(int128_sub(llend, int128_one())); + vfio_dma_unmap_ram_section(container, section); - llsize = int128_sub(llend, int128_make64(iova)); + memory_region_unref(section->mr); - trace_vfio_listener_region_del(iova, end); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + vfio_spapr_remove_window(container, + section->offset_within_address_space); + if (vfio_host_win_del(container, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { + hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, + __func__, section->offset_within_address_space); + } + } +} - if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask; - VFIOHostDMAWindow *hostwin; - bool hostwin_found = false; +static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +{ + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { - if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { - hostwin_found = true; - break; - } - } - assert(hostwin_found); /* or region_add() would have failed */ + if (start) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } - pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; - try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + error_report("Failed to set dirty tracking flag 0x%x errno: %d", + dirty.flags, errno); } +} - if (try_unmap) { - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); - if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, int128_get64(llsize), ret); - } +static void vfio_listener_log_global_start(MemoryListener *listener) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + /* For nested mode, vfio_prereg_listener is used to start dirty tracking */ + if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) { + vfio_set_dirty_page_tracking(container, true); } +} - memory_region_unref(section->mr); +static void vfio_prereg_listener_log_global_start(MemoryListener *listener) +{ + VFIOContainer *container = + container_of(listener, VFIOContainer, prereg_listener); - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - vfio_spapr_remove_window(container, - section->offset_within_address_space); - if (vfio_host_win_del(container, - section->offset_within_address_space, - section->offset_within_address_space + - int128_get64(section->size) - 1) < 0) { - hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, - __func__, section->offset_within_address_space); - } + vfio_set_dirty_page_tracking(container, true); +} + +static void vfio_listener_log_global_stop(MemoryListener *listener) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + /* For nested mode, vfio_prereg_listener is used to stop dirty tracking */ + if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) { + vfio_set_dirty_page_tracking(container, false); } } +static void vfio_prereg_listener_log_global_stop(MemoryListener *listener) +{ + VFIOContainer *container = + container_of(listener, VFIOContainer, prereg_listener); + + vfio_set_dirty_page_tracking(container, false); +} + static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; + VFIODMARange *qrange; uint64_t pages; int ret; + qrange = vfio_lookup_match_range(container, iova, size); + /* the same as vfio_dma_unmap() */ + assert(qrange); + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); - dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + dbitmap->flags = container->dirty_log_manual_clear ? + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR : + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; range->iova = iova; range->size = size; /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to - * TARGET_PAGE_SIZE. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize + * to qemu_real_host_page_size. */ - range->bitmap.pgsize = TARGET_PAGE_SIZE; + range->bitmap.pgsize = qemu_real_host_page_size; - pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; + pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size; range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; - range->bitmap.data = g_try_malloc0(range->bitmap.size); - if (!range->bitmap.data) { - ret = -ENOMEM; - goto err_out; - } + + range->bitmap.data = (__u64 *)qrange->bitmap; ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); if (ret) { @@ -993,12 +1321,40 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, range->bitmap.size, ram_addr); err_out: - g_free(range->bitmap.data); g_free(dbitmap); return ret; } +static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + ram_addr_t ram_addr; + + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + + return vfio_get_dirty_bitmap(container, + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); +} + +static void vfio_prereg_listener_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = + container_of(listener, VFIOContainer, prereg_listener); + + if (!memory_region_is_ram(section->mr) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_dirty_tracking(container)) { + vfio_dma_sync_ram_section_dirty_bitmap(container, section); + } +} + typedef struct { IOMMUNotifier n; VFIOGuestIOMMU *giommu; @@ -1040,11 +1396,19 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { - ram_addr_t ram_addr; - if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; + /* + * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are + * set up separately. It is inappropriate to pass 'giova' to kernel + * to get dirty pages. We only need to focus on stage 2 mapping when + * marking dirty pages. + */ + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + return 0; + } + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu) == section->mr && giommu->n.start == section->offset_within_region) { @@ -1070,15 +1434,10 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, return 0; } - ram_addr = memory_region_get_ram_addr(section->mr) + - section->offset_within_region; - - return vfio_get_dirty_bitmap(container, - TARGET_PAGE_ALIGN(section->offset_within_address_space), - int128_get64(section->size), ram_addr); + return vfio_dma_sync_ram_section_dirty_bitmap(container, section); } -static void vfio_listerner_log_sync(MemoryListener *listener, +static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); @@ -1088,21 +1447,206 @@ static void vfio_listerner_log_sync(MemoryListener *listener, return; } - if (vfio_devices_all_stopped_and_saving(container)) { + if (vfio_devices_all_dirty_tracking(container)) { vfio_sync_dirty_bitmap(container, section); } } +/* + * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP + * ioctl. But copy from kvm side and align {start, size} with 64 pages. + * + * I think the code can be simplified a lot if no alignment requirement. + */ +#define VFIO_CLEAR_LOG_SHIFT 6 +#define VFIO_CLEAR_LOG_ALIGN (qemu_real_host_page_size << VFIO_CLEAR_LOG_SHIFT) +#define VFIO_CLEAR_LOG_MASK (-VFIO_CLEAR_LOG_ALIGN) + +static int vfio_log_clear_one_range(VFIOContainer *container, + VFIODMARange *qrange, uint64_t start, uint64_t size) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + + /* + * Now let's deal with the actual bitmap, which is almost the same + * as the kvm side. + */ + uint64_t end, bmap_start, start_delta, bmap_npages; + unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; + int ret; + + bmap_start = start & VFIO_CLEAR_LOG_MASK; + start_delta = start - bmap_start; + bmap_start /= psize; + + bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN) + << VFIO_CLEAR_LOG_SHIFT; + end = qrange->size / psize; + if (bmap_npages > end - bmap_start) { + bmap_npages = end - bmap_start; + } + start_delta /= psize; + + if (start_delta) { + bmap_clear = bitmap_new(bmap_npages); + bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap, + bmap_start, start_delta + size / psize); + bitmap_clear(bmap_clear, 0, start_delta); + range->bitmap.data = (__u64 *)bmap_clear; + } else { + range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start)); + } + + range->iova = qrange->iova + bmap_start * psize; + range->size = bmap_npages * psize; + range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.pgsize = qemu_real_host_page_size; + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to clear dirty log for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize); +err_out: + g_free(bmap_clear); + g_free(dbitmap); + return 0; +} + +static int vfio_physical_log_clear(VFIOContainer *container, + MemoryRegionSection *section) +{ + uint64_t start, size, offset, count; + VFIODMARange *qrange; + int ret = 0; + + if (!container->dirty_log_manual_clear) { + /* No need to do explicit clear */ + return ret; + } + + start = section->offset_within_address_space; + size = int128_get64(section->size); + + if (!size) { + return ret; + } + + QLIST_FOREACH(qrange, &container->dma_list, next) { + /* + * Discard ranges that do not overlap the section (e.g., the + * Memory BAR regions of the device) + */ + if (qrange->iova > start + size - 1 || + start > qrange->iova + qrange->size - 1) { + continue; + } + + if (start >= qrange->iova) { + /* The range starts before section or is aligned to it. */ + offset = start - qrange->iova; + count = MIN(qrange->size - offset, size); + } else { + /* The range starts after section. */ + offset = 0; + count = MIN(qrange->size, size - (qrange->iova - start)); + } + ret = vfio_log_clear_one_range(container, qrange, offset, count); + if (ret < 0) { + break; + } + } + + return ret; +} + +static void vfio_prereg_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = + container_of(listener, VFIOContainer, prereg_listener); + + if (!memory_region_is_ram(section->mr)) { + return; + } + + vfio_physical_log_clear(container, section); +} + +static int vfio_clear_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + if (memory_region_is_iommu(section->mr)) { + /* + * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are + * set up separately. It is inappropriate to pass 'giova' to kernel + * to get dirty pages. We only need to focus on stage 2 mapping when + * marking dirty pages. + */ + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + return 0; + } + + /* + * TODO: x86. With the log_clear() interface added, x86 may inplement + * its own method. + */ + } + + /* Here we assume that memory_region_is_ram(section->mr) == true */ + return vfio_physical_log_clear(container, section); +} + +static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_dirty_tracking(container)) { + vfio_clear_dirty_bitmap(container, section); + } +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, - .log_sync = vfio_listerner_log_sync, + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, + .log_sync = vfio_listener_log_sync, + .log_clear = vfio_listener_log_clear, +}; + +static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, + .log_global_start = vfio_prereg_listener_log_global_start, + .log_global_stop = vfio_prereg_listener_log_global_stop, + .log_sync = vfio_prereg_listener_log_sync, + .log_clear = vfio_prereg_listener_log_clear, }; static void vfio_listener_release(VFIOContainer *container) { memory_listener_unregister(&container->listener); - if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { memory_listener_unregister(&container->prereg_listener); } } @@ -1126,6 +1670,25 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) return NULL; } +struct vfio_info_cap_header * +vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_IRQ_INFO_FLAG_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, struct vfio_region_info *info) { @@ -1460,27 +2023,38 @@ static void vfio_put_address_space(VFIOAddressSpace *space) * vfio_get_iommu_type - selects the richest iommu_type (v2 first) */ static int vfio_get_iommu_type(VFIOContainer *container, + bool want_nested, Error **errp) { - int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, + int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU, + VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; - int i; + int i, ret = -EINVAL; for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { - return iommu_types[i]; + if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU && !want_nested) { + continue; + } + ret = iommu_types[i]; + break; } } - error_setg(errp, "No available IOMMU models"); - return -EINVAL; + if (ret < 0) { + error_setg(errp, "No available IOMMU models"); + } else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) { + error_setg(errp, "Nested mode requested but not supported"); + ret = -EINVAL; + } + return ret; } static int vfio_init_container(VFIOContainer *container, int group_fd, - Error **errp) + bool want_nested, Error **errp) { - int iommu_type, ret; + int iommu_type, dirty_log_manual_clear, ret; - iommu_type = vfio_get_iommu_type(container, errp); + iommu_type = vfio_get_iommu_type(container, want_nested, errp); if (iommu_type < 0) { return iommu_type; } @@ -1507,6 +2081,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, } container->iommu_type = iommu_type; + + dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION, + VFIO_DIRTY_LOG_MANUAL_CLEAR); + if (dirty_log_manual_clear) { + container->dirty_log_manual_clear = dirty_log_manual_clear; + } + return 0; } @@ -1569,10 +2150,10 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, header); /* - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of - * TARGET_PAGE_SIZE to mark those dirty. + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. */ - if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) { container->dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; @@ -1585,6 +2166,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, VFIOContainer *container; int ret, fd; VFIOAddressSpace *space; + IOMMUMemoryRegion *iommu_mr; + bool nested = false; + + if (memory_region_is_iommu(as->root)) { + iommu_mr = IOMMU_MEMORY_REGION(as->root); + memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED, + (void *)&nested); + } space = vfio_get_address_space(as); @@ -1618,6 +2207,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_FOREACH(container, &space->containers, next) { if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { group->container = container; + trace_vfio_connect_existing_container(group->groupid, + container->fd); QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_kvm_device_add_group(group); return 0; @@ -1645,13 +2236,16 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->dirty_pages_supported = false; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->dma_list); - ret = vfio_init_container(container, group->fd, errp); + ret = vfio_init_container(container, group->fd, nested, errp); if (ret) { goto free_container_exit; } + trace_vfio_connect_new_container(group->groupid, container->fd); switch (container->iommu_type) { + case VFIO_TYPE1_NESTING_IOMMU: case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: { @@ -1677,6 +2271,20 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_get_iommu_info_migration(container, info); } g_free(info); + + if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + container->prereg_listener = vfio_memory_prereg_listener; + memory_listener_register(&container->prereg_listener, + &address_space_memory); + if (container->error) { + memory_listener_unregister(&container->prereg_listener); + ret = container->error; + error_setg(errp, + "RAM memory listener initialization failed " + "for container"); + goto free_container_exit; + } + } break; } case VFIO_SPAPR_TCE_v2_IOMMU: @@ -2015,6 +2623,33 @@ retry: return 0; } +int vfio_get_irq_info(VFIODevice *vbasedev, int index, + struct vfio_irq_info **info) +{ + size_t argsz = sizeof(struct vfio_irq_info); + + *info = g_malloc0(argsz); + + (*info)->index = index; +retry: + (*info)->argsz = argsz; + + if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, *info)) { + g_free(*info); + *info = NULL; + return -errno; + } + + if ((*info)->argsz > argsz) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + + goto retry; + } + + return 0; +} + int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, uint32_t subtype, struct vfio_region_info **info) { @@ -2050,6 +2685,42 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, return -ENODEV; } +int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_irq_info **info) +{ + int i; + + for (i = 0; i < vbasedev->num_irqs; i++) { + struct vfio_info_cap_header *hdr; + struct vfio_irq_info_cap_type *cap_type; + + if (vfio_get_irq_info(vbasedev, i, info)) { + continue; + } + + hdr = vfio_get_irq_info_cap(*info, VFIO_IRQ_INFO_CAP_TYPE); + if (!hdr) { + g_free(*info); + continue; + } + + cap_type = container_of(hdr, struct vfio_irq_info_cap_type, header); + + trace_vfio_get_dev_irq(vbasedev->name, i, + cap_type->type, cap_type->subtype); + + if (cap_type->type == type && cap_type->subtype == subtype) { + return 0; + } + + g_free(*info); + } + + *info = NULL; + return -ENODEV; +} + + bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) { struct vfio_region_info *info = NULL; @@ -2065,6 +2736,21 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) return ret; } +bool vfio_has_irq_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) +{ + struct vfio_region_info *info = NULL; + bool ret = false; + + if (!vfio_get_region_info(vbasedev, region, &info)) { + if (vfio_get_region_info_cap(info, cap_type)) { + ret = true; + } + g_free(info); + } + + return ret; +} + /* * Interfaces for IBM EEH (Enhanced Error Handling) */ diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 1a9778448605bfbcbdf7d89c6596de014623cc94..d9e0e12824a92d792e606feb4b7c01bd2490f053 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -395,40 +395,10 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } -static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start) -{ - int ret; - VFIOMigration *migration = vbasedev->migration; - VFIOContainer *container = vbasedev->group->container; - struct vfio_iommu_type1_dirty_bitmap dirty = { - .argsz = sizeof(dirty), - }; - - if (start) { - if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; - } else { - return -EINVAL; - } - } else { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; - } - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); - if (ret) { - error_report("Failed to set dirty tracking flag 0x%x errno: %d", - dirty.flags, errno); - return -errno; - } - return ret; -} - static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; - vfio_set_dirty_page_tracking(vbasedev, false); - if (migration->region.mmaps) { vfio_region_unmap(&migration->region); } @@ -469,11 +439,6 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) return ret; } - ret = vfio_set_dirty_page_tracking(vbasedev, true); - if (ret) { - return ret; - } - qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ret = qemu_file_get_error(f); @@ -888,7 +853,7 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) Error *local_err = NULL; int ret = -ENOTSUP; - if (!container->dirty_pages_supported) { + if (!vbasedev->enable_migration || !container->dirty_pages_supported) { goto add_blocker; } @@ -903,8 +868,8 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) goto add_blocker; } - g_free(info); trace_vfio_migration_probe(vbasedev->name, info->index); + g_free(info); return 0; add_blocker: @@ -928,6 +893,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) remove_migration_state_change_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); + unregister_savevm(vbasedev->dev, "vfio", vbasedev); vfio_migration_exit(vbasedev); } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index da7c740bce590223cc125972908fee0288d5cccf..6f4083aec85d44ef4cd54dc64064c8fc54de1aa4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -360,6 +360,65 @@ static void vfio_msi_interrupt(void *opaque) notify(&vdev->pdev, nr); } +static bool vfio_iommu_require_msi_binding(IOMMUMemoryRegion *iommu_mr) +{ + bool msi_translate = false, nested = false; + + memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_MSI_TRANSLATE, + (void *)&msi_translate); + memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED, + (void *)&nested); + if (!nested || !msi_translate) { + return false; + } + return true; +} + +static int vfio_register_msi_binding(VFIOPCIDevice *vdev, + int vector_n, bool set) +{ + VFIOContainer *container = vdev->vbasedev.group->container; + PCIDevice *dev = &vdev->pdev; + AddressSpace *as = pci_device_iommu_address_space(dev); + IOMMUMemoryRegionClass *imrc; + IOMMUMemoryRegion *iommu_mr; + IOMMUTLBEntry entry; + MSIMessage msg; + + if (as == &address_space_memory) { + return 0; + } + + iommu_mr = IOMMU_MEMORY_REGION(as->root); + if (!vfio_iommu_require_msi_binding(iommu_mr)) { + return 0; + } + + /* MSI doorbell address is translated by an IOMMU */ + + if (!set) { /* unregister */ + trace_vfio_unregister_msi_binding(vdev->vbasedev.name, vector_n); + + return vfio_iommu_unset_msi_binding(container, vector_n); + } + + msg = pci_get_msi_message(dev, vector_n); + imrc = memory_region_get_iommu_class_nocheck(iommu_mr); + + rcu_read_lock(); + entry = imrc->translate(iommu_mr, msg.address, IOMMU_WO, 0); + rcu_read_unlock(); + + if (entry.perm == IOMMU_NONE) { + return -ENOENT; + } + + trace_vfio_register_msi_binding(vdev->vbasedev.name, vector_n, + msg.address, entry.translated_addr); + + return vfio_iommu_set_msi_binding(container, vector_n, &entry); +} + static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) { struct vfio_irq_set *irq_set; @@ -377,7 +436,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds = (int32_t *)&irq_set->data; for (i = 0; i < vdev->nr_vectors; i++) { - int fd = -1; + int ret, fd = -1; /* * MSI vs MSI-X - The guest has direct access to MSI mask and pending @@ -386,6 +445,12 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) * KVM signaling path only when configured and unmasked. */ if (vdev->msi_vectors[i].use) { + ret = vfio_register_msi_binding(vdev, i, true); + if (ret) { + error_report("%s failed to register S1 MSI binding " + "for vector %d(%d)", vdev->vbasedev.name, i, ret); + goto out; + } if (vdev->msi_vectors[i].virq < 0 || (msix && msix_is_masked(&vdev->pdev, i))) { fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); @@ -399,6 +464,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); +out: g_free(irq_set); return ret; @@ -712,7 +778,8 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) static void vfio_msix_disable(VFIOPCIDevice *vdev) { - int i; + int ret, i; + msix_unset_vector_notifiers(&vdev->pdev); @@ -724,6 +791,11 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) if (vdev->msi_vectors[i].use) { vfio_msix_vector_release(&vdev->pdev, i); msix_vector_unuse(&vdev->pdev, i); + ret = vfio_register_msi_binding(vdev, i, false); + if (ret) { + error_report("%s: failed to unregister S1 MSI binding " + "for vector %d(%d)", vdev->vbasedev.name, i, ret); + } } } @@ -2535,11 +2607,122 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return 0; } +static void vfio_init_fault_regions(VFIOPCIDevice *vdev, Error **errp) +{ + struct vfio_region_info *fault_region_info = NULL; + struct vfio_region_info_cap_fault *cap_fault; + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_info_cap_header *hdr; + char *fault_region_name; + int ret; + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_NESTED, + VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT, + &fault_region_info); + if (ret) { + goto out; + } + + hdr = vfio_get_region_info_cap(fault_region_info, + VFIO_REGION_INFO_CAP_DMA_FAULT); + if (!hdr) { + error_setg(errp, "failed to retrieve DMA FAULT capability"); + goto out; + } + cap_fault = container_of(hdr, struct vfio_region_info_cap_fault, + header); + if (cap_fault->version != 1) { + error_setg(errp, "Unsupported DMA FAULT API version %d", + cap_fault->version); + goto out; + } + + fault_region_name = g_strdup_printf("%s DMA FAULT %d", + vbasedev->name, + fault_region_info->index); + + ret = vfio_region_setup(OBJECT(vdev), vbasedev, + &vdev->dma_fault_region, + fault_region_info->index, + fault_region_name); + g_free(fault_region_name); + if (ret) { + error_setg_errno(errp, -ret, + "failed to set up the DMA FAULT region %d", + fault_region_info->index); + goto out; + } + + ret = vfio_region_mmap(&vdev->dma_fault_region); + if (ret) { + error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT queue"); + } +out: + g_free(fault_region_info); +} + +static void vfio_init_fault_response_regions(VFIOPCIDevice *vdev, Error **errp) +{ + struct vfio_region_info *fault_region_info = NULL; + struct vfio_region_info_cap_fault *cap_fault; + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_info_cap_header *hdr; + char *fault_region_name; + int ret; + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_NESTED, + VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE, + &fault_region_info); + if (ret) { + goto out; + } + + hdr = vfio_get_region_info_cap(fault_region_info, + VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE); + if (!hdr) { + error_setg(errp, "failed to retrieve DMA FAULT RESPONSE capability"); + goto out; + } + cap_fault = container_of(hdr, struct vfio_region_info_cap_fault, + header); + if (cap_fault->version != 1) { + error_setg(errp, "Unsupported DMA FAULT RESPONSE API version %d", + cap_fault->version); + goto out; + } + + fault_region_name = g_strdup_printf("%s DMA FAULT RESPONSE %d", + vbasedev->name, + fault_region_info->index); + + ret = vfio_region_setup(OBJECT(vdev), vbasedev, + &vdev->dma_fault_response_region, + fault_region_info->index, + fault_region_name); + g_free(fault_region_name); + if (ret) { + error_setg_errno(errp, -ret, + "failed to set up the DMA FAULT RESPONSE region %d", + fault_region_info->index); + goto out; + } + + ret = vfio_region_mmap(&vdev->dma_fault_response_region); + if (ret) { + error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT RESPONSE queue"); + } +out: + g_free(fault_region_info); +} + static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info; struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + Error *err = NULL; int i, ret = -1; /* Sanity check device */ @@ -2603,6 +2786,18 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } + vfio_init_fault_regions(vdev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + vfio_init_fault_response_regions(vdev, &err); + if (err) { + error_propagate(errp, err); + return; + } + irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); @@ -2766,6 +2961,205 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) vdev->req_enabled = false; } +static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn, + IOMMUConfig *config) +{ + PCIDevice *pdev = bus->devices[devfn]; + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); + VFIOContainer *container = vdev->vbasedev.group->container; + struct vfio_iommu_type1_set_pasid_table info; + + info.argsz = sizeof(info); + info.flags = VFIO_PASID_TABLE_FLAG_SET; + memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg)); + + return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); +} + +static int vfio_iommu_return_page_response(PCIBus *bus, int32_t devfn, + IOMMUPageResponse *resp) +{ + PCIDevice *pdev = bus->devices[devfn]; + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); + struct iommu_page_response *response = &resp->resp; + struct vfio_region_dma_fault_response header; + struct iommu_page_response *queue; + char *queue_buffer = NULL; + ssize_t bytes; + + if (!vdev->dma_fault_response_region.mem) { + return -EINVAL; + } + + /* read the header */ + bytes = pread(vdev->vbasedev.fd, &header, sizeof(header), + vdev->dma_fault_response_region.fd_offset); + if (bytes != sizeof(header)) { + error_report("%s unable to read the fault region header (0x%lx)", + __func__, bytes); + return -1; + } + + /* Normally the fault queue is mmapped */ + queue = (struct iommu_page_response *)vdev->dma_fault_response_region.mmaps[0].mmap; + if (!queue) { + size_t queue_size = header.nb_entries * header.entry_size; + + error_report("%s: fault queue not mmapped: slower fault handling", + vdev->vbasedev.name); + + queue_buffer = g_malloc(queue_size); + bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size, + vdev->dma_fault_response_region.fd_offset + header.offset); + if (bytes != queue_size) { + error_report("%s unable to read the fault queue (0x%lx)", + __func__, bytes); + return -1; + } + + queue = (struct iommu_page_response *)queue_buffer; + } + /* deposit the new response in the queue and increment the head */ + memcpy(queue + header.head, response, header.entry_size); + + vdev->fault_response_head_index = + (vdev->fault_response_head_index + 1) % header.nb_entries; + bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_response_head_index, 4, + vdev->dma_fault_response_region.fd_offset); + if (bytes != 4) { + error_report("%s unable to write the fault response region head index (0x%lx)", + __func__, bytes); + } + g_free(queue_buffer); + + return 0; +} + +static PCIPASIDOps vfio_pci_pasid_ops = { + .set_pasid_table = vfio_iommu_set_pasid_table, + .return_page_response = vfio_iommu_return_page_response, +}; + +static void vfio_dma_fault_notifier_handler(void *opaque) +{ + VFIOPCIExtIRQ *ext_irq = opaque; + VFIOPCIDevice *vdev = ext_irq->vdev; + PCIDevice *pdev = &vdev->pdev; + AddressSpace *as = pci_device_iommu_address_space(pdev); + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(as->root); + struct vfio_region_dma_fault header; + struct iommu_fault *queue; + char *queue_buffer = NULL; + ssize_t bytes; + + if (!event_notifier_test_and_clear(&ext_irq->notifier)) { + return; + } + + bytes = pread(vdev->vbasedev.fd, &header, sizeof(header), + vdev->dma_fault_region.fd_offset); + if (bytes != sizeof(header)) { + error_report("%s unable to read the fault region header (0x%lx)", + __func__, bytes); + return; + } + + /* Normally the fault queue is mmapped */ + queue = (struct iommu_fault *)vdev->dma_fault_region.mmaps[0].mmap; + if (!queue) { + size_t queue_size = header.nb_entries * header.entry_size; + + error_report("%s: fault queue not mmapped: slower fault handling", + vdev->vbasedev.name); + + queue_buffer = g_malloc(queue_size); + bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size, + vdev->dma_fault_region.fd_offset + header.offset); + if (bytes != queue_size) { + error_report("%s unable to read the fault queue (0x%lx)", + __func__, bytes); + return; + } + + queue = (struct iommu_fault *)queue_buffer; + } + + while (vdev->fault_tail_index != header.head) { + memory_region_inject_faults(iommu_mr, 1, + &queue[vdev->fault_tail_index]); + vdev->fault_tail_index = + (vdev->fault_tail_index + 1) % header.nb_entries; + } + bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_tail_index, 4, + vdev->dma_fault_region.fd_offset); + if (bytes != 4) { + error_report("%s unable to write the fault region tail index (0x%lx)", + __func__, bytes); + } + g_free(queue_buffer); +} + +static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev, + uint32_t type, uint32_t subtype, + IOHandler *handler) +{ + int32_t fd, ext_irq_index, index; + struct vfio_irq_info *irq_info; + Error *err = NULL; + EventNotifier *n; + int ret; + + ret = vfio_get_dev_irq_info(&vdev->vbasedev, type, subtype, &irq_info); + if (ret) { + return ret; + } + index = irq_info->index; + ext_irq_index = irq_info->index - VFIO_PCI_NUM_IRQS; + g_free(irq_info); + + vdev->ext_irqs[ext_irq_index].vdev = vdev; + vdev->ext_irqs[ext_irq_index].index = index; + n = &vdev->ext_irqs[ext_irq_index].notifier; + + ret = event_notifier_init(n, 0); + if (ret) { + error_report("vfio: Unable to init event notifier for ext irq %d(%d)", + ext_irq_index, ret); + return ret; + } + + fd = event_notifier_get_fd(n); + qemu_set_fd_handler(fd, vfio_dma_fault_notifier_handler, NULL, + &vdev->ext_irqs[ext_irq_index]); + + ret = vfio_set_irq_signaling(&vdev->vbasedev, index, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err); + if (ret) { + error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + qemu_set_fd_handler(fd, NULL, NULL, vdev); + event_notifier_cleanup(n); + } + return ret; +} + +static void vfio_unregister_ext_irq_notifiers(VFIOPCIDevice *vdev) +{ + VFIODevice *vbasedev = &vdev->vbasedev; + Error *err = NULL; + int i; + + for (i = 0; i < vbasedev->num_irqs - VFIO_PCI_NUM_IRQS; i++) { + if (vfio_set_irq_signaling(vbasedev, i + VFIO_PCI_NUM_IRQS , 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } + qemu_set_fd_handler(event_notifier_get_fd(&vdev->ext_irqs[i].notifier), + NULL, NULL, vdev); + event_notifier_cleanup(&vdev->ext_irqs[i].notifier); + } + g_free(vdev->ext_irqs); +} + static void vfio_realize(PCIDevice *pdev, Error **errp) { VFIOPCIDevice *vdev = PCI_VFIO(pdev); @@ -2776,7 +3170,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) ssize_t len; struct stat st; int groupid; - int i, ret; + int i, ret, nb_ext_irqs; bool is_mdev; if (!vdev->vbasedev.sysfsdev) { @@ -2864,6 +3258,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } + nb_ext_irqs = vdev->vbasedev.num_irqs - VFIO_PCI_NUM_IRQS; + if (nb_ext_irqs > 0) { + vdev->ext_irqs = g_new0(VFIOPCIExtIRQ, nb_ext_irqs); + } + vfio_populate_device(vdev, &err); if (err) { error_propagate(errp, err); @@ -3070,8 +3469,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); + vfio_register_ext_irq_handler(vdev, VFIO_IRQ_TYPE_NESTED, + VFIO_IRQ_SUBTYPE_DMA_FAULT, + vfio_dma_fault_notifier_handler); vfio_setup_resetfn_quirk(vdev); + pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); + return; out_teardown: @@ -3089,6 +3493,8 @@ static void vfio_instance_finalize(Object *obj) vfio_display_finalize(vdev); vfio_bars_finalize(vdev); + vfio_region_finalize(&vdev->dma_fault_region); + vfio_region_finalize(&vdev->dma_fault_response_region); g_free(vdev->emulated_config_bits); g_free(vdev->rom); /* @@ -3108,6 +3514,9 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); + vfio_unregister_ext_irq_notifiers(vdev); + vfio_region_exit(&vdev->dma_fault_region); + vfio_region_exit(&vdev->dma_fault_response_region); pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); vfio_disable_interrupts(vdev); if (vdev->intx.mmap_timer) { @@ -3180,6 +3589,9 @@ static void vfio_instance_init(Object *obj) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), + DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, + vbasedev.pre_copy_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), @@ -3192,6 +3604,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice, + vbasedev.enable_migration, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, vbasedev.balloon_allowed, false), diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 834a90d64686705ad9f69538dfa1054c6b9378cc..7fdcfa0dc8b38e8772c7f79bcdda2dde9de84d3f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -113,6 +113,12 @@ typedef struct VFIOMSIXInfo { unsigned long *pending; } VFIOMSIXInfo; +typedef struct VFIOPCIExtIRQ { + struct VFIOPCIDevice *vdev; + EventNotifier notifier; + uint32_t index; +} VFIOPCIExtIRQ; + typedef struct VFIOPCIDevice { PCIDevice pdev; VFIODevice vbasedev; @@ -134,6 +140,11 @@ typedef struct VFIOPCIDevice { PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; + VFIOPCIExtIRQ *ext_irqs; + VFIORegion dma_fault_region; + uint32_t fault_tail_index; + VFIORegion dma_fault_response_region; + uint32_t fault_response_head_index; int (*resetfn)(struct VFIOPCIDevice *); uint32_t vendor_id; uint32_t device_id; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 575ebde6e0f2dbd16fbee90f17b80fc1d8af9c3c..54e10046f505509b22b0988f6cf1449be535a71a 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -97,11 +97,13 @@ vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "i vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 -vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" +vfio_dma_map_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64 -vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 +vfio_dma_unmap_ram(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 vfio_disconnect_container(int fd) "close container->fd=%d" +vfio_connect_existing_container(int groupid, int container_fd) "group=%d existing container fd=%d" +vfio_connect_new_container(int groupid, int container_fd) "group=%d new container fd=%d" vfio_put_group(int fd) "close group->fd=%d" vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" vfio_put_base_device(int fd) "close vdev->fd=%d" @@ -115,7 +117,12 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" +vfio_get_dev_irq(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" vfio_dma_unmap_overflow_workaround(void) "" +vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" +vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" +vfio_register_msi_binding(const char *name, int vector, uint64_t giova, uint64_t gdb) "%s: register vector %d gIOVA=0x%"PRIx64 "-> gDB=0x%"PRIx64" stage 1 mapping" +vfio_unregister_msi_binding(const char *name, int vector) "%s: unregister vector %d stage 1 mapping" # platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" diff --git a/include/exec/memory.h b/include/exec/memory.h index dca8184277d354c855c12a31d41742a578b9474a..ffd4282f147dc7d30b902d05c8dca2ba567e67e2 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -56,6 +56,8 @@ struct MemoryRegionMmio { CPUWriteMemoryFunc *write[3]; }; +struct iommu_fault; + typedef struct IOMMUTLBEntry IOMMUTLBEntry; /* See address_space_translate: bit 0 is read, bit 1 is write. */ @@ -66,14 +68,48 @@ typedef enum { IOMMU_RW = 3, } IOMMUAccessFlags; +/* Granularity of the cache invalidation */ +typedef enum { + IOMMU_INV_GRAN_ADDR = 0, + IOMMU_INV_GRAN_PASID, + IOMMU_INV_GRAN_DOMAIN, +} IOMMUInvGranularity; + #define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0)) +/** + * IOMMUTLBEntry - IOMMU TLB entry + * + * Structure used when performing a translation or when notifying MAP or + * UNMAP (invalidation) events + * + * @target_as: target address space + * @iova: IO virtual address (input) + * @translated_addr: translated address (output) + * @addr_mask: address mask (0xfff means 4K binding), must be multiple of 2 + * @perm: permission flag of the mapping (NONE encodes no mapping or + * invalidation notification) + * @granularity: granularity of the invalidation + * @flags: informs whether the following fields are set + * @arch_id: architecture specific ID tagging the TLB + * @pasid: PASID tagging the TLB + * @leaf: when @perm is NONE, indicates whether only caches for the last + * level of translation need to be invalidated. + */ struct IOMMUTLBEntry { AddressSpace *target_as; hwaddr iova; hwaddr translated_addr; - hwaddr addr_mask; /* 0xfff = 4k translation */ + hwaddr addr_mask; IOMMUAccessFlags perm; + IOMMUInvGranularity granularity; +#define IOMMU_INV_FLAGS_PASID (1 << 0) +#define IOMMU_INV_FLAGS_ARCHID (1 << 1) +#define IOMMU_INV_FLAGS_LEAF (1 << 2) + uint32_t flags; + uint32_t arch_id; + uint32_t pasid; + bool leaf; }; /* @@ -206,7 +242,9 @@ struct MemoryRegionOps { }; enum IOMMUMemoryRegionAttr { - IOMMU_ATTR_SPAPR_TCE_FD + IOMMU_ATTR_SPAPR_TCE_FD, + IOMMU_ATTR_VFIO_NESTED, + IOMMU_ATTR_MSI_TRANSLATE, }; /** @@ -342,6 +380,19 @@ typedef struct IOMMUMemoryRegionClass { * @iommu: the IOMMUMemoryRegion */ int (*num_indexes)(IOMMUMemoryRegion *iommu); + + /* + * Inject @count faults into the IOMMU memory region + * + * Optional method: if this method is not provided, then + * memory_region_injection_faults() will return -ENOENT + * + * @iommu: the IOMMU memory region to inject the faults in + * @count: number of faults to inject + * @buf: fault buffer + */ + int (*inject_faults)(IOMMUMemoryRegion *iommu, int count, + struct iommu_fault *buf); } IOMMUMemoryRegionClass; typedef struct CoalescedMemoryRange CoalescedMemoryRange; @@ -1146,6 +1197,16 @@ int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, */ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); +/** + * memory_region_inject_faults : inject @count faults stored in @buf + * + * @iommu_mr: the IOMMU memory region + * @count: number of faults to be injected + * @buf: buffer containing the faults + */ +int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, + struct iommu_fault *buf); + /** * memory_region_name: get a memory region's name * diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 1f37844e5c977c61806172c4ae951809edb2cfd0..353668f4eabf9d7f88de8a9fa2fc854a75831be0 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -68,6 +68,7 @@ typedef struct SMMUTransCfg { uint8_t tbi; /* Top Byte Ignore */ uint16_t asid; SMMUTransTableInfo tt[2]; + dma_addr_t s1ctxptr; uint32_t iotlb_hits; /* counts IOTLB hits for this asid */ uint32_t iotlb_misses; /* counts IOTLB misses for this asid */ } SMMUTransCfg; diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h new file mode 100644 index 0000000000000000000000000000000000000000..5890f095b14b413a829079c4950f85dd2ea47cc3 --- /dev/null +++ b/include/hw/iommu/iommu.h @@ -0,0 +1,36 @@ +/* + * common header for iommu devices + * + * Copyright Red Hat, Inc. 2019 + * + * Authors: + * Eric Auger + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef QEMU_HW_IOMMU_IOMMU_H +#define QEMU_HW_IOMMU_IOMMU_H +#ifdef __linux__ +#include +#endif + +typedef struct IOMMUConfig { + union { +#ifdef __linux__ + struct iommu_pasid_table_config pasid_cfg; +#endif + }; +} IOMMUConfig; + +typedef struct IOMMUPageResponse { + union { +#ifdef __linux__ + struct iommu_page_response resp; +#endif + }; +} IOMMUPageResponse; + + +#endif /* QEMU_HW_IOMMU_IOMMU_H */ diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index aaf1b9f70d78251d3c16ea44343c114b3a314a82..5e7e0e4e6fe4aa3b84e1d7a056a3b09af14c027b 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -9,6 +9,7 @@ #include "hw/isa/isa.h" #include "hw/pci/pcie.h" +#include "hw/iommu/iommu.h" extern bool pci_available; @@ -263,6 +264,13 @@ struct PCIReqIDCache { }; typedef struct PCIReqIDCache PCIReqIDCache; +struct PCIPASIDOps { + int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config); + int (*return_page_response)(PCIBus *bus, int32_t devfn, + IOMMUPageResponse *resp); +}; +typedef struct PCIPASIDOps PCIPASIDOps; + struct PCIDevice { DeviceState qdev; @@ -352,6 +360,7 @@ struct PCIDevice { MSIVectorUseNotifier msix_vector_use_notifier; MSIVectorReleaseNotifier msix_vector_release_notifier; MSIVectorPollNotifier msix_vector_poll_notifier; + PCIPASIDOps *pasid_ops; }; void pci_register_bar(PCIDevice *pci_dev, int region_num, @@ -485,6 +494,12 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); +void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops); +bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn); +int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config); +int pci_device_return_page_response(PCIBus *bus, int32_t devfn, + IOMMUPageResponse *resp); + static inline void pci_set_byte(uint8_t *config, uint8_t val) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 048731e81f70bf6e836a81a68729eff9968e577c..a82962ab1604764dbe091ba34e83cdea6d0208d8 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -74,8 +74,24 @@ typedef struct VFIOAddressSpace { QLIST_ENTRY(VFIOAddressSpace) list; } VFIOAddressSpace; +typedef struct VFIOMSIBinding { + int index; + hwaddr iova; + hwaddr gpa; + hwaddr size; + QLIST_ENTRY(VFIOMSIBinding) next; +} VFIOMSIBinding; + struct VFIOGroup; +typedef struct VFIODMARange { + QLIST_ENTRY(VFIODMARange) next; + hwaddr iova; + size_t size; + void *vaddr; /* unused */ + unsigned long *bitmap; /* dirty bitmap cache for this range */ +} VFIODMARange; + typedef struct VFIOContainer { VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ @@ -85,12 +101,15 @@ typedef struct VFIOContainer { int error; bool initialized; bool dirty_pages_supported; + bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIODMARange) dma_list; + QLIST_HEAD(, VFIOMSIBinding) msibinding_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; @@ -123,12 +142,14 @@ typedef struct VFIODevice { bool needs_reset; bool no_mmap; bool balloon_allowed; + bool enable_migration; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; VFIOMigration *migration; Error *migration_blocker; + OnOffAuto pre_copy_dirty_page_tracking; } VFIODevice; struct VFIODeviceOps { @@ -198,6 +219,9 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp); void vfio_put_group(VFIOGroup *group); int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vbasedev, Error **errp); +int vfio_iommu_set_msi_binding(VFIOContainer *container, int n, + IOMMUTLBEntry *entry); +int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n); extern const MemoryRegionOps vfio_region_ops; typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; @@ -214,6 +238,13 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type); struct vfio_info_cap_header * vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); +int vfio_get_irq_info(VFIODevice *vbasedev, int index, + struct vfio_irq_info **info); +int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_irq_info **info); +bool vfio_has_irq_cap(VFIODevice *vbasedev, int irq, uint16_t cap_type); +struct vfio_info_cap_header * +vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id); #endif extern const MemoryListener vfio_prereg_listener; diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 787dbc7770e6726723648d5d535c4b2c296bc588..f8e884f1461d88ba249fca981a3dc956f90290d5 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -43,4 +43,5 @@ typedef struct KVMMemoryListener { void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, AddressSpace *as, int as_id); +void kvm_set_max_memslot_size(hwaddr max_slot_size); #endif diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h new file mode 100644 index 0000000000000000000000000000000000000000..773b7dc2d695b49f2423059907b4e93afb30a73d --- /dev/null +++ b/linux-headers/linux/iommu.h @@ -0,0 +1,395 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * IOMMU user API definitions + */ + +#ifndef IOMMU_H +#define IOMMU_H + +#include + +#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ +#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ +#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ +#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ + +/* Generic fault types, can be expanded IRQ remapping fault */ +enum iommu_fault_type { + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ + IOMMU_FAULT_PAGE_REQ, /* page request fault */ +}; + +enum iommu_fault_reason { + IOMMU_FAULT_REASON_UNKNOWN = 0, + + /* Could not access the PASID table (fetch caused external abort) */ + IOMMU_FAULT_REASON_PASID_FETCH, + + /* PASID entry is invalid or has configuration errors */ + IOMMU_FAULT_REASON_BAD_PASID_ENTRY, + + /* + * PASID is out of range (e.g. exceeds the maximum PASID + * supported by the IOMMU) or disabled. + */ + IOMMU_FAULT_REASON_PASID_INVALID, + + /* + * An external abort occurred fetching (or updating) a translation + * table descriptor + */ + IOMMU_FAULT_REASON_WALK_EABT, + + /* + * Could not access the page table entry (Bad address), + * actual translation fault + */ + IOMMU_FAULT_REASON_PTE_FETCH, + + /* Protection flag check failed */ + IOMMU_FAULT_REASON_PERMISSION, + + /* access flag check failed */ + IOMMU_FAULT_REASON_ACCESS, + + /* Output address of a translation stage caused Address Size fault */ + IOMMU_FAULT_REASON_OOR_ADDRESS, +}; + +/** + * struct iommu_fault_unrecoverable - Unrecoverable fault data + * @reason: reason of the fault, from &enum iommu_fault_reason + * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) + * @pasid: Process Address Space ID + * @perm: requested permission access using by the incoming transaction + * (IOMMU_FAULT_PERM_* values) + * @addr: offending page address + * @fetch_addr: address that caused a fetch abort, if any + */ +struct iommu_fault_unrecoverable { + __u32 reason; +#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) +#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) +#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) + __u32 flags; + __u32 pasid; + __u32 perm; + __u64 addr; + __u64 fetch_addr; +}; + +/** + * struct iommu_fault_page_request - Page Request data + * @flags: encodes whether the corresponding fields are valid and whether this + * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). + * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response + * must have the same PASID value as the page request. When it is clear, + * the page response should not have a PASID. + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) + * @addr: page address + * @private_data: device-specific private information + */ +struct iommu_fault_page_request { +#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) +#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) +#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) +#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u64 private_data[2]; +}; + +/** + * struct iommu_fault - Generic fault data + * @type: fault type from &enum iommu_fault_type + * @padding: reserved for future use (should be zero) + * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV + * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ + * @padding2: sets the fault size to allow for future extensions + */ +struct iommu_fault { + __u32 type; + __u32 padding; + union { + struct iommu_fault_unrecoverable event; + struct iommu_fault_page_request prm; + __u8 padding2[56]; + }; +}; + +/** + * enum iommu_page_response_code - Return status of fault handlers + * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is "Success" in PCI PRI. + * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from + * this device if possible. This is "Response Failure" in PCI PRI. + * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is "Invalid Request" in PCI PRI. + */ +enum iommu_page_response_code { + IOMMU_PAGE_RESP_SUCCESS = 0, + IOMMU_PAGE_RESP_INVALID, + IOMMU_PAGE_RESP_FAILURE, +}; + +/** + * struct iommu_page_response - Generic page response information + * @argsz: User filled size of this data + * @version: API version of this structure + * @flags: encodes whether the corresponding fields are valid + * (IOMMU_FAULT_PAGE_RESPONSE_* values) + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @code: response code from &enum iommu_page_response_code + */ +struct iommu_page_response { + __u32 argsz; +#define IOMMU_PAGE_RESP_VERSION_1 1 + __u32 version; +#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 code; +}; + +/* defines the granularity of the invalidation */ +enum iommu_inv_granularity { + IOMMU_INV_GRANU_DOMAIN, /* domain-selective invalidation */ + IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */ + IOMMU_INV_GRANU_ADDR, /* page-selective invalidation */ + IOMMU_INV_GRANU_NR, /* number of invalidation granularities */ +}; + +/** + * struct iommu_inv_addr_info - Address Selective Invalidation Structure + * + * @flags: indicates the granularity of the address-selective invalidation + * - If the PASID bit is set, the @pasid field is populated and the invalidation + * relates to cache entries tagged with this PASID and matching the address + * range. + * - If ARCHID bit is set, @archid is populated and the invalidation relates + * to cache entries tagged with this architecture specific ID and matching + * the address range. + * - Both PASID and ARCHID can be set as they may tag different caches. + * - If neither PASID or ARCHID is set, global addr invalidation applies. + * - The LEAF flag indicates whether only the leaf PTE caching needs to be + * invalidated and other paging structure caches can be preserved. + * @pasid: process address space ID + * @archid: architecture-specific ID + * @addr: first stage/level input address + * @granule_size: page/block size of the mapping in bytes + * @nb_granules: number of contiguous granules to be invalidated + */ +struct iommu_inv_addr_info { +#define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0) +#define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1) +#define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2) + __u32 flags; + __u32 archid; + __u64 pasid; + __u64 addr; + __u64 granule_size; + __u64 nb_granules; +}; + +/** + * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure + * + * @flags: indicates the granularity of the PASID-selective invalidation + * - If the PASID bit is set, the @pasid field is populated and the invalidation + * relates to cache entries tagged with this PASID and matching the address + * range. + * - If the ARCHID bit is set, the @archid is populated and the invalidation + * relates to cache entries tagged with this architecture specific ID and + * matching the address range. + * - Both PASID and ARCHID can be set as they may tag different caches. + * - At least one of PASID or ARCHID must be set. + * @pasid: process address space ID + * @archid: architecture-specific ID + */ +struct iommu_inv_pasid_info { +#define IOMMU_INV_PASID_FLAGS_PASID (1 << 0) +#define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1) + __u32 flags; + __u32 archid; + __u64 pasid; +}; + +/** + * struct iommu_cache_invalidate_info - First level/stage invalidation + * information + * @argsz: User filled size of this data + * @version: API version of this structure + * @cache: bitfield that allows to select which caches to invalidate + * @granularity: defines the lowest granularity used for the invalidation: + * domain > PASID > addr + * @padding: reserved for future use (should be zero) + * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID + * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR + * + * Not all the combinations of cache/granularity are valid: + * + * +--------------+---------------+---------------+---------------+ + * | type / | DEV_IOTLB | IOTLB | PASID | + * | granularity | | | cache | + * +==============+===============+===============+===============+ + * | DOMAIN | N/A | Y | Y | + * +--------------+---------------+---------------+---------------+ + * | PASID | Y | Y | Y | + * +--------------+---------------+---------------+---------------+ + * | ADDR | Y | Y | N/A | + * +--------------+---------------+---------------+---------------+ + * + * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than + * @version and @cache. + * + * If multiple cache types are invalidated simultaneously, they all + * must support the used granularity. + */ +struct iommu_cache_invalidate_info { + __u32 argsz; +#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1 + __u32 version; +/* IOMMU paging structure cache */ +#define IOMMU_CACHE_INV_TYPE_IOTLB (1 << 0) /* IOMMU IOTLB */ +#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB (1 << 1) /* Device IOTLB */ +#define IOMMU_CACHE_INV_TYPE_PASID (1 << 2) /* PASID cache */ +#define IOMMU_CACHE_INV_TYPE_NR (3) + __u8 cache; + __u8 granularity; + __u8 padding[6]; + union { + struct iommu_inv_pasid_info pasid_info; + struct iommu_inv_addr_info addr_info; + } granu; +}; + +/** + * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest + * SVA binding. + * + * @flags: VT-d PASID table entry attributes + * @pat: Page attribute table data to compute effective memory type + * @emt: Extended memory type + * + * Only guest vIOMMU selectable and effective options are passed down to + * the host IOMMU. + */ +struct iommu_gpasid_bind_data_vtd { +#define IOMMU_SVA_VTD_GPASID_SRE (1 << 0) /* supervisor request */ +#define IOMMU_SVA_VTD_GPASID_EAFE (1 << 1) /* extended access enable */ +#define IOMMU_SVA_VTD_GPASID_PCD (1 << 2) /* page-level cache disable */ +#define IOMMU_SVA_VTD_GPASID_PWT (1 << 3) /* page-level write through */ +#define IOMMU_SVA_VTD_GPASID_EMTE (1 << 4) /* extended mem type enable */ +#define IOMMU_SVA_VTD_GPASID_CD (1 << 5) /* PASID-level cache disable */ +#define IOMMU_SVA_VTD_GPASID_LAST (1 << 6) + __u64 flags; + __u32 pat; + __u32 emt; +}; + +#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ + IOMMU_SVA_VTD_GPASID_EMTE | \ + IOMMU_SVA_VTD_GPASID_PCD | \ + IOMMU_SVA_VTD_GPASID_PWT) + +/** + * struct iommu_gpasid_bind_data - Information about device and guest PASID binding + * @argsz: User filled size of this data + * @version: Version of this data structure + * @format: PASID table entry format + * @flags: Additional information on guest bind request + * @gpgd: Guest page directory base of the guest mm to bind + * @hpasid: Process address space ID used for the guest mm in host IOMMU + * @gpasid: Process address space ID used for the guest mm in guest IOMMU + * @addr_width: Guest virtual address width + * @padding: Reserved for future use (should be zero) + * @vtd: Intel VT-d specific data + * + * Guest to host PASID mapping can be an identity or non-identity, where guest + * has its own PASID space. For non-identify mapping, guest to host PASID lookup + * is needed when VM programs guest PASID into an assigned device. VMM may + * trap such PASID programming then request host IOMMU driver to convert guest + * PASID to host PASID based on this bind data. + */ +struct iommu_gpasid_bind_data { + __u32 argsz; +#define IOMMU_GPASID_BIND_VERSION_1 1 + __u32 version; +#define IOMMU_PASID_FORMAT_INTEL_VTD 1 +#define IOMMU_PASID_FORMAT_LAST 2 + __u32 format; + __u32 addr_width; +#define IOMMU_SVA_GPASID_VAL (1 << 0) /* guest PASID valid */ + __u64 flags; + __u64 gpgd; + __u64 hpasid; + __u64 gpasid; + __u8 padding[8]; + /* Vendor specific data */ + union { + struct iommu_gpasid_bind_data_vtd vtd; + } vendor; +}; + +/** + * struct iommu_pasid_smmuv3 - ARM SMMUv3 Stream Table Entry stage 1 related + * information + * @version: API version of this structure + * @s1fmt: STE s1fmt (format of the CD table: single CD, linear table + * or 2-level table) + * @s1dss: STE s1dss (specifies the behavior when @pasid_bits != 0 + * and no PASID is passed along with the incoming transaction) + * @padding: reserved for future use (should be zero) + * + * The PASID table is referred to as the Context Descriptor (CD) table on ARM + * SMMUv3. Please refer to the ARM SMMU 3.x spec (ARM IHI 0070A) for full + * details. + */ +struct iommu_pasid_smmuv3 { +#define PASID_TABLE_SMMUV3_CFG_VERSION_1 1 + __u32 version; + __u8 s1fmt; + __u8 s1dss; + __u8 padding[2]; +}; + +/** + * struct iommu_pasid_table_config - PASID table data used to bind guest PASID + * table to the host IOMMU + * @argsz: User filled size of this data + * @version: API version to prepare for future extensions + * @base_ptr: guest physical address of the PASID table + * @format: format of the PASID table + * @pasid_bits: number of PASID bits used in the PASID table + * @config: indicates whether the guest translation stage must + * be translated, bypassed or aborted. + * @padding: reserved for future use (should be zero) + * @vendor_data.smmuv3: table information when @format is + * %IOMMU_PASID_FORMAT_SMMUV3 + */ +struct iommu_pasid_table_config { + __u32 argsz; +#define PASID_TABLE_CFG_VERSION_1 1 + __u32 version; + __u64 base_ptr; +#define IOMMU_PASID_FORMAT_SMMUV3 1 + __u32 format; + __u8 pasid_bits; +#define IOMMU_PASID_CONFIG_TRANSLATE 1 +#define IOMMU_PASID_CONFIG_BYPASS 2 +#define IOMMU_PASID_CONFIG_ABORT 3 + __u8 config; + __u8 padding[2]; + union { + struct iommu_pasid_smmuv3 smmuv3; + } vendor_data; +}; + +#endif /* _UAPI_IOMMU_H */ diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index a90672494dc584fff35d3141e248393d0697fdb7..d6edfbd2f51d1241c85ebe6b8a7bb3c2349b34e7 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -14,6 +14,7 @@ #include #include +#include #define VFIO_API_VERSION 0 @@ -46,6 +47,16 @@ */ #define VFIO_NOIOMMU_IOMMU 8 +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -201,8 +212,11 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ +#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ + __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -218,6 +232,15 @@ struct vfio_device_info { #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" #define VFIO_DEVICE_API_AP_STRING "vfio-ap" +/* + * The following capabilities are unique to s390 zPCI devices. Their contents + * are further-defined in vfio_zdev.h + */ +#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1 +#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2 +#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 +#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) @@ -306,6 +329,7 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_GFX (1) #define VFIO_REGION_TYPE_CCW (2) #define VFIO_REGION_TYPE_MIGRATION (3) +#define VFIO_REGION_TYPE_NESTED (4) /* sub-types for VFIO_REGION_TYPE_PCI_* */ @@ -330,6 +354,10 @@ struct vfio_region_info_cap_type { /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) +/* sub-types for VFIO_REGION_TYPE_NESTED */ +#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT (1) +#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE (2) + /** * struct vfio_region_gfx_edid - EDID region layout. * @@ -462,7 +490,7 @@ struct vfio_region_gfx_edid { * 5. Resumed * |--------->| * - * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 0. Default state of VFIO device is _RUNNING when the user application starts. * 1. During normal shutdown of the user application, the user application may * optionally change the VFIO device state from _RUNNING to _STOP. This * transition is optional. The vendor driver must support this transition but @@ -685,11 +713,30 @@ struct vfio_irq_info { #define VFIO_IRQ_INFO_MASKABLE (1 << 1) #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) #define VFIO_IRQ_INFO_NORESIZE (1 << 3) +#define VFIO_IRQ_INFO_FLAG_CAPS (1 << 4) /* Info supports caps */ __u32 index; /* IRQ index */ __u32 count; /* Number of IRQs within this index */ + __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) +/* + * The irq type capability allows IRQs unique to a specific device or + * class of devices to be exposed. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_IRQ_INFO_CAP_TYPE 3 + +struct vfio_irq_info_cap_type { + struct vfio_info_cap_header header; + __u32 type; /* global per bus driver */ + __u32 subtype; /* type specific */ +}; + +#define VFIO_IRQ_TYPE_NESTED (1) +#define VFIO_IRQ_SUBTYPE_DMA_FAULT (1) + /** * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) * @@ -791,7 +838,8 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, - VFIO_PCI_NUM_IRQS + VFIO_PCI_NUM_IRQS = 5 /* Fixed user ABI, IRQ indexes >=5 use */ + /* device specific cap to define content */ }; /* @@ -975,6 +1023,68 @@ struct vfio_device_feature { */ #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0) +/* + * Capability exposed by the DMA fault region + * @version: ABI version + */ +#define VFIO_REGION_INFO_CAP_DMA_FAULT 6 + +struct vfio_region_info_cap_fault { + struct vfio_info_cap_header header; + __u32 version; +}; + +/* + * Capability exposed by the DMA fault response region + * @version: ABI version + */ +#define VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE 7 + +struct vfio_region_info_cap_fault_response { + struct vfio_info_cap_header header; + __u32 version; +}; + +/* + * DMA Fault Region Layout + * @tail: index relative to the start of the ring buffer at which the + * consumer finds the next item in the buffer + * @entry_size: fault ring buffer entry size in bytes + * @nb_entries: max capacity of the fault ring buffer + * @offset: ring buffer offset relative to the start of the region + * @head: index relative to the start of the ring buffer at which the + * producer (kernel) inserts items into the buffers + */ +struct vfio_region_dma_fault { + /* Write-Only */ + __u32 tail; + /* Read-Only */ + __u32 entry_size; + __u32 nb_entries; + __u32 offset; + __u32 head; +}; + +/* + * DMA Fault Response Region Layout + * @head: index relative to the start of the ring buffer at which the + * producer (userspace) insert responses into the buffer + * @entry_size: fault ring buffer entry size in bytes + * @nb_entries: max capacity of the fault ring buffer + * @offset: ring buffer offset relative to the start of the region + * @tail: index relative to the start of the ring buffer at which the + * consumer (kernel) finds the next item in the buffer + */ +struct vfio_region_dma_fault_response { + /* Write-Only */ + __u32 head; + /* Read-Only */ + __u32 entry_size; + __u32 nb_entries; + __u32 offset; + __u32 tail; +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** @@ -1039,6 +1149,21 @@ struct vfio_iommu_type1_info_cap_migration { __u64 max_dirty_bitmap_size; /* in bytes */ }; +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** @@ -1062,7 +1187,7 @@ struct vfio_iommu_type1_dma_map { struct vfio_bitmap { __u64 pgsize; /* page size for bitmap in bytes */ __u64 size; /* in bytes */ - __u64 *data; /* one bit per page */ + __u64 *data; /* one bit per page */ }; /** @@ -1074,6 +1199,7 @@ struct vfio_bitmap { * field. No guarantee is made to the user that arbitrary unmaps of iova * or size different from those used in the original mapping call will * succeed. + * * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap * before unmapping IO virtual addresses. When this flag is set, the user must * provide a struct vfio_bitmap in data[]. User must provide zero-allocated @@ -1133,8 +1259,30 @@ struct vfio_iommu_type1_dma_unmap { * actual bitmap. If dirty pages logging is not enabled, an error will be * returned. * - * Only one of the flags _START, _STOP and _GET may be specified at a time. + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. */ struct vfio_iommu_type1_dirty_bitmap { __u32 argsz; @@ -1142,6 +1290,8 @@ struct vfio_iommu_type1_dirty_bitmap { #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) __u8 data[]; }; @@ -1153,6 +1303,134 @@ struct vfio_iommu_type1_dirty_bitmap_get { #define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) +/* + * VFIO_IOMMU_BIND_PROCESS + * + * Allocate a PASID for a process address space, and use it to attach this + * process to all devices in the container. Devices can then tag their DMA + * traffic with the returned @pasid to perform transactions on the associated + * virtual address space. Mapping and unmapping buffers is performed by standard + * functions such as mmap and malloc. + * + * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to + * bind. Otherwise the current task is bound. Given that the caller owns the + * device, setting this flag grants the caller read and write permissions on the + * entire address space of foreign process described by @pid. Therefore, + * permission to perform the bind operation on a foreign process is governed by + * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) + * for more information. + * + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This + * ID is unique to a process and can be used on all devices in the container. + * + * On fork, the child inherits the device fd and can use the bonds setup by its + * parent. Consequently, the child has R/W access on the address spaces bound by + * its parent. After an execv, the device fd is closed and the child doesn't + * have access to the address space anymore. + * + * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is + * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND, + * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current + * task from the container. + */ +struct vfio_iommu_type1_bind_process { + __u32 flags; +#define VFIO_IOMMU_BIND_PID (1 << 0) + __u32 pasid; + __s32 pid; +}; + +/* + * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes + * vfio_iommu_type1_bind_process in data. + */ +struct vfio_iommu_type1_bind { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_BIND_PROCESS (1 << 0) + __u8 data[]; +}; + +/* + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind) + * + * Manage address spaces of devices in this container. Initially a TYPE1 + * container can only have one address space, managed with + * VFIO_IOMMU_MAP/UNMAP_DMA. + * + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page + * tables, and BIND manages the stage-1 (guest) page tables. Other types of + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls + * non-PASID traffic and BIND controls PASID traffic. But this depends on the + * underlying IOMMU architecture and isn't guaranteed. + * + * Availability of this feature depends on the device, its bus, the underlying + * IOMMU and the CPU architecture. + * + * returns: 0 on success, -errno on failure. + */ +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) + +/* + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind) + * + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. + */ +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) + +/* + * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18, + * struct vfio_iommu_type1_set_pasid_table) + * + * The SET operation passes a PASID table to the host while the + * UNSET operation detaches the one currently programmed. It is + * allowed to "SET" the table several times without unsetting as + * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE. + */ +struct vfio_iommu_type1_set_pasid_table { + __u32 argsz; + __u32 flags; +#define VFIO_PASID_TABLE_FLAG_SET (1 << 0) +#define VFIO_PASID_TABLE_FLAG_UNSET (1 << 1) + struct iommu_pasid_table_config config; /* used on SET */ +}; + +#define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18) + +/** + * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, + * struct vfio_iommu_type1_cache_invalidate) + * + * Propagate guest IOMMU cache invalidation to the host. + */ +struct vfio_iommu_type1_cache_invalidate { + __u32 argsz; + __u32 flags; + struct iommu_cache_invalidate_info info; +}; +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 19) + +/** + * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20, + * struct vfio_iommu_type1_set_msi_binding) + * + * Pass a stage 1 MSI doorbell mapping to the host so that this + * latter can build a nested stage2 mapping. Or conversely tear + * down a previously bound stage 1 MSI binding. + */ +struct vfio_iommu_type1_set_msi_binding { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_BIND_MSI (1 << 0) +#define VFIO_IOMMU_UNBIND_MSI (1 << 1) + __u64 iova; /* MSI guest IOVA */ + /* Fields below are used on BIND */ + __u64 gpa; /* MSI guest physical address */ + __u64 size; /* size of stage1 mapping (bytes) */ +}; +#define VFIO_IOMMU_SET_MSI_BINDING _IO(VFIO_TYPE, VFIO_BASE + 20) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /* diff --git a/memory.c b/memory.c index 44713efc66b2b8ed5ec198841f6daa8ae9777cf2..623f89baa41d55cd23f0788edc55a9cc4634272b 100644 --- a/memory.c +++ b/memory.c @@ -1825,7 +1825,10 @@ bool memory_region_is_ram_device(MemoryRegion *mr) uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; - if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) { + RAMBlock *rb = mr->ram_block; + + if (global_dirty_log && ((rb && qemu_ram_is_migratable(rb)) || + memory_region_is_iommu(mr))) { mask |= (1 << DIRTY_MEMORY_MIGRATION); } return mask; @@ -2014,6 +2017,16 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr) return imrc->num_indexes(iommu_mr); } +int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, + struct iommu_fault *buf) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); + if (!imrc->inject_faults) { + return -ENOENT; + } + return imrc->inject_faults(iommu_mr, count, buf); +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; diff --git a/migration/migration.c b/migration/migration.c index 9faf5f63a627a7155f0d2fa1a13c53fa866af4e4..d34e57fc051f2919f5d7278d5ba57b61b9e1698d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2541,7 +2541,7 @@ retry: out: res = qemu_file_get_error(rp); if (res) { - if (res == -EIO) { + if (res == -EIO && migration_in_postcopy()) { /* * Maybe there is something we can do: it looks like a * network down issue, and we pause for a recovery. diff --git a/migration/ram.c b/migration/ram.c index 2077ba5be4b917aa2c7f139e69a9502bc2e3c50d..1bd99ff9e5b30bfefe2a17b74442a9c478e8bba0 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -3052,6 +3052,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; + unsigned long hostpage_boundary = + QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); if (ramblock_is_ignored(pss->block)) { error_report("block %s should not be migrated !", pss->block->idstr); @@ -3060,29 +3062,31 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, do { /* Check the pages is dirty and if it is send it */ - if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { - pss->page++; - continue; - } + if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { + tmppages = ram_save_target_page(rs, pss, last_stage); + if (tmppages < 0) { + return tmppages; + } - tmppages = ram_save_target_page(rs, pss, last_stage); - if (tmppages < 0) { - return tmppages; - } + pages += tmppages; + if (pss->block->unsentmap) { + clear_bit(pss->page, pss->block->unsentmap); + } - pages += tmppages; - if (pss->block->unsentmap) { - clear_bit(pss->page, pss->block->unsentmap); + /* + * Allow rate limiting to happen in the middle of huge pages if + * something is sent in the current iteration. + */ + if (pagesize_bits > 1 && tmppages > 0) { + migration_rate_limit(); + } } - - pss->page++; - /* Allow rate limiting to happen in the middle of huge pages */ - migration_rate_limit(); + pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); } while ((pss->page & (pagesize_bits - 1)) && offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); - /* The offset we leave with is the last one we looked at */ - pss->page--; + /* The offset we leave with is the min boundary of host page and block */ + pss->page = MIN(pss->page, hostpage_boundary) - 1; return pages; } diff --git a/qdev-monitor.c b/qdev-monitor.c index c6c1d3f06a9f2205639d220545e63c28318709b5..ab2bdef105f2227f1c0c02dcf5ab9cec131210f2 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -587,7 +587,6 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) if (path != NULL) { bus = qbus_find(path, errp); if (!bus) { - error_setg(errp, "can not find bus for %s", driver); return NULL; } if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index f76d77363bbabb955289d06bf478915813928cc2..dfdfdfddcf38c8490278459136e33691eea2bd56 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -141,7 +141,7 @@ done rm -rf "$output/linux-headers/linux" mkdir -p "$output/linux-headers/linux" -for header in kvm.h vfio.h vfio_ccw.h vhost.h \ +for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \ psci.h psp-sev.h userfaultfd.h mman.h; do cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" done