diff --git a/BUGFIX-Enforce-isolation-for-virtcca_shared_hugepage.patch b/BUGFIX-Enforce-isolation-for-virtcca_shared_hugepage.patch new file mode 100644 index 0000000000000000000000000000000000000000..148b47869f436ccf3ade5d19c8a65086d9780215 --- /dev/null +++ b/BUGFIX-Enforce-isolation-for-virtcca_shared_hugepage.patch @@ -0,0 +1,43 @@ +From 458d90e226d5833661f9257f6af57c14f9b9bdfe Mon Sep 17 00:00:00 2001 +From: gongchangsui +Date: Mon, 17 Mar 2025 02:52:21 -0400 +Subject: [PATCH] BUGFIX: Enforce isolation for virtcca_shared_hugepage + +Add memory isolation enforcement when virtcca hugepage is disabled. + +Signed-off-by: gongchangsui +--- + hw/core/numa.c | 3 ++- + hw/virtio/vhost.c | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/hw/core/numa.c b/hw/core/numa.c +index e7c48dab61..c691578ef5 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -728,7 +728,8 @@ void numa_complete_configuration(MachineState *ms) + memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id, + ms->ram_size); + numa_init_memdev_container(ms, ms->ram); +- if (virtcca_cvm_enabled() && virtcca_shared_hugepage->ram_block) { ++ if (virtcca_cvm_enabled() && virtcca_shared_hugepage && ++ virtcca_shared_hugepage->ram_block) { + virtcca_shared_memory_configuration(ms); + } + } +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index 8b95558013..4bf0b03977 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -1617,7 +1617,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, + hdev->log_size = 0; + hdev->log_enabled = false; + hdev->started = false; +- if (virtcca_cvm_enabled()) { ++ if (virtcca_cvm_enabled() && virtcca_shared_hugepage && virtcca_shared_hugepage->ram_block) { + memory_listener_register(&hdev->memory_listener, + &address_space_virtcca_shared_memory); + } else { +-- +2.41.0.windows.1 + diff --git a/BinDir.tar.gz b/BinDir.tar.gz index 326fcb51c3998db1344613bba4f6cdbf0f858e3d..ee03657d69f4a2b78d8ab5e6e0ba8998bef9e6a4 100644 Binary files a/BinDir.tar.gz and b/BinDir.tar.gz differ diff --git a/HostIOMMUDevice-Introduce-realize_late-callback.patch b/HostIOMMUDevice-Introduce-realize_late-callback.patch new file mode 100644 index 0000000000000000000000000000000000000000..f60aa776270db27cf3efc78b48f0aecaba0ff30b --- /dev/null +++ b/HostIOMMUDevice-Introduce-realize_late-callback.patch @@ -0,0 +1,93 @@ +From 53a82c6a5a22bb41e9bd3f754479baf4ce0845bf Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Mon, 5 Aug 2024 09:29:00 +0800 +Subject: [PATCH] HostIOMMUDevice: Introduce realize_late callback + +Previously we have a realize() callback which is called before attachment. +But there are still some elements e.g., ioas not ready before attachment. +So we need a realize_late() callback to further initialize them. + +Currently, this callback is only useful for iommufd backend. For legacy +backend nothing needs to be initialized after attachment. + +Signed-off-by: Zhenzhong Duan +--- + hw/vfio/common.c | 18 +++++++++++++++--- + include/sysemu/host_iommu_device.h | 17 +++++++++++++++++ + 2 files changed, 32 insertions(+), 3 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index a8bc1c6055..0be63c5fbc 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1654,6 +1654,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + HostIOMMUDevice *hiod = NULL; ++ HostIOMMUDeviceClass *hiod_ops = NULL; + int ret; + + if (vbasedev->iommufd) { +@@ -1664,17 +1665,28 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + + if (!vbasedev->mdev) { + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); ++ hiod_ops = HOST_IOMMU_DEVICE_GET_CLASS(hiod); + vbasedev->hiod = hiod; + } + + ret = ops->attach_device(name, vbasedev, as, errp); + if (ret) { +- object_unref(hiod); +- vbasedev->hiod = NULL; +- return ret; ++ goto err_attach; ++ } ++ ++ if (hiod_ops && hiod_ops->realize_late && ++ !hiod_ops->realize_late(hiod, vbasedev, errp)) { ++ ops->detach_device(vbasedev); ++ ret = -EINVAL; ++ goto err_attach; + } + + return 0; ++ ++err_attach: ++ object_unref(hiod); ++ vbasedev->hiod = NULL; ++ return ret; + } + + void vfio_detach_device(VFIODevice *vbasedev) +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +index e4d8300350..84131f5495 100644 +--- a/include/sysemu/host_iommu_device.h ++++ b/include/sysemu/host_iommu_device.h +@@ -64,6 +64,23 @@ struct HostIOMMUDeviceClass { + * Returns: true on success, false on failure. + */ + bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); ++ /** ++ * @realize_late: initialize host IOMMU device instance after attachment, ++ * some elements e.g., ioas are ready only after attachment. ++ * This callback initialize them. ++ * ++ * Optional callback. ++ * ++ * @hiod: pointer to a host IOMMU device instance. ++ * ++ * @opaque: pointer to agent device of this host IOMMU device, ++ * e.g., VFIO base device or VDPA device. ++ * ++ * @errp: pass an Error out when realize fails. ++ * ++ * Returns: true on success, false on failure. ++ */ ++ bool (*realize_late)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @get_cap: check if a host IOMMU device capability is supported. + * +-- +2.41.0.windows.1 + diff --git a/HostIOMMUDevice-Store-the-VFIO-VDPA-agent.patch b/HostIOMMUDevice-Store-the-VFIO-VDPA-agent.patch new file mode 100644 index 0000000000000000000000000000000000000000..34ee17e76c07991a3833957481776df0719a9ecb --- /dev/null +++ b/HostIOMMUDevice-Store-the-VFIO-VDPA-agent.patch @@ -0,0 +1,57 @@ +From 35f33bf18826286c9e9fc739a893b9915c71f43c Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 14 Jun 2024 11:52:51 +0200 +Subject: [PATCH] HostIOMMUDevice: Store the VFIO/VDPA agent + +Store the agent device (VFIO or VDPA) in the host IOMMU device. +This will allow easy access to some of its resources. + +Signed-off-by: Eric Auger +Reviewed-by: Zhenzhong Duan +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/container.c | 1 + + hw/vfio/iommufd.c | 2 ++ + include/sysemu/host_iommu_device.h | 1 + + 3 files changed, 4 insertions(+) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 10f7635425..8a5a112b6b 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1259,6 +1259,7 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + + hiod->name = g_strdup(vdev->name); + hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); ++ hiod->agent = opaque; + + return true; + } +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 3b75cba26c..7a069ca576 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -735,6 +735,8 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + } data; + uint64_t hw_caps; + ++ hiod->agent = opaque; ++ + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, + &type, &data, sizeof(data), + &hw_caps, errp)) { +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +index a57873958b..3e5f058e7b 100644 +--- a/include/sysemu/host_iommu_device.h ++++ b/include/sysemu/host_iommu_device.h +@@ -34,6 +34,7 @@ struct HostIOMMUDevice { + Object parent_obj; + + char *name; ++ void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ + HostIOMMUDeviceCaps caps; + }; + +-- +2.41.0.windows.1 + diff --git a/Revert-linux-user-Print-tid-not-pid-with-strace.patch b/Revert-linux-user-Print-tid-not-pid-with-strace.patch new file mode 100644 index 0000000000000000000000000000000000000000..ec949f7a8d54daf0d40dd772f312e5a42cb22e64 --- /dev/null +++ b/Revert-linux-user-Print-tid-not-pid-with-strace.patch @@ -0,0 +1,32 @@ +From c0717e82e34f96af456309b3786a6808e8e324e4 Mon Sep 17 00:00:00 2001 +From: huangyan +Date: Wed, 16 Apr 2025 00:43:27 +0800 +Subject: [PATCH] Revert "linux-user: Print tid not pid with strace" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reverts commit 2f37362de1d971cc90c35405705bfa22a33f6cd8. + +* this change is incomplete, "get_task_state" lacks the implementation. +* Moreover, it requires all calls to the "getpid" function to be changed to use "get_task_state", it would cause too much disruption,and it has not been applied in the upstream 8.2.0. +--- + linux-user/strace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/linux-user/strace.c b/linux-user/strace.c +index ac9177ebe4..cf26e55264 100644 +--- a/linux-user/strace.c ++++ b/linux-user/strace.c +@@ -4176,7 +4176,7 @@ print_syscall(CPUArchState *cpu_env, int num, + if (!f) { + return; + } +- fprintf(f, "%d ", get_task_state(env_cpu(cpu_env))->ts_tid); ++ fprintf(f, "%d ", getpid()); + + for (i = 0; i < nsyscalls; i++) { + if (scnames[i].nr == num) { +-- +2.41.0.windows.1 + diff --git a/Update-iommufd.h-header-for-vSVA.patch b/Update-iommufd.h-header-for-vSVA.patch new file mode 100644 index 0000000000000000000000000000000000000000..6c75416a8f5d9cf35fe83818a48e171aca881a34 --- /dev/null +++ b/Update-iommufd.h-header-for-vSVA.patch @@ -0,0 +1,514 @@ +From ac715e361fdb6d92169b3b3f5964405c816a13ac Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Tue, 14 Jan 2025 10:29:24 +0000 +Subject: [PATCH] Update iommufd.h header for vSVA + +This is based on Linaro UADK branch: +https://github.com/Linaro/linux-kernel-uadk/tree/6.12-wip-10.26 + +Signed-off-by: Shameer Kolothum +--- + linux-headers/linux/iommufd.h | 394 ++++++++++++++++++++++++++++++++-- + 1 file changed, 371 insertions(+), 23 deletions(-) + +diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h +index 806d98d09c..41559c6064 100644 +--- a/linux-headers/linux/iommufd.h ++++ b/linux-headers/linux/iommufd.h +@@ -37,18 +37,22 @@ + enum { + IOMMUFD_CMD_BASE = 0x80, + IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, +- IOMMUFD_CMD_IOAS_ALLOC, +- IOMMUFD_CMD_IOAS_ALLOW_IOVAS, +- IOMMUFD_CMD_IOAS_COPY, +- IOMMUFD_CMD_IOAS_IOVA_RANGES, +- IOMMUFD_CMD_IOAS_MAP, +- IOMMUFD_CMD_IOAS_UNMAP, +- IOMMUFD_CMD_OPTION, +- IOMMUFD_CMD_VFIO_IOAS, +- IOMMUFD_CMD_HWPT_ALLOC, +- IOMMUFD_CMD_GET_HW_INFO, +- IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, +- IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, ++ IOMMUFD_CMD_IOAS_ALLOC = 0x81, ++ IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82, ++ IOMMUFD_CMD_IOAS_COPY = 0x83, ++ IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84, ++ IOMMUFD_CMD_IOAS_MAP = 0x85, ++ IOMMUFD_CMD_IOAS_UNMAP = 0x86, ++ IOMMUFD_CMD_OPTION = 0x87, ++ IOMMUFD_CMD_VFIO_IOAS = 0x88, ++ IOMMUFD_CMD_HWPT_ALLOC = 0x89, ++ IOMMUFD_CMD_GET_HW_INFO = 0x8a, ++ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b, ++ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, ++ IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, ++ IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, ++ IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f, ++ IOMMUFD_CMD_VDEVICE_ALLOC = 0x90, + }; + + /** +@@ -355,10 +359,13 @@ struct iommu_vfio_ioas { + * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment ++ * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is ++ * valid. + */ + enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, ++ IOMMU_HWPT_FAULT_ID_VALID = 1 << 2, + }; + + /** +@@ -389,14 +396,34 @@ struct iommu_hwpt_vtd_s1 { + __u32 __reserved; + }; + ++/** ++ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info ++ * (IOMMU_HWPT_DATA_ARM_SMMUV3) ++ * ++ * @ste: The first two double words of the user space Stream Table Entry for ++ * a user stage-1 Context Descriptor Table. Must be little-endian. ++ * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) ++ * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax ++ * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD ++ * ++ * -EIO will be returned if @ste is not legal or contains any non-allowed field. ++ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass ++ * nested domain will translate the same as the nesting parent. ++ */ ++struct iommu_hwpt_arm_smmuv3 { ++ __aligned_le64 ste[2]; ++}; ++ + /** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table ++ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table + */ + enum iommu_hwpt_data_type { +- IOMMU_HWPT_DATA_NONE, +- IOMMU_HWPT_DATA_VTD_S1, ++ IOMMU_HWPT_DATA_NONE = 0, ++ IOMMU_HWPT_DATA_VTD_S1 = 1, ++ IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, + }; + + /** +@@ -404,12 +431,15 @@ enum iommu_hwpt_data_type { + * @size: sizeof(struct iommu_hwpt_alloc) + * @flags: Combination of enum iommufd_hwpt_alloc_flags + * @dev_id: The device to allocate this HWPT for +- * @pt_id: The IOAS or HWPT to connect this HWPT to ++ * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to + * @out_hwpt_id: The ID of the new HWPT + * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data ++ * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of ++ * IOMMU_HWPT_FAULT_ID_VALID is set. ++ * @__reserved2: Padding to 64-bit alignment. Must be 0. + * + * Explicitly allocate a hardware page table object. This is the same object + * type that is returned by iommufd_device_attach() and represents the +@@ -420,11 +450,13 @@ enum iommu_hwpt_data_type { + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * +- * A user-managed nested HWPT will be created from a given parent HWPT via +- * @pt_id, in which the parent HWPT must be allocated previously via the +- * same ioctl from a given IOAS (@pt_id). In this case, the @data_type +- * must be set to a pre-defined type corresponding to an I/O page table +- * type supported by the underlying IOMMU hardware. ++ * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a ++ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be ++ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this ++ * case, the @data_type must be set to a pre-defined type corresponding to an ++ * I/O page table type supported by the underlying IOMMU hardware. The device ++ * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU ++ * instance. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr +@@ -440,6 +472,8 @@ struct iommu_hwpt_alloc { + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; ++ __u32 fault_id; ++ __u32 __reserved2; + }; + #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) + +@@ -474,15 +508,50 @@ struct iommu_hw_info_vtd { + __aligned_u64 ecap_reg; + }; + ++/** ++ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information ++ * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) ++ * ++ * @flags: Must be set to 0 ++ * @__reserved: Must be 0 ++ * @idr: Implemented features for ARM SMMU Non-secure programming interface ++ * @iidr: Information about the implementation and implementer of ARM SMMU, ++ * and architecture version supported ++ * @aidr: ARM SMMU architecture version ++ * ++ * For the details of @idr, @iidr and @aidr, please refer to the chapters ++ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. ++ * ++ * User space should read the underlying ARM SMMUv3 hardware information for ++ * the list of supported features. ++ * ++ * Note that these values reflect the raw HW capability, without any insight if ++ * any required kernel driver support is present. Bits may be set indicating the ++ * HW has functionality that is lacking kernel software support, such as BTM. If ++ * a VMM is using this information to construct emulated copies of these ++ * registers it should only forward bits that it knows it can support. ++ * ++ * In future, presence of required kernel support will be indicated in flags. ++ */ ++struct iommu_hw_info_arm_smmuv3 { ++ __u32 flags; ++ __u32 __reserved; ++ __u32 idr[6]; ++ __u32 iidr; ++ __u32 aidr; ++}; ++ + /** + * enum iommu_hw_info_type - IOMMU Hardware Info Types + * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware + * info + * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type ++ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type + */ + enum iommu_hw_info_type { +- IOMMU_HW_INFO_TYPE_NONE, +- IOMMU_HW_INFO_TYPE_INTEL_VTD, ++ IOMMU_HW_INFO_TYPE_NONE = 0, ++ IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, ++ IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, + }; + + /** +@@ -494,9 +563,17 @@ enum iommu_hw_info_type { + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * ++ * @IOMMU_HW_CAP_PASID_EXEC: Execute Permission Supported, user ignores it ++ * when the struct iommu_hw_info::out_max_pasid_log2 ++ * is zero. ++ * @IOMMU_HW_CAP_PASID_PRIV: Privileged Mode Supported, user ignores it ++ * when the struct iommu_hw_info::out_max_pasid_log2 ++ * is zero. + */ + enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, ++ IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, ++ IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, + }; + + /** +@@ -512,6 +589,9 @@ enum iommufd_hw_capabilities { + * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. ++ * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. ++ * PCI devices turn to out_capabilities to check if the ++ * specific capabilities is supported or not. + * @__reserved: Must be 0 + * + * Query an iommu type specific hardware information data from an iommu behind +@@ -535,7 +615,8 @@ struct iommu_hw_info { + __u32 data_len; + __aligned_u64 data_uptr; + __u32 out_data_type; +- __u32 __reserved; ++ __u8 out_max_pasid_log2; ++ __u8 __reserved[3]; + __aligned_u64 out_capabilities; + }; + #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) +@@ -613,4 +694,271 @@ struct iommu_hwpt_get_dirty_bitmap { + #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + ++/** ++ * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation ++ * Data Type ++ * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 ++ * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3 ++ */ ++enum iommu_hwpt_invalidate_data_type { ++ IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0, ++ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1, ++}; ++ ++/** ++ * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d ++ * stage-1 cache invalidation ++ * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies ++ * to all-levels page structure cache or just ++ * the leaf PTE cache. ++ */ ++enum iommu_hwpt_vtd_s1_invalidate_flags { ++ IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0, ++}; ++ ++/** ++ * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation ++ * (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) ++ * @addr: The start address of the range to be invalidated. It needs to ++ * be 4KB aligned. ++ * @npages: Number of contiguous 4K pages to be invalidated. ++ * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags ++ * @__reserved: Must be 0 ++ * ++ * The Intel VT-d specific invalidation data for user-managed stage-1 cache ++ * invalidation in nested translation. Userspace uses this structure to ++ * tell the impacted cache scope after modifying the stage-1 page table. ++ * ++ * Invalidating all the caches related to the page table by setting @addr ++ * to be 0 and @npages to be U64_MAX. ++ * ++ * The device TLB will be invalidated automatically if ATS is enabled. ++ */ ++struct iommu_hwpt_vtd_s1_invalidate { ++ __aligned_u64 addr; ++ __aligned_u64 npages; ++ __u32 flags; ++ __u32 __reserved; ++}; ++ ++/** ++ * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation ++ * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3) ++ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ. ++ * Must be little-endian. ++ * ++ * Supported command list only when passing in a vIOMMU via @hwpt_id: ++ * CMDQ_OP_TLBI_NSNH_ALL ++ * CMDQ_OP_TLBI_NH_VA ++ * CMDQ_OP_TLBI_NH_VAA ++ * CMDQ_OP_TLBI_NH_ALL ++ * CMDQ_OP_TLBI_NH_ASID ++ * CMDQ_OP_ATC_INV ++ * CMDQ_OP_CFGI_CD ++ * CMDQ_OP_CFGI_CD_ALL ++ * ++ * -EIO will be returned if the command is not supported. ++ */ ++struct iommu_viommu_arm_smmuv3_invalidate { ++ __aligned_le64 cmd[2]; ++}; ++ ++/** ++ * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) ++ * @size: sizeof(struct iommu_hwpt_invalidate) ++ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation ++ * @data_uptr: User pointer to an array of driver-specific cache invalidation ++ * data. ++ * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data ++ * type of all the entries in the invalidation request array. It ++ * should be a type supported by the hwpt pointed by @hwpt_id. ++ * @entry_len: Length (in bytes) of a request entry in the request array ++ * @entry_num: Input the number of cache invalidation requests in the array. ++ * Output the number of requests successfully handled by kernel. ++ * @__reserved: Must be 0. ++ * ++ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications ++ * on a user-managed page table should be followed by this operation, if a HWPT ++ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor ++ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field. ++ * ++ * Each ioctl can support one or more cache invalidation requests in the array ++ * that has a total size of @entry_len * @entry_num. ++ * ++ * An empty invalidation request array by setting @entry_num==0 is allowed, and ++ * @entry_len and @data_uptr would be ignored in this case. This can be used to ++ * check if the given @data_type is supported or not by kernel. ++ */ ++struct iommu_hwpt_invalidate { ++ __u32 size; ++ __u32 hwpt_id; ++ __aligned_u64 data_uptr; ++ __u32 data_type; ++ __u32 entry_len; ++ __u32 entry_num; ++ __u32 __reserved; ++}; ++#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) ++ ++/** ++ * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault ++ * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is ++ * valid. ++ * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group. ++ */ ++enum iommu_hwpt_pgfault_flags { ++ IOMMU_PGFAULT_FLAGS_PASID_VALID = (1 << 0), ++ IOMMU_PGFAULT_FLAGS_LAST_PAGE = (1 << 1), ++}; ++ ++/** ++ * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault ++ * @IOMMU_PGFAULT_PERM_READ: request for read permission ++ * @IOMMU_PGFAULT_PERM_WRITE: request for write permission ++ * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the ++ * Execute Requested bit set in PASID TLP Prefix. ++ * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the ++ * Privileged Mode Requested bit set in PASID TLP ++ * Prefix. ++ */ ++enum iommu_hwpt_pgfault_perm { ++ IOMMU_PGFAULT_PERM_READ = (1 << 0), ++ IOMMU_PGFAULT_PERM_WRITE = (1 << 1), ++ IOMMU_PGFAULT_PERM_EXEC = (1 << 2), ++ IOMMU_PGFAULT_PERM_PRIV = (1 << 3), ++}; ++ ++/** ++ * struct iommu_hwpt_pgfault - iommu page fault data ++ * @flags: Combination of enum iommu_hwpt_pgfault_flags ++ * @dev_id: id of the originated device ++ * @pasid: Process Address Space ID ++ * @grpid: Page Request Group Index ++ * @perm: Combination of enum iommu_hwpt_pgfault_perm ++ * @addr: Fault address ++ * @length: a hint of how much data the requestor is expecting to fetch. For ++ * example, if the PRI initiator knows it is going to do a 10MB ++ * transfer, it could fill in 10MB and the OS could pre-fault in ++ * 10MB of IOVA. It's default to 0 if there's no such hint. ++ * @cookie: kernel-managed cookie identifying a group of fault messages. The ++ * cookie number encoded in the last page fault of the group should ++ * be echoed back in the response message. ++ */ ++struct iommu_hwpt_pgfault { ++ __u32 flags; ++ __u32 dev_id; ++ __u32 pasid; ++ __u32 grpid; ++ __u32 perm; ++ __u64 addr; ++ __u32 length; ++ __u32 cookie; ++}; ++ ++/** ++ * enum iommufd_page_response_code - Return status of fault handlers ++ * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables ++ * populated, retry the access. This is the ++ * "Success" defined in PCI 10.4.2.1. ++ * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the ++ * access. This is the "Invalid Request" in PCI ++ * 10.4.2.1. ++ */ ++enum iommufd_page_response_code { ++ IOMMUFD_PAGE_RESP_SUCCESS = 0, ++ IOMMUFD_PAGE_RESP_INVALID = 1, ++}; ++ ++/** ++ * struct iommu_hwpt_page_response - IOMMU page fault response ++ * @cookie: The kernel-managed cookie reported in the fault message. ++ * @code: One of response code in enum iommufd_page_response_code. ++ */ ++struct iommu_hwpt_page_response { ++ __u32 cookie; ++ __u32 code; ++}; ++ ++/** ++ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC) ++ * @size: sizeof(struct iommu_fault_alloc) ++ * @flags: Must be 0 ++ * @out_fault_id: The ID of the new FAULT ++ * @out_fault_fd: The fd of the new FAULT ++ * ++ * Explicitly allocate a fault handling object. ++ */ ++struct iommu_fault_alloc { ++ __u32 size; ++ __u32 flags; ++ __u32 out_fault_id; ++ __u32 out_fault_fd; ++}; ++#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC) ++ ++/** ++ * enum iommu_viommu_type - Virtual IOMMU Type ++ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use ++ * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type ++ */ ++enum iommu_viommu_type { ++ IOMMU_VIOMMU_TYPE_DEFAULT = 0, ++ IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, ++}; ++ ++/** ++ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC) ++ * @size: sizeof(struct iommu_viommu_alloc) ++ * @flags: Must be 0 ++ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type ++ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU ++ * @hwpt_id: ID of a nesting parent HWPT to associate to ++ * @out_viommu_id: Output virtual IOMMU ID for the allocated object ++ * ++ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's ++ * virtualization support that is a security-isolated slice of the real IOMMU HW ++ * that is unique to a specific VM. Operations global to the IOMMU are connected ++ * to the vIOMMU, such as: ++ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags ++ * - Access to a sharable nesting parent pagetable across physical IOMMUs ++ * - Non-affiliated event reporting (e.g. an invalidation queue error) ++ * - Virtualization of various platforms IDs, e.g. RIDs and others ++ * - Delivery of paravirtualized invalidation ++ * - Direct assigned invalidation queues ++ * - Direct assigned interrupts ++ */ ++struct iommu_viommu_alloc { ++ __u32 size; ++ __u32 flags; ++ __u32 type; ++ __u32 dev_id; ++ __u32 hwpt_id; ++ __u32 out_viommu_id; ++}; ++#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) ++ ++/** ++ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) ++ * @size: sizeof(struct iommu_vdevice_alloc) ++ * @viommu_id: vIOMMU ID to associate with the virtual device ++ * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU ++ * @__reserved: Must be 0 ++ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID ++ * of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table. ++ * @out_vdevice_id: Output virtual instance ID for the allocated object ++ * @__reserved2: Must be 0 ++ * ++ * Allocate a virtual device instance (for a physical device) against a vIOMMU. ++ * This instance holds the device's information (related to its vIOMMU) in a VM. ++ */ ++struct iommu_vdevice_alloc { ++ __u32 size; ++ __u32 viommu_id; ++ __u32 dev_id; ++ __u32 __reserved; ++ __aligned_u64 virt_id; ++ __u32 out_vdevice_id; ++ __u32 __reserved2; ++}; ++#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) + #endif +-- +2.41.0.windows.1 + diff --git a/acpi-gpex-Fix-PCI-Express-Slot-Information-function-.patch b/acpi-gpex-Fix-PCI-Express-Slot-Information-function-.patch new file mode 100644 index 0000000000000000000000000000000000000000..9238a792eb63cffc05255d675e2a6a84b7dbc3d7 --- /dev/null +++ b/acpi-gpex-Fix-PCI-Express-Slot-Information-function-.patch @@ -0,0 +1,32 @@ +From 237fdc8ddb0598234aace9c88ac4c8387119a12a Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 7 Jul 2022 11:55:25 -0400 +Subject: [PATCH] acpi/gpex: Fix PCI Express Slot Information function 0 + returned value + +At the moment we do not support other function than function 0. +So according to ACPI spec "_DSM (Device Specific Method)" +description, bit 0 should rather be 0, meaning no other function is +supported than function 0. + +Signed-off-by: Eric Auger +--- + hw/pci-host/gpex-acpi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c +index 1092dc3b70..ac5d229757 100644 +--- a/hw/pci-host/gpex-acpi.c ++++ b/hw/pci-host/gpex-acpi.c +@@ -113,7 +113,7 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) + UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); + ifctx = aml_if(aml_equal(aml_arg(0), UUID)); + ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); +- uint8_t byte_list[1] = {1}; ++ uint8_t byte_list[1] = {0}; + buf = aml_buffer(1, byte_list); + aml_append(ifctx1, aml_return(buf)); + aml_append(ifctx, ifctx1); +-- +2.41.0.windows.1 + diff --git a/arm-VirtCCA-CVM-support-UEFI-boot.patch b/arm-VirtCCA-CVM-support-UEFI-boot.patch new file mode 100644 index 0000000000000000000000000000000000000000..37af6303f4801f9f1fb9735e4fc6d6772db66459 --- /dev/null +++ b/arm-VirtCCA-CVM-support-UEFI-boot.patch @@ -0,0 +1,189 @@ +From 9eacd1a6df6861b76663e98133adb15059bf65cc Mon Sep 17 00:00:00 2001 +From: gongchangsui +Date: Mon, 17 Mar 2025 02:40:50 -0400 +Subject: [PATCH] arm: VirtCCA: CVM support UEFI boot + +1. Add UEFI boot support for Confidential VMs. +2. Modify the base memory address of Confidential VMs from 3GB to 1GB. +3. Disable pflash boot support for Confidential VMs; use the`-bios`option to specify`QEMU_EFI.fd`during launch. + +Signed-off-by: gongchangsui +--- + hw/arm/boot.c | 38 ++++++++++++++++++++++++++++++++++++-- + hw/arm/virt.c | 33 ++++++++++++++++++++++++++++++++- + include/hw/arm/boot.h | 3 +++ + 3 files changed, 71 insertions(+), 3 deletions(-) + +diff --git a/hw/arm/boot.c b/hw/arm/boot.c +index 42110b0f18..6b2f46af4d 100644 +--- a/hw/arm/boot.c ++++ b/hw/arm/boot.c +@@ -43,6 +43,9 @@ + + #define BOOTLOADER_MAX_SIZE (4 * KiB) + ++#define UEFI_MAX_SIZE 0x8000000 ++#define UEFI_LOADER_START 0x0 ++#define DTB_MAX 0x200000 + AddressSpace *arm_boot_address_space(ARMCPU *cpu, + const struct arm_boot_info *info) + { +@@ -1155,7 +1158,31 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, + } + } + +-static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) ++static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, ++ struct arm_boot_info *info, ++ const char *firmware_filename) ++{ ++ ssize_t fw_size; ++ const char *fname; ++ AddressSpace *as = arm_boot_address_space(cpu, info); ++ ++ fname = qemu_find_file(QEMU_FILE_TYPE_BIOS, firmware_filename); ++ if (!fname) { ++ error_report("Could not find firmware image '%s'", firmware_filename); ++ exit(EXIT_FAILURE); ++ } ++ ++ fw_size = load_image_targphys_as(firmware_filename, ++ info->firmware_base, ++ info->firmware_max_size, as); ++ ++ if (fw_size <= 0) { ++ error_report("could not load firmware '%s'", firmware_filename); ++ exit(EXIT_FAILURE); ++ } ++} ++ ++static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename) + { + /* Set up for booting firmware (which might load a kernel via fw_cfg) */ + +@@ -1166,6 +1193,8 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) + * DTB to the base of RAM for the bootloader to pick up. + */ + info->dtb_start = info->loader_start; ++ if (info->confidential) ++ tmm_add_ram_region(UEFI_LOADER_START, UEFI_MAX_SIZE, info->dtb_start, DTB_MAX , true); + } + + if (info->kernel_filename) { +@@ -1206,6 +1235,11 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) + } + } + ++ if (info->confidential) { ++ arm_setup_confidential_firmware_boot(cpu, info, firmware_filename); ++ kvm_load_user_data(UEFI_LOADER_START, UEFI_MAX_SIZE, info->loader_start, info->loader_start + DTB_MAX, info->ram_size, ++ (struct kvm_numa_info *)info->numa_info); ++ } + /* + * We will start from address 0 (typically a boot ROM image) in the + * same way as hardware. Leave env->boot_info NULL, so that +@@ -1282,7 +1316,7 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) + + /* Load the kernel. */ + if (!info->kernel_filename || info->firmware_loaded) { +- arm_setup_firmware_boot(cpu, info); ++ arm_setup_firmware_boot(cpu, info, ms->firmware); + } else { + arm_setup_direct_kernel_boot(cpu, info); + } +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 8823f2ed1c..6ffb26e7e6 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -1398,6 +1398,9 @@ static void virt_flash_map1(PFlashCFI01 *flash, + qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); + sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); + ++ if (virtcca_cvm_enabled()) { ++ return; ++ } + memory_region_add_subregion(sysmem, base, + sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), + 0)); +@@ -1433,6 +1436,10 @@ static void virt_flash_fdt(VirtMachineState *vms, + MachineState *ms = MACHINE(vms); + char *nodename; + ++ if (virtcca_cvm_enabled()) { ++ return; ++ } ++ + if (sysmem == secure_sysmem) { + /* Report both flash devices as a single node in the DT */ + nodename = g_strdup_printf("/flash@%" PRIx64, flashbase); +@@ -1468,6 +1475,23 @@ static void virt_flash_fdt(VirtMachineState *vms, + } + } + ++static bool virt_confidential_firmware_init(VirtMachineState *vms, ++ MemoryRegion *sysmem) ++{ ++ MemoryRegion *fw_ram; ++ hwaddr fw_base = vms->memmap[VIRT_FLASH].base; ++ hwaddr fw_size = vms->memmap[VIRT_FLASH].size; ++ ++ if (!MACHINE(vms)->firmware) { ++ return false; ++ } ++ ++ fw_ram = g_new(MemoryRegion, 1); ++ memory_region_init_ram(fw_ram, NULL, "fw_ram", fw_size, NULL); ++ memory_region_add_subregion(sysmem, fw_base, fw_ram); ++ return true; ++} ++ + static bool virt_firmware_init(VirtMachineState *vms, + MemoryRegion *sysmem, + MemoryRegion *secure_sysmem) +@@ -1486,6 +1510,10 @@ static bool virt_firmware_init(VirtMachineState *vms, + + pflash_blk0 = pflash_cfi01_get_blk(vms->flash[0]); + ++ if (virtcca_cvm_enabled()) { ++ return virt_confidential_firmware_init(vms, sysmem); ++ } ++ + bios_name = MACHINE(vms)->firmware; + if (bios_name) { + char *fname; +@@ -2023,7 +2051,7 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) + vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; + vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; + +- vms->memmap[VIRT_MEM].base = 3 * GiB; ++ vms->memmap[VIRT_MEM].base = 1 * GiB; + vms->memmap[VIRT_MEM].size = ms->ram_size; + info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), + (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); +@@ -2822,6 +2850,9 @@ static void machvirt_init(MachineState *machine) + vms->bootinfo.get_dtb = machvirt_dtb; + vms->bootinfo.skip_dtb_autoload = true; + vms->bootinfo.firmware_loaded = firmware_loaded; ++ vms->bootinfo.firmware_base = vms->memmap[VIRT_FLASH].base; ++ vms->bootinfo.firmware_max_size = vms->memmap[VIRT_FLASH].size; ++ vms->bootinfo.confidential = virtcca_cvm_enabled(); + vms->bootinfo.psci_conduit = vms->psci_conduit; + arm_load_kernel(ARM_CPU(first_cpu), machine, &vms->bootinfo); + +diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h +index 4491b1f85b..06ca1d90b2 100644 +--- a/include/hw/arm/boot.h ++++ b/include/hw/arm/boot.h +@@ -133,6 +133,9 @@ struct arm_boot_info { + bool secure_board_setup; + + arm_endianness endianness; ++ hwaddr firmware_base; ++ hwaddr firmware_max_size; ++ bool confidential; + }; + + /** +-- +2.41.0.windows.1 + diff --git a/arm-VirtCCA-Compatibility-with-older-versions-of-TMM.patch b/arm-VirtCCA-Compatibility-with-older-versions-of-TMM.patch new file mode 100644 index 0000000000000000000000000000000000000000..6141b8399a91cd3233d3ecc7bea3316756060268 --- /dev/null +++ b/arm-VirtCCA-Compatibility-with-older-versions-of-TMM.patch @@ -0,0 +1,117 @@ +From 5ed17a43a4cc7fc76397d6d8cad8246063b5b2f3 Mon Sep 17 00:00:00 2001 +From: gongchangsui +Date: Mon, 17 Mar 2025 02:43:55 -0400 +Subject: [PATCH] arm: VirtCCA: Compatibility with older versions of TMM and + the kernel + +Since the base memory address of Confidential VMs in QEMU was changed +from 3GB to 1GB, corresponding adjustments are required in both the TMM +and kernel components. To maintain backward compatibility, the following +modifications were implemented: + 1. **TMM Versioning**: The TMM version number was incremented to +reflect the update + 2. **Kernel Interface**: A new interface was exposed in the kernel +to retrieve the TMM version number. + 3. **QEMU Compatibility Logic**: During initialization, QEMU checks +the TMM version via the kernel interface. If the TMM version is**<2.1**(legacy), +QEMU sets the Confidential VM's base memory address to**3GB**. For TMM versions +**2.1**(updated), the address is configured to**1GB**to align with the new memory layout +This approach ensures seamless backward compatibility while transitioning +to the revised memory addressing scheme. + +Signed-off-by: gongchangsui +--- + accel/kvm/kvm-all.c | 3 +-- + hw/arm/boot.c | 9 +++++++++ + hw/arm/virt.c | 9 +++++++-- + linux-headers/asm-arm64/kvm.h | 2 ++ + linux-headers/linux/kvm.h | 3 +++ + 5 files changed, 22 insertions(+), 4 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index a8e29f148e..38a48cc031 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2390,6 +2390,7 @@ static int kvm_init(MachineState *ms) + qemu_mutex_init(&kml_slots_lock); + + s = KVM_STATE(ms->accelerator); ++ kvm_state = s; + + /* + * On systems where the kernel can support different base page +@@ -2609,8 +2610,6 @@ static int kvm_init(MachineState *ms) + #endif + } + +- kvm_state = s; +- + ret = kvm_arch_init(ms, s); + if (ret < 0) { + goto err; +diff --git a/hw/arm/boot.c b/hw/arm/boot.c +index 6b2f46af4d..ca9f69fd3d 100644 +--- a/hw/arm/boot.c ++++ b/hw/arm/boot.c +@@ -1162,6 +1162,15 @@ static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, + struct arm_boot_info *info, + const char *firmware_filename) + { ++ uint64_t tmi_version = 0; ++ if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { ++ error_report("please check the kernel version!"); ++ exit(EXIT_FAILURE); ++ } ++ if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { ++ error_report("please check the tmi version!"); ++ exit(EXIT_FAILURE); ++ } + ssize_t fw_size; + const char *fname; + AddressSpace *as = arm_boot_address_space(cpu, info); +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 6ffb26e7e6..39dfec0877 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -2050,8 +2050,13 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) + /* support kae vf device tree nodes */ + vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; + vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; +- +- vms->memmap[VIRT_MEM].base = 1 * GiB; ++ uint64_t tmi_version = 0; ++ if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { ++ warn_report("can not get tmi version"); ++ } ++ if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { ++ vms->memmap[VIRT_MEM].base = 3 * GiB; ++ } + vms->memmap[VIRT_MEM].size = ms->ram_size; + info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), + (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); +diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h +index 552fdcb18f..d69a71cbec 100644 +--- a/linux-headers/asm-arm64/kvm.h ++++ b/linux-headers/asm-arm64/kvm.h +@@ -597,4 +597,6 @@ struct kvm_cap_arm_tmm_populate_region_args { + + #endif + ++#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 ++ + #endif /* __ARM_KVM_H__ */ +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 84cec64b88..7a08f9b1e9 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -2422,4 +2422,7 @@ struct kvm_s390_zpci_op { + /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ + #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) + ++/* get tmi version */ ++#define KVM_GET_TMI_VERSION _IOR(KVMIO, 0xd2, uint64_t) ++ + #endif /* __LINUX_KVM_H */ +-- +2.41.0.windows.1 + diff --git a/arm-VirtCCA-qemu-CoDA-support-UEFI-boot.patch b/arm-VirtCCA-qemu-CoDA-support-UEFI-boot.patch new file mode 100644 index 0000000000000000000000000000000000000000..9d412930f90820a89c7a2b0a73744aa5882b94b2 --- /dev/null +++ b/arm-VirtCCA-qemu-CoDA-support-UEFI-boot.patch @@ -0,0 +1,137 @@ +From 0119389040e4d78c6238875b812827d4f07b5f0f Mon Sep 17 00:00:00 2001 +From: gongchangsui +Date: Mon, 17 Mar 2025 02:51:16 -0400 +Subject: [PATCH] arm: VirtCCA: qemu CoDA support UEFI boot + +1. Expose PCIe MMIO region from QEMU memory map. +2. Refactor struct kvm_user_data data_start and data_size represent +the address base and size of the MMIO in UEFI boot modedata_start +and data_size represent the address base and size of the DTB in direct boot mode. + +Signed-off-by: gongchangsui +--- + accel/kvm/kvm-all.c | 8 ++++---- + hw/arm/boot.c | 10 ++++++---- + hw/arm/virt.c | 6 ++++++ + linux-headers/linux/kvm.h | 12 +++++++++--- + target/arm/kvm_arm.h | 2 ++ + 5 files changed, 27 insertions(+), 11 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 38a48cc031..57c6718b77 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3527,7 +3527,7 @@ int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) + return r; + } + +-int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, ++int kvm_load_user_data(hwaddr loader_start, hwaddr dtb_info, hwaddr data_start, hwaddr data_size, hwaddr ram_size, + struct kvm_numa_info *numa_info) + { + KVMState *state = kvm_state; +@@ -3535,9 +3535,9 @@ int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_star + int ret; + + data.loader_start = loader_start; +- data.image_end = image_end; +- data.initrd_start = initrd_start; +- data.dtb_end = dtb_end; ++ data.dtb_info = dtb_info; ++ data.data_start = data_start; ++ data.data_size = data_size; + data.ram_size = ram_size; + memcpy(&data.numa_info, numa_info, sizeof(struct kvm_numa_info)); + +diff --git a/hw/arm/boot.c b/hw/arm/boot.c +index ca9f69fd3d..a3e0dbb68c 100644 +--- a/hw/arm/boot.c ++++ b/hw/arm/boot.c +@@ -1149,10 +1149,10 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, + + if (kvm_enabled() && virtcca_cvm_enabled()) { + if (info->dtb_limit == 0) { +- info->dtb_limit = info->dtb_start + 0x200000; ++ info->dtb_limit = info->dtb_start + DTB_MAX; + } +- kvm_load_user_data(info->loader_start, image_high_addr, info->initrd_start, +- info->dtb_limit, info->ram_size, (struct kvm_numa_info *)info->numa_info); ++ kvm_load_user_data(info->loader_start, 0x1, info->dtb_start, ++ info->dtb_limit - info->dtb_start, info->ram_size, (struct kvm_numa_info *)info->numa_info); + tmm_add_ram_region(info->loader_start, image_high_addr - info->loader_start, + info->initrd_start, info->dtb_limit - info->initrd_start, true); + } +@@ -1193,6 +1193,7 @@ static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, + + static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename) + { ++ hwaddr mmio_start, mmio_size; + /* Set up for booting firmware (which might load a kernel via fw_cfg) */ + + if (have_dtb(info)) { +@@ -1246,7 +1247,8 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, con + + if (info->confidential) { + arm_setup_confidential_firmware_boot(cpu, info, firmware_filename); +- kvm_load_user_data(UEFI_LOADER_START, UEFI_MAX_SIZE, info->loader_start, info->loader_start + DTB_MAX, info->ram_size, ++ virtcca_kvm_get_mmio_addr(&mmio_start, &mmio_size); ++ kvm_load_user_data(info->loader_start, DTB_MAX, mmio_start, mmio_size, info->ram_size, + (struct kvm_numa_info *)info->numa_info); + } + /* +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 39dfec0877..6c5611826c 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -176,6 +176,12 @@ static const MemMapEntry base_memmap[] = { + [VIRT_MEM] = { GiB, LEGACY_RAMLIMIT_BYTES }, + }; + ++void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size) ++{ ++ *mmio_start = base_memmap[VIRT_PCIE_MMIO].base; ++ *mmio_size = base_memmap[VIRT_PCIE_MMIO].size; ++} ++ + /* + * Highmem IO Regions: This memory map is floating, located after the RAM. + * Each MemMapEntry base (GPA) will be dynamically computed, depending on the +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 7a08f9b1e9..c9ec7f862a 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -1510,9 +1510,15 @@ struct kvm_numa_info { + + struct kvm_user_data { + __u64 loader_start; +- __u64 image_end; +- __u64 initrd_start; +- __u64 dtb_end; ++ /* ++ * When the lowest bit of dtb_info is 0, the value of dtb_info represents the size of the DTB, ++ * and data_start and data_size represent the address base and size of the MMIO. ++ * When the lowest bit of dtb_info is 1, data_start and data_size represent the address base ++ * and size of the DTB. ++ */ ++ __u64 dtb_info; ++ __u64 data_start; ++ __u64 data_size; + __u64 ram_size; + struct kvm_numa_info numa_info; + }; +diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h +index 31457a57f7..62fbb713f4 100644 +--- a/target/arm/kvm_arm.h ++++ b/target/arm/kvm_arm.h +@@ -73,6 +73,8 @@ int kvm_arm_vcpu_finalize(CPUState *cs, int feature); + void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group, + uint64_t attr, int dev_fd, uint64_t addr_ormask); + ++void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size); ++ + /** + * kvm_arm_init_cpreg_list: + * @cpu: ARMCPU +-- +2.41.0.windows.1 + diff --git a/arm-VirtCCA-qemu-uefi-boot-support-kae.patch b/arm-VirtCCA-qemu-uefi-boot-support-kae.patch new file mode 100644 index 0000000000000000000000000000000000000000..399b2dfe7003c3ef179eb4f1e345a0103482cd76 --- /dev/null +++ b/arm-VirtCCA-qemu-uefi-boot-support-kae.patch @@ -0,0 +1,100 @@ +From 5bffeb311c969a0e05106e4bf54282431c5ba907 Mon Sep 17 00:00:00 2001 +From: gongchangsui +Date: Mon, 17 Mar 2025 02:42:43 -0400 +Subject: [PATCH] arm: VirtCCA: qemu uefi boot support kae + +This commit introduces modifications to enable KAE functionality +during UEFI boot in cVMs. Additionally,the ACPI feature must be +configured in cVM. + +Signed-off-by: gongchangsui +--- + hw/arm/virt-acpi-build.c | 58 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 58 insertions(+) + +diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c +index 076781423b..f78331d69f 100644 +--- a/hw/arm/virt-acpi-build.c ++++ b/hw/arm/virt-acpi-build.c +@@ -58,6 +58,7 @@ + #include "migration/vmstate.h" + #include "hw/acpi/ghes.h" + #include "hw/acpi/viot.h" ++#include "kvm_arm.h" + + #define ARM_SPI_BASE 32 + +@@ -405,6 +406,54 @@ static void acpi_dsdt_add_virtio(Aml *scope, + } + } + ++static void acpi_dsdt_add_hisi_sec(Aml *scope, ++ const MemMapEntry *virtio_mmio_memmap, ++ int dev_id) ++{ ++ hwaddr size = 0x10000; ++ ++ /* ++ * Calculate the base address for the sec device node. ++ * Each device group contains one sec device and one hpre device,spaced by 2 * size. ++ */ ++ hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size; ++ ++ Aml *dev = aml_device("SE%02u", dev_id); ++ aml_append(dev, aml_name_decl("_HID", aml_string("SEC07"))); ++ aml_append(dev, aml_name_decl("_UID", aml_int(dev_id))); ++ aml_append(dev, aml_name_decl("_CCA", aml_int(1))); ++ ++ Aml *crs = aml_resource_template(); ++ ++ aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); ++ aml_append(dev, aml_name_decl("_CRS", crs)); ++ aml_append(scope, dev); ++} ++ ++static void acpi_dsdt_add_hisi_hpre(Aml *scope, ++ const MemMapEntry *virtio_mmio_memmap, ++ int dev_id) ++{ ++ hwaddr size = 0x10000; ++ ++ /* ++ * Calculate the base address for the hpre device node. ++ * Each hpre device follows the corresponding sec device by an additional offset of size. ++ */ ++ hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size + size; ++ ++ Aml *dev = aml_device("HP%02u", dev_id); ++ aml_append(dev, aml_name_decl("_HID", aml_string("HPRE07"))); ++ aml_append(dev, aml_name_decl("_UID", aml_int(dev_id))); ++ aml_append(dev, aml_name_decl("_CCA", aml_int(1))); ++ ++ Aml *crs = aml_resource_template(); ++ ++ aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); ++ aml_append(dev, aml_name_decl("_CRS", crs)); ++ aml_append(scope, dev); ++} ++ + static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, + uint32_t irq, VirtMachineState *vms) + { +@@ -1201,6 +1250,15 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + acpi_dsdt_add_virtio(scope, &memmap[VIRT_MMIO], + (irqmap[VIRT_MMIO] + ARM_SPI_BASE), NUM_VIRTIO_TRANSPORTS); + acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms); ++ ++ if (virtcca_cvm_enabled()) { ++ int kae_num = tmm_get_kae_num(); ++ for (int i = 0; i < kae_num; i++) { ++ acpi_dsdt_add_hisi_sec(scope, &memmap[VIRT_KAE_DEVICE], i); ++ acpi_dsdt_add_hisi_hpre(scope, &memmap[VIRT_KAE_DEVICE], i); ++ } ++ } ++ + if (vms->acpi_dev) { + build_ged_aml(scope, "\\_SB."GED_DEVICE, + HOTPLUG_HANDLER(vms->acpi_dev), +-- +2.41.0.windows.1 + diff --git a/backend-iommufd-Report-PASID-capability.patch b/backend-iommufd-Report-PASID-capability.patch new file mode 100644 index 0000000000000000000000000000000000000000..6c38cf0602338aa3411e2423168552dbb43d494d --- /dev/null +++ b/backend-iommufd-Report-PASID-capability.patch @@ -0,0 +1,150 @@ +From 0978556247d968ffc83beff3b2611c93fd9b6b13 Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Thu, 12 Sep 2024 00:17:31 -0700 +Subject: [PATCH] backend/iommufd: Report PASID capability + +Signed-off-by: Yi Liu +--- + backends/iommufd.c | 4 +++- + hw/arm/smmu-common.c | 4 ++-- + hw/arm/smmuv3.c | 4 +++- + hw/vfio/iommufd.c | 4 +++- + include/hw/arm/smmu-common.h | 2 +- + include/sysemu/host_iommu_device.h | 1 + + include/sysemu/iommufd.h | 3 ++- + 7 files changed, 15 insertions(+), 7 deletions(-) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index e9ce82297b..4f5df63331 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -326,7 +326,8 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, + + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, +- uint64_t *caps, Error **errp) ++ uint64_t *caps, uint8_t *max_pasid_log2, ++ Error **errp) + { + struct iommu_hw_info info = { + .size = sizeof(info), +@@ -344,6 +345,7 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + *type = info.out_data_type; + g_assert(caps); + *caps = info.out_capabilities; ++ *max_pasid_log2 = info.out_max_pasid_log2; + + return true; + } +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index c382fa16e5..e7028bd4ec 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -853,7 +853,7 @@ SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) + + /* IOMMUFD helpers */ + int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, +- uint32_t data_len, void *data) ++ uint32_t data_len, uint8_t *pasid, void *data) + { + uint64_t caps; + +@@ -863,7 +863,7 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + + return !iommufd_backend_get_device_info(sdev->idev->iommufd, + sdev->idev->devid, data_type, data, +- data_len, &caps, NULL); ++ data_len, &caps, pasid, NULL); + } + + void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 30c0ae4c3b..0ca0e96fcc 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -264,6 +264,7 @@ static void smmuv3_nested_init_regs(SMMUv3State *s) + SMMUDevice *sdev; + uint32_t data_type; + uint32_t val; ++ uint8_t pasid; + int ret; + + if (!bs->nested || !bs->viommu) { +@@ -280,7 +281,8 @@ static void smmuv3_nested_init_regs(SMMUv3State *s) + goto out; + } + +- ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &sdev->info); ++ ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &pasid, ++ &sdev->info); + if (ret) { + error_report("failed to get SMMU device info"); + return; +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index c0eb87c78c..a108beda29 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -871,18 +871,20 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + struct iommu_hw_info_vtd vtd; + } data; + uint64_t hw_caps; ++ uint8_t pasids; + + hiod->agent = opaque; + + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, + &type, &data, sizeof(data), +- &hw_caps, errp)) { ++ &hw_caps, &pasids, errp)) { + return false; + } + + hiod->name = g_strdup(vdev->name); + caps->type = type; + caps->hw_caps = hw_caps; ++ caps->max_pasid_log2 = pasids; + + return true; + } +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 087a11efc7..8ae33c3753 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -276,7 +276,7 @@ void smmu_inv_notifiers_all(SMMUState *s); + + /* IOMMUFD helpers */ + int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, +- uint32_t data_len, void *data); ++ uint32_t data_len, uint8_t *pasid, void *data); + void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); + int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data, +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +index 84131f5495..22c76a37a7 100644 +--- a/include/sysemu/host_iommu_device.h ++++ b/include/sysemu/host_iommu_device.h +@@ -26,6 +26,7 @@ + typedef struct HostIOMMUDeviceCaps { + uint32_t type; + uint64_t hw_caps; ++ uint8_t max_pasid_log2; + } HostIOMMUDeviceCaps; + + #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index b279184974..29afaa429d 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -57,7 +57,8 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, +- uint64_t *caps, Error **errp); ++ uint64_t *caps, uint8_t *max_pasid_log2, ++ Error **errp); + bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, +-- +2.41.0.windows.1 + diff --git a/backends-Introduce-HostIOMMUDevice-abstract.patch b/backends-Introduce-HostIOMMUDevice-abstract.patch new file mode 100644 index 0000000000000000000000000000000000000000..42a92c8d751528b5e117eac781e7ca17f58e2280 --- /dev/null +++ b/backends-Introduce-HostIOMMUDevice-abstract.patch @@ -0,0 +1,162 @@ +From 626698a1e9edff6a1032f496858555e1a4614fbe Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:27 +0800 +Subject: [PATCH] backends: Introduce HostIOMMUDevice abstract +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +A HostIOMMUDevice is an abstraction for an assigned device that is protected +by a physical IOMMU (aka host IOMMU). The userspace interaction with this +physical IOMMU can be done either through the VFIO IOMMU type 1 legacy +backend or the new iommufd backend. The assigned device can be a VFIO device +or a VDPA device. The HostIOMMUDevice is needed to interact with the host +IOMMU that protects the assigned device. It is especially useful when the +device is also protected by a virtual IOMMU as this latter use the translation +services of the physical IOMMU and is constrained by it. In that context the +HostIOMMUDevice can be passed to the virtual IOMMU to collect physical IOMMU +capabilities such as the supported address width. In the future, the virtual +IOMMU will use the HostIOMMUDevice to program the guest page tables in the +first translation stage of the physical IOMMU. + +Introduce .realize() to initialize HostIOMMUDevice further after instance init. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + MAINTAINERS | 2 ++ + backends/host_iommu_device.c | 33 +++++++++++++++++++ + backends/meson.build | 1 + + include/sysemu/host_iommu_device.h | 53 ++++++++++++++++++++++++++++++ + 4 files changed, 89 insertions(+) + create mode 100644 backends/host_iommu_device.c + create mode 100644 include/sysemu/host_iommu_device.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index 0ddb20a35f..ada87bfa9e 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2174,6 +2174,8 @@ M: Zhenzhong Duan + S: Supported + F: backends/iommufd.c + F: include/sysemu/iommufd.h ++F: backends/host_iommu_device.c ++F: include/sysemu/host_iommu_device.h + F: include/qemu/chardev_open.h + F: util/chardev_open.c + F: docs/devel/vfio-iommufd.rst +diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c +new file mode 100644 +index 0000000000..8f2dda1beb +--- /dev/null ++++ b/backends/host_iommu_device.c +@@ -0,0 +1,33 @@ ++/* ++ * Host IOMMU device abstract ++ * ++ * Copyright (C) 2024 Intel Corporation. ++ * ++ * Authors: Zhenzhong Duan ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. See ++ * the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "sysemu/host_iommu_device.h" ++ ++OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice, ++ host_iommu_device, ++ HOST_IOMMU_DEVICE, ++ OBJECT) ++ ++static void host_iommu_device_class_init(ObjectClass *oc, void *data) ++{ ++} ++ ++static void host_iommu_device_init(Object *obj) ++{ ++} ++ ++static void host_iommu_device_finalize(Object *obj) ++{ ++ HostIOMMUDevice *hiod = HOST_IOMMU_DEVICE(obj); ++ ++ g_free(hiod->name); ++} +diff --git a/backends/meson.build b/backends/meson.build +index 9a5cea480d..68b5e34e04 100644 +--- a/backends/meson.build ++++ b/backends/meson.build +@@ -13,6 +13,7 @@ system_ss.add([files( + system_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c')) + system_ss.add(when: 'CONFIG_POSIX', if_true: files('hostmem-file.c')) + system_ss.add(when: 'CONFIG_LINUX', if_true: files('hostmem-memfd.c')) ++system_ss.add(when: 'CONFIG_LINUX', if_true: files('host_iommu_device.c')) + if keyutils.found() + system_ss.add(keyutils, files('cryptodev-lkcf.c')) + endif +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +new file mode 100644 +index 0000000000..db47a16189 +--- /dev/null ++++ b/include/sysemu/host_iommu_device.h +@@ -0,0 +1,53 @@ ++/* ++ * Host IOMMU device abstract declaration ++ * ++ * Copyright (C) 2024 Intel Corporation. ++ * ++ * Authors: Zhenzhong Duan ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. See ++ * the COPYING file in the top-level directory. ++ */ ++ ++#ifndef HOST_IOMMU_DEVICE_H ++#define HOST_IOMMU_DEVICE_H ++ ++#include "qom/object.h" ++#include "qapi/error.h" ++ ++#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" ++OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) ++ ++struct HostIOMMUDevice { ++ Object parent_obj; ++ ++ char *name; ++}; ++ ++/** ++ * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices. ++ * ++ * Different types of host devices (e.g., VFIO or VDPA device) or devices ++ * with different backend (e.g., VFIO legacy container or IOMMUFD backend) ++ * will have different implementations of the HostIOMMUDeviceClass. ++ */ ++struct HostIOMMUDeviceClass { ++ ObjectClass parent_class; ++ ++ /** ++ * @realize: initialize host IOMMU device instance further. ++ * ++ * Mandatory callback. ++ * ++ * @hiod: pointer to a host IOMMU device instance. ++ * ++ * @opaque: pointer to agent device of this host IOMMU device, ++ * e.g., VFIO base device or VDPA device. ++ * ++ * @errp: pass an Error out when realize fails. ++ * ++ * Returns: true on success, false on failure. ++ */ ++ bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); ++}; ++#endif +-- +2.41.0.windows.1 + diff --git a/backends-VirtCCA-cvm_gpa_start-supports-both-1GB-and.patch b/backends-VirtCCA-cvm_gpa_start-supports-both-1GB-and.patch new file mode 100644 index 0000000000000000000000000000000000000000..e7066d9acd899b053190e4e5447e7d8dd3e09e09 --- /dev/null +++ b/backends-VirtCCA-cvm_gpa_start-supports-both-1GB-and.patch @@ -0,0 +1,113 @@ +From bc08940ad3c75da49e05c596f79e9e0164573709 Mon Sep 17 00:00:00 2001 +From: gongchangsui +Date: Mon, 17 Mar 2025 02:56:40 -0400 +Subject: [PATCH] backends: VirtCCA: cvm_gpa_start supports both 1GB and 3GB + +For TMM versions 2.1 and above, `cvm_gpa_start` is 1GB, while for +versions prior to 2.1, `cvm_gpa_start` is 3GB. Shared huge page memory +supports both `cvm_gpa_start` values. + +Signed-off-by: gongchangsui +--- + backends/hostmem-file.c | 17 ++++++++++++++--- + hw/arm/virt.c | 1 + + hw/core/numa.c | 2 +- + include/exec/memory.h | 11 +++++++---- + 4 files changed, 23 insertions(+), 8 deletions(-) + +diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c +index 891fe4ac4a..ce63a372a3 100644 +--- a/backends/hostmem-file.c ++++ b/backends/hostmem-file.c +@@ -27,6 +27,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE) + + bool virtcca_shared_hugepage_mapped = false; + uint64_t virtcca_cvm_ram_size = 0; ++uint64_t virtcca_cvm_gpa_start = 0; + + struct HostMemoryBackendFile { + HostMemoryBackend parent_obj; +@@ -101,8 +102,16 @@ virtcca_shared_backend_memory_alloc(char *mem_path, uint32_t ram_flags, Error ** + error_report("parse virtcca share memory path failed"); + exit(1); + } +- if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE) { +- size = VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE; ++ ++ /* ++ * 1) CVM_GPA_START = 3GB --> fix size = 1GB ++ * 2) CVM_GPA_START = 1GB && ram_size >= 3GB --> size = 3GB ++ * 3) CVM_GPA_START = 1GB && ram_size < 3GB --> size = ram_size ++ */ ++ if (virtcca_cvm_gpa_start != DEFAULT_VM_GPA_START) { ++ size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - virtcca_cvm_gpa_start; ++ } else if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START) { ++ size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START; + } + + virtcca_shared_hugepage = g_new(MemoryRegion, 1); +@@ -172,7 +181,9 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) + fb->mem_path, fb->offset, errp); + g_free(name); + +- if (virtcca_cvm_enabled() && backend->share && !virtcca_shared_hugepage_mapped) { ++ if (virtcca_cvm_enabled() && backend->share && ++ (strcmp(fb->mem_path, "/dev/shm") != 0) && ++ !virtcca_shared_hugepage_mapped) { + virtcca_shared_backend_memory_alloc(fb->mem_path, ram_flags, errp); + } + #endif +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 6c5611826c..3c31d3667e 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -2063,6 +2063,7 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) + if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { + vms->memmap[VIRT_MEM].base = 3 * GiB; + } ++ virtcca_cvm_gpa_start = vms->memmap[VIRT_MEM].base; + vms->memmap[VIRT_MEM].size = ms->ram_size; + info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), + (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); +diff --git a/hw/core/numa.c b/hw/core/numa.c +index c691578ef5..98d896e687 100644 +--- a/hw/core/numa.c ++++ b/hw/core/numa.c +@@ -655,7 +655,7 @@ static void virtcca_shared_memory_configuration(MachineState *ms) + memory_region_init_alias(alias_mr, NULL, "alias-mr", virtcca_shared_hugepage, + 0, int128_get64(virtcca_shared_hugepage->size)); + memory_region_add_subregion(address_space_virtcca_shared_memory.root, +- VIRTCCA_GPA_START, alias_mr); ++ virtcca_cvm_gpa_start, alias_mr); + } + + void numa_complete_configuration(MachineState *ms) +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 33778f5c64..c14dc69d27 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -243,14 +243,17 @@ typedef struct IOMMUTLBEvent { + /* RAM FD is opened read-only */ + #define RAM_READONLY_FD (1 << 11) + +-/* The GPA range of the VirtCCA bounce buffer is from 1GB to 4GB. */ +-#define VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE 0xc0000000ULL ++/* The address limit of the VirtCCA bounce buffer is 4GB. */ ++#define VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT 0x100000000ULL + + /* The VirtCCA shared hugepage memory granularity is 1GB */ + #define VIRTCCA_SHARED_HUGEPAGE_ALIGN 0x40000000ULL + +-/* The GPA starting address of the VirtCCA CVM is 1GB */ +-#define VIRTCCA_GPA_START 0x40000000ULL ++/* The default GPA starting address of VM is 1GB */ ++#define DEFAULT_VM_GPA_START 0x40000000ULL ++ ++/* The GPA starting address of the VirtCCA CVM is 1GB or 3GB */ ++extern uint64_t virtcca_cvm_gpa_start; + + extern uint64_t virtcca_cvm_ram_size; + +-- +2.41.0.windows.1 + diff --git a/backends-cryptodev-Do-not-abort-for-invalid-session-.patch b/backends-cryptodev-Do-not-abort-for-invalid-session-.patch new file mode 100644 index 0000000000000000000000000000000000000000..8d81e171ccdaeb81c5e7be84e8106b4bbb2d8d21 --- /dev/null +++ b/backends-cryptodev-Do-not-abort-for-invalid-session-.patch @@ -0,0 +1,71 @@ +From 29080940b37ce7486a46ab5534383321319fe2c5 Mon Sep 17 00:00:00 2001 +From: gubin +Date: Sat, 22 Mar 2025 15:10:32 +0800 +Subject: [PATCH] backends/cryptodev: Do not abort for invalid session ID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cherry-pick from eaf2bd29538d039df80bb4b1584de33a61312bc6 + +Instead of aborting when a session ID is invalid, +return VIRTIO_CRYPTO_INVSESS ("Invalid session id"). + +Reproduced using: + + $ cat << EOF | qemu-system-i386 -display none \ + -machine q35,accel=qtest -m 512M -nodefaults \ + -object cryptodev-backend-builtin,id=cryptodev0 \ + -device virtio-crypto-pci,id=crypto0,cryptodev=cryptodev0 \ + -qtest stdio + outl 0xcf8 0x80000804 + outw 0xcfc 0x06 + outl 0xcf8 0x80000820 + outl 0xcfc 0xe0008000 + write 0x10800e 0x1 0x01 + write 0xe0008016 0x1 0x01 + write 0xe0008020 0x4 0x00801000 + write 0xe0008028 0x4 0x00c01000 + write 0xe000801c 0x1 0x01 + write 0x110000 0x1 0x05 + write 0x110001 0x1 0x04 + write 0x108002 0x1 0x11 + write 0x108008 0x1 0x48 + write 0x10800c 0x1 0x01 + write 0x108018 0x1 0x10 + write 0x10801c 0x1 0x02 + write 0x10c002 0x1 0x01 + write 0xe000b005 0x1 0x00 + EOF + Assertion failed: (session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]), + function cryptodev_builtin_close_session, file cryptodev-builtin.c, line 430. + +Cc: qemu-stable@nongnu.org +Reported-by: Zheyu Ma +Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2274 +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: zhenwei pi +Message-Id: <20240409094757.9127-1-philmd@linaro.org> +Signed-off-by: gubin +--- + backends/cryptodev-builtin.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c +index 0822f198d9..940104ee55 100644 +--- a/backends/cryptodev-builtin.c ++++ b/backends/cryptodev-builtin.c +@@ -428,7 +428,9 @@ static int cryptodev_builtin_close_session( + CRYPTODEV_BACKEND_BUILTIN(backend); + CryptoDevBackendBuiltinSession *session; + +- assert(session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]); ++ if (session_id >= MAX_NUM_SESSIONS || !builtin->sessions[session_id]) { ++ return -VIRTIO_CRYPTO_INVSESS; ++ } + + session = builtin->sessions[session_id]; + if (session->cipher) { +-- +2.41.0.windows.1 + diff --git a/backends-cryptodev-Do-not-ignore-throttle-backends-E.patch b/backends-cryptodev-Do-not-ignore-throttle-backends-E.patch new file mode 100644 index 0000000000000000000000000000000000000000..2d7a4d16eb1910a8fe9dd01359168db8275c7771 --- /dev/null +++ b/backends-cryptodev-Do-not-ignore-throttle-backends-E.patch @@ -0,0 +1,65 @@ +From 690812903469db798ebae012248b9231d5ce9f11 Mon Sep 17 00:00:00 2001 +From: gubin +Date: Sat, 22 Mar 2025 15:15:08 +0800 +Subject: [PATCH] backends/cryptodev: Do not ignore throttle/backends Errors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cherry-pick from 484aecf2d3a75251b63481be2a0c3aef635002af + +Both cryptodev_backend_set_throttle() and CryptoDevBackendClass::init() +can set their Error** argument. Do not ignore them, return early +on failure. Without that, running into another failure trips +error_setv()'s assertion. Use the ERRP_GUARD() macro as suggested +in commit ae7c80a7bd ("error: New macro ERRP_GUARD()"). + +Cc: qemu-stable@nongnu.org +Fixes: e7a775fd9f ("cryptodev: Account statistics") +Fixes: 2580b452ff ("cryptodev: support QoS") +Reviewed-by: zhenwei pi +Reviewed-by: Gonglei +Reviewed-by: Markus Armbruster +Signed-off-by: Philippe Mathieu-Daudé +Message-Id: <20231120150418.93443-1-philmd@linaro.org> +Signed-off-by: gubin +--- + backends/cryptodev.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/backends/cryptodev.c b/backends/cryptodev.c +index e5006bd215..fff89fd62a 100644 +--- a/backends/cryptodev.c ++++ b/backends/cryptodev.c +@@ -398,6 +398,7 @@ static void cryptodev_backend_set_ops(Object *obj, Visitor *v, + static void + cryptodev_backend_complete(UserCreatable *uc, Error **errp) + { ++ ERRP_GUARD(); + CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc); + CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc); + uint32_t services; +@@ -406,11 +407,20 @@ cryptodev_backend_complete(UserCreatable *uc, Error **errp) + QTAILQ_INIT(&backend->opinfos); + value = backend->tc.buckets[THROTTLE_OPS_TOTAL].avg; + cryptodev_backend_set_throttle(backend, THROTTLE_OPS_TOTAL, value, errp); ++ if (*errp) { ++ return; ++ } + value = backend->tc.buckets[THROTTLE_BPS_TOTAL].avg; + cryptodev_backend_set_throttle(backend, THROTTLE_BPS_TOTAL, value, errp); ++ if (*errp) { ++ return; ++ } + + if (bc->init) { + bc->init(backend, errp); ++ if (*errp) { ++ return; ++ } + } + + services = backend->conf.crypto_services; +-- +2.41.0.windows.1 + diff --git a/backends-host_iommu_device-Introduce-HostIOMMUDevice.patch b/backends-host_iommu_device-Introduce-HostIOMMUDevice.patch new file mode 100644 index 0000000000000000000000000000000000000000..8bdfb79d0590b798699f8f2df7298b449e019c11 --- /dev/null +++ b/backends-host_iommu_device-Introduce-HostIOMMUDevice.patch @@ -0,0 +1,91 @@ +From ca210a4a8fe97dd56baa184671bb48bff9a54ecb Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:28 +0800 +Subject: [PATCH] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities. +Different platform IOMMU can support different elements. + +Currently only two elements, type and aw_bits, type hints the host +platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints +host IOMMU address width. + +Introduce .get_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX +is supported. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + include/sysemu/host_iommu_device.h | 38 ++++++++++++++++++++++++++++++ + 1 file changed, 38 insertions(+) + +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +index db47a16189..a57873958b 100644 +--- a/include/sysemu/host_iommu_device.h ++++ b/include/sysemu/host_iommu_device.h +@@ -15,6 +15,18 @@ + #include "qom/object.h" + #include "qapi/error.h" + ++/** ++ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. ++ * ++ * @type: host platform IOMMU type. ++ * ++ * @aw_bits: host IOMMU address width. 0xff if no limitation. ++ */ ++typedef struct HostIOMMUDeviceCaps { ++ uint32_t type; ++ uint8_t aw_bits; ++} HostIOMMUDeviceCaps; ++ + #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" + OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) + +@@ -22,6 +34,7 @@ struct HostIOMMUDevice { + Object parent_obj; + + char *name; ++ HostIOMMUDeviceCaps caps; + }; + + /** +@@ -49,5 +62,30 @@ struct HostIOMMUDeviceClass { + * Returns: true on success, false on failure. + */ + bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); ++ /** ++ * @get_cap: check if a host IOMMU device capability is supported. ++ * ++ * Optional callback, if not implemented, hint not supporting query ++ * of @cap. ++ * ++ * @hiod: pointer to a host IOMMU device instance. ++ * ++ * @cap: capability to check. ++ * ++ * @errp: pass an Error out when fails to query capability. ++ * ++ * Returns: <0 on failure, 0 if a @cap is unsupported, or else ++ * 1 or some positive value for some special @cap, ++ * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS. ++ */ ++ int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp); + }; ++ ++/* ++ * Host IOMMU device capability list. ++ */ ++#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE 0 ++#define HOST_IOMMU_DEVICE_CAP_AW_BITS 1 ++ ++#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX 64 + #endif +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Add-helpers-for-invalidating-user-m.patch b/backends-iommufd-Add-helpers-for-invalidating-user-m.patch new file mode 100644 index 0000000000000000000000000000000000000000..56eeca4cd3429c597ab835c2adca49f218ee0a4b --- /dev/null +++ b/backends-iommufd-Add-helpers-for-invalidating-user-m.patch @@ -0,0 +1,81 @@ +From cedca4d3635cde049151b5818df2cb66c2b1531f Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Fri, 3 Nov 2023 16:54:01 +0800 +Subject: [PATCH] backends/iommufd: Add helpers for invalidating user-managed + HWPT + +Signed-off-by: Nicolin Chen +Signed-off-by: Zhenzhong Duan +--- + backends/iommufd.c | 30 ++++++++++++++++++++++++++++++ + backends/trace-events | 1 + + include/sysemu/iommufd.h | 3 +++ + 3 files changed, 34 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index c1260766f0..cf24370385 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -330,6 +330,36 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + return true; + } + ++int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, ++ uint32_t data_type, uint32_t entry_len, ++ uint32_t *entry_num, void *data_ptr) ++{ ++ int ret, fd = be->fd; ++ struct iommu_hwpt_invalidate cache = { ++ .size = sizeof(cache), ++ .hwpt_id = hwpt_id, ++ .data_type = data_type, ++ .entry_len = entry_len, ++ .entry_num = *entry_num, ++ .data_uptr = (uintptr_t)data_ptr, ++ }; ++ ++ ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); ++ ++ trace_iommufd_backend_invalidate_cache(fd, hwpt_id, data_type, entry_len, ++ *entry_num, cache.entry_num, ++ (uintptr_t)data_ptr, ret); ++ if (ret) { ++ *entry_num = cache.entry_num; ++ error_report("IOMMU_HWPT_INVALIDATE failed: %s", strerror(errno)); ++ ret = -errno; ++ } else { ++ g_assert(*entry_num == cache.entry_num); ++ } ++ ++ return ret; ++} ++ + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) + { + HostIOMMUDeviceCaps *caps = &hiod->caps; +diff --git a/backends/trace-events b/backends/trace-events +index b02433710a..ef0ff98921 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -18,3 +18,4 @@ iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_ + iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" + iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" + iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" ++iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 3b28c8a81c..f6596f6338 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -63,6 +63,9 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); ++int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, ++ uint32_t data_type, uint32_t entry_len, ++ uint32_t *entry_num, void *data_ptr); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Extend-iommufd_backend_get_device_i.patch b/backends-iommufd-Extend-iommufd_backend_get_device_i.patch new file mode 100644 index 0000000000000000000000000000000000000000..37e9041fdefa4d29f9d862bdbd293000ffa740e0 --- /dev/null +++ b/backends-iommufd-Extend-iommufd_backend_get_device_i.patch @@ -0,0 +1,78 @@ +From 7d53d0938921d0faa32e1fef4c7bcc45d21f9bfb Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Fri, 19 Jul 2024 13:04:51 +0100 +Subject: [PATCH] backends/iommufd: Extend iommufd_backend_get_device_info() to + fetch HW capabilities +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The helper will be able to fetch vendor agnostic IOMMU capabilities +supported both by hardware and software. Right now it is only iommu dirty +tracking. + +Signed-off-by: Joao Martins +Reviewed-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +--- + backends/iommufd.c | 4 +++- + hw/vfio/iommufd.c | 4 +++- + include/sysemu/iommufd.h | 2 +- + 3 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 7e805bd664..1ce2a24226 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -225,7 +225,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, +- Error **errp) ++ uint64_t *caps, Error **errp) + { + struct iommu_hw_info info = { + .size = sizeof(info), +@@ -241,6 +241,8 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + + g_assert(type); + *type = info.out_data_type; ++ g_assert(caps); ++ *caps = info.out_capabilities; + + return true; + } +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 7cbf0e44f1..d5b923ca83 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -647,9 +647,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + union { + struct iommu_hw_info_vtd vtd; + } data; ++ uint64_t hw_caps; + + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, +- &type, &data, sizeof(data), errp)) { ++ &type, &data, sizeof(data), ++ &hw_caps, errp)) { + return false; + } + +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index dfade18e6d..a0a0143856 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -51,7 +51,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, +- Error **errp); ++ uint64_t *caps, Error **errp); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Implement-HostIOMMUDeviceClass-get_.patch b/backends-iommufd-Implement-HostIOMMUDeviceClass-get_.patch new file mode 100644 index 0000000000000000000000000000000000000000..e92fe42b532e8a5ced8892799b70ab64a34b4619 --- /dev/null +++ b/backends-iommufd-Implement-HostIOMMUDeviceClass-get_.patch @@ -0,0 +1,61 @@ +From 2f1a2f4b320e70a85cef8392cd5f4b1e54afb9c9 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:36 +0800 +Subject: [PATCH] backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() + handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + backends/iommufd.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 604a8f4e7d..7e805bd664 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -245,6 +245,28 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + return true; + } + ++static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) ++{ ++ HostIOMMUDeviceCaps *caps = &hiod->caps; ++ ++ switch (cap) { ++ case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: ++ return caps->type; ++ case HOST_IOMMU_DEVICE_CAP_AW_BITS: ++ return caps->aw_bits; ++ default: ++ error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); ++ return -EINVAL; ++ } ++} ++ ++static void hiod_iommufd_class_init(ObjectClass *oc, void *data) ++{ ++ HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); ++ ++ hioc->get_cap = hiod_iommufd_get_cap; ++}; ++ + static const TypeInfo types[] = { + { + .name = TYPE_IOMMUFD_BACKEND, +@@ -261,6 +283,7 @@ static const TypeInfo types[] = { + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .parent = TYPE_HOST_IOMMU_DEVICE, ++ .class_init = hiod_iommufd_class_init, + .abstract = true, + } + }; +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Introduce-TYPE_HOST_IOMMU_DEVICE_IO.patch b/backends-iommufd-Introduce-TYPE_HOST_IOMMU_DEVICE_IO.patch new file mode 100644 index 0000000000000000000000000000000000000000..13e26c512f929e1b6e285bb2799e742c811548cc --- /dev/null +++ b/backends-iommufd-Introduce-TYPE_HOST_IOMMU_DEVICE_IO.patch @@ -0,0 +1,158 @@ +From 50142057ec070a70f3f38ec272ec61cc3ae6e071 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:30 +0800 +Subject: [PATCH] backends/iommufd: Introduce + TYPE_HOST_IOMMU_DEVICE_IOMMUFD[_VFIO] devices +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +TYPE_HOST_IOMMU_DEVICE_IOMMUFD represents a host IOMMU device under +iommufd backend. It is abstract, because it is going to be derived +into VFIO or VDPA type'd device. + +It will have its own .get_cap() implementation. + +TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO is a sub-class of +TYPE_HOST_IOMMU_DEVICE_IOMMUFD, represents a VFIO type'd host IOMMU +device under iommufd backend. It will be created during VFIO device +attaching and passed to vIOMMU. + +It will have its own .realize() implementation. + +Opportunistically, add missed header to include/sysemu/iommufd.h. + +Suggested-by: Cédric Le Goater +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + backends/iommufd.c | 36 +++++++++++++++++------------------ + hw/vfio/iommufd.c | 5 ++++- + include/hw/vfio/vfio-common.h | 3 +++ + include/sysemu/iommufd.h | 16 ++++++++++++++++ + 4 files changed, 41 insertions(+), 19 deletions(-) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index ba58a0eb0d..a2b7f5c3c4 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -223,23 +223,23 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + return ret; + } + +-static const TypeInfo iommufd_backend_info = { +- .name = TYPE_IOMMUFD_BACKEND, +- .parent = TYPE_OBJECT, +- .instance_size = sizeof(IOMMUFDBackend), +- .instance_init = iommufd_backend_init, +- .instance_finalize = iommufd_backend_finalize, +- .class_size = sizeof(IOMMUFDBackendClass), +- .class_init = iommufd_backend_class_init, +- .interfaces = (InterfaceInfo[]) { +- { TYPE_USER_CREATABLE }, +- { } ++static const TypeInfo types[] = { ++ { ++ .name = TYPE_IOMMUFD_BACKEND, ++ .parent = TYPE_OBJECT, ++ .instance_size = sizeof(IOMMUFDBackend), ++ .instance_init = iommufd_backend_init, ++ .instance_finalize = iommufd_backend_finalize, ++ .class_size = sizeof(IOMMUFDBackendClass), ++ .class_init = iommufd_backend_class_init, ++ .interfaces = (InterfaceInfo[]) { ++ { TYPE_USER_CREATABLE }, ++ { } ++ } ++ }, { ++ .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, ++ .parent = TYPE_HOST_IOMMU_DEVICE, ++ .abstract = true, + } + }; +- +-static void register_types(void) +-{ +- type_register_static(&iommufd_backend_info); +-} +- +-type_init(register_types); ++DEFINE_TYPES(types) +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index d4c586e842..7a4b818830 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -641,7 +641,10 @@ static const TypeInfo types[] = { + .name = TYPE_VFIO_IOMMU_IOMMUFD, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_iommufd_class_init, +- }, ++ }, { ++ .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, ++ .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, ++ } + }; + + DEFINE_TYPES(types) +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 0c807c2806..2cfc8521cd 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -32,6 +32,7 @@ + #include "sysemu/sysemu.h" + #include "hw/vfio/vfio-container-base.h" + #include "sysemu/host_iommu_device.h" ++#include "sysemu/iommufd.h" + + #define VFIO_MSG_PREFIX "vfio %s: " + +@@ -77,6 +78,8 @@ typedef struct VFIOMigration { + struct VFIOGroup; + + #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" ++#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ ++ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" + + typedef struct VFIODMARange { + QLIST_ENTRY(VFIODMARange) next; +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 9c5524b0ed..1a75e82f42 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -1,3 +1,16 @@ ++/* ++ * iommufd container backend declaration ++ * ++ * Copyright (C) 2024 Intel Corporation. ++ * Copyright Red Hat, Inc. 2024 ++ * ++ * Authors: Yi Liu ++ * Eric Auger ++ * Zhenzhong Duan ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ + #ifndef SYSEMU_IOMMUFD_H + #define SYSEMU_IOMMUFD_H + +@@ -5,6 +18,7 @@ + #include "qemu/thread.h" + #include "exec/hwaddr.h" + #include "exec/cpu-common.h" ++#include "sysemu/host_iommu_device.h" + + #define TYPE_IOMMUFD_BACKEND "iommufd" + OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) +@@ -35,4 +49,6 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly); + int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); ++ ++#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Introduce-helper-function-iommufd_b.patch b/backends-iommufd-Introduce-helper-function-iommufd_b.patch new file mode 100644 index 0000000000000000000000000000000000000000..b45a86888b7bc857ea056547446deedf4949ca3a --- /dev/null +++ b/backends-iommufd-Introduce-helper-function-iommufd_b.patch @@ -0,0 +1,69 @@ +From ccd8baf4648e6fd6b69e65ee249609904edc92e1 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:33 +0800 +Subject: [PATCH] backends/iommufd: Introduce helper function + iommufd_backend_get_device_info() + +Introduce a helper function iommufd_backend_get_device_info() to get +host IOMMU related information through iommufd uAPI. + +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + backends/iommufd.c | 22 ++++++++++++++++++++++ + include/sysemu/iommufd.h | 3 +++ + 2 files changed, 25 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index a2b7f5c3c4..604a8f4e7d 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -223,6 +223,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + return ret; + } + ++bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, ++ uint32_t *type, void *data, uint32_t len, ++ Error **errp) ++{ ++ struct iommu_hw_info info = { ++ .size = sizeof(info), ++ .dev_id = devid, ++ .data_len = len, ++ .data_uptr = (uintptr_t)data, ++ }; ++ ++ if (ioctl(be->fd, IOMMU_GET_HW_INFO, &info)) { ++ error_setg_errno(errp, errno, "Failed to get hardware info"); ++ return false; ++ } ++ ++ g_assert(type); ++ *type = info.out_data_type; ++ ++ return true; ++} ++ + static const TypeInfo types[] = { + { + .name = TYPE_IOMMUFD_BACKEND, +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 1a75e82f42..dfade18e6d 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -49,6 +49,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly); + int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); ++bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, ++ uint32_t *type, void *data, uint32_t len, ++ Error **errp); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Introduce-iommufd_backend_alloc_vio.patch b/backends-iommufd-Introduce-iommufd_backend_alloc_vio.patch new file mode 100644 index 0000000000000000000000000000000000000000..be79447e026308cef5a70b976d4c2a41b6f9c18f --- /dev/null +++ b/backends-iommufd-Introduce-iommufd_backend_alloc_vio.patch @@ -0,0 +1,100 @@ +From 207259b8f08e87b4a741a8b7884e699c95641a2e Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Sat, 13 Apr 2024 00:15:17 +0000 +Subject: [PATCH] backends/iommufd: Introduce iommufd_backend_alloc_viommu + +Add a helper to allocate a viommu object. + +Signed-off-by: Nicolin Chen +--- + backends/iommufd.c | 35 +++++++++++++++++++++++++++++++++++ + backends/trace-events | 1 + + include/sysemu/iommufd.h | 10 ++++++++++ + 3 files changed, 46 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index c10aa9b011..82368a3918 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -360,6 +360,41 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + return ret; + } + ++struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, ++ uint32_t dev_id, ++ uint32_t viommu_type, ++ uint32_t hwpt_id) ++{ ++ int ret, fd = be->fd; ++ struct IOMMUFDViommu *viommu = g_malloc(sizeof(*viommu)); ++ struct iommu_viommu_alloc alloc_viommu = { ++ .size = sizeof(alloc_viommu), ++ .type = viommu_type, ++ .dev_id = dev_id, ++ .hwpt_id = hwpt_id, ++ }; ++ ++ if (!viommu) { ++ error_report("failed to allocate viommu object"); ++ return NULL; ++ } ++ ++ ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu); ++ ++ trace_iommufd_backend_alloc_viommu(fd, viommu_type, dev_id, hwpt_id, ++ alloc_viommu.out_viommu_id, ret); ++ if (ret) { ++ error_report("IOMMU_VIOMMU_ALLOC failed: %s", strerror(errno)); ++ g_free(viommu); ++ return NULL; ++ } ++ ++ viommu->viommu_id = alloc_viommu.out_viommu_id; ++ viommu->s2_hwpt_id = hwpt_id; ++ viommu->iommufd = be; ++ return viommu; ++} ++ + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) + { +diff --git a/backends/trace-events b/backends/trace-events +index ef0ff98921..c24cd378df 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -19,3 +19,4 @@ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (% + iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" + iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" + iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" ++iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 3dc6934144..05a08c49c2 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -39,6 +39,12 @@ struct IOMMUFDBackend { + /*< public >*/ + }; + ++typedef struct IOMMUFDViommu { ++ IOMMUFDBackend *iommufd; ++ uint32_t s2_hwpt_id; ++ uint32_t viommu_id; ++} IOMMUFDViommu; ++ + int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); + void iommufd_backend_disconnect(IOMMUFDBackend *be); + +@@ -66,6 +72,10 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); ++struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, ++ uint32_t dev_id, ++ uint32_t viommu_type, ++ uint32_t hwpt_id); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Introduce-iommufd_vdev_alloc.patch b/backends-iommufd-Introduce-iommufd_vdev_alloc.patch new file mode 100644 index 0000000000000000000000000000000000000000..91f8ea357fc14ec71698eeab793267366c3ac15a --- /dev/null +++ b/backends-iommufd-Introduce-iommufd_vdev_alloc.patch @@ -0,0 +1,89 @@ +From 005b8f4b6cef11982abcc2c071cbe40b69fb22e7 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Sat, 13 Apr 2024 00:21:22 +0000 +Subject: [PATCH] backends/iommufd: Introduce iommufd_vdev_alloc + +Add a helper to allocate an iommufd device's virtual device (in the user +space) per a viommu instance. + +Signed-off-by: Nicolin Chen +--- + backends/iommufd.c | 31 +++++++++++++++++++++++++++++++ + backends/trace-events | 1 + + include/sysemu/iommufd.h | 11 +++++++++++ + 3 files changed, 43 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 82368a3918..af3376d0bf 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -395,6 +395,37 @@ struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + return viommu; + } + ++struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, ++ IOMMUFDViommu *viommu, ++ uint64_t virt_id) ++{ ++ int ret, fd = viommu->iommufd->fd; ++ struct IOMMUFDVdev *vdev = g_malloc(sizeof(*vdev)); ++ struct iommu_vdevice_alloc alloc_vdev = { ++ .size = sizeof(alloc_vdev), ++ .viommu_id = viommu->viommu_id, ++ .dev_id = idev->devid, ++ .virt_id = virt_id, ++ }; ++ ++ ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev); ++ ++ trace_iommufd_backend_alloc_vdev(fd, idev->devid, viommu->viommu_id, virt_id, ++ alloc_vdev.out_vdevice_id, ret); ++ ++ if (ret) { ++ error_report("IOMMU_VDEVICE_ALLOC failed: %s", strerror(errno)); ++ g_free(vdev); ++ return NULL; ++ } ++ ++ vdev->idev = idev; ++ vdev->viommu = viommu; ++ vdev->virt_id = virt_id; ++ vdev->vdev_id = alloc_vdev.out_vdevice_id; ++ return vdev; ++} ++ + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) + { +diff --git a/backends/trace-events b/backends/trace-events +index c24cd378df..e150a37e9a 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -20,3 +20,4 @@ iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " + iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" + iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" + iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" ++iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 05a08c49c2..0284e95460 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -128,4 +128,15 @@ bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); + bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); ++ ++typedef struct IOMMUFDVdev { ++ HostIOMMUDeviceIOMMUFD *idev; ++ IOMMUFDViommu *viommu; ++ uint32_t vdev_id; ++ uint64_t virt_id; ++} IOMMUFDVdev; ++ ++struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, ++ IOMMUFDViommu *viommu, ++ uint64_t virt_id); + #endif +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Introduce-iommufd_viommu_invalidate.patch b/backends-iommufd-Introduce-iommufd_viommu_invalidate.patch new file mode 100644 index 0000000000000000000000000000000000000000..a835fb06dc5c9aa31876bb399112bb9b676eb583 --- /dev/null +++ b/backends-iommufd-Introduce-iommufd_viommu_invalidate.patch @@ -0,0 +1,84 @@ +From 2be28f75e4ed2a0a35549dd1a545e0655e63973d Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Fri, 12 Apr 2024 23:27:54 +0000 +Subject: [PATCH] backends/iommufd: Introduce iommufd_viommu_invalidate_cache + +Similar to iommufd_backend_invalidate_cache for iotlb invalidation via +IOMMU_HWPT_INVALIDATE ioctl, add a new helper for viommu specific cache +invalidation via IOMMU_VIOMMU_INVALIDATE ioctl. + +Signed-off-by: Nicolin Chen +--- + backends/iommufd.c | 31 +++++++++++++++++++++++++++++++ + backends/trace-events | 1 + + include/sysemu/iommufd.h | 3 +++ + 3 files changed, 35 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index af3376d0bf..ee6f5bcf65 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -426,6 +426,37 @@ struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + return vdev; + } + ++int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, ++ uint32_t data_type, uint32_t entry_len, ++ uint32_t *entry_num, void *data_ptr) ++{ ++ int ret, fd = be->fd; ++ struct iommu_hwpt_invalidate cache = { ++ .size = sizeof(cache), ++ .hwpt_id = viommu_id, ++ .data_type = data_type, ++ .entry_len = entry_len, ++ .entry_num = *entry_num, ++ .data_uptr = (uint64_t)data_ptr, ++ }; ++ ++ ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); ++ ++ trace_iommufd_viommu_invalidate_cache(fd, viommu_id, data_type, ++ entry_len, *entry_num, ++ cache.entry_num, ++ (uint64_t)data_ptr, ret); ++ if (ret) { ++ *entry_num = cache.entry_num; ++ error_report("IOMMU_VIOMMU_INVALIDATE failed: %s", strerror(errno)); ++ ret = -errno; ++ } else { ++ g_assert(*entry_num == cache.entry_num); ++ } ++ ++ return ret; ++} ++ + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) + { +diff --git a/backends/trace-events b/backends/trace-events +index e150a37e9a..f8592a2711 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -21,3 +21,4 @@ iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, u + iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" + iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" + iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" ++iommufd_viommu_invalidate_cache(int iommufd, uint32_t viommu_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d viommu_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 0284e95460..0f2c826036 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -76,6 +76,9 @@ struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id); ++int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, ++ uint32_t data_type, uint32_t entry_len, ++ uint32_t *entry_num, void *data_ptr); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, +-- +2.41.0.windows.1 + diff --git a/backends-iommufd-Introduce-the-iommufd-object.patch b/backends-iommufd-Introduce-the-iommufd-object.patch new file mode 100644 index 0000000000000000000000000000000000000000..a6a2a10f5d3ddd84e6f767eb4763983015e7dbae --- /dev/null +++ b/backends-iommufd-Introduce-the-iommufd-object.patch @@ -0,0 +1,468 @@ +From 6cb41a55992571dd215fee86ed910bb4d6688bf8 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:37 +0800 +Subject: [PATCH] backends/iommufd: Introduce the iommufd object +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce an iommufd object which allows the interaction +with the host /dev/iommu device. + +The /dev/iommu can have been already pre-opened outside of qemu, +in which case the fd can be passed directly along with the +iommufd object: + +This allows the iommufd object to be shared accross several +subsystems (VFIO, VDPA, ...). For example, libvirt would open +the /dev/iommu once. + +If no fd is passed along with the iommufd object, the /dev/iommu +is opened by the qemu code. + +Suggested-by: Alex Williamson +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + MAINTAINERS | 8 ++ + backends/Kconfig | 4 + + backends/iommufd.c | 245 +++++++++++++++++++++++++++++++++++++++ + backends/meson.build | 1 + + backends/trace-events | 10 ++ + include/sysemu/iommufd.h | 38 ++++++ + qapi/qom.json | 19 +++ + qemu-options.hx | 12 ++ + 8 files changed, 337 insertions(+) + create mode 100644 backends/iommufd.c + create mode 100644 include/sysemu/iommufd.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index 695e0bd34f..a5a446914a 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2167,6 +2167,14 @@ F: hw/vfio/ap.c + F: docs/system/s390x/vfio-ap.rst + L: qemu-s390x@nongnu.org + ++iommufd ++M: Yi Liu ++M: Eric Auger ++M: Zhenzhong Duan ++S: Supported ++F: backends/iommufd.c ++F: include/sysemu/iommufd.h ++ + vhost + M: Michael S. Tsirkin + S: Supported +diff --git a/backends/Kconfig b/backends/Kconfig +index f35abc1609..2cb23f62fa 100644 +--- a/backends/Kconfig ++++ b/backends/Kconfig +@@ -1 +1,5 @@ + source tpm/Kconfig ++ ++config IOMMUFD ++ bool ++ depends on VFIO +diff --git a/backends/iommufd.c b/backends/iommufd.c +new file mode 100644 +index 0000000000..ba58a0eb0d +--- /dev/null ++++ b/backends/iommufd.c +@@ -0,0 +1,245 @@ ++/* ++ * iommufd container backend ++ * ++ * Copyright (C) 2023 Intel Corporation. ++ * Copyright Red Hat, Inc. 2023 ++ * ++ * Authors: Yi Liu ++ * Eric Auger ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#include "qemu/osdep.h" ++#include "sysemu/iommufd.h" ++#include "qapi/error.h" ++#include "qapi/qmp/qerror.h" ++#include "qemu/module.h" ++#include "qom/object_interfaces.h" ++#include "qemu/error-report.h" ++#include "monitor/monitor.h" ++#include "trace.h" ++#include ++#include ++ ++static void iommufd_backend_init(Object *obj) ++{ ++ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); ++ ++ be->fd = -1; ++ be->users = 0; ++ be->owned = true; ++ qemu_mutex_init(&be->lock); ++} ++ ++static void iommufd_backend_finalize(Object *obj) ++{ ++ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); ++ ++ if (be->owned) { ++ close(be->fd); ++ be->fd = -1; ++ } ++} ++ ++static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) ++{ ++ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); ++ int fd = -1; ++ ++ fd = monitor_fd_param(monitor_cur(), str, errp); ++ if (fd == -1) { ++ error_prepend(errp, "Could not parse remote object fd %s:", str); ++ return; ++ } ++ qemu_mutex_lock(&be->lock); ++ be->fd = fd; ++ be->owned = false; ++ qemu_mutex_unlock(&be->lock); ++ trace_iommu_backend_set_fd(be->fd); ++} ++ ++static bool iommufd_backend_can_be_deleted(UserCreatable *uc) ++{ ++ IOMMUFDBackend *be = IOMMUFD_BACKEND(uc); ++ ++ return !be->users; ++} ++ ++static void iommufd_backend_class_init(ObjectClass *oc, void *data) ++{ ++ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); ++ ++ ucc->can_be_deleted = iommufd_backend_can_be_deleted; ++ ++ object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); ++} ++ ++int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) ++{ ++ int fd, ret = 0; ++ ++ qemu_mutex_lock(&be->lock); ++ if (be->users == UINT32_MAX) { ++ error_setg(errp, "too many connections"); ++ ret = -E2BIG; ++ goto out; ++ } ++ if (be->owned && !be->users) { ++ fd = qemu_open_old("/dev/iommu", O_RDWR); ++ if (fd < 0) { ++ error_setg_errno(errp, errno, "/dev/iommu opening failed"); ++ ret = fd; ++ goto out; ++ } ++ be->fd = fd; ++ } ++ be->users++; ++out: ++ trace_iommufd_backend_connect(be->fd, be->owned, ++ be->users, ret); ++ qemu_mutex_unlock(&be->lock); ++ return ret; ++} ++ ++void iommufd_backend_disconnect(IOMMUFDBackend *be) ++{ ++ qemu_mutex_lock(&be->lock); ++ if (!be->users) { ++ goto out; ++ } ++ be->users--; ++ if (!be->users && be->owned) { ++ close(be->fd); ++ be->fd = -1; ++ } ++out: ++ trace_iommufd_backend_disconnect(be->fd, be->users); ++ qemu_mutex_unlock(&be->lock); ++} ++ ++int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, ++ Error **errp) ++{ ++ int ret, fd = be->fd; ++ struct iommu_ioas_alloc alloc_data = { ++ .size = sizeof(alloc_data), ++ .flags = 0, ++ }; ++ ++ ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); ++ if (ret) { ++ error_setg_errno(errp, errno, "Failed to allocate ioas"); ++ return ret; ++ } ++ ++ *ioas_id = alloc_data.out_ioas_id; ++ trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); ++ ++ return ret; ++} ++ ++void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) ++{ ++ int ret, fd = be->fd; ++ struct iommu_destroy des = { ++ .size = sizeof(des), ++ .id = id, ++ }; ++ ++ ret = ioctl(fd, IOMMU_DESTROY, &des); ++ trace_iommufd_backend_free_id(fd, id, ret); ++ if (ret) { ++ error_report("Failed to free id: %u %m", id); ++ } ++} ++ ++int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ++ ram_addr_t size, void *vaddr, bool readonly) ++{ ++ int ret, fd = be->fd; ++ struct iommu_ioas_map map = { ++ .size = sizeof(map), ++ .flags = IOMMU_IOAS_MAP_READABLE | ++ IOMMU_IOAS_MAP_FIXED_IOVA, ++ .ioas_id = ioas_id, ++ .__reserved = 0, ++ .user_va = (uintptr_t)vaddr, ++ .iova = iova, ++ .length = size, ++ }; ++ ++ if (!readonly) { ++ map.flags |= IOMMU_IOAS_MAP_WRITEABLE; ++ } ++ ++ ret = ioctl(fd, IOMMU_IOAS_MAP, &map); ++ trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, ++ vaddr, readonly, ret); ++ if (ret) { ++ ret = -errno; ++ ++ /* TODO: Not support mapping hardware PCI BAR region for now. */ ++ if (errno == EFAULT) { ++ warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?"); ++ } else { ++ error_report("IOMMU_IOAS_MAP failed: %m"); ++ } ++ } ++ return ret; ++} ++ ++int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, ++ hwaddr iova, ram_addr_t size) ++{ ++ int ret, fd = be->fd; ++ struct iommu_ioas_unmap unmap = { ++ .size = sizeof(unmap), ++ .ioas_id = ioas_id, ++ .iova = iova, ++ .length = size, ++ }; ++ ++ ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); ++ /* ++ * IOMMUFD takes mapping as some kind of object, unmapping ++ * nonexistent mapping is treated as deleting a nonexistent ++ * object and return ENOENT. This is different from legacy ++ * backend which allows it. vIOMMU may trigger a lot of ++ * redundant unmapping, to avoid flush the log, treat them ++ * as succeess for IOMMUFD just like legacy backend. ++ */ ++ if (ret && errno == ENOENT) { ++ trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret); ++ ret = 0; ++ } else { ++ trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); ++ } ++ ++ if (ret) { ++ ret = -errno; ++ error_report("IOMMU_IOAS_UNMAP failed: %m"); ++ } ++ return ret; ++} ++ ++static const TypeInfo iommufd_backend_info = { ++ .name = TYPE_IOMMUFD_BACKEND, ++ .parent = TYPE_OBJECT, ++ .instance_size = sizeof(IOMMUFDBackend), ++ .instance_init = iommufd_backend_init, ++ .instance_finalize = iommufd_backend_finalize, ++ .class_size = sizeof(IOMMUFDBackendClass), ++ .class_init = iommufd_backend_class_init, ++ .interfaces = (InterfaceInfo[]) { ++ { TYPE_USER_CREATABLE }, ++ { } ++ } ++}; ++ ++static void register_types(void) ++{ ++ type_register_static(&iommufd_backend_info); ++} ++ ++type_init(register_types); +diff --git a/backends/meson.build b/backends/meson.build +index 914c7c4afb..9a5cea480d 100644 +--- a/backends/meson.build ++++ b/backends/meson.build +@@ -20,6 +20,7 @@ if have_vhost_user + system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) + endif + system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) ++system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) + if have_vhost_user_crypto + system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c')) + endif +diff --git a/backends/trace-events b/backends/trace-events +index 652eb76a57..d45c6e31a6 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void) + dbus_vmstate_post_load(int version_id) "version_id: %d" + dbus_vmstate_loading(const char *id) "id: %s" + dbus_vmstate_saving(const char *id) "id: %s" ++ ++# iommufd.c ++iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" ++iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" ++iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" ++iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" ++iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" ++iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" ++iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" ++iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +new file mode 100644 +index 0000000000..9c5524b0ed +--- /dev/null ++++ b/include/sysemu/iommufd.h +@@ -0,0 +1,38 @@ ++#ifndef SYSEMU_IOMMUFD_H ++#define SYSEMU_IOMMUFD_H ++ ++#include "qom/object.h" ++#include "qemu/thread.h" ++#include "exec/hwaddr.h" ++#include "exec/cpu-common.h" ++ ++#define TYPE_IOMMUFD_BACKEND "iommufd" ++OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) ++ ++struct IOMMUFDBackendClass { ++ ObjectClass parent_class; ++}; ++ ++struct IOMMUFDBackend { ++ Object parent; ++ ++ /*< protected >*/ ++ int fd; /* /dev/iommu file descriptor */ ++ bool owned; /* is the /dev/iommu opened internally */ ++ QemuMutex lock; ++ uint32_t users; ++ ++ /*< public >*/ ++}; ++ ++int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); ++void iommufd_backend_disconnect(IOMMUFDBackend *be); ++ ++int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, ++ Error **errp); ++void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); ++int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ++ ram_addr_t size, void *vaddr, bool readonly); ++int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, ++ hwaddr iova, ram_addr_t size); ++#endif +diff --git a/qapi/qom.json b/qapi/qom.json +index a74c7a91f9..a5336e6b11 100644 +--- a/qapi/qom.json ++++ b/qapi/qom.json +@@ -794,6 +794,23 @@ + { 'struct': 'VfioUserServerProperties', + 'data': { 'socket': 'SocketAddress', 'device': 'str' } } + ++## ++# @IOMMUFDProperties: ++# ++# Properties for iommufd objects. ++# ++# @fd: file descriptor name previously passed via 'getfd' command, ++# which represents a pre-opened /dev/iommu. This allows the ++# iommufd object to be shared accross several subsystems ++# (VFIO, VDPA, ...), and the file descriptor to be shared ++# with other process, e.g. DPDK. (default: QEMU opens ++# /dev/iommu by itself) ++# ++# Since: 9.0 ++## ++{ 'struct': 'IOMMUFDProperties', ++ 'data': { '*fd': 'str' } } ++ + ## + # @RngProperties: + # +@@ -969,6 +986,7 @@ + 'input-barrier', + { 'name': 'input-linux', + 'if': 'CONFIG_LINUX' }, ++ 'iommufd', + 'iothread', + 'main-loop', + { 'name': 'memory-backend-epc', +@@ -1039,6 +1057,7 @@ + 'input-barrier': 'InputBarrierProperties', + 'input-linux': { 'type': 'InputLinuxProperties', + 'if': 'CONFIG_LINUX' }, ++ 'iommufd': 'IOMMUFDProperties', + 'iothread': 'IothreadProperties', + 'main-loop': 'MainLoopProperties', + 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', +diff --git a/qemu-options.hx b/qemu-options.hx +index 8516b73206..7fe76c4b1d 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -5224,6 +5224,18 @@ SRST + + The ``share`` boolean option is on by default with memfd. + ++ ``-object iommufd,id=id[,fd=fd]`` ++ Creates an iommufd backend which allows control of DMA mapping ++ through the ``/dev/iommu`` device. ++ ++ The ``id`` parameter is a unique ID which frontends (such as ++ vfio-pci of vdpa) will use to connect with the iommufd backend. ++ ++ The ``fd`` parameter is an optional pre-opened file descriptor ++ resulting from ``/dev/iommu`` opening. Usually the iommufd is shared ++ across all subsystems, bringing the benefit of centralized ++ reference counting. ++ + ``-object rng-builtin,id=id`` + Creates a random number generator backend which obtains entropy + from QEMU builtin functions. The ``id`` parameter is a unique ID +-- +2.41.0.windows.1 + diff --git a/cryptodev-Fix-error-handling-in-cryptodev_lkcf_execu.patch b/cryptodev-Fix-error-handling-in-cryptodev_lkcf_execu.patch new file mode 100644 index 0000000000000000000000000000000000000000..a942a46d9aac6849b779db6b46d6b7ae7f8adf3f --- /dev/null +++ b/cryptodev-Fix-error-handling-in-cryptodev_lkcf_execu.patch @@ -0,0 +1,52 @@ +From ca3f4fd234ea4b8f02a415b99b449e71d028c076 Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Tue, 8 Apr 2025 07:27:47 -0400 +Subject: [PATCH] cryptodev: Fix error handling in + cryptodev_lkcf_execute_task() + +cheery-pick from 1c89dfefc4c33295126208225f202f39b5a234c3 + +When cryptodev_lkcf_set_op_desc() fails, we report an error, but +continue anyway. This is wrong. We then pass a non-null @local_error +to various functions, which could easily fail error_setv()'s assertion +on failure. + +Fail the function instead. + +When qcrypto_akcipher_new() fails, we fail the function without +reporting the error. This leaks the Error object. + +Add the missing error reporting. This also frees the Error object. + +Signed-off-by: Markus Armbruster +Message-ID: <20250312101131.1615777-1-armbru@redhat.com> +Reviewed-by: zhenwei pi +Signed-off-by: qihao_yewu +--- + backends/cryptodev-lkcf.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c +index 45aba1ff67..45b287a953 100644 +--- a/backends/cryptodev-lkcf.c ++++ b/backends/cryptodev-lkcf.c +@@ -330,6 +330,8 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task) + cryptodev_lkcf_set_op_desc(&session->akcipher_opts, op_desc, + sizeof(op_desc), &local_error) != 0) { + error_report_err(local_error); ++ status = -VIRTIO_CRYPTO_ERR; ++ goto out; + } else { + key_id = add_key(KCTL_KEY_TYPE_PKEY, "lkcf-backend-priv-key", + p8info, p8info_len, KCTL_KEY_RING); +@@ -346,6 +348,7 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task) + session->key, session->keylen, + &local_error); + if (!akcipher) { ++ error_report_err(local_error); + status = -VIRTIO_CRYPTO_ERR; + goto out; + } +-- +2.41.0.windows.1 + diff --git a/docs-devel-Add-VFIO-iommufd-backend-documentation.patch b/docs-devel-Add-VFIO-iommufd-backend-documentation.patch new file mode 100644 index 0000000000000000000000000000000000000000..f15a1f8709b2e478e69212219d494196d9f2df66 --- /dev/null +++ b/docs-devel-Add-VFIO-iommufd-backend-documentation.patch @@ -0,0 +1,220 @@ +From fd1d6d64803a052adcab8c7993ca40cabc9c926d Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:53:03 +0800 +Subject: [PATCH] docs/devel: Add VFIO iommufd backend documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Suggested-by: Cédric Le Goater +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + MAINTAINERS | 1 + + docs/devel/index-internals.rst | 1 + + docs/devel/vfio-iommufd.rst | 166 +++++++++++++++++++++++++++++++++ + 3 files changed, 168 insertions(+) + create mode 100644 docs/devel/vfio-iommufd.rst + +diff --git a/MAINTAINERS b/MAINTAINERS +index ca70bb4e64..0ddb20a35f 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2176,6 +2176,7 @@ F: backends/iommufd.c + F: include/sysemu/iommufd.h + F: include/qemu/chardev_open.h + F: util/chardev_open.c ++F: docs/devel/vfio-iommufd.rst + + vhost + M: Michael S. Tsirkin +diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst +index 6f81df92bc..3def4a138b 100644 +--- a/docs/devel/index-internals.rst ++++ b/docs/devel/index-internals.rst +@@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them. + s390-dasd-ipl + tracing + vfio-migration ++ vfio-iommufd + writing-monitor-commands + virtio-backends +diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst +new file mode 100644 +index 0000000000..3d1c11f175 +--- /dev/null ++++ b/docs/devel/vfio-iommufd.rst +@@ -0,0 +1,166 @@ ++=============================== ++IOMMUFD BACKEND usage with VFIO ++=============================== ++ ++(Same meaning for backend/container/BE) ++ ++With the introduction of iommufd, the Linux kernel provides a generic ++interface for user space drivers to propagate their DMA mappings to kernel ++for assigned devices. While the legacy kernel interface is group-centric, ++the new iommufd interface is device-centric, relying on device fd and iommufd. ++ ++To support both interfaces in the QEMU VFIO device, introduce a base container ++to abstract the common part of VFIO legacy and iommufd container. So that the ++generic VFIO code can use either container. ++ ++The base container implements generic functions such as memory_listener and ++address space management whereas the derived container implements callbacks ++specific to either legacy or iommufd. Each container has its own way to setup ++secure context and dma management interface. The below diagram shows how it ++looks like with both containers. ++ ++:: ++ ++ VFIO AddressSpace/Memory ++ +-------+ +----------+ +-----+ +-----+ ++ | pci | | platform | | ap | | ccw | ++ +---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+ ++ | | | | | AddressSpace | ++ | | | | +------------+---------+ ++ +---V-----------V-----------V--------V----+ / ++ | VFIOAddressSpace | <------------+ ++ | | | MemoryListener ++ | VFIOContainerBase list | ++ +-------+----------------------------+----+ ++ | | ++ | | ++ +-------V------+ +--------V----------+ ++ | iommufd | | vfio legacy | ++ | container | | container | ++ +-------+------+ +--------+----------+ ++ | | ++ | /dev/iommu | /dev/vfio/vfio ++ | /dev/vfio/devices/vfioX | /dev/vfio/$group_id ++ Userspace | | ++ ============+============================+=========================== ++ Kernel | device fd | ++ +---------------+ | group/container fd ++ | (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU) ++ | ATTACH_IOAS) | | device fd ++ | | | ++ | +-------V------------V-----------------+ ++ iommufd | | vfio | ++ (map/unmap | +---------+--------------------+-------+ ++ ioas_copy) | | | map/unmap ++ | | | ++ +------V------+ +-----V------+ +------V--------+ ++ | iommfd core | | device | | vfio iommu | ++ +-------------+ +------------+ +---------------+ ++ ++* Secure Context setup ++ ++ - iommufd BE: uses device fd and iommufd to setup secure context ++ (bind_iommufd, attach_ioas) ++ - vfio legacy BE: uses group fd and container fd to setup secure context ++ (set_container, set_iommu) ++ ++* Device access ++ ++ - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX`` ++ - vfio legacy BE: device fd is retrieved from group fd ioctl ++ ++* DMA Mapping flow ++ ++ 1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener ++ 2. VFIO populates DMA map/unmap via the container BEs ++ * iommufd BE: uses iommufd ++ * vfio legacy BE: uses container fd ++ ++Example configuration ++===================== ++ ++Step 1: configure the host device ++--------------------------------- ++ ++It's exactly same as the VFIO device with legacy VFIO container. ++ ++Step 2: configure QEMU ++---------------------- ++ ++Interactions with the ``/dev/iommu`` are abstracted by a new iommufd ++object (compiled in with the ``CONFIG_IOMMUFD`` option). ++ ++Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must ++be linked with an iommufd object. It gets a new optional property ++named iommufd which allows to pass an iommufd object. Take ``vfio-pci`` ++device for example: ++ ++.. code-block:: bash ++ ++ -object iommufd,id=iommufd0 ++ -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 ++ ++Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a ++management layer. In such a case the fd is passed, the fd supports a ++string naming the fd or a number, for example: ++ ++.. code-block:: bash ++ ++ -object iommufd,id=iommufd0,fd=22 ++ -device vfio-pci,iommufd=iommufd0,fd=23 ++ ++If the ``fd`` property is not passed, the fd is opened by QEMU. ++ ++If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd ++is not used and the user gets the behavior based on the legacy VFIO ++container: ++ ++.. code-block:: bash ++ ++ -device vfio-pci,host=0000:02:00.0 ++ ++Supported platform ++================== ++ ++Supports x86, ARM and s390x currently. ++ ++Caveats ++======= ++ ++Dirty page sync ++--------------- ++ ++Dirty page sync with iommufd backend is unsupported yet, live migration is ++disabled by default. But it can be force enabled like below, low efficient ++though. ++ ++.. code-block:: bash ++ ++ -object iommufd,id=iommufd0 ++ -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on ++ ++P2P DMA ++------- ++ ++PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI ++BAR region yet. Below warning shows for assigned PCI device, it's not a bug. ++ ++.. code-block:: none ++ ++ qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR? ++ qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address) ++ ++FD passing with mdev ++-------------------- ++ ++``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev. ++If FD passing is used, there is no way to know that and the mdev is treated ++like a real PCI device. There is an error as below if user wants to enable ++RAM discarding for mdev. ++ ++.. code-block:: none ++ ++ qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices ++ ++``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend ++devices are always mdev and RAM discarding is force enabled. +-- +2.41.0.windows.1 + diff --git a/gpex-acpi-Remove-duplicate-DSM-5.patch b/gpex-acpi-Remove-duplicate-DSM-5.patch new file mode 100644 index 0000000000000000000000000000000000000000..8f54bec1e390fb898578588696a2e4a1bf810965 --- /dev/null +++ b/gpex-acpi-Remove-duplicate-DSM-5.patch @@ -0,0 +1,57 @@ +From b1087bb8a4edbacc7240c0fcab63bc1cf2624627 Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Tue, 21 Jan 2025 14:42:45 +0000 +Subject: [PATCH] gpex-acpi: Remove duplicate DSM #5 + +It looks like acpi_dsdt_add_pci_osc() already builds the _DSM +for virt/gpex case, and we don't need to add duplicate DSM methods +for _DSM #5 case. + +And the acpi_dsdt_add_pci_osc() already adds _DSM #5 when +preserve_config is true. + +This is to get rid of the ACPI related error messages during boot: + +ACPI BIOS Error (bug): Failure creating named object [\_SB.PC08._DSM], AE_ALREADY_EXISTS +ACPI BIOS Error (bug): \_SB.PC08.PCI0._DSM: Excess arguments - ASL declared 5, ACPI requires 4 + +ToDo: Only sanity tested. + +Signed-off-by: Shameer Kolothum +--- + hw/pci-host/gpex-acpi.c | 12 ------------ + 1 file changed, 12 deletions(-) + +diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c +index ce424fc9da..162f6221ab 100644 +--- a/hw/pci-host/gpex-acpi.c ++++ b/hw/pci-host/gpex-acpi.c +@@ -189,12 +189,6 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); + } + +- if (cfg->preserve_config) { +- method = aml_method("_DSM", 5, AML_SERIALIZED); +- aml_append(method, aml_return(aml_int(0))); +- aml_append(dev, method); +- } +- + acpi_dsdt_add_pci_route_table(dev, cfg->irq); + + /* +@@ -226,12 +220,6 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + +- if (cfg->preserve_config) { +- method = aml_method("_DSM", 5, AML_SERIALIZED); +- aml_append(method, aml_return(aml_int(0))); +- aml_append(dev, method); +- } +- + acpi_dsdt_add_pci_route_table(dev, cfg->irq); + + method = aml_method("_CBA", 0, AML_NOTSERIALIZED); +-- +2.41.0.windows.1 + diff --git a/hw-arm-Activate-IOMMUFD-for-virt-machines.patch b/hw-arm-Activate-IOMMUFD-for-virt-machines.patch new file mode 100644 index 0000000000000000000000000000000000000000..b8c27675786fa59ffafde7505b45cfc9f62ad8bc --- /dev/null +++ b/hw-arm-Activate-IOMMUFD-for-virt-machines.patch @@ -0,0 +1,34 @@ +From bcb031b40fe40d5b6347b2134fb039945b87e8a3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Sat, 11 Jan 2025 10:52:55 +0800 +Subject: [PATCH] hw/arm: Activate IOMMUFD for virt machines +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/arm/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig +index c0a7d0bd58..4a0ea0628f 100644 +--- a/hw/arm/Kconfig ++++ b/hw/arm/Kconfig +@@ -8,6 +8,7 @@ config ARM_VIRT + imply TPM_TIS_SYSBUS + imply TPM_TIS_I2C + imply NVDIMM ++ imply IOMMUFD + select ARM_GIC + select ACPI + select ARM_SMMUV3 +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Add-a-nested-flag-to-SMMUState.patch b/hw-arm-smmu-common-Add-a-nested-flag-to-SMMUState.patch new file mode 100644 index 0000000000000000000000000000000000000000..d917e2649a6bbc8872b69e475e2450591edb153d --- /dev/null +++ b/hw-arm-smmu-common-Add-a-nested-flag-to-SMMUState.patch @@ -0,0 +1,67 @@ +From d589010512005bfc698f30417911e4b14478c81b Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Wed, 22 Jun 2022 01:30:39 -0700 +Subject: [PATCH] hw/arm/smmu-common: Add a nested flag to SMMUState + +Add a nested flag in the SMMUState, passed in from device property. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 1 + + hw/arm/smmuv3.c | 5 +++++ + include/hw/arm/smmu-common.h | 4 ++++ + 3 files changed, 10 insertions(+) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 9a8ac45431..c5f3e02065 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -683,6 +683,7 @@ static Property smmu_dev_properties[] = { + DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0), + DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus, + TYPE_PCI_BUS, PCIBus *), ++ DEFINE_PROP_BOOL("nested", SMMUState, nested, false), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index c3871ae067..64ca4c5542 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1746,6 +1746,11 @@ static void smmu_realize(DeviceState *d, Error **errp) + SysBusDevice *dev = SYS_BUS_DEVICE(d); + Error *local_err = NULL; + ++ if (s->stage && strcmp("1", s->stage)) { ++ /* Only support nested with an stage1 only vSMMU */ ++ sys->nested = false; ++ } ++ + c->parent_realize(d, &local_err); + if (local_err) { + error_propagate(errp, local_err); +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index fd8d772da1..eae5d4d05b 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -22,6 +22,7 @@ + #include "hw/sysbus.h" + #include "hw/pci/pci.h" + #include "qom/object.h" ++#include "sysemu/iommufd.h" + + #define SMMU_PCI_BUS_MAX 256 + #define SMMU_PCI_DEVFN_MAX 256 +@@ -136,6 +137,9 @@ struct SMMUState { + const char *mrtypename; + MemoryRegion iomem; + ++ /* Nested SMMU */ ++ bool nested; ++ + GHashTable *smmu_pcibus_by_busptr; + GHashTable *configs; /* cache for configuration data */ + GHashTable *iotlb; +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Add-iommufd-helpers.patch b/hw-arm-smmu-common-Add-iommufd-helpers.patch new file mode 100644 index 0000000000000000000000000000000000000000..a95b394b7e9ef7e33fd10502eacf988e8d79bc6e --- /dev/null +++ b/hw-arm-smmu-common-Add-iommufd-helpers.patch @@ -0,0 +1,179 @@ +From a2735cd15160a62065a0a0b39af405c7b0f3fae8 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Wed, 22 Jun 2022 14:41:27 -0700 +Subject: [PATCH] hw/arm/smmu-common: Add iommufd helpers + +Add a set of helper functions for IOMMUFD and new "struct SMMUS1Hwpt" +to store the nested hwpt information. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 108 +++++++++++++++++++++++++++++++++++ + include/hw/arm/smmu-common.h | 20 +++++++ + 2 files changed, 128 insertions(+) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 038ae857d8..a79eb34277 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -838,6 +838,114 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + return NULL; + } + ++/* IOMMUFD helpers */ ++int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, ++ uint32_t data_len, void *data) ++{ ++ uint64_t caps; ++ ++ if (!sdev || !sdev->idev) { ++ return -ENOENT; ++ } ++ ++ return !iommufd_backend_get_device_info(sdev->idev->iommufd, ++ sdev->idev->devid, data_type, data, ++ data_len, &caps, NULL); ++} ++ ++void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) ++{ ++ HostIOMMUDeviceIOMMUFD *idev = sdev->idev; ++ SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; ++ uint32_t hwpt_id; ++ ++ if (!s1_hwpt || !sdev->viommu) { ++ return; ++ } ++ ++ if (abort) { ++ hwpt_id = sdev->viommu->abort_hwpt_id; ++ } else { ++ hwpt_id = sdev->viommu->bypass_hwpt_id; ++ } ++ ++ if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { ++ return; ++ } ++ ++ iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); ++ sdev->s1_hwpt = NULL; ++ g_free(s1_hwpt); ++} ++ ++int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, ++ uint32_t data_len, void *data) ++{ ++ SMMUViommu *viommu = sdev->viommu; ++ SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; ++ HostIOMMUDeviceIOMMUFD *idev = sdev->idev; ++ ++ if (!idev || !viommu) { ++ return -ENOENT; ++ } ++ ++ if (s1_hwpt) { ++ smmu_dev_uninstall_nested_ste(sdev, false); ++ } ++ ++ s1_hwpt = g_new0(SMMUS1Hwpt, 1); ++ if (!s1_hwpt) { ++ return -ENOMEM; ++ } ++ ++ s1_hwpt->smmu = sdev->smmu; ++ s1_hwpt->viommu = viommu; ++ s1_hwpt->iommufd = idev->iommufd; ++ ++ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, ++ viommu->core->viommu_id, 0, data_type, ++ data_len, data, &s1_hwpt->hwpt_id, NULL)) { ++ goto free; ++ } ++ ++ if (!host_iommu_device_iommufd_attach_hwpt(idev, s1_hwpt->hwpt_id, NULL)) { ++ goto free_hwpt; ++ } ++ ++ sdev->s1_hwpt = s1_hwpt; ++ ++ return 0; ++free_hwpt: ++ iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); ++free: ++ sdev->s1_hwpt = NULL; ++ g_free(s1_hwpt); ++ ++ return -EINVAL; ++} ++ ++int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, ++ uint32_t *num, void *reqs) ++{ ++ if (!s1_hwpt) { ++ return -ENOENT; ++ } ++ ++ return iommufd_backend_invalidate_cache(s1_hwpt->iommufd, s1_hwpt->hwpt_id, ++ type, len, num, reqs); ++} ++ ++int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, ++ uint32_t len, uint32_t *num, void *reqs) ++{ ++ if (!viommu) { ++ return -ENOENT; ++ } ++ ++ return iommufd_viommu_invalidate_cache(viommu->iommufd, viommu->viommu_id, ++ type, len, num, reqs); ++} ++ + /* Unmap all notifiers attached to @mr */ + static void smmu_inv_notifiers_mr(IOMMUMemoryRegion *mr) + { +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 3bfb68cef6..66dc7206ea 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -125,6 +125,15 @@ typedef struct SMMUViommu { + QLIST_ENTRY(SMMUViommu) next; + } SMMUViommu; + ++typedef struct SMMUS1Hwpt { ++ void *smmu; ++ IOMMUFDBackend *iommufd; ++ SMMUViommu *viommu; ++ uint32_t hwpt_id; ++ QLIST_HEAD(, SMMUDevice) device_list; ++ QLIST_ENTRY(SMMUViommu) next; ++} SMMUS1Hwpt; ++ + typedef struct SMMUDevice { + void *smmu; + PCIBus *bus; +@@ -132,6 +141,7 @@ typedef struct SMMUDevice { + IOMMUMemoryRegion iommu; + HostIOMMUDeviceIOMMUFD *idev; + SMMUViommu *viommu; ++ SMMUS1Hwpt *s1_hwpt; + AddressSpace as; + uint32_t cfg_cache_hits; + uint32_t cfg_cache_misses; +@@ -225,4 +235,14 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, dma_addr_t iova, + /* Unmap the range of all the notifiers registered to any IOMMU mr */ + void smmu_inv_notifiers_all(SMMUState *s); + ++/* IOMMUFD helpers */ ++int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, ++ uint32_t data_len, void *data); ++void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); ++int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, ++ uint32_t data_len, void *data); ++int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, ++ uint32_t *num, void *reqs); ++int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, ++ uint32_t len, uint32_t *num, void *reqs); + #endif /* HW_ARM_SMMU_COMMON_H */ +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Add-set-unset_iommu_device-callba.patch b/hw-arm-smmu-common-Add-set-unset_iommu_device-callba.patch new file mode 100644 index 0000000000000000000000000000000000000000..271b408451120c9c11c5322eb162c3c7433b6777 --- /dev/null +++ b/hw-arm-smmu-common-Add-set-unset_iommu_device-callba.patch @@ -0,0 +1,283 @@ +From 539e12641dc2db30a6fea7a0f061e163bc245d79 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Wed, 22 Jun 2022 02:16:52 -0700 +Subject: [PATCH] hw/arm/smmu-common: Add set/unset_iommu_device callback + +Implement a set_iommu_device callback: + - Find an existing S2 hwpt to test attach() or allocate a new one + (Devices behind the same physical SMMU should share an S2 HWPT.) + - Attach the device to the S2 hwpt and add it to its device list + +And add an unset_iommu_device doing the opposite cleanup routine. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 177 +++++++++++++++++++++++++++++++++++ + hw/arm/trace-events | 2 + + include/hw/arm/smmu-common.h | 21 +++++ + 3 files changed, 200 insertions(+) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 03d9ff58d4..038ae857d8 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -20,6 +20,7 @@ + #include "trace.h" + #include "exec/target_page.h" + #include "hw/core/cpu.h" ++#include "hw/pci/pci_device.h" + #include "hw/qdev-properties.h" + #include "qapi/error.h" + #include "qemu/jhash.h" +@@ -639,8 +640,184 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) + return &sdev->as; + } + ++static bool smmu_dev_attach_viommu(SMMUDevice *sdev, ++ HostIOMMUDeviceIOMMUFD *idev, Error **errp) ++{ ++ struct iommu_hwpt_arm_smmuv3 bypass_data = { ++ .ste = { 0x9ULL, 0x0ULL }, //0x1ULL << (108 - 64) }, ++ }; ++ struct iommu_hwpt_arm_smmuv3 abort_data = { ++ .ste = { 0x1ULL, 0x0ULL }, ++ }; ++ SMMUState *s = sdev->smmu; ++ SMMUS2Hwpt *s2_hwpt; ++ SMMUViommu *viommu; ++ uint32_t s2_hwpt_id; ++ ++ if (s->viommu) { ++ return host_iommu_device_iommufd_attach_hwpt( ++ idev, s->viommu->s2_hwpt->hwpt_id, errp); ++ } ++ ++ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, ++ IOMMU_HWPT_ALLOC_NEST_PARENT, ++ IOMMU_HWPT_DATA_NONE, 0, NULL, ++ &s2_hwpt_id, errp)) { ++ error_setg(errp, "failed to allocate an S2 hwpt"); ++ return false; ++ } ++ ++ /* Attach to S2 for MSI cookie */ ++ if (!host_iommu_device_iommufd_attach_hwpt(idev, s2_hwpt_id, errp)) { ++ error_setg(errp, "failed to attach stage-2 HW pagetable"); ++ goto free_s2_hwpt; ++ } ++ ++ viommu = g_new0(SMMUViommu, 1); ++ ++ viommu->core = iommufd_backend_alloc_viommu(idev->iommufd, idev->devid, ++ IOMMU_VIOMMU_TYPE_ARM_SMMUV3, ++ s2_hwpt_id); ++ if (!viommu->core) { ++ error_setg(errp, "failed to allocate a viommu"); ++ goto free_viommu; ++ } ++ ++ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, ++ viommu->core->viommu_id, 0, ++ IOMMU_HWPT_DATA_ARM_SMMUV3, ++ sizeof(abort_data), &abort_data, ++ &viommu->abort_hwpt_id, errp)) { ++ error_setg(errp, "failed to allocate an abort pagetable"); ++ goto free_viommu_core; ++ } ++ ++ if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, ++ viommu->core->viommu_id, 0, ++ IOMMU_HWPT_DATA_ARM_SMMUV3, ++ sizeof(bypass_data), &bypass_data, ++ &viommu->bypass_hwpt_id, errp)) { ++ error_setg(errp, "failed to allocate a bypass pagetable"); ++ goto free_abort_hwpt; ++ } ++ ++ if (!host_iommu_device_iommufd_attach_hwpt( ++ idev, viommu->bypass_hwpt_id, errp)) { ++ error_setg(errp, "failed to attach the bypass pagetable"); ++ goto free_bypass_hwpt; ++ } ++ ++ s2_hwpt = g_new0(SMMUS2Hwpt, 1); ++ s2_hwpt->iommufd = idev->iommufd; ++ s2_hwpt->hwpt_id = s2_hwpt_id; ++ s2_hwpt->ioas_id = idev->ioas_id; ++ ++ viommu->iommufd = idev->iommufd; ++ viommu->s2_hwpt = s2_hwpt; ++ ++ s->viommu = viommu; ++ return true; ++ ++free_bypass_hwpt: ++ iommufd_backend_free_id(idev->iommufd, viommu->bypass_hwpt_id); ++free_abort_hwpt: ++ iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id); ++free_viommu_core: ++ iommufd_backend_free_id(idev->iommufd, viommu->core->viommu_id); ++ g_free(viommu->core); ++free_viommu: ++ g_free(viommu); ++ host_iommu_device_iommufd_attach_hwpt(idev, sdev->idev->ioas_id, errp); ++free_s2_hwpt: ++ iommufd_backend_free_id(idev->iommufd, s2_hwpt_id); ++ return false; ++} ++ ++static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, ++ HostIOMMUDevice *hiod, Error **errp) ++{ ++ HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); ++ SMMUState *s = opaque; ++ SMMUPciBus *sbus = smmu_get_sbus(s, bus); ++ SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); ++ ++ if (!s->nested) { ++ return true; ++ } ++ ++ if (sdev->idev) { ++ if (sdev->idev != idev) { ++ return false;//-EEXIST; ++ } else { ++ return true; ++ } ++ } ++ ++ if (!idev) { ++ return true; ++ } ++ ++ if (!smmu_dev_attach_viommu(sdev, idev, errp)) { ++ error_report("Unable to attach viommu"); ++ return false; ++ } ++ ++ sdev->idev = idev; ++ sdev->viommu = s->viommu; ++ QLIST_INSERT_HEAD(&s->viommu->device_list, sdev, next); ++ trace_smmu_set_iommu_device(devfn, smmu_get_sid(sdev)); ++ ++ return true; ++} ++ ++static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) ++{ ++ SMMUDevice *sdev; ++ SMMUViommu *viommu; ++ SMMUState *s = opaque; ++ SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); ++ ++ if (!s->nested) { ++ return; ++ } ++ ++ if (!sbus) { ++ return; ++ } ++ ++ sdev = sbus->pbdev[devfn]; ++ if (!sdev) { ++ return; ++ } ++ ++ if (!host_iommu_device_iommufd_attach_hwpt(sdev->idev, ++ sdev->idev->ioas_id, NULL)) { ++ error_report("Unable to attach dev to the default HW pagetable"); ++ } ++ ++ viommu = sdev->viommu; ++ ++ sdev->idev = NULL; ++ sdev->viommu = NULL; ++ QLIST_REMOVE(sdev, next); ++ trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); ++ ++ if (QLIST_EMPTY(&viommu->device_list)) { ++ iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); ++ iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); ++ iommufd_backend_free_id(viommu->iommufd, viommu->core->viommu_id); ++ g_free(viommu->core); ++ iommufd_backend_free_id(viommu->iommufd, viommu->s2_hwpt->hwpt_id); ++ g_free(viommu->s2_hwpt); ++ g_free(viommu); ++ s->viommu = NULL; ++ } ++} ++ + static const PCIIOMMUOps smmu_ops = { + .get_address_space = smmu_find_add_as, ++ .set_iommu_device = smmu_dev_set_iommu_device, ++ .unset_iommu_device = smmu_dev_unset_iommu_device, + }; + + IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index cdc1ea06a8..58e0636e95 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -5,6 +5,8 @@ virt_acpi_setup(void) "No fw cfg or ACPI disabled. Bailing out." + + # smmu-common.c + smmu_add_mr(const char *name) "%s" ++smmu_set_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" ++smmu_unset_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" + smmu_ptw_level(int stage, int level, uint64_t iova, size_t subpage_size, uint64_t baseaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d iova=0x%"PRIx64" subpage_sz=0x%zx baseaddr=0x%"PRIx64" offset=%d => pte=0x%"PRIx64 + smmu_ptw_invalid_pte(int stage, int level, uint64_t baseaddr, uint64_t pteaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d base@=0x%"PRIx64" pte@=0x%"PRIx64" offset=%d pte=0x%"PRIx64 + smmu_ptw_page_pte(int stage, int level, uint64_t iova, uint64_t baseaddr, uint64_t pteaddr, uint64_t pte, uint64_t address) "stage=%d level=%d iova=0x%"PRIx64" base@=0x%"PRIx64" pte@=0x%"PRIx64" pte=0x%"PRIx64" page address = 0x%"PRIx64 +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index eae5d4d05b..3bfb68cef6 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -23,6 +23,7 @@ + #include "hw/pci/pci.h" + #include "qom/object.h" + #include "sysemu/iommufd.h" ++#include + + #define SMMU_PCI_BUS_MAX 256 + #define SMMU_PCI_DEVFN_MAX 256 +@@ -107,11 +108,30 @@ typedef struct SMMUTransCfg { + struct SMMUS2Cfg s2cfg; + } SMMUTransCfg; + ++typedef struct SMMUS2Hwpt { ++ IOMMUFDBackend *iommufd; ++ uint32_t hwpt_id; ++ uint32_t ioas_id; ++} SMMUS2Hwpt; ++ ++typedef struct SMMUViommu { ++ void *smmu; ++ IOMMUFDBackend *iommufd; ++ IOMMUFDViommu *core; ++ SMMUS2Hwpt *s2_hwpt; ++ uint32_t bypass_hwpt_id; ++ uint32_t abort_hwpt_id; ++ QLIST_HEAD(, SMMUDevice) device_list; ++ QLIST_ENTRY(SMMUViommu) next; ++} SMMUViommu; ++ + typedef struct SMMUDevice { + void *smmu; + PCIBus *bus; + int devfn; + IOMMUMemoryRegion iommu; ++ HostIOMMUDeviceIOMMUFD *idev; ++ SMMUViommu *viommu; + AddressSpace as; + uint32_t cfg_cache_hits; + uint32_t cfg_cache_misses; +@@ -139,6 +159,7 @@ struct SMMUState { + + /* Nested SMMU */ + bool nested; ++ SMMUViommu *viommu; + + GHashTable *smmu_pcibus_by_busptr; + GHashTable *configs; /* cache for configuration data */ +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Bypass-emulated-IOTLB-for-a-neste.patch b/hw-arm-smmu-common-Bypass-emulated-IOTLB-for-a-neste.patch new file mode 100644 index 0000000000000000000000000000000000000000..8998bcb06848c163c8c6939e8889f0b241cfd882 --- /dev/null +++ b/hw-arm-smmu-common-Bypass-emulated-IOTLB-for-a-neste.patch @@ -0,0 +1,75 @@ +From 6c330f39cc08e4c641a3567e2b6ad0ebcadf5165 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Fri, 21 Jun 2024 21:22:04 +0000 +Subject: [PATCH] hw/arm/smmu-common: Bypass emulated IOTLB for a nested SMMU + +If a vSMMU is configured as a nested one, HW IOTLB will be used and all +cache invalidation should be done to the HW IOTLB too, v.s. the emulated +iotlb. In this case, an iommu notifier isn't registered, as the devices +behind a nested SMMU would stay in the system address space for stage-2 +mappings. + +However, the KVM code still requests an iommu address space to translate +an MSI doorbell gIOVA via get_msi_address_space() and translate(). + +Since a nested SMMU doesn't register an iommu notifier to flush emulated +iotlb, bypass the emulated IOTLB and always walk through the guest-level +IO page table. + +Note that regular nested SMMU could still register an iommu notifier for +IOTLB invalidation, since QEMU traps the invalidation commands. But this +would result in invalidation inefficiency since each invlaidation would +be doubled for both HW IOTLB and the emulated IOTLB. Also, with NVIDIA's +CMDQV feature on its Grace SoC, invalidation commands are issued to the +CMDQ HW direclty, without any trapping. So, there is no way to maintain +the emulated IOTLB. Meanwhile, the stage-1 translation request from KVM +is only activated in case of an MSI table update, which does not happen +that often to impact performance if walking through the guest RAM every +time. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index c5f3e02065..016418a48c 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -75,6 +75,16 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, + uint8_t level = 4 - (inputsize - 4) / stride; + SMMUTLBEntry *entry = NULL; + ++ /* ++ * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, ++ * KVM still requests for an iommu address space for an MSI fixup by looking ++ * up stage-1 page table. Make sure we don't go through the emulated pathway ++ * so that the emulated iotlb will not need any invalidation. ++ */ ++ if (bs->nested) { ++ return NULL; ++ } ++ + while (level <= 3) { + uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz); + uint64_t mask = subpage_size - 1; +@@ -110,6 +120,16 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) + SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); + uint8_t tg = (new->granule - 10) / 2; + ++ /* ++ * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, ++ * KVM still requests for an iommu address space for an MSI fixup by looking ++ * up stage-1 page table. Make sure we don't go through the emulated pathway ++ * so that the emulated iotlb will not need any invalidation. ++ */ ++ if (bs->nested) { ++ return; ++ } ++ + if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { + smmu_iotlb_inv_all(bs); + } +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Extract-smmu_get_sbus-and-smmu_ge.patch b/hw-arm-smmu-common-Extract-smmu_get_sbus-and-smmu_ge.patch new file mode 100644 index 0000000000000000000000000000000000000000..16fe217d2a96fef8be9a0c47c36be504b9247640 --- /dev/null +++ b/hw-arm-smmu-common-Extract-smmu_get_sbus-and-smmu_ge.patch @@ -0,0 +1,68 @@ +From 2fea4f93632679afcb15f0c35b3d9abeede37778 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Wed, 10 Apr 2024 16:37:25 +0000 +Subject: [PATCH] hw/arm/smmu-common: Extract smmu_get_sbus and smmu_get_sdev + helpers + +Add two helpers to get sbus and sdev respectively. These will be used +by the following patch adding set/unset_iommu_device ops. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 24 +++++++++++++++++++----- + 1 file changed, 19 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 016418a48c..03d9ff58d4 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -589,12 +589,9 @@ SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num) + return NULL; + } + +-static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) ++static SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus) + { +- SMMUState *s = opaque; + SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); +- SMMUDevice *sdev; +- static unsigned int index; + + if (!sbus) { + sbus = g_malloc0(sizeof(SMMUPciBus) + +@@ -603,7 +600,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) + g_hash_table_insert(s->smmu_pcibus_by_busptr, bus, sbus); + } + +- sdev = sbus->pbdev[devfn]; ++ return sbus; ++} ++ ++static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, ++ PCIBus *bus, int devfn) ++{ ++ SMMUDevice *sdev = sbus->pbdev[devfn]; ++ static unsigned int index; ++ + if (!sdev) { + char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn, index++); + +@@ -622,6 +627,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) + g_free(name); + } + ++ return sdev; ++} ++ ++static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) ++{ ++ SMMUState *s = opaque; ++ SMMUPciBus *sbus = smmu_get_sbus(s, bus); ++ SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); ++ + return &sdev->as; + } + +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Replace-smmu_iommu_mr-with-smmu_f.patch b/hw-arm-smmu-common-Replace-smmu_iommu_mr-with-smmu_f.patch new file mode 100644 index 0000000000000000000000000000000000000000..9cc887dc954e5e8a097cea442b0861bfde90e445 --- /dev/null +++ b/hw-arm-smmu-common-Replace-smmu_iommu_mr-with-smmu_f.patch @@ -0,0 +1,114 @@ +From d8d7f775b602a84c37b8aced11e00cb5b0521c4e Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Tue, 18 Jun 2024 17:22:18 -0700 +Subject: [PATCH] hw/arm/smmu-common: Replace smmu_iommu_mr with smmu_find_sdev + +The caller of smmu_iommu_mr wants to get sdev for smmuv3_flush_config(). + +Do it directly instead of bridging with an iommu mr pointer. + +Signed-off-by: Nicolin Chen +Message-id: 20240619002218.926674-1-nicolinc@nvidia.com +Reviewed-by: Peter Maydell +Signed-off-by: Peter Maydell +--- + hw/arm/smmu-common.c | 8 ++------ + hw/arm/smmuv3.c | 12 ++++-------- + include/hw/arm/smmu-common.h | 4 ++-- + 3 files changed, 8 insertions(+), 16 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 9e9af8f5c7..d0bc620606 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -837,20 +837,16 @@ static const PCIIOMMUOps smmu_ops = { + .unset_iommu_device = smmu_dev_unset_iommu_device, + }; + +-IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) ++SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) + { + uint8_t bus_n, devfn; + SMMUPciBus *smmu_bus; +- SMMUDevice *smmu; + + bus_n = PCI_BUS_NUM(sid); + smmu_bus = smmu_find_smmu_pcibus(s, bus_n); + if (smmu_bus) { + devfn = SMMU_PCI_DEVFN(sid); +- smmu = smmu_bus->pbdev[devfn]; +- if (smmu) { +- return &smmu->iommu; +- } ++ return smmu_bus->pbdev[devfn]; + } + return NULL; + } +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 9d44bb19bc..b2ffe2d40b 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1407,20 +1407,18 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + case SMMU_CMD_CFGI_STE: + { + uint32_t sid = CMD_SID(&cmd); +- IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); +- SMMUDevice *sdev; ++ SMMUDevice *sdev = smmu_find_sdev(bs, sid); + + if (CMD_SSEC(&cmd)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + +- if (!mr) { ++ if (!sdev) { + break; + } + + trace_smmuv3_cmdq_cfgi_ste(sid); +- sdev = container_of(mr, SMMUDevice, iommu); + smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); + +@@ -1452,20 +1450,18 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + case SMMU_CMD_CFGI_CD_ALL: + { + uint32_t sid = CMD_SID(&cmd); +- IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); +- SMMUDevice *sdev; ++ SMMUDevice *sdev = smmu_find_sdev(bs, sid); + + if (CMD_SSEC(&cmd)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + +- if (!mr) { ++ if (!sdev) { + break; + } + + trace_smmuv3_cmdq_cfgi_cd(sid); +- sdev = container_of(mr, SMMUDevice, iommu); + smmuv3_flush_config(sdev); + break; + } +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 955ca716a5..e30539a8d4 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -234,8 +234,8 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, + */ + SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova); + +-/* Return the iommu mr associated to @sid, or NULL if none */ +-IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); ++/* Return the SMMUDevice associated to @sid, or NULL if none */ ++SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid); + + #define SMMU_IOTLB_MAX_SIZE 256 + +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmu-common-Return-sysmem-if-stage-1-is-bypas.patch b/hw-arm-smmu-common-Return-sysmem-if-stage-1-is-bypas.patch new file mode 100644 index 0000000000000000000000000000000000000000..406280cbc2f54b20a763711eb70c2215748777be --- /dev/null +++ b/hw-arm-smmu-common-Return-sysmem-if-stage-1-is-bypas.patch @@ -0,0 +1,87 @@ +From 3c6c29612d5ca0ff07bcb8a45735a3877c8fadd4 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Thu, 7 Dec 2023 20:04:47 +0000 +Subject: [PATCH] hw/arm/smmu-common: Return sysmem if stage-1 is bypassed + +When nested translation is enabled, there are 2-stage translation occuring +to two different address spaces: stage-1 in the iommu as, while stage-2 in +the system as. + +If a device attached to the vSMMU doesn't enable stage-1 translation, e.g. +vSTE sets to Config=Bypass, the system as should be returned, so QEMU can +set up system memory mappings onto the stage-2 page table. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 18 +++++++++++++++++- + include/hw/arm/smmu-common.h | 3 +++ + 2 files changed, 20 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index a79eb34277..cc41bf3de8 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -622,6 +622,9 @@ static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, + memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu), + s->mrtypename, + OBJECT(s), name, UINT64_MAX); ++ if (s->nested) { ++ address_space_init(&sdev->as_sysmem, &s->root, name); ++ } + address_space_init(&sdev->as, + MEMORY_REGION(&sdev->iommu), name); + trace_smmu_add_mr(name); +@@ -637,7 +640,12 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + +- return &sdev->as; ++ /* Return the system as if the device uses stage-2 only */ ++ if (s->nested && !sdev->s1_hwpt) { ++ return &sdev->as_sysmem; ++ } else { ++ return &sdev->as; ++ } + } + + static bool smmu_dev_attach_viommu(SMMUDevice *sdev, +@@ -983,6 +991,14 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) + g_free, g_free); + s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); + ++ if (s->nested) { ++ memory_region_init(&s->root, OBJECT(s), "root", UINT64_MAX); ++ memory_region_init_alias(&s->sysmem, OBJECT(s), ++ "smmu-sysmem", get_system_memory(), 0, ++ memory_region_size(get_system_memory())); ++ memory_region_add_subregion(&s->root, 0, &s->sysmem); ++ } ++ + if (s->primary_bus) { + pci_setup_iommu(s->primary_bus, &smmu_ops, s); + } else { +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 66dc7206ea..37dfeed026 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -143,6 +143,7 @@ typedef struct SMMUDevice { + SMMUViommu *viommu; + SMMUS1Hwpt *s1_hwpt; + AddressSpace as; ++ AddressSpace as_sysmem; + uint32_t cfg_cache_hits; + uint32_t cfg_cache_misses; + QLIST_ENTRY(SMMUDevice) next; +@@ -165,7 +166,9 @@ struct SMMUState { + /* */ + SysBusDevice dev; + const char *mrtypename; ++ MemoryRegion root; + MemoryRegion iomem; ++ MemoryRegion sysmem; + + /* Nested SMMU */ + bool nested; +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Add-initial-support-for-SMMUv3-Nested-.patch b/hw-arm-smmuv3-Add-initial-support-for-SMMUv3-Nested-.patch new file mode 100644 index 0000000000000000000000000000000000000000..0a09fabaa9f8e0f8c003ebfa9e44dafe27550d29 --- /dev/null +++ b/hw-arm-smmuv3-Add-initial-support-for-SMMUv3-Nested-.patch @@ -0,0 +1,233 @@ +From 9895192512af4b52aff88432618a474e69b44bdd Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Wed, 6 Nov 2024 14:47:27 +0000 +Subject: [PATCH] hw/arm/smmuv3: Add initial support for SMMUv3 Nested device + +Based on SMMUv3 as a parent device, add a user-creatable +smmuv3-nested device. Subsequent patches will add support to +specify a PCI bus for this device. + +Currently only supported for "virt", so hook up the sybus mem & irq +for that as well. + +No FDT support is added for now. + +Signed-off-by: Shameer Kolothum +--- + hw/arm/smmuv3.c | 34 ++++++++++++++++++++++++++++++++++ + hw/arm/virt.c | 31 +++++++++++++++++++++++++++++-- + hw/core/sysbus-fdt.c | 1 + + include/hw/arm/smmuv3.h | 15 +++++++++++++++ + include/hw/arm/virt.h | 6 ++++++ + 5 files changed, 85 insertions(+), 2 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index b860c8385f..3010471cdc 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -2069,6 +2069,19 @@ static void smmu_realize(DeviceState *d, Error **errp) + smmu_init_irq(s, dev); + } + ++static void smmu_nested_realize(DeviceState *d, Error **errp) ++{ ++ SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); ++ SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); ++ Error *local_err = NULL; ++ ++ c->parent_realize(d, &local_err); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ return; ++ } ++} ++ + static const VMStateDescription vmstate_smmuv3_queue = { + .name = "smmuv3_queue", + .version_id = 1, +@@ -2167,6 +2180,18 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) + device_class_set_props(dc, smmuv3_properties); + } + ++static void smmuv3_nested_class_init(ObjectClass *klass, void *data) ++{ ++ DeviceClass *dc = DEVICE_CLASS(klass); ++ SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_CLASS(klass); ++ ++ dc->vmsd = &vmstate_smmuv3; ++ device_class_set_parent_realize(dc, smmu_nested_realize, ++ &c->parent_realize); ++ dc->user_creatable = true; ++ dc->hotpluggable = false; ++} ++ + static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new, +@@ -2205,6 +2230,14 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + imrc->notify_flag_changed = smmuv3_notify_flag_changed; + } + ++static const TypeInfo smmuv3_nested_type_info = { ++ .name = TYPE_ARM_SMMUV3_NESTED, ++ .parent = TYPE_ARM_SMMUV3, ++ .instance_size = sizeof(SMMUv3NestedState), ++ .class_size = sizeof(SMMUv3NestedClass), ++ .class_init = smmuv3_nested_class_init, ++}; ++ + static const TypeInfo smmuv3_type_info = { + .name = TYPE_ARM_SMMUV3, + .parent = TYPE_ARM_SMMU, +@@ -2223,6 +2256,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { + static void smmuv3_register_types(void) + { + type_register(&smmuv3_type_info); ++ type_register(&smmuv3_nested_type_info); + type_register(&smmuv3_iommu_memory_region_info); + } + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 08c40c314b..a55f297af2 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -166,6 +166,7 @@ static const MemMapEntry base_memmap[] = { + /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ + [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, + [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, ++ [VIRT_SMMU_NESTED] = { 0x0b010000, 0x00ff0000}, + /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ + [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, + [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, +@@ -211,6 +212,7 @@ static const int a15irqmap[] = { + [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ + [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ + [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ ++ [VIRT_SMMU_NESTED] = 200, + }; + + static const char *valid_cpus[] = { +@@ -3613,10 +3615,34 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) + { + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); ++ MachineClass *mc = MACHINE_GET_CLASS(vms); + +- if (vms->platform_bus_dev) { +- MachineClass *mc = MACHINE_GET_CLASS(vms); ++ /* For smmuv3-nested devices we need to set the mem & irq */ ++ if (device_is_dynamic_sysbus(mc, dev) && ++ object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_NESTED)) { ++ hwaddr base = vms->memmap[VIRT_SMMU_NESTED].base; ++ int irq = vms->irqmap[VIRT_SMMU_NESTED]; ++ ++ if (vms->smmu_nested_count >= MAX_SMMU_NESTED) { ++ error_setg(errp, "smmuv3-nested max count reached!"); ++ return; ++ } ++ ++ base += (vms->smmu_nested_count * SMMU_IO_LEN); ++ irq += (vms->smmu_nested_count * NUM_SMMU_IRQS); + ++ sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); ++ for (int i = 0; i < 4; i++) { ++ sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, ++ qdev_get_gpio_in(vms->gic, irq + i)); ++ } ++ if (vms->iommu != VIRT_IOMMU_SMMUV3_NESTED) { ++ vms->iommu = VIRT_IOMMU_SMMUV3_NESTED; ++ } ++ vms->smmu_nested_count++; ++ } ++ ++ if (vms->platform_bus_dev) { + if (device_is_dynamic_sysbus(mc, dev)) { + platform_bus_link_device(PLATFORM_BUS_DEVICE(vms->platform_bus_dev), + SYS_BUS_DEVICE(dev)); +@@ -3789,6 +3815,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); ++ machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_NESTED); + #ifdef CONFIG_TPM + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); + #endif +diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c +index eebcd28f9a..0f0d0b3e58 100644 +--- a/hw/core/sysbus-fdt.c ++++ b/hw/core/sysbus-fdt.c +@@ -489,6 +489,7 @@ static const BindingEntry bindings[] = { + #ifdef CONFIG_LINUX + TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), + TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), ++ TYPE_BINDING("arm-smmuv3-nested", no_fdt_node), + VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), + #endif + #ifdef CONFIG_TPM +diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h +index d183a62766..87e628be7a 100644 +--- a/include/hw/arm/smmuv3.h ++++ b/include/hw/arm/smmuv3.h +@@ -84,6 +84,21 @@ struct SMMUv3Class { + #define TYPE_ARM_SMMUV3 "arm-smmuv3" + OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) + ++#define TYPE_ARM_SMMUV3_NESTED "arm-smmuv3-nested" ++OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) ++ ++struct SMMUv3NestedState { ++ SMMUv3State smmuv3_state; ++}; ++ ++struct SMMUv3NestedClass { ++ /*< private >*/ ++ SMMUv3Class smmuv3_class; ++ /*< public >*/ ++ ++ DeviceRealize parent_realize; ++}; ++ + #define STAGE1_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S1P) + #define STAGE2_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S2P) + +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index e6a449becd..cd41e28202 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -109,6 +109,9 @@ typedef enum { + /* MMIO region size for SMMUv3 */ + #define SMMU_IO_LEN 0x20000 + ++/* Max supported nested SMMUv3 */ ++#define MAX_SMMU_NESTED 64 ++ + enum { + VIRT_FLASH, + VIRT_MEM, +@@ -121,6 +124,7 @@ enum { + VIRT_GIC_ITS, + VIRT_GIC_REDIST, + VIRT_SMMU, ++ VIRT_SMMU_NESTED, + VIRT_UART, + VIRT_CPUFREQ, + VIRT_MMIO, +@@ -155,6 +159,7 @@ enum { + typedef enum VirtIOMMUType { + VIRT_IOMMU_NONE, + VIRT_IOMMU_SMMUV3, ++ VIRT_IOMMU_SMMUV3_NESTED, + VIRT_IOMMU_VIRTIO, + } VirtIOMMUType; + +@@ -222,6 +227,7 @@ struct VirtMachineState { + bool mte; + bool dtb_randomness; + bool pmu; ++ int smmu_nested_count; + OnOffAuto acpi; + VirtGICType gic_version; + VirtIOMMUType iommu; +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Add-missing-STE-invalidation.patch b/hw-arm-smmuv3-Add-missing-STE-invalidation.patch new file mode 100644 index 0000000000000000000000000000000000000000..f96f14407a0b15a8e219c6b5e4f0cacf0204fba3 --- /dev/null +++ b/hw-arm-smmuv3-Add-missing-STE-invalidation.patch @@ -0,0 +1,92 @@ +From 707bd8198642549595f11ef34c80094fbf7d2de1 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Mon, 29 Apr 2024 21:26:41 +0000 +Subject: [PATCH] hw/arm/smmuv3: Add missing STE invalidation + +Multitple STEs can be invalidated in a range via SMMU_CMD_CFGI_STE_RANGE +or SMMU_CMD_CFGI_ALL command. + +Add the missing STE invalidation in this pathway. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-internal.h | 1 + + hw/arm/smmuv3.c | 28 +++++++++++++++++++++++++--- + 2 files changed, 26 insertions(+), 3 deletions(-) + +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 843bebb185..5a81dd1b82 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -142,6 +142,7 @@ typedef struct SMMUIOTLBPageInvInfo { + } SMMUIOTLBPageInvInfo; + + typedef struct SMMUSIDRange { ++ SMMUState *state; + uint32_t start; + uint32_t end; + } SMMUSIDRange; +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 540831ab8e..9d44bb19bc 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1322,11 +1322,9 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) + } + + static gboolean +-smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) ++_smmuv3_invalidate_ste(SMMUDevice *sdev, SMMUSIDRange *sid_range) + { +- SMMUDevice *sdev = (SMMUDevice *)key; + uint32_t sid = smmu_get_sid(sdev); +- SMMUSIDRange *sid_range = (SMMUSIDRange *)user_data; + + if (sid < sid_range->start || sid > sid_range->end) { + return false; +@@ -1337,6 +1335,28 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + return true; + } + ++static gboolean ++smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) ++{ ++ return _smmuv3_invalidate_ste((SMMUDevice *)key, (SMMUSIDRange *)user_data); ++} ++ ++static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) ++{ ++ SMMUState *bs = sid_range->state; ++ SMMUDevice *sdev; ++ ++ if (!bs->viommu) { ++ return; ++ } ++ ++ QLIST_FOREACH(sdev, &bs->viommu->device_list, next) { ++ if (smmu_get_sid(sdev)) { ++ _smmuv3_invalidate_ste(sdev, sid_range); ++ } ++ } ++} ++ + static int smmuv3_cmdq_consume(SMMUv3State *s) + { + SMMUState *bs = ARM_SMMU(s); +@@ -1418,12 +1438,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + } + + mask = (1ULL << (range + 1)) - 1; ++ sid_range.state = bs; + sid_range.start = sid & ~mask; + sid_range.end = sid_range.start + mask; + + trace_smmuv3_cmdq_cfgi_ste_range(sid_range.start, sid_range.end); + g_hash_table_foreach_remove(bs->configs, smmuv3_invalidate_ste, + &sid_range); ++ smmuv3_invalidate_nested_ste(&sid_range); + break; + } + case SMMU_CMD_CFGI_CD: +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Add-smmu_dev_install_nested_ste-for-CF.patch b/hw-arm-smmuv3-Add-smmu_dev_install_nested_ste-for-CF.patch new file mode 100644 index 0000000000000000000000000000000000000000..02defaef0fead966718af49b83ad800864bbec0c --- /dev/null +++ b/hw-arm-smmuv3-Add-smmu_dev_install_nested_ste-for-CF.patch @@ -0,0 +1,255 @@ +From 13b84313c9f7ca4823abdbad92baf091c337861e Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Fri, 21 Apr 2023 15:13:53 -0700 +Subject: [PATCH] hw/arm/smmuv3: Add smmu_dev_install_nested_ste() for CFGI_STE + +Call smmu_dev_install_nested_ste and eventually down to IOMMU_HWPT_ALLOC +ioctl for a nested HWPT allocation. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmu-common.c | 9 ++++ + hw/arm/smmuv3-internal.h | 1 + + hw/arm/smmuv3.c | 97 +++++++++++++++++++++++++++++++++++- + hw/arm/trace-events | 1 + + include/hw/arm/smmu-common.h | 14 ++++++ + 5 files changed, 120 insertions(+), 2 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index cc41bf3de8..9e9af8f5c7 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -780,6 +780,7 @@ static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + + static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) + { ++ SMMUVdev *vdev; + SMMUDevice *sdev; + SMMUViommu *viommu; + SMMUState *s = opaque; +@@ -803,13 +804,21 @@ static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) + error_report("Unable to attach dev to the default HW pagetable"); + } + ++ vdev = sdev->vdev; + viommu = sdev->viommu; + + sdev->idev = NULL; + sdev->viommu = NULL; ++ sdev->vdev = NULL; + QLIST_REMOVE(sdev, next); + trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); + ++ if (vdev) { ++ iommufd_backend_free_id(viommu->iommufd, vdev->core->vdev_id); ++ g_free(vdev->core); ++ g_free(vdev); ++ } ++ + if (QLIST_EMPTY(&viommu->device_list)) { + iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); +diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h +index 6076025ad6..163459d450 100644 +--- a/hw/arm/smmuv3-internal.h ++++ b/hw/arm/smmuv3-internal.h +@@ -552,6 +552,7 @@ typedef struct CD { + + #define STE_S1FMT(x) extract32((x)->word[0], 4 , 2) + #define STE_S1CDMAX(x) extract32((x)->word[1], 27, 5) ++#define STE_S1DSS(x) extract32((x)->word[2], 0, 2) + #define STE_S1STALLD(x) extract32((x)->word[2], 27, 1) + #define STE_EATS(x) extract32((x)->word[2], 28, 2) + #define STE_STRW(x) extract32((x)->word[2], 30, 2) +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 253d297eec..540831ab8e 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -563,6 +563,27 @@ bad_ste: + return -EINVAL; + } + ++static void decode_ste_config(SMMUTransCfg *cfg, uint32_t config) ++{ ++ ++ if (STE_CFG_ABORT(config)) { ++ cfg->aborted = true; ++ return; ++ } ++ if (STE_CFG_BYPASS(config)) { ++ cfg->bypassed = true; ++ return; ++ } ++ ++ if (STE_CFG_S1_ENABLED(config)) { ++ cfg->stage = SMMU_STAGE_1; ++ } ++ ++ if (STE_CFG_S2_ENABLED(config)) { ++ cfg->stage |= SMMU_STAGE_2; ++ } ++} ++ + /* Returns < 0 in case of invalid STE, 0 otherwise */ + static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, + STE *ste, SMMUEventInfo *event) +@@ -579,12 +600,19 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, + + config = STE_CONFIG(ste); + +- if (STE_CFG_ABORT(config)) { ++ decode_ste_config(cfg, config); ++ ++ /* S1DSS.Terminate is same as Config.abort for default stream */ ++ if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0) { + cfg->aborted = true; ++ } ++ ++ if (cfg->aborted || cfg->bypassed) { + return 0; + } + +- if (STE_CFG_BYPASS(config)) { ++ /* S1DSS.Bypass is same as Config.bypass for default stream */ ++ if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0x1) { + cfg->bypassed = true; + return 0; + } +@@ -1231,6 +1259,68 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) + } + } + ++static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) ++{ ++#ifdef __linux__ ++ SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid, ++ .inval_ste_allowed = true}; ++ struct iommu_hwpt_arm_smmuv3 nested_data = {}; ++ SMMUv3State *s = sdev->smmu; ++ SMMUState *bs = &s->smmu_state; ++ uint32_t config; ++ STE ste; ++ int ret; ++ ++ if (!sdev->viommu || !bs->nested) { ++ return; ++ } ++ ++ if (!sdev->vdev && sdev->idev && sdev->viommu) { ++ SMMUVdev *vdev = g_new0(SMMUVdev, 1); ++ vdev->core = iommufd_backend_alloc_vdev(sdev->idev, sdev->viommu->core, ++ sid); ++ if (!vdev->core) { ++ error_report("failed to allocate a vDEVICE"); ++ g_free(vdev); ++ return; ++ } ++ sdev->vdev = vdev; ++ } ++ ++ ret = smmu_find_ste(sdev->smmu, sid, &ste, &event); ++ if (ret) { ++ /* ++ * For a 2-level Stream Table, the level-2 table might not be ready ++ * until the device gets inserted to the stream table. Ignore this. ++ */ ++ return; ++ } ++ ++ config = STE_CONFIG(&ste); ++ if (!STE_VALID(&ste) || !STE_CFG_S1_ENABLED(config)) { ++ smmu_dev_uninstall_nested_ste(sdev, STE_CFG_ABORT(config)); ++ smmuv3_flush_config(sdev); ++ return; ++ } ++ ++ nested_data.ste[0] = (uint64_t)ste.word[0] | (uint64_t)ste.word[1] << 32; ++ nested_data.ste[1] = (uint64_t)ste.word[2] | (uint64_t)ste.word[3] << 32; ++ /* V | CONFIG | S1FMT | S1CTXPTR | S1CDMAX */ ++ nested_data.ste[0] &= 0xf80fffffffffffffULL; ++ /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ ++ nested_data.ste[1] &= 0x380000ffULL; ++ ++ ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, ++ sizeof(nested_data), &nested_data); ++ if (ret) { ++ error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", ++ nested_data.ste[1], nested_data.ste[0], ret); ++ } ++ ++ trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); ++#endif ++} ++ + static gboolean + smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + { +@@ -1241,6 +1331,8 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + if (sid < sid_range->start || sid > sid_range->end) { + return false; + } ++ smmuv3_flush_config(sdev); ++ smmuv3_install_nested_ste(sdev, sid); + trace_smmuv3_config_cache_inv(sid); + return true; + } +@@ -1310,6 +1402,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + trace_smmuv3_cmdq_cfgi_ste(sid); + sdev = container_of(mr, SMMUDevice, iommu); + smmuv3_flush_config(sdev); ++ smmuv3_install_nested_ste(sdev, sid); + + break; + } +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 1e3d86382d..490da6349c 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -57,4 +57,5 @@ smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" + smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" + smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 ++smmuv3_install_nested_ste(uint32_t sid, uint64_t ste_1, uint64_t ste_0) "sid=%d ste=%"PRIx64":%"PRIx64 + +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index d120c352cf..955ca716a5 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -51,6 +51,13 @@ typedef enum { + SMMU_PTW_ERR_PERMISSION, /* Permission fault */ + } SMMUPTWEventType; + ++/* SMMU Stage */ ++typedef enum { ++ SMMU_STAGE_1 = 1, ++ SMMU_STAGE_2, ++ SMMU_NESTED, ++} SMMUStage; ++ + typedef struct SMMUPTWEventInfo { + int stage; + SMMUPTWEventType type; +@@ -125,6 +132,12 @@ typedef struct SMMUViommu { + QLIST_ENTRY(SMMUViommu) next; + } SMMUViommu; + ++typedef struct SMMUVdev { ++ SMMUViommu *vsmmu; ++ IOMMUFDVdev *core; ++ uint32_t sid; ++}SMMUVdev; ++ + typedef struct SMMUS1Hwpt { + void *smmu; + IOMMUFDBackend *iommufd; +@@ -141,6 +154,7 @@ typedef struct SMMUDevice { + IOMMUMemoryRegion iommu; + HostIOMMUDeviceIOMMUFD *idev; + SMMUViommu *viommu; ++ SMMUVdev *vdev; + SMMUS1Hwpt *s1_hwpt; + AddressSpace as; + AddressSpace as_sysmem; +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Associate-a-pci-bus-with-a-SMMUv3-Nest.patch b/hw-arm-smmuv3-Associate-a-pci-bus-with-a-SMMUv3-Nest.patch new file mode 100644 index 0000000000000000000000000000000000000000..abf1ab5691f43a249e5cbc6b9e1ae12b530ddbc6 --- /dev/null +++ b/hw-arm-smmuv3-Associate-a-pci-bus-with-a-SMMUv3-Nest.patch @@ -0,0 +1,95 @@ +From afca50145f52601d912a805b65bd4530e9278388 Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Wed, 6 Nov 2024 15:53:45 +0000 +Subject: [PATCH] hw/arm/smmuv3: Associate a pci bus with a SMMUv3 Nested + device + +Subsequent patches will add IORT modifications to get this working. + +Signed-off-by: Shameer Kolothum +--- + hw/arm/smmuv3.c | 27 +++++++++++++++++++++++++++ + include/hw/arm/smmuv3.h | 2 ++ + 2 files changed, 29 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 3010471cdc..66e4e1b57d 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -24,6 +24,7 @@ + #include "hw/qdev-properties.h" + #include "hw/qdev-core.h" + #include "hw/pci/pci.h" ++#include "hw/pci/pci_bridge.h" + #include "cpu.h" + #include "trace.h" + #include "qemu/log.h" +@@ -2069,12 +2070,32 @@ static void smmu_realize(DeviceState *d, Error **errp) + smmu_init_irq(s, dev); + } + ++static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) ++{ ++ DeviceState *d = opaque; ++ SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); ++ ++ if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { ++ PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; ++ if (s_nested->pci_bus && !strcmp(bus->qbus.name, s_nested->pci_bus)) { ++ object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), ++ &error_abort); ++ } ++ } ++ return 0; ++} ++ + static void smmu_nested_realize(DeviceState *d, Error **errp) + { + SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); ++ SysBusDevice *dev = SYS_BUS_DEVICE(d); + Error *local_err = NULL; + ++ object_child_foreach_recursive(object_get_root(), ++ smmuv3_nested_pci_host_bridge, d); ++ object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); ++ + c->parent_realize(d, &local_err); + if (local_err) { + error_propagate(errp, local_err); +@@ -2161,6 +2182,11 @@ static Property smmuv3_properties[] = { + DEFINE_PROP_END_OF_LIST() + }; + ++static Property smmuv3_nested_properties[] = { ++ DEFINE_PROP_STRING("pci-bus", SMMUv3NestedState, pci_bus), ++ DEFINE_PROP_END_OF_LIST() ++}; ++ + static void smmuv3_instance_init(Object *obj) + { + /* Nothing much to do here as of now */ +@@ -2188,6 +2214,7 @@ static void smmuv3_nested_class_init(ObjectClass *klass, void *data) + dc->vmsd = &vmstate_smmuv3; + device_class_set_parent_realize(dc, smmu_nested_realize, + &c->parent_realize); ++ device_class_set_props(dc, smmuv3_nested_properties); + dc->user_creatable = true; + dc->hotpluggable = false; + } +diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h +index 87e628be7a..96513fce56 100644 +--- a/include/hw/arm/smmuv3.h ++++ b/include/hw/arm/smmuv3.h +@@ -89,6 +89,8 @@ OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) + + struct SMMUv3NestedState { + SMMUv3State smmuv3_state; ++ ++ char *pci_bus; + }; + + struct SMMUv3NestedClass { +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Check-idr-registers-for-STE_S1CDMAX-an.patch b/hw-arm-smmuv3-Check-idr-registers-for-STE_S1CDMAX-an.patch new file mode 100644 index 0000000000000000000000000000000000000000..3bf8416baff11d101569b76b408a77c5de6e1050 --- /dev/null +++ b/hw-arm-smmuv3-Check-idr-registers-for-STE_S1CDMAX-an.patch @@ -0,0 +1,38 @@ +From fac9784bbedb50dc964feb9cf70b6f37472fcf60 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Fri, 21 Apr 2023 22:10:44 -0700 +Subject: [PATCH] hw/arm/smmuv3: Check idr registers for STE_S1CDMAX and + STE_S1STALLD + +With nested translation, the underlying HW could support those two fields. +Allow them according to the updated idr registers after the hw_info ioctl. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmuv3.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 4208325ab3..253d297eec 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -622,13 +622,14 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, + } + } + +- if (STE_S1CDMAX(ste) != 0) { ++ if (!FIELD_EX32(s->idr[1], IDR1, SSIDSIZE) && STE_S1CDMAX(ste) != 0) { + qemu_log_mask(LOG_UNIMP, + "SMMUv3 does not support multiple context descriptors yet\n"); + goto bad_ste; + } + +- if (STE_S1STALLD(ste)) { ++ /* STALL_MODEL being 0b01 means "stall is not supported" */ ++ if ((FIELD_EX32(s->idr[0], IDR0, STALL_MODEL) & 0x1) && STE_S1STALLD(ste)) { + qemu_log_mask(LOG_UNIMP, + "SMMUv3 S1 stalling fault model not allowed yet\n"); + goto bad_ste; +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Enable-sva-stall-IDR-features.patch b/hw-arm-smmuv3-Enable-sva-stall-IDR-features.patch new file mode 100644 index 0000000000000000000000000000000000000000..fde17487db46275897be8e5b5c2f939d64f9c428 --- /dev/null +++ b/hw-arm-smmuv3-Enable-sva-stall-IDR-features.patch @@ -0,0 +1,76 @@ +From c8267f88b2af37779a597aac00aeaf06adc80ccc Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Mon, 11 Dec 2023 14:42:01 +0000 +Subject: [PATCH] hw/arm/smmuv3: Enable sva/stall IDR features + +Emulate features that will enable the stall and sva feature in Guest. + +Signed-off-by: Shameer Kolothum +--- + hw/arm/smmuv3-internal.h | 3 ++- + hw/arm/smmuv3.c | 8 +++----- + 2 files changed, 5 insertions(+), 6 deletions(-) + +diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h +index a411fd4048..cfc04c563e 100644 +--- a/hw/arm/smmuv3-internal.h ++++ b/hw/arm/smmuv3-internal.h +@@ -74,6 +74,7 @@ REG32(IDR1, 0x4) + FIELD(IDR1, ECMDQ, 31, 1) + + #define SMMU_IDR1_SIDSIZE 16 ++#define SMMU_IDR1_SSIDSIZE 16 + #define SMMU_CMDQS 19 + #define SMMU_EVENTQS 19 + +@@ -104,7 +105,7 @@ REG32(IDR5, 0x14) + FIELD(IDR5, VAX, 10, 2); + FIELD(IDR5, STALL_MAX, 16, 16); + +-#define SMMU_IDR5_OAS 4 ++#define SMMU_IDR5_OAS 5 + + REG32(IIDR, 0x18) + REG32(AIDR, 0x1c) +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 66e4e1b57d..8d8dcccd48 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -343,13 +343,14 @@ static void smmuv3_init_regs(SMMUv3State *s) + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, 1); /* 16-bit ASID */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, VMID16, 1); /* 16-bit VMID */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TTENDIAN, 2); /* little endian */ +- s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 1); /* No stall */ ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 0); /* stall */ + /* terminated transaction will always be aborted/error returned */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, 1); + /* 2-level stream table supported */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, 1); + + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, SMMU_IDR1_SIDSIZE); ++ s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, SMMU_IDR1_SSIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS, SMMU_CMDQS); + +@@ -361,7 +362,7 @@ static void smmuv3_init_regs(SMMUv3State *s) + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2); + +- s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */ ++ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 48 bits */ + /* 4K, 16K and 64K granule support */ + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); +@@ -776,9 +777,6 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event) + if (!CD_A(cd)) { + goto bad_cd; /* SMMU_IDR0.TERM_MODEL == 1 */ + } +- if (CD_S(cd)) { +- goto bad_cd; /* !STE_SECURE && SMMU_IDR0.STALL_MODEL == 1 */ +- } + if (CD_HA(cd) || CD_HD(cd)) { + goto bad_cd; /* HTTU = 0 */ + } +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Forward-cache-invalidate-commands-via-.patch b/hw-arm-smmuv3-Forward-cache-invalidate-commands-via-.patch new file mode 100644 index 0000000000000000000000000000000000000000..9568a8e52a27ac08e4595594792f3a4cae00986f --- /dev/null +++ b/hw-arm-smmuv3-Forward-cache-invalidate-commands-via-.patch @@ -0,0 +1,229 @@ +From b331acc42fa54ca93496c32d92cdf5397927bff1 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Fri, 21 Apr 2023 15:18:56 -0700 +Subject: [PATCH] hw/arm/smmuv3: Forward cache invalidate commands via iommufd + +Inroduce an SMMUCommandBatch and some helpers to batch the commands. + +Rewind the q->cons accordingly when it fails to execute a batch/command. + +Currently separate TLBI commands and device cache commands to avoid some +errata on certain version of SMMUs. Later it should check IIDR register +to detect if underlying SMMU hw has such an erratum. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmuv3-internal.h | 13 +++++ + hw/arm/smmuv3.c | 113 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 125 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h +index 163459d450..a411fd4048 100644 +--- a/hw/arm/smmuv3-internal.h ++++ b/hw/arm/smmuv3-internal.h +@@ -226,6 +226,19 @@ static inline bool smmuv3_gerror_irq_enabled(SMMUv3State *s) + #define Q_CONS_WRAP(q) (((q)->cons & WRAP_MASK(q)) >> (q)->log2size) + #define Q_PROD_WRAP(q) (((q)->prod & WRAP_MASK(q)) >> (q)->log2size) + ++#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) ++ ++static inline int smmuv3_q_ncmds(SMMUQueue *q) ++{ ++ uint32_t prod = Q_PROD(q); ++ uint32_t cons = Q_CONS(q); ++ ++ if (Q_PROD_WRAP(q) == Q_CONS_WRAP(q)) ++ return prod - cons; ++ else ++ return WRAP_MASK(q) - cons + prod; ++} ++ + static inline bool smmuv3_q_full(SMMUQueue *q) + { + return ((q->cons ^ q->prod) & WRAP_INDEX_MASK(q)) == WRAP_MASK(q); +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index b2ffe2d40b..b860c8385f 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1357,16 +1357,85 @@ static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) + } + } + ++/** ++ * SMMUCommandBatch - batch of commands to issue for nested SMMU invalidation ++ * @cmds: Pointer to list of commands ++ * @cons: Pointer to list of CONS corresponding to the commands ++ * @ncmds: Total ncmds in the batch ++ * @dev_cache: Issue to a device cache ++ */ ++typedef struct SMMUCommandBatch { ++ Cmd *cmds; ++ uint32_t *cons; ++ uint32_t ncmds; ++ bool dev_cache; ++} SMMUCommandBatch; ++ ++/* Update batch->ncmds to the number of execute cmds */ ++static int smmuv3_issue_cmd_batch(SMMUState *bs, SMMUCommandBatch *batch) ++{ ++ uint32_t total = batch->ncmds; ++ int ret; ++ ++ ret = smmu_viommu_invalidate_cache(bs->viommu->core, ++ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3, ++ sizeof(Cmd), &batch->ncmds, batch->cmds); ++ if (total != batch->ncmds) { ++ error_report("%s failed: ret=%d, total=%d, done=%d", ++ __func__, ret, total, batch->ncmds); ++ return ret; ++ } ++ ++ batch->ncmds = 0; ++ batch->dev_cache = false; ++ return ret; ++} ++ ++static int smmuv3_batch_cmds(SMMUState *bs, SMMUCommandBatch *batch, ++ Cmd *cmd, uint32_t *cons, bool dev_cache) ++{ ++ int ret; ++ ++ if (!bs->nested || !bs->viommu) { ++ return 0; ++ } ++ ++ /* ++ * Currently separate dev_cache and hwpt for safety, which might not be ++ * necessary if underlying HW SMMU does not have the errata. ++ * ++ * TODO check IIDR register values read from hw_info. ++ */ ++ if (batch->ncmds && (dev_cache != batch->dev_cache)) { ++ ret = smmuv3_issue_cmd_batch(bs, batch); ++ if (ret) { ++ *cons = batch->cons[batch->ncmds]; ++ return ret; ++ } ++ } ++ batch->dev_cache = dev_cache; ++ batch->cmds[batch->ncmds] = *cmd; ++ batch->cons[batch->ncmds++] = *cons; ++ return 0; ++} ++ + static int smmuv3_cmdq_consume(SMMUv3State *s) + { + SMMUState *bs = ARM_SMMU(s); + SMMUCmdError cmd_error = SMMU_CERROR_NONE; + SMMUQueue *q = &s->cmdq; + SMMUCommandType type = 0; ++ SMMUCommandBatch batch = {}; ++ uint32_t ncmds = 0; + + if (!smmuv3_cmdq_enabled(s)) { + return 0; + } ++ ++ ncmds = smmuv3_q_ncmds(q); ++ batch.cmds = g_new0(Cmd, ncmds); ++ batch.cons = g_new0(uint32_t, ncmds); ++ + /* + * some commands depend on register values, typically CR0. In case those + * register values change while handling the command, spec says it +@@ -1463,6 +1532,13 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + + trace_smmuv3_cmdq_cfgi_cd(sid); + smmuv3_flush_config(sdev); ++ ++ if (sdev->s1_hwpt) { ++ if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { ++ cmd_error = SMMU_CERROR_ILL; ++ break; ++ } ++ } + break; + } + case SMMU_CMD_TLBI_NH_ASID: +@@ -1477,6 +1553,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + trace_smmuv3_cmdq_tlbi_nh_asid(asid); + smmu_inv_notifiers_all(&s->smmu_state); + smmu_iotlb_inv_asid(bs, asid); ++ if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { ++ cmd_error = SMMU_CERROR_ILL; ++ break; ++ } + break; + } + case SMMU_CMD_TLBI_NH_ALL: +@@ -1489,6 +1569,11 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + trace_smmuv3_cmdq_tlbi_nh(); + smmu_inv_notifiers_all(&s->smmu_state); + smmu_iotlb_inv_all(bs); ++ ++ if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { ++ cmd_error = SMMU_CERROR_ILL; ++ break; ++ } + break; + case SMMU_CMD_TLBI_NH_VAA: + case SMMU_CMD_TLBI_NH_VA: +@@ -1497,7 +1582,24 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + break; + } + smmuv3_range_inval(bs, &cmd); ++ ++ if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { ++ cmd_error = SMMU_CERROR_ILL; ++ break; ++ } + break; ++ case SMMU_CMD_ATC_INV: ++ { ++ SMMUDevice *sdev = smmu_find_sdev(bs, CMD_SID(&cmd)); ++ ++ if (sdev->s1_hwpt) { ++ if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { ++ cmd_error = SMMU_CERROR_ILL; ++ break; ++ } ++ } ++ break; ++ } + case SMMU_CMD_TLBI_S12_VMALL: + { + uint16_t vmid = CMD_VMID(&cmd); +@@ -1529,7 +1631,6 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + case SMMU_CMD_TLBI_EL2_ASID: + case SMMU_CMD_TLBI_EL2_VA: + case SMMU_CMD_TLBI_EL2_VAA: +- case SMMU_CMD_ATC_INV: + case SMMU_CMD_PRI_RESP: + case SMMU_CMD_RESUME: + case SMMU_CMD_STALL_TERM: +@@ -1554,12 +1655,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + */ + queue_cons_incr(q); + } ++ qemu_mutex_lock(&s->mutex); ++ if (!cmd_error && batch.ncmds && bs->viommu) { ++ if (smmuv3_issue_cmd_batch(bs, &batch)) { ++ q->cons = batch.cons[batch.ncmds]; ++ cmd_error = SMMU_CERROR_ILL; ++ } ++ } ++ qemu_mutex_unlock(&s->mutex); + + if (cmd_error) { + trace_smmuv3_cmdq_consume_error(smmu_cmd_string(type), cmd_error); + smmu_write_cmdq_err(s, cmd_error); + smmuv3_trigger_irq(s, SMMU_IRQ_GERROR, R_GERROR_CMDQ_ERR_MASK); + } ++ g_free(batch.cmds); ++ g_free(batch.cons); + + trace_smmuv3_cmdq_consume_out(Q_PROD(q), Q_CONS(q), + Q_PROD_WRAP(q), Q_CONS_WRAP(q)); +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Ignore-IOMMU_NOTIFIER_MAP-for-nested-s.patch b/hw-arm-smmuv3-Ignore-IOMMU_NOTIFIER_MAP-for-nested-s.patch new file mode 100644 index 0000000000000000000000000000000000000000..5705fffb83c377b9c07fd5af4d5ad2fcf45230ea --- /dev/null +++ b/hw-arm-smmuv3-Ignore-IOMMU_NOTIFIER_MAP-for-nested-s.patch @@ -0,0 +1,43 @@ +From 9f3b8c283d4c1014ff292faddb78bbbfd7ec22d3 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Tue, 9 Apr 2024 01:49:26 +0000 +Subject: [PATCH] hw/arm/smmuv3: Ignore IOMMU_NOTIFIER_MAP for nested-smmuv3 + +If a device's MemmoryRegion type is iommu, vfio core registers a listener, +passing the IOMMU_NOTIFIER_IOTLB_EVENTS flag (bundle of IOMMU_NOTIFIER_MAP +and IOMMU_NOTIFIER_UNMAP). + +On the other hand, nested SMMUv3 does not use a map notifier. And it would +only insert an IOTLB entry for MSI doorbell page mapping, which can simply +be done by the mr->translate call. + +Ignore the IOMMU_NOTIFIER_MAP flag and drop the error out. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmuv3.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 64ca4c5542..db111220c7 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1881,12 +1881,9 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, + return -EINVAL; + } + +- if (new & IOMMU_NOTIFIER_MAP) { +- error_setg(errp, +- "device %02x.%02x.%x requires iommu MAP notifier which is " +- "not currently supported", pci_bus_num(sdev->bus), +- PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn)); +- return -EINVAL; ++ /* nested-smmuv3 does not need IOMMU_NOTIFIER_MAP. Ignore it. */ ++ if (s->nested) { ++ new &= ~IOMMU_NOTIFIER_MAP; + } + + if (old == IOMMU_NOTIFIER_NONE) { +-- +2.41.0.windows.1 + diff --git a/hw-arm-smmuv3-Read-host-SMMU-device-info.patch b/hw-arm-smmuv3-Read-host-SMMU-device-info.patch new file mode 100644 index 0000000000000000000000000000000000000000..6363902bd11f9886b63d3d32381d0c3b066c4633 --- /dev/null +++ b/hw-arm-smmuv3-Read-host-SMMU-device-info.patch @@ -0,0 +1,135 @@ +From 03964c037862a594b4eb7d2e3754acd32c01c80b Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Thu, 22 Sep 2022 14:06:07 -0700 +Subject: [PATCH] hw/arm/smmuv3: Read host SMMU device info + +Read the underlying SMMU device info and set corresponding IDR bits. + +Signed-off-by: Nicolin Chen +--- + hw/arm/smmuv3.c | 77 ++++++++++++++++++++++++++++++++++++ + hw/arm/trace-events | 1 + + include/hw/arm/smmu-common.h | 1 + + 3 files changed, 79 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index db111220c7..4208325ab3 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -254,6 +254,80 @@ void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *info) + info->recorded = true; + } + ++static void smmuv3_nested_init_regs(SMMUv3State *s) ++{ ++ SMMUState *bs = ARM_SMMU(s); ++ SMMUDevice *sdev; ++ uint32_t data_type; ++ uint32_t val; ++ int ret; ++ ++ if (!bs->nested || !bs->viommu) { ++ return; ++ } ++ ++ sdev = QLIST_FIRST(&bs->viommu->device_list); ++ if (!sdev) { ++ return; ++ } ++ ++ if (sdev->info.idr[0]) { ++ error_report("reusing the previous hw_info"); ++ goto out; ++ } ++ ++ ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &sdev->info); ++ if (ret) { ++ error_report("failed to get SMMU device info"); ++ return; ++ } ++ ++ if (data_type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { ++ error_report( "Wrong data type (%d)!", data_type); ++ return; ++ } ++ ++out: ++ trace_smmuv3_get_device_info(sdev->info.idr[0], sdev->info.idr[1], ++ sdev->info.idr[3], sdev->info.idr[5]); ++ ++ val = FIELD_EX32(sdev->info.idr[0], IDR0, BTM); ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, BTM, val); ++ val = FIELD_EX32(sdev->info.idr[0], IDR0, ATS); ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, val); ++ val = FIELD_EX32(sdev->info.idr[0], IDR0, ASID16); ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, val); ++ val = FIELD_EX32(sdev->info.idr[0], IDR0, TERM_MODEL); ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, val); ++ val = FIELD_EX32(sdev->info.idr[0], IDR0, STALL_MODEL); ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, val); ++ val = FIELD_EX32(sdev->info.idr[0], IDR0, STLEVEL); ++ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, val); ++ ++ val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); ++ s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); ++ val = FIELD_EX32(sdev->info.idr[1], IDR1, SSIDSIZE); ++ s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, val); ++ ++ val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); ++ s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); ++ val = FIELD_EX32(sdev->info.idr[3], IDR3, RIL); ++ s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, val); ++ val = FIELD_EX32(sdev->info.idr[3], IDR3, BBML); ++ s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, val); ++ ++ val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN4K); ++ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, val); ++ val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN16K); ++ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, val); ++ val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN64K); ++ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, val); ++ val = FIELD_EX32(sdev->info.idr[5], IDR5, OAS); ++ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, val); ++ ++ /* FIXME check iidr and aidr registrs too */ ++} ++ + static void smmuv3_init_regs(SMMUv3State *s) + { + /* Based on sys property, the stages supported in smmu will be advertised.*/ +@@ -292,6 +366,9 @@ static void smmuv3_init_regs(SMMUv3State *s) + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1); + ++ /* Override IDR fields with HW caps */ ++ smmuv3_nested_init_regs(s); ++ + s->cmdq.base = deposit64(s->cmdq.base, 0, 5, SMMU_CMDQS); + s->cmdq.prod = 0; + s->cmdq.cons = 0; +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 58e0636e95..1e3d86382d 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -55,5 +55,6 @@ smmuv3_cmdq_tlbi_s12_vmid(uint16_t vmid) "vmid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" ++smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" + smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 + +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 37dfeed026..d120c352cf 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -146,6 +146,7 @@ typedef struct SMMUDevice { + AddressSpace as_sysmem; + uint32_t cfg_cache_hits; + uint32_t cfg_cache_misses; ++ struct iommu_hw_info_arm_smmuv3 info; + QLIST_ENTRY(SMMUDevice) next; + } SMMUDevice; + +-- +2.41.0.windows.1 + diff --git a/hw-arm-virt-Add-an-SMMU_IO_LEN-macro.patch b/hw-arm-virt-Add-an-SMMU_IO_LEN-macro.patch new file mode 100644 index 0000000000000000000000000000000000000000..38db82dce6b6b66beb7e76eb6f22d49c0c98c213 --- /dev/null +++ b/hw-arm-virt-Add-an-SMMU_IO_LEN-macro.patch @@ -0,0 +1,47 @@ +From a6c7b16107b506f85e6643604c923291e41f70d1 Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Wed, 19 Jun 2024 04:42:33 +0000 +Subject: [PATCH] hw/arm/virt: Add an SMMU_IO_LEN macro + +A following patch will add a new MMIO region for nested SMMU instances. + +This macro will be repeatedly used to set offsets and MMIO sizes in both +virt and virt-acpi-build. + +Signed-off-by: Nicolin Chen +Signed-off-by: Shameer Kolothum +--- + hw/arm/virt.c | 2 +- + include/hw/arm/virt.h | 3 +++ + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 8823f2ed1c..08c40c314b 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -155,7 +155,7 @@ static const MemMapEntry base_memmap[] = { + [VIRT_FW_CFG] = { 0x09020000, 0x00000018 }, + [VIRT_GPIO] = { 0x09030000, 0x00001000 }, + [VIRT_SECURE_UART] = { 0x09040000, 0x00001000 }, +- [VIRT_SMMU] = { 0x09050000, 0x00020000 }, ++ [VIRT_SMMU] = { 0x09050000, SMMU_IO_LEN }, + [VIRT_PCDIMM_ACPI] = { 0x09070000, MEMORY_HOTPLUG_IO_LEN }, + [VIRT_ACPI_GED] = { 0x09080000, ACPI_GED_EVT_SEL_LEN }, + [VIRT_NVDIMM_ACPI] = { 0x09090000, NVDIMM_ACPI_IO_LEN}, +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index 345b2d5594..e6a449becd 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -106,6 +106,9 @@ typedef enum { + ARM_L3_CACHE + } ArmCacheType; + ++/* MMIO region size for SMMUv3 */ ++#define SMMU_IO_LEN 0x20000 ++ + enum { + VIRT_FLASH, + VIRT_MEM, +-- +2.41.0.windows.1 + diff --git a/hw-arm-virt-acpi-build-Add-IORT-RMR-regions-to-handl.patch b/hw-arm-virt-acpi-build-Add-IORT-RMR-regions-to-handl.patch new file mode 100644 index 0000000000000000000000000000000000000000..6c0d8de2c6075c2941bb86ed8da47a0ebfc6874e --- /dev/null +++ b/hw-arm-virt-acpi-build-Add-IORT-RMR-regions-to-handl.patch @@ -0,0 +1,187 @@ +From 1746ba1aee671b9552540e36a629988b00846a82 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 5 Oct 2021 10:53:13 +0200 +Subject: [PATCH] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI + nested binding + +To handle SMMUv3 nested stage support it is practical to +expose the guest with reserved memory regions (RMRs) +covering the IOVAs used by the host kernel to map +physical MSI doorbells. + +Those IOVAs belong to [0x8000000, 0x8100000] matching +MSI_IOVA_BASE and MSI_IOVA_LENGTH definitions in kernel +arm-smmu-v3 driver. This is the window used to allocate +IOVAs matching physical MSI doorbells. + +With those RMRs, the guest is forced to use a flat mapping +for this range. Hence the assigned device is programmed +with one IOVA from this range. Stage 1, owned by the guest +has a flat mapping for this IOVA. Stage2, owned by the VMM +then enforces a mapping from this IOVA to the physical +MSI doorbell. + +The creation of those RMR nodes only is relevant if nested +stage SMMU is in use, along with VFIO. As VFIO devices can be +hotplugged, all RMRs need to be created in advance. Hence +the patch introduces a new arm virt "nested-smmuv3" iommu type. + +ARM DEN 0049E.b IORT specification also mandates that when +RMRs are present, the OS must preserve PCIe configuration +performed by the boot FW. So along with the RMR IORT nodes, +a _DSM function #5, as defined by PCI FIRMWARE SPECIFICATION +EVISION 3.3, chapter 4.6.5 is added to PCIe host bridge +and PCIe expander bridge objects. + +Signed-off-by: Eric Auger +Suggested-by: Jean-Philippe Brucker +Signed-off-by: Nicolin Chen +Signed-off-by: Shameer Kolothum +--- + hw/arm/virt-acpi-build.c | 71 +++++++++++++++++++++++++++++++++++----- + 1 file changed, 63 insertions(+), 8 deletions(-) + +diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c +index 1d7839e4a0..ad0f79e03d 100644 +--- a/hw/arm/virt-acpi-build.c ++++ b/hw/arm/virt-acpi-build.c +@@ -417,6 +417,14 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, + .bus = vms->bus, + }; + ++ /* ++ * Nested SMMU requires RMRs for MSI 1-1 mapping, which ++ * require _DSM for PreservingPCI Boot Configurations ++ */ ++ if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { ++ cfg.preserve_config = true; ++ } ++ + if (vms->highmem_mmio) { + cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO]; + } +@@ -495,7 +503,7 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms) + #define IORT_NODE_OFFSET 48 + + static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, +- uint32_t id_count, uint32_t out_ref) ++ uint32_t id_count, uint32_t out_ref, uint32_t flags) + { + /* Table 4 ID mapping format */ + build_append_int_noprefix(table_data, input_base, 4); /* Input base */ +@@ -503,7 +511,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, + build_append_int_noprefix(table_data, input_base, 4); /* Output base */ + build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */ + /* Flags */ +- build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4); ++ build_append_int_noprefix(table_data, flags, 4); /* Flags */ + } + + struct AcpiIortIdMapping { +@@ -545,6 +553,50 @@ static int iort_idmap_compare(gconstpointer a, gconstpointer b) + return idmap_a->input_base - idmap_b->input_base; + } + ++static void ++build_iort_rmr_nodes(GArray *table_data, GArray *smmu_idmaps, ++ size_t *smmu_offset, uint32_t *id) ++{ ++ AcpiIortIdMapping *range; ++ int i; ++ ++ for (i = 0; i < smmu_idmaps->len; i++) { ++ range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); ++ int bdf = range->input_base; ++ ++ /* Table 18 Reserved Memory Range Node */ ++ ++ build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */ ++ /* Length */ ++ build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20, 2); ++ build_append_int_noprefix(table_data, 3, 1); /* Revision */ ++ build_append_int_noprefix(table_data, *id, 4); /* Identifier */ ++ /* Number of ID mappings */ ++ build_append_int_noprefix(table_data, 1, 4); ++ /* Reference to ID Array */ ++ build_append_int_noprefix(table_data, 28, 4); ++ ++ /* RMR specific data */ ++ ++ /* Flags */ ++ build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4); ++ /* Number of Memory Range Descriptors */ ++ build_append_int_noprefix(table_data, 1 , 4); ++ /* Reference to Memory Range Descriptors */ ++ build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4); ++ build_iort_id_mapping(table_data, bdf, range->id_count, smmu_offset[i], 1); ++ ++ /* Table 19 Memory Range Descriptor */ ++ ++ /* Physical Range offset */ ++ build_append_int_noprefix(table_data, 0x8000000, 8); ++ /* Physical Range length */ ++ build_append_int_noprefix(table_data, 0x100000, 8); ++ build_append_int_noprefix(table_data, 0, 4); /* Reserved */ ++ *id += 1; ++ } ++} ++ + /* + * Input Output Remapping Table (IORT) + * Conforms to "IO Remapping Table System Software on ARM Platforms", +@@ -554,7 +606,6 @@ static void + build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + { + int i, nb_nodes, rc_mapping_count; +- const uint32_t iort_node_offset = IORT_NODE_OFFSET; + size_t node_size, *smmu_offset; + AcpiIortIdMapping *idmap; + hwaddr base; +@@ -563,7 +614,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); + GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); + +- AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id, ++ AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id, + .oem_table_id = vms->oem_table_id }; + /* Table 2 The IORT */ + acpi_table_begin(&table, table_data); +@@ -668,7 +719,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + build_append_int_noprefix(table_data, 0, 4); + + /* output IORT node is the ITS group node (the first node) */ +- build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); ++ build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); + } + + /* Table 17 Root Complex Node */ +@@ -709,7 +760,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + /* output IORT node is the smmuv3 node */ + build_iort_id_mapping(table_data, range->input_base, +- range->id_count, smmu_offset[i]); ++ range->id_count, smmu_offset[i], 0); + } + + /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ +@@ -717,11 +768,15 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); + /* output IORT node is the ITS group node (the first node) */ + build_iort_id_mapping(table_data, range->input_base, +- range->id_count, iort_node_offset); ++ range->id_count, IORT_NODE_OFFSET, 0); + } + } else { + /* output IORT node is the ITS group node (the first node) */ +- build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); ++ build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); ++ } ++ ++ if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { ++ build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); + } + + acpi_table_end(linker, &table); +-- +2.41.0.windows.1 + diff --git a/hw-arm-virt-acpi-build-Build-IORT-with-multiple-SMMU.patch b/hw-arm-virt-acpi-build-Build-IORT-with-multiple-SMMU.patch new file mode 100644 index 0000000000000000000000000000000000000000..3451d6d5a700e010f714da317e431347599141c5 --- /dev/null +++ b/hw-arm-virt-acpi-build-Build-IORT-with-multiple-SMMU.patch @@ -0,0 +1,155 @@ +From a7ffb5856940a1515ef84a4d4644b7c7c07afb8f Mon Sep 17 00:00:00 2001 +From: Nicolin Chen +Date: Wed, 6 Nov 2024 19:22:13 +0000 +Subject: [PATCH] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Now that we can have multiple user-creatable smmuv3-nested +devices, each associated with different pci buses, update +IORT ID mappings accordingly. + +Signed-off-by: Nicolin Chen +Signed-off-by: Shameer Kolothum +--- + hw/arm/virt-acpi-build.c | 43 ++++++++++++++++++++++++++++------------ + include/hw/arm/virt.h | 6 ++++++ + 2 files changed, 36 insertions(+), 13 deletions(-) + +diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c +index 076781423b..1d7839e4a0 100644 +--- a/hw/arm/virt-acpi-build.c ++++ b/hw/arm/virt-acpi-build.c +@@ -555,8 +555,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + { + int i, nb_nodes, rc_mapping_count; + const uint32_t iort_node_offset = IORT_NODE_OFFSET; +- size_t node_size, smmu_offset = 0; ++ size_t node_size, *smmu_offset; + AcpiIortIdMapping *idmap; ++ hwaddr base; ++ int irq, num_smmus = 0; + uint32_t id = 0; + GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); + GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); +@@ -566,7 +568,21 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + /* Table 2 The IORT */ + acpi_table_begin(&table, table_data); + +- if (vms->iommu == VIRT_IOMMU_SMMUV3) { ++ if (vms->smmu_nested_count) { ++ irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE; ++ base = vms->memmap[VIRT_SMMU_NESTED].base; ++ num_smmus = vms->smmu_nested_count; ++ } else if (virt_has_smmuv3(vms)) { ++ irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; ++ base = vms->memmap[VIRT_SMMU].base; ++ num_smmus = 1; ++ } ++ ++ smmu_offset = g_new0(size_t, num_smmus); ++ nb_nodes = 2; /* RC, ITS */ ++ nb_nodes += num_smmus; /* SMMU nodes */ ++ ++ if (virt_has_smmuv3(vms)) { + AcpiIortIdMapping next_range = {0}; + + object_child_foreach_recursive(object_get_root(), +@@ -588,18 +604,19 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + } + + next_range.input_base = idmap->input_base + idmap->id_count; ++ if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { ++ nb_nodes++; /* RMR node per SMMU */ ++ } + } + + /* Append the last RC -> ITS ID mapping */ +- if (next_range.input_base < 0xFFFF) { +- next_range.id_count = 0xFFFF - next_range.input_base; ++ if (next_range.input_base < 0x10000) { ++ next_range.id_count = 0x10000 - next_range.input_base; + g_array_append_val(its_idmaps, next_range); + } + +- nb_nodes = 3; /* RC, ITS, SMMUv3 */ + rc_mapping_count = smmu_idmaps->len + its_idmaps->len; + } else { +- nb_nodes = 2; /* RC, ITS */ + rc_mapping_count = 1; + } + /* Number of IORT Nodes */ +@@ -621,10 +638,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + /* GIC ITS Identifier Array */ + build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); + +- if (vms->iommu == VIRT_IOMMU_SMMUV3) { +- int irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; ++ for (i = 0; i < num_smmus; i++) { ++ smmu_offset[i] = table_data->len - table.table_offset; + +- smmu_offset = table_data->len - table.table_offset; + /* Table 9 SMMUv3 Format */ + build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */ + node_size = SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE; +@@ -635,7 +651,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + /* Reference to ID Array */ + build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4); + /* Base address */ +- build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8); ++ build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8); + /* Flags */ + build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4); + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ +@@ -646,12 +662,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */ + build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */ + build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */ ++ irq += NUM_SMMU_IRQS; + build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */ + /* DeviceID mapping index (ignored since interrupts are GSIV based) */ + build_append_int_noprefix(table_data, 0, 4); + + /* output IORT node is the ITS group node (the first node) */ +- build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); ++ build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); + } + + /* Table 17 Root Complex Node */ +@@ -684,7 +701,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + build_append_int_noprefix(table_data, 0, 3); /* Reserved */ + + /* Output Reference */ +- if (vms->iommu == VIRT_IOMMU_SMMUV3) { ++ if (virt_has_smmuv3(vms)) { + AcpiIortIdMapping *range; + + /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ +@@ -692,7 +709,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + /* output IORT node is the smmuv3 node */ + build_iort_id_mapping(table_data, range->input_base, +- range->id_count, smmu_offset); ++ range->id_count, smmu_offset[i]); + } + + /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index cd41e28202..bc3c8b70da 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -295,4 +295,10 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) + vms->highmem_redists) ? 2 : 1; + } + ++static inline bool virt_has_smmuv3(const VirtMachineState *vms) ++{ ++ return vms->iommu == VIRT_IOMMU_SMMUV3 || ++ vms->iommu == VIRT_IOMMU_SMMUV3_NESTED; ++} ++ + #endif /* QEMU_ARM_VIRT_H */ +-- +2.41.0.windows.1 + diff --git a/hw-i386-Activate-IOMMUFD-for-q35-machines.patch b/hw-i386-Activate-IOMMUFD-for-q35-machines.patch new file mode 100644 index 0000000000000000000000000000000000000000..23ca9aae29d69f0cb1d7d4e04a23c3f819710538 --- /dev/null +++ b/hw-i386-Activate-IOMMUFD-for-q35-machines.patch @@ -0,0 +1,33 @@ +From 5405fa36c5f2784a9a6b19ee60d44b6cffb9f769 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Sat, 11 Jan 2025 10:52:57 +0800 +Subject: [PATCH] hw/i386: Activate IOMMUFD for q35 machines +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/i386/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig +index 682e324f1c..908f29e02b 100644 +--- a/hw/i386/Kconfig ++++ b/hw/i386/Kconfig +@@ -105,6 +105,7 @@ config Q35 + imply E1000E_PCI_EXPRESS + imply VMPORT + imply VMMOUSE ++ imply IOMMUFD + select PC_PCI + select PC_ACPI + select PCI_EXPRESS_Q35 +-- +2.41.0.windows.1 + diff --git a/hw-misc-aspeed_hace-Fix-buffer-overflow-in-has_paddi.patch b/hw-misc-aspeed_hace-Fix-buffer-overflow-in-has_paddi.patch new file mode 100644 index 0000000000000000000000000000000000000000..379be17fe7897ccc8d737622e486d4a466349d8a --- /dev/null +++ b/hw-misc-aspeed_hace-Fix-buffer-overflow-in-has_paddi.patch @@ -0,0 +1,50 @@ +From 7e1bd6e7e109c6228bc4c40ea6f2af2d7f281fca Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Tue, 8 Apr 2025 05:59:29 -0400 +Subject: [PATCH] hw/misc/aspeed_hace: Fix buffer overflow in has_padding + function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from 78877b2e06464f49f777e086845e094ea7bc82ef + +The maximum padding size is either 64 or 128 bytes and should always be smaller +than "req_len". If "padding_size" exceeds "req_len", then +"req_len - padding_size" underflows due to "uint32_t" data type, leading to a +large incorrect value (e.g., `0xFFXXXXXX`). This causes an out-of-bounds memory +access, potentially leading to a buffer overflow. + +Added a check to ensure "padding_size" does not exceed "req_len" before +computing "pad_offset". This prevents "req_len - padding_size" from underflowing +and avoids accessing invalid memory. + +Signed-off-by: Jamin Lin +Reviewed-by: Cédric Le Goater +Fixes: 5cd7d8564a8b563da724b9e6264c967f0a091afa ("aspeed/hace: Support AST2600 HACE ") +Link: https://lore.kernel.org/qemu-devel/20250321092623.2097234-3-jamin_lin@aspeedtech.com +Signed-off-by: Cédric Le Goater +Signed-off-by: qihao_yewu +--- + hw/misc/aspeed_hace.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c +index b07506ec04..8706e3d376 100644 +--- a/hw/misc/aspeed_hace.c ++++ b/hw/misc/aspeed_hace.c +@@ -123,6 +123,11 @@ static bool has_padding(AspeedHACEState *s, struct iovec *iov, + if (*total_msg_len <= s->total_req_len) { + uint32_t padding_size = s->total_req_len - *total_msg_len; + uint8_t *padding = iov->iov_base; ++ ++ if (padding_size > req_len) { ++ return false; ++ } ++ + *pad_offset = req_len - padding_size; + if (padding[*pad_offset] == 0x80) { + return true; +-- +2.41.0.windows.1 + diff --git a/hw-nvme-fix-invalid-check-on-mcl.patch b/hw-nvme-fix-invalid-check-on-mcl.patch new file mode 100644 index 0000000000000000000000000000000000000000..cefd6c18b37ddeff1679c0b311760d36ad9a8200 --- /dev/null +++ b/hw-nvme-fix-invalid-check-on-mcl.patch @@ -0,0 +1,36 @@ +From 43fdaaa492ea10ab0e90ec4cc68ec45aed1d415c Mon Sep 17 00:00:00 2001 +From: gubin +Date: Sat, 22 Mar 2025 15:20:27 +0800 +Subject: [PATCH] hw/nvme: fix invalid check on mcl + +cherry-pick from 8c78015a55d84c016da6d5e41b6b5f618ecb25ab + +The number of logical blocks within a source range is converted into a +1s based number at the time of parsing. However, when verifying the copy +length we add one again, causing the check against MCL to fail in error. + +Cc: qemu-stable@nongnu.org +Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY") +Reviewed-by: Minwoo Im +Signed-off-by: Klaus Jensen +Signed-off-by: gubin +--- + hw/nvme/ctrl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c +index 29445938d5..407004b2f7 100644 +--- a/hw/nvme/ctrl.c ++++ b/hw/nvme/ctrl.c +@@ -2863,7 +2863,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, + uint32_t nlb; + nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, + &nlb, NULL, NULL, NULL); +- copy_len += nlb + 1; ++ copy_len += nlb; + } + + if (copy_len > ns->id_ns.mcl) { +-- +2.41.0.windows.1 + diff --git a/hw-nvme-fix-invalid-endian-conversion.patch b/hw-nvme-fix-invalid-endian-conversion.patch new file mode 100644 index 0000000000000000000000000000000000000000..8f236bd9c022664dc96efb5e6221b7d6135957ab --- /dev/null +++ b/hw-nvme-fix-invalid-endian-conversion.patch @@ -0,0 +1,42 @@ +From 6de964bac51139ef24f43bde56933cd8eafaf317 Mon Sep 17 00:00:00 2001 +From: gubin +Date: Sat, 22 Mar 2025 15:25:39 +0800 +Subject: [PATCH] hw/nvme: fix invalid endian conversion +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cherry-pick from d2b5bb860e6c17442ad95cc275feb07c1665be5c + +numcntl is one byte and so is max_vfs. Using cpu_to_le16 on big endian +hosts results in numcntl being set to 0. + +Fix by dropping the endian conversion. + +Fixes: 99f48ae7ae ("hw/nvme: Add support for Secondary Controller List") +Reported-by: Kevin Wolf +Signed-off-by: Klaus Jensen +Reviewed-by: Minwoo Im +Message-ID: <20240222-fix-sriov-numcntl-v1-1-d60bea5e72d0@samsung.com> +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: gubin +--- + hw/nvme/ctrl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c +index 29445938d5..9410344844 100644 +--- a/hw/nvme/ctrl.c ++++ b/hw/nvme/ctrl.c +@@ -7928,7 +7928,7 @@ static void nvme_init_state(NvmeCtrl *n) + n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); + QTAILQ_INIT(&n->aer_queue); + +- list->numcntl = cpu_to_le16(max_vfs); ++ list->numcntl = max_vfs; + for (i = 0; i < max_vfs; i++) { + sctrl = &list->sec[i]; + sctrl->pcid = cpu_to_le16(n->cntlid); +-- +2.41.0.windows.1 + diff --git a/hw-pci-Introduce-helper-function-pci_device_get_iomm.patch b/hw-pci-Introduce-helper-function-pci_device_get_iomm.patch new file mode 100644 index 0000000000000000000000000000000000000000..ba4323cb5fb23594e06c75a59db09b46b8c10e94 --- /dev/null +++ b/hw-pci-Introduce-helper-function-pci_device_get_iomm.patch @@ -0,0 +1,95 @@ +From 03f9b12e33238587da36be24523911fd1b003324 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:38 +0800 +Subject: [PATCH] hw/pci: Introduce helper function + pci_device_get_iommu_bus_devfn() + +Extract out pci_device_get_iommu_bus_devfn() from +pci_device_iommu_address_space() to facilitate +implementation of pci_device_[set|unset]_iommu_device() +in following patch. + +No functional change intended. + +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Nicolin Chen +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/pci/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 45 insertions(+), 3 deletions(-) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index 7467a2a9de..0884fbb760 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2681,11 +2681,27 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data) + } + } + +-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) ++/* ++ * Get IOMMU root bus, aliased bus and devfn of a PCI device ++ * ++ * IOMMU root bus is needed by all call sites to call into iommu_ops. ++ * For call sites which don't need aliased BDF, passing NULL to ++ * aliased_[bus|devfn] is allowed. ++ * ++ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device. ++ * ++ * @aliased_bus: return aliased #PCIBus of the PCI device, optional. ++ * ++ * @aliased_devfn: return aliased devfn of the PCI device, optional. ++ */ ++static void pci_device_get_iommu_bus_devfn(PCIDevice *dev, ++ PCIBus **piommu_bus, ++ PCIBus **aliased_bus, ++ int *aliased_devfn) + { + PCIBus *bus = pci_get_bus(dev); + PCIBus *iommu_bus = bus; +- uint8_t devfn = dev->devfn; ++ int devfn = dev->devfn; + + while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { + PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); +@@ -2726,7 +2742,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) + + iommu_bus = parent_bus; + } +- if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { ++ ++ assert(0 <= devfn && devfn < PCI_DEVFN_MAX); ++ assert(iommu_bus); ++ ++ if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) { ++ iommu_bus = NULL; ++ } ++ ++ *piommu_bus = iommu_bus; ++ ++ if (aliased_bus) { ++ *aliased_bus = bus; ++ } ++ ++ if (aliased_devfn) { ++ *aliased_devfn = devfn; ++ } ++} ++ ++AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) ++{ ++ PCIBus *bus; ++ PCIBus *iommu_bus; ++ int devfn; ++ ++ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); ++ if (iommu_bus) { + return iommu_bus->iommu_ops->get_address_space(bus, + iommu_bus->iommu_opaque, devfn); + } +-- +2.41.0.windows.1 + diff --git a/hw-pci-Introduce-pci_device_-set-unset-_iommu_device.patch b/hw-pci-Introduce-pci_device_-set-unset-_iommu_device.patch new file mode 100644 index 0000000000000000000000000000000000000000..48f864d026beb6f1b60c12ac4f40186a0c4b4370 --- /dev/null +++ b/hw-pci-Introduce-pci_device_-set-unset-_iommu_device.patch @@ -0,0 +1,120 @@ +From 7bc73d38984460315df315d007789f87f4d11994 Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Wed, 5 Jun 2024 16:30:39 +0800 +Subject: [PATCH] hw/pci: Introduce pci_device_[set|unset]_iommu_device() + +pci_device_[set|unset]_iommu_device() call pci_device_get_iommu_bus_devfn() +to get iommu_bus->iommu_ops and call [set|unset]_iommu_device callback to +set/unset HostIOMMUDevice for a given PCI device. + +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Nicolin Chen +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/pci/pci.c | 27 +++++++++++++++++++++++++++ + include/hw/pci/pci.h | 38 +++++++++++++++++++++++++++++++++++++- + 2 files changed, 64 insertions(+), 1 deletion(-) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index 0884fbb760..d6f627aa51 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2775,6 +2775,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) + return &address_space_memory; + } + ++bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, ++ Error **errp) ++{ ++ PCIBus *iommu_bus; ++ ++ /* set_iommu_device requires device's direct BDF instead of aliased BDF */ ++ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); ++ if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) { ++ return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev), ++ iommu_bus->iommu_opaque, ++ dev->devfn, hiod, errp); ++ } ++ return true; ++} ++ ++void pci_device_unset_iommu_device(PCIDevice *dev) ++{ ++ PCIBus *iommu_bus; ++ ++ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); ++ if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) { ++ return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev), ++ iommu_bus->iommu_opaque, ++ dev->devfn); ++ } ++} ++ + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) + { + /* +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index cee0cf7460..8d1af44249 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -3,6 +3,7 @@ + + #include "exec/memory.h" + #include "sysemu/dma.h" ++#include "sysemu/host_iommu_device.h" + + /* PCI includes legacy ISA access. */ + #include "hw/isa/isa.h" +@@ -384,10 +385,45 @@ typedef struct PCIIOMMUOps { + * + * @devfn: device and function number + */ +- AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); ++ AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); ++ /** ++ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU ++ * ++ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't ++ * retrieve host information from the associated HostIOMMUDevice. ++ * ++ * @bus: the #PCIBus of the PCI device. ++ * ++ * @opaque: the data passed to pci_setup_iommu(). ++ * ++ * @devfn: device and function number of the PCI device. ++ * ++ * @dev: the #HostIOMMUDevice to attach. ++ * ++ * @errp: pass an Error out only when return false ++ * ++ * Returns: true if HostIOMMUDevice is attached or else false with errp set. ++ */ ++ bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn, ++ HostIOMMUDevice *dev, Error **errp); ++ /** ++ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU ++ * ++ * Optional callback. ++ * ++ * @bus: the #PCIBus of the PCI device. ++ * ++ * @opaque: the data passed to pci_setup_iommu(). ++ * ++ * @devfn: device and function number of the PCI device. ++ */ ++ void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); + } PCIIOMMUOps; + + AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); ++bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, ++ Error **errp); ++void pci_device_unset_iommu_device(PCIDevice *dev); + + /** + * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus +-- +2.41.0.windows.1 + diff --git a/hw-pci-host-designware-Fix-ATU_UPPER_TARGET-register.patch b/hw-pci-host-designware-Fix-ATU_UPPER_TARGET-register.patch new file mode 100644 index 0000000000000000000000000000000000000000..5660bdedb327111214a7242dd66a68749da84725 --- /dev/null +++ b/hw-pci-host-designware-Fix-ATU_UPPER_TARGET-register.patch @@ -0,0 +1,41 @@ +From c1f1346eea8da6552e085aa13630bbf5227db00f Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Mon, 7 Apr 2025 12:54:10 -0400 +Subject: [PATCH] hw/pci-host/designware: Fix ATU_UPPER_TARGET register access +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from 04e99f9eb7920b0f0fcce65686c3bedf5e32a1f9 + +Fix copy/paste error writing to the ATU_UPPER_TARGET +register, we want to update the upper 32 bits. + +Cc: qemu-stable@nongnu.org +Reported-by: Joey +Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2861 +Fixes: d64e5eabc4c ("pci: Add support for Designware IP block") +Signed-off-by: Philippe Mathieu-Daudé +Reviewed-by: Gustavo Romero +Message-Id: <20250331152041.74533-2-philmd@linaro.org> +Signed-off-by: qihao_yewu +--- + hw/pci-host/designware.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c +index f477f97847..004142709c 100644 +--- a/hw/pci-host/designware.c ++++ b/hw/pci-host/designware.c +@@ -360,7 +360,7 @@ static void designware_pcie_root_config_write(PCIDevice *d, uint32_t address, + + case DESIGNWARE_PCIE_ATU_UPPER_TARGET: + viewport->target &= 0x00000000FFFFFFFFULL; +- viewport->target |= val; ++ viewport->target |= (uint64_t)val << 32; + break; + + case DESIGNWARE_PCIE_ATU_LIMIT: +-- +2.41.0.windows.1 + diff --git a/hw-pci-host-gpex-needs-kernel-fix-Allow-to-generate-.patch b/hw-pci-host-gpex-needs-kernel-fix-Allow-to-generate-.patch new file mode 100644 index 0000000000000000000000000000000000000000..cbe78ad8f1f01a3b0ce13bf1a02d5338035b323f --- /dev/null +++ b/hw-pci-host-gpex-needs-kernel-fix-Allow-to-generate-.patch @@ -0,0 +1,119 @@ +From 37308e60d43323c0ea65d734487ce6542f8a9d3b Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 5 Oct 2021 10:53:12 +0200 +Subject: [PATCH] hw/pci-host/gpex: [needs kernel fix] Allow to generate + preserve boot config DSM #5 + +Add a 'preserve_config' field in struct GPEXConfig and +if set, generate the DSM #5 for preserving PCI boot configurations. +The DSM presence is needed to expose RMRs. + +At the moment the DSM generation is not yet enabled. + +Signed-off-by: Eric Auger +--- + hw/pci-host/gpex-acpi.c | 35 +++++++++++++++++++++++++++++++---- + include/hw/pci-host/gpex.h | 1 + + 2 files changed, 32 insertions(+), 4 deletions(-) + +diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c +index ac5d229757..ce424fc9da 100644 +--- a/hw/pci-host/gpex-acpi.c ++++ b/hw/pci-host/gpex-acpi.c +@@ -49,9 +49,10 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) + } + } + +-static void acpi_dsdt_add_pci_osc(Aml *dev) ++static void acpi_dsdt_add_pci_osc(Aml *dev, bool preserve_config) + { + Aml *method, *UUID, *ifctx, *ifctx1, *elsectx, *buf; ++ uint8_t byte_list[1] = {0}; + + /* Declare an _OSC (OS Control Handoff) method */ + aml_append(dev, aml_name_decl("SUPP", aml_int(0))); +@@ -113,10 +114,24 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) + UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); + ifctx = aml_if(aml_equal(aml_arg(0), UUID)); + ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); +- uint8_t byte_list[1] = {0}; ++ if (preserve_config) { ++ /* support for functions other than function 0 and function 5 */ ++ byte_list[0] = 0x21; ++ } + buf = aml_buffer(1, byte_list); + aml_append(ifctx1, aml_return(buf)); + aml_append(ifctx, ifctx1); ++ ++ if (preserve_config) { ++ Aml *ifctx2 = aml_if(aml_equal(aml_arg(2), aml_int(5))); ++ /* ++ * 0 - The operating system must not ignore the PCI configuration that ++ * firmware has done at boot time. ++ */ ++ aml_append(ifctx2, aml_return(aml_int(0))); ++ aml_append(ifctx, ifctx2); ++ } ++ + aml_append(method, ifctx); + + byte_list[0] = 0; +@@ -174,6 +189,12 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); + } + ++ if (cfg->preserve_config) { ++ method = aml_method("_DSM", 5, AML_SERIALIZED); ++ aml_append(method, aml_return(aml_int(0))); ++ aml_append(dev, method); ++ } ++ + acpi_dsdt_add_pci_route_table(dev, cfg->irq); + + /* +@@ -188,7 +209,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + if (is_cxl) { + build_cxl_osc_method(dev); + } else { +- acpi_dsdt_add_pci_osc(dev); ++ acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); + } + + aml_append(scope, dev); +@@ -205,6 +226,12 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + ++ if (cfg->preserve_config) { ++ method = aml_method("_DSM", 5, AML_SERIALIZED); ++ aml_append(method, aml_return(aml_int(0))); ++ aml_append(dev, method); ++ } ++ + acpi_dsdt_add_pci_route_table(dev, cfg->irq); + + method = aml_method("_CBA", 0, AML_NOTSERIALIZED); +@@ -263,7 +290,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + } + aml_append(dev, aml_name_decl("_CRS", rbuf)); + +- acpi_dsdt_add_pci_osc(dev); ++ acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); + + Aml *dev_res0 = aml_device("%s", "RES0"); + aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02"))); +diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h +index b0240bd768..65475f7f9d 100644 +--- a/include/hw/pci-host/gpex.h ++++ b/include/hw/pci-host/gpex.h +@@ -64,6 +64,7 @@ struct GPEXConfig { + MemMapEntry pio; + int irq; + PCIBus *bus; ++ bool preserve_config; + }; + + int gpex_set_irq_num(GPEXHost *s, int index, int gsi); +-- +2.41.0.windows.1 + diff --git a/hw-sd-sdhci-free-irq-on-exit.patch b/hw-sd-sdhci-free-irq-on-exit.patch new file mode 100644 index 0000000000000000000000000000000000000000..2f8a611833f310757dd587d23e79c56652a35d87 --- /dev/null +++ b/hw-sd-sdhci-free-irq-on-exit.patch @@ -0,0 +1,46 @@ +From 3746a434596b9bc20994c869c79fb9db24227418 Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Mon, 7 Apr 2025 13:56:18 -0400 +Subject: [PATCH] hw/sd/sdhci: free irq on exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from 1c2d03bb0889b7a9a677d53126fb035190683af4 + +Fix a memory leak bug in sdhci_pci_realize() due to s->irq +not being freed in sdhci_pci_exit(). + +Signed-off-by: Zheng Huang +Reviewed-by: Philippe Mathieu-Daudé +Message-ID: <09ddf42b-a6db-42d5-954b-148d09d8d6cc@gmail.com> +[PMD: Moved qemu_free_irq() call before sdhci_common_unrealize()] +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: qihao_yewu +--- + hw/sd/sdhci-pci.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hw/sd/sdhci-pci.c b/hw/sd/sdhci-pci.c +index 9b7bee8b3f..c1eb67cf29 100644 +--- a/hw/sd/sdhci-pci.c ++++ b/hw/sd/sdhci-pci.c +@@ -18,6 +18,7 @@ + #include "qemu/osdep.h" + #include "qapi/error.h" + #include "qemu/module.h" ++#include "hw/irq.h" + #include "hw/qdev-properties.h" + #include "hw/sd/sdhci.h" + #include "sdhci-internal.h" +@@ -49,6 +50,7 @@ static void sdhci_pci_exit(PCIDevice *dev) + { + SDHCIState *s = PCI_SDHCI(dev); + ++ qemu_free_irq(s->irq); + sdhci_common_unrealize(s); + sdhci_uninitfn(s); + } +-- +2.41.0.windows.1 + diff --git a/hw-ufs-free-irq-on-exit.patch b/hw-ufs-free-irq-on-exit.patch new file mode 100644 index 0000000000000000000000000000000000000000..8e6a11c978546fe086542fdabe009ae8440cfb1c --- /dev/null +++ b/hw-ufs-free-irq-on-exit.patch @@ -0,0 +1,46 @@ +From 068fef175047c18f60900dacd54c7a436114c164 Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Mon, 7 Apr 2025 13:18:47 -0400 +Subject: [PATCH] hw/ufs: free irq on exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from c458f9474d6574505ce9144ab1a90b951e69c1bd + +Fix a memory leak bug in ufs_init_pci() due to u->irq +not being freed in ufs_exit(). + +Signed-off-by: Zheng Huang +Reviewed-by: Philippe Mathieu-Daudé +Message-ID: <43ceb427-87aa-44ee-9007-dbaecc499bba@gmail.com> +Signed-off-by: Philippe Mathieu-Daudé +Signed-off-by: qihao_yewu +--- + hw/ufs/ufs.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c +index 068895b27b..f57d33e771 100644 +--- a/hw/ufs/ufs.c ++++ b/hw/ufs/ufs.c +@@ -25,6 +25,7 @@ + #include "qapi/error.h" + #include "migration/vmstate.h" + #include "scsi/constants.h" ++#include "hw/irq.h" + #include "trace.h" + #include "ufs.h" + +@@ -1286,6 +1287,8 @@ static void ufs_exit(PCIDevice *pci_dev) + { + UfsHc *u = UFS(pci_dev); + ++ qemu_free_irq(u->irq); ++ + qemu_bh_delete(u->doorbell_bh); + qemu_bh_delete(u->complete_bh); + +-- +2.41.0.windows.1 + diff --git a/hw-xen-Fix-xen_bus_realize-error-handling.patch b/hw-xen-Fix-xen_bus_realize-error-handling.patch new file mode 100644 index 0000000000000000000000000000000000000000..52a2023e0240545f3f95ebf0a9ce05570ebb017e --- /dev/null +++ b/hw-xen-Fix-xen_bus_realize-error-handling.patch @@ -0,0 +1,43 @@ +From 5eb0bb1f8ce9835b368e78d414ff6136c77ef94b Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Tue, 8 Apr 2025 06:51:26 -0400 +Subject: [PATCH] hw/xen: Fix xen_bus_realize() error handling + +cheery-pick from de7b18083bfed4e1a01bb40b4ad050c47d2011fa + +The Error ** argument must be NULL, &error_abort, &error_fatal, or a +pointer to a variable containing NULL. Passing an argument of the +latter kind twice without clearing it in between is wrong: if the +first call sets an error, it no longer points to NULL for the second +call. + +xen_bus_realize() is wrong that way: it passes &local_err to +xs_node_watch() in a loop. If this fails in more than one iteration, +it can trip error_setv()'s assertion. + +Fix by clearing @local_err. + +Fixes: c4583c8c394e (xen-bus: reduce scope of backend watch) +Signed-off-by: Markus Armbruster +Message-ID: <20250314143500.2449658-2-armbru@redhat.com> +Reviewed-by: Stefano Stabellini +Signed-off-by: qihao_yewu +--- + hw/xen/xen-bus.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c +index 4973e7d9c9..c10b089914 100644 +--- a/hw/xen/xen-bus.c ++++ b/hw/xen/xen-bus.c +@@ -352,6 +352,7 @@ static void xen_bus_realize(BusState *bus, Error **errp) + error_reportf_err(local_err, + "failed to set up '%s' enumeration watch: ", + type[i]); ++ local_err = NULL; + } + + g_free(node); +-- +2.41.0.windows.1 + diff --git a/i386-cpuid-Remove-subleaf-constraint-on-CPUID-leaf-1.patch b/i386-cpuid-Remove-subleaf-constraint-on-CPUID-leaf-1.patch new file mode 100644 index 0000000000000000000000000000000000000000..2443f3bcf5e64e15ba083f5ad005de61cfae58a9 --- /dev/null +++ b/i386-cpuid-Remove-subleaf-constraint-on-CPUID-leaf-1.patch @@ -0,0 +1,38 @@ +From 0d5ac4f36208eadbb922f552ba1b762f5bd0c3a6 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 24 Jan 2024 21:40:15 -0500 +Subject: [PATCH] i386/cpuid: Remove subleaf constraint on CPUID leaf 1F + +commit a3b5376521a0de898440e8d0942b54e628f0949f upstream. + +No such constraint that subleaf index needs to be less than 64. + +Intel-SIG: commit a3b5376521a0 i386/cpuid: Remove subleaf constraint on CPUID leaf 1F + +Signed-off-by: Xiaoyao Li +Reviewed-by:Yang Weijiang +Message-ID: <20240125024016.2521244-3-xiaoyao.li@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/kvm/kvm.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index ce96ed9158..850104f6b5 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1928,10 +1928,6 @@ int kvm_arch_init_vcpu(CPUState *cs) + break; + } + +- if (i == 0x1f && j == 64) { +- break; +- } +- + c->function = i; + c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + c->index = j; +-- +2.41.0.windows.1 + diff --git a/intel_iommu-Check-compatibility-with-host-IOMMU-capa.patch b/intel_iommu-Check-compatibility-with-host-IOMMU-capa.patch new file mode 100644 index 0000000000000000000000000000000000000000..140639c98083bee64c248ad3272488ce180ab50a --- /dev/null +++ b/intel_iommu-Check-compatibility-with-host-IOMMU-capa.patch @@ -0,0 +1,70 @@ +From 4ef1b086272552378c09356b0e9fd2548a27a621 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:43 +0800 +Subject: [PATCH] intel_iommu: Check compatibility with host IOMMU capabilities + +If check fails, host device (either VFIO or VDPA device) is not +compatible with current vIOMMU config and should not be passed to +guest. + +Only aw_bits is checked for now, we don't care about other caps +before scalable modern mode is introduced. + +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/i386/intel_iommu.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index bdc14f8438..60d86e0cb6 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -3838,6 +3838,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, + return vtd_dev_as; + } + ++static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod, ++ Error **errp) ++{ ++ HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod); ++ int ret; ++ ++ if (!hiodc->get_cap) { ++ error_setg(errp, ".get_cap() not implemented"); ++ return false; ++ } ++ ++ /* Common checks */ ++ ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp); ++ if (ret < 0) { ++ return false; ++ } ++ if (s->aw_bits > ret) { ++ error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret); ++ return false; ++ } ++ ++ return true; ++} ++ + static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) + { +@@ -3858,6 +3882,11 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + return false; + } + ++ if (!vtd_check_hiod(s, hiod, errp)) { ++ vtd_iommu_unlock(s); ++ return false; ++ } ++ + new_key = g_malloc(sizeof(*new_key)); + new_key->bus = bus; + new_key->devfn = devfn; +-- +2.41.0.windows.1 + diff --git a/intel_iommu-Extract-out-vtd_cap_init-to-initialize-c.patch b/intel_iommu-Extract-out-vtd_cap_init-to-initialize-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..e1c4d3ec067976b7e3ca49ef60d3b1a1b2568463 --- /dev/null +++ b/intel_iommu-Extract-out-vtd_cap_init-to-initialize-c.patch @@ -0,0 +1,142 @@ +From a051e4349316d7065c9418de691787edae8e7f4e Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:41 +0800 +Subject: [PATCH] intel_iommu: Extract out vtd_cap_init() to initialize + cap/ecap + +Extract cap/ecap initialization in vtd_cap_init() to make code +cleaner. + +No functional change intended. + +Reviewed-by: Eric Auger +Signed-off-by: Zhenzhong Duan +Reviewed-by: Michael S. Tsirkin +--- + hw/i386/intel_iommu.c | 93 ++++++++++++++++++++++++------------------- + 1 file changed, 51 insertions(+), 42 deletions(-) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 3da56e439e..6716407b7a 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -3935,30 +3935,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) + return; + } + +-/* Do the initialization. It will also be called when reset, so pay +- * attention when adding new initialization stuff. +- */ +-static void vtd_init(IntelIOMMUState *s) ++static void vtd_cap_init(IntelIOMMUState *s) + { + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + +- memset(s->csr, 0, DMAR_REG_SIZE); +- memset(s->wmask, 0, DMAR_REG_SIZE); +- memset(s->w1cmask, 0, DMAR_REG_SIZE); +- memset(s->womask, 0, DMAR_REG_SIZE); +- +- s->root = 0; +- s->root_scalable = false; +- s->dmar_enabled = false; +- s->intr_enabled = false; +- s->iq_head = 0; +- s->iq_tail = 0; +- s->iq = 0; +- s->iq_size = 0; +- s->qi_enabled = false; +- s->iq_last_desc_type = VTD_INV_DESC_NONE; +- s->iq_dw = false; +- s->next_frcd_reg = 0; + s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | + VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | + VTD_CAP_MGAW(s->aw_bits); +@@ -3975,27 +3955,6 @@ static void vtd_init(IntelIOMMUState *s) + } + s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; + +- /* +- * Rsvd field masks for spte +- */ +- vtd_spte_rsvd[0] = ~0ULL; +- vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, +- x86_iommu->dt_supported); +- vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); +- vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); +- vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); +- +- vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, +- x86_iommu->dt_supported); +- vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, +- x86_iommu->dt_supported); +- +- if (s->scalable_mode || s->snoop_control) { +- vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; +- vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; +- vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; +- } +- + if (x86_iommu_ir_supported(x86_iommu)) { + s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; + if (s->intr_eim == ON_OFF_AUTO_ON) { +@@ -4028,6 +3987,56 @@ static void vtd_init(IntelIOMMUState *s) + if (s->pasid) { + s->ecap |= VTD_ECAP_PASID; + } ++} ++ ++/* ++ * Do the initialization. It will also be called when reset, so pay ++ * attention when adding new initialization stuff. ++ */ ++static void vtd_init(IntelIOMMUState *s) ++{ ++ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); ++ ++ memset(s->csr, 0, DMAR_REG_SIZE); ++ memset(s->wmask, 0, DMAR_REG_SIZE); ++ memset(s->w1cmask, 0, DMAR_REG_SIZE); ++ memset(s->womask, 0, DMAR_REG_SIZE); ++ ++ s->root = 0; ++ s->root_scalable = false; ++ s->dmar_enabled = false; ++ s->intr_enabled = false; ++ s->iq_head = 0; ++ s->iq_tail = 0; ++ s->iq = 0; ++ s->iq_size = 0; ++ s->qi_enabled = false; ++ s->iq_last_desc_type = VTD_INV_DESC_NONE; ++ s->iq_dw = false; ++ s->next_frcd_reg = 0; ++ ++ vtd_cap_init(s); ++ ++ /* ++ * Rsvd field masks for spte ++ */ ++ vtd_spte_rsvd[0] = ~0ULL; ++ vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, ++ x86_iommu->dt_supported); ++ vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); ++ vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); ++ vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); ++ ++ vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, ++ x86_iommu->dt_supported); ++ vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, ++ x86_iommu->dt_supported); ++ ++ if (s->scalable_mode || s->snoop_control) { ++ vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; ++ vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; ++ vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; ++ } + + vtd_reset_caches(s); + +-- +2.41.0.windows.1 + diff --git a/intel_iommu-Implement-set-unset-_iommu_device-callba.patch b/intel_iommu-Implement-set-unset-_iommu_device-callba.patch new file mode 100644 index 0000000000000000000000000000000000000000..572540296b4a82c78f11f80f880d0ef2ff60e28b --- /dev/null +++ b/intel_iommu-Implement-set-unset-_iommu_device-callba.patch @@ -0,0 +1,160 @@ +From 5834bb1ccce592380a91a5cf127f90a031cd7cf2 Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Wed, 5 Jun 2024 16:30:42 +0800 +Subject: [PATCH] intel_iommu: Implement [set|unset]_iommu_device() callbacks + +Implement [set|unset]_iommu_device() callbacks in Intel vIOMMU. +In set call, we take a reference of HostIOMMUDevice and store it +in hash table indexed by PCI BDF. + +Note this BDF index is device's real BDF not the aliased one which +is different from the index of VTDAddressSpace. There can be multiple +assigned devices under same virtual iommu group and share same +VTDAddressSpace, but each has its own HostIOMMUDevice. + +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/i386/intel_iommu.c | 81 +++++++++++++++++++++++++++++++++++ + include/hw/i386/intel_iommu.h | 2 + + 2 files changed, 83 insertions(+) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 6716407b7a..bdc14f8438 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -61,6 +61,12 @@ struct vtd_as_key { + uint32_t pasid; + }; + ++/* bus/devfn is PCI device's real BDF not the aliased one */ ++struct vtd_hiod_key { ++ PCIBus *bus; ++ uint8_t devfn; ++}; ++ + struct vtd_iotlb_key { + uint64_t gfn; + uint32_t pasid; +@@ -250,6 +256,25 @@ static guint vtd_as_hash(gconstpointer v) + return (guint)(value << 8 | key->devfn); + } + ++/* Same implementation as vtd_as_hash() */ ++static guint vtd_hiod_hash(gconstpointer v) ++{ ++ return vtd_as_hash(v); ++} ++ ++static gboolean vtd_hiod_equal(gconstpointer v1, gconstpointer v2) ++{ ++ const struct vtd_hiod_key *key1 = v1; ++ const struct vtd_hiod_key *key2 = v2; ++ ++ return (key1->bus == key2->bus) && (key1->devfn == key2->devfn); ++} ++ ++static void vtd_hiod_destroy(gpointer v) ++{ ++ object_unref(v); ++} ++ + static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, + gpointer user_data) + { +@@ -3813,6 +3838,58 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, + return vtd_dev_as; + } + ++static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, ++ HostIOMMUDevice *hiod, Error **errp) ++{ ++ IntelIOMMUState *s = opaque; ++ struct vtd_as_key key = { ++ .bus = bus, ++ .devfn = devfn, ++ }; ++ struct vtd_as_key *new_key; ++ ++ assert(hiod); ++ ++ vtd_iommu_lock(s); ++ ++ if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { ++ error_setg(errp, "Host IOMMU device already exist"); ++ vtd_iommu_unlock(s); ++ return false; ++ } ++ ++ new_key = g_malloc(sizeof(*new_key)); ++ new_key->bus = bus; ++ new_key->devfn = devfn; ++ ++ object_ref(hiod); ++ g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod); ++ ++ vtd_iommu_unlock(s); ++ ++ return true; ++} ++ ++static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) ++{ ++ IntelIOMMUState *s = opaque; ++ struct vtd_as_key key = { ++ .bus = bus, ++ .devfn = devfn, ++ }; ++ ++ vtd_iommu_lock(s); ++ ++ if (!g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { ++ vtd_iommu_unlock(s); ++ return; ++ } ++ ++ g_hash_table_remove(s->vtd_host_iommu_dev, &key); ++ ++ vtd_iommu_unlock(s); ++} ++ + /* Unmap the whole range in the notifier's scope. */ + static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + { +@@ -4117,6 +4194,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + + static PCIIOMMUOps vtd_iommu_ops = { + .get_address_space = vtd_host_dma_iommu, ++ .set_iommu_device = vtd_dev_set_iommu_device, ++ .unset_iommu_device = vtd_dev_unset_iommu_device, + }; + + static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) +@@ -4240,6 +4319,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) + g_free, g_free); + s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, + g_free, g_free); ++ s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_hiod_hash, vtd_hiod_equal, ++ g_free, vtd_hiod_destroy); + vtd_init(s); + pci_setup_iommu(bus, &vtd_iommu_ops, dev); + /* Pseudo address space under root PCI bus. */ +diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h +index 7fa0a695c8..1eb05c29fc 100644 +--- a/include/hw/i386/intel_iommu.h ++++ b/include/hw/i386/intel_iommu.h +@@ -292,6 +292,8 @@ struct IntelIOMMUState { + /* list of registered notifiers */ + QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; + ++ GHashTable *vtd_host_iommu_dev; /* HostIOMMUDevice */ ++ + /* interrupt remapping */ + bool intr_enabled; /* Whether guest enabled IR */ + dma_addr_t intr_root; /* Interrupt remapping table pointer */ +-- +2.41.0.windows.1 + diff --git a/iommufd.h-Updated-to-openeuler-olk-6.6-kernel.patch b/iommufd.h-Updated-to-openeuler-olk-6.6-kernel.patch new file mode 100644 index 0000000000000000000000000000000000000000..fab0bfa658275ca79de924aba7326d90d19bce71 --- /dev/null +++ b/iommufd.h-Updated-to-openeuler-olk-6.6-kernel.patch @@ -0,0 +1,90 @@ +From 8414bc02f988ecca7dda5325227ff5ffbe45150c Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Wed, 15 Jan 2025 10:02:58 +0000 +Subject: [PATCH] iommufd.h: Updated to openeuler olk-6.6 kernel + +Signed-off-by: Shameer Kolothum +--- + linux-headers/linux/iommufd.h | 26 ++++++++++++-------------- + 1 file changed, 12 insertions(+), 14 deletions(-) + +diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h +index 41559c6064..3e57fee01c 100644 +--- a/linux-headers/linux/iommufd.h ++++ b/linux-headers/linux/iommufd.h +@@ -51,8 +51,8 @@ enum { + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, + IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, + IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, +- IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f, +- IOMMUFD_CMD_VDEVICE_ALLOC = 0x90, ++ IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, ++ IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, + }; + + /** +@@ -397,18 +397,20 @@ struct iommu_hwpt_vtd_s1 { + }; + + /** +- * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info ++ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for +- * a user stage-1 Context Descriptor Table. Must be little-endian. ++ * the translation. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass +- * nested domain will translate the same as the nesting parent. ++ * nested domain will translate the same as the nesting parent. The S1 will ++ * install a Context Descriptor Table pointing at userspace memory translated ++ * by the nesting parent. + */ + struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; +@@ -920,8 +922,8 @@ enum iommu_viommu_type { + * that is unique to a specific VM. Operations global to the IOMMU are connected + * to the vIOMMU, such as: + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags ++ * - Non-device-affiliated event reporting, e.g. invalidation queue errors + * - Access to a sharable nesting parent pagetable across physical IOMMUs +- * - Non-affiliated event reporting (e.g. an invalidation queue error) + * - Virtualization of various platforms IDs, e.g. RIDs and others + * - Delivery of paravirtualized invalidation + * - Direct assigned invalidation queues +@@ -941,12 +943,10 @@ struct iommu_viommu_alloc { + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) + * @size: sizeof(struct iommu_vdevice_alloc) + * @viommu_id: vIOMMU ID to associate with the virtual device +- * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU +- * @__reserved: Must be 0 ++ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU ++ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY + * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID +- * of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table. +- * @out_vdevice_id: Output virtual instance ID for the allocated object +- * @__reserved2: Must be 0 ++ * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * + * Allocate a virtual device instance (for a physical device) against a vIOMMU. + * This instance holds the device's information (related to its vIOMMU) in a VM. +@@ -955,10 +955,8 @@ struct iommu_vdevice_alloc { + __u32 size; + __u32 viommu_id; + __u32 dev_id; +- __u32 __reserved; +- __aligned_u64 virt_id; + __u32 out_vdevice_id; +- __u32 __reserved2; ++ __aligned_u64 virt_id; + }; + #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) + #endif +-- +2.41.0.windows.1 + diff --git a/kconfig-Activate-IOMMUFD-for-s390x-machines.patch b/kconfig-Activate-IOMMUFD-for-s390x-machines.patch new file mode 100644 index 0000000000000000000000000000000000000000..565563a12b35a42ce0eb186c06efc08a2930cefe --- /dev/null +++ b/kconfig-Activate-IOMMUFD-for-s390x-machines.patch @@ -0,0 +1,34 @@ +From 3dfc0dd0b59925d1b73ca1a0db6d307ae597f76e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Sat, 11 Jan 2025 10:52:56 +0800 +Subject: [PATCH] kconfig: Activate IOMMUFD for s390x machines +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Matthew Rosato +Reviewed-by: Eric Farman +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/s390x/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig +index 4c068d7960..26ad104485 100644 +--- a/hw/s390x/Kconfig ++++ b/hw/s390x/Kconfig +@@ -6,6 +6,7 @@ config S390_CCW_VIRTIO + imply VFIO_CCW + imply WDT_DIAG288 + imply PCIE_DEVICES ++ imply IOMMUFD + select PCI_EXPRESS + select S390_FLIC + select S390_FLIC_KVM if KVM +-- +2.41.0.windows.1 + diff --git a/kvm-Translate-MSI-doorbell-address-only-if-it-is-val.patch b/kvm-Translate-MSI-doorbell-address-only-if-it-is-val.patch new file mode 100644 index 0000000000000000000000000000000000000000..8c0f313c4326e328c29fa7477fa25db1482b0e85 --- /dev/null +++ b/kvm-Translate-MSI-doorbell-address-only-if-it-is-val.patch @@ -0,0 +1,54 @@ +From cdd5c088ff46ebf423c926fe4c0b12e345ae0db0 Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Thu, 23 Feb 2023 12:12:48 +0000 +Subject: [PATCH] =?UTF-8?q?kvm:=20Translate=20MSI=20doorbell=20address?= + =?UTF-8?q?=C2=A0only=20if=20it=20is=20valid?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Guest might have already set the MSI doorbell address to invalid +and if we try to translate the address again, Guest reports, + +[ 26.784082] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x10 received: +[ 26.784088] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000001000000010 +[ 26.784090] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 +[ 26.784092] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 +[ 26.784094] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 +[ 26.788082] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x10 received: +[ 26.788085] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000001000000010 +[ 26.788087] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 +.... + +eg: rmmod hisi_zip.ko. The sequence seems to be, + + - Write 0 to MSI Message Address register + - Disable MSI + +Hence check for address validity before we try to do the translation. + +Note: The fix is placed in generic code and hopefully is not a problem +for other architectures. + +Signed-off-by: Shameer Kolothum +--- + accel/kvm/kvm-all.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index a8e29f148e..6fa97d2cbf 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2074,7 +2074,8 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, + kroute.flags = KVM_MSI_VALID_DEVID; + kroute.u.msi.devid = pci_requester_id(dev); + } +- if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { ++ if (msg.address && ++ kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { + return -EINVAL; + } + +-- +2.41.0.windows.1 + diff --git a/net-fix-build-when-libbpf-is-disabled-but-libxdp-is-.patch b/net-fix-build-when-libbpf-is-disabled-but-libxdp-is-.patch new file mode 100644 index 0000000000000000000000000000000000000000..ecd0c23e0e6678503aa9e4a9ae47bf6a81c89653 --- /dev/null +++ b/net-fix-build-when-libbpf-is-disabled-but-libxdp-is-.patch @@ -0,0 +1,76 @@ +From 17835e803d0cfa308cd00f070c7e21b27f3d036e Mon Sep 17 00:00:00 2001 +From: gubin +Date: Sat, 22 Mar 2025 15:38:09 +0800 +Subject: [PATCH] net: fix build when libbpf is disabled, but libxdp is enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cherry-pick from 1f37280b37dbf85f36748f359a9f8802c8fe7ccd + +The net/af-xdp.c code is enabled when the libxdp library is present, +however, it also has direct API calls to bpf_xdp_query_id & +bpf_xdp_detach which are provided by the libbpf library. + +As a result if building with --disable-libbpf, but libxdp gets +auto-detected, we'll fail to link QEMU + + /usr/bin/ld: libcommon.a.p/net_af-xdp.c.o: undefined reference to symbol 'bpf_xdp_query_id@@LIBBPF_0.7.0' + +There are two bugs here + + * Since we have direct libbpf API calls, when building + net/af-xdp.c, we must tell meson that libbpf is a + dependancy, so that we directly link to it, rather + than relying on indirect linkage. + + * When must skip probing for libxdp at all, when libbpf + is not found, raising an error if --enable-libxdp was + given explicitly. + +Fixes: cb039ef3d9e3112da01e1ecd9b136ac9809ef733 +Signed-off-by: Daniel P. Berrangé +Signed-off-by: Jason Wang +Signed-off-by: gubin +--- + meson.build | 10 ++++++++-- + net/meson.build | 2 +- + 2 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/meson.build b/meson.build +index 4078f2aced..aea6a33ca3 100644 +--- a/meson.build ++++ b/meson.build +@@ -1972,8 +1972,14 @@ endif + # libxdp + libxdp = not_found + if not get_option('af_xdp').auto() or have_system +- libxdp = dependency('libxdp', required: get_option('af_xdp'), +- version: '>=1.4.0', method: 'pkg-config') ++ if libbpf.found() ++ libxdp = dependency('libxdp', required: get_option('af_xdp'), ++ version: '>=1.4.0', method: 'pkg-config') ++ else ++ if get_option('af_xdp').enabled() ++ error('libxdp requested, but libbpf is not available') ++ endif ++ endif + endif + + # libdw +diff --git a/net/meson.build b/net/meson.build +index ce99bd4447..7264479242 100644 +--- a/net/meson.build ++++ b/net/meson.build +@@ -37,7 +37,7 @@ if have_netmap + system_ss.add(files('netmap.c')) + endif + +-system_ss.add(when: libxdp, if_true: files('af-xdp.c')) ++system_ss.add(when: [libxdp, libbpf], if_true: files('af-xdp.c')) + + if have_vhost_net_user + system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c')) +-- +2.41.0.windows.1 + diff --git a/pci-Get-pasid-capability-from-vIOMMU.patch b/pci-Get-pasid-capability-from-vIOMMU.patch new file mode 100644 index 0000000000000000000000000000000000000000..7ba0cdef63449a0c3754011f2b7ebd20caf429b6 --- /dev/null +++ b/pci-Get-pasid-capability-from-vIOMMU.patch @@ -0,0 +1,68 @@ +From 494e0ace6c120af00b27a0cc1d4a478073654e35 Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Thu, 12 Sep 2024 00:33:13 -0700 +Subject: [PATCH] pci: Get pasid capability from vIOMMU + +Signed-off-by: Yi Liu +--- + hw/pci/pci.c | 13 +++++++++++++ + include/hw/pci/pci.h | 13 +++++++++++++ + 2 files changed, 26 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index d6f627aa51..447ef2b163 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2802,6 +2802,19 @@ void pci_device_unset_iommu_device(PCIDevice *dev) + } + } + ++bool pci_device_get_pasid_cap(PCIDevice *dev) ++{ ++ PCIBus *iommu_bus; ++ ++ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); ++ if (iommu_bus && iommu_bus->iommu_ops->get_pasid_cap) { ++ return iommu_bus->iommu_ops->get_pasid_cap(pci_get_bus(dev), ++ iommu_bus->iommu_opaque, ++ dev->devfn); ++ } ++ return false; ++} ++ + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) + { + /* +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index 8d1af44249..0dfe274c33 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -418,12 +418,25 @@ typedef struct PCIIOMMUOps { + * @devfn: device and function number of the PCI device. + */ + void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); ++ /** ++ * @get_pasid_cap: get pasid capability from vIOMMU ++ * ++ * Optional callback. ++ * ++ * @bus: the #PCIBus of the PCI device. ++ * ++ * @opaque: the data passed to pci_setup_iommu(). ++ * ++ * @devfn: device and function number of the PCI device. ++ */ ++ bool (*get_pasid_cap)(PCIBus *bus, void *opaque, int devfn); + } PCIIOMMUOps; + + AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); + bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp); + void pci_device_unset_iommu_device(PCIDevice *dev); ++bool pci_device_get_pasid_cap(PCIDevice *dev); + + /** + * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus +-- +2.41.0.windows.1 + diff --git a/qemu.spec b/qemu.spec index c129e8ee5775517857d4a70b82aa78d5b29158f8..93534bb8a987cf02b9e1152491190dd549868fe6 100644 --- a/qemu.spec +++ b/qemu.spec @@ -3,7 +3,7 @@ Name: qemu Version: 8.2.0 -Release: 29 +Release: 30 Epoch: 11 Summary: QEMU is a generic and open source machine emulator and virtualizer License: GPLv2 and BSD and MIT and CC-BY-SA-4.0 @@ -622,6 +622,178 @@ Patch0605: target-i386-kvm-Support-to-get-and-enable-extensions.patch Patch0606: target-i386-csv-Request-to-set-private-memory-of-CSV.patch Patch0607: target-i386-csv-Support-load-kernel-hashes-for-CSV3-.patch Patch0608: target-i386-csv-Support-inject-secret-for-CSV3-guest.patch +Patch0609: arm-VirtCCA-CVM-support-UEFI-boot.patch +Patch0610: arm-VirtCCA-qemu-uefi-boot-support-kae.patch +Patch0611: arm-VirtCCA-Compatibility-with-older-versions-of-TMM.patch +Patch0612: arm-VirtCCA-qemu-CoDA-support-UEFI-boot.patch +Patch0613: BUGFIX-Enforce-isolation-for-virtcca_shared_hugepage.patch +Patch0614: backends-VirtCCA-cvm_gpa_start-supports-both-1GB-and.patch +Patch0615: qga-Add-log-to-guest-fsfreeze-thaw-command.patch +Patch0616: qga-Don-t-daemonize-before-channel-is-initialized.patch +Patch0617: virtcca-add-kvm-isolation-when-get-tmi-version.patch +Patch0618: backends-cryptodev-Do-not-abort-for-invalid-session-.patch +Patch0619: backends-cryptodev-Do-not-ignore-throttle-backends-E.patch +Patch0620: hw-nvme-fix-invalid-check-on-mcl.patch +Patch0621: hw-nvme-fix-invalid-endian-conversion.patch +Patch0622: net-fix-build-when-libbpf-is-disabled-but-libxdp-is-.patch +Patch0623: target-i386-Add-more-features-enumerated-by-CPUID.7..patch +Patch0624: target-i386-fix-feature-dependency-for-WAITPKG.patch +Patch0625: target-i386-add-support-for-FRED-in-CPUID-enumeratio.patch +Patch0626: target-i386-mark-CR4.FRED-not-reserved.patch +Patch0627: vmxcap-add-support-for-VMX-FRED-controls.patch +Patch0628: target-i386-enumerate-VMX-nested-exception-support.patch +Patch0629: target-i386-Add-get-set-migrate-support-for-FRED-MSR.patch +Patch0630: target-i386-Delete-duplicated-macro-definition-CR4_F.patch +Patch0631: target-i386-Add-VMX-control-bits-for-nested-FRED-sup.patch +Patch0632: target-i386-Raise-the-highest-index-value-used-for-a.patch +Patch0633: target-i386-pass-X86CPU-to-x86_cpu_get_supported_fea.patch +Patch0634: i386-cpuid-Remove-subleaf-constraint-on-CPUID-leaf-1.patch +Patch0635: target-i386-Don-t-construct-a-all-zero-entry-for-CPU.patch +Patch0636: target-i386-Enable-fdp-excptn-only-and-zero-fcs-fds.patch +Patch0637: target-i386-Construct-CPUID-2-as-stateful-iff-times-.patch +Patch0638: target-i386-Make-invtsc-migratable-when-user-sets-ts.patch +Patch0639: hw-pci-host-designware-Fix-ATU_UPPER_TARGET-register.patch +Patch0640: hw-ufs-free-irq-on-exit.patch +Patch0641: hw-sd-sdhci-free-irq-on-exit.patch +Patch0642: target-s390x-Fix-a-typo-in-s390_cpu_class_init.patch +Patch0643: hw-misc-aspeed_hace-Fix-buffer-overflow-in-has_paddi.patch +Patch0644: hw-xen-Fix-xen_bus_realize-error-handling.patch +Patch0645: cryptodev-Fix-error-handling-in-cryptodev_lkcf_execu.patch +Patch0646: vfio-Introduce-base-object-for-VFIOContainer-and-tar.patch +Patch0647: vfio-container-Introduce-a-empty-VFIOIOMMUOps.patch +Patch0648: vfio-container-Switch-to-dma_map-unmap-API.patch +Patch0649: vfio-common-Introduce-vfio_container_init-destroy-he.patch +Patch0650: vfio-common-Move-giommu_list-in-base-container.patch +Patch0651: vfio-container-Move-space-field-to-base-container.patch +Patch0652: vfio-container-Switch-to-IOMMU-BE-set_dirty_page_tra.patch +Patch0653: vfio-container-Move-per-container-device-list-in-bas.patch +Patch0654: vfio-container-Convert-functions-to-base-container.patch +Patch0655: vfio-container-Move-pgsizes-and-dma_max_mappings-to-.patch +Patch0656: vfio-container-Move-vrdl_list-to-base-container.patch +Patch0657: vfio-container-Move-listener-to-base-container.patch +Patch0658: vfio-container-Move-dirty_pgsizes-and-max_dirty_bitm.patch +Patch0659: vfio-container-Move-iova_ranges-to-base-container.patch +Patch0660: vfio-container-Implement-attach-detach_device.patch +Patch0661: vfio-spapr-Introduce-spapr-backend-and-target-interf.patch +Patch0662: vfio-spapr-switch-to-spapr-IOMMU-BE-add-del_section_.patch +Patch0663: vfio-spapr-Move-prereg_listener-into-spapr-container.patch +Patch0664: vfio-spapr-Move-hostwin_list-into-spapr-container.patch +Patch0665: backends-iommufd-Introduce-the-iommufd-object.patch +Patch0666: util-char_dev-Add-open_cdev.patch +Patch0667: vfio-common-return-early-if-space-isn-t-empty.patch +Patch0668: vfio-iommufd-Implement-the-iommufd-backend.patch +Patch0669: vfio-iommufd-Relax-assert-check-for-iommufd-backend.patch +Patch0670: vfio-iommufd-Add-support-for-iova_ranges-and-pgsizes.patch +Patch0671: vfio-pci-Extract-out-a-helper-vfio_pci_get_pci_hot_r.patch +Patch0672: vfio-pci-Introduce-a-vfio-pci-hot-reset-interface.patch +Patch0673: vfio-iommufd-Enable-pci-hot-reset-through-iommufd-cd.patch +Patch0674: vfio-pci-Allow-the-selection-of-a-given-iommu-backen.patch +Patch0675: vfio-pci-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch +Patch0676: vfio-platform-Allow-the-selection-of-a-given-iommu-b.patch +Patch0677: vfio-platform-Make-vfio-cdev-pre-openable-by-passing.patch +Patch0678: vfio-ap-Allow-the-selection-of-a-given-iommu-backend.patch +Patch0679: vfio-ap-Make-vfio-cdev-pre-openable-by-passing-a-fil.patch +Patch0680: vfio-ccw-Allow-the-selection-of-a-given-iommu-backen.patch +Patch0681: vfio-ccw-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch +Patch0682: vfio-Make-VFIOContainerBase-poiner-parameter-const-i.patch +Patch0683: hw-arm-Activate-IOMMUFD-for-virt-machines.patch +Patch0684: kconfig-Activate-IOMMUFD-for-s390x-machines.patch +Patch0685: hw-i386-Activate-IOMMUFD-for-q35-machines.patch +Patch0686: vfio-pci-Move-VFIODevice-initializations-in-vfio_ins.patch +Patch0687: vfio-platform-Move-VFIODevice-initializations-in-vfi.patch +Patch0688: vfio-ap-Move-VFIODevice-initializations-in-vfio_ap_i.patch +Patch0689: vfio-ccw-Move-VFIODevice-initializations-in-vfio_ccw.patch +Patch0690: vfio-Introduce-a-helper-function-to-initialize-VFIOD.patch +Patch0691: docs-devel-Add-VFIO-iommufd-backend-documentation.patch +Patch0692: vfio-container-Introduce-vfio_legacy_setup-for-furth.patch +Patch0693: vfio-container-Initialize-VFIOIOMMUOps-under-vfio_in.patch +Patch0694: vfio-container-Introduce-a-VFIOIOMMU-QOM-interface.patch +Patch0695: vfio-container-Introduce-a-VFIOIOMMU-legacy-QOM-inte.patch +Patch0696: vfio-container-Intoduce-a-new-VFIOIOMMUClass-setup-h.patch +Patch0697: vfio-spapr-Introduce-a-sPAPR-VFIOIOMMU-QOM-interface.patch +Patch0698: vfio-iommufd-Introduce-a-VFIOIOMMU-iommufd-QOM-inter.patch +Patch0699: vfio-spapr-Only-compile-sPAPR-IOMMU-support-when-nee.patch +Patch0700: vfio-spapr-Extend-VFIOIOMMUOps-with-a-release-handle.patch +Patch0701: vfio-iommufd-Remove-CONFIG_IOMMUFD-usage.patch +Patch0702: backends-Introduce-HostIOMMUDevice-abstract.patch +Patch0703: backends-host_iommu_device-Introduce-HostIOMMUDevice.patch +Patch0704: vfio-container-Introduce-TYPE_HOST_IOMMU_DEVICE_LEGA.patch +Patch0705: backends-iommufd-Introduce-TYPE_HOST_IOMMU_DEVICE_IO.patch +Patch0706: range-Introduce-range_get_last_bit.patch +Patch0707: vfio-container-Implement-HostIOMMUDeviceClass-realiz.patch +Patch0708: backends-iommufd-Introduce-helper-function-iommufd_b.patch +Patch0709: vfio-iommufd-Implement-HostIOMMUDeviceClass-realize-.patch +Patch0710: vfio-container-Implement-HostIOMMUDeviceClass-get_ca.patch +Patch0711: backends-iommufd-Implement-HostIOMMUDeviceClass-get_.patch +Patch0712: vfio-Create-host-IOMMU-device-instance.patch +Patch0713: hw-pci-Introduce-helper-function-pci_device_get_iomm.patch +Patch0714: hw-pci-Introduce-pci_device_-set-unset-_iommu_device.patch +Patch0715: vfio-pci-Pass-HostIOMMUDevice-to-vIOMMU.patch +Patch0716: intel_iommu-Extract-out-vtd_cap_init-to-initialize-c.patch +Patch0717: intel_iommu-Implement-set-unset-_iommu_device-callba.patch +Patch0718: intel_iommu-Check-compatibility-with-host-IOMMU-capa.patch +Patch0719: vfio-pci-Extract-mdev-check-into-an-helper.patch +Patch0720: vfio-iommufd-Don-t-initialize-nor-set-a-HOST_IOMMU_D.patch +Patch0721: backends-iommufd-Extend-iommufd_backend_get_device_i.patch +Patch0722: vfio-iommufd-Return-errno-in-iommufd_cdev_attach_ioa.patch +Patch0723: vfio-ap-Don-t-initialize-HOST_IOMMU_DEVICE-with-mdev.patch +Patch0724: vfio-ccw-Don-t-initialize-HOST_IOMMU_DEVICE-with-mde.patch +Patch0725: vfio-iommufd-Introduce-auto-domain-creation.patch +Patch0726: HostIOMMUDevice-Store-the-VFIO-VDPA-agent.patch +Patch0727: vfio-iommufd-container-Remove-caps-aw_bits.patch +Patch0728: vfio-iommufd-Add-hw_caps-field-to-HostIOMMUDeviceCap.patch +Patch0729: vfio-iommufd-container-Invoke-HostIOMMUDevice-realiz.patch +Patch0730: vfio-iommufd-Probe-and-request-hwpt-dirty-tracking-c.patch +Patch0731: vfio-iommufd-Implement-VFIOIOMMUClass-set_dirty_trac.patch +Patch0732: vfio-iommufd-Implement-VFIOIOMMUClass-query_dirty_bi.patch +Patch0733: vfio-migration-Don-t-block-migration-device-dirty-tr.patch +Patch0734: vfio-common-Allow-disabling-device-dirty-page-tracki.patch +Patch0735: Update-iommufd.h-header-for-vSVA.patch +Patch0736: backends-iommufd-Add-helpers-for-invalidating-user-m.patch +Patch0737: vfio-iommufd-Add-properties-and-handlers-to-TYPE_HOS.patch +Patch0738: HostIOMMUDevice-Introduce-realize_late-callback.patch +Patch0739: vfio-iommufd-Implement-HostIOMMUDeviceClass-realize_.patch +Patch0740: vfio-iommufd-Implement-at-de-tach_hwpt-handlers.patch +Patch0741: backends-iommufd-Introduce-iommufd_backend_alloc_vio.patch +Patch0742: backends-iommufd-Introduce-iommufd_vdev_alloc.patch +Patch0743: backends-iommufd-Introduce-iommufd_viommu_invalidate.patch +Patch0744: hw-arm-smmu-common-Add-a-nested-flag-to-SMMUState.patch +Patch0745: hw-arm-smmu-common-Bypass-emulated-IOTLB-for-a-neste.patch +Patch0746: hw-arm-smmu-common-Extract-smmu_get_sbus-and-smmu_ge.patch +Patch0747: hw-arm-smmu-common-Add-set-unset_iommu_device-callba.patch +Patch0748: hw-arm-smmu-common-Add-iommufd-helpers.patch +Patch0749: hw-arm-smmu-common-Return-sysmem-if-stage-1-is-bypas.patch +Patch0750: hw-arm-smmuv3-Ignore-IOMMU_NOTIFIER_MAP-for-nested-s.patch +Patch0751: hw-arm-smmuv3-Read-host-SMMU-device-info.patch +Patch0752: hw-arm-smmuv3-Check-idr-registers-for-STE_S1CDMAX-an.patch +Patch0753: hw-arm-smmuv3-Add-smmu_dev_install_nested_ste-for-CF.patch +Patch0754: hw-arm-smmuv3-Add-missing-STE-invalidation.patch +Patch0755: hw-arm-smmu-common-Replace-smmu_iommu_mr-with-smmu_f.patch +Patch0756: hw-arm-smmuv3-Forward-cache-invalidate-commands-via-.patch +Patch0757: tests-qtest-Allow-DSDT-acpi-tables-to-change.patch +Patch0758: acpi-gpex-Fix-PCI-Express-Slot-Information-function-.patch +Patch0759: tests-data-acpi-Update-DSDT-acpi-tables.patch +Patch0760: hw-pci-host-gpex-needs-kernel-fix-Allow-to-generate-.patch +Patch0761: hw-arm-virt-Add-an-SMMU_IO_LEN-macro.patch +Patch0762: hw-arm-smmuv3-Add-initial-support-for-SMMUv3-Nested-.patch +Patch0763: hw-arm-smmuv3-Associate-a-pci-bus-with-a-SMMUv3-Nest.patch +Patch0764: hw-arm-virt-acpi-build-Build-IORT-with-multiple-SMMU.patch +Patch0765: tests-qtest-Allow-IORT-acpi-table-to-change.patch +Patch0766: hw-arm-virt-acpi-build-Add-IORT-RMR-regions-to-handl.patch +Patch0767: tests-data-acpi-virt-Update-IORT-acpi-table.patch +Patch0768: iommufd.h-Updated-to-openeuler-olk-6.6-kernel.patch +Patch0769: hw-arm-smmuv3-Enable-sva-stall-IDR-features.patch +Patch0770: kvm-Translate-MSI-doorbell-address-only-if-it-is-val.patch +Patch0771: smmuv3-Add-support-for-page-fault-handling.patch +Patch0772: pci-Get-pasid-capability-from-vIOMMU.patch +Patch0773: backend-iommufd-Report-PASID-capability.patch +Patch0774: vfio-Synthesize-vPASID-capability-to-VM.patch +Patch0775: smmuv3-realize-get_pasid_cap-and-set-ssidsize-with-p.patch +Patch0776: smmu-common-Return-sysmem-address-space-only-for-vfi.patch +Patch0777: smmuv3-Change-arm-smmuv3-nested-name-to-arm-smmuv3-a.patch +Patch0778: smmuv3-Use-default-bus-for-arm-smmuv3-accel.patch +Patch0779: gpex-acpi-Remove-duplicate-DSM-5.patch +Patch0780: Revert-linux-user-Print-tid-not-pid-with-strace.patch BuildRequires: flex BuildRequires: gcc @@ -1220,6 +1392,180 @@ getent passwd qemu >/dev/null || \ %endif %changelog +* Tue Apr 22 2025 Jiabo Feng - 11:8.2.0-30 +- Revert "linux-user: Print tid not pid with strace" +- gpex-acpi: Remove duplicate DSM #5 +- smmuv3: Use default bus for arm-smmuv3-accel +- smmuv3: Change arm-smmuv3-nested name to arm-smmuv3-accel +- smmu-common: Return sysmem address space only for vfio-pci +- smmuv3: realize get_pasid_cap and set ssidsize with pasid +- vfio: Synthesize vPASID capability to VM +- backend/iommufd: Report PASID capability +- pci: Get pasid capability from vIOMMU +- smmuv3: Add support for page fault handling +- kvm: Translate MSI doorbell address only if it is valid +- hw/arm/smmuv3: Enable sva/stall IDR features +- iommufd.h: Updated to openeuler olk-6.6 kernel +- tests/data/acpi/virt: Update IORT acpi table +- hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding +- tests/qtest: Allow IORT acpi table to change +- hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes +- hw/arm/smmuv3: Associate a pci bus with a SMMUv3 Nested device +- hw/arm/smmuv3: Add initial support for SMMUv3 Nested device +- hw/arm/virt: Add an SMMU_IO_LEN macro +- hw/pci-host/gpex: [needs kernel fix] Allow to generate preserve boot config DSM #5 +- tests/data/acpi: Update DSDT acpi tables +- acpi/gpex: Fix PCI Express Slot Information function 0 returned value +- tests/qtest: Allow DSDT acpi tables to change +- hw/arm/smmuv3: Forward cache invalidate commands via iommufd +- hw/arm/smmu-common: Replace smmu_iommu_mr with smmu_find_sdev +- hw/arm/smmuv3: Add missing STE invalidation +- hw/arm/smmuv3: Add smmu_dev_install_nested_ste() for CFGI_STE +- hw/arm/smmuv3: Check idr registers for STE_S1CDMAX and STE_S1STALLD +- hw/arm/smmuv3: Read host SMMU device info +- hw/arm/smmuv3: Ignore IOMMU_NOTIFIER_MAP for nested-smmuv3 +- hw/arm/smmu-common: Return sysmem if stage-1 is bypassed +- hw/arm/smmu-common: Add iommufd helpers +- hw/arm/smmu-common: Add set/unset_iommu_device callback +- hw/arm/smmu-common: Extract smmu_get_sbus and smmu_get_sdev helpers +- hw/arm/smmu-common: Bypass emulated IOTLB for a nested SMMU +- hw/arm/smmu-common: Add a nested flag to SMMUState +- backends/iommufd: Introduce iommufd_viommu_invalidate_cache +- backends/iommufd: Introduce iommufd_vdev_alloc +- backends/iommufd: Introduce iommufd_backend_alloc_viommu +- vfio/iommufd: Implement [at|de]tach_hwpt handlers +- vfio/iommufd: Implement HostIOMMUDeviceClass::realize_late() handler +- HostIOMMUDevice: Introduce realize_late callback +- vfio/iommufd: Add properties and handlers to TYPE_HOST_IOMMU_DEVICE_IOMMUFD +- backends/iommufd: Add helpers for invalidating user-managed HWPT +- Update iommufd.h header for vSVA +- vfio/common: Allow disabling device dirty page tracking +- vfio/migration: Don't block migration device dirty tracking is unsupported +- vfio/iommufd: Implement VFIOIOMMUClass::query_dirty_bitmap support +- vfio/iommufd: Implement VFIOIOMMUClass::set_dirty_tracking support +- vfio/iommufd: Probe and request hwpt dirty tracking capability +- vfio/{iommufd, container}: Invoke HostIOMMUDevice::realize() during attach_device() +- vfio/iommufd: Add hw_caps field to HostIOMMUDeviceCaps +- vfio/{iommufd,container}: Remove caps::aw_bits +- HostIOMMUDevice: Store the VFIO/VDPA agent +- vfio/iommufd: Introduce auto domain creation +- vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev +- vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev +- vfio/iommufd: Return errno in iommufd_cdev_attach_ioas_hwpt() +- backends/iommufd: Extend iommufd_backend_get_device_info() to fetch HW capabilities +- vfio/iommufd: Don't initialize nor set a HOST_IOMMU_DEVICE with mdev +- vfio/pci: Extract mdev check into an helper +- intel_iommu: Check compatibility with host IOMMU capabilities +- intel_iommu: Implement [set|unset]_iommu_device() callbacks +- intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap +- vfio/pci: Pass HostIOMMUDevice to vIOMMU +- hw/pci: Introduce pci_device_[set|unset]_iommu_device() +- hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn() +- vfio: Create host IOMMU device instance +- backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler +- vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler +- vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler +- backends/iommufd: Introduce helper function iommufd_backend_get_device_info() +- vfio/container: Implement HostIOMMUDeviceClass::realize() handler +- range: Introduce range_get_last_bit() +- backends/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD[_VFIO] devices +- vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device +- backends/host_iommu_device: Introduce HostIOMMUDeviceCaps +- backends: Introduce HostIOMMUDevice abstract +- vfio/iommufd: Remove CONFIG_IOMMUFD usage +- vfio/spapr: Extend VFIOIOMMUOps with a release handler +- vfio/spapr: Only compile sPAPR IOMMU support when needed +- vfio/iommufd: Introduce a VFIOIOMMU iommufd QOM interface +- vfio/spapr: Introduce a sPAPR VFIOIOMMU QOM interface +- vfio/container: Intoduce a new VFIOIOMMUClass::setup handler +- vfio/container: Introduce a VFIOIOMMU legacy QOM interface +- vfio/container: Introduce a VFIOIOMMU QOM interface +- vfio/container: Initialize VFIOIOMMUOps under vfio_init_container() +- vfio/container: Introduce vfio_legacy_setup() for further cleanups +- docs/devel: Add VFIO iommufd backend documentation +- vfio: Introduce a helper function to initialize VFIODevice +- vfio/ccw: Move VFIODevice initializations in vfio_ccw_instance_init +- vfio/ap: Move VFIODevice initializations in vfio_ap_instance_init +- vfio/platform: Move VFIODevice initializations in vfio_platform_instance_init +- vfio/pci: Move VFIODevice initializations in vfio_instance_init +- hw/i386: Activate IOMMUFD for q35 machines +- kconfig: Activate IOMMUFD for s390x machines +- hw/arm: Activate IOMMUFD for virt machines +- vfio: Make VFIOContainerBase poiner parameter const in VFIOIOMMUOps callbacks +- vfio/ccw: Make vfio cdev pre-openable by passing a file handle +- vfio/ccw: Allow the selection of a given iommu backend +- vfio/ap: Make vfio cdev pre-openable by passing a file handle +- vfio/ap: Allow the selection of a given iommu backend +- vfio/platform: Make vfio cdev pre-openable by passing a file handle +- vfio/platform: Allow the selection of a given iommu backend +- vfio/pci: Make vfio cdev pre-openable by passing a file handle +- vfio/pci: Allow the selection of a given iommu backend +- vfio/iommufd: Enable pci hot reset through iommufd cdev interface +- vfio/pci: Introduce a vfio pci hot reset interface +- vfio/pci: Extract out a helper vfio_pci_get_pci_hot_reset_info +- vfio/iommufd: Add support for iova_ranges and pgsizes +- vfio/iommufd: Relax assert check for iommufd backend +- vfio/iommufd: Implement the iommufd backend +- vfio/common: return early if space isn't empty +- util/char_dev: Add open_cdev() +- backends/iommufd: Introduce the iommufd object +- vfio/spapr: Move hostwin_list into spapr container +- vfio/spapr: Move prereg_listener into spapr container +- vfio/spapr: switch to spapr IOMMU BE add/del_section_window +- vfio/spapr: Introduce spapr backend and target interface +- vfio/container: Implement attach/detach_device +- vfio/container: Move iova_ranges to base container +- vfio/container: Move dirty_pgsizes and max_dirty_bitmap_size to base container +- vfio/container: Move listener to base container +- vfio/container: Move vrdl_list to base container +- vfio/container: Move pgsizes and dma_max_mappings to base container +- vfio/container: Convert functions to base container +- vfio/container: Move per container device list in base container +- vfio/container: Switch to IOMMU BE set_dirty_page_tracking/query_dirty_bitmap API +- vfio/container: Move space field to base container +- vfio/common: Move giommu_list in base container +- vfio/common: Introduce vfio_container_init/destroy helper +- vfio/container: Switch to dma_map|unmap API +- vfio/container: Introduce a empty VFIOIOMMUOps +- vfio: Introduce base object for VFIOContainer and targeted interface +- cryptodev: Fix error handling in cryptodev_lkcf_execute_task() +- hw/xen: Fix xen_bus_realize() error handling +- hw/misc/aspeed_hace: Fix buffer overflow in has_padding function +- target/s390x: Fix a typo in s390_cpu_class_init() +- hw/sd/sdhci: free irq on exit +- hw/ufs: free irq on exit +- hw/pci-host/designware: Fix ATU_UPPER_TARGET register access +- target/i386: Make invtsc migratable when user sets tsc-khz explicitly +- target/i386: Construct CPUID 2 as stateful iff times > 1 +- target/i386: Enable fdp-excptn-only and zero-fcs-fds +- target/i386: Don't construct a all-zero entry for CPUID[0xD 0x3f] +- i386/cpuid: Remove subleaf constraint on CPUID leaf 1F +- target/i386: pass X86CPU to x86_cpu_get_supported_feature_word +- target/i386: Raise the highest index value used for any VMCS encoding +- target/i386: Add VMX control bits for nested FRED support +- target/i386: Delete duplicated macro definition CR4_FRED_MASK +- target/i386: Add get/set/migrate support for FRED MSRs +- target/i386: enumerate VMX nested-exception support +- vmxcap: add support for VMX FRED controls +- target/i386: mark CR4.FRED not reserved +- target/i386: add support for FRED in CPUID enumeration +- target/i386: fix feature dependency for WAITPKG +- target/i386: Add more features enumerated by CPUID.7.2.EDX +- net: fix build when libbpf is disabled, but libxdp is enabled +- hw/nvme: fix invalid endian conversion +- hw/nvme: fix invalid check on mcl +- backends/cryptodev: Do not ignore throttle/backends Errors +- backends/cryptodev: Do not abort for invalid session ID +- virtcca: add kvm isolation when get tmi version. +- qga: Don't daemonize before channel is initialized +- qga: Add log to guest-fsfreeze-thaw command +- backends: VirtCCA: cvm_gpa_start supports both 1GB and 3GB +- BUGFIX: Enforce isolation for virtcca_shared_hugepage +- arm: VirtCCA: qemu CoDA support UEFI boot +- arm: VirtCCA: Compatibility with older versions of TMM and the kernel +- arm: VirtCCA: qemu uefi boot support kae +- arm: VirtCCA: CVM support UEFI boot + * Fri Feb 21 2025 Jiabo Feng - 11:8.2.0-29 - target/i386: csv: Support inject secret for CSV3 guest only if the extension is enabled - target/i386: csv: Support load kernel hashes for CSV3 guest only if the extension is enabled diff --git a/qga-Add-log-to-guest-fsfreeze-thaw-command.patch b/qga-Add-log-to-guest-fsfreeze-thaw-command.patch new file mode 100644 index 0000000000000000000000000000000000000000..693f089ff513589ddffe4bf2e0f267f51e658558 --- /dev/null +++ b/qga-Add-log-to-guest-fsfreeze-thaw-command.patch @@ -0,0 +1,48 @@ +From 3a14516128cf936906e5f519bf7808b9a977a757 Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Fri, 7 Mar 2025 21:57:29 -0500 +Subject: [PATCH] qga: Add log to guest-fsfreeze-thaw command +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from ad1e6843632555c771dda6a9425930fa25b71fb3 + +Reviewed-by: Daniel P. Berrangé +Message-ID: <20241216154552.213961-2-kkostiuk@redhat.com> +Signed-off-by: Konstantin Kostiuk +Signed-off-by: qihao_yewu +--- + qga/commands-posix.c | 1 + + qga/commands-win32.c | 3 +++ + 2 files changed, 4 insertions(+) + +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 6169bbf7a0..f0d8e9e9c5 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -759,6 +759,7 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) + ret = qmp_guest_fsfreeze_do_thaw(errp); + if (ret >= 0) { + ga_unset_frozen(ga_state); ++ slog("guest-fsthaw called"); + execute_fsfreeze_hook(FSFREEZE_HOOK_THAW, errp); + } else { + ret = 0; +diff --git a/qga/commands-win32.c b/qga/commands-win32.c +index 697c65507c..656d1459f1 100644 +--- a/qga/commands-win32.c ++++ b/qga/commands-win32.c +@@ -1275,6 +1275,9 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) + qga_vss_fsfreeze(&i, false, NULL, errp); + + ga_unset_frozen(ga_state); ++ ++ slog("guest-fsthaw called"); ++ + return i; + } + +-- +2.41.0.windows.1 + diff --git a/qga-Don-t-daemonize-before-channel-is-initialized.patch b/qga-Don-t-daemonize-before-channel-is-initialized.patch new file mode 100644 index 0000000000000000000000000000000000000000..0f76c48486338ccfea82d0c65f8392d7d5e8b9af --- /dev/null +++ b/qga-Don-t-daemonize-before-channel-is-initialized.patch @@ -0,0 +1,106 @@ +From 752d98d93459c87817be5e02c39257e0fa5934f8 Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Fri, 7 Mar 2025 21:07:11 -0500 +Subject: [PATCH] qga: Don't daemonize before channel is initialized +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from c6f5dd7ac8ef62dcdec4cdeda1467c658161afff + +If the agent is set to daemonize but for whatever reason fails to +init the channel, the error message is lost. Worse, the agent +daemonizes needlessly and returns success. For instance: + + # qemu-ga -m virtio-serial \ + -p /dev/nonexistent_device \ + -f /run/qemu-ga.pid \ + -t /run \ + -d + # echo $? + 0 + +This makes it needlessly hard for init scripts to detect a +failure in qemu-ga startup. Though, they shouldn't pass '-d' in +the first place. + +Let's open the channel first and only after that become a daemon. + +Related bug: https://bugs.gentoo.org/810628 + +Signed-off-by: Michal Privoznik +Reviewed-by: Ján Tomko +Reviewed-by: Konstantin Kostiuk +Message-ID: <7a42b0cbda5c7e01cf76bc1b29a1210cd018fa78.1736261360.git.mprivozn@redhat.com> +Signed-off-by: Konstantin Kostiuk +Signed-off-by: qihao_yewu +--- + qga/main.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/qga/main.c b/qga/main.c +index c4dcbb86be..8d341ffdf1 100644 +--- a/qga/main.c ++++ b/qga/main.c +@@ -1407,7 +1407,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) + if (config->daemonize) { + /* delay opening/locking of pidfile till filesystems are unfrozen */ + s->deferred_options.pid_filepath = config->pid_filepath; +- become_daemon(NULL); + } + if (config->log_filepath) { + /* delay opening the log file till filesystems are unfrozen */ +@@ -1416,9 +1415,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) + ga_disable_logging(s); + qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL); + } else { +- if (config->daemonize) { +- become_daemon(config->pid_filepath); +- } + if (config->log_filepath) { + FILE *log_file = ga_open_logfile(config->log_filepath); + if (!log_file) { +@@ -1482,6 +1478,20 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) + } + #endif + ++ if (!channel_init(s, s->config->method, s->config->channel_path, ++ s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { ++ g_critical("failed to initialize guest agent channel"); ++ return NULL; ++ } ++ ++ if (config->daemonize) { ++ if (ga_is_frozen(s)) { ++ become_daemon(NULL); ++ } else { ++ become_daemon(config->pid_filepath); ++ } ++ } ++ + ga_state = s; + return s; + failed: +@@ -1516,8 +1526,9 @@ static void cleanup_agent(GAState *s) + + static int run_agent_once(GAState *s) + { +- if (!channel_init(s, s->config->method, s->config->channel_path, +- s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { ++ if (!s->channel && ++ channel_init(s, s->config->method, s->config->channel_path, ++ s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { + g_critical("failed to initialize guest agent channel"); + return EXIT_FAILURE; + } +@@ -1526,6 +1537,7 @@ static int run_agent_once(GAState *s) + + if (s->channel) { + ga_channel_free(s->channel); ++ s->channel = NULL; + } + + return EXIT_SUCCESS; +-- +2.41.0.windows.1 + diff --git a/range-Introduce-range_get_last_bit.patch b/range-Introduce-range_get_last_bit.patch new file mode 100644 index 0000000000000000000000000000000000000000..427a14d8d1c8660c84a075f3436d5166a512c4fb --- /dev/null +++ b/range-Introduce-range_get_last_bit.patch @@ -0,0 +1,52 @@ +From 30150b8727e9ec41f83c4dfcd93f04b766357469 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:31 +0800 +Subject: [PATCH] range: Introduce range_get_last_bit() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This helper get the highest 1 bit position of the upper bound. + +If the range is empty or upper bound is zero, -1 is returned. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + include/qemu/range.h | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/include/qemu/range.h b/include/qemu/range.h +index 205e1da76d..4ce694a398 100644 +--- a/include/qemu/range.h ++++ b/include/qemu/range.h +@@ -20,6 +20,8 @@ + #ifndef QEMU_RANGE_H + #define QEMU_RANGE_H + ++#include "qemu/bitops.h" ++ + /* + * Operations on 64 bit address ranges. + * Notes: +@@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, + return !(last2 < first1 || last1 < first2); + } + ++/* Get highest non-zero bit position of a range */ ++static inline int range_get_last_bit(Range *range) ++{ ++ if (range_is_empty(range)) { ++ return -1; ++ } ++ return 63 - clz64(range->upb); ++} ++ + /* + * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. + * Both @a and @b must not be empty. +-- +2.41.0.windows.1 + diff --git a/smmu-common-Return-sysmem-address-space-only-for-vfi.patch b/smmu-common-Return-sysmem-address-space-only-for-vfi.patch new file mode 100644 index 0000000000000000000000000000000000000000..d68ab699d84cb1b67896ba70d1a7c1c7888cb464 --- /dev/null +++ b/smmu-common-Return-sysmem-address-space-only-for-vfi.patch @@ -0,0 +1,39 @@ +From 58f66c2581b3c4a45a02717330f1b2188424889b Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Wed, 15 Jan 2025 16:11:21 +0000 +Subject: [PATCH] smmu-common: Return sysmem address space only for vfio-pci + +This will enable pcie-root-port hotplug event irq to work. + +Discussion Link: https://lore.kernel.org/qemu-devel/74114c0db34b420a90e9fe5bd991767e@huawei.com/ + +Signed-off-by: Shameer Kolothum +--- + hw/arm/smmu-common.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 3a257a5b0e..6c4b82757f 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -639,9 +639,16 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); ++ bool is_vfio = false; ++ PCIDevice *pdev; ++ ++ pdev = pci_find_device(bus, pci_bus_num(bus), devfn); ++ if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { ++ is_vfio = true; ++ } + + /* Return the system as if the device uses stage-2 only */ +- if (s->nested && !sdev->s1_hwpt) { ++ if (s->nested && !sdev->s1_hwpt && is_vfio) { + return &sdev->as_sysmem; + } else { + return &sdev->as; +-- +2.41.0.windows.1 + diff --git a/smmuv3-Add-support-for-page-fault-handling.patch b/smmuv3-Add-support-for-page-fault-handling.patch new file mode 100644 index 0000000000000000000000000000000000000000..6e4ce9dc21df5d7ec6925c995bf6fe7e26d630e3 --- /dev/null +++ b/smmuv3-Add-support-for-page-fault-handling.patch @@ -0,0 +1,462 @@ +From ebfa7213e32faafd5532d6f5b3cb873018b671ae Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Thu, 10 Oct 2024 06:19:31 +0000 +Subject: [PATCH] smmuv3: Add support for page fault handling + +Handle page fault from host and send response back. + +Signed-off-by: Shameer Kolothum +--- + backends/iommufd.c | 20 +++- + hw/arm/smmu-common.c | 39 ++++++-- + hw/arm/smmuv3.c | 188 ++++++++++++++++++++++++++++++++++- + hw/vfio/iommufd.c | 2 +- + include/hw/arm/smmu-common.h | 24 ++++- + include/sysemu/iommufd.h | 2 +- + 6 files changed, 263 insertions(+), 12 deletions(-) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index ee6f5bcf65..e9ce82297b 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -228,7 +228,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, +- Error **errp) ++ uint32_t *out_fault_fd, Error **errp) + { + int ret, fd = be->fd; + struct iommu_hwpt_alloc alloc_hwpt = { +@@ -241,6 +241,24 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + .data_uptr = (uintptr_t)data_ptr, + }; + ++ if (flags & IOMMU_HWPT_FAULT_ID_VALID) { ++ ++ struct iommu_fault_alloc cmd = { ++ .size = sizeof(cmd), ++ }; ++ ++ ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd); ++ if (ret) { ++ ret = -errno; ++ error_report("IOMMU_FAULT_ALLOC failed: %m"); ++ } else { ++ alloc_hwpt.fault_id = cmd.out_fault_id; ++ if (out_fault_fd) { ++ *out_fault_fd = cmd.out_fault_fd; ++ } ++ } ++ } ++ + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); + trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, + data_len, (uintptr_t)data_ptr, +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index d0bc620606..c382fa16e5 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -670,7 +670,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + IOMMU_HWPT_DATA_NONE, 0, NULL, +- &s2_hwpt_id, errp)) { ++ &s2_hwpt_id, NULL, errp)) { + error_setg(errp, "failed to allocate an S2 hwpt"); + return false; + } +@@ -695,7 +695,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(abort_data), &abort_data, +- &viommu->abort_hwpt_id, errp)) { ++ &viommu->abort_hwpt_id, NULL, errp)) { + error_setg(errp, "failed to allocate an abort pagetable"); + goto free_viommu_core; + } +@@ -704,7 +704,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(bypass_data), &bypass_data, +- &viommu->bypass_hwpt_id, errp)) { ++ &viommu->bypass_hwpt_id, NULL, errp)) { + error_setg(errp, "failed to allocate a bypass pagetable"); + goto free_abort_hwpt; + } +@@ -882,6 +882,25 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) + hwpt_id = sdev->viommu->bypass_hwpt_id; + } + ++ /* ToDo: May be better to move the below to smmuv3. */ ++ if (s1_hwpt->out_fault_fd) { ++ struct io_uring *ring = &s1_hwpt->fault_ring; ++ struct io_uring_sqe *sqe; ++ struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1}; ++ ++ s1_hwpt->exiting = true; ++ /* Send out a timeout sqe for the read handler to exit */ ++ sqe = io_uring_get_sqe(ring); ++ io_uring_prep_timeout(sqe, &ts, 0, 0); ++ io_uring_submit(ring); ++ ++ qemu_cond_signal(&s1_hwpt->fault_cond); ++ qemu_thread_join(&s1_hwpt->read_fault_thread); ++ qemu_thread_join(&s1_hwpt->write_fault_thread); ++ qemu_mutex_destroy(&s1_hwpt->fault_mutex); ++ io_uring_queue_exit(&s1_hwpt->fault_ring); ++ } ++ + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { + return; + } +@@ -892,11 +911,13 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) + } + + int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, +- uint32_t data_len, void *data) ++ uint32_t data_len, void *data, ++ bool req_fault_fd) + { + SMMUViommu *viommu = sdev->viommu; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; ++ uint32_t flags = 0; + + if (!idev || !viommu) { + return -ENOENT; +@@ -912,12 +933,18 @@ int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + } + + s1_hwpt->smmu = sdev->smmu; ++ s1_hwpt->sdev = sdev; + s1_hwpt->viommu = viommu; + s1_hwpt->iommufd = idev->iommufd; + ++ if (req_fault_fd) { ++ flags |= IOMMU_HWPT_FAULT_ID_VALID; ++ } ++ + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, +- viommu->core->viommu_id, 0, data_type, +- data_len, data, &s1_hwpt->hwpt_id, NULL)) { ++ viommu->core->viommu_id, flags, data_type, ++ data_len, data, &s1_hwpt->hwpt_id, ++ &s1_hwpt->out_fault_fd, NULL)) { + goto free; + } + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 8d8dcccd48..30c0ae4c3b 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -34,6 +34,9 @@ + #include "hw/arm/smmuv3.h" + #include "smmuv3-internal.h" + #include "smmu-internal.h" ++#ifdef CONFIG_LINUX_IO_URING ++#include ++#endif + + #define PTW_RECORD_FAULT(cfg) (((cfg)->stage == 1) ? (cfg)->record_faults : \ + (cfg)->s2cfg.record_faults) +@@ -1258,6 +1261,165 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) + } + } + ++static void smmuv3_report_iommu_fault(SMMUS1Hwpt *hwpt, ++ struct iommu_hwpt_pgfault *fault) ++{ ++ PendFaultEntry *pend; ++ SMMUDevice *sdev = hwpt->sdev; ++ SMMUv3State *s3 = sdev->smmu; ++ uint32_t sid = smmu_get_sid(sdev); ++ SMMUEventInfo info = {0}; ++ ++ info.sid = sid; ++ info.type = SMMU_EVT_F_TRANSLATION; ++ info.u.f_translation.addr = fault->addr; ++ info.u.f_translation.stall = true; ++ info.u.f_translation.ssid = fault->pasid; ++ info.u.f_translation.stag = fault->grpid; ++ ++ if (fault->flags | IOMMU_PGFAULT_FLAGS_PASID_VALID) { ++ info.u.f_translation.ssv = true; ++ } ++ if (fault->perm & IOMMU_PGFAULT_PERM_READ) { ++ info.u.f_translation.rnw = true; ++ } ++ if (fault->perm & IOMMU_PGFAULT_PERM_PRIV) { ++ info.u.f_translation.pnu = true; ++ } ++ if (fault->perm & IOMMU_PGFAULT_PERM_EXEC) { ++ info.u.f_translation.ind = true; ++ } ++ ++ pend = g_new0(PendFaultEntry, 1); ++ memcpy(&pend->fault, fault, sizeof(*fault)); ++ qemu_mutex_lock(&hwpt->fault_mutex); ++ QTAILQ_INSERT_TAIL(&hwpt->pendfault, pend, entry); ++ qemu_mutex_unlock(&hwpt->fault_mutex); ++ smmuv3_record_event(s3, &info); ++ return; ++} ++ ++static void smmuv3_notify_stall_resume(SMMUState *bs, uint32_t sid, ++ uint32_t stag, uint32_t code) ++{ ++ SMMUDevice *sdev = smmu_find_sdev(bs, sid); ++ PageRespEntry *msg; ++ PendFaultEntry *pend, *tmp; ++ SMMUS1Hwpt *hwpt; ++ bool found = false; ++ ++ if (!sdev) { ++ return; ++ } ++ ++ hwpt = sdev->s1_hwpt; ++ msg = g_new0(PageRespEntry, 1); ++ ++ /* Kernel expects addr and pasid info for page response */ ++ qemu_mutex_lock(&hwpt->fault_mutex); ++ QTAILQ_FOREACH_SAFE(pend, &hwpt->pendfault, entry, tmp) { ++ if (pend->fault.grpid == stag) { ++ QTAILQ_REMOVE(&hwpt->pendfault, pend, entry); ++ msg->resp.cookie = pend->fault.cookie; ++ msg->resp.code = code; ++ QTAILQ_INSERT_TAIL(&hwpt->pageresp, msg, entry); ++ qemu_cond_signal(&hwpt->fault_cond); ++ ++ g_free(pend); ++ found = true; ++ break; ++ } ++ } ++ ++ qemu_mutex_unlock(&hwpt->fault_mutex); ++ if (!found) { ++ warn_report("No matching fault for resume(stag 0x%x), drop!", stag); ++ return; ++ } ++} ++ ++static void *write_fault_handler(void *opaque) ++{ ++ SMMUS1Hwpt *hwpt = opaque; ++ PageRespEntry *msg, *tmp; ++ struct iommu_hwpt_page_response *resp; ++ int ret; ++ ++ resp = g_new0(struct iommu_hwpt_page_response, 1); ++ while (!hwpt->exiting) { ++ /* Check we have any pending responses */ ++ qemu_mutex_lock(&hwpt->fault_mutex); ++ qemu_cond_wait(&hwpt->fault_cond, &hwpt->fault_mutex); ++ QTAILQ_FOREACH_SAFE(msg, &hwpt->pageresp, entry, tmp) { ++ QTAILQ_REMOVE(&hwpt->pageresp, msg, entry); ++ memcpy(resp, &msg->resp, sizeof(*resp)); ++ g_free(msg); ++ ++ ret = write(hwpt->out_fault_fd, resp, sizeof(*resp)); ++ if (ret != sizeof(*resp)) { ++ warn_report("Write resp[cookie 0x%x] fail %d", ++ resp->cookie, ret); ++ } ++ } ++ qemu_mutex_unlock(&hwpt->fault_mutex); ++ } ++ g_free(resp); ++ return NULL; ++} ++ ++static void *read_fault_handler(void *opaque) ++{ ++ SMMUS1Hwpt *hwpt = opaque; ++ struct io_uring_sqe *sqe; ++ struct io_uring_cqe *cqe; ++ struct iommu_hwpt_pgfault *fault; ++ struct io_uring *ring = &hwpt->fault_ring; ++ void *data; ++ int ret; ++ ++ fault = g_new0(struct iommu_hwpt_pgfault, 1); ++ while (!hwpt->exiting) { ++ sqe = io_uring_get_sqe(ring); ++ io_uring_prep_read(sqe, hwpt->out_fault_fd, fault, ++ sizeof(*fault), 0); ++ io_uring_sqe_set_data(sqe, fault); ++ io_uring_submit(ring); ++ ++ ret = io_uring_wait_cqe(ring, &cqe); ++ if (ret == 0) { ++ if (cqe->res == sizeof(*fault)) { ++ data = io_uring_cqe_get_data(cqe); ++ smmuv3_report_iommu_fault(hwpt, data); ++ } ++ } else { ++ warn_report("Read fault[hwpt_id 0x%x] failed %d", ++ hwpt->hwpt_id, ret); ++ } ++ io_uring_cqe_seen(ring, cqe); ++ } ++ g_free(fault); ++ return NULL; ++} ++ ++static void create_fault_handlers(SMMUS1Hwpt *hwpt) ++{ ++ if (!hwpt->out_fault_fd) { ++ warn_report("No fault fd for hwpt id: %d", hwpt->hwpt_id); ++ return; ++ } ++ ++ io_uring_queue_init(1024, &hwpt->fault_ring, 0); ++ qemu_mutex_init(&hwpt->fault_mutex); ++ qemu_cond_init(&hwpt->fault_cond); ++ QTAILQ_INIT(&hwpt->pageresp); ++ QTAILQ_INIT(&hwpt->pendfault); ++ qemu_thread_create(&hwpt->read_fault_thread, "io fault read", ++ read_fault_handler, ++ hwpt, QEMU_THREAD_JOINABLE); ++ qemu_thread_create(&hwpt->write_fault_thread, "io fault write", ++ write_fault_handler, ++ hwpt, QEMU_THREAD_JOINABLE); ++} + static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) + { + #ifdef __linux__ +@@ -1266,6 +1428,7 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) + struct iommu_hwpt_arm_smmuv3 nested_data = {}; + SMMUv3State *s = sdev->smmu; + SMMUState *bs = &s->smmu_state; ++ bool req_fault_fd = false; + uint32_t config; + STE ste; + int ret; +@@ -1309,13 +1472,22 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) + /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ + nested_data.ste[1] &= 0x380000ffULL; + ++ if (STE_S1CDMAX(&ste)) { ++ req_fault_fd = true; ++ } ++ + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, +- sizeof(nested_data), &nested_data); ++ sizeof(nested_data), &nested_data, ++ req_fault_fd); + if (ret) { + error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", + nested_data.ste[1], nested_data.ste[0], ret); + } + ++ if (req_fault_fd) { ++ create_fault_handlers(sdev->s1_hwpt); ++ } ++ + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); + #endif + } +@@ -1631,10 +1803,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + case SMMU_CMD_TLBI_EL2_VA: + case SMMU_CMD_TLBI_EL2_VAA: + case SMMU_CMD_PRI_RESP: +- case SMMU_CMD_RESUME: + case SMMU_CMD_STALL_TERM: + trace_smmuv3_unhandled_cmd(type); + break; ++ case SMMU_CMD_RESUME: ++ { ++ uint32_t sid = CMD_SID(&cmd); ++ uint16_t stag = CMD_RESUME_STAG(&cmd); ++ uint8_t action = CMD_RESUME_AC(&cmd); ++ uint32_t code = IOMMUFD_PAGE_RESP_INVALID; ++ ++ if (action) { ++ code = IOMMUFD_PAGE_RESP_SUCCESS; ++ } ++ smmuv3_notify_stall_resume(bs, sid, stag, code); ++ break; ++ } + default: + cmd_error = SMMU_CERROR_ILL; + break; +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 528023b95b..c0eb87c78c 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -344,7 +344,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, +- &hwpt_id, errp)) { ++ &hwpt_id, NULL, errp)) { + return -EINVAL; + } + +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index e30539a8d4..087a11efc7 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -138,13 +138,34 @@ typedef struct SMMUVdev { + uint32_t sid; + }SMMUVdev; + ++typedef struct PendFaultEntry { ++ struct iommu_hwpt_pgfault fault; ++ QTAILQ_ENTRY(PendFaultEntry) entry; ++} PendFaultEntry; ++ ++typedef struct PageRespEntry { ++ struct iommu_hwpt_page_response resp; ++ QTAILQ_ENTRY(PageRespEntry) entry; ++} PageRespEntry; ++ + typedef struct SMMUS1Hwpt { ++ void *sdev; + void *smmu; + IOMMUFDBackend *iommufd; + SMMUViommu *viommu; + uint32_t hwpt_id; ++ uint32_t out_fault_fd; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; ++ /* fault handling */ ++ struct io_uring fault_ring; ++ QemuThread read_fault_thread; ++ QemuThread write_fault_thread; ++ QemuMutex fault_mutex; ++ QemuCond fault_cond; ++ QTAILQ_HEAD(, PageRespEntry) pageresp; ++ QTAILQ_HEAD(, PendFaultEntry) pendfault; ++ bool exiting; + } SMMUS1Hwpt; + + typedef struct SMMUDevice { +@@ -258,7 +279,8 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, void *data); + void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); + int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, +- uint32_t data_len, void *data); ++ uint32_t data_len, void *data, ++ bool req_fault_fd); + int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs); + int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 0f2c826036..b279184974 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -62,7 +62,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, +- Error **errp); ++ uint32_t *out_fault_fd, Error **errp); + bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); + bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, +-- +2.41.0.windows.1 + diff --git a/smmuv3-Change-arm-smmuv3-nested-name-to-arm-smmuv3-a.patch b/smmuv3-Change-arm-smmuv3-nested-name-to-arm-smmuv3-a.patch new file mode 100644 index 0000000000000000000000000000000000000000..6f09425545265544e561601f7a0d16f50c737348 --- /dev/null +++ b/smmuv3-Change-arm-smmuv3-nested-name-to-arm-smmuv3-a.patch @@ -0,0 +1,325 @@ +From 2697e7418c1e0d87c82feca33800e3a093546a90 Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Thu, 16 Jan 2025 15:20:18 +0000 +Subject: [PATCH] smmuv3: Change arm-smmuv3-nested name to arm-smmuv3-accel + +This is based on feedback received for RFC v1. + +Signed-off-by: Shameer Kolothum +--- + hw/arm/smmuv3.c | 38 +++++++++++++++++++------------------- + hw/arm/virt-acpi-build.c | 16 ++++++++-------- + hw/arm/virt.c | 24 ++++++++++++------------ + hw/core/sysbus-fdt.c | 2 +- + include/hw/arm/smmuv3.h | 8 ++++---- + include/hw/arm/virt.h | 10 +++++----- + 6 files changed, 49 insertions(+), 49 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 6964ab000d..ecdad6bda4 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -2253,14 +2253,14 @@ static void smmu_realize(DeviceState *d, Error **errp) + smmu_init_irq(s, dev); + } + +-static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) ++static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) + { + DeviceState *d = opaque; +- SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); ++ SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; +- if (s_nested->pci_bus && !strcmp(bus->qbus.name, s_nested->pci_bus)) { ++ if (s_accel->pci_bus && !strcmp(bus->qbus.name, s_accel->pci_bus)) { + object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), + &error_abort); + } +@@ -2268,15 +2268,15 @@ static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) + return 0; + } + +-static void smmu_nested_realize(DeviceState *d, Error **errp) ++static void smmu_accel_realize(DeviceState *d, Error **errp) + { +- SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); +- SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); ++ SMMUv3AccelState *s_nested = ARM_SMMUV3_ACCEL(d); ++ SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_nested); + SysBusDevice *dev = SYS_BUS_DEVICE(d); + Error *local_err = NULL; + + object_child_foreach_recursive(object_get_root(), +- smmuv3_nested_pci_host_bridge, d); ++ smmuv3_accel_pci_host_bridge, d); + object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); + + c->parent_realize(d, &local_err); +@@ -2365,8 +2365,8 @@ static Property smmuv3_properties[] = { + DEFINE_PROP_END_OF_LIST() + }; + +-static Property smmuv3_nested_properties[] = { +- DEFINE_PROP_STRING("pci-bus", SMMUv3NestedState, pci_bus), ++static Property smmuv3_accel_properties[] = { ++ DEFINE_PROP_STRING("pci-bus", SMMUv3AccelState, pci_bus), + DEFINE_PROP_END_OF_LIST() + }; + +@@ -2389,15 +2389,15 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) + device_class_set_props(dc, smmuv3_properties); + } + +-static void smmuv3_nested_class_init(ObjectClass *klass, void *data) ++static void smmuv3_accel_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); +- SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_CLASS(klass); ++ SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass); + + dc->vmsd = &vmstate_smmuv3; +- device_class_set_parent_realize(dc, smmu_nested_realize, ++ device_class_set_parent_realize(dc, smmu_accel_realize, + &c->parent_realize); +- device_class_set_props(dc, smmuv3_nested_properties); ++ device_class_set_props(dc, smmuv3_accel_properties); + dc->user_creatable = true; + dc->hotpluggable = false; + } +@@ -2440,12 +2440,12 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + imrc->notify_flag_changed = smmuv3_notify_flag_changed; + } + +-static const TypeInfo smmuv3_nested_type_info = { +- .name = TYPE_ARM_SMMUV3_NESTED, ++static const TypeInfo smmuv3_accel_type_info = { ++ .name = TYPE_ARM_SMMUV3_ACCEL, + .parent = TYPE_ARM_SMMUV3, +- .instance_size = sizeof(SMMUv3NestedState), +- .class_size = sizeof(SMMUv3NestedClass), +- .class_init = smmuv3_nested_class_init, ++ .instance_size = sizeof(SMMUv3AccelState), ++ .class_size = sizeof(SMMUv3AccelClass), ++ .class_init = smmuv3_accel_class_init, + }; + + static const TypeInfo smmuv3_type_info = { +@@ -2466,7 +2466,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { + static void smmuv3_register_types(void) + { + type_register(&smmuv3_type_info); +- type_register(&smmuv3_nested_type_info); ++ type_register(&smmuv3_accel_type_info); + type_register(&smmuv3_iommu_memory_region_info); + } + +diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c +index ad0f79e03d..db635120f9 100644 +--- a/hw/arm/virt-acpi-build.c ++++ b/hw/arm/virt-acpi-build.c +@@ -418,10 +418,10 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, + }; + + /* +- * Nested SMMU requires RMRs for MSI 1-1 mapping, which ++ * Accel SMMU requires RMRs for MSI 1-1 mapping, which + * require _DSM for PreservingPCI Boot Configurations + */ +- if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { ++ if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { + cfg.preserve_config = true; + } + +@@ -619,10 +619,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + /* Table 2 The IORT */ + acpi_table_begin(&table, table_data); + +- if (vms->smmu_nested_count) { +- irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE; +- base = vms->memmap[VIRT_SMMU_NESTED].base; +- num_smmus = vms->smmu_nested_count; ++ if (vms->smmu_accel_count) { ++ irq = vms->irqmap[VIRT_SMMU_ACCEL] + ARM_SPI_BASE; ++ base = vms->memmap[VIRT_SMMU_ACCEL].base; ++ num_smmus = vms->smmu_accel_count; + } else if (virt_has_smmuv3(vms)) { + irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU].base; +@@ -655,7 +655,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + } + + next_range.input_base = idmap->input_base + idmap->id_count; +- if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { ++ if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { + nb_nodes++; /* RMR node per SMMU */ + } + } +@@ -775,7 +775,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); + } + +- if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { ++ if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { + build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); + } + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index a55f297af2..57d00acd48 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -166,7 +166,7 @@ static const MemMapEntry base_memmap[] = { + /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ + [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, + [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, +- [VIRT_SMMU_NESTED] = { 0x0b010000, 0x00ff0000}, ++ [VIRT_SMMU_ACCEL] = { 0x0b010000, 0x00ff0000}, + /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ + [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, + [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, +@@ -212,7 +212,7 @@ static const int a15irqmap[] = { + [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ + [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ + [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ +- [VIRT_SMMU_NESTED] = 200, ++ [VIRT_SMMU_ACCEL] = 200, + }; + + static const char *valid_cpus[] = { +@@ -3619,27 +3619,27 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, + + /* For smmuv3-nested devices we need to set the mem & irq */ + if (device_is_dynamic_sysbus(mc, dev) && +- object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_NESTED)) { +- hwaddr base = vms->memmap[VIRT_SMMU_NESTED].base; +- int irq = vms->irqmap[VIRT_SMMU_NESTED]; ++ object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_ACCEL)) { ++ hwaddr base = vms->memmap[VIRT_SMMU_ACCEL].base; ++ int irq = vms->irqmap[VIRT_SMMU_ACCEL]; + +- if (vms->smmu_nested_count >= MAX_SMMU_NESTED) { ++ if (vms->smmu_accel_count >= MAX_SMMU_ACCEL) { + error_setg(errp, "smmuv3-nested max count reached!"); + return; + } + +- base += (vms->smmu_nested_count * SMMU_IO_LEN); +- irq += (vms->smmu_nested_count * NUM_SMMU_IRQS); ++ base += (vms->smmu_accel_count * SMMU_IO_LEN); ++ irq += (vms->smmu_accel_count * NUM_SMMU_IRQS); + + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); + for (int i = 0; i < 4; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, + qdev_get_gpio_in(vms->gic, irq + i)); + } +- if (vms->iommu != VIRT_IOMMU_SMMUV3_NESTED) { +- vms->iommu = VIRT_IOMMU_SMMUV3_NESTED; ++ if (vms->iommu != VIRT_IOMMU_SMMUV3_ACCEL) { ++ vms->iommu = VIRT_IOMMU_SMMUV3_ACCEL; + } +- vms->smmu_nested_count++; ++ vms->smmu_accel_count++; + } + + if (vms->platform_bus_dev) { +@@ -3815,7 +3815,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); +- machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_NESTED); ++ machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_ACCEL); + #ifdef CONFIG_TPM + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); + #endif +diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c +index 0f0d0b3e58..58f4dc614c 100644 +--- a/hw/core/sysbus-fdt.c ++++ b/hw/core/sysbus-fdt.c +@@ -489,7 +489,7 @@ static const BindingEntry bindings[] = { + #ifdef CONFIG_LINUX + TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), + TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), +- TYPE_BINDING("arm-smmuv3-nested", no_fdt_node), ++ TYPE_BINDING("arm-smmuv3-accel", no_fdt_node), + VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), + #endif + #ifdef CONFIG_TPM +diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h +index 96513fce56..79b6fcd8e7 100644 +--- a/include/hw/arm/smmuv3.h ++++ b/include/hw/arm/smmuv3.h +@@ -84,16 +84,16 @@ struct SMMUv3Class { + #define TYPE_ARM_SMMUV3 "arm-smmuv3" + OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) + +-#define TYPE_ARM_SMMUV3_NESTED "arm-smmuv3-nested" +-OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) ++#define TYPE_ARM_SMMUV3_ACCEL "arm-smmuv3-accel" ++OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL) + +-struct SMMUv3NestedState { ++struct SMMUv3AccelState { + SMMUv3State smmuv3_state; + + char *pci_bus; + }; + +-struct SMMUv3NestedClass { ++struct SMMUv3AccelClass { + /*< private >*/ + SMMUv3Class smmuv3_class; + /*< public >*/ +diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h +index bc3c8b70da..3e2759d225 100644 +--- a/include/hw/arm/virt.h ++++ b/include/hw/arm/virt.h +@@ -110,7 +110,7 @@ typedef enum { + #define SMMU_IO_LEN 0x20000 + + /* Max supported nested SMMUv3 */ +-#define MAX_SMMU_NESTED 64 ++#define MAX_SMMU_ACCEL 64 + + enum { + VIRT_FLASH, +@@ -124,7 +124,7 @@ enum { + VIRT_GIC_ITS, + VIRT_GIC_REDIST, + VIRT_SMMU, +- VIRT_SMMU_NESTED, ++ VIRT_SMMU_ACCEL, + VIRT_UART, + VIRT_CPUFREQ, + VIRT_MMIO, +@@ -159,7 +159,7 @@ enum { + typedef enum VirtIOMMUType { + VIRT_IOMMU_NONE, + VIRT_IOMMU_SMMUV3, +- VIRT_IOMMU_SMMUV3_NESTED, ++ VIRT_IOMMU_SMMUV3_ACCEL, + VIRT_IOMMU_VIRTIO, + } VirtIOMMUType; + +@@ -227,7 +227,7 @@ struct VirtMachineState { + bool mte; + bool dtb_randomness; + bool pmu; +- int smmu_nested_count; ++ int smmu_accel_count; + OnOffAuto acpi; + VirtGICType gic_version; + VirtIOMMUType iommu; +@@ -298,7 +298,7 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) + static inline bool virt_has_smmuv3(const VirtMachineState *vms) + { + return vms->iommu == VIRT_IOMMU_SMMUV3 || +- vms->iommu == VIRT_IOMMU_SMMUV3_NESTED; ++ vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL; + } + + #endif /* QEMU_ARM_VIRT_H */ +-- +2.41.0.windows.1 + diff --git a/smmuv3-Use-default-bus-for-arm-smmuv3-accel.patch b/smmuv3-Use-default-bus-for-arm-smmuv3-accel.patch new file mode 100644 index 0000000000000000000000000000000000000000..46c7c77ee9e00a9c54fbcd840cfac5dac37162a7 --- /dev/null +++ b/smmuv3-Use-default-bus-for-arm-smmuv3-accel.patch @@ -0,0 +1,55 @@ +From 5e83bdd94533c91d69c7154d967f3bdd2fa86054 Mon Sep 17 00:00:00 2001 +From: Shameer Kolothum +Date: Thu, 16 Jan 2025 15:29:49 +0000 +Subject: [PATCH] smmuv3: Use default bus for arm-smmuv3-accel + +This is based on feedback on RFC v1. + +Signed-off-by: Shameer Kolothum +--- + hw/arm/smmuv3.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ecdad6bda4..c0fcdd7574 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -2256,11 +2256,10 @@ static void smmu_realize(DeviceState *d, Error **errp) + static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) + { + DeviceState *d = opaque; +- SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; +- if (s_accel->pci_bus && !strcmp(bus->qbus.name, s_accel->pci_bus)) { ++ if (d->parent_bus && !strcmp(bus->qbus.name, d->parent_bus->name)) { + object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), + &error_abort); + } +@@ -2365,11 +2364,6 @@ static Property smmuv3_properties[] = { + DEFINE_PROP_END_OF_LIST() + }; + +-static Property smmuv3_accel_properties[] = { +- DEFINE_PROP_STRING("pci-bus", SMMUv3AccelState, pci_bus), +- DEFINE_PROP_END_OF_LIST() +-}; +- + static void smmuv3_instance_init(Object *obj) + { + /* Nothing much to do here as of now */ +@@ -2397,9 +2391,9 @@ static void smmuv3_accel_class_init(ObjectClass *klass, void *data) + dc->vmsd = &vmstate_smmuv3; + device_class_set_parent_realize(dc, smmu_accel_realize, + &c->parent_realize); +- device_class_set_props(dc, smmuv3_accel_properties); + dc->user_creatable = true; + dc->hotpluggable = false; ++ dc->bus_type = TYPE_PCIE_BUS; + } + + static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, +-- +2.41.0.windows.1 + diff --git a/smmuv3-realize-get_pasid_cap-and-set-ssidsize-with-p.patch b/smmuv3-realize-get_pasid_cap-and-set-ssidsize-with-p.patch new file mode 100644 index 0000000000000000000000000000000000000000..cbfa611012cfa74daaf8b9e584aeb62c9a97091a --- /dev/null +++ b/smmuv3-realize-get_pasid_cap-and-set-ssidsize-with-p.patch @@ -0,0 +1,52 @@ +From d4d0d15716a3f4c89ca9532e6b598b14db76ae0c Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Sat, 26 Oct 2024 08:40:11 +0000 +Subject: [PATCH] smmuv3: realize get_pasid_cap and set ssidsize with pasid + +Signed-off-by: Zhangfei Gao +--- + hw/arm/smmu-common.c | 9 +++++++++ + hw/arm/smmuv3.c | 3 +-- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index e7028bd4ec..3a257a5b0e 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -831,10 +831,19 @@ static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) + } + } + ++static bool smmu_dev_get_pasid_cap(PCIBus *bus, ++ void *opaque, int devfn) ++{ ++ assert(0 <= devfn && devfn < PCI_DEVFN_MAX); ++ ++ return true; ++} ++ + static const PCIIOMMUOps smmu_ops = { + .get_address_space = smmu_find_add_as, + .set_iommu_device = smmu_dev_set_iommu_device, + .unset_iommu_device = smmu_dev_unset_iommu_device, ++ .get_pasid_cap = smmu_dev_get_pasid_cap, + }; + + SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 0ca0e96fcc..6964ab000d 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -312,8 +312,7 @@ out: + + val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); +- val = FIELD_EX32(sdev->info.idr[1], IDR1, SSIDSIZE); +- s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, val); ++ s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, pasid); + + val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); +-- +2.41.0.windows.1 + diff --git a/target-i386-Add-VMX-control-bits-for-nested-FRED-sup.patch b/target-i386-Add-VMX-control-bits-for-nested-FRED-sup.patch new file mode 100644 index 0000000000000000000000000000000000000000..4459e52cbc2ebe742ae6e4a80ecb4e46ea0d0dcf --- /dev/null +++ b/target-i386-Add-VMX-control-bits-for-nested-FRED-sup.patch @@ -0,0 +1,48 @@ +From 4dea92e8570650776ed8caa0fedf0a90920f5e97 Mon Sep 17 00:00:00 2001 +From: "Xin Li (Intel)" +Date: Wed, 7 Aug 2024 01:18:11 -0700 +Subject: [PATCH] target/i386: Add VMX control bits for nested FRED support + +commit 7c6ec5bc5fea92a4ddea3f0189e3a7e7588e1d19 upstream. + +Add definitions of + 1) VM-exit activate secondary controls bit + 2) VM-entry load FRED bit +which are required to enable nested FRED. + +Intel-SIG: commit 7c6ec5bc5fea target/i386: Add VMX control bits for nested FRED support + +Reviewed-by: Zhao Liu +Signed-off-by: Xin Li (Intel) +Link: https://lore.kernel.org/r/20240807081813.735158-3-xin@zytor.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 00e636e61c..f80570f4da 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1271,7 +1271,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "vmx-exit-save-efer", "vmx-exit-load-efer", + "vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs", + NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL, +- NULL, "vmx-exit-load-pkrs", NULL, NULL, ++ NULL, "vmx-exit-load-pkrs", NULL, "vmx-exit-secondary-ctls", + }, + .msr = { + .index = MSR_IA32_VMX_TRUE_EXIT_CTLS, +@@ -1286,7 +1286,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + NULL, "vmx-entry-ia32e-mode", NULL, NULL, + NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", "vmx-entry-load-efer", + "vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL, +- NULL, NULL, "vmx-entry-load-pkrs", NULL, ++ NULL, NULL, "vmx-entry-load-pkrs", "vmx-entry-load-fred", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, +-- +2.41.0.windows.1 + diff --git a/target-i386-Add-get-set-migrate-support-for-FRED-MSR.patch b/target-i386-Add-get-set-migrate-support-for-FRED-MSR.patch new file mode 100644 index 0000000000000000000000000000000000000000..0fec3249ff729d81259ddf33e01e1d338feb5ac9 --- /dev/null +++ b/target-i386-Add-get-set-migrate-support-for-FRED-MSR.patch @@ -0,0 +1,188 @@ +From c3e47749fba4418d80bf4314335118452912b29c Mon Sep 17 00:00:00 2001 +From: Xin Li +Date: Wed, 8 Nov 2023 23:20:12 -0800 +Subject: [PATCH] target/i386: Add get/set/migrate support for FRED MSRs + +commit 4ebd98eb3ade5957a842da1420bda012eeeaab9c upstream. + +FRED CPU states are managed in 9 new FRED MSRs, in addtion to a few +existing CPU registers and MSRs, e.g., CR4.FRED and MSR_IA32_PL0_SSP. + +Save/restore/migrate FRED MSRs if FRED is exposed to the guest. + +Intel-SIG: commit 4ebd98eb3ade target/i386: Add get/set/migrate support for FRED MSRs + +Tested-by: Shan Kang +Signed-off-by: Xin Li +Message-ID: <20231109072012.8078-7-xin3.li@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.h | 22 +++++++++++++++++++ + target/i386/kvm/kvm.c | 49 +++++++++++++++++++++++++++++++++++++++++++ + target/i386/machine.c | 28 +++++++++++++++++++++++++ + 3 files changed, 99 insertions(+) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index b03237c305..1b9d922651 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -539,6 +539,17 @@ typedef enum X86Seg { + #define MSR_IA32_XFD 0x000001c4 + #define MSR_IA32_XFD_ERR 0x000001c5 + ++/* FRED MSRs */ ++#define MSR_IA32_FRED_RSP0 0x000001cc /* Stack level 0 regular stack pointer */ ++#define MSR_IA32_FRED_RSP1 0x000001cd /* Stack level 1 regular stack pointer */ ++#define MSR_IA32_FRED_RSP2 0x000001ce /* Stack level 2 regular stack pointer */ ++#define MSR_IA32_FRED_RSP3 0x000001cf /* Stack level 3 regular stack pointer */ ++#define MSR_IA32_FRED_STKLVLS 0x000001d0 /* FRED exception stack levels */ ++#define MSR_IA32_FRED_SSP1 0x000001d1 /* Stack level 1 shadow stack pointer in ring 0 */ ++#define MSR_IA32_FRED_SSP2 0x000001d2 /* Stack level 2 shadow stack pointer in ring 0 */ ++#define MSR_IA32_FRED_SSP3 0x000001d3 /* Stack level 3 shadow stack pointer in ring 0 */ ++#define MSR_IA32_FRED_CONFIG 0x000001d4 /* FRED Entrypoint and interrupt stack level */ ++ + #define MSR_IA32_BNDCFGS 0x00000d90 + #define MSR_IA32_XSS 0x00000da0 + #define MSR_IA32_UMWAIT_CONTROL 0xe1 +@@ -1698,6 +1709,17 @@ typedef struct CPUArchState { + target_ulong cstar; + target_ulong fmask; + target_ulong kernelgsbase; ++ ++ /* FRED MSRs */ ++ uint64_t fred_rsp0; ++ uint64_t fred_rsp1; ++ uint64_t fred_rsp2; ++ uint64_t fred_rsp3; ++ uint64_t fred_stklvls; ++ uint64_t fred_ssp1; ++ uint64_t fred_ssp2; ++ uint64_t fred_ssp3; ++ uint64_t fred_config; + #endif + + uint64_t tsc_adjust; +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 12e920bbb4..5f3497e122 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -3391,6 +3391,17 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase); + kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask); + kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar); ++ if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config); ++ } + } + #endif + +@@ -3867,6 +3878,17 @@ static int kvm_get_msrs(X86CPU *cpu) + kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0); + kvm_msr_entry_add(cpu, MSR_FMASK, 0); + kvm_msr_entry_add(cpu, MSR_LSTAR, 0); ++ if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0); ++ kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0); ++ } + } + #endif + kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); +@@ -4092,6 +4114,33 @@ static int kvm_get_msrs(X86CPU *cpu) + case MSR_LSTAR: + env->lstar = msrs[i].data; + break; ++ case MSR_IA32_FRED_RSP0: ++ env->fred_rsp0 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_RSP1: ++ env->fred_rsp1 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_RSP2: ++ env->fred_rsp2 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_RSP3: ++ env->fred_rsp3 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_STKLVLS: ++ env->fred_stklvls = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_SSP1: ++ env->fred_ssp1 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_SSP2: ++ env->fred_ssp2 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_SSP3: ++ env->fred_ssp3 = msrs[i].data; ++ break; ++ case MSR_IA32_FRED_CONFIG: ++ env->fred_config = msrs[i].data; ++ break; + #endif + case MSR_IA32_TSC: + env->tsc = msrs[i].data; +diff --git a/target/i386/machine.c b/target/i386/machine.c +index 9a1cb8f3b8..7cbfbc0efb 100644 +--- a/target/i386/machine.c ++++ b/target/i386/machine.c +@@ -1544,6 +1544,33 @@ static const VMStateDescription vmstate_msr_xfd = { + }; + + #ifdef TARGET_X86_64 ++static bool intel_fred_msrs_needed(void *opaque) ++{ ++ X86CPU *cpu = opaque; ++ CPUX86State *env = &cpu->env; ++ ++ return !!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED); ++} ++ ++static const VMStateDescription vmstate_msr_fred = { ++ .name = "cpu/fred", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = intel_fred_msrs_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT64(env.fred_rsp0, X86CPU), ++ VMSTATE_UINT64(env.fred_rsp1, X86CPU), ++ VMSTATE_UINT64(env.fred_rsp2, X86CPU), ++ VMSTATE_UINT64(env.fred_rsp3, X86CPU), ++ VMSTATE_UINT64(env.fred_stklvls, X86CPU), ++ VMSTATE_UINT64(env.fred_ssp1, X86CPU), ++ VMSTATE_UINT64(env.fred_ssp2, X86CPU), ++ VMSTATE_UINT64(env.fred_ssp3, X86CPU), ++ VMSTATE_UINT64(env.fred_config, X86CPU), ++ VMSTATE_END_OF_LIST() ++ } ++ }; ++ + static bool amx_xtile_needed(void *opaque) + { + X86CPU *cpu = opaque; +@@ -1768,6 +1795,7 @@ const VMStateDescription vmstate_x86_cpu = { + &vmstate_pdptrs, + &vmstate_msr_xfd, + #ifdef TARGET_X86_64 ++ &vmstate_msr_fred, + &vmstate_amx_xtile, + #endif + &vmstate_arch_lbr, +-- +2.41.0.windows.1 + diff --git a/target-i386-Add-more-features-enumerated-by-CPUID.7..patch b/target-i386-Add-more-features-enumerated-by-CPUID.7..patch new file mode 100644 index 0000000000000000000000000000000000000000..7713fa5b2bb02fcd1cc88cf4fbb2350fd453ef7f --- /dev/null +++ b/target-i386-Add-more-features-enumerated-by-CPUID.7..patch @@ -0,0 +1,63 @@ +From cfb01b2fe4a99ed030dacdc49064a152a472dc2d Mon Sep 17 00:00:00 2001 +From: Chao Gao +Date: Thu, 19 Sep 2024 13:10:11 +0800 +Subject: [PATCH] target/i386: Add more features enumerated by CPUID.7.2.EDX + +commit 10eaf9c0fb7060f45807becbb2742a9de9bc3632 upstream + +Following 5 bits in CPUID.7.2.EDX are supported by KVM. Add their +supports in QEMU. Each of them indicates certain bits of IA32_SPEC_CTRL +are supported. Those bits can control CPU speculation behavior which can +be used to defend against side-channel attacks. + +bit0: intel-psfd + if 1, indicates bit 7 of the IA32_SPEC_CTRL MSR is supported. Bit 7 of + this MSR disables Fast Store Forwarding Predictor without disabling + Speculative Store Bypass + +bit1: ipred-ctrl + If 1, indicates bits 3 and 4 of the IA32_SPEC_CTRL MSR are supported. + Bit 3 of this MSR enables IPRED_DIS control for CPL3. Bit 4 of this + MSR enables IPRED_DIS control for CPL0/1/2 + +bit2: rrsba-ctrl + If 1, indicates bits 5 and 6 of the IA32_SPEC_CTRL MSR are supported. + Bit 5 of this MSR disables RRSBA behavior for CPL3. Bit 6 of this MSR + disables RRSBA behavior for CPL0/1/2 + +bit3: ddpd-u + If 1, indicates bit 8 of the IA32_SPEC_CTRL MSR is supported. Bit 8 of + this MSR disables Data Dependent Prefetcher. + +bit4: bhi-ctrl + if 1, indicates bit 10 of the IA32_SPEC_CTRL MSR is supported. Bit 10 + of this MSR enables BHI_DIS_S behavior. + +Intel-SIG: 10eaf9c0fb70 target/i386: Add more features enumerated by CPUID.7.2.EDX + +Signed-off-by: Chao Gao +Link: https://lore.kernel.org/r/20240919051011.118309-1-chao.gao@intel.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 1fa08265bc..f3df62127c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1000,8 +1000,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + [FEAT_7_2_EDX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { +- NULL, NULL, NULL, NULL, +- NULL, "mcdt-no", NULL, NULL, ++ "intel-psfd", "ipred-ctrl", "rrsba-ctrl", "ddpd-u", ++ "bhi-ctrl", "mcdt-no", NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +-- +2.41.0.windows.1 + diff --git a/target-i386-Construct-CPUID-2-as-stateful-iff-times-.patch b/target-i386-Construct-CPUID-2-as-stateful-iff-times-.patch new file mode 100644 index 0000000000000000000000000000000000000000..38a2ac997e1c7e690f3b3eaa7411f24e4337f36f --- /dev/null +++ b/target-i386-Construct-CPUID-2-as-stateful-iff-times-.patch @@ -0,0 +1,41 @@ +From afcdb893e4c702f4e009a98da71408cf54a53cc4 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 14 Aug 2024 03:54:27 -0400 +Subject: [PATCH] target/i386: Construct CPUID 2 as stateful iff times > 1 + +commit 5ab639141b6d916a6f4041d4ec46f2f1a1e4a365 upstream. + +When times == 1, the CPUID leaf 2 is not stateful. + +Intel-SIG: commit 5ab639141b6d target/i386: Construct CPUID 2 as stateful iff times > 1 + +Signed-off-by: Xiaoyao Li +Link: https://lore.kernel.org/r/20240814075431.339209-6-xiaoyao.li@intel.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/kvm/kvm.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 5057dfbd75..a867512822 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1896,10 +1896,12 @@ int kvm_arch_init_vcpu(CPUState *cs) + int times; + + c->function = i; +- c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | +- KVM_CPUID_FLAG_STATE_READ_NEXT; + cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); + times = c->eax & 0xff; ++ if (times > 1) { ++ c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | ++ KVM_CPUID_FLAG_STATE_READ_NEXT; ++ } + + for (j = 1; j < times; ++j) { + if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { +-- +2.41.0.windows.1 + diff --git a/target-i386-Delete-duplicated-macro-definition-CR4_F.patch b/target-i386-Delete-duplicated-macro-definition-CR4_F.patch new file mode 100644 index 0000000000000000000000000000000000000000..ca33d59036b68a92006ef88ae140918d551e6243 --- /dev/null +++ b/target-i386-Delete-duplicated-macro-definition-CR4_F.patch @@ -0,0 +1,39 @@ +From 1eacc509e9158b9e87f05fc9844142c0022b2d64 Mon Sep 17 00:00:00 2001 +From: "Xin Li (Intel)" +Date: Wed, 7 Aug 2024 01:18:10 -0700 +Subject: [PATCH] target/i386: Delete duplicated macro definition CR4_FRED_MASK + +commit a23bc6539890d8b27458cf56bc4ed0e0d3c2de3e upstream. + +Macro CR4_FRED_MASK is defined twice, delete one. + +Intel-SIG: commit a23bc6539890 target/i386: Delete duplicated macro definition CR4_FRED_MASK + +Signed-off-by: Xin Li (Intel) +Link: https://lore.kernel.org/r/20240807081813.735158-2-xin@zytor.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.h | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 1b9d922651..f022749c86 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -270,12 +270,6 @@ typedef enum X86Seg { + #define CR4_FRED_MASK 0 + #endif + +-#ifdef TARGET_X86_64 +-#define CR4_FRED_MASK (1ULL << 32) +-#else +-#define CR4_FRED_MASK 0 +-#endif +- + #define CR4_RESERVED_MASK \ + (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ + | CR4_DE_MASK | CR4_PSE_MASK | CR4_PAE_MASK \ +-- +2.41.0.windows.1 + diff --git a/target-i386-Don-t-construct-a-all-zero-entry-for-CPU.patch b/target-i386-Don-t-construct-a-all-zero-entry-for-CPU.patch new file mode 100644 index 0000000000000000000000000000000000000000..4f00f7337ff1906ff87355e8a69d1d1f2c099231 --- /dev/null +++ b/target-i386-Don-t-construct-a-all-zero-entry-for-CPU.patch @@ -0,0 +1,57 @@ +From e0b51ea0f229ea9c6788fa0da252e8100e30241e Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 14 Aug 2024 03:54:23 -0400 +Subject: [PATCH] target/i386: Don't construct a all-zero entry for CPUID[0xD + 0x3f] + +commit 00c8a933d95add3ce4afebbe491ca0fa398a9007 upstream. + +Currently, QEMU always constructs a all-zero CPUID entry for +CPUID[0xD 0x3f]. + +It's meaningless to construct such a leaf as the end of leaf 0xD. Rework +the logic of how subleaves of 0xD are constructed to get rid of such +all-zero value of subleaf 0x3f. + +Intel-SIG: commit 00c8a933d95a target/i386: Don't construct a all-zero entry for CPUID[0xD 0x3f] + +Signed-off-by: Xiaoyao Li +Link: https://lore.kernel.org/r/20240814075431.339209-2-xiaoyao.li@intel.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/kvm/kvm.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 850104f6b5..5057dfbd75 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1924,10 +1924,6 @@ int kvm_arch_init_vcpu(CPUState *cs) + case 0xb: + case 0xd: + for (j = 0; ; j++) { +- if (i == 0xd && j == 64) { +- break; +- } +- + c->function = i; + c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + c->index = j; +@@ -1943,7 +1939,12 @@ int kvm_arch_init_vcpu(CPUState *cs) + break; + } + if (i == 0xd && c->eax == 0) { +- continue; ++ if (j < 63) { ++ continue; ++ } else { ++ cpuid_i--; ++ break; ++ } + } + if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { + fprintf(stderr, "cpuid_data is full, no space for " +-- +2.41.0.windows.1 + diff --git a/target-i386-Enable-fdp-excptn-only-and-zero-fcs-fds.patch b/target-i386-Enable-fdp-excptn-only-and-zero-fcs-fds.patch new file mode 100644 index 0000000000000000000000000000000000000000..b293a5563c7b2291ea52a215989ce7779df83aeb --- /dev/null +++ b/target-i386-Enable-fdp-excptn-only-and-zero-fcs-fds.patch @@ -0,0 +1,70 @@ +From 8c61e09f435ff3a965867b0496f01682d679182f Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 14 Aug 2024 03:54:24 -0400 +Subject: [PATCH] target/i386: Enable fdp-excptn-only and zero-fcs-fds + +commit 7dddc3bb875e7141ab25931d0f30a1c319bc8457 upstream. + +- CPUID.(EAX=07H,ECX=0H):EBX[bit 6]: x87 FPU Data Pointer updated only + on x87 exceptions if 1. + +- CPUID.(EAX=07H,ECX=0H):EBX[bit 13]: Deprecates FPU CS and FPU DS + values if 1. i.e., X87 FCS and FDS are always zero. + +Define names for them so that they can be exposed to guest with -cpu host. + +Also define the bit field MACROs so that named cpu models can add it as +well in the future. + +Intel-SIG: commit 7dddc3bb875e target/i386: Enable fdp-excptn-only and zero-fcs-fds + +Signed-off-by: Xiaoyao Li +Link: https://lore.kernel.org/r/20240814075431.339209-3-xiaoyao.li@intel.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + target/i386/cpu.h | 4 ++++ + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index dfc0f7fd2d..d0aa2fb5ff 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -906,9 +906,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + "fsgsbase", "tsc-adjust", "sgx", "bmi1", +- "hle", "avx2", NULL, "smep", ++ "hle", "avx2", "fdp-excptn-only", "smep", + "bmi2", "erms", "invpcid", "rtm", +- NULL, NULL, "mpx", NULL, ++ NULL, "zero-fcs-fds", "mpx", NULL, + "avx512f", "avx512dq", "rdseed", "adx", + "smap", "avx512ifma", "pcommit", "clflushopt", + "clwb", "intel-pt", "avx512pf", "avx512er", +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index b90182582f..b883e5e1d6 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -809,6 +809,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); + #define CPUID_7_0_EBX_HLE (1U << 4) + /* Intel Advanced Vector Extensions 2 */ + #define CPUID_7_0_EBX_AVX2 (1U << 5) ++/* FPU data pointer updated only on x87 exceptions */ ++#define CPUID_7_0_EBX_FDP_EXCPTN_ONLY (1u << 6) + /* Supervisor-mode Execution Prevention */ + #define CPUID_7_0_EBX_SMEP (1U << 7) + /* 2nd Group of Advanced Bit Manipulation Extensions */ +@@ -819,6 +821,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); + #define CPUID_7_0_EBX_INVPCID (1U << 10) + /* Restricted Transactional Memory */ + #define CPUID_7_0_EBX_RTM (1U << 11) ++/* Zero out FPU CS and FPU DS */ ++#define CPUID_7_0_EBX_ZERO_FCS_FDS (1U << 13) + /* Memory Protection Extension */ + #define CPUID_7_0_EBX_MPX (1U << 14) + /* AVX-512 Foundation */ +-- +2.41.0.windows.1 + diff --git a/target-i386-Make-invtsc-migratable-when-user-sets-ts.patch b/target-i386-Make-invtsc-migratable-when-user-sets-ts.patch new file mode 100644 index 0000000000000000000000000000000000000000..df9b7441716a6a9e0ddf2a96a902d593cc84055b --- /dev/null +++ b/target-i386-Make-invtsc-migratable-when-user-sets-ts.patch @@ -0,0 +1,66 @@ +From 07a671dc3e3baedb650b307c36d69bef869c2480 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 14 Aug 2024 03:54:31 -0400 +Subject: [PATCH] target/i386: Make invtsc migratable when user sets tsc-khz + explicitly + +commit 87c88db3143e91076d167a62dd7febf49afca8a2 upstream. + +When user sets tsc-frequency explicitly, the invtsc feature is actually +migratable because the tsc-frequency is supposed to be fixed during the +migration. + +See commit d99569d9d856 ("kvm: Allow invtsc migration if tsc-khz +is set explicitly") for referrence. + +Intel-SIG: commit 87c88db3143e target/i386: Make invtsc migratable when user sets tsc-khz explicitly + +Signed-off-by: Xiaoyao Li +Link: https://lore.kernel.org/r/20240814075431.339209-10-xiaoyao.li@intel.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index d0aa2fb5ff..20358ffa91 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1685,9 +1685,10 @@ static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu) + * Returns the set of feature flags that are supported and migratable by + * QEMU, for a given FeatureWord. + */ +-static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) ++static uint64_t x86_cpu_get_migratable_flags(X86CPU *cpu, FeatureWord w) + { + FeatureWordInfo *wi = &feature_word_info[w]; ++ CPUX86State *env = &cpu->env; + uint64_t r = 0; + int i; + +@@ -1701,6 +1702,12 @@ static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) + r |= f; + } + } ++ ++ /* when tsc-khz is set explicitly, invtsc is migratable */ ++ if ((w == FEAT_8000_0007_EDX) && env->user_tsc_khz) { ++ r |= CPUID_APM_INVTSC; ++ } ++ + return r; + } + +@@ -6002,7 +6009,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) + } + #endif + if (cpu && cpu->migratable) { +- r &= x86_cpu_get_migratable_flags(w); ++ r &= x86_cpu_get_migratable_flags(cpu, w); + } + return r; + } +-- +2.41.0.windows.1 + diff --git a/target-i386-Raise-the-highest-index-value-used-for-a.patch b/target-i386-Raise-the-highest-index-value-used-for-a.patch new file mode 100644 index 0000000000000000000000000000000000000000..57db409451982642754fc92a09b7231e5d41d57f --- /dev/null +++ b/target-i386-Raise-the-highest-index-value-used-for-a.patch @@ -0,0 +1,66 @@ +From 513d33050869a337262fdba0a2d064e7ce9fdb22 Mon Sep 17 00:00:00 2001 +From: Lei Wang +Date: Wed, 7 Aug 2024 01:18:12 -0700 +Subject: [PATCH] target/i386: Raise the highest index value used for any VMCS + encoding + +commit ab891454ebe82f7e359be721007652556f9f8356 upstream. + +Because the index value of the VMCS field encoding of FRED injected-event +data (one of the newly added VMCS fields for FRED transitions), 0x52, is +larger than any existing index value, raise the highest index value used +for any VMCS encoding to 0x52. + +Because the index value of the VMCS field encoding of Secondary VM-exit +controls, 0x44, is larger than any existing index value, raise the highest +index value used for any VMCS encoding to 0x44. + +Intel-SIG: commit ab891454ebe8 target/i386: Raise the highest index value used for any VMCS encoding + +Co-developed-by: Xin Li +Signed-off-by: Xin Li +Signed-off-by: Lei Wang +Signed-off-by: Xin Li (Intel) +Link: https://lore.kernel.org/r/20240807081813.735158-4-xin@zytor.com +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.h | 1 + + target/i386/kvm/kvm.c | 9 ++++++++- + 2 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f022749c86..fb6721f182 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1166,6 +1166,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, + #define VMX_VM_EXIT_PT_CONCEAL_PIP 0x01000000 + #define VMX_VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 + #define VMX_VM_EXIT_LOAD_IA32_PKRS 0x20000000 ++#define VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS 0x80000000 + + #define VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 + #define VMX_VM_ENTRY_IA32E_MODE 0x00000200 +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 5f3497e122..ce96ed9158 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -3254,7 +3254,14 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) + kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, + CR4_VMXE_MASK); + +- if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { ++ if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { ++ /* FRED injected-event data (0x2052). */ ++ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52); ++ } else if (f[FEAT_VMX_EXIT_CTLS] & ++ VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) { ++ /* Secondary VM-exit controls (0x2044). */ ++ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44); ++ } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { + /* TSC multiplier (0x2032). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32); + } else { +-- +2.41.0.windows.1 + diff --git a/target-i386-add-support-for-FRED-in-CPUID-enumeratio.patch b/target-i386-add-support-for-FRED-in-CPUID-enumeratio.patch new file mode 100644 index 0000000000000000000000000000000000000000..8afdb72a0104bd4e0408956ca036af9b28e3af5a --- /dev/null +++ b/target-i386-add-support-for-FRED-in-CPUID-enumeratio.patch @@ -0,0 +1,108 @@ +From 110184b14d17c13e046e9c4ebed6c3cec29b31d0 Mon Sep 17 00:00:00 2001 +From: Xin Li +Date: Wed, 8 Nov 2023 23:20:07 -0800 +Subject: [PATCH] target/i386: add support for FRED in CPUID enumeration + +commit c1acad9f72d14daf918563eb77d2b31c39fbd06a upstream. + +FRED, i.e., the Intel flexible return and event delivery architecture, +defines simple new transitions that change privilege level (ring +transitions). + +The new transitions defined by the FRED architecture are FRED event +delivery and, for returning from events, two FRED return instructions. +FRED event delivery can effect a transition from ring 3 to ring 0, but +it is used also to deliver events incident to ring 0. One FRED +instruction (ERETU) effects a return from ring 0 to ring 3, while the +other (ERETS) returns while remaining in ring 0. Collectively, FRED +event delivery and the FRED return instructions are FRED transitions. + +In addition to these transitions, the FRED architecture defines a new +instruction (LKGS) for managing the state of the GS segment register. +The LKGS instruction can be used by 64-bit operating systems that do +not use the new FRED transitions. + +WRMSRNS is an instruction that behaves exactly like WRMSR, with the +only difference being that it is not a serializing instruction by +default. Under certain conditions, WRMSRNS may replace WRMSR to improve +performance. FRED uses it to switch RSP0 in a faster manner. + +Search for the latest FRED spec in most search engines with this search +pattern: + + site:intel.com FRED (flexible return and event delivery) specification + +The CPUID feature flag CPUID.(EAX=7,ECX=1):EAX[17] enumerates FRED, and +the CPUID feature flag CPUID.(EAX=7,ECX=1):EAX[18] enumerates LKGS, and +the CPUID feature flag CPUID.(EAX=7,ECX=1):EAX[19] enumerates WRMSRNS. + +Add CPUID definitions for FRED/LKGS/WRMSRNS, and expose them to KVM guests. + +Because FRED relies on LKGS and WRMSRNS, add that to feature dependency +map. + +Intel-SIG: commit c1acad9f72d1 target/i386: add support for FRED in CPUID enumeration + +Tested-by: Shan Kang +Signed-off-by: Xin Li +Message-ID: <20231109072012.8078-2-xin3.li@intel.com> +[Fix order of dependencies, add dependencies from LM to FRED. - Paolo] +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 14 +++++++++++++- + target/i386/cpu.h | 6 ++++++ + 2 files changed, 19 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 860934b39f..47f00392be 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -966,7 +966,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + "avx-vnni", "avx512-bf16", NULL, "cmpccxadd", + NULL, NULL, "fzrm", "fsrs", + "fsrc", NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, ++ NULL, "fred", "lkgs", "wrmsrns", + NULL, "amx-fp16", NULL, "avx-ifma", + NULL, NULL, "lam", NULL, + NULL, NULL, NULL, NULL, +@@ -1553,6 +1553,18 @@ static FeatureDep feature_dependencies[] = { + .from = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, + .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, + }, ++ { ++ .from = { FEAT_8000_0001_EDX, CPUID_EXT2_LM }, ++ .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, ++ }, ++ { ++ .from = { FEAT_7_1_EAX, CPUID_7_1_EAX_LKGS }, ++ .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, ++ }, ++ { ++ .from = { FEAT_7_1_EAX, CPUID_7_1_EAX_WRMSRNS }, ++ .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, ++ }, + }; + + typedef struct X86RegisterInfo32 { +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 21fb769cce..f392626f98 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -941,6 +941,12 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, + #define CPUID_7_1_EDX_AMX_COMPLEX (1U << 8) + /* PREFETCHIT0/1 Instructions */ + #define CPUID_7_1_EDX_PREFETCHITI (1U << 14) ++/* Flexible return and event delivery (FRED) */ ++#define CPUID_7_1_EAX_FRED (1U << 17) ++/* Load into IA32_KERNEL_GS_BASE (LKGS) */ ++#define CPUID_7_1_EAX_LKGS (1U << 18) ++/* Non-Serializing Write to Model Specific Register (WRMSRNS) */ ++#define CPUID_7_1_EAX_WRMSRNS (1U << 19) + + /* Do not exhibit MXCSR Configuration Dependent Timing (MCDT) behavior */ + #define CPUID_7_2_EDX_MCDT_NO (1U << 5) +-- +2.41.0.windows.1 + diff --git a/target-i386-enumerate-VMX-nested-exception-support.patch b/target-i386-enumerate-VMX-nested-exception-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..92cb7464ebe90b4f683202a2045094237b1c7c0a --- /dev/null +++ b/target-i386-enumerate-VMX-nested-exception-support.patch @@ -0,0 +1,62 @@ +From 5f828613ba69ce640512a900f630515d980208dd Mon Sep 17 00:00:00 2001 +From: Xin Li +Date: Wed, 8 Nov 2023 23:20:11 -0800 +Subject: [PATCH] target/i386: enumerate VMX nested-exception support + +commit ef202d64c3020f3df03c39d3ad688732d81aaae8 upstream. + +Allow VMX nested-exception support to be exposed in KVM guests, thus +nested KVM guests can enumerate it. + +Intel-SIG: commit ef202d64c302 target/i386: enumerate VMX nested-exception support + +Tested-by: Shan Kang +Signed-off-by: Xin Li +Message-ID: <20231109072012.8078-6-xin3.li@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + scripts/kvm/vmxcap | 1 + + target/i386/cpu.c | 1 + + target/i386/cpu.h | 1 + + 3 files changed, 3 insertions(+) + +diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap +index 44898d73c2..508be19c75 100755 +--- a/scripts/kvm/vmxcap ++++ b/scripts/kvm/vmxcap +@@ -117,6 +117,7 @@ controls = [ + 54: 'INS/OUTS instruction information', + 55: 'IA32_VMX_TRUE_*_CTLS support', + 56: 'Skip checks on event error code', ++ 58: 'VMX nested exception support', + }, + msr = MSR_IA32_VMX_BASIC, + ), +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 47f00392be..00e636e61c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1344,6 +1344,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + [54] = "vmx-ins-outs", + [55] = "vmx-true-ctls", + [56] = "vmx-any-errcode", ++ [58] = "vmx-nested-exception", + }, + .msr = { + .index = MSR_IA32_VMX_BASIC, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 418daeab04..b03237c305 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1065,6 +1065,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, + #define MSR_VMX_BASIC_INS_OUTS (1ULL << 54) + #define MSR_VMX_BASIC_TRUE_CTLS (1ULL << 55) + #define MSR_VMX_BASIC_ANY_ERRCODE (1ULL << 56) ++#define MSR_VMX_BASIC_NESTED_EXCEPTION (1ULL << 58) + + #define MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK 0x1Full + #define MSR_VMX_MISC_STORE_LMA (1ULL << 5) +-- +2.41.0.windows.1 + diff --git a/target-i386-fix-feature-dependency-for-WAITPKG.patch b/target-i386-fix-feature-dependency-for-WAITPKG.patch new file mode 100644 index 0000000000000000000000000000000000000000..0cea61cff54b0ecfa751efeadc88ca04853c2ad2 --- /dev/null +++ b/target-i386-fix-feature-dependency-for-WAITPKG.patch @@ -0,0 +1,39 @@ +From bce44f92530fed18cac1e51f81217a6addf992bd Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 8 May 2024 11:10:54 +0200 +Subject: [PATCH] target/i386: fix feature dependency for WAITPKG + +commit fe01af5d47d4cf7fdf90c54d43f784e5068c8d72 upstream. + +The VMX feature bit depends on general availability of WAITPKG, +not the other way round. + +Intel-SIG: commit fe01af5d47d4 target/i386: fix feature dependency for WAITPKG + +Fixes: 33cc88261c3 ("target/i386: add support for VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE", 2023-08-28) +Cc: qemu-stable@nongnu.org +Reviewed-by: Zhao Liu +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index f3df62127c..860934b39f 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1550,8 +1550,8 @@ static FeatureDep feature_dependencies[] = { + .to = { FEAT_SVM, ~0ull }, + }, + { +- .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, +- .to = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, ++ .from = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, ++ .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, + }, + }; + +-- +2.41.0.windows.1 + diff --git a/target-i386-mark-CR4.FRED-not-reserved.patch b/target-i386-mark-CR4.FRED-not-reserved.patch new file mode 100644 index 0000000000000000000000000000000000000000..055447413f2192f51d7c7ba51fbfed648ea4263b --- /dev/null +++ b/target-i386-mark-CR4.FRED-not-reserved.patch @@ -0,0 +1,67 @@ +From 1a2ee56c173984212ba7b9970aa36e307094d460 Mon Sep 17 00:00:00 2001 +From: Xin Li +Date: Wed, 8 Nov 2023 23:20:08 -0800 +Subject: [PATCH] target/i386: mark CR4.FRED not reserved + +commit f88ddc40c6d8b591a357108feec52cea13796d2d upstream. + +The CR4.FRED bit, i.e., CR4[32], is no longer a reserved bit when FRED +is exposed to guests, otherwise it is still a reserved bit. + +Intel-SIG: commit f88ddc40c6d8 target/i386: mark CR4.FRED not reserved + +Tested-by: Shan Kang +Signed-off-by: Xin Li +Reviewed-by: Zhao Liu +Message-ID: <20231109072012.8078-3-xin3.li@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.h | 17 ++++++++++++++++- + 1 file changed, 16 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index f392626f98..418daeab04 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -264,6 +264,18 @@ typedef enum X86Seg { + #define CR4_PKS_MASK (1U << 24) + #define CR4_LAM_SUP_MASK (1U << 28) + ++#ifdef TARGET_X86_64 ++#define CR4_FRED_MASK (1ULL << 32) ++#else ++#define CR4_FRED_MASK 0 ++#endif ++ ++#ifdef TARGET_X86_64 ++#define CR4_FRED_MASK (1ULL << 32) ++#else ++#define CR4_FRED_MASK 0 ++#endif ++ + #define CR4_RESERVED_MASK \ + (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ + | CR4_DE_MASK | CR4_PSE_MASK | CR4_PAE_MASK \ +@@ -272,7 +284,7 @@ typedef enum X86Seg { + | CR4_LA57_MASK \ + | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \ + | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \ +- | CR4_LAM_SUP_MASK)) ++ | CR4_LAM_SUP_MASK | CR4_FRED_MASK)) + + #define DR6_BD (1 << 13) + #define DR6_BS (1 << 14) +@@ -2551,6 +2563,9 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env) + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { + reserved_bits |= CR4_LAM_SUP_MASK; + } ++ if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED)) { ++ reserved_bits |= CR4_FRED_MASK; ++ } + return reserved_bits; + } + +-- +2.41.0.windows.1 + diff --git a/target-i386-pass-X86CPU-to-x86_cpu_get_supported_fea.patch b/target-i386-pass-X86CPU-to-x86_cpu_get_supported_fea.patch new file mode 100644 index 0000000000000000000000000000000000000000..a69bdefb882ebceb2d8a4199570611b2763846fb --- /dev/null +++ b/target-i386-pass-X86CPU-to-x86_cpu_get_supported_fea.patch @@ -0,0 +1,108 @@ +From bd6fec2cb2bb811aa73a2a6e6da45c76ecded49c Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 27 Jun 2024 01:12:42 +0200 +Subject: [PATCH] target/i386: pass X86CPU to + x86_cpu_get_supported_feature_word + +commit 8dee38483274bd0fcf3f74dea024d719b958200d upstream. + +This allows modifying the bits in "-cpu max"/"-cpu host" depending on +the guest CPU vendor (which, at least by default, is the host vendor in +the case of KVM). + +For example, machine check architecture differs between Intel and AMD, +and bits from AMD should be dropped when configuring the guest for +an Intel model. + +Intel-SIG: commit 8dee38483274 target/i386: pass X86CPU to x86_cpu_get_supported_feature_word + +Cc: Xiaoyao Li +Cc: John Allen +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 11 +++++------ + target/i386/cpu.h | 3 +-- + target/i386/kvm/kvm-cpu.c | 2 +- + 3 files changed, 7 insertions(+), 9 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index f80570f4da..dfc0f7fd2d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -5959,8 +5959,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) + + #endif /* !CONFIG_USER_ONLY */ + +-uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, +- bool migratable_only) ++uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) + { + FeatureWordInfo *wi = &feature_word_info[w]; + uint64_t r = 0; +@@ -6002,7 +6001,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, + r &= ~unavail; + } + #endif +- if (migratable_only) { ++ if (cpu && cpu->migratable) { + r &= x86_cpu_get_migratable_flags(w); + } + return r; +@@ -7324,7 +7323,7 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) + * by the user. + */ + env->features[w] |= +- x86_cpu_get_supported_feature_word(w, cpu->migratable) & ++ x86_cpu_get_supported_feature_word(cpu, w) & + ~env->user_features[w] & + ~feature_word_info[w].no_autoenable_flags; + } +@@ -7450,7 +7449,7 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool verbose) + + for (w = 0; w < FEATURE_WORDS; w++) { + uint64_t host_feat = +- x86_cpu_get_supported_feature_word(w, false); ++ x86_cpu_get_supported_feature_word(NULL, w); + uint64_t requested_features = env->features[w]; + uint64_t unavailable_features = requested_features & ~host_feat; + mark_unavailable_features(cpu, w, unavailable_features, prefix); +@@ -7566,7 +7565,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) + env->features[FEAT_PERF_CAPABILITIES] & PERF_CAP_LBR_FMT; + if (requested_lbr_fmt && kvm_enabled()) { + uint64_t host_perf_cap = +- x86_cpu_get_supported_feature_word(FEAT_PERF_CAPABILITIES, false); ++ x86_cpu_get_supported_feature_word(NULL, FEAT_PERF_CAPABILITIES); + unsigned host_lbr_fmt = host_perf_cap & PERF_CAP_LBR_FMT; + + if (!cpu->enable_pmu) { +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index fb6721f182..b90182582f 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -655,8 +655,7 @@ typedef enum FeatureWord { + } FeatureWord; + + typedef uint64_t FeatureWordArray[FEATURE_WORDS]; +-uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, +- bool migratable_only); ++uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); + + /* cpuid_features bits */ + #define CPUID_FP87 (1U << 0) +diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c +index f76972e47e..a3bc8d8f83 100644 +--- a/target/i386/kvm/kvm-cpu.c ++++ b/target/i386/kvm/kvm-cpu.c +@@ -137,7 +137,7 @@ static void kvm_cpu_xsave_init(void) + if (!esa->size) { + continue; + } +- if ((x86_cpu_get_supported_feature_word(esa->feature, false) & esa->bits) ++ if ((x86_cpu_get_supported_feature_word(NULL, esa->feature) & esa->bits) + != esa->bits) { + continue; + } +-- +2.41.0.windows.1 + diff --git a/target-s390x-Fix-a-typo-in-s390_cpu_class_init.patch b/target-s390x-Fix-a-typo-in-s390_cpu_class_init.patch new file mode 100644 index 0000000000000000000000000000000000000000..07f7fc11dbf65f7f599334f95bc58d2ba5a4d9f0 --- /dev/null +++ b/target-s390x-Fix-a-typo-in-s390_cpu_class_init.patch @@ -0,0 +1,39 @@ +From 52cc8f5a9ba854268a58402d351d2fd43dddb1b4 Mon Sep 17 00:00:00 2001 +From: qihao_yewu +Date: Mon, 7 Apr 2025 17:54:20 -0400 +Subject: [PATCH] target/s390x: Fix a typo in s390_cpu_class_init() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +cheery-pick from 6a93b1c7b4cfa4f5e3c0b8a17177ce14aaa2346c + +Replace the comma at the end of the line by a semicolon. + +Fixes: 41868f846d2 ("s390x/cpumodel: "host" and "qemu" as CPU subclasses") +Reviewed-by: Richard Henderson +Reviewed-by: Thomas Huth +Signed-off-by: Philippe Mathieu-Daudé +Message-ID: <20250324165356.39540-1-philmd@linaro.org> +Signed-off-by: Thomas Huth +Signed-off-by: qihao_yewu +--- + target/s390x/cpu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 6acfa1c91b..5e64f24cc2 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -350,7 +350,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + device_class_set_parent_reset(dc, s390_cpu_reset_full, &scc->parent_reset); + + scc->reset = s390_cpu_reset; +- cc->class_by_name = s390_cpu_class_by_name, ++ cc->class_by_name = s390_cpu_class_by_name; + cc->has_work = s390_cpu_has_work; + cc->dump_state = s390_cpu_dump_state; + cc->query_cpu_fast = s390_query_cpu_fast; +-- +2.41.0.windows.1 + diff --git a/tests-data-acpi-Update-DSDT-acpi-tables.patch b/tests-data-acpi-Update-DSDT-acpi-tables.patch new file mode 100644 index 0000000000000000000000000000000000000000..bc52330a4c343dcb9f5cbfd768667ad6cceb6ccc --- /dev/null +++ b/tests-data-acpi-Update-DSDT-acpi-tables.patch @@ -0,0 +1,70 @@ +From 4a065d0fbbe159dfbc073e4480434d6889b7c5a4 Mon Sep 17 00:00:00 2001 +From: caijian +Date: Mon, 31 Mar 2025 15:03:02 +0800 +Subject: [PATCH] tests/data/acpi: Update DSDT acpi tables + +- * Disassembly of tests/data/acpi/virt/DSDT, Fri Mar 28 16:43:04 2025 ++ * Disassembly of /tmp/aml-1KF432, Fri Mar 28 16:43:04 2025 + * + * Original Table Header: + * Signature "DSDT" + * Length 0x000016B6 (5814) + * Revision 0x02 +- * Checksum 0x46 ++ * Checksum 0x47 + * OEM ID "BOCHS " + * OEM Table ID "BXPC " + * OEM Revision 0x00000001 (1) + * Compiler ID "BXPC" + * Compiler Version 0x00000001 (1) + */ + DefinitionBlock ("", "DSDT", 2, "BOCHS ", "BXPC ", 0x00000001) +@@ -2090,33 +2090,33 @@ + } + Else + { + CDW1 |= 0x04 + Return (Arg3) + } + } + + Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + { + If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */)) + { + If ((Arg2 == Zero)) + { + Return (Buffer (One) + { +- 0x01 // . ++ 0x00 // . + }) + } + } + + Return (Buffer (One) + { + 0x00 + }) + } + +Signed-off-by: caijian +--- + tests/qtest/bios-tables-test-allowed-diff.h | 6 ------ + 1 files changed, 6 deletions(-) + +diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h +index e4a94bb8bd..dfb8523c8b 100644 +--- a/tests/qtest/bios-tables-test-allowed-diff.h ++++ b/tests/qtest/bios-tables-test-allowed-diff.h +@@ -1,7 +1 @@ + /* List of comma-separated changed AML files to ignore */ +-"tests/data/acpi/microvm/DSDT.pcie", +-"tests/data/acpi/virt/DSDT", +-"tests/data/acpi/virt/DSDT.acpihmatvirt", +-"tests/data/acpi/virt/DSDT.memhp", +-"tests/data/acpi/virt/DSDT.pxb", +-"tests/data/acpi/virt/DSDT.topology", +-- +2.41.0.windows.1 + diff --git a/tests-data-acpi-virt-Update-IORT-acpi-table.patch b/tests-data-acpi-virt-Update-IORT-acpi-table.patch new file mode 100644 index 0000000000000000000000000000000000000000..d509ccc4570e9fc300f6d99427f342aae199983a --- /dev/null +++ b/tests-data-acpi-virt-Update-IORT-acpi-table.patch @@ -0,0 +1,76 @@ +From bf12438e93f2d55aac6245f6a9f77f51b6fd2d8a Mon Sep 17 00:00:00 2001 +From: caijian +Date: Mon, 31 Mar 2025 15:06:24 +0800 +Subject: [PATCH] tests/data/acpi/virt: Update IORT acpi table + +- * Disassembly of tests/data/acpi/virt/IORT, Fri Mar 28 18:05:37 2025 ++ * Disassembly of /tmp/aml-9R3932, Fri Mar 28 18:05:37 2025 + * + * ACPI Data Table [IORT] + * + * Format: [HexOffset DecimalOffset ByteLength] FieldName : FieldValue + */ + + [000h 0000 4] Signature : "IORT" [IO Remapping Table] + [004h 0004 4] Table Length : 00000080 +-[008h 0008 1] Revision : 03 +-[009h 0009 1] Checksum : B3 ++[008h 0008 1] Revision : 05 ++[009h 0009 1] Checksum : AE + [00Ah 0010 6] Oem ID : "BOCHS " + [010h 0016 8] Oem Table ID : "BXPC " + [018h 0024 4] Oem Revision : 00000001 + [01Ch 0028 4] Asl Compiler ID : "BXPC" + [020h 0032 4] Asl Compiler Revision : 00000001 +@@ -45,32 +45,32 @@ + [058h 0088 4] Cache Coherency : 00000001 + [05Ch 0092 1] Hints (decoded below) : 00 + Transient : 0 + Write Allocate : 0 + Read Allocate : 0 + Override : 0 + [05Dh 0093 2] Reserved : 0000 + [05Fh 0095 1] Memory Flags (decoded below) : 03 + Coherency : 1 + Device Attribute : 1 + [060h 0096 4] ATS Attribute : 00000000 + [064h 0100 4] PCI Segment Number : 00000000 + [068h 0104 1] Memory Size Limit : 40 + [069h 0105 3] Reserved : 000000 + + [06Ch 0108 4] Input base : 00000000 +-[070h 0112 4] ID Count : 0000FFFF ++[070h 0112 4] ID Count : 00010000 + [074h 0116 4] Output Base : 00000000 + [078h 0120 4] Output Reference : 00000030 + [07Ch 0124 4] Flags (decoded below) : 00000000 + Single Mapping : 0 + + Raw Table Data: Length 128 (0x80) + +- 0000: 49 4F 52 54 80 00 00 00 03 B3 42 4F 43 48 53 20 // IORT......BOCHS ++ 0000: 49 4F 52 54 80 00 00 00 05 AE 42 4F 43 48 53 20 // IORT......BOCHS + 0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43 // BXPC ....BXPC + 0020: 01 00 00 00 02 00 00 00 30 00 00 00 00 00 00 00 // ........0....... + 0030: 00 18 00 01 00 00 00 00 00 00 00 00 00 00 00 00 // ................ + 0040: 01 00 00 00 00 00 00 00 02 38 00 03 01 00 00 00 // .........8...... + 0050: 01 00 00 00 24 00 00 00 01 00 00 00 00 00 00 03 // ....$........... + 0060: 00 00 00 00 00 00 00 00 40 00 00 00 00 00 00 00 // ........@....... +- 0070: FF FF 00 00 00 00 00 00 30 00 00 00 00 00 00 00 // ........0....... ++ 0070: 00 00 01 00 00 00 00 00 30 00 00 00 00 00 00 00 // ........0....... + +Signed-off-by: caijian +--- + tests/qtest/bios-tables-test-allowed-diff.h | 1 - + 1 files changed, 1 deletion(-) + +diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h +index 9a5a923d6b..dfb8523c8b 100644 +--- a/tests/qtest/bios-tables-test-allowed-diff.h ++++ b/tests/qtest/bios-tables-test-allowed-diff.h +@@ -1,2 +1 @@ + /* List of comma-separated changed AML files to ignore */ +-"tests/data/acpi/virt/IORT", +-- +2.41.0.windows.1 + diff --git a/tests-qtest-Allow-DSDT-acpi-tables-to-change.patch b/tests-qtest-Allow-DSDT-acpi-tables-to-change.patch new file mode 100644 index 0000000000000000000000000000000000000000..0e3356c5387828ae5fa0232d594e341461d6f465 --- /dev/null +++ b/tests-qtest-Allow-DSDT-acpi-tables-to-change.patch @@ -0,0 +1,27 @@ +From ea23e4215b332446d4964769d004f7a11caba00b Mon Sep 17 00:00:00 2001 +From: caijian +Date: Mon, 31 Mar 2025 15:02:37 +0800 +Subject: [PATCH] tests/qtest: Allow DSDT acpi tables to change + +List all DSDT files and allow them to change. + +Signed-of-by: caijian +--- + tests/qtest/bios-tables-test-allowed-diff.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h +index dfb8523c8b..e4a94bb8bd 100644 +--- a/tests/qtest/bios-tables-test-allowed-diff.h ++++ b/tests/qtest/bios-tables-test-allowed-diff.h +@@ -1 +1,7 @@ + /* List of comma-separated changed AML files to ignore */ ++"tests/data/acpi/microvm/DSDT.pcie", ++"tests/data/acpi/virt/DSDT", ++"tests/data/acpi/virt/DSDT.acpihmatvirt", ++"tests/data/acpi/virt/DSDT.memhp", ++"tests/data/acpi/virt/DSDT.pxb", ++"tests/data/acpi/virt/DSDT.topology", +-- +2.41.0.windows.1 + diff --git a/tests-qtest-Allow-IORT-acpi-table-to-change.patch b/tests-qtest-Allow-IORT-acpi-table-to-change.patch new file mode 100644 index 0000000000000000000000000000000000000000..22bbe28f530de1bfb837468eba5ea89f168bf69a --- /dev/null +++ b/tests-qtest-Allow-IORT-acpi-table-to-change.patch @@ -0,0 +1,22 @@ +From ca17fd9b9e608e0a6e8a948ccf46fa020c12f510 Mon Sep 17 00:00:00 2001 +From: caijian +Date: Mon, 31 Mar 2025 15:06:13 +0800 +Subject: [PATCH] tests/qtest: Allow IORT acpi table to change + +List changed IORT file and allow it to change. + +Signed-off-by: caijian +--- + tests/qtest/bios-tables-test-allowed-diff.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h +index dfb8523c8b..9a5a923d6b 100644 +--- a/tests/qtest/bios-tables-test-allowed-diff.h ++++ b/tests/qtest/bios-tables-test-allowed-diff.h +@@ -1 +1,2 @@ + /* List of comma-separated changed AML files to ignore */ ++"tests/data/acpi/virt/IORT", +-- +2.41.0.windows.1 + diff --git a/util-char_dev-Add-open_cdev.patch b/util-char_dev-Add-open_cdev.patch new file mode 100644 index 0000000000000000000000000000000000000000..b6af1918d6b5181c0290f5dada9a4ab3c1962f50 --- /dev/null +++ b/util-char_dev-Add-open_cdev.patch @@ -0,0 +1,167 @@ +From 90688ff9c5802965f24460ac79fe52b93d2adb1f Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Sat, 11 Jan 2025 10:52:38 +0800 +Subject: [PATCH] util/char_dev: Add open_cdev() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +/dev/vfio/devices/vfioX may not exist. In that case it is still possible +to open /dev/char/$major:$minor instead. Add helper function to abstract +the cdev open. + +Suggested-by: Jason Gunthorpe +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + MAINTAINERS | 2 + + include/qemu/chardev_open.h | 16 ++++++++ + util/chardev_open.c | 81 +++++++++++++++++++++++++++++++++++++ + util/meson.build | 1 + + 4 files changed, 100 insertions(+) + create mode 100644 include/qemu/chardev_open.h + create mode 100644 util/chardev_open.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index a5a446914a..ca70bb4e64 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2174,6 +2174,8 @@ M: Zhenzhong Duan + S: Supported + F: backends/iommufd.c + F: include/sysemu/iommufd.h ++F: include/qemu/chardev_open.h ++F: util/chardev_open.c + + vhost + M: Michael S. Tsirkin +diff --git a/include/qemu/chardev_open.h b/include/qemu/chardev_open.h +new file mode 100644 +index 0000000000..64e8fcfdcb +--- /dev/null ++++ b/include/qemu/chardev_open.h +@@ -0,0 +1,16 @@ ++/* ++ * QEMU Chardev Helper ++ * ++ * Copyright (C) 2023 Intel Corporation. ++ * ++ * Authors: Yi Liu ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. See ++ * the COPYING file in the top-level directory. ++ */ ++ ++#ifndef QEMU_CHARDEV_OPEN_H ++#define QEMU_CHARDEV_OPEN_H ++ ++int open_cdev(const char *devpath, dev_t cdev); ++#endif +diff --git a/util/chardev_open.c b/util/chardev_open.c +new file mode 100644 +index 0000000000..f776429788 +--- /dev/null ++++ b/util/chardev_open.c +@@ -0,0 +1,81 @@ ++/* ++ * Copyright (c) 2019, Mellanox Technologies. All rights reserved. ++ * Copyright (C) 2023 Intel Corporation. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ * ++ * Authors: Yi Liu ++ * ++ * Copied from ++ * https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c ++ * ++ */ ++ ++#include "qemu/osdep.h" ++#include "qemu/chardev_open.h" ++ ++static int open_cdev_internal(const char *path, dev_t cdev) ++{ ++ struct stat st; ++ int fd; ++ ++ fd = qemu_open_old(path, O_RDWR); ++ if (fd == -1) { ++ return -1; ++ } ++ if (fstat(fd, &st) || !S_ISCHR(st.st_mode) || ++ (cdev != 0 && st.st_rdev != cdev)) { ++ close(fd); ++ return -1; ++ } ++ return fd; ++} ++ ++static int open_cdev_robust(dev_t cdev) ++{ ++ g_autofree char *devpath = NULL; ++ ++ /* ++ * This assumes that udev is being used and is creating the /dev/char/ ++ * symlinks. ++ */ ++ devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev)); ++ return open_cdev_internal(devpath, cdev); ++} ++ ++int open_cdev(const char *devpath, dev_t cdev) ++{ ++ int fd; ++ ++ fd = open_cdev_internal(devpath, cdev); ++ if (fd == -1 && cdev != 0) { ++ return open_cdev_robust(cdev); ++ } ++ return fd; ++} +diff --git a/util/meson.build b/util/meson.build +index c2322ef6e7..174c133368 100644 +--- a/util/meson.build ++++ b/util/meson.build +@@ -108,6 +108,7 @@ if have_block + util_ss.add(files('filemonitor-stub.c')) + endif + util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c')) ++ util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c')) + endif + + if cpu == 'aarch64' +-- +2.41.0.windows.1 + diff --git a/vfio-Create-host-IOMMU-device-instance.patch b/vfio-Create-host-IOMMU-device-instance.patch new file mode 100644 index 0000000000000000000000000000000000000000..03b681ffd6c68dc6ddfae96fb29dd4045596ffc6 --- /dev/null +++ b/vfio-Create-host-IOMMU-device-instance.patch @@ -0,0 +1,124 @@ +From a152921f6d534f2a515b4e88304ad115fae8fa8f Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:37 +0800 +Subject: [PATCH] vfio: Create host IOMMU device instance +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Create host IOMMU device instance in vfio_attach_device() and call +.realize() to initialize it further. + +Introuduce attribute VFIOIOMMUClass::hiod_typename and initialize +it based on VFIO backend type. It will facilitate HostIOMMUDevice +creation in vfio_attach_device(). + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/common.c | 18 +++++++++++++++++- + hw/vfio/container.c | 2 ++ + hw/vfio/iommufd.c | 2 ++ + include/hw/vfio/vfio-common.h | 1 + + include/hw/vfio/vfio-container-base.h | 3 +++ + 5 files changed, 25 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b5d02df0c2..d5ff65f90a 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1650,6 +1650,8 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + { + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); ++ HostIOMMUDevice *hiod = NULL; ++ int ret; + + if (vbasedev->iommufd) { + ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); +@@ -1657,7 +1659,20 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + + assert(ops); + +- return ops->attach_device(name, vbasedev, as, errp); ++ ret = ops->attach_device(name, vbasedev, as, errp); ++ if (ret) { ++ return ret; ++ } ++ ++ hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); ++ if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { ++ object_unref(hiod); ++ ops->detach_device(vbasedev); ++ return -1; ++ } ++ vbasedev->hiod = hiod; ++ ++ return 0; + } + + void vfio_detach_device(VFIODevice *vbasedev) +@@ -1665,5 +1680,6 @@ void vfio_detach_device(VFIODevice *vbasedev) + if (!vbasedev->bcontainer) { + return; + } ++ object_unref(vbasedev->hiod); + vbasedev->bcontainer->ops->detach_device(vbasedev); + } +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index ed54ce6d0c..10f7635425 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1240,6 +1240,8 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) + { + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + ++ vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO; ++ + vioc->setup = vfio_legacy_setup; + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 2efdba5565..7cbf0e44f1 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -629,6 +629,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) + { + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + ++ vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO; ++ + vioc->dma_map = iommufd_cdev_map; + vioc->dma_unmap = iommufd_cdev_unmap; + vioc->attach_device = iommufd_cdev_attach; +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 376b8350b9..d45d40c329 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -140,6 +140,7 @@ typedef struct VFIODevice { + OnOffAuto pre_copy_dirty_page_tracking; + bool dirty_pages_supported; + bool dirty_tracking; ++ HostIOMMUDevice *hiod; + int devid; + IOMMUFDBackend *iommufd; + } VFIODevice; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index b2813b0c11..7a4c575115 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -109,6 +109,9 @@ DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) + struct VFIOIOMMUClass { + InterfaceClass parent_class; + ++ /* Properties */ ++ const char *hiod_typename; ++ + /* basic feature */ + int (*setup)(VFIOContainerBase *bcontainer, Error **errp); + int (*dma_map)(const VFIOContainerBase *bcontainer, +-- +2.41.0.windows.1 + diff --git a/vfio-Introduce-a-helper-function-to-initialize-VFIOD.patch b/vfio-Introduce-a-helper-function-to-initialize-VFIOD.patch new file mode 100644 index 0000000000000000000000000000000000000000..f6c11b9690f0a6e0b8e15e52d6448952055c72b9 --- /dev/null +++ b/vfio-Introduce-a-helper-function-to-initialize-VFIOD.patch @@ -0,0 +1,145 @@ +From 65c5381ba3ce5f062f0be9aa796e68b8a9d6bb3c Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:53:02 +0800 +Subject: [PATCH] vfio: Introduce a helper function to initialize VFIODevice +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce a helper function to replace the common code to initialize +VFIODevice in pci, platform, ap and ccw VFIO device. + +No functional change intended. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ap.c | 8 ++------ + hw/vfio/ccw.c | 8 ++------ + hw/vfio/helpers.c | 11 +++++++++++ + hw/vfio/pci.c | 6 ++---- + hw/vfio/platform.c | 6 ++---- + include/hw/vfio/vfio-common.h | 2 ++ + 6 files changed, 21 insertions(+), 20 deletions(-) + +diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c +index 95fe7cd98b..e157aa1ff7 100644 +--- a/hw/vfio/ap.c ++++ b/hw/vfio/ap.c +@@ -226,18 +226,14 @@ static void vfio_ap_instance_init(Object *obj) + VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); + VFIODevice *vbasedev = &vapdev->vdev; + +- vbasedev->type = VFIO_DEVICE_TYPE_AP; +- vbasedev->ops = &vfio_ap_ops; +- vbasedev->dev = DEVICE(vapdev); +- vbasedev->fd = -1; +- + /* + * vfio-ap devices operate in a way compatible with discarding of + * memory in RAM blocks, as no pages are pinned in the host. + * This needs to be set before vfio_get_device() for vfio common to + * handle ram_block_discard_disable(). + */ +- vbasedev->ram_block_discard_allowed = true; ++ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, ++ DEVICE(vapdev), true); + } + + #ifdef CONFIG_IOMMUFD +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 6305a4c1b8..90e4a53437 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -683,11 +683,6 @@ static void vfio_ccw_instance_init(Object *obj) + VFIOCCWDevice *vcdev = VFIO_CCW(obj); + VFIODevice *vbasedev = &vcdev->vdev; + +- vbasedev->type = VFIO_DEVICE_TYPE_CCW; +- vbasedev->ops = &vfio_ccw_ops; +- vbasedev->dev = DEVICE(vcdev); +- vbasedev->fd = -1; +- + /* + * All vfio-ccw devices are believed to operate in a way compatible with + * discarding of memory in RAM blocks, ie. pages pinned in the host are +@@ -696,7 +691,8 @@ static void vfio_ccw_instance_init(Object *obj) + * needs to be set before vfio_get_device() for vfio common to handle + * ram_block_discard_disable(). + */ +- vbasedev->ram_block_discard_allowed = true; ++ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops, ++ DEVICE(vcdev), true); + } + + #ifdef CONFIG_IOMMUFD +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index 3592c3d54e..6789870802 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -652,3 +652,14 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) + } + vbasedev->fd = fd; + } ++ ++void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, ++ DeviceState *dev, bool ram_discard) ++{ ++ vbasedev->type = type; ++ vbasedev->ops = ops; ++ vbasedev->dev = dev; ++ vbasedev->fd = -1; ++ ++ vbasedev->ram_block_discard_allowed = ram_discard; ++} +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 87405584d7..1874ec1aba 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3327,10 +3327,8 @@ static void vfio_instance_init(Object *obj) + vdev->host.slot = ~0U; + vdev->host.function = ~0U; + +- vbasedev->type = VFIO_DEVICE_TYPE_PCI; +- vbasedev->ops = &vfio_pci_ops; +- vbasedev->dev = DEVICE(vdev); +- vbasedev->fd = -1; ++ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, ++ DEVICE(vdev), false); + + vdev->nv_gpudirect_clique = 0xFF; + +diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c +index 506eb8193f..a8d9b7da63 100644 +--- a/hw/vfio/platform.c ++++ b/hw/vfio/platform.c +@@ -657,10 +657,8 @@ static void vfio_platform_instance_init(Object *obj) + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; + +- vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; +- vbasedev->ops = &vfio_platform_ops; +- vbasedev->dev = DEVICE(vdev); +- vbasedev->fd = -1; ++ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, ++ DEVICE(vdev), false); + } + + #ifdef CONFIG_IOMMUFD +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 37f01410d5..151b2ab65f 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -271,4 +271,6 @@ int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, + /* Returns 0 on success, or a negative errno. */ + int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); + void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); ++void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, ++ DeviceState *dev, bool ram_discard); + #endif /* HW_VFIO_VFIO_COMMON_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-Introduce-base-object-for-VFIOContainer-and-tar.patch b/vfio-Introduce-base-object-for-VFIOContainer-and-tar.patch new file mode 100644 index 0000000000000000000000000000000000000000..6b32a5de7f63c5d499b9051e0df7414272ce30d2 --- /dev/null +++ b/vfio-Introduce-base-object-for-VFIOContainer-and-tar.patch @@ -0,0 +1,121 @@ +From 166ecdd78a0f5cf359c0cbb4f7a5c32beee12fd7 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:18 +0800 +Subject: [PATCH] vfio: Introduce base object for VFIOContainer and targeted + interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce a dumb VFIOContainerBase object and its targeted interface. +This is willingly not a QOM object because we don't want it to be +visible from the user interface. The VFIOContainerBase will be +smoothly populated in subsequent patches as well as interfaces. + +No functional change intended. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + include/hw/vfio/vfio-common.h | 8 ++--- + include/hw/vfio/vfio-container-base.h | 50 +++++++++++++++++++++++++++ + 2 files changed, 52 insertions(+), 6 deletions(-) + create mode 100644 include/hw/vfio/vfio-container-base.h + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index fd9828d50b..c89b5886f2 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -30,6 +30,7 @@ + #include + #endif + #include "sysemu/sysemu.h" ++#include "hw/vfio/vfio-container-base.h" + + #define VFIO_MSG_PREFIX "vfio %s: " + +@@ -89,6 +90,7 @@ typedef struct VFIODMARange { + } VFIODMARange; + + typedef struct VFIOContainer { ++ VFIOContainerBase bcontainer; + VFIOAddressSpace *space; + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ + MemoryListener listener; +@@ -211,12 +213,6 @@ typedef struct VFIODisplay { + } dmabuf; + } VFIODisplay; + +-typedef struct { +- unsigned long *bitmap; +- hwaddr size; +- hwaddr pages; +-} VFIOBitmap; +- + VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); + void vfio_put_address_space(VFIOAddressSpace *space); + bool vfio_devices_all_running_and_saving(VFIOContainer *container); +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +new file mode 100644 +index 0000000000..1d6daaea5d +--- /dev/null ++++ b/include/hw/vfio/vfio-container-base.h +@@ -0,0 +1,50 @@ ++/* ++ * VFIO BASE CONTAINER ++ * ++ * Copyright (C) 2023 Intel Corporation. ++ * Copyright Red Hat, Inc. 2023 ++ * ++ * Authors: Yi Liu ++ * Eric Auger ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H ++#define HW_VFIO_VFIO_CONTAINER_BASE_H ++ ++#include "exec/memory.h" ++ ++typedef struct VFIODevice VFIODevice; ++typedef struct VFIOIOMMUOps VFIOIOMMUOps; ++ ++typedef struct { ++ unsigned long *bitmap; ++ hwaddr size; ++ hwaddr pages; ++} VFIOBitmap; ++ ++/* ++ * This is the base object for vfio container backends ++ */ ++typedef struct VFIOContainerBase { ++ const VFIOIOMMUOps *ops; ++} VFIOContainerBase; ++ ++struct VFIOIOMMUOps { ++ /* basic feature */ ++ int (*dma_map)(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ void *vaddr, bool readonly); ++ int (*dma_unmap)(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ IOMMUTLBEntry *iotlb); ++ int (*attach_device)(const char *name, VFIODevice *vbasedev, ++ AddressSpace *as, Error **errp); ++ void (*detach_device)(VFIODevice *vbasedev); ++ /* migration feature */ ++ int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); ++ int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, ++ hwaddr iova, hwaddr size); ++}; ++#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-Make-VFIOContainerBase-poiner-parameter-const-i.patch b/vfio-Make-VFIOContainerBase-poiner-parameter-const-i.patch new file mode 100644 index 0000000000000000000000000000000000000000..6afb13850cebd59cb2c2c839ad76eaab17b587b0 --- /dev/null +++ b/vfio-Make-VFIOContainerBase-poiner-parameter-const-i.patch @@ -0,0 +1,287 @@ +From f702d050b4309bb7e7ffc159a3c41c82fe34ba07 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:54 +0800 +Subject: [PATCH] vfio: Make VFIOContainerBase poiner parameter const in + VFIOIOMMUOps callbacks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Some of the callbacks in VFIOIOMMUOps pass VFIOContainerBase poiner, +those callbacks only need read access to the sub object of VFIOContainerBase. +So make VFIOContainerBase, VFIOContainer and VFIOIOMMUFDContainer as const +in these callbacks. + +Local functions called by those callbacks also need same changes to avoid +build error. + +Modify vfio_lookup_match_range/vfio_legacy_dma_map during backporting. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 9 +++---- + hw/vfio/container-base.c | 2 +- + hw/vfio/container.c | 34 ++++++++++++++------------- + hw/vfio/iommufd.c | 8 +++---- + include/hw/vfio/vfio-common.h | 14 ++++++----- + include/hw/vfio/vfio-container-base.h | 12 ++++++---- + 6 files changed, 43 insertions(+), 36 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 0e900c6746..d572ec5880 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -204,7 +204,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) + return true; + } + +-bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) ++bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) + { + VFIODevice *vbasedev; + +@@ -221,7 +221,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) + * Check if all VFIO devices are running and migration is active, which is + * essentially equivalent to the migration being in pre-copy phase. + */ +-bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer) ++bool ++vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) + { + VFIODevice *vbasedev; + +@@ -1139,7 +1140,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + return 0; + } + +-int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) + { +@@ -1162,7 +1163,7 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, + return 0; + } + +-int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, ++int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) + { + bool all_device_dirty_tracking = +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index eee2dcfe76..1ffd25bbfa 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -63,7 +63,7 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); + } + +-int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) + { +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index e32e1b51e0..67aeaa825b 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -63,11 +63,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) + } + } + +-static int vfio_dma_unmap_bitmap(VFIOContainer *container, ++static int vfio_dma_unmap_bitmap(const VFIOContainer *container, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; ++ const VFIOContainerBase *bcontainer = &container->bcontainer; + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + VFIOBitmap vbmap; +@@ -116,7 +116,7 @@ unmap_exit: + return ret; + } + +-VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, ++VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, + hwaddr start_addr, hwaddr size) + { + VFIODMARange *qrange; +@@ -142,11 +142,12 @@ void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) + /* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +-static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, +- ram_addr_t size, IOMMUTLBEntry *iotlb) ++static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ IOMMUTLBEntry *iotlb) + { +- VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); ++ const VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, +@@ -216,11 +217,11 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + return 0; + } + +-static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ++static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) + { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); ++ bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_READ, +@@ -257,11 +258,12 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, + return -errno; + } + +-static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, +- bool start) ++static int ++vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, ++ bool start) + { +- VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); ++ const VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), +@@ -283,12 +285,12 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + return ret; + } + +-static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) + { +- VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); ++ const VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + int ret; +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 5accd26484..87a561c545 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -26,10 +26,10 @@ + #include "qemu/chardev_open.h" + #include "pci.h" + +-static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, ++static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) + { +- VFIOIOMMUFDContainer *container = ++ const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + return iommufd_backend_map_dma(container->be, +@@ -37,11 +37,11 @@ static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, + iova, size, vaddr, readonly); + } + +-static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer, ++static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) + { +- VFIOIOMMUFDContainer *container = ++ const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 5f35f2900b..37f01410d5 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -186,7 +186,7 @@ typedef struct VFIODisplay { + VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); + void vfio_put_address_space(VFIOAddressSpace *space); + +-VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, ++VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, + hwaddr start_addr, hwaddr size); + void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); + +@@ -258,13 +258,15 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); + void vfio_migration_exit(VFIODevice *vbasedev); + + int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); +-bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer); +-bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer); +-int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++bool ++vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer); ++bool ++vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer); ++int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size); +-int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, +- uint64_t size, ram_addr_t ram_addr); ++int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, ++ uint64_t size, ram_addr_t ram_addr); + + /* Returns 0 on success, or a negative errno. */ + int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 45bb19c767..2ae297ccda 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -82,7 +82,7 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); + int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start); +-int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); + +@@ -93,18 +93,20 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); + + struct VFIOIOMMUOps { + /* basic feature */ +- int (*dma_map)(VFIOContainerBase *bcontainer, ++ int (*dma_map)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +- int (*dma_unmap)(VFIOContainerBase *bcontainer, ++ int (*dma_unmap)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + int (*attach_device)(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + void (*detach_device)(VFIODevice *vbasedev); + /* migration feature */ +- int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); +- int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, ++ int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, ++ bool start); ++ int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, ++ VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); + /* PCI specific */ + int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); +-- +2.41.0.windows.1 + diff --git a/vfio-Synthesize-vPASID-capability-to-VM.patch b/vfio-Synthesize-vPASID-capability-to-VM.patch new file mode 100644 index 0000000000000000000000000000000000000000..48637f50c8520b4aea8899100413e6f32ed5bb9f --- /dev/null +++ b/vfio-Synthesize-vPASID-capability-to-VM.patch @@ -0,0 +1,114 @@ +From da7cdc41aa3813f6bb1c87ced178f60185dac692 Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Thu, 12 Sep 2024 01:38:46 -0700 +Subject: [PATCH] vfio: Synthesize vPASID capability to VM + +If user wants to expose PASID capability in vIOMMU, then VFIO would also +report the PASID cap for this device if the underlying hardware supports +it as well. + +As a start, this chooses to put the vPASID cap in the last 8 bytes of the +vconfig space. This is a choice in the good hope of no conflict with any +existing cap or hidden registers. For the devices that has hidden registers, +user should figure out a proper offset for the vPASID cap. This may require +an option for user to config it. Here we leave it as a future extension. +There are more discussions on the mechanism of finding the proper offset. + +https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/ + +Signed-off-by: Yi Liu +--- + hw/pci/pcie.c | 12 ++++++++++++ + hw/vfio/pci.c | 28 ++++++++++++++++++++++++++++ + include/hw/pci/pcie.h | 4 ++++ + 3 files changed, 44 insertions(+) + +diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c +index 04fbd794a8..a5b4e54bd7 100644 +--- a/hw/pci/pcie.c ++++ b/hw/pci/pcie.c +@@ -1123,3 +1123,15 @@ void pcie_acs_reset(PCIDevice *dev) + pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0); + } + } ++ ++void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps) ++{ ++ pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, 1, ++ offset, PCI_EXT_CAP_PASID_SIZEOF); ++ ++ dev->exp.pasid_cap = offset; ++ ++ pci_set_word(dev->config + offset + PCI_PASID_CAP, caps); ++ ++ pci_set_word(dev->wmask + dev->exp.pasid_cap + PCI_PASID_CTRL, 0x7); ++} +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index f585f285f4..293deb8737 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -21,6 +21,7 @@ + #include "qemu/osdep.h" + #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include ++#include + #include + + #include "hw/hw.h" +@@ -2348,6 +2349,33 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) + + } + ++ { ++ HostIOMMUDeviceCaps *caps = &vdev->vbasedev.hiod->caps; ++ ++ /* ++ * TODO: Add option for enabling pasid at a safe offset, this adds the ++ * pasid capability in the end of the PCIE config space. ++ */ ++ if (caps->max_pasid_log2 && pci_device_get_pasid_cap(&vdev->pdev)) { ++ uint16_t pasid_caps = (caps->max_pasid_log2 << 8) & PCI_PASID_CAP_WIDTH; ++ ++ if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) { ++ pasid_caps |= PCI_PASID_CAP_EXEC; ++ } ++ ++ if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) { ++ pasid_caps |= PCI_PASID_CAP_PRIV; ++ } ++ ++ pcie_pasid_init(pdev, ++ PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF, ++ pasid_caps); ++ ++ /* PASID capability is fully emulated by QEMU */ ++ memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8); ++ } ++ } ++ + /* Cleanup chain head ID if necessary */ + if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { + pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); +diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h +index 11f5a91bbb..41ee27f023 100644 +--- a/include/hw/pci/pcie.h ++++ b/include/hw/pci/pcie.h +@@ -79,6 +79,9 @@ struct PCIExpressDevice { + uint16_t sriov_cap; + PCIESriovPF sriov_pf; + PCIESriovVF sriov_vf; ++ ++ /* Offset of PASID capability in config space */ ++ uint16_t pasid_cap; + }; + + #define COMPAT_PROP_PCP "power_controller_present" +@@ -147,4 +150,5 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp); + void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp); ++void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps); + #endif /* QEMU_PCIE_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-ap-Allow-the-selection-of-a-given-iommu-backend.patch b/vfio-ap-Allow-the-selection-of-a-given-iommu-backend.patch new file mode 100644 index 0000000000000000000000000000000000000000..140933a793509a0211e3594f284d33af3673b13e --- /dev/null +++ b/vfio-ap-Allow-the-selection-of-a-given-iommu-backend.patch @@ -0,0 +1,67 @@ +From 6b9f02dbde780118d33abb998bc72ed246f50b6a Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:50 +0800 +Subject: [PATCH] vfio/ap: Allow the selection of a given iommu backend +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Now we support two types of iommu backends, let's add the capability +to select one of them. This depends on whether an iommufd object has +been linked with the vfio-ap device: + +if the user wants to use the legacy backend, it shall not +link the vfio-ap device with any iommufd object: + + -device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX + +This is called the legacy mode/backend. + +If the user wants to use the iommufd backend (/dev/iommu) it +shall pass an iommufd object id in the vfio-ap device options: + + -object iommufd,id=iommufd0 + -device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0 + +Suggested-by: Alex Williamson +Signed-off-by: Zhenzhong Duan +Reviewed-by: Matthew Rosato +Reviewed-by: Cédric Le Goater +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ap.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c +index bbf69ff55a..80629609ae 100644 +--- a/hw/vfio/ap.c ++++ b/hw/vfio/ap.c +@@ -11,10 +11,12 @@ + */ + + #include "qemu/osdep.h" ++#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include + #include + #include "qapi/error.h" + #include "hw/vfio/vfio-common.h" ++#include "sysemu/iommufd.h" + #include "hw/s390x/ap-device.h" + #include "qemu/error-report.h" + #include "qemu/event_notifier.h" +@@ -204,6 +206,10 @@ static void vfio_ap_unrealize(DeviceState *dev) + + static Property vfio_ap_properties[] = { + DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev), ++#ifdef CONFIG_IOMMUFD ++ DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd, ++ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), ++#endif + DEFINE_PROP_END_OF_LIST(), + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-ap-Don-t-initialize-HOST_IOMMU_DEVICE-with-mdev.patch b/vfio-ap-Don-t-initialize-HOST_IOMMU_DEVICE-with-mdev.patch new file mode 100644 index 0000000000000000000000000000000000000000..f4dcfd1b75526ab733750a4c6acbaab00171bb70 --- /dev/null +++ b/vfio-ap-Don-t-initialize-HOST_IOMMU_DEVICE-with-mdev.patch @@ -0,0 +1,35 @@ +From 44d573b10c45746e81d0d1786fe61d45160f2181 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Mon, 22 Jul 2024 15:07:12 +0800 +Subject: [PATCH] vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev + +mdevs aren't "physical" devices and when asking for backing IOMMU info, +it fails the entire provisioning of the guest. Fix that by setting +vbasedev->mdev true so skipping HostIOMMUDevice initialization in the +presence of mdevs. + +Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") +Signed-off-by: Zhenzhong Duan +Reviewed-by: Joao Martins +Reviewed-by: Eric Auger +--- + hw/vfio/ap.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c +index e157aa1ff7..6b2bc32549 100644 +--- a/hw/vfio/ap.c ++++ b/hw/vfio/ap.c +@@ -234,6 +234,9 @@ static void vfio_ap_instance_init(Object *obj) + */ + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, + DEVICE(vapdev), true); ++ ++ /* AP device is mdev type device */ ++ vbasedev->mdev = true; + } + + #ifdef CONFIG_IOMMUFD +-- +2.41.0.windows.1 + diff --git a/vfio-ap-Make-vfio-cdev-pre-openable-by-passing-a-fil.patch b/vfio-ap-Make-vfio-cdev-pre-openable-by-passing-a-fil.patch new file mode 100644 index 0000000000000000000000000000000000000000..f799c6f9ae969a364e40a3c18c97bad9ce10e707 --- /dev/null +++ b/vfio-ap-Make-vfio-cdev-pre-openable-by-passing-a-fil.patch @@ -0,0 +1,78 @@ +From e4e2a6414eabe80d0d9f57446626c91c55b40afa Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:51 +0800 +Subject: [PATCH] vfio/ap: Make vfio cdev pre-openable by passing a file handle +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This gives management tools like libvirt a chance to open the vfio +cdev with privilege and pass FD to qemu. This way qemu never needs +to have privilege to open a VFIO or iommu cdev node. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Matthew Rosato +Reviewed-by: Cédric Le Goater +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ap.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c +index 80629609ae..f180e4a32a 100644 +--- a/hw/vfio/ap.c ++++ b/hw/vfio/ap.c +@@ -160,7 +160,10 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) + VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); + VFIODevice *vbasedev = &vapdev->vdev; + +- vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); ++ if (vfio_device_get_name(vbasedev, errp) < 0) { ++ return; ++ } ++ + vbasedev->ops = &vfio_ap_ops; + vbasedev->type = VFIO_DEVICE_TYPE_AP; + vbasedev->dev = dev; +@@ -230,11 +233,28 @@ static const VMStateDescription vfio_ap_vmstate = { + .unmigratable = 1, + }; + ++static void vfio_ap_instance_init(Object *obj) ++{ ++ VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); ++ ++ vapdev->vdev.fd = -1; ++} ++ ++#ifdef CONFIG_IOMMUFD ++static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp) ++{ ++ vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp); ++} ++#endif ++ + static void vfio_ap_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + + device_class_set_props(dc, vfio_ap_properties); ++#ifdef CONFIG_IOMMUFD ++ object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd); ++#endif + dc->vmsd = &vfio_ap_vmstate; + dc->desc = "VFIO-based AP device assignment"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); +@@ -249,6 +269,7 @@ static const TypeInfo vfio_ap_info = { + .name = TYPE_VFIO_AP_DEVICE, + .parent = TYPE_AP_DEVICE, + .instance_size = sizeof(VFIOAPDevice), ++ .instance_init = vfio_ap_instance_init, + .class_init = vfio_ap_class_init, + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-ap-Move-VFIODevice-initializations-in-vfio_ap_i.patch b/vfio-ap-Move-VFIODevice-initializations-in-vfio_ap_i.patch new file mode 100644 index 0000000000000000000000000000000000000000..0ae9791865ad777e509f51bdb390699adc1b04e5 --- /dev/null +++ b/vfio-ap-Move-VFIODevice-initializations-in-vfio_ap_i.patch @@ -0,0 +1,73 @@ +From 69da3907dc07bdb3cab4519922842820388bac4c Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:53:00 +0800 +Subject: [PATCH] vfio/ap: Move VFIODevice initializations in + vfio_ap_instance_init +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Some of the VFIODevice initializations is in vfio_ap_realize, +move all of them in vfio_ap_instance_init. + +No functional change intended. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Eric Farman +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ap.c | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c +index f180e4a32a..95fe7cd98b 100644 +--- a/hw/vfio/ap.c ++++ b/hw/vfio/ap.c +@@ -164,18 +164,6 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) + return; + } + +- vbasedev->ops = &vfio_ap_ops; +- vbasedev->type = VFIO_DEVICE_TYPE_AP; +- vbasedev->dev = dev; +- +- /* +- * vfio-ap devices operate in a way compatible with discarding of +- * memory in RAM blocks, as no pages are pinned in the host. +- * This needs to be set before vfio_get_device() for vfio common to +- * handle ram_block_discard_disable(). +- */ +- vapdev->vdev.ram_block_discard_allowed = true; +- + ret = vfio_attach_device(vbasedev->name, vbasedev, + &address_space_memory, errp); + if (ret) { +@@ -236,8 +224,20 @@ static const VMStateDescription vfio_ap_vmstate = { + static void vfio_ap_instance_init(Object *obj) + { + VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); ++ VFIODevice *vbasedev = &vapdev->vdev; + +- vapdev->vdev.fd = -1; ++ vbasedev->type = VFIO_DEVICE_TYPE_AP; ++ vbasedev->ops = &vfio_ap_ops; ++ vbasedev->dev = DEVICE(vapdev); ++ vbasedev->fd = -1; ++ ++ /* ++ * vfio-ap devices operate in a way compatible with discarding of ++ * memory in RAM blocks, as no pages are pinned in the host. ++ * This needs to be set before vfio_get_device() for vfio common to ++ * handle ram_block_discard_disable(). ++ */ ++ vbasedev->ram_block_discard_allowed = true; + } + + #ifdef CONFIG_IOMMUFD +-- +2.41.0.windows.1 + diff --git a/vfio-ccw-Allow-the-selection-of-a-given-iommu-backen.patch b/vfio-ccw-Allow-the-selection-of-a-given-iommu-backen.patch new file mode 100644 index 0000000000000000000000000000000000000000..beb4a137d1581c197ba2aa98960824c897248dc1 --- /dev/null +++ b/vfio-ccw-Allow-the-selection-of-a-given-iommu-backen.patch @@ -0,0 +1,70 @@ +From 5e743a2f7791f4fb3eea40806ca69f6cce1258c2 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:52 +0800 +Subject: [PATCH] vfio/ccw: Allow the selection of a given iommu backend +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Now we support two types of iommu backends, let's add the capability +to select one of them. This depends on whether an iommufd object has +been linked with the vfio-ccw device: + +If the user wants to use the legacy backend, it shall not +link the vfio-ccw device with any iommufd object: + + -device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX + +This is called the legacy mode/backend. + +If the user wants to use the iommufd backend (/dev/iommu) it +shall pass an iommufd object id in the vfio-ccw device options: + + -object iommufd,id=iommufd0 + -device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0 + +Suggested-by: Alex Williamson +Signed-off-by: Zhenzhong Duan +Reviewed-by: Matthew Rosato +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Farman +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ccw.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index d857bb8d0f..d2d58bb677 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -15,12 +15,14 @@ + */ + + #include "qemu/osdep.h" ++#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include + #include + #include + + #include "qapi/error.h" + #include "hw/vfio/vfio-common.h" ++#include "sysemu/iommufd.h" + #include "hw/s390x/s390-ccw.h" + #include "hw/s390x/vfio-ccw.h" + #include "hw/qdev-properties.h" +@@ -677,6 +679,10 @@ static void vfio_ccw_unrealize(DeviceState *dev) + static Property vfio_ccw_properties[] = { + DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev), + DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false), ++#ifdef CONFIG_IOMMUFD ++ DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd, ++ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), ++#endif + DEFINE_PROP_END_OF_LIST(), + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-ccw-Don-t-initialize-HOST_IOMMU_DEVICE-with-mde.patch b/vfio-ccw-Don-t-initialize-HOST_IOMMU_DEVICE-with-mde.patch new file mode 100644 index 0000000000000000000000000000000000000000..ab58961a272e330760c83b6f9a8e231ab361db63 --- /dev/null +++ b/vfio-ccw-Don-t-initialize-HOST_IOMMU_DEVICE-with-mde.patch @@ -0,0 +1,36 @@ +From ffcda8cc141e14528fd73aea750be822575eedcc Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Mon, 22 Jul 2024 15:07:13 +0800 +Subject: [PATCH] vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev + +mdevs aren't "physical" devices and when asking for backing IOMMU info, +it fails the entire provisioning of the guest. Fix that by setting +vbasedev->mdev true so skipping HostIOMMUDevice initialization in the +presence of mdevs. + +Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") +Signed-off-by: Zhenzhong Duan +Reviewed-by: Joao Martins +Acked-by: Eric Farman +Reviewed-by: Eric Auger +--- + hw/vfio/ccw.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 90e4a53437..257e9723cf 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -683,6 +683,9 @@ static void vfio_ccw_instance_init(Object *obj) + VFIOCCWDevice *vcdev = VFIO_CCW(obj); + VFIODevice *vbasedev = &vcdev->vdev; + ++ /* CCW device is mdev type device */ ++ vbasedev->mdev = true; ++ + /* + * All vfio-ccw devices are believed to operate in a way compatible with + * discarding of memory in RAM blocks, ie. pages pinned in the host are +-- +2.41.0.windows.1 + diff --git a/vfio-ccw-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch b/vfio-ccw-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch new file mode 100644 index 0000000000000000000000000000000000000000..23fd6aab97666125369f42e6b7ef578efd8b20aa --- /dev/null +++ b/vfio-ccw-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch @@ -0,0 +1,85 @@ +From 0f9545907220680ee7e85a823a0e19b216a8b7d9 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:53 +0800 +Subject: [PATCH] vfio/ccw: Make vfio cdev pre-openable by passing a file + handle +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This gives management tools like libvirt a chance to open the vfio +cdev with privilege and pass FD to qemu. This way qemu never needs +to have privilege to open a VFIO or iommu cdev node. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Matthew Rosato +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Farman +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ccw.c | 25 ++++++++++++++++++++++--- + 1 file changed, 22 insertions(+), 3 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index d2d58bb677..2afdf17dbe 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -590,11 +590,12 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + } + } + ++ if (vfio_device_get_name(vbasedev, errp) < 0) { ++ return; ++ } ++ + vbasedev->ops = &vfio_ccw_ops; + vbasedev->type = VFIO_DEVICE_TYPE_CCW; +- vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid, +- vcdev->cdev.hostid.ssid, +- vcdev->cdev.hostid.devid); + vbasedev->dev = dev; + + /* +@@ -691,12 +692,29 @@ static const VMStateDescription vfio_ccw_vmstate = { + .unmigratable = 1, + }; + ++static void vfio_ccw_instance_init(Object *obj) ++{ ++ VFIOCCWDevice *vcdev = VFIO_CCW(obj); ++ ++ vcdev->vdev.fd = -1; ++} ++ ++#ifdef CONFIG_IOMMUFD ++static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp) ++{ ++ vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp); ++} ++#endif ++ + static void vfio_ccw_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass); + + device_class_set_props(dc, vfio_ccw_properties); ++#ifdef CONFIG_IOMMUFD ++ object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd); ++#endif + dc->vmsd = &vfio_ccw_vmstate; + dc->desc = "VFIO-based subchannel assignment"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); +@@ -714,6 +732,7 @@ static const TypeInfo vfio_ccw_info = { + .name = TYPE_VFIO_CCW, + .parent = TYPE_S390_CCW, + .instance_size = sizeof(VFIOCCWDevice), ++ .instance_init = vfio_ccw_instance_init, + .class_init = vfio_ccw_class_init, + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-ccw-Move-VFIODevice-initializations-in-vfio_ccw.patch b/vfio-ccw-Move-VFIODevice-initializations-in-vfio_ccw.patch new file mode 100644 index 0000000000000000000000000000000000000000..2630a2fcaafb0f5cf9dc81dde900e054c9a6d405 --- /dev/null +++ b/vfio-ccw-Move-VFIODevice-initializations-in-vfio_ccw.patch @@ -0,0 +1,77 @@ +From 4d12d39e824a35014f753a25e5aa8ec0e275a38c Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:53:01 +0800 +Subject: [PATCH] vfio/ccw: Move VFIODevice initializations in + vfio_ccw_instance_init +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Some of the VFIODevice initializations is in vfio_ccw_realize, +move all of them in vfio_ccw_instance_init. + +No functional change intended. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Eric Farman +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/ccw.c | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c +index 2afdf17dbe..6305a4c1b8 100644 +--- a/hw/vfio/ccw.c ++++ b/hw/vfio/ccw.c +@@ -594,20 +594,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) + return; + } + +- vbasedev->ops = &vfio_ccw_ops; +- vbasedev->type = VFIO_DEVICE_TYPE_CCW; +- vbasedev->dev = dev; +- +- /* +- * All vfio-ccw devices are believed to operate in a way compatible with +- * discarding of memory in RAM blocks, ie. pages pinned in the host are +- * in the current working set of the guest driver and therefore never +- * overlap e.g., with pages available to the guest balloon driver. This +- * needs to be set before vfio_get_device() for vfio common to handle +- * ram_block_discard_disable(). +- */ +- vbasedev->ram_block_discard_allowed = true; +- + ret = vfio_attach_device(cdev->mdevid, vbasedev, + &address_space_memory, errp); + if (ret) { +@@ -695,8 +681,22 @@ static const VMStateDescription vfio_ccw_vmstate = { + static void vfio_ccw_instance_init(Object *obj) + { + VFIOCCWDevice *vcdev = VFIO_CCW(obj); ++ VFIODevice *vbasedev = &vcdev->vdev; ++ ++ vbasedev->type = VFIO_DEVICE_TYPE_CCW; ++ vbasedev->ops = &vfio_ccw_ops; ++ vbasedev->dev = DEVICE(vcdev); ++ vbasedev->fd = -1; + +- vcdev->vdev.fd = -1; ++ /* ++ * All vfio-ccw devices are believed to operate in a way compatible with ++ * discarding of memory in RAM blocks, ie. pages pinned in the host are ++ * in the current working set of the guest driver and therefore never ++ * overlap e.g., with pages available to the guest balloon driver. This ++ * needs to be set before vfio_get_device() for vfio common to handle ++ * ram_block_discard_disable(). ++ */ ++ vbasedev->ram_block_discard_allowed = true; + } + + #ifdef CONFIG_IOMMUFD +-- +2.41.0.windows.1 + diff --git a/vfio-common-Allow-disabling-device-dirty-page-tracki.patch b/vfio-common-Allow-disabling-device-dirty-page-tracki.patch new file mode 100644 index 0000000000000000000000000000000000000000..f019d6e34973bf183ecbc160455d77899a8443d6 --- /dev/null +++ b/vfio-common-Allow-disabling-device-dirty-page-tracki.patch @@ -0,0 +1,81 @@ +From b0fe5a6794c5403f4ab9859ec2ced338246690bd Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:26 +0100 +Subject: [PATCH] vfio/common: Allow disabling device dirty page tracking + +The property 'x-pre-copy-dirty-page-tracking' allows disabling the whole +tracking of VF pre-copy phase of dirty page tracking, though it means +that it will only be used at the start of the switchover phase. + +Add an option that disables the VF dirty page tracking, and fall +back into container-based dirty page tracking. This also allows to +use IOMMU dirty tracking even on VFs with their own dirty +tracker scheme. + +Signed-off-by: Joao Martins +Reviewed-by: Zhenzhong Duan +--- + hw/vfio/common.c | 3 +++ + hw/vfio/migration.c | 4 +++- + hw/vfio/pci.c | 3 +++ + include/hw/vfio/vfio-common.h | 1 + + 4 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 65e1c9f810..a8bc1c6055 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -208,6 +208,9 @@ bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) + VFIODevice *vbasedev; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { ++ if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { ++ return false; ++ } + if (!vbasedev->dirty_pages_supported) { + return false; + } +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index db128204af..3924beb289 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -945,7 +945,9 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + return !vfio_block_migration(vbasedev, err, errp); + } + +- if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { ++ if ((!vbasedev->dirty_pages_supported || ++ vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && ++ !vbasedev->iommu_dirty_tracking) { + if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { + error_setg(&err, + "%s: VFIO device doesn't support device and " +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 19211f4368..f585f285f4 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3350,6 +3350,9 @@ static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, + vbasedev.pre_copy_dirty_page_tracking, + ON_OFF_AUTO_ON), ++ DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, ++ vbasedev.device_dirty_page_tracking, ++ ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, + display, ON_OFF_AUTO_OFF), + DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 22a7386591..abae8655c4 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -147,6 +147,7 @@ typedef struct VFIODevice { + VFIOMigration *migration; + Error *migration_blocker; + OnOffAuto pre_copy_dirty_page_tracking; ++ OnOffAuto device_dirty_page_tracking; + bool dirty_pages_supported; + bool dirty_tracking; + bool iommu_dirty_tracking; +-- +2.41.0.windows.1 + diff --git a/vfio-common-Introduce-vfio_container_init-destroy-he.patch b/vfio-common-Introduce-vfio_container_init-destroy-he.patch new file mode 100644 index 0000000000000000000000000000000000000000..6ae516a32a649db7647d3feadc2833dfd3a1fd1b --- /dev/null +++ b/vfio-common-Introduce-vfio_container_init-destroy-he.patch @@ -0,0 +1,89 @@ +From ff4e67fa5ceb31f1dc686a661cbf37c1a81cd644 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:21 +0800 +Subject: [PATCH] vfio/common: Introduce vfio_container_init/destroy helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This adds two helper functions vfio_container_init/destroy which will be +used by both legacy and iommufd containers to do base container specific +initialization and release. + +No functional change intended. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/container-base.c | 9 +++++++++ + hw/vfio/container.c | 4 +++- + include/hw/vfio/vfio-container-base.h | 4 ++++ + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 55d3a35fa4..e929435751 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -30,3 +30,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + g_assert(bcontainer->ops->dma_unmap); + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); + } ++ ++void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) ++{ ++ bcontainer->ops = ops; ++} ++ ++void vfio_container_destroy(VFIOContainerBase *bcontainer) ++{ ++} +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 40e378e888..5a8c55056b 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -653,7 +653,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; +- bcontainer->ops = &vfio_legacy_ops; ++ vfio_container_init(bcontainer, &vfio_legacy_ops); + + ret = vfio_init_container(container, group->fd, errp); + if (ret) { +@@ -765,6 +765,7 @@ put_space_exit: + static void vfio_disconnect_container(VFIOGroup *group) + { + VFIOContainer *container = group->container; ++ VFIOContainerBase *bcontainer = &container->bcontainer; + + QLIST_REMOVE(group, container_next); + group->container = NULL; +@@ -803,6 +804,7 @@ static void vfio_disconnect_container(VFIOGroup *group) + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } ++ vfio_container_destroy(bcontainer); + + trace_vfio_disconnect_container(container->fd); + close(container->fd); +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 56b033f59f..577f52ccbc 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -38,6 +38,10 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + ++void vfio_container_init(VFIOContainerBase *bcontainer, ++ const VFIOIOMMUOps *ops); ++void vfio_container_destroy(VFIOContainerBase *bcontainer); ++ + struct VFIOIOMMUOps { + /* basic feature */ + int (*dma_map)(VFIOContainerBase *bcontainer, +-- +2.41.0.windows.1 + diff --git a/vfio-common-Move-giommu_list-in-base-container.patch b/vfio-common-Move-giommu_list-in-base-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..a517cab68a98f0b5e4a1714ee2997a9115c494e6 --- /dev/null +++ b/vfio-common-Move-giommu_list-in-base-container.patch @@ -0,0 +1,213 @@ +From 350f1a4d221849cc26a6d3950c128f951648c391 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:22 +0800 +Subject: [PATCH] vfio/common: Move giommu_list in base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Move the giommu_list field in the base container and store +the base container in the VFIOGuestIOMMU. + +No functional change intended. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 17 +++++++++++------ + hw/vfio/container-base.c | 9 +++++++++ + hw/vfio/container.c | 8 -------- + include/hw/vfio/vfio-common.h | 9 --------- + include/hw/vfio/vfio-container-base.h | 9 +++++++++ + 5 files changed, 29 insertions(+), 23 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index ea63271167..b8007b22c3 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + { + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); +- VFIOContainerBase *bcontainer = &giommu->container->bcontainer; ++ VFIOContainerBase *bcontainer = giommu->bcontainer; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + void *vaddr; + int ret; +@@ -569,6 +569,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ VFIOContainerBase *bcontainer = &container->bcontainer; + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; +@@ -612,7 +613,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + giommu->iommu_mr = iommu_mr; + giommu->iommu_offset = section->offset_within_address_space - + section->offset_within_region; +- giommu->container = container; ++ giommu->bcontainer = bcontainer; + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); +@@ -647,7 +648,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + g_free(giommu); + goto fail; + } +- QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); ++ QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); + + return; +@@ -732,6 +733,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ VFIOContainerBase *bcontainer = &container->bcontainer; + hwaddr iova, end; + Int128 llend, llsize; + int ret; +@@ -744,7 +746,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + +- QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { ++ QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu_mr) == section->mr && + giommu->n.start == section->offset_within_region) { + memory_region_unregister_iommu_notifier(section->mr, +@@ -1211,7 +1213,9 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + vfio_giommu_dirty_notifier *gdn = container_of(n, + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; +- VFIOContainer *container = giommu->container; ++ VFIOContainerBase *bcontainer = giommu->bcontainer; ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + int ret = -EINVAL; +@@ -1289,12 +1293,13 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + static int vfio_sync_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + ram_addr_t ram_addr; + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + +- QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { ++ QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu_mr) == section->mr && + giommu->n.start == section->offset_within_region) { + Int128 llend; +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index e929435751..20bcb9669a 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -34,8 +34,17 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) + { + bcontainer->ops = ops; ++ QLIST_INIT(&bcontainer->giommu_list); + } + + void vfio_container_destroy(VFIOContainerBase *bcontainer) + { ++ VFIOGuestIOMMU *giommu, *tmp; ++ ++ QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { ++ memory_region_unregister_iommu_notifier( ++ MEMORY_REGION(giommu->iommu_mr), &giommu->n); ++ QLIST_REMOVE(giommu, giommu_next); ++ g_free(giommu); ++ } + } +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 5a8c55056b..03791601d0 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -649,7 +649,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + container->dirty_pages_supported = false; + container->dma_max_mappings = 0; + container->iova_ranges = NULL; +- QLIST_INIT(&container->giommu_list); + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; +@@ -794,16 +793,9 @@ static void vfio_disconnect_container(VFIOGroup *group) + + if (QLIST_EMPTY(&container->group_list)) { + VFIOAddressSpace *space = container->space; +- VFIOGuestIOMMU *giommu, *tmp; + + QLIST_REMOVE(container, next); + +- QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { +- memory_region_unregister_iommu_notifier( +- MEMORY_REGION(giommu->iommu_mr), &giommu->n); +- QLIST_REMOVE(giommu, giommu_next); +- g_free(giommu); +- } + vfio_container_destroy(bcontainer); + + trace_vfio_disconnect_container(container->fd); +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index f94baf72db..6f02952ff6 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -104,7 +104,6 @@ typedef struct VFIOContainer { + uint64_t max_dirty_bitmap_size; + unsigned long pgsizes; + unsigned int dma_max_mappings; +- QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; +@@ -114,14 +113,6 @@ typedef struct VFIOContainer { + GList *iova_ranges; + } VFIOContainer; + +-typedef struct VFIOGuestIOMMU { +- VFIOContainer *container; +- IOMMUMemoryRegion *iommu_mr; +- hwaddr iommu_offset; +- IOMMUNotifier n; +- QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; +-} VFIOGuestIOMMU; +- + typedef struct VFIORamDiscardListener { + VFIOContainer *container; + MemoryRegion *mr; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 577f52ccbc..a11aec5755 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -29,8 +29,17 @@ typedef struct { + */ + typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; ++ QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + } VFIOContainerBase; + ++typedef struct VFIOGuestIOMMU { ++ VFIOContainerBase *bcontainer; ++ IOMMUMemoryRegion *iommu_mr; ++ hwaddr iommu_offset; ++ IOMMUNotifier n; ++ QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; ++} VFIOGuestIOMMU; ++ + int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +-- +2.41.0.windows.1 + diff --git a/vfio-common-return-early-if-space-isn-t-empty.patch b/vfio-common-return-early-if-space-isn-t-empty.patch new file mode 100644 index 0000000000000000000000000000000000000000..e3b3009352f8230930a4ece1088b83332c356571 --- /dev/null +++ b/vfio-common-return-early-if-space-isn-t-empty.patch @@ -0,0 +1,47 @@ +From bf4c408cd5d3daadbfd11136655e5bcb40dcbba0 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:39 +0800 +Subject: [PATCH] vfio/common: return early if space isn't empty +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is a trivial optimization. If there is active container in space, +vfio_reset_handler will never be unregistered. So revert the check of +space->containers and return early. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 679fee4321..f6c2029aec 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1608,10 +1608,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) + + void vfio_put_address_space(VFIOAddressSpace *space) + { +- if (QLIST_EMPTY(&space->containers)) { +- QLIST_REMOVE(space, list); +- g_free(space); ++ if (!QLIST_EMPTY(&space->containers)) { ++ return; + } ++ ++ QLIST_REMOVE(space, list); ++ g_free(space); ++ + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_unregister_reset(vfio_reset_handler, NULL); + } +-- +2.41.0.windows.1 + diff --git a/vfio-container-Convert-functions-to-base-container.patch b/vfio-container-Convert-functions-to-base-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..1aeb934d7ee566085d45f819d7ca13d1d9656cbe --- /dev/null +++ b/vfio-container-Convert-functions-to-base-container.patch @@ -0,0 +1,263 @@ +From 718cfbf181541fa4142aba10d5aee839e06b4d66 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:26 +0800 +Subject: [PATCH] vfio/container: Convert functions to base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In the prospect to get rid of VFIOContainer refs +in common.c lets convert misc functions to use the base +container object instead: + +vfio_devices_all_dirty_tracking +vfio_devices_all_device_dirty_tracking +vfio_devices_all_running_and_mig_active +vfio_devices_query_dirty_bitmap +vfio_get_dirty_bitmap + +Modify vfio_get_dirty_bitmap/vfio_listener_log_clear during backporting. + +Signed-off-by: Eric Auger +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 46 ++++++++++++++++------------------- + hw/vfio/container.c | 6 ++--- + hw/vfio/trace-events | 2 +- + include/hw/vfio/vfio-common.h | 9 +++---- + 4 files changed, 29 insertions(+), 34 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b952d1c811..b663d0bcc0 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -177,9 +177,8 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) + migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; + } + +-static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) ++static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + +@@ -204,9 +203,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) + return true; + } + +-bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) ++bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { +@@ -222,9 +220,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) + * Check if all VFIO devices are running and migration is active, which is + * essentially equivalent to the migration being in pre-copy phase. + */ +-bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) ++bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + + if (!migration_is_active(migrate_get_current())) { +@@ -1082,7 +1079,7 @@ static void vfio_listener_log_global_start(MemoryListener *listener) + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret; + +- if (vfio_devices_all_device_dirty_tracking(container)) { ++ if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { + ret = vfio_devices_dma_logging_start(container); + } else { + ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, +@@ -1101,7 +1098,7 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + int ret = 0; + +- if (vfio_devices_all_device_dirty_tracking(container)) { ++ if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { + vfio_devices_dma_logging_stop(container); + } else { + ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, +@@ -1141,11 +1138,10 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + return 0; + } + +-int vfio_devices_query_dirty_bitmap(VFIOContainer *container, ++int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + int ret; + +@@ -1165,18 +1161,19 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, + return 0; + } + +-int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, ++int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) + { + bool all_device_dirty_tracking = +- vfio_devices_all_device_dirty_tracking(container); ++ vfio_devices_all_device_dirty_tracking(bcontainer); ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + uint64_t dirty_pages; + VFIOBitmap vbmap; + VFIODMARange *qrange; + int ret; + +- if (!container->bcontainer.dirty_pages_supported && +- !all_device_dirty_tracking) { ++ if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); +@@ -1195,10 +1192,9 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + vbmap.bitmap = qrange->bitmap; + + if (all_device_dirty_tracking) { +- ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); ++ ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size); + } else { +- ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap, +- iova, size); ++ ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size); + } + + if (ret) { +@@ -1208,8 +1204,7 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, + vbmap.pages); + +- trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, +- ram_addr, dirty_pages); ++ trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); + out: + return ret; + } +@@ -1241,8 +1236,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + + rcu_read_lock(); + if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { +- ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, +- translated_addr); ++ ret = vfio_get_dirty_bitmap(&container->bcontainer, iova, ++ iotlb->addr_mask + 1, translated_addr); + if (ret) { + error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", +@@ -1271,7 +1266,8 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ +- return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); ++ return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size, ++ ram_addr); + } + + static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, +@@ -1340,7 +1336,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + +- return vfio_get_dirty_bitmap(container, ++ return vfio_get_dirty_bitmap(&container->bcontainer, + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); + } +@@ -1355,7 +1351,7 @@ static void vfio_listener_log_sync(MemoryListener *listener, + return; + } + +- if (vfio_devices_all_dirty_tracking(container)) { ++ if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { + ret = vfio_sync_dirty_bitmap(container, section); + if (ret) { + error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, +@@ -1495,7 +1491,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, + return; + } + +- if (vfio_devices_all_dirty_tracking(container)) { ++ if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { + vfio_physical_log_clear(container, section); + } + } +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 74d236ddee..9a542368ab 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -155,8 +155,8 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + bool need_dirty_sync = false; + int ret; + +- if (iotlb && vfio_devices_all_running_and_mig_active(container)) { +- if (!vfio_devices_all_device_dirty_tracking(container) && ++ if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { ++ if (!vfio_devices_all_device_dirty_tracking(bcontainer) && + container->bcontainer.dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } +@@ -204,7 +204,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + } + + if (need_dirty_sync) { +- ret = vfio_get_dirty_bitmap(container, iova, size, ++ ret = vfio_get_dirty_bitmap(bcontainer, iova, size, + iotlb->translated_addr); + if (ret) { + return ret; +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 9f7fedee98..08a1f9dfa4 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -117,7 +117,7 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic + vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" + vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" + vfio_legacy_dma_unmap_overflow_workaround(void) "" +-vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 ++vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 + vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 + + # platform.c +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index e27854228c..0295ede7ba 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -196,7 +196,6 @@ typedef struct VFIODisplay { + + VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); + void vfio_put_address_space(VFIOAddressSpace *space); +-bool vfio_devices_all_running_and_saving(VFIOContainer *container); + + VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size); +@@ -274,11 +273,11 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); + void vfio_migration_exit(VFIODevice *vbasedev); + + int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); +-bool vfio_devices_all_running_and_mig_active(VFIOContainer *container); +-bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container); +-int vfio_devices_query_dirty_bitmap(VFIOContainer *container, ++bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer); ++bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer); ++int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size); +-int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, ++int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); + #endif /* HW_VFIO_VFIO_COMMON_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-container-Implement-HostIOMMUDeviceClass-get_ca.patch b/vfio-container-Implement-HostIOMMUDeviceClass-get_ca.patch new file mode 100644 index 0000000000000000000000000000000000000000..a6ec9243db7ea417cbea8a444de6c655698d63a1 --- /dev/null +++ b/vfio-container-Implement-HostIOMMUDeviceClass-get_ca.patch @@ -0,0 +1,51 @@ +From b6830d3caff821b2472e369042c169935c906ef2 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:35 +0800 +Subject: [PATCH] vfio/container: Implement HostIOMMUDeviceClass::get_cap() + handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/container.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index fbe2bc50d4..ed54ce6d0c 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1261,11 +1261,26 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + return true; + } + ++static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, ++ Error **errp) ++{ ++ HostIOMMUDeviceCaps *caps = &hiod->caps; ++ ++ switch (cap) { ++ case HOST_IOMMU_DEVICE_CAP_AW_BITS: ++ return caps->aw_bits; ++ default: ++ error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); ++ return -EINVAL; ++ } ++} ++ + static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) + { + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->realize = hiod_legacy_vfio_realize; ++ hioc->get_cap = hiod_legacy_vfio_get_cap; + }; + + static const TypeInfo types[] = { +-- +2.41.0.windows.1 + diff --git a/vfio-container-Implement-HostIOMMUDeviceClass-realiz.patch b/vfio-container-Implement-HostIOMMUDeviceClass-realiz.patch new file mode 100644 index 0000000000000000000000000000000000000000..802f46c515958a2a7c8d3a13b555be04e5494c1b --- /dev/null +++ b/vfio-container-Implement-HostIOMMUDeviceClass-realiz.patch @@ -0,0 +1,100 @@ +From c66d22fa4ee9f6f38193256d7ce1494c32e10581 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:32 +0800 +Subject: [PATCH] vfio/container: Implement HostIOMMUDeviceClass::realize() + handler + +The realize function populates the capabilities. For now only the +aw_bits caps is computed for legacy backend. + +Introduce a helper function vfio_device_get_aw_bits() which calls +range_get_last_bit() to get host aw_bits and package it in +HostIOMMUDeviceCaps for query with .get_cap(). This helper will +also be used by iommufd backend. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/container.c | 20 +++++++++++++++++++- + hw/vfio/helpers.c | 17 +++++++++++++++++ + include/hw/vfio/vfio-common.h | 1 + + 3 files changed, 37 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index dcf49af2d0..fbe2bc50d4 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1250,6 +1250,24 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) + vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; + }; + ++static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, ++ Error **errp) ++{ ++ VFIODevice *vdev = opaque; ++ ++ hiod->name = g_strdup(vdev->name); ++ hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); ++ ++ return true; ++} ++ ++static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) ++{ ++ HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); ++ ++ hioc->realize = hiod_legacy_vfio_realize; ++}; ++ + static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_LEGACY, +@@ -1258,8 +1276,8 @@ static const TypeInfo types[] = { + }, { + .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE, ++ .class_init = hiod_legacy_vfio_class_init, + } +- + }; + + DEFINE_TYPES(types) +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index 6789870802..35b8e42304 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -663,3 +663,20 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + + vbasedev->ram_block_discard_allowed = ram_discard; + } ++ ++int vfio_device_get_aw_bits(VFIODevice *vdev) ++{ ++ /* ++ * iova_ranges is a sorted list. For old kernels that support ++ * VFIO but not support query of iova ranges, iova_ranges is NULL, ++ * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. ++ */ ++ GList *l = g_list_last(vdev->bcontainer->iova_ranges); ++ ++ if (l) { ++ Range *range = l->data; ++ return range_get_last_bit(range) + 1; ++ } ++ ++ return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; ++} +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 2cfc8521cd..376b8350b9 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -277,4 +277,5 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); + void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); + void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard); ++int vfio_device_get_aw_bits(VFIODevice *vdev); + #endif /* HW_VFIO_VFIO_COMMON_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-container-Implement-attach-detach_device.patch b/vfio-container-Implement-attach-detach_device.patch new file mode 100644 index 0000000000000000000000000000000000000000..0ce52489a87db68bec4ecdc29d3e643c717aa9d7 --- /dev/null +++ b/vfio-container-Implement-attach-detach_device.patch @@ -0,0 +1,89 @@ +From 1ba796aff9476e5850df910304eb3720a09feef2 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:32 +0800 +Subject: [PATCH] vfio/container: Implement attach/detach_device +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change intended. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 16 ++++++++++++++++ + hw/vfio/container.c | 12 +++++------- + 2 files changed, 21 insertions(+), 7 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 9926454527..488aa43c9b 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1644,3 +1644,19 @@ retry: + + return info; + } ++ ++int vfio_attach_device(char *name, VFIODevice *vbasedev, ++ AddressSpace *as, Error **errp) ++{ ++ const VFIOIOMMUOps *ops = &vfio_legacy_ops; ++ ++ return ops->attach_device(name, vbasedev, as, errp); ++} ++ ++void vfio_detach_device(VFIODevice *vbasedev) ++{ ++ if (!vbasedev->bcontainer) { ++ return; ++ } ++ vbasedev->bcontainer->ops->detach_device(vbasedev); ++} +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 13d42aad0d..62af0f2bdd 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -986,8 +986,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) + * @name and @vbasedev->name are likely to be different depending + * on the type of the device, hence the need for passing @name + */ +-int vfio_attach_device(char *name, VFIODevice *vbasedev, +- AddressSpace *as, Error **errp) ++static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, ++ AddressSpace *as, Error **errp) + { + int groupid = vfio_device_groupid(vbasedev, errp); + VFIODevice *vbasedev_iter; +@@ -1027,14 +1027,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + return ret; + } + +-void vfio_detach_device(VFIODevice *vbasedev) ++static void vfio_legacy_detach_device(VFIODevice *vbasedev) + { + VFIOGroup *group = vbasedev->group; + +- if (!vbasedev->bcontainer) { +- return; +- } +- + QLIST_REMOVE(vbasedev, global_next); + QLIST_REMOVE(vbasedev, container_next); + vbasedev->bcontainer = NULL; +@@ -1046,6 +1042,8 @@ void vfio_detach_device(VFIODevice *vbasedev) + const VFIOIOMMUOps vfio_legacy_ops = { + .dma_map = vfio_legacy_dma_map, + .dma_unmap = vfio_legacy_dma_unmap, ++ .attach_device = vfio_legacy_attach_device, ++ .detach_device = vfio_legacy_detach_device, + .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, + .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, + }; +-- +2.41.0.windows.1 + diff --git a/vfio-container-Initialize-VFIOIOMMUOps-under-vfio_in.patch b/vfio-container-Initialize-VFIOIOMMUOps-under-vfio_in.patch new file mode 100644 index 0000000000000000000000000000000000000000..c7eb4bceb3f045431333b1bdba78d2018eeda8c9 --- /dev/null +++ b/vfio-container-Initialize-VFIOIOMMUOps-under-vfio_in.patch @@ -0,0 +1,55 @@ +From 7a81c3919dda48b4e12b83ceb661896523cce6ab Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:18 +0100 +Subject: [PATCH] vfio/container: Initialize VFIOIOMMUOps under + vfio_init_container() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +vfio_init_container() already defines the IOMMU type of the container. +Do the same for the VFIOIOMMUOps struct. This prepares ground for the +following patches that will deduce the associated VFIOIOMMUOps struct +from the IOMMU type. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/container.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 27ce31c883..dc805ceb12 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -430,7 +430,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, + } + + static int vfio_init_container(VFIOContainer *container, int group_fd, +- Error **errp) ++ VFIOAddressSpace *space, Error **errp) + { + int iommu_type, dirty_log_manual_clear, ret; + +@@ -467,7 +467,7 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, + if (dirty_log_manual_clear) { + container->dirty_log_manual_clear = dirty_log_manual_clear; + } +- ++ vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops); + return 0; + } + +@@ -679,7 +679,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, &vfio_legacy_ops); + +- ret = vfio_init_container(container, group->fd, errp); ++ ret = vfio_init_container(container, group->fd, space, errp); + if (ret) { + goto free_container_exit; + } +-- +2.41.0.windows.1 + diff --git a/vfio-container-Intoduce-a-new-VFIOIOMMUClass-setup-h.patch b/vfio-container-Intoduce-a-new-VFIOIOMMUClass-setup-h.patch new file mode 100644 index 0000000000000000000000000000000000000000..7e9f9387f02530134e1f776b39e67fcadabd51fd --- /dev/null +++ b/vfio-container-Intoduce-a-new-VFIOIOMMUClass-setup-h.patch @@ -0,0 +1,45 @@ +From b8e67d06ec3036cd3fd6d625c550e0c542e49d60 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:21 +0100 +Subject: [PATCH] vfio/container: Intoduce a new VFIOIOMMUClass::setup handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This will help in converting the sPAPR IOMMU backend to a QOM interface. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/container.c | 1 + + include/hw/vfio/vfio-container-base.h | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 6b8de8f471..845239eff4 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1248,6 +1248,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) + { + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + ++ vioc->setup = vfio_legacy_setup; + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; + vioc->attach_device = vfio_legacy_attach_device; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index dce801378b..614de90767 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -109,6 +109,7 @@ struct VFIOIOMMUClass { + InterfaceClass parent_class; + + /* basic feature */ ++ int (*setup)(VFIOContainerBase *bcontainer, Error **errp); + int (*dma_map)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +-- +2.41.0.windows.1 + diff --git a/vfio-container-Introduce-TYPE_HOST_IOMMU_DEVICE_LEGA.patch b/vfio-container-Introduce-TYPE_HOST_IOMMU_DEVICE_LEGA.patch new file mode 100644 index 0000000000000000000000000000000000000000..c5be84d61d0ad2bc3ec6f09a5cb16b35529e1dbe --- /dev/null +++ b/vfio-container-Introduce-TYPE_HOST_IOMMU_DEVICE_LEGA.patch @@ -0,0 +1,65 @@ +From c253a07d9fe1598c4dbbb1cefee457806c417885 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:29 +0800 +Subject: [PATCH] vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO + device +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO represents a host IOMMU device under +VFIO legacy container backend. + +It will have its own realize implementation. + +Suggested-by: Eric Auger +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/container.c | 6 +++++- + include/hw/vfio/vfio-common.h | 3 +++ + 2 files changed, 8 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 4c62f088b1..dcf49af2d0 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1255,7 +1255,11 @@ static const TypeInfo types[] = { + .name = TYPE_VFIO_IOMMU_LEGACY, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_legacy_class_init, +- }, ++ }, { ++ .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, ++ .parent = TYPE_HOST_IOMMU_DEVICE, ++ } ++ + }; + + DEFINE_TYPES(types) +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index f3966410c1..0c807c2806 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -31,6 +31,7 @@ + #endif + #include "sysemu/sysemu.h" + #include "hw/vfio/vfio-container-base.h" ++#include "sysemu/host_iommu_device.h" + + #define VFIO_MSG_PREFIX "vfio %s: " + +@@ -75,6 +76,8 @@ typedef struct VFIOMigration { + + struct VFIOGroup; + ++#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" ++ + typedef struct VFIODMARange { + QLIST_ENTRY(VFIODMARange) next; + hwaddr iova; +-- +2.41.0.windows.1 + diff --git a/vfio-container-Introduce-a-VFIOIOMMU-QOM-interface.patch b/vfio-container-Introduce-a-VFIOIOMMU-QOM-interface.patch new file mode 100644 index 0000000000000000000000000000000000000000..0dd9efbca45c79d6d82c374115739086cf2659fd --- /dev/null +++ b/vfio-container-Introduce-a-VFIOIOMMU-QOM-interface.patch @@ -0,0 +1,134 @@ +From 5f62836c64d5abdbdb0d8fb9f0d2fd0d87f47b0a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:19 +0100 +Subject: [PATCH] vfio/container: Introduce a VFIOIOMMU QOM interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +VFIOContainerBase was not introduced as an abstract QOM object because +it felt unnecessary to expose all the IOMMU backends to the QEMU +machine and human interface. However, we can still abstract the IOMMU +backend handlers using a QOM interface class. This provides more +flexibility when referencing the various implementations. + +Simply transform the VFIOIOMMUOps struct in an InterfaceClass and do +some initial name replacements. Next changes will start converting +VFIOIOMMUOps. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 2 +- + hw/vfio/container-base.c | 12 +++++++++++- + hw/vfio/pci.c | 2 +- + include/hw/vfio/vfio-container-base.h | 23 +++++++++++++++++++---- + 4 files changed, 32 insertions(+), 7 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d572ec5880..abca6aa01a 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1649,7 +1649,7 @@ retry: + int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) + { +- const VFIOIOMMUOps *ops = &vfio_legacy_ops; ++ const VFIOIOMMUClass *ops = &vfio_legacy_ops; + + #ifdef CONFIG_IOMMUFD + if (vbasedev->iommufd) { +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 1ffd25bbfa..913ae49077 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -72,7 +72,7 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + } + + void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, +- const VFIOIOMMUOps *ops) ++ const VFIOIOMMUClass *ops) + { + bcontainer->ops = ops; + bcontainer->space = space; +@@ -99,3 +99,13 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) + + g_list_free_full(bcontainer->iova_ranges, g_free); + } ++ ++static const TypeInfo types[] = { ++ { ++ .name = TYPE_VFIO_IOMMU, ++ .parent = TYPE_INTERFACE, ++ .class_size = sizeof(VFIOIOMMUClass), ++ }, ++}; ++ ++DEFINE_TYPES(types) +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 1874ec1aba..d84a9e73a6 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2488,7 +2488,7 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) + { + VFIODevice *vbasedev = &vdev->vbasedev; +- const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops; ++ const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops; + + return ops->pci_hot_reset(vbasedev, single); + } +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 2ae297ccda..ce8bf9e2e6 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -16,7 +16,8 @@ + #include "exec/memory.h" + + typedef struct VFIODevice VFIODevice; +-typedef struct VFIOIOMMUOps VFIOIOMMUOps; ++typedef struct VFIOIOMMUClass VFIOIOMMUClass; ++#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */ + + typedef struct { + unsigned long *bitmap; +@@ -34,7 +35,7 @@ typedef struct VFIOAddressSpace { + * This is the base object for vfio container backends + */ + typedef struct VFIOContainerBase { +- const VFIOIOMMUOps *ops; ++ const VFIOIOMMUClass *ops; + VFIOAddressSpace *space; + MemoryListener listener; + Error *error; +@@ -88,10 +89,24 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + + void vfio_container_init(VFIOContainerBase *bcontainer, + VFIOAddressSpace *space, +- const VFIOIOMMUOps *ops); ++ const VFIOIOMMUClass *ops); + void vfio_container_destroy(VFIOContainerBase *bcontainer); + +-struct VFIOIOMMUOps { ++ ++#define TYPE_VFIO_IOMMU "vfio-iommu" ++ ++/* ++ * VFIOContainerBase is not an abstract QOM object because it felt ++ * unnecessary to expose all the IOMMU backends to the QEMU machine ++ * and human interface. However, we can still abstract the IOMMU ++ * backend handlers using a QOM interface class. This provides more ++ * flexibility when referencing the various implementations. ++ */ ++DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) ++ ++struct VFIOIOMMUClass { ++ InterfaceClass parent_class; ++ + /* basic feature */ + int (*dma_map)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, +-- +2.41.0.windows.1 + diff --git a/vfio-container-Introduce-a-VFIOIOMMU-legacy-QOM-inte.patch b/vfio-container-Introduce-a-VFIOIOMMU-legacy-QOM-inte.patch new file mode 100644 index 0000000000000000000000000000000000000000..1b2a1bb2a66afd16a80bafd08fa410fff270b22f --- /dev/null +++ b/vfio-container-Introduce-a-VFIOIOMMU-legacy-QOM-inte.patch @@ -0,0 +1,166 @@ +From 9f04d045ef1b2d206b002d20b792111b3ce86909 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:20 +0100 +Subject: [PATCH] vfio/container: Introduce a VFIOIOMMU legacy QOM interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Convert the legacy VFIOIOMMUOps struct to the new VFIOIOMMU QOM +interface. The set of of operations for this backend can be referenced +with a literal typename instead of a C struct. This will simplify +support of multiple backends. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 6 ++- + hw/vfio/container.c | 59 ++++++++++++++++++++++----- + include/hw/vfio/vfio-common.h | 1 - + include/hw/vfio/vfio-container-base.h | 1 + + 4 files changed, 55 insertions(+), 12 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index abca6aa01a..d98c3b7422 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1649,13 +1649,17 @@ retry: + int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) + { +- const VFIOIOMMUClass *ops = &vfio_legacy_ops; ++ const VFIOIOMMUClass *ops = ++ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + + #ifdef CONFIG_IOMMUFD + if (vbasedev->iommufd) { + ops = &vfio_iommufd_ops; + } + #endif ++ ++ assert(ops); ++ + return ops->attach_device(name, vbasedev, as, errp); + } + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index dc805ceb12..6b8de8f471 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -429,10 +429,30 @@ static int vfio_get_iommu_type(VFIOContainer *container, + return -EINVAL; + } + ++/* ++ * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type ++ */ ++static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) ++{ ++ ObjectClass *klass = NULL; ++ ++ switch (iommu_type) { ++ case VFIO_TYPE1v2_IOMMU: ++ case VFIO_TYPE1_IOMMU: ++ klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); ++ break; ++ default: ++ g_assert_not_reached(); ++ }; ++ ++ return VFIO_IOMMU_CLASS(klass); ++} ++ + static int vfio_init_container(VFIOContainer *container, int group_fd, + VFIOAddressSpace *space, Error **errp) + { + int iommu_type, dirty_log_manual_clear, ret; ++ const VFIOIOMMUClass *vioc; + + iommu_type = vfio_get_iommu_type(container, errp); + if (iommu_type < 0) { +@@ -467,7 +487,14 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, + if (dirty_log_manual_clear) { + container->dirty_log_manual_clear = dirty_log_manual_clear; + } +- vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops); ++ ++ vioc = vfio_get_iommu_class(iommu_type, errp); ++ if (!vioc) { ++ error_setg(errp, "No available IOMMU models"); ++ return -EINVAL; ++ } ++ ++ vfio_container_init(&container->bcontainer, space, vioc); + return 0; + } + +@@ -677,7 +704,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + container->fd = fd; + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; +- vfio_container_init(bcontainer, space, &vfio_legacy_ops); + + ret = vfio_init_container(container, group->fd, space, errp); + if (ret) { +@@ -1218,12 +1244,25 @@ out_single: + return ret; + } + +-const VFIOIOMMUOps vfio_legacy_ops = { +- .dma_map = vfio_legacy_dma_map, +- .dma_unmap = vfio_legacy_dma_unmap, +- .attach_device = vfio_legacy_attach_device, +- .detach_device = vfio_legacy_detach_device, +- .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, +- .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, +- .pci_hot_reset = vfio_legacy_pci_hot_reset, ++static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) ++{ ++ VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); ++ ++ vioc->dma_map = vfio_legacy_dma_map; ++ vioc->dma_unmap = vfio_legacy_dma_unmap; ++ vioc->attach_device = vfio_legacy_attach_device; ++ vioc->detach_device = vfio_legacy_detach_device; ++ vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; ++ vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; ++ vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; + }; ++ ++static const TypeInfo types[] = { ++ { ++ .name = TYPE_VFIO_IOMMU_LEGACY, ++ .parent = TYPE_VFIO_IOMMU, ++ .class_init = vfio_iommu_legacy_class_init, ++ }, ++}; ++ ++DEFINE_TYPES(types) +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 151b2ab65f..f78a97006c 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -224,7 +224,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; + typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; + extern VFIOGroupList vfio_group_list; + extern VFIODeviceList vfio_device_list; +-extern const VFIOIOMMUOps vfio_legacy_ops; + extern const VFIOIOMMUOps vfio_iommufd_ops; + extern const MemoryListener vfio_memory_listener; + extern int vfio_kvm_device_fd; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index ce8bf9e2e6..dce801378b 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -94,6 +94,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); + + + #define TYPE_VFIO_IOMMU "vfio-iommu" ++#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" + + /* + * VFIOContainerBase is not an abstract QOM object because it felt +-- +2.41.0.windows.1 + diff --git a/vfio-container-Introduce-a-empty-VFIOIOMMUOps.patch b/vfio-container-Introduce-a-empty-VFIOIOMMUOps.patch new file mode 100644 index 0000000000000000000000000000000000000000..b6d40421e33683b740c99658146ea4c5dbc0aa4a --- /dev/null +++ b/vfio-container-Introduce-a-empty-VFIOIOMMUOps.patch @@ -0,0 +1,63 @@ +From bda13dc55ae5e16174a4a611353f4bb8a590d510 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:19 +0800 +Subject: [PATCH] vfio/container: Introduce a empty VFIOIOMMUOps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This empty VFIOIOMMUOps named vfio_legacy_ops will hold all general +IOMMU ops of legacy container. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/container.c | 5 +++++ + include/hw/vfio/vfio-common.h | 2 +- + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 77e61cfedd..8d8ed13e93 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -565,6 +565,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + Error **errp) + { + VFIOContainer *container; ++ VFIOContainerBase *bcontainer; + int ret, fd; + VFIOAddressSpace *space; + +@@ -646,6 +647,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->giommu_list); + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); ++ bcontainer = &container->bcontainer; ++ bcontainer->ops = &vfio_legacy_ops; + + ret = vfio_init_container(container, group->fd, errp); + if (ret) { +@@ -1046,3 +1049,5 @@ void vfio_detach_device(VFIODevice *vbasedev) + vfio_put_base_device(vbasedev); + vfio_put_group(group); + } ++ ++const VFIOIOMMUOps vfio_legacy_ops; +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index c89b5886f2..3a0a6ab6ee 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -268,7 +268,7 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; + typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; + extern VFIOGroupList vfio_group_list; + extern VFIODeviceList vfio_device_list; +- ++extern const VFIOIOMMUOps vfio_legacy_ops; + extern const MemoryListener vfio_memory_listener; + extern int vfio_kvm_device_fd; + +-- +2.41.0.windows.1 + diff --git a/vfio-container-Introduce-vfio_legacy_setup-for-furth.patch b/vfio-container-Introduce-vfio_legacy_setup-for-furth.patch new file mode 100644 index 0000000000000000000000000000000000000000..630d44324ae41876c0a64e89f08730d22cd172c7 --- /dev/null +++ b/vfio-container-Introduce-vfio_legacy_setup-for-furth.patch @@ -0,0 +1,108 @@ +From 1bb64d6e69c385af5817dc6f0c3bbd204783c237 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:17 +0100 +Subject: [PATCH] vfio/container: Introduce vfio_legacy_setup() for further + cleanups +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This will help subsequent patches to unify the initialization of type1 +and sPAPR IOMMU backends. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/container.c | 60 +++++++++++++++++++++++++++------------------ + 1 file changed, 36 insertions(+), 24 deletions(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 67aeaa825b..27ce31c883 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -567,6 +567,35 @@ static void shared_memory_listener_unregister(void) + g_shl = NULL; + } + ++static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) ++{ ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); ++ g_autofree struct vfio_iommu_type1_info *info = NULL; ++ int ret; ++ ++ ret = vfio_get_iommu_info(container, &info); ++ if (ret) { ++ error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); ++ return ret; ++ } ++ ++ if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { ++ bcontainer->pgsizes = info->iova_pgsizes; ++ } else { ++ bcontainer->pgsizes = qemu_real_host_page_size(); ++ } ++ ++ if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { ++ bcontainer->dma_max_mappings = 65535; ++ } ++ ++ vfio_get_info_iova_range(info, bcontainer); ++ ++ vfio_get_iommu_info_migration(container, info); ++ return 0; ++} ++ + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + Error **errp) + { +@@ -665,31 +694,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_S_IOMMU: +- { +- struct vfio_iommu_type1_info *info; +- +- ret = vfio_get_iommu_info(container, &info); +- if (ret) { +- error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); +- goto enable_discards_exit; +- } +- +- if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { +- bcontainer->pgsizes = info->iova_pgsizes; +- } else { +- bcontainer->pgsizes = qemu_real_host_page_size(); +- } +- +- if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { +- bcontainer->dma_max_mappings = 65535; +- } +- +- vfio_get_info_iova_range(info, bcontainer); +- +- vfio_get_iommu_info_migration(container, info); +- g_free(info); ++ ret = vfio_legacy_setup(bcontainer, errp); + break; +- } + case VFIO_SPAPR_TCE_v2_IOMMU: + case VFIO_SPAPR_TCE_IOMMU: + { +@@ -699,6 +705,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + } + break; + } ++ default: ++ g_assert_not_reached(); ++ } ++ ++ if (ret) { ++ goto enable_discards_exit; + } + + vfio_kvm_device_add_group(group); +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-dirty_pgsizes-and-max_dirty_bitm.patch b/vfio-container-Move-dirty_pgsizes-and-max_dirty_bitm.patch new file mode 100644 index 0000000000000000000000000000000000000000..b0b9ee7fae24c2526a3e99fbb8419b4d8e07150e --- /dev/null +++ b/vfio-container-Move-dirty_pgsizes-and-max_dirty_bitm.patch @@ -0,0 +1,94 @@ +From a59131a461adf9b626735886a53825e2a03f3272 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:30 +0800 +Subject: [PATCH] vfio/container: Move dirty_pgsizes and max_dirty_bitmap_size + to base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change intended. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/container.c | 9 +++++---- + include/hw/vfio/vfio-common.h | 2 -- + include/hw/vfio/vfio-container-base.h | 2 ++ + 3 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 50da1300dd..191597167a 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -66,6 +66,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + VFIOBitmap vbmap; +@@ -93,7 +94,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, + bitmap->size = vbmap.size; + bitmap->data = (__u64 *)vbmap.bitmap; + +- if (vbmap.size > container->max_dirty_bitmap_size) { ++ if (vbmap.size > bcontainer->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); + ret = -E2BIG; + goto unmap_exit; +@@ -157,7 +158,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + + if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { + if (!vfio_devices_all_device_dirty_tracking(bcontainer) && +- container->bcontainer.dirty_pages_supported) { ++ bcontainer->dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + +@@ -536,8 +537,8 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, + */ + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { + bcontainer->dirty_pages_supported = true; +- container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; +- container->dirty_pgsizes = cap_mig->pgsize_bitmap; ++ bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; ++ bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; + } + } + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index c23e7fb8ee..a8da41d27e 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -89,8 +89,6 @@ typedef struct VFIOContainer { + MemoryListener prereg_listener; + unsigned iommu_type; + bool dirty_log_manual_clear; +- uint64_t dirty_pgsizes; +- uint64_t max_dirty_bitmap_size; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIODMARange) dma_list; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 95f8d319e0..80e4a993c5 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -39,6 +39,8 @@ typedef struct VFIOContainerBase { + MemoryListener listener; + Error *error; + bool initialized; ++ uint64_t dirty_pgsizes; ++ uint64_t max_dirty_bitmap_size; + unsigned long pgsizes; + unsigned int dma_max_mappings; + bool dirty_pages_supported; +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-iova_ranges-to-base-container.patch b/vfio-container-Move-iova_ranges-to-base-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..3580802248dbb230193ad48ce60ea584b9383ac1 --- /dev/null +++ b/vfio-container-Move-iova_ranges-to-base-container.patch @@ -0,0 +1,160 @@ +From 4aac9c99e4f90d400d511bb46809714eab1fbf5f Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:31 +0800 +Subject: [PATCH] vfio/container: Move iova_ranges to base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Meanwhile remove the helper function vfio_free_container as it +only calls g_free now. + +No functional change intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 5 +++-- + hw/vfio/container-base.c | 3 +++ + hw/vfio/container.c | 19 ++++++------------- + include/hw/vfio/vfio-common.h | 1 - + include/hw/vfio/vfio-container-base.h | 1 + + 5 files changed, 13 insertions(+), 16 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 4647f4447d..9926454527 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -637,9 +637,10 @@ static void vfio_listener_region_add(MemoryListener *listener, + goto fail; + } + +- if (container->iova_ranges) { ++ if (bcontainer->iova_ranges) { + ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, +- container->iova_ranges, &err); ++ bcontainer->iova_ranges, ++ &err); + if (ret) { + g_free(giommu); + goto fail; +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 7f508669f5..0177f43741 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + bcontainer->error = NULL; + bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; ++ bcontainer->iova_ranges = NULL; + QLIST_INIT(&bcontainer->giommu_list); + QLIST_INIT(&bcontainer->vrdl_list); + } +@@ -70,4 +71,6 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } ++ ++ g_list_free_full(bcontainer->iova_ranges, g_free); + } +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 191597167a..13d42aad0d 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -360,7 +360,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + } + + static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, +- VFIOContainer *container) ++ VFIOContainerBase *bcontainer) + { + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_iova_range *cap; +@@ -378,8 +378,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, + + range_set_bounds(range, cap->iova_ranges[i].start, + cap->iova_ranges[i].end); +- container->iova_ranges = +- range_list_insert(container->iova_ranges, range); ++ bcontainer->iova_ranges = ++ range_list_insert(bcontainer->iova_ranges, range); + } + + return true; +@@ -542,12 +542,6 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, + } + } + +-static void vfio_free_container(VFIOContainer *container) +-{ +- g_list_free_full(container->iova_ranges, g_free); +- g_free(container); +-} +- + static SharedRegionListener *g_shl; + + static void shared_memory_listener_register(MemoryListener *listener, +@@ -653,7 +647,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + + container = g_malloc0(sizeof(*container)); + container->fd = fd; +- container->iova_ranges = NULL; + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, &vfio_legacy_ops); +@@ -692,7 +685,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + bcontainer->dma_max_mappings = 65535; + } + +- vfio_get_info_iova_range(info, container); ++ vfio_get_info_iova_range(info, bcontainer); + + vfio_get_iommu_info_migration(container, info); + g_free(info); +@@ -753,7 +746,7 @@ enable_discards_exit: + vfio_ram_block_discard_disable(container, false); + + free_container_exit: +- vfio_free_container(container); ++ g_free(container); + + close_fd_exit: + close(fd); +@@ -801,7 +794,7 @@ static void vfio_disconnect_container(VFIOGroup *group) + + trace_vfio_disconnect_container(container->fd); + close(container->fd); +- vfio_free_container(container); ++ g_free(container); + + vfio_put_address_space(space); + } +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index a8da41d27e..9a2e0ace72 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -92,7 +92,6 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIODMARange) dma_list; +- GList *iova_ranges; + } VFIOContainer; + + typedef struct VFIOHostDMAWindow { +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 80e4a993c5..9658ffb526 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -48,6 +48,7 @@ typedef struct VFIOContainerBase { + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_ENTRY(VFIOContainerBase) next; + QLIST_HEAD(, VFIODevice) device_list; ++ GList *iova_ranges; + } VFIOContainerBase; + + typedef struct VFIOGuestIOMMU { +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-listener-to-base-container.patch b/vfio-container-Move-listener-to-base-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..bdba89ac111f4b8037975c8e5069619e9a326074 --- /dev/null +++ b/vfio-container-Move-listener-to-base-container.patch @@ -0,0 +1,546 @@ +From 4515b719fb7a335ce76dd9168a9e4db24fca28df Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:29 +0800 +Subject: [PATCH] vfio/container: Move listener to base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Move listener to base container. Also error and initialized fields +are moved at the same time. + +No functional change intended. + +Modify vfio_physical_log_clear/vfio_connect_container during +backporting. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 119 +++++++++++++------------- + hw/vfio/container-base.c | 1 + + hw/vfio/container.c | 23 +++-- + hw/vfio/spapr.c | 11 +-- + include/hw/vfio/vfio-common.h | 3 - + include/hw/vfio/vfio-container-base.h | 3 + + 6 files changed, 82 insertions(+), 78 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index e9a19209ab..4647f4447d 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -541,7 +541,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section, + return true; + } + +-static bool vfio_get_section_iova_range(VFIOContainer *container, ++static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + hwaddr *out_iova, hwaddr *out_end, + Int128 *out_llend) +@@ -569,8 +569,10 @@ static bool vfio_get_section_iova_range(VFIOContainer *container, + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, listener); +- VFIOContainerBase *bcontainer = &container->bcontainer; ++ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, ++ listener); ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; +@@ -581,7 +583,8 @@ static void vfio_listener_region_add(MemoryListener *listener, + return; + } + +- if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { ++ if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, ++ &llend)) { + if (memory_region_is_ram_device(section->mr)) { + trace_vfio_listener_region_add_no_dma_map( + memory_region_name(section->mr), +@@ -688,13 +691,12 @@ static void vfio_listener_region_add(MemoryListener *listener, + } + } + +- ret = vfio_container_dma_map(&container->bcontainer, +- iova, int128_get64(llsize), vaddr, +- section->readonly); ++ ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), ++ vaddr, section->readonly); + if (ret) { + error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", +- container, iova, int128_get64(llsize), vaddr, ret, ++ bcontainer, iova, int128_get64(llsize), vaddr, ret, + strerror(-ret)); + if (memory_region_is_ram_device(section->mr)) { + /* Allow unexpected mappings not to be fatal for RAM devices */ +@@ -716,9 +718,9 @@ fail: + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ +- if (!container->initialized) { +- if (!container->error) { +- error_propagate_prepend(&container->error, err, ++ if (!bcontainer->initialized) { ++ if (!bcontainer->error) { ++ error_propagate_prepend(&bcontainer->error, err, + "Region %s: ", + memory_region_name(section->mr)); + } else { +@@ -733,8 +735,10 @@ fail: + static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, listener); +- VFIOContainerBase *bcontainer = &container->bcontainer; ++ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, ++ listener); ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + hwaddr iova, end; + Int128 llend, llsize; + int ret; +@@ -767,7 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener, + */ + } + +- if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { ++ if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, ++ &llend)) { + return; + } + +@@ -790,22 +795,22 @@ static void vfio_listener_region_del(MemoryListener *listener, + if (int128_eq(llsize, int128_2_64())) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + llsize = int128_rshift(llsize, 1); +- ret = vfio_container_dma_unmap(&container->bcontainer, iova, ++ ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", +- container, iova, int128_get64(llsize), ret, ++ bcontainer, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + iova += int128_get64(llsize); + } +- ret = vfio_container_dma_unmap(&container->bcontainer, iova, ++ ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); + if (ret) { + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", +- container, iova, int128_get64(llsize), ret, ++ bcontainer, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + } +@@ -825,16 +830,15 @@ typedef struct VFIODirtyRanges { + } VFIODirtyRanges; + + typedef struct VFIODirtyRangesListener { +- VFIOContainer *container; ++ VFIOContainerBase *bcontainer; + VFIODirtyRanges ranges; + MemoryListener listener; + } VFIODirtyRangesListener; + + static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, +- VFIOContainer *container) ++ VFIOContainerBase *bcontainer) + { + VFIOPCIDevice *pcidev; +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + Object *owner; + +@@ -863,7 +867,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, + hwaddr iova, end, *min, *max; + + if (!vfio_listener_valid_section(section, "tracking_update") || +- !vfio_get_section_iova_range(dirty->container, section, ++ !vfio_get_section_iova_range(dirty->bcontainer, section, + &iova, &end, NULL)) { + return; + } +@@ -887,7 +891,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, + * The alternative would be an IOVATree but that has a much bigger runtime + * overhead and unnecessary complexity. + */ +- if (vfio_section_is_vfio_pci(section, dirty->container) && ++ if (vfio_section_is_vfio_pci(section, dirty->bcontainer) && + iova >= UINT32_MAX) { + min = &range->minpci64; + max = &range->maxpci64; +@@ -911,7 +915,7 @@ static const MemoryListener vfio_dirty_tracking_listener = { + .region_add = vfio_dirty_tracking_update, + }; + +-static void vfio_dirty_tracking_init(VFIOContainer *container, ++static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, + VFIODirtyRanges *ranges) + { + VFIODirtyRangesListener dirty; +@@ -921,10 +925,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, + dirty.ranges.min64 = UINT64_MAX; + dirty.ranges.minpci64 = UINT64_MAX; + dirty.listener = vfio_dirty_tracking_listener; +- dirty.container = container; ++ dirty.bcontainer = bcontainer; + + memory_listener_register(&dirty.listener, +- container->bcontainer.space->as); ++ bcontainer->space->as); + + *ranges = dirty.ranges; + +@@ -936,12 +940,11 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, + memory_listener_unregister(&dirty.listener); + } + +-static void vfio_devices_dma_logging_stop(VFIOContainer *container) ++static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) + { + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + + feature->argsz = sizeof(buf); +@@ -962,7 +965,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) + } + + static struct vfio_device_feature * +-vfio_device_feature_dma_logging_start_create(VFIOContainer *container, ++vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, + VFIODirtyRanges *tracking) + { + struct vfio_device_feature *feature; +@@ -1035,16 +1038,15 @@ static void vfio_device_feature_dma_logging_start_destroy( + g_free(feature); + } + +-static int vfio_devices_dma_logging_start(VFIOContainer *container) ++static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer) + { + struct vfio_device_feature *feature; + VFIODirtyRanges ranges; +- VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + int ret = 0; + +- vfio_dirty_tracking_init(container, &ranges); +- feature = vfio_device_feature_dma_logging_start_create(container, ++ vfio_dirty_tracking_init(bcontainer, &ranges); ++ feature = vfio_device_feature_dma_logging_start_create(bcontainer, + &ranges); + if (!feature) { + return -errno; +@@ -1067,7 +1069,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) + + out: + if (ret) { +- vfio_devices_dma_logging_stop(container); ++ vfio_devices_dma_logging_stop(bcontainer); + } + + vfio_device_feature_dma_logging_start_destroy(feature); +@@ -1077,14 +1079,14 @@ out: + + static void vfio_listener_log_global_start(MemoryListener *listener) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, ++ listener); + int ret; + +- if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { +- ret = vfio_devices_dma_logging_start(container); ++ if (vfio_devices_all_device_dirty_tracking(bcontainer)) { ++ ret = vfio_devices_dma_logging_start(bcontainer); + } else { +- ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, +- true); ++ ret = vfio_container_set_dirty_page_tracking(bcontainer, true); + } + + if (ret) { +@@ -1096,14 +1098,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener) + + static void vfio_listener_log_global_stop(MemoryListener *listener) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, ++ listener); + int ret = 0; + +- if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { +- vfio_devices_dma_logging_stop(container); ++ if (vfio_devices_all_device_dirty_tracking(bcontainer)) { ++ vfio_devices_dma_logging_stop(bcontainer); + } else { +- ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, +- false); ++ ret = vfio_container_set_dirty_page_tracking(bcontainer, false); + } + + if (ret) { +@@ -1221,8 +1223,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; + VFIOContainerBase *bcontainer = giommu->bcontainer; +- VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + int ret = -EINVAL; +@@ -1237,12 +1237,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + + rcu_read_lock(); + if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { +- ret = vfio_get_dirty_bitmap(&container->bcontainer, iova, +- iotlb->addr_mask + 1, translated_addr); ++ ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, ++ translated_addr); + if (ret) { + error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", +- container, iova, iotlb->addr_mask + 1, ret, ++ bcontainer, iova, iotlb->addr_mask + 1, ret, + strerror(-ret)); + } + } +@@ -1298,10 +1298,9 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, + &vrdl); + } + +-static int vfio_sync_dirty_bitmap(VFIOContainer *container, ++static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; + ram_addr_t ram_addr; + + if (memory_region_is_iommu(section->mr)) { +@@ -1337,7 +1336,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + +- return vfio_get_dirty_bitmap(&container->bcontainer, ++ return vfio_get_dirty_bitmap(bcontainer, + REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); + } +@@ -1345,15 +1344,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + static void vfio_listener_log_sync(MemoryListener *listener, + MemoryRegionSection *section) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, ++ listener); + int ret; + + if (vfio_listener_skipped_section(section)) { + return; + } + +- if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { +- ret = vfio_sync_dirty_bitmap(container, section); ++ if (vfio_devices_all_dirty_tracking(bcontainer)) { ++ ret = vfio_sync_dirty_bitmap(bcontainer, section); + if (ret) { + error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, + strerror(-ret)); +@@ -1485,14 +1485,17 @@ static int vfio_physical_log_clear(VFIOContainer *container, + static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, ++ listener); ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + + if (vfio_listener_skipped_section(section) || +- !container->bcontainer.dirty_pages_supported) { ++ !bcontainer->dirty_pages_supported) { + return; + } + +- if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { ++ if (vfio_devices_all_dirty_tracking(bcontainer)) { + vfio_physical_log_clear(container, section); + } + } +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 584eee4ba1..7f508669f5 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -51,6 +51,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + { + bcontainer->ops = ops; + bcontainer->space = space; ++ bcontainer->error = NULL; + bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; + QLIST_INIT(&bcontainer->giommu_list); +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 023f220c93..50da1300dd 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -520,6 +520,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, + { + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_migration *cap_mig; ++ VFIOContainerBase *bcontainer = &container->bcontainer; + + hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); + if (!hdr) { +@@ -534,7 +535,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, + * qemu_real_host_page_size to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { +- container->bcontainer.dirty_pages_supported = true; ++ bcontainer->dirty_pages_supported = true; + container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + container->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +@@ -651,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + + container = g_malloc0(sizeof(*container)); + container->fd = fd; +- container->error = NULL; + container->iova_ranges = NULL; + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; +@@ -716,23 +716,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + +- container->listener = vfio_memory_listener; +- + if (kvm_csv3_enabled()) { +- shared_memory_listener_register(&container->listener, ++ shared_memory_listener_register(&bcontainer->listener, + bcontainer->space->as); +- } else { +- memory_listener_register(&container->listener, bcontainer->space->as); + } + +- if (container->error) { ++ bcontainer->listener = vfio_memory_listener; ++ memory_listener_register(&bcontainer->listener, bcontainer->space->as); ++ ++ if (bcontainer->error) { + ret = -1; +- error_propagate_prepend(errp, container->error, ++ error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); + goto listener_release_exit; + } + +- container->initialized = true; ++ bcontainer->initialized = true; + + return 0; + listener_release_exit: +@@ -742,7 +741,7 @@ listener_release_exit: + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); + } else { +- memory_listener_unregister(&container->listener); ++ memory_listener_unregister(&bcontainer->listener); + } + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { +@@ -781,7 +780,7 @@ static void vfio_disconnect_container(VFIOGroup *group) + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); + } else { +- memory_listener_unregister(&container->listener); ++ memory_listener_unregister(&bcontainer->listener); + } + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 4f76bdd3ca..7a50975f25 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -46,6 +46,7 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, + { + VFIOContainer *container = container_of(listener, VFIOContainer, + prereg_listener); ++ VFIOContainerBase *bcontainer = &container->bcontainer; + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; + int ret; +@@ -88,9 +89,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ +- if (!container->initialized) { +- if (!container->error) { +- error_setg_errno(&container->error, -ret, ++ if (!bcontainer->initialized) { ++ if (!bcontainer->error) { ++ error_setg_errno(&bcontainer->error, -ret, + "Memory registering failed"); + } + } else { +@@ -445,9 +446,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + + memory_listener_register(&container->prereg_listener, + &address_space_memory); +- if (container->error) { ++ if (bcontainer->error) { + ret = -1; +- error_propagate_prepend(errp, container->error, ++ error_propagate_prepend(errp, bcontainer->error, + "RAM memory listener initialization failed: "); + goto listener_unregister_exit; + } +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 0174b767ca..c23e7fb8ee 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -86,11 +86,8 @@ typedef struct VFIODMARange { + typedef struct VFIOContainer { + VFIOContainerBase bcontainer; + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ +- MemoryListener listener; + MemoryListener prereg_listener; + unsigned iommu_type; +- Error *error; +- bool initialized; + bool dirty_log_manual_clear; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 8e05b5ac5a..95f8d319e0 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -36,6 +36,9 @@ typedef struct VFIOAddressSpace { + typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; + VFIOAddressSpace *space; ++ MemoryListener listener; ++ Error *error; ++ bool initialized; + unsigned long pgsizes; + unsigned int dma_max_mappings; + bool dirty_pages_supported; +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-per-container-device-list-in-bas.patch b/vfio-container-Move-per-container-device-list-in-bas.patch new file mode 100644 index 0000000000000000000000000000000000000000..bcbaff73749fa53839b3ca4e5ccf40791d2ec403 --- /dev/null +++ b/vfio-container-Move-per-container-device-list-in-bas.patch @@ -0,0 +1,222 @@ +From 22244582a5ff77c0d93008e603a343c1e47ca85d Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:25 +0800 +Subject: [PATCH] vfio/container: Move per container device list in base + container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +VFIO Device is also changed to point to base container instead of +legacy container. + +No functional change intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 23 +++++++++++++++-------- + hw/vfio/container.c | 12 ++++++------ + include/hw/vfio/vfio-common.h | 3 +-- + include/hw/vfio/vfio-container-base.h | 1 + + 4 files changed, 23 insertions(+), 16 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 3be6cecc63..b952d1c811 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) + + bool vfio_viommu_preset(VFIODevice *vbasedev) + { +- return vbasedev->container->bcontainer.space->as != &address_space_memory; ++ return vbasedev->bcontainer->space->as != &address_space_memory; + } + + static void vfio_set_migration_error(int err) +@@ -179,6 +179,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) + + static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + +@@ -187,7 +188,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) + return false; + } + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { +@@ -205,9 +206,10 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) + + bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_pages_supported) { + return false; + } +@@ -222,13 +224,14 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) + */ + bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + + if (!migration_is_active(migrate_get_current())) { + return false; + } + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { +@@ -833,12 +836,13 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainer *container) + { + VFIOPCIDevice *pcidev; ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + Object *owner; + + owner = memory_region_owner(section->mr); + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } +@@ -939,13 +943,14 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_tracking) { + continue; + } +@@ -1036,6 +1041,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) + { + struct vfio_device_feature *feature; + VFIODirtyRanges ranges; ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + int ret = 0; + +@@ -1046,7 +1052,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) + return -errno; + } + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->dirty_tracking) { + continue; + } +@@ -1139,10 +1145,11 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + VFIODevice *vbasedev; + int ret; + +- QLIST_FOREACH(vbasedev, &container->device_list, container_next) { ++ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + ret = vfio_device_dma_logging_report(vbasedev, iova, size, + vbmap->bitmap); + if (ret) { +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index cf373e42ef..74d236ddee 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1001,7 +1001,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + int groupid = vfio_device_groupid(vbasedev, errp); + VFIODevice *vbasedev_iter; + VFIOGroup *group; +- VFIOContainer *container; ++ VFIOContainerBase *bcontainer; + int ret; + + if (groupid < 0) { +@@ -1028,9 +1028,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + return ret; + } + +- container = group->container; +- vbasedev->container = container; +- QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next); ++ bcontainer = &group->container->bcontainer; ++ vbasedev->bcontainer = bcontainer; ++ QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + return ret; +@@ -1040,13 +1040,13 @@ void vfio_detach_device(VFIODevice *vbasedev) + { + VFIOGroup *group = vbasedev->group; + +- if (!vbasedev->container) { ++ if (!vbasedev->bcontainer) { + return; + } + + QLIST_REMOVE(vbasedev, global_next); + QLIST_REMOVE(vbasedev, container_next); +- vbasedev->container = NULL; ++ vbasedev->bcontainer = NULL; + trace_vfio_detach_device(vbasedev->name, group->groupid); + vfio_put_base_device(vbasedev); + vfio_put_group(group); +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index af0ef9042d..e27854228c 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -100,7 +100,6 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_HEAD(, VFIODMARange) dma_list; +- QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; + } VFIOContainer; + +@@ -128,7 +127,7 @@ typedef struct VFIODevice { + QLIST_ENTRY(VFIODevice) container_next; + QLIST_ENTRY(VFIODevice) global_next; + struct VFIOGroup *group; +- VFIOContainer *container; ++ VFIOContainerBase *bcontainer; + char *sysfsdev; + char *name; + DeviceState *dev; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index f244f003d0..7090962496 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -39,6 +39,7 @@ typedef struct VFIOContainerBase { + bool dirty_pages_supported; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_ENTRY(VFIOContainerBase) next; ++ QLIST_HEAD(, VFIODevice) device_list; + } VFIOContainerBase; + + typedef struct VFIOGuestIOMMU { +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-pgsizes-and-dma_max_mappings-to-.patch b/vfio-container-Move-pgsizes-and-dma_max_mappings-to-.patch new file mode 100644 index 0000000000000000000000000000000000000000..86b8b1f36355f82f11dc7ac4df5ca77153adbb07 --- /dev/null +++ b/vfio-container-Move-pgsizes-and-dma_max_mappings-to-.patch @@ -0,0 +1,234 @@ +From 961614f6c997caf632ce37ead96b301ec47b1847 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:27 +0800 +Subject: [PATCH] vfio/container: Move pgsizes and dma_max_mappings to base + container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change intended. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 17 +++++++++-------- + hw/vfio/container-base.c | 1 + + hw/vfio/container.c | 11 +++++------ + hw/vfio/spapr.c | 10 ++++++---- + include/hw/vfio/vfio-common.h | 2 -- + include/hw/vfio/vfio-container-base.h | 2 ++ + 6 files changed, 23 insertions(+), 20 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b663d0bcc0..fd6249c290 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -401,6 +401,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + static void vfio_register_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl; + +@@ -419,8 +420,8 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); +- g_assert(container->pgsizes && +- vrdl->granularity >= 1ULL << ctz64(container->pgsizes)); ++ g_assert(bcontainer->pgsizes && ++ vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, +@@ -441,7 +442,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ +- if (container->dma_max_mappings) { ++ if (bcontainer->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + + #ifdef CONFIG_KVM +@@ -462,11 +463,11 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + } + + if (vrdl_mappings + max_memslots - vrdl_count > +- container->dma_max_mappings) { ++ bcontainer->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" +- " memslots: %d", __func__, container->dma_max_mappings, ++ " memslots: %d", __func__, bcontainer->dma_max_mappings, + max_memslots); + } + } +@@ -626,7 +627,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + iommu_idx); + + ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, +- container->pgsizes, ++ bcontainer->pgsizes, + &err); + if (ret) { + g_free(giommu); +@@ -675,7 +676,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + llsize = int128_sub(llend, int128_make64(iova)); + + if (memory_region_is_ram_device(section->mr)) { +- hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; ++ hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + + if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { + trace_vfio_listener_region_add_no_dma_map( +@@ -777,7 +778,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; + +- pgmask = (1ULL << ctz64(container->pgsizes)) - 1; ++ pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(container, section); +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 5d654ae172..dcce111349 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -52,6 +52,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + bcontainer->ops = ops; + bcontainer->space = space; + bcontainer->dirty_pages_supported = false; ++ bcontainer->dma_max_mappings = 0; + QLIST_INIT(&bcontainer->giommu_list); + } + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 9a542368ab..116a9e1e73 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -196,7 +196,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && + container->iommu_type == VFIO_TYPE1v2_IOMMU) { + trace_vfio_legacy_dma_unmap_overflow_workaround(); +- unmap.size -= 1ULL << ctz64(container->pgsizes); ++ unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); + continue; + } + error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); +@@ -652,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + container = g_malloc0(sizeof(*container)); + container->fd = fd; + container->error = NULL; +- container->dma_max_mappings = 0; + container->iova_ranges = NULL; + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); +@@ -684,13 +683,13 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + } + + if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { +- container->pgsizes = info->iova_pgsizes; ++ bcontainer->pgsizes = info->iova_pgsizes; + } else { +- container->pgsizes = qemu_real_host_page_size(); ++ bcontainer->pgsizes = qemu_real_host_page_size(); + } + +- if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { +- container->dma_max_mappings = 65535; ++ if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { ++ bcontainer->dma_max_mappings = 65535; + } + + vfio_get_info_iova_range(info, container); +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 83da2f7ec2..4f76bdd3ca 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -226,6 +226,7 @@ static int vfio_spapr_create_window(VFIOContainer *container, + hwaddr *pgsize) + { + int ret = 0; ++ VFIOContainerBase *bcontainer = &container->bcontainer; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); + uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; + unsigned entries, bits_total, bits_per_level, max_levels; +@@ -239,13 +240,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, + if (pagesize > rampagesize) { + pagesize = rampagesize; + } +- pgmask = container->pgsizes & (pagesize | (pagesize - 1)); ++ pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); + pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; + if (!pagesize) { + error_report("Host doesn't support page size 0x%"PRIx64 + ", the supported mask is 0x%lx", + memory_region_iommu_get_min_page_size(iommu_mr), +- container->pgsizes); ++ bcontainer->pgsizes); + return -EINVAL; + } + +@@ -421,6 +422,7 @@ void vfio_container_del_section_window(VFIOContainer *container, + + int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + { ++ VFIOContainerBase *bcontainer = &container->bcontainer; + struct vfio_iommu_spapr_tce_info info; + bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; + int ret, fd = container->fd; +@@ -461,7 +463,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + } + + if (v2) { +- container->pgsizes = info.ddw.pgsizes; ++ bcontainer->pgsizes = info.ddw.pgsizes; + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this +@@ -476,7 +478,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + } + } else { + /* The default table uses 4K pages */ +- container->pgsizes = 0x1000; ++ bcontainer->pgsizes = 0x1000; + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 0295ede7ba..3046287070 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -94,8 +94,6 @@ typedef struct VFIOContainer { + bool dirty_log_manual_clear; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; +- unsigned long pgsizes; +- unsigned int dma_max_mappings; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 7090962496..85ec7e1a56 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -36,6 +36,8 @@ typedef struct VFIOAddressSpace { + typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; + VFIOAddressSpace *space; ++ unsigned long pgsizes; ++ unsigned int dma_max_mappings; + bool dirty_pages_supported; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_ENTRY(VFIOContainerBase) next; +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-space-field-to-base-container.patch b/vfio-container-Move-space-field-to-base-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..94bd76425db5c667d60330cc6170de1ff899f5c7 --- /dev/null +++ b/vfio-container-Move-space-field-to-base-container.patch @@ -0,0 +1,268 @@ +From 97979ab4d92d0006ffefb586675b6110e5b7a746 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:23 +0800 +Subject: [PATCH] vfio/container: Move space field to base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Move the space field to the base object. Also the VFIOAddressSpace +now contains a list of base containers. + +No functional change intended. + +Modify hw/vfio/container.c: +vfio_connect_container->shared_memory_listener_register in kvm_csv3_enabled +during backporting. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/ppc/spapr_pci_vfio.c | 10 +++++----- + hw/vfio/common.c | 4 ++-- + hw/vfio/container-base.c | 6 +++++- + hw/vfio/container.c | 20 +++++++++----------- + include/hw/vfio/vfio-common.h | 8 -------- + include/hw/vfio/vfio-container-base.h | 9 +++++++++ + 6 files changed, 30 insertions(+), 27 deletions(-) + +diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c +index f283f7e38d..d1d07bec46 100644 +--- a/hw/ppc/spapr_pci_vfio.c ++++ b/hw/ppc/spapr_pci_vfio.c +@@ -84,27 +84,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) + static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) + { + VFIOAddressSpace *space = vfio_get_address_space(as); +- VFIOContainer *container = NULL; ++ VFIOContainerBase *bcontainer = NULL; + + if (QLIST_EMPTY(&space->containers)) { + /* No containers to act on */ + goto out; + } + +- container = QLIST_FIRST(&space->containers); ++ bcontainer = QLIST_FIRST(&space->containers); + +- if (QLIST_NEXT(container, next)) { ++ if (QLIST_NEXT(bcontainer, next)) { + /* + * We don't yet have logic to synchronize EEH state across + * multiple containers + */ +- container = NULL; ++ bcontainer = NULL; + goto out; + } + + out: + vfio_put_address_space(space); +- return container; ++ return container_of(bcontainer, VFIOContainer, bcontainer); + } + + static bool vfio_eeh_as_ok(AddressSpace *as) +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b8007b22c3..2f3f66991a 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) + + bool vfio_viommu_preset(VFIODevice *vbasedev) + { +- return vbasedev->container->space->as != &address_space_memory; ++ return vbasedev->container->bcontainer.space->as != &address_space_memory; + } + + static void vfio_set_migration_error(int err) +@@ -922,7 +922,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, + dirty.container = container; + + memory_listener_register(&dirty.listener, +- container->space->as); ++ container->bcontainer.space->as); + + *ranges = dirty.ranges; + +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 20bcb9669a..3933391e0d 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -31,9 +31,11 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); + } + +-void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) ++void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, ++ const VFIOIOMMUOps *ops) + { + bcontainer->ops = ops; ++ bcontainer->space = space; + QLIST_INIT(&bcontainer->giommu_list); + } + +@@ -41,6 +43,8 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) + { + VFIOGuestIOMMU *giommu, *tmp; + ++ QLIST_REMOVE(bcontainer, next); ++ + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier( + MEMORY_REGION(giommu->iommu_mr), &giommu->n); +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 03791601d0..b7ab0d7323 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -607,7 +607,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + * details once we know which type of IOMMU we are using. + */ + +- QLIST_FOREACH(container, &space->containers, next) { ++ QLIST_FOREACH(bcontainer, &space->containers, next) { ++ container = container_of(bcontainer, VFIOContainer, bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { +@@ -643,7 +644,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + } + + container = g_malloc0(sizeof(*container)); +- container->space = space; + container->fd = fd; + container->error = NULL; + container->dirty_pages_supported = false; +@@ -652,7 +652,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; +- vfio_container_init(bcontainer, &vfio_legacy_ops); ++ vfio_container_init(bcontainer, space, &vfio_legacy_ops); + + ret = vfio_init_container(container, group->fd, errp); + if (ret) { +@@ -708,7 +708,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + vfio_kvm_device_add_group(group); + + QLIST_INIT(&container->group_list); +- QLIST_INSERT_HEAD(&space->containers, container, next); ++ QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); +@@ -717,9 +717,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + + if (kvm_csv3_enabled()) { + shared_memory_listener_register(&container->listener, +- container->space->as); ++ bcontainer->space->as); + } else { +- memory_listener_register(&container->listener, container->space->as); ++ memory_listener_register(&container->listener, bcontainer->space->as); + } + + if (container->error) { +@@ -734,7 +734,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + return 0; + listener_release_exit: + QLIST_REMOVE(group, container_next); +- QLIST_REMOVE(container, next); ++ QLIST_REMOVE(bcontainer, next); + vfio_kvm_device_del_group(group); + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); +@@ -792,9 +792,7 @@ static void vfio_disconnect_container(VFIOGroup *group) + } + + if (QLIST_EMPTY(&container->group_list)) { +- VFIOAddressSpace *space = container->space; +- +- QLIST_REMOVE(container, next); ++ VFIOAddressSpace *space = bcontainer->space; + + vfio_container_destroy(bcontainer); + +@@ -815,7 +813,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == groupid) { + /* Found it. Now is it already in the right context? */ +- if (group->container->space->as == as) { ++ if (group->container->bcontainer.space->as == as) { + return group; + } else { + error_setg(errp, "group %d used in multiple address spaces", +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 6f02952ff6..31c9df4b03 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -73,12 +73,6 @@ typedef struct VFIOMigration { + bool initial_data_sent; + } VFIOMigration; + +-typedef struct VFIOAddressSpace { +- AddressSpace *as; +- QLIST_HEAD(, VFIOContainer) containers; +- QLIST_ENTRY(VFIOAddressSpace) list; +-} VFIOAddressSpace; +- + struct VFIOGroup; + + typedef struct VFIODMARange { +@@ -91,7 +85,6 @@ typedef struct VFIODMARange { + + typedef struct VFIOContainer { + VFIOContainerBase bcontainer; +- VFIOAddressSpace *space; + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ + MemoryListener listener; + MemoryListener prereg_listener; +@@ -108,7 +101,6 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_HEAD(, VFIODMARange) dma_list; +- QLIST_ENTRY(VFIOContainer) next; + QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; + } VFIOContainer; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index a11aec5755..c7cc6ec9c5 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -24,12 +24,20 @@ typedef struct { + hwaddr pages; + } VFIOBitmap; + ++typedef struct VFIOAddressSpace { ++ AddressSpace *as; ++ QLIST_HEAD(, VFIOContainerBase) containers; ++ QLIST_ENTRY(VFIOAddressSpace) list; ++} VFIOAddressSpace; ++ + /* + * This is the base object for vfio container backends + */ + typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; ++ VFIOAddressSpace *space; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; ++ QLIST_ENTRY(VFIOContainerBase) next; + } VFIOContainerBase; + + typedef struct VFIOGuestIOMMU { +@@ -48,6 +56,7 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + IOMMUTLBEntry *iotlb); + + void vfio_container_init(VFIOContainerBase *bcontainer, ++ VFIOAddressSpace *space, + const VFIOIOMMUOps *ops); + void vfio_container_destroy(VFIOContainerBase *bcontainer); + +-- +2.41.0.windows.1 + diff --git a/vfio-container-Move-vrdl_list-to-base-container.patch b/vfio-container-Move-vrdl_list-to-base-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..8b265742929605476edb2038f9bc7bac0b49de22 --- /dev/null +++ b/vfio-container-Move-vrdl_list-to-base-container.patch @@ -0,0 +1,248 @@ +From d0234f18616cfe9a43287ba75e4788a10166a526 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:28 +0800 +Subject: [PATCH] vfio/container: Move vrdl_list to base container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 38 +++++++++++++-------------- + hw/vfio/container-base.c | 1 + + hw/vfio/container.c | 1 - + include/hw/vfio/vfio-common.h | 11 -------- + include/hw/vfio/vfio-container-base.h | 11 ++++++++ + 5 files changed, 31 insertions(+), 31 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index fd6249c290..e9a19209ab 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -351,13 +351,13 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + { + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); ++ VFIOContainerBase *bcontainer = vrdl->bcontainer; + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ +- ret = vfio_container_dma_unmap(&vrdl->container->bcontainer, +- iova, size , NULL); ++ ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + if (ret) { + error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, + strerror(-ret)); +@@ -369,6 +369,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + { + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); ++ VFIOContainerBase *bcontainer = vrdl->bcontainer; + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; +@@ -387,8 +388,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + +- ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova, +- next - start, vaddr, section->readonly); ++ ret = vfio_container_dma_map(bcontainer, iova, next - start, ++ vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); +@@ -398,10 +399,9 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + return 0; + } + +-static void vfio_register_ram_discard_listener(VFIOContainer *container, ++static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) + { +- VFIOContainerBase *bcontainer = &container->bcontainer; + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl; + +@@ -412,7 +412,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); + + vrdl = g_new0(VFIORamDiscardListener, 1); +- vrdl->container = container; ++ vrdl->bcontainer = bcontainer; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); +@@ -427,7 +427,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); +- QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); ++ QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could +@@ -451,7 +451,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + } + #endif + +- QLIST_FOREACH(vrdl, &container->vrdl_list, next) { ++ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, +@@ -473,13 +473,13 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, + } + } + +-static void vfio_unregister_ram_discard_listener(VFIOContainer *container, ++static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) + { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + +- QLIST_FOREACH(vrdl, &container->vrdl_list, next) { ++ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { +@@ -663,7 +663,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { +- vfio_register_ram_discard_listener(container, section); ++ vfio_register_ram_discard_listener(bcontainer, section); + return; + } + +@@ -781,7 +781,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { +- vfio_unregister_ram_discard_listener(container, section); ++ vfio_unregister_ram_discard_listener(bcontainer, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; + } +@@ -1267,17 +1267,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ +- return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size, +- ram_addr); ++ return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr); + } + +-static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, +- MemoryRegionSection *section) ++static int ++vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section) + { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + +- QLIST_FOREACH(vrdl, &container->vrdl_list, next) { ++ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { +@@ -1331,7 +1331,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + } + return 0; + } else if (memory_region_has_ram_discard_manager(section->mr)) { +- return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); ++ return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); + } + + ram_addr = memory_region_get_ram_addr(section->mr) + +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index dcce111349..584eee4ba1 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; + QLIST_INIT(&bcontainer->giommu_list); ++ QLIST_INIT(&bcontainer->vrdl_list); + } + + void vfio_container_destroy(VFIOContainerBase *bcontainer) +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 116a9e1e73..023f220c93 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -653,7 +653,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + container->fd = fd; + container->error = NULL; + container->iova_ranges = NULL; +- QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, &vfio_legacy_ops); +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 3046287070..0174b767ca 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -96,21 +96,10 @@ typedef struct VFIOContainer { + uint64_t max_dirty_bitmap_size; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; +- QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_HEAD(, VFIODMARange) dma_list; + GList *iova_ranges; + } VFIOContainer; + +-typedef struct VFIORamDiscardListener { +- VFIOContainer *container; +- MemoryRegion *mr; +- hwaddr offset_within_address_space; +- hwaddr size; +- uint64_t granularity; +- RamDiscardListener listener; +- QLIST_ENTRY(VFIORamDiscardListener) next; +-} VFIORamDiscardListener; +- + typedef struct VFIOHostDMAWindow { + hwaddr min_iova; + hwaddr max_iova; +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 85ec7e1a56..8e05b5ac5a 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -40,6 +40,7 @@ typedef struct VFIOContainerBase { + unsigned int dma_max_mappings; + bool dirty_pages_supported; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; ++ QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_ENTRY(VFIOContainerBase) next; + QLIST_HEAD(, VFIODevice) device_list; + } VFIOContainerBase; +@@ -52,6 +53,16 @@ typedef struct VFIOGuestIOMMU { + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; + } VFIOGuestIOMMU; + ++typedef struct VFIORamDiscardListener { ++ VFIOContainerBase *bcontainer; ++ MemoryRegion *mr; ++ hwaddr offset_within_address_space; ++ hwaddr size; ++ uint64_t granularity; ++ RamDiscardListener listener; ++ QLIST_ENTRY(VFIORamDiscardListener) next; ++} VFIORamDiscardListener; ++ + int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +-- +2.41.0.windows.1 + diff --git a/vfio-container-Switch-to-IOMMU-BE-set_dirty_page_tra.patch b/vfio-container-Switch-to-IOMMU-BE-set_dirty_page_tra.patch new file mode 100644 index 0000000000000000000000000000000000000000..db1c212bfcdd66e6c589a1402b13d3a5605adaba --- /dev/null +++ b/vfio-container-Switch-to-IOMMU-BE-set_dirty_page_tra.patch @@ -0,0 +1,240 @@ +From c8c17aaddeee1e5002fc4bde7245719db75d4021 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:24 +0800 +Subject: [PATCH] vfio/container: Switch to IOMMU BE + set_dirty_page_tracking/query_dirty_bitmap API +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +dirty_pages_supported field is also moved to the base container + +No functional change intended. + +Modify vfio_listener_log_clear during backporting. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 14 +++++++++----- + hw/vfio/container-base.c | 16 ++++++++++++++++ + hw/vfio/container.c | 21 ++++++++++++++------- + include/hw/vfio/vfio-common.h | 5 ----- + include/hw/vfio/vfio-container-base.h | 6 ++++++ + 5 files changed, 45 insertions(+), 17 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 2f3f66991a..3be6cecc63 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1079,7 +1079,8 @@ static void vfio_listener_log_global_start(MemoryListener *listener) + if (vfio_devices_all_device_dirty_tracking(container)) { + ret = vfio_devices_dma_logging_start(container); + } else { +- ret = vfio_set_dirty_page_tracking(container, true); ++ ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, ++ true); + } + + if (ret) { +@@ -1097,7 +1098,8 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) + if (vfio_devices_all_device_dirty_tracking(container)) { + vfio_devices_dma_logging_stop(container); + } else { +- ret = vfio_set_dirty_page_tracking(container, false); ++ ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, ++ false); + } + + if (ret) { +@@ -1166,7 +1168,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + VFIODMARange *qrange; + int ret; + +- if (!container->dirty_pages_supported && !all_device_dirty_tracking) { ++ if (!container->bcontainer.dirty_pages_supported && ++ !all_device_dirty_tracking) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); +@@ -1187,7 +1190,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + if (all_device_dirty_tracking) { + ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); + } else { +- ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); ++ ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap, ++ iova, size); + } + + if (ret) { +@@ -1480,7 +1484,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || +- !container->dirty_pages_supported) { ++ !container->bcontainer.dirty_pages_supported) { + return; + } + +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 3933391e0d..5d654ae172 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -31,11 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); + } + ++int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, ++ bool start) ++{ ++ g_assert(bcontainer->ops->set_dirty_page_tracking); ++ return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); ++} ++ ++int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++ VFIOBitmap *vbmap, ++ hwaddr iova, hwaddr size) ++{ ++ g_assert(bcontainer->ops->query_dirty_bitmap); ++ return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size); ++} ++ + void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + const VFIOIOMMUOps *ops) + { + bcontainer->ops = ops; + bcontainer->space = space; ++ bcontainer->dirty_pages_supported = false; + QLIST_INIT(&bcontainer->giommu_list); + } + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index b7ab0d7323..cf373e42ef 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -157,7 +157,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + + if (iotlb && vfio_devices_all_running_and_mig_active(container)) { + if (!vfio_devices_all_device_dirty_tracking(container) && +- container->dirty_pages_supported) { ++ container->bcontainer.dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + +@@ -255,14 +255,17 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, + return -errno; + } + +-int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) ++static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, ++ bool start) + { ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + +- if (!container->dirty_pages_supported) { ++ if (!bcontainer->dirty_pages_supported) { + return 0; + } + +@@ -282,9 +285,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) + return ret; + } + +-int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, +- hwaddr iova, hwaddr size) ++static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++ VFIOBitmap *vbmap, ++ hwaddr iova, hwaddr size) + { ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + int ret; +@@ -528,7 +534,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, + * qemu_real_host_page_size to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { +- container->dirty_pages_supported = true; ++ container->bcontainer.dirty_pages_supported = true; + container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + container->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +@@ -646,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + container = g_malloc0(sizeof(*container)); + container->fd = fd; + container->error = NULL; +- container->dirty_pages_supported = false; + container->dma_max_mappings = 0; + container->iova_ranges = NULL; + QLIST_INIT(&container->vrdl_list); +@@ -1050,4 +1055,6 @@ void vfio_detach_device(VFIODevice *vbasedev) + const VFIOIOMMUOps vfio_legacy_ops = { + .dma_map = vfio_legacy_dma_map, + .dma_unmap = vfio_legacy_dma_unmap, ++ .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, ++ .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, + }; +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 31c9df4b03..af0ef9042d 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -91,7 +91,6 @@ typedef struct VFIOContainer { + unsigned iommu_type; + Error *error; + bool initialized; +- bool dirty_pages_supported; + bool dirty_log_manual_clear; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; +@@ -200,13 +199,9 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); + void vfio_put_address_space(VFIOAddressSpace *space); + bool vfio_devices_all_running_and_saving(VFIOContainer *container); + +-/* container->fd */ + VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size); + void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); +-int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); +-int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, +- hwaddr iova, hwaddr size); + + /* SPAPR specific */ + int vfio_container_add_section_window(VFIOContainer *container, +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index c7cc6ec9c5..f244f003d0 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -36,6 +36,7 @@ typedef struct VFIOAddressSpace { + typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; + VFIOAddressSpace *space; ++ bool dirty_pages_supported; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_ENTRY(VFIOContainerBase) next; + } VFIOContainerBase; +@@ -54,6 +55,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, + int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); ++int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, ++ bool start); ++int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, ++ VFIOBitmap *vbmap, ++ hwaddr iova, hwaddr size); + + void vfio_container_init(VFIOContainerBase *bcontainer, + VFIOAddressSpace *space, +-- +2.41.0.windows.1 + diff --git a/vfio-container-Switch-to-dma_map-unmap-API.patch b/vfio-container-Switch-to-dma_map-unmap-API.patch new file mode 100644 index 0000000000000000000000000000000000000000..5c51add5d94a31cc2a5e42483f7f55818090ce97 --- /dev/null +++ b/vfio-container-Switch-to-dma_map-unmap-API.patch @@ -0,0 +1,295 @@ +From 775cf7c2a0dc34d7163eeea1aab6bfc6cb28be9b Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:20 +0800 +Subject: [PATCH] vfio/container: Switch to dma_map|unmap API +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change intended. + +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 45 +++++++++++++++------------ + hw/vfio/container-base.c | 32 +++++++++++++++++++ + hw/vfio/container.c | 22 ++++++++----- + hw/vfio/meson.build | 1 + + hw/vfio/trace-events | 2 +- + include/hw/vfio/vfio-common.h | 4 --- + include/hw/vfio/vfio-container-base.h | 7 +++++ + 7 files changed, 81 insertions(+), 32 deletions(-) + create mode 100644 hw/vfio/container-base.c + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index e08b147b3d..ea63271167 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + { + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); +- VFIOContainer *container = giommu->container; ++ VFIOContainerBase *bcontainer = &giommu->container->bcontainer; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + void *vaddr; + int ret; +@@ -322,21 +322,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + * of vaddr will always be there, even if the memory object is + * destroyed and its backing memory munmap-ed. + */ +- ret = vfio_dma_map(container, iova, +- iotlb->addr_mask + 1, vaddr, +- read_only); ++ ret = vfio_container_dma_map(bcontainer, iova, ++ iotlb->addr_mask + 1, vaddr, ++ read_only); + if (ret) { +- error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " ++ error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", +- container, iova, ++ bcontainer, iova, + iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); + } + } else { +- ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); ++ ret = vfio_container_dma_unmap(bcontainer, iova, ++ iotlb->addr_mask + 1, iotlb); + if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", +- container, iova, ++ bcontainer, iova, + iotlb->addr_mask + 1, ret, strerror(-ret)); + vfio_set_migration_error(ret); + } +@@ -355,9 +356,10 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + int ret; + + /* Unmap with a single call. */ +- ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); ++ ret = vfio_container_dma_unmap(&vrdl->container->bcontainer, ++ iova, size , NULL); + if (ret) { +- error_report("%s: vfio_dma_unmap() failed: %s", __func__, ++ error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } + } +@@ -385,8 +387,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + +- ret = vfio_dma_map(vrdl->container, iova, next - start, +- vaddr, section->readonly); ++ ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova, ++ next - start, vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); +@@ -684,10 +686,11 @@ static void vfio_listener_region_add(MemoryListener *listener, + } + } + +- ret = vfio_dma_map(container, iova, int128_get64(llsize), +- vaddr, section->readonly); ++ ret = vfio_container_dma_map(&container->bcontainer, ++ iova, int128_get64(llsize), vaddr, ++ section->readonly); + if (ret) { +- error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " ++ error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%s)", + container, iova, int128_get64(llsize), vaddr, ret, + strerror(-ret)); +@@ -784,18 +787,20 @@ static void vfio_listener_region_del(MemoryListener *listener, + if (int128_eq(llsize, int128_2_64())) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + llsize = int128_rshift(llsize, 1); +- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); ++ ret = vfio_container_dma_unmap(&container->bcontainer, iova, ++ int128_get64(llsize), NULL); + if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, int128_get64(llsize), ret, + strerror(-ret)); + } + iova += int128_get64(llsize); + } +- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); ++ ret = vfio_container_dma_unmap(&container->bcontainer, iova, ++ int128_get64(llsize), NULL); + if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%s)", + container, iova, int128_get64(llsize), ret, + strerror(-ret)); +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +new file mode 100644 +index 0000000000..55d3a35fa4 +--- /dev/null ++++ b/hw/vfio/container-base.c +@@ -0,0 +1,32 @@ ++/* ++ * VFIO BASE CONTAINER ++ * ++ * Copyright (C) 2023 Intel Corporation. ++ * Copyright Red Hat, Inc. 2023 ++ * ++ * Authors: Yi Liu ++ * Eric Auger ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#include "qemu/osdep.h" ++#include "qapi/error.h" ++#include "qemu/error-report.h" ++#include "hw/vfio/vfio-container-base.h" ++ ++int vfio_container_dma_map(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ void *vaddr, bool readonly) ++{ ++ g_assert(bcontainer->ops->dma_map); ++ return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly); ++} ++ ++int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ IOMMUTLBEntry *iotlb) ++{ ++ g_assert(bcontainer->ops->dma_unmap); ++ return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); ++} +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 8d8ed13e93..40e378e888 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -140,9 +140,11 @@ void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) + /* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +-int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, +- ram_addr_t size, IOMMUTLBEntry *iotlb) ++static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ++ ram_addr_t size, IOMMUTLBEntry *iotlb) + { ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, +@@ -193,7 +195,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, + */ + if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && + container->iommu_type == VFIO_TYPE1v2_IOMMU) { +- trace_vfio_dma_unmap_overflow_workaround(); ++ trace_vfio_legacy_dma_unmap_overflow_workaround(); + unmap.size -= 1ULL << ctz64(container->pgsizes); + continue; + } +@@ -212,9 +214,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, + return 0; + } + +-int vfio_dma_map(VFIOContainer *container, hwaddr iova, +- ram_addr_t size, void *vaddr, bool readonly) ++static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ++ ram_addr_t size, void *vaddr, bool readonly) + { ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_READ, +@@ -241,7 +245,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, + * the VGA ROM space. + */ + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || +- (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && ++ (errno == EBUSY && ++ vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && + ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { + return 0; + } +@@ -1050,4 +1055,7 @@ void vfio_detach_device(VFIODevice *vbasedev) + vfio_put_group(group); + } + +-const VFIOIOMMUOps vfio_legacy_ops; ++const VFIOIOMMUOps vfio_legacy_ops = { ++ .dma_map = vfio_legacy_dma_map, ++ .dma_unmap = vfio_legacy_dma_unmap, ++}; +diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build +index b1db4c8605..32a6933280 100644 +--- a/hw/vfio/meson.build ++++ b/hw/vfio/meson.build +@@ -2,6 +2,7 @@ vfio_ss = ss.source_set() + vfio_ss.add(files( + 'helpers.c', + 'common.c', ++ 'container-base.c', + 'container.c', + 'spapr.c', + 'migration.c', +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 0eb2387cf2..9f7fedee98 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -116,7 +116,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re + vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" + vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" + vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" +-vfio_dma_unmap_overflow_workaround(void) "" ++vfio_legacy_dma_unmap_overflow_workaround(void) "" + vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 + vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 3a0a6ab6ee..f94baf72db 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -221,10 +221,6 @@ bool vfio_devices_all_running_and_saving(VFIOContainer *container); + VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size); + void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); +-int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, +- ram_addr_t size, IOMMUTLBEntry *iotlb); +-int vfio_dma_map(VFIOContainer *container, hwaddr iova, +- ram_addr_t size, void *vaddr, bool readonly); + int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); + int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 1d6daaea5d..56b033f59f 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -31,6 +31,13 @@ typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; + } VFIOContainerBase; + ++int vfio_container_dma_map(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ void *vaddr, bool readonly); ++int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ IOMMUTLBEntry *iotlb); ++ + struct VFIOIOMMUOps { + /* basic feature */ + int (*dma_map)(VFIOContainerBase *bcontainer, +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Add-hw_caps-field-to-HostIOMMUDeviceCap.patch b/vfio-iommufd-Add-hw_caps-field-to-HostIOMMUDeviceCap.patch new file mode 100644 index 0000000000000000000000000000000000000000..b94066382488cd009977619210247c540444f4db --- /dev/null +++ b/vfio-iommufd-Add-hw_caps-field-to-HostIOMMUDeviceCap.patch @@ -0,0 +1,57 @@ +From 72660b98e799248338588fe97f191c544c073806 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:20 +0100 +Subject: [PATCH] vfio/iommufd: Add hw_caps field to HostIOMMUDeviceCaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Store the value of @caps returned by iommufd_backend_get_device_info() +in a new field HostIOMMUDeviceCaps::hw_caps. Right now the only value is +whether device IOMMU supports dirty tracking (IOMMU_HW_CAP_DIRTY_TRACKING). + +This is in preparation for HostIOMMUDevice::realize() being called early +during attach_device(). + +Signed-off-by: Joao Martins +Reviewed-by: Cédric Le Goater +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +--- + hw/vfio/iommufd.c | 1 + + include/sysemu/host_iommu_device.h | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 06e6a400be..d9088705de 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -745,6 +745,7 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + + hiod->name = g_strdup(vdev->name); + caps->type = type; ++ caps->hw_caps = hw_caps; + + return true; + } +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +index f586908945..e4d8300350 100644 +--- a/include/sysemu/host_iommu_device.h ++++ b/include/sysemu/host_iommu_device.h +@@ -19,9 +19,13 @@ + * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. + * + * @type: host platform IOMMU type. ++ * ++ * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents ++ * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) + */ + typedef struct HostIOMMUDeviceCaps { + uint32_t type; ++ uint64_t hw_caps; + } HostIOMMUDeviceCaps; + + #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Add-properties-and-handlers-to-TYPE_HOS.patch b/vfio-iommufd-Add-properties-and-handlers-to-TYPE_HOS.patch new file mode 100644 index 0000000000000000000000000000000000000000..c4fc3050db6f72316874639ac5a99b2d7d3e2322 --- /dev/null +++ b/vfio-iommufd-Add-properties-and-handlers-to-TYPE_HOS.patch @@ -0,0 +1,130 @@ +From 0e0956cb785f868dfe48201fcdead71dbdd234b0 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Mon, 15 Jan 2024 15:05:19 +0800 +Subject: [PATCH] vfio/iommufd: Add properties and handlers to + TYPE_HOST_IOMMU_DEVICE_IOMMUFD + +New added properties include IOMMUFD handle and devid, ioas. +IOMMUFD handle and devid are used to allocate/free ioas, hwpt. +ioas is used to re-attach IOMMUFD backed device to its +default ioas id, i.e., when vIOMMU is disabled by guest. +These properties are initialized in .realize() handler. + +New added handlers include [at|de]tach_hwpt. They are used to +attaching/detaching hwpt. VFIO and VDPA have different way to +attach and detach, so implementation will be in sub-class +instead of HostIOMMUDeviceIOMMUFD. + +Add two wrappers host_iommu_device_iommufd_[at|de]tach_hwpt to +wrap the two handlers. + +This is a prerequisite patch for following ones. + +Signed-off-by: Zhenzhong Duan +--- + backends/iommufd.c | 22 ++++++++++++++++++ + include/sysemu/iommufd.h | 50 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 72 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index cf24370385..c10aa9b011 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -360,6 +360,26 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + return ret; + } + ++bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, ++ uint32_t hwpt_id, Error **errp) ++{ ++ HostIOMMUDeviceIOMMUFDClass *idevc = ++ HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); ++ ++ g_assert(idevc->attach_hwpt); ++ return idevc->attach_hwpt(idev, hwpt_id, errp); ++} ++ ++bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, ++ Error **errp) ++{ ++ HostIOMMUDeviceIOMMUFDClass *idevc = ++ HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); ++ ++ g_assert(idevc->detach_hwpt); ++ return idevc->detach_hwpt(idev, errp); ++} ++ + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) + { + HostIOMMUDeviceCaps *caps = &hiod->caps; +@@ -398,6 +418,8 @@ static const TypeInfo types[] = { + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .parent = TYPE_HOST_IOMMU_DEVICE, ++ .instance_size = sizeof(HostIOMMUDeviceIOMMUFD), ++ .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass), + .class_init = hiod_iommufd_class_init, + .abstract = true, + } +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index f6596f6338..3dc6934144 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -68,4 +68,54 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t *entry_num, void *data_ptr); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" ++OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, ++ HOST_IOMMU_DEVICE_IOMMUFD) ++ ++/* Abstract of host IOMMU device with iommufd backend */ ++struct HostIOMMUDeviceIOMMUFD { ++ HostIOMMUDevice parent_obj; ++ ++ IOMMUFDBackend *iommufd; ++ uint32_t devid; ++ uint32_t ioas_id; ++}; ++ ++struct HostIOMMUDeviceIOMMUFDClass { ++ HostIOMMUDeviceClass parent_class; ++ ++ /** ++ * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. ++ * VFIO and VDPA device can have different implementation. ++ * ++ * Mandatory callback. ++ * ++ * @idev: host IOMMU device backed by IOMMUFD backend. ++ * ++ * @hwpt_id: ID of IOMMUFD hardware page table. ++ * ++ * @errp: pass an Error out when attachment fails. ++ * ++ * Returns: true on success, false on failure. ++ */ ++ bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, ++ Error **errp); ++ /** ++ * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. ++ * VFIO and VDPA device can have different implementation. ++ * ++ * Mandatory callback. ++ * ++ * @idev: host IOMMU device backed by IOMMUFD backend. ++ * ++ * @errp: pass an Error out when attachment fails. ++ * ++ * Returns: true on success, false on failure. ++ */ ++ bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); ++}; ++ ++bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, ++ uint32_t hwpt_id, Error **errp); ++bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, ++ Error **errp); + #endif +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Add-support-for-iova_ranges-and-pgsizes.patch b/vfio-iommufd-Add-support-for-iova_ranges-and-pgsizes.patch new file mode 100644 index 0000000000000000000000000000000000000000..e37a26eb7ea511cc54e73eb4b3bddce536d3ec8b --- /dev/null +++ b/vfio-iommufd-Add-support-for-iova_ranges-and-pgsizes.patch @@ -0,0 +1,107 @@ +From d6f0612a8760959f25c148ab50a1e7c394d4279a Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:42 +0800 +Subject: [PATCH] vfio/iommufd: Add support for iova_ranges and pgsizes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Some vIOMMU such as virtio-iommu use IOVA ranges from host side to +setup reserved ranges for passthrough device, so that guest will not +use an IOVA range beyond host support. + +Use an uAPI of IOMMUFD to get IOVA ranges of host side and pass to +vIOMMU just like the legacy backend, if this fails, fallback to +64bit IOVA range. + +Also use out_iova_alignment returned from uAPI as pgsizes instead of +qemu_real_host_page_size() as a fallback. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/iommufd.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 55 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 6d31aeac7b..01b448e840 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -261,6 +261,53 @@ static int iommufd_cdev_ram_block_discard_disable(bool state) + return ram_block_uncoordinated_discard_disable(state); + } + ++static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container, ++ uint32_t ioas_id, Error **errp) ++{ ++ VFIOContainerBase *bcontainer = &container->bcontainer; ++ struct iommu_ioas_iova_ranges *info; ++ struct iommu_iova_range *iova_ranges; ++ int ret, sz, fd = container->be->fd; ++ ++ info = g_malloc0(sizeof(*info)); ++ info->size = sizeof(*info); ++ info->ioas_id = ioas_id; ++ ++ ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); ++ if (ret && errno != EMSGSIZE) { ++ goto error; ++ } ++ ++ sz = info->num_iovas * sizeof(struct iommu_iova_range); ++ info = g_realloc(info, sizeof(*info) + sz); ++ info->allowed_iovas = (uintptr_t)(info + 1); ++ ++ ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); ++ if (ret) { ++ goto error; ++ } ++ ++ iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas; ++ ++ for (int i = 0; i < info->num_iovas; i++) { ++ Range *range = g_new(Range, 1); ++ ++ range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last); ++ bcontainer->iova_ranges = ++ range_list_insert(bcontainer->iova_ranges, range); ++ } ++ bcontainer->pgsizes = info->out_iova_alignment; ++ ++ g_free(info); ++ return 0; ++ ++error: ++ ret = -errno; ++ g_free(info); ++ error_setg_errno(errp, errno, "Cannot get IOVA ranges"); ++ return ret; ++} ++ + static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) + { +@@ -335,7 +382,14 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + goto err_discard_disable; + } + +- bcontainer->pgsizes = qemu_real_host_page_size(); ++ ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err); ++ if (ret) { ++ error_append_hint(&err, ++ "Fallback to default 64bit IOVA range and 4K page size\n"); ++ warn_report_err(err); ++ err = NULL; ++ bcontainer->pgsizes = qemu_real_host_page_size(); ++ } + + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Don-t-initialize-nor-set-a-HOST_IOMMU_D.patch b/vfio-iommufd-Don-t-initialize-nor-set-a-HOST_IOMMU_D.patch new file mode 100644 index 0000000000000000000000000000000000000000..af353c02508b393376a12f659b3a494de2e9f16b --- /dev/null +++ b/vfio-iommufd-Don-t-initialize-nor-set-a-HOST_IOMMU_D.patch @@ -0,0 +1,64 @@ +From b2d58d5b474633514c3195d6948e1cd2a9c78d67 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Fri, 19 Jul 2024 13:04:50 +0100 +Subject: [PATCH] vfio/iommufd: Don't initialize nor set a HOST_IOMMU_DEVICE + with mdev + +mdevs aren't "physical" devices and when asking for backing IOMMU info, it +fails the entire provisioning of the guest. Fix that by skipping +HostIOMMUDevice initialization in the presence of mdevs, and skip setting +an iommu device when it is known to be an mdev. + +Cc: Zhenzhong Duan +Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") +Signed-off-by: Joao Martins +Reviewed-by: Eric Auger +Reviewed-by: Zhenzhong Duan +--- + hw/vfio/common.c | 4 ++++ + hw/vfio/pci.c | 7 +++++-- + 2 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d5ff65f90a..ceb1da0b94 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1664,6 +1664,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + return ret; + } + ++ if (vbasedev->mdev) { ++ return true; ++ } ++ + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + object_unref(hiod); +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index de040e73ca..19211f4368 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3101,7 +3101,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + + vfio_bars_register(vdev); + +- if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { ++ if (!vbasedev->mdev && ++ !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + error_prepend(errp, "Failed to set iommu_device: "); + goto out_teardown; + } +@@ -3229,7 +3230,9 @@ out_deregister: + timer_free(vdev->intx.mmap_timer); + } + out_unset_idev: +- pci_device_unset_iommu_device(pdev); ++ if (!vbasedev->mdev) { ++ pci_device_unset_iommu_device(pdev); ++ } + out_teardown: + vfio_teardown_msi(vdev); + vfio_bars_exit(vdev); +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Enable-pci-hot-reset-through-iommufd-cd.patch b/vfio-iommufd-Enable-pci-hot-reset-through-iommufd-cd.patch new file mode 100644 index 0000000000000000000000000000000000000000..3829379111bb459f6ac1adda4df927f7072bb061 --- /dev/null +++ b/vfio-iommufd-Enable-pci-hot-reset-through-iommufd-cd.patch @@ -0,0 +1,207 @@ +From de17750e24d4e583e9f392bbe47e4bd1aa81d6bc Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:45 +0800 +Subject: [PATCH] vfio/iommufd: Enable pci hot reset through iommufd cdev + interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Implement the newly introduced pci_hot_reset callback named +iommufd_cdev_pci_hot_reset to do iommufd specific check and +reset operation. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/iommufd.c | 150 +++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/trace-events | 1 + + 2 files changed, 151 insertions(+) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 01b448e840..6e53e013ef 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -24,6 +24,7 @@ + #include "sysemu/reset.h" + #include "qemu/cutils.h" + #include "qemu/chardev_open.h" ++#include "pci.h" + + static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +@@ -468,9 +469,158 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) + close(vbasedev->fd); + } + ++static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) ++{ ++ VFIODevice *vbasedev_iter; ++ ++ QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { ++ if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) { ++ continue; ++ } ++ if (devid == vbasedev_iter->devid) { ++ return vbasedev_iter; ++ } ++ } ++ return NULL; ++} ++ ++static VFIOPCIDevice * ++iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev, ++ VFIODevice *reset_dev) ++{ ++ VFIODevice *vbasedev_tmp; ++ ++ if (dep_dev->devid == reset_dev->devid || ++ dep_dev->devid == VFIO_PCI_DEVID_OWNED) { ++ return NULL; ++ } ++ ++ vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid); ++ if (!vbasedev_tmp || !vbasedev_tmp->dev->realized || ++ vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) { ++ return NULL; ++ } ++ ++ return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev); ++} ++ ++static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single) ++{ ++ VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); ++ struct vfio_pci_hot_reset_info *info = NULL; ++ struct vfio_pci_dependent_device *devices; ++ struct vfio_pci_hot_reset *reset; ++ int ret, i; ++ bool multi = false; ++ ++ trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); ++ ++ if (!single) { ++ vfio_pci_pre_reset(vdev); ++ } ++ vdev->vbasedev.needs_reset = false; ++ ++ ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); ++ ++ if (ret) { ++ goto out_single; ++ } ++ ++ assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID); ++ ++ devices = &info->devices[0]; ++ ++ if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) { ++ if (!vdev->has_pm_reset) { ++ for (i = 0; i < info->count; i++) { ++ if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) { ++ error_report("vfio: Cannot reset device %s, " ++ "depends on device %04x:%02x:%02x.%x " ++ "which is not owned.", ++ vdev->vbasedev.name, devices[i].segment, ++ devices[i].bus, PCI_SLOT(devices[i].devfn), ++ PCI_FUNC(devices[i].devfn)); ++ } ++ } ++ } ++ ret = -EPERM; ++ goto out_single; ++ } ++ ++ trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); ++ ++ for (i = 0; i < info->count; i++) { ++ VFIOPCIDevice *tmp; ++ ++ trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment, ++ devices[i].bus, ++ PCI_SLOT(devices[i].devfn), ++ PCI_FUNC(devices[i].devfn), ++ devices[i].devid); ++ ++ /* ++ * If a VFIO cdev device is resettable, all the dependent devices ++ * are either bound to same iommufd or within same iommu_groups as ++ * one of the iommufd bound devices. ++ */ ++ assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED); ++ ++ tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); ++ if (!tmp) { ++ continue; ++ } ++ ++ if (single) { ++ ret = -EINVAL; ++ goto out_single; ++ } ++ vfio_pci_pre_reset(tmp); ++ tmp->vbasedev.needs_reset = false; ++ multi = true; ++ } ++ ++ if (!single && !multi) { ++ ret = -EINVAL; ++ goto out_single; ++ } ++ ++ /* Use zero length array for hot reset with iommufd backend */ ++ reset = g_malloc0(sizeof(*reset)); ++ reset->argsz = sizeof(*reset); ++ ++ /* Bus reset! */ ++ ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); ++ g_free(reset); ++ if (ret) { ++ ret = -errno; ++ } ++ ++ trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, ++ ret ? strerror(errno) : "Success"); ++ ++ /* Re-enable INTx on affected devices */ ++ for (i = 0; i < info->count; i++) { ++ VFIOPCIDevice *tmp; ++ ++ tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); ++ if (!tmp) { ++ continue; ++ } ++ vfio_pci_post_reset(tmp); ++ } ++out_single: ++ if (!single) { ++ vfio_pci_post_reset(vdev); ++ } ++ g_free(info); ++ ++ return ret; ++} ++ + const VFIOIOMMUOps vfio_iommufd_ops = { + .dma_map = iommufd_cdev_map, + .dma_unmap = iommufd_cdev_unmap, + .attach_device = iommufd_cdev_attach, + .detach_device = iommufd_cdev_detach, ++ .pci_hot_reset = iommufd_cdev_pci_hot_reset, + }; +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 3340c93af0..8fdde54456 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -174,3 +174,4 @@ iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Succ + iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" + iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" + iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" ++iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Implement-HostIOMMUDeviceClass-realize-.patch b/vfio-iommufd-Implement-HostIOMMUDeviceClass-realize-.patch new file mode 100644 index 0000000000000000000000000000000000000000..02adf86047dc51d0c9e0954eb4ce77faec14abe1 --- /dev/null +++ b/vfio-iommufd-Implement-HostIOMMUDeviceClass-realize-.patch @@ -0,0 +1,72 @@ +From c9f1b73eb36a84347c3720ce2a93f72ea47f5daa Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:34 +0800 +Subject: [PATCH] vfio/iommufd: Implement HostIOMMUDeviceClass::realize() + handler + +It calls iommufd_backend_get_device_info() to get host IOMMU +related information and translate it into HostIOMMUDeviceCaps +for query with .get_cap(). + +For aw_bits, use the same way as legacy backend by calling +vfio_device_get_aw_bits() which is common for different vendor +IOMMU. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/iommufd.c | 30 ++++++++++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 7a4b818830..2efdba5565 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -636,6 +636,35 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) + vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + }; + ++static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, ++ Error **errp) ++{ ++ VFIODevice *vdev = opaque; ++ HostIOMMUDeviceCaps *caps = &hiod->caps; ++ enum iommu_hw_info_type type; ++ union { ++ struct iommu_hw_info_vtd vtd; ++ } data; ++ ++ if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, ++ &type, &data, sizeof(data), errp)) { ++ return false; ++ } ++ ++ hiod->name = g_strdup(vdev->name); ++ caps->type = type; ++ caps->aw_bits = vfio_device_get_aw_bits(vdev); ++ ++ return true; ++} ++ ++static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) ++{ ++ HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); ++ ++ hiodc->realize = hiod_iommufd_vfio_realize; ++}; ++ + static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_IOMMUFD, +@@ -644,6 +673,7 @@ static const TypeInfo types[] = { + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, ++ .class_init = hiod_iommufd_vfio_class_init, + } + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Implement-HostIOMMUDeviceClass-realize_.patch b/vfio-iommufd-Implement-HostIOMMUDeviceClass-realize_.patch new file mode 100644 index 0000000000000000000000000000000000000000..524834d3f990889c6a695038a1b0c7945a12e7d3 --- /dev/null +++ b/vfio-iommufd-Implement-HostIOMMUDeviceClass-realize_.patch @@ -0,0 +1,54 @@ +From b727a28ce2cf062473ca011dd69697e0b7826a25 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Mon, 5 Aug 2024 09:29:00 +0800 +Subject: [PATCH] vfio/iommufd: Implement HostIOMMUDeviceClass::realize_late() + handler + +There are three iommufd related elements iommufd handle, devid and +ioas_id. ioas_id is ready only after VFIO device attachment. Device +id and iommufd handle are ready before attachment, but they are all +iommufd related elements, initialize them together with ioas_id. + +Signed-off-by: Zhenzhong Duan +--- + hw/vfio/iommufd.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 3d4f902ae5..47a8823146 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -827,6 +827,22 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; + }; + ++static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, ++ Error **errp) ++{ ++ VFIODevice *vdev = opaque; ++ VFIOIOMMUFDContainer *container = container_of(vdev->bcontainer, ++ VFIOIOMMUFDContainer, ++ bcontainer); ++ HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); ++ ++ idev->iommufd = vdev->iommufd; ++ idev->devid = vdev->devid; ++ idev->ioas_id = container->ioas_id; ++ ++ return true; ++} ++ + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) + { +@@ -858,6 +874,7 @@ static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + + hiodc->realize = hiod_iommufd_vfio_realize; ++ hiodc->realize_late = hiod_iommufd_vfio_realize_late; + }; + + static const TypeInfo types[] = { +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Implement-VFIOIOMMUClass-query_dirty_bi.patch b/vfio-iommufd-Implement-VFIOIOMMUClass-query_dirty_bi.patch new file mode 100644 index 0000000000000000000000000000000000000000..e6927585c8bf495d7eded15f145a6dfcb3c53ee9 --- /dev/null +++ b/vfio-iommufd-Implement-VFIOIOMMUClass-query_dirty_bi.patch @@ -0,0 +1,154 @@ +From d09cb3d1907e3afbae9b3ea345c9973e207614bf Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:24 +0100 +Subject: [PATCH] vfio/iommufd: Implement VFIOIOMMUClass::query_dirty_bitmap + support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +ioctl(iommufd, IOMMU_HWPT_GET_DIRTY_BITMAP, arg) is the UAPI +that fetches the bitmap that tells what was dirty in an IOVA +range. + +A single bitmap is allocated and used across all the hwpts +sharing an IOAS which is then used in log_sync() to set Qemu +global bitmaps. + +Signed-off-by: Joao Martins +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Reviewed-by: Zhenzhong Duan +[Shameer: changed iommufd_query_dirty_bitmap() declaration] +Signed-off-by: Shameer Kolothum +--- + backends/iommufd.c | 29 +++++++++++++++++++++++++++++ + backends/trace-events | 1 + + hw/vfio/iommufd.c | 32 ++++++++++++++++++++++++++++++++ + include/sysemu/iommufd.h | 4 ++++ + 4 files changed, 66 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 785d3fbbad..c1260766f0 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -277,6 +277,35 @@ bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, + return true; + } + ++bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, ++ uint32_t hwpt_id, ++ uint64_t iova, ram_addr_t size, ++ uint64_t page_size, uint64_t *data, ++ Error **errp) ++{ ++ int ret; ++ struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = { ++ .size = sizeof(get_dirty_bitmap), ++ .hwpt_id = hwpt_id, ++ .iova = iova, ++ .length = size, ++ .page_size = page_size, ++ .data = (uintptr_t)data, ++ }; ++ ++ ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap); ++ trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size, ++ page_size, ret ? errno : 0); ++ if (ret) { ++ error_setg_errno(errp, errno, ++ "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx ++ " size: 0x"RAM_ADDR_FMT") failed", iova, size); ++ return false; ++ } ++ ++ return true; ++} ++ + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, Error **errp) +diff --git a/backends/trace-events b/backends/trace-events +index fe3297ca15..b02433710a 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -17,3 +17,4 @@ iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioa + iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" + iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" + iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" ++iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 11e1392527..3d4f902ae5 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -25,6 +25,7 @@ + #include "qemu/cutils.h" + #include "qemu/chardev_open.h" + #include "pci.h" ++#include "exec/ram_addr.h" + + static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +@@ -152,6 +153,36 @@ err: + return -EINVAL; + } + ++static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, ++ VFIOBitmap *vbmap, hwaddr iova, ++ hwaddr size) ++{ ++ VFIOIOMMUFDContainer *container = container_of(bcontainer, ++ VFIOIOMMUFDContainer, ++ bcontainer); ++ unsigned long page_size = qemu_real_host_page_size(); ++ VFIOIOASHwpt *hwpt; ++ ++ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { ++ if (!iommufd_hwpt_dirty_tracking(hwpt)) { ++ continue; ++ } ++ ++ if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id, ++ iova, size, page_size, ++ (uint64_t *)vbmap->bitmap, ++ NULL)) { ++ error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 ++ " size: 0x%"PRIx64" err: %d", (uint64_t)iova, ++ (uint64_t)size, errno); ++ ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) + { + long int ret = -ENOTTY; +@@ -793,6 +824,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) + vioc->detach_device = iommufd_cdev_detach; + vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; ++ vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; + }; + + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index 4f1dbe827c..3b28c8a81c 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -59,6 +59,10 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + Error **errp); + bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); ++bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, ++ uint64_t iova, ram_addr_t size, ++ uint64_t page_size, uint64_t *data, ++ Error **errp); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Implement-VFIOIOMMUClass-set_dirty_trac.patch b/vfio-iommufd-Implement-VFIOIOMMUClass-set_dirty_trac.patch new file mode 100644 index 0000000000000000000000000000000000000000..536a35d93bc706fbbfffa47e2472cdd31556148f --- /dev/null +++ b/vfio-iommufd-Implement-VFIOIOMMUClass-set_dirty_trac.patch @@ -0,0 +1,134 @@ +From 73b24be504fcd9b453a51e1f2fc8af64b092c586 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:23 +0100 +Subject: [PATCH] vfio/iommufd: Implement VFIOIOMMUClass::set_dirty_tracking + support + +ioctl(iommufd, IOMMU_HWPT_SET_DIRTY_TRACKING, arg) is the UAPI that +enables or disables dirty page tracking. The ioctl is used if the hwpt +has been created with dirty tracking supported domain (stored in +hwpt::flags) and it is called on the whole list of iommu domains. + +Signed-off-by: Joao Martins +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +[Shameer: changed iommufd_set_dirty_page_tracking() declaration] +Signed-off-by: Shameer Kolothum +--- + backends/iommufd.c | 23 +++++++++++++++++++++++ + backends/trace-events | 1 + + hw/vfio/iommufd.c | 34 ++++++++++++++++++++++++++++++++++ + include/sysemu/iommufd.h | 2 ++ + 4 files changed, 60 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 4aebf54765..785d3fbbad 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -254,6 +254,29 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + return true; + } + ++bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, ++ uint32_t hwpt_id, bool start, ++ Error **errp) ++{ ++ int ret; ++ struct iommu_hwpt_set_dirty_tracking set_dirty = { ++ .size = sizeof(set_dirty), ++ .hwpt_id = hwpt_id, ++ .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, ++ }; ++ ++ ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty); ++ trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0); ++ if (ret) { ++ error_setg_errno(errp, errno, ++ "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed", ++ hwpt_id); ++ return false; ++ } ++ ++ return true; ++} ++ + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, Error **errp) +diff --git a/backends/trace-events b/backends/trace-events +index e248bf039e..fe3297ca15 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -16,3 +16,4 @@ iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t si + iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" + iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" + iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" ++iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index a9400d8107..11e1392527 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -119,6 +119,39 @@ static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) + return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + } + ++static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, ++ bool start) ++{ ++ const VFIOIOMMUFDContainer *container = ++ container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); ++ VFIOIOASHwpt *hwpt; ++ ++ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { ++ if (!iommufd_hwpt_dirty_tracking(hwpt)) { ++ continue; ++ } ++ ++ if (!iommufd_backend_set_dirty_tracking(container->be, ++ hwpt->hwpt_id, start, NULL)) { ++ error_report("Failed to set dirty tracking hwpt_id %u errno: %d", ++ hwpt->hwpt_id, errno); ++ goto err; ++ } ++ } ++ ++ return 0; ++ ++err: ++ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { ++ if (!iommufd_hwpt_dirty_tracking(hwpt)) { ++ continue; ++ } ++ iommufd_backend_set_dirty_tracking(container->be, ++ hwpt->hwpt_id, !start, NULL); ++ } ++ return -EINVAL; ++} ++ + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) + { + long int ret = -ENOTTY; +@@ -759,6 +792,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) + vioc->attach_device = iommufd_cdev_attach; + vioc->detach_device = iommufd_cdev_detach; + vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; ++ vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; + }; + + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index f6f01e4be8..4f1dbe827c 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -57,6 +57,8 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp); ++bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, ++ bool start, Error **errp); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Implement-at-de-tach_hwpt-handlers.patch b/vfio-iommufd-Implement-at-de-tach_hwpt-handlers.patch new file mode 100644 index 0000000000000000000000000000000000000000..1a7314b5c8bda7f17f15e9f4996fb3c93b9f1943 --- /dev/null +++ b/vfio-iommufd-Implement-at-de-tach_hwpt-handlers.patch @@ -0,0 +1,66 @@ +From aea706f6a71ddbcc9bd342ece14991f8f8261224 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Thu, 11 Jan 2024 17:26:50 +0800 +Subject: [PATCH] vfio/iommufd: Implement [at|de]tach_hwpt handlers + +Implement [at|de]tach_hwpt handlers in VFIO subsystem. vIOMMU +utilizes them to attach to or detach from hwpt on host side. + +To achieve that, a new property vdev is add to +TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO which is initialized in +.realize() handler. + +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +[Shameer: Changed ret for host_iommu_device_iommufd_vfio_detach_hwpt()] +Signed-off-by: Shameer Kolothum +--- + hw/vfio/iommufd.c | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 47a8823146..528023b95b 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -827,6 +827,24 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; + }; + ++static bool ++host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, ++ uint32_t hwpt_id, Error **errp) ++{ ++ VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; ++ ++ return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); ++} ++ ++static bool ++host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, ++ Error **errp) ++{ ++ VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; ++ ++ return !iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); ++} ++ + static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, + Error **errp) + { +@@ -872,9 +890,13 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) + { + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); ++ HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); + + hiodc->realize = hiod_iommufd_vfio_realize; + hiodc->realize_late = hiod_iommufd_vfio_realize_late; ++ ++ idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; ++ idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; + }; + + static const TypeInfo types[] = { +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Implement-the-iommufd-backend.patch b/vfio-iommufd-Implement-the-iommufd-backend.patch new file mode 100644 index 0000000000000000000000000000000000000000..878f74cf21dbdefcebbb48edc9d05c26bab7841e --- /dev/null +++ b/vfio-iommufd-Implement-the-iommufd-backend.patch @@ -0,0 +1,553 @@ +From 5c034b7ec5ca255551956744a386288a74ab172e Mon Sep 17 00:00:00 2001 +From: Yi Liu +Date: Sat, 11 Jan 2025 10:52:40 +0800 +Subject: [PATCH] vfio/iommufd: Implement the iommufd backend +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The iommufd backend is implemented based on the new /dev/iommu user API. +This backend obviously depends on CONFIG_IOMMUFD. + +So far, the iommufd backend doesn't support dirty page sync yet. + +Co-authored-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 6 + + hw/vfio/iommufd.c | 422 ++++++++++++++++++++++++++++++++++ + hw/vfio/meson.build | 3 + + hw/vfio/trace-events | 10 + + include/hw/vfio/vfio-common.h | 11 + + 5 files changed, 452 insertions(+) + create mode 100644 hw/vfio/iommufd.c + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index f6c2029aec..0e900c6746 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -19,6 +19,7 @@ + */ + + #include "qemu/osdep.h" ++#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include + #ifdef CONFIG_KVM + #include +@@ -1649,6 +1650,11 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + { + const VFIOIOMMUOps *ops = &vfio_legacy_ops; + ++#ifdef CONFIG_IOMMUFD ++ if (vbasedev->iommufd) { ++ ops = &vfio_iommufd_ops; ++ } ++#endif + return ops->attach_device(name, vbasedev, as, errp); + } + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +new file mode 100644 +index 0000000000..6d31aeac7b +--- /dev/null ++++ b/hw/vfio/iommufd.c +@@ -0,0 +1,422 @@ ++/* ++ * iommufd container backend ++ * ++ * Copyright (C) 2023 Intel Corporation. ++ * Copyright Red Hat, Inc. 2023 ++ * ++ * Authors: Yi Liu ++ * Eric Auger ++ * ++ * SPDX-License-Identifier: GPL-2.0-or-later ++ */ ++ ++#include "qemu/osdep.h" ++#include ++#include ++#include ++ ++#include "hw/vfio/vfio-common.h" ++#include "qemu/error-report.h" ++#include "trace.h" ++#include "qapi/error.h" ++#include "sysemu/iommufd.h" ++#include "hw/qdev-core.h" ++#include "sysemu/reset.h" ++#include "qemu/cutils.h" ++#include "qemu/chardev_open.h" ++ ++static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, ++ ram_addr_t size, void *vaddr, bool readonly) ++{ ++ VFIOIOMMUFDContainer *container = ++ container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); ++ ++ return iommufd_backend_map_dma(container->be, ++ container->ioas_id, ++ iova, size, vaddr, readonly); ++} ++ ++static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer, ++ hwaddr iova, ram_addr_t size, ++ IOMMUTLBEntry *iotlb) ++{ ++ VFIOIOMMUFDContainer *container = ++ container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); ++ ++ /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ ++ return iommufd_backend_unmap_dma(container->be, ++ container->ioas_id, iova, size); ++} ++ ++static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp) ++{ ++ return vfio_kvm_device_add_fd(vbasedev->fd, errp); ++} ++ ++static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev) ++{ ++ Error *err = NULL; ++ ++ if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) { ++ error_report_err(err); ++ } ++} ++ ++static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) ++{ ++ IOMMUFDBackend *iommufd = vbasedev->iommufd; ++ struct vfio_device_bind_iommufd bind = { ++ .argsz = sizeof(bind), ++ .flags = 0, ++ }; ++ int ret; ++ ++ ret = iommufd_backend_connect(iommufd, errp); ++ if (ret) { ++ return ret; ++ } ++ ++ /* ++ * Add device to kvm-vfio to be prepared for the tracking ++ * in KVM. Especially for some emulated devices, it requires ++ * to have kvm information in the device open. ++ */ ++ ret = iommufd_cdev_kvm_device_add(vbasedev, errp); ++ if (ret) { ++ goto err_kvm_device_add; ++ } ++ ++ /* Bind device to iommufd */ ++ bind.iommufd = iommufd->fd; ++ ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); ++ if (ret) { ++ error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d", ++ vbasedev->fd, bind.iommufd); ++ goto err_bind; ++ } ++ ++ vbasedev->devid = bind.out_devid; ++ trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, ++ vbasedev->fd, vbasedev->devid); ++ return ret; ++err_bind: ++ iommufd_cdev_kvm_device_del(vbasedev); ++err_kvm_device_add: ++ iommufd_backend_disconnect(iommufd); ++ return ret; ++} ++ ++static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) ++{ ++ /* Unbind is automatically conducted when device fd is closed */ ++ iommufd_cdev_kvm_device_del(vbasedev); ++ iommufd_backend_disconnect(vbasedev->iommufd); ++} ++ ++static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) ++{ ++ long int ret = -ENOTTY; ++ char *path, *vfio_dev_path = NULL, *vfio_path = NULL; ++ DIR *dir = NULL; ++ struct dirent *dent; ++ gchar *contents; ++ struct stat st; ++ gsize length; ++ int major, minor; ++ dev_t vfio_devt; ++ ++ path = g_strdup_printf("%s/vfio-dev", sysfs_path); ++ if (stat(path, &st) < 0) { ++ error_setg_errno(errp, errno, "no such host device"); ++ goto out_free_path; ++ } ++ ++ dir = opendir(path); ++ if (!dir) { ++ error_setg_errno(errp, errno, "couldn't open directory %s", path); ++ goto out_free_path; ++ } ++ ++ while ((dent = readdir(dir))) { ++ if (!strncmp(dent->d_name, "vfio", 4)) { ++ vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name); ++ break; ++ } ++ } ++ ++ if (!vfio_dev_path) { ++ error_setg(errp, "failed to find vfio-dev/vfioX/dev"); ++ goto out_close_dir; ++ } ++ ++ if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) { ++ error_setg(errp, "failed to load \"%s\"", vfio_dev_path); ++ goto out_free_dev_path; ++ } ++ ++ if (sscanf(contents, "%d:%d", &major, &minor) != 2) { ++ error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path); ++ goto out_free_dev_path; ++ } ++ g_free(contents); ++ vfio_devt = makedev(major, minor); ++ ++ vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name); ++ ret = open_cdev(vfio_path, vfio_devt); ++ if (ret < 0) { ++ error_setg(errp, "Failed to open %s", vfio_path); ++ } ++ ++ trace_iommufd_cdev_getfd(vfio_path, ret); ++ g_free(vfio_path); ++ ++out_free_dev_path: ++ g_free(vfio_dev_path); ++out_close_dir: ++ closedir(dir); ++out_free_path: ++ if (*errp) { ++ error_prepend(errp, VFIO_MSG_PREFIX, path); ++ } ++ g_free(path); ++ ++ return ret; ++} ++ ++static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, ++ Error **errp) ++{ ++ int ret, iommufd = vbasedev->iommufd->fd; ++ struct vfio_device_attach_iommufd_pt attach_data = { ++ .argsz = sizeof(attach_data), ++ .flags = 0, ++ .pt_id = id, ++ }; ++ ++ /* Attach device to an IOAS or hwpt within iommufd */ ++ ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); ++ if (ret) { ++ error_setg_errno(errp, errno, ++ "[iommufd=%d] error attach %s (%d) to id=%d", ++ iommufd, vbasedev->name, vbasedev->fd, id); ++ } else { ++ trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, ++ vbasedev->fd, id); ++ } ++ return ret; ++} ++ ++static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) ++{ ++ int ret, iommufd = vbasedev->iommufd->fd; ++ struct vfio_device_detach_iommufd_pt detach_data = { ++ .argsz = sizeof(detach_data), ++ .flags = 0, ++ }; ++ ++ ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data); ++ if (ret) { ++ error_setg_errno(errp, errno, "detach %s failed", vbasedev->name); ++ } else { ++ trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name); ++ } ++ return ret; ++} ++ ++static int iommufd_cdev_attach_container(VFIODevice *vbasedev, ++ VFIOIOMMUFDContainer *container, ++ Error **errp) ++{ ++ return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); ++} ++ ++static void iommufd_cdev_detach_container(VFIODevice *vbasedev, ++ VFIOIOMMUFDContainer *container) ++{ ++ Error *err = NULL; ++ ++ if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { ++ error_report_err(err); ++ } ++} ++ ++static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) ++{ ++ VFIOContainerBase *bcontainer = &container->bcontainer; ++ ++ if (!QLIST_EMPTY(&bcontainer->device_list)) { ++ return; ++ } ++ memory_listener_unregister(&bcontainer->listener); ++ vfio_container_destroy(bcontainer); ++ iommufd_backend_free_id(container->be, container->ioas_id); ++ g_free(container); ++} ++ ++static int iommufd_cdev_ram_block_discard_disable(bool state) ++{ ++ /* ++ * We support coordinated discarding of RAM via the RamDiscardManager. ++ */ ++ return ram_block_uncoordinated_discard_disable(state); ++} ++ ++static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, ++ AddressSpace *as, Error **errp) ++{ ++ VFIOContainerBase *bcontainer; ++ VFIOIOMMUFDContainer *container; ++ VFIOAddressSpace *space; ++ struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; ++ int ret, devfd; ++ uint32_t ioas_id; ++ Error *err = NULL; ++ ++ devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); ++ if (devfd < 0) { ++ return devfd; ++ } ++ vbasedev->fd = devfd; ++ ++ ret = iommufd_cdev_connect_and_bind(vbasedev, errp); ++ if (ret) { ++ goto err_connect_bind; ++ } ++ ++ space = vfio_get_address_space(as); ++ ++ /* try to attach to an existing container in this space */ ++ QLIST_FOREACH(bcontainer, &space->containers, next) { ++ container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); ++ if (bcontainer->ops != &vfio_iommufd_ops || ++ vbasedev->iommufd != container->be) { ++ continue; ++ } ++ if (iommufd_cdev_attach_container(vbasedev, container, &err)) { ++ const char *msg = error_get_pretty(err); ++ ++ trace_iommufd_cdev_fail_attach_existing_container(msg); ++ error_free(err); ++ err = NULL; ++ } else { ++ ret = iommufd_cdev_ram_block_discard_disable(true); ++ if (ret) { ++ error_setg(errp, ++ "Cannot set discarding of RAM broken (%d)", ret); ++ goto err_discard_disable; ++ } ++ goto found_container; ++ } ++ } ++ ++ /* Need to allocate a new dedicated container */ ++ ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp); ++ if (ret < 0) { ++ goto err_alloc_ioas; ++ } ++ ++ trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); ++ ++ container = g_malloc0(sizeof(*container)); ++ container->be = vbasedev->iommufd; ++ container->ioas_id = ioas_id; ++ ++ bcontainer = &container->bcontainer; ++ vfio_container_init(bcontainer, space, &vfio_iommufd_ops); ++ QLIST_INSERT_HEAD(&space->containers, bcontainer, next); ++ ++ ret = iommufd_cdev_attach_container(vbasedev, container, errp); ++ if (ret) { ++ goto err_attach_container; ++ } ++ ++ ret = iommufd_cdev_ram_block_discard_disable(true); ++ if (ret) { ++ goto err_discard_disable; ++ } ++ ++ bcontainer->pgsizes = qemu_real_host_page_size(); ++ ++ bcontainer->listener = vfio_memory_listener; ++ memory_listener_register(&bcontainer->listener, bcontainer->space->as); ++ ++ if (bcontainer->error) { ++ ret = -1; ++ error_propagate_prepend(errp, bcontainer->error, ++ "memory listener initialization failed: "); ++ goto err_listener_register; ++ } ++ ++ bcontainer->initialized = true; ++ ++found_container: ++ ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info); ++ if (ret) { ++ error_setg_errno(errp, errno, "error getting device info"); ++ goto err_listener_register; ++ } ++ ++ /* ++ * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level ++ * for discarding incompatibility check as well? ++ */ ++ if (vbasedev->ram_block_discard_allowed) { ++ iommufd_cdev_ram_block_discard_disable(false); ++ } ++ ++ vbasedev->group = 0; ++ vbasedev->num_irqs = dev_info.num_irqs; ++ vbasedev->num_regions = dev_info.num_regions; ++ vbasedev->flags = dev_info.flags; ++ vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); ++ vbasedev->bcontainer = bcontainer; ++ QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); ++ QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); ++ ++ trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, ++ vbasedev->num_regions, vbasedev->flags); ++ return 0; ++ ++err_listener_register: ++ iommufd_cdev_ram_block_discard_disable(false); ++err_discard_disable: ++ iommufd_cdev_detach_container(vbasedev, container); ++err_attach_container: ++ iommufd_cdev_container_destroy(container); ++err_alloc_ioas: ++ vfio_put_address_space(space); ++ iommufd_cdev_unbind_and_disconnect(vbasedev); ++err_connect_bind: ++ close(vbasedev->fd); ++ return ret; ++} ++ ++static void iommufd_cdev_detach(VFIODevice *vbasedev) ++{ ++ VFIOContainerBase *bcontainer = vbasedev->bcontainer; ++ VFIOAddressSpace *space = bcontainer->space; ++ VFIOIOMMUFDContainer *container = container_of(bcontainer, ++ VFIOIOMMUFDContainer, ++ bcontainer); ++ QLIST_REMOVE(vbasedev, global_next); ++ QLIST_REMOVE(vbasedev, container_next); ++ vbasedev->bcontainer = NULL; ++ ++ if (!vbasedev->ram_block_discard_allowed) { ++ iommufd_cdev_ram_block_discard_disable(false); ++ } ++ ++ iommufd_cdev_detach_container(vbasedev, container); ++ iommufd_cdev_container_destroy(container); ++ vfio_put_address_space(space); ++ ++ iommufd_cdev_unbind_and_disconnect(vbasedev); ++ close(vbasedev->fd); ++} ++ ++const VFIOIOMMUOps vfio_iommufd_ops = { ++ .dma_map = iommufd_cdev_map, ++ .dma_unmap = iommufd_cdev_unmap, ++ .attach_device = iommufd_cdev_attach, ++ .detach_device = iommufd_cdev_detach, ++}; +diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build +index 32a6933280..bd5cc4ca79 100644 +--- a/hw/vfio/meson.build ++++ b/hw/vfio/meson.build +@@ -7,6 +7,9 @@ vfio_ss.add(files( + 'spapr.c', + 'migration.c', + )) ++vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( ++ 'iommufd.c', ++)) + vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( + 'display.c', + 'pci-quirks.c', +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 08a1f9dfa4..3340c93af0 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -164,3 +164,13 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop + vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 + vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" + vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" ++ ++#iommufd.c ++ ++iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d" ++iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)" ++iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d" ++iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s" ++iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" ++iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" ++iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 9e22acbfb6..9b9fd7b461 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -99,6 +99,14 @@ typedef struct VFIOHostDMAWindow { + QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; + } VFIOHostDMAWindow; + ++typedef struct IOMMUFDBackend IOMMUFDBackend; ++ ++typedef struct VFIOIOMMUFDContainer { ++ VFIOContainerBase bcontainer; ++ IOMMUFDBackend *be; ++ uint32_t ioas_id; ++} VFIOIOMMUFDContainer; ++ + typedef struct VFIODeviceOps VFIODeviceOps; + + typedef struct VFIODevice { +@@ -126,6 +134,8 @@ typedef struct VFIODevice { + OnOffAuto pre_copy_dirty_page_tracking; + bool dirty_pages_supported; + bool dirty_tracking; ++ int devid; ++ IOMMUFDBackend *iommufd; + } VFIODevice; + + struct VFIODeviceOps { +@@ -215,6 +225,7 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; + extern VFIOGroupList vfio_group_list; + extern VFIODeviceList vfio_device_list; + extern const VFIOIOMMUOps vfio_legacy_ops; ++extern const VFIOIOMMUOps vfio_iommufd_ops; + extern const MemoryListener vfio_memory_listener; + extern int vfio_kvm_device_fd; + +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Introduce-a-VFIOIOMMU-iommufd-QOM-inter.patch b/vfio-iommufd-Introduce-a-VFIOIOMMU-iommufd-QOM-inter.patch new file mode 100644 index 0000000000000000000000000000000000000000..dddcb04f8e9f22bdf3a756b4ad6cbd3f6b53fa3d --- /dev/null +++ b/vfio-iommufd-Introduce-a-VFIOIOMMU-iommufd-QOM-inter.patch @@ -0,0 +1,145 @@ +From 66f71e9acdaa0c1c31770f00a21ea32644ebaac9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:23 +0100 +Subject: [PATCH] vfio/iommufd: Introduce a VFIOIOMMU iommufd QOM interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +As previously done for the sPAPR and legacy IOMMU backends, convert +the VFIOIOMMUOps struct to a QOM interface. The set of of operations +for this backend can be referenced with a literal typename instead of +a C struct. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 2 +- + hw/vfio/iommufd.c | 35 ++++++++++++++++++++------- + include/hw/vfio/vfio-common.h | 1 - + include/hw/vfio/vfio-container-base.h | 2 +- + 4 files changed, 28 insertions(+), 12 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d98c3b7422..a8b7129fa5 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1654,7 +1654,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + + #ifdef CONFIG_IOMMUFD + if (vbasedev->iommufd) { +- ops = &vfio_iommufd_ops; ++ ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + } + #endif + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 87a561c545..d4c586e842 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -319,6 +319,8 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + int ret, devfd; + uint32_t ioas_id; + Error *err = NULL; ++ const VFIOIOMMUClass *iommufd_vioc = ++ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + + if (vbasedev->fd < 0) { + devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); +@@ -340,7 +342,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + /* try to attach to an existing container in this space */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); +- if (bcontainer->ops != &vfio_iommufd_ops || ++ if (bcontainer->ops != iommufd_vioc || + vbasedev->iommufd != container->be) { + continue; + } +@@ -374,7 +376,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + container->ioas_id = ioas_id; + + bcontainer = &container->bcontainer; +- vfio_container_init(bcontainer, space, &vfio_iommufd_ops); ++ vfio_container_init(bcontainer, space, iommufd_vioc); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + + ret = iommufd_cdev_attach_container(vbasedev, container, errp); +@@ -476,9 +478,11 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) + static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) + { + VFIODevice *vbasedev_iter; ++ const VFIOIOMMUClass *iommufd_vioc = ++ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + + QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { +- if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) { ++ if (vbasedev_iter->bcontainer->ops != iommufd_vioc) { + continue; + } + if (devid == vbasedev_iter->devid) { +@@ -621,10 +625,23 @@ out_single: + return ret; + } + +-const VFIOIOMMUOps vfio_iommufd_ops = { +- .dma_map = iommufd_cdev_map, +- .dma_unmap = iommufd_cdev_unmap, +- .attach_device = iommufd_cdev_attach, +- .detach_device = iommufd_cdev_detach, +- .pci_hot_reset = iommufd_cdev_pci_hot_reset, ++static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) ++{ ++ VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); ++ ++ vioc->dma_map = iommufd_cdev_map; ++ vioc->dma_unmap = iommufd_cdev_unmap; ++ vioc->attach_device = iommufd_cdev_attach; ++ vioc->detach_device = iommufd_cdev_detach; ++ vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + }; ++ ++static const TypeInfo types[] = { ++ { ++ .name = TYPE_VFIO_IOMMU_IOMMUFD, ++ .parent = TYPE_VFIO_IOMMU, ++ .class_init = vfio_iommu_iommufd_class_init, ++ }, ++}; ++ ++DEFINE_TYPES(types) +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index f78a97006c..f3966410c1 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -224,7 +224,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; + typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; + extern VFIOGroupList vfio_group_list; + extern VFIODeviceList vfio_device_list; +-extern const VFIOIOMMUOps vfio_iommufd_ops; + extern const MemoryListener vfio_memory_listener; + extern int vfio_kvm_device_fd; + +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 1085109d0c..c12ce4dfcb 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -17,7 +17,6 @@ + + typedef struct VFIODevice VFIODevice; + typedef struct VFIOIOMMUClass VFIOIOMMUClass; +-#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */ + + typedef struct { + unsigned long *bitmap; +@@ -96,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); + #define TYPE_VFIO_IOMMU "vfio-iommu" + #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" + #define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" ++#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" + + /* + * VFIOContainerBase is not an abstract QOM object because it felt +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Introduce-auto-domain-creation.patch b/vfio-iommufd-Introduce-auto-domain-creation.patch new file mode 100644 index 0000000000000000000000000000000000000000..b4cd4c2baa62ffef4b4748c9722b2002732e5750 --- /dev/null +++ b/vfio-iommufd-Introduce-auto-domain-creation.patch @@ -0,0 +1,275 @@ +From 630efd6ca2f0c9383223f0ea092abda1c7528f21 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:18 +0100 +Subject: [PATCH] vfio/iommufd: Introduce auto domain creation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +There's generally two modes of operation for IOMMUFD: + +1) The simple user API which intends to perform relatively simple things +with IOMMUs e.g. DPDK. The process generally creates an IOAS and attaches +to VFIO and mainly performs IOAS_MAP and UNMAP. + +2) The native IOMMUFD API where you have fine grained control of the +IOMMU domain and model it accordingly. This is where most new feature +are being steered to. + +For dirty tracking 2) is required, as it needs to ensure that +the stage-2/parent IOMMU domain will only attach devices +that support dirty tracking (so far it is all homogeneous in x86, likely +not the case for smmuv3). Such invariant on dirty tracking provides a +useful guarantee to VMMs that will refuse incompatible device +attachments for IOMMU domains. + +Dirty tracking insurance is enforced via HWPT_ALLOC, which is +responsible for creating an IOMMU domain. This is contrast to the +'simple API' where the IOMMU domain is created by IOMMUFD automatically +when it attaches to VFIO (usually referred as autodomains) but it has +the needed handling for mdevs. + +To support dirty tracking with the advanced IOMMUFD API, it needs +similar logic, where IOMMU domains are created and devices attached to +compatible domains. Essentially mimicking kernel +iommufd_device_auto_get_domain(). With mdevs given there's no IOMMU domain +it falls back to IOAS attach. + +The auto domain logic allows different IOMMU domains to be created when +DMA dirty tracking is not desired (and VF can provide it), and others where +it is. Here it is not used in this way given how VFIODevice migration +state is initialized after the device attachment. But such mixed mode of +IOMMU dirty tracking + device dirty tracking is an improvement that can +be added on. Keep the 'all of nothing' of type1 approach that we have +been using so far between container vs device dirty tracking. + +Signed-off-by: Joao Martins +Reviewed-by: Zhenzhong Duan +[ clg: Added ERRP_GUARD() in iommufd_cdev_autodomains_get() ] +Signed-off-by: Cédric Le Goater +Reviewed-by: Eric Auger +[Shameer: Changed ret for iommufd_cdev_autodomains_get() ] +Signed-off-by: Shameer Kolothum +--- + backends/iommufd.c | 30 +++++++++++++ + backends/trace-events | 1 + + hw/vfio/iommufd.c | 85 +++++++++++++++++++++++++++++++++++ + include/hw/vfio/vfio-common.h | 9 ++++ + include/sysemu/iommufd.h | 5 +++ + 5 files changed, 130 insertions(+) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 1ce2a24226..0d995d7563 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -223,6 +223,36 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + return ret; + } + ++bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, ++ uint32_t pt_id, uint32_t flags, ++ uint32_t data_type, uint32_t data_len, ++ void *data_ptr, uint32_t *out_hwpt, ++ Error **errp) ++{ ++ int ret, fd = be->fd; ++ struct iommu_hwpt_alloc alloc_hwpt = { ++ .size = sizeof(struct iommu_hwpt_alloc), ++ .flags = flags, ++ .dev_id = dev_id, ++ .pt_id = pt_id, ++ .data_type = data_type, ++ .data_len = data_len, ++ .data_uptr = (uintptr_t)data_ptr, ++ }; ++ ++ ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); ++ trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, ++ data_len, (uintptr_t)data_ptr, ++ alloc_hwpt.out_hwpt_id, ret); ++ if (ret) { ++ error_setg_errno(errp, errno, "Failed to allocate hwpt"); ++ return false; ++ } ++ ++ *out_hwpt = alloc_hwpt.out_hwpt_id; ++ return true; ++} ++ + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, Error **errp) +diff --git a/backends/trace-events b/backends/trace-events +index d45c6e31a6..e248bf039e 100644 +--- a/backends/trace-events ++++ b/backends/trace-events +@@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size + iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" + iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" + iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" ++iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" + iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 5e7788ed59..3b75cba26c 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -225,10 +225,89 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) + return ret; + } + ++static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, ++ VFIOIOMMUFDContainer *container, ++ Error **errp) ++{ ++ ERRP_GUARD(); ++ IOMMUFDBackend *iommufd = vbasedev->iommufd; ++ uint32_t flags = 0; ++ VFIOIOASHwpt *hwpt; ++ uint32_t hwpt_id; ++ int ret; ++ ++ /* Try to find a domain */ ++ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { ++ ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); ++ if (ret) { ++ /* -EINVAL means the domain is incompatible with the device. */ ++ if (ret == -EINVAL) { ++ /* ++ * It is an expected failure and it just means we will try ++ * another domain, or create one if no existing compatible ++ * domain is found. Hence why the error is discarded below. ++ */ ++ error_free(*errp); ++ *errp = NULL; ++ continue; ++ } ++ ++ return ret; ++ } else { ++ vbasedev->hwpt = hwpt; ++ QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); ++ return 0; ++ } ++ } ++ ++ if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, ++ container->ioas_id, flags, ++ IOMMU_HWPT_DATA_NONE, 0, NULL, ++ &hwpt_id, errp)) { ++ return -EINVAL; ++ } ++ ++ hwpt = g_malloc0(sizeof(*hwpt)); ++ hwpt->hwpt_id = hwpt_id; ++ QLIST_INIT(&hwpt->device_list); ++ ++ ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); ++ if (ret) { ++ iommufd_backend_free_id(container->be, hwpt->hwpt_id); ++ g_free(hwpt); ++ return ret; ++ } ++ ++ vbasedev->hwpt = hwpt; ++ QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); ++ QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); ++ return 0; ++} ++ ++static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev, ++ VFIOIOMMUFDContainer *container) ++{ ++ VFIOIOASHwpt *hwpt = vbasedev->hwpt; ++ ++ QLIST_REMOVE(vbasedev, hwpt_next); ++ vbasedev->hwpt = NULL; ++ ++ if (QLIST_EMPTY(&hwpt->device_list)) { ++ QLIST_REMOVE(hwpt, next); ++ iommufd_backend_free_id(container->be, hwpt->hwpt_id); ++ g_free(hwpt); ++ } ++} ++ + static int iommufd_cdev_attach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) + { ++ /* mdevs aren't physical devices and will fail with auto domains */ ++ if (!vbasedev->mdev) { ++ return iommufd_cdev_autodomains_get(vbasedev, container, errp); ++ } ++ + return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); + } + +@@ -240,6 +319,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, + if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { + error_report_err(err); + } ++ ++ if (vbasedev->hwpt) { ++ iommufd_cdev_autodomains_put(vbasedev, container); ++ } ++ + } + + static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) +@@ -375,6 +459,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + container = g_malloc0(sizeof(*container)); + container->be = vbasedev->iommufd; + container->ioas_id = ioas_id; ++ QLIST_INIT(&container->hwpt_list); + + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, iommufd_vioc); +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index e49e5fabba..2093ed2e91 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -107,10 +107,17 @@ typedef struct VFIOHostDMAWindow { + + typedef struct IOMMUFDBackend IOMMUFDBackend; + ++typedef struct VFIOIOASHwpt { ++ uint32_t hwpt_id; ++ QLIST_HEAD(, VFIODevice) device_list; ++ QLIST_ENTRY(VFIOIOASHwpt) next; ++} VFIOIOASHwpt; ++ + typedef struct VFIOIOMMUFDContainer { + VFIOContainerBase bcontainer; + IOMMUFDBackend *be; + uint32_t ioas_id; ++ QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; + } VFIOIOMMUFDContainer; + + typedef struct VFIODeviceOps VFIODeviceOps; +@@ -144,6 +151,8 @@ typedef struct VFIODevice { + HostIOMMUDevice *hiod; + int devid; + IOMMUFDBackend *iommufd; ++ VFIOIOASHwpt *hwpt; ++ QLIST_ENTRY(VFIODevice) hwpt_next; + } VFIODevice; + + struct VFIODeviceOps { +diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h +index a0a0143856..f6f01e4be8 100644 +--- a/include/sysemu/iommufd.h ++++ b/include/sysemu/iommufd.h +@@ -52,6 +52,11 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, Error **errp); ++bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, ++ uint32_t pt_id, uint32_t flags, ++ uint32_t data_type, uint32_t data_len, ++ void *data_ptr, uint32_t *out_hwpt, ++ Error **errp); + + #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" + #endif +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Probe-and-request-hwpt-dirty-tracking-c.patch b/vfio-iommufd-Probe-and-request-hwpt-dirty-tracking-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..425bf09c37e02dbc1e4e763a3b750175af02d7f0 --- /dev/null +++ b/vfio-iommufd-Probe-and-request-hwpt-dirty-tracking-c.patch @@ -0,0 +1,119 @@ +From db8ef4524568c2379c25986db6e30cb0f6c0ec05 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:22 +0100 +Subject: [PATCH] vfio/iommufd: Probe and request hwpt dirty tracking + capability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In preparation to using the dirty tracking UAPI, probe whether the IOMMU +supports dirty tracking. This is done via the data stored in +hiod::caps::hw_caps initialized from GET_HW_INFO. + +Qemu doesn't know if VF dirty tracking is supported when allocating +hardware pagetable in iommufd_cdev_autodomains_get(). This is because +VFIODevice migration state hasn't been initialized *yet* hence it can't pick +between VF dirty tracking vs IOMMU dirty tracking. So, if IOMMU supports +dirty tracking it always creates HWPTs with IOMMU_HWPT_ALLOC_DIRTY_TRACKING +even if later on VFIOMigration decides to use VF dirty tracking instead. + +Signed-off-by: Joao Martins +[ clg: - Fixed vbasedev->iommu_dirty_tracking assignment in + iommufd_cdev_autodomains_get() + - Added warning for heterogeneous dirty page tracking support + in iommufd_cdev_autodomains_get() ] +Signed-off-by: Cédric Le Goater +Reviewed-by: Zhenzhong Duan +--- + hw/vfio/iommufd.c | 26 ++++++++++++++++++++++++++ + include/hw/vfio/vfio-common.h | 2 ++ + 2 files changed, 28 insertions(+) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 8fd6826826..a9400d8107 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -114,6 +114,11 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) + iommufd_backend_disconnect(vbasedev->iommufd); + } + ++static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) ++{ ++ return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; ++} ++ + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) + { + long int ret = -ENOTTY; +@@ -256,10 +261,22 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + } else { + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); ++ vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); + return 0; + } + } + ++ /* ++ * This is quite early and VFIO Migration state isn't yet fully ++ * initialized, thus rely only on IOMMU hardware capabilities as to ++ * whether IOMMU dirty tracking is going to be requested. Later ++ * vfio_migration_realize() may decide to use VF dirty tracking ++ * instead. ++ */ ++ if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { ++ flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; ++ } ++ + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, +@@ -269,6 +286,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + + hwpt = g_malloc0(sizeof(*hwpt)); + hwpt->hwpt_id = hwpt_id; ++ hwpt->hwpt_flags = flags; + QLIST_INIT(&hwpt->device_list); + + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); +@@ -279,8 +297,16 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + } + + vbasedev->hwpt = hwpt; ++ vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); ++ container->bcontainer.dirty_pages_supported |= ++ vbasedev->iommu_dirty_tracking; ++ if (container->bcontainer.dirty_pages_supported && ++ !vbasedev->iommu_dirty_tracking) { ++ warn_report("IOMMU instance for device %s doesn't support dirty tracking", ++ vbasedev->name); ++ } + return 0; + } + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 63da291456..22a7386591 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -109,6 +109,7 @@ typedef struct IOMMUFDBackend IOMMUFDBackend; + + typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; ++ uint32_t hwpt_flags; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; + } VFIOIOASHwpt; +@@ -148,6 +149,7 @@ typedef struct VFIODevice { + OnOffAuto pre_copy_dirty_page_tracking; + bool dirty_pages_supported; + bool dirty_tracking; ++ bool iommu_dirty_tracking; + HostIOMMUDevice *hiod; + int devid; + IOMMUFDBackend *iommufd; +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Relax-assert-check-for-iommufd-backend.patch b/vfio-iommufd-Relax-assert-check-for-iommufd-backend.patch new file mode 100644 index 0000000000000000000000000000000000000000..a73228e5421aa128de80102c7564b5d058b3c3ca --- /dev/null +++ b/vfio-iommufd-Relax-assert-check-for-iommufd-backend.patch @@ -0,0 +1,63 @@ +From cb2bd16a67cd45a0ad3318098120aee10a298f3b Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:41 +0800 +Subject: [PATCH] vfio/iommufd: Relax assert check for iommufd backend +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently iommufd doesn't support dirty page sync yet, +but it will not block us doing live migration if VFIO +migration is force enabled. + +So in this case we allow set_dirty_page_tracking to be NULL. +Note we don't need same change for query_dirty_bitmap because +when dirty page sync isn't supported, query_dirty_bitmap will +never be called. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/container-base.c | 4 ++++ + hw/vfio/container.c | 4 ---- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 71f7274973..eee2dcfe76 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -55,6 +55,10 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) + { ++ if (!bcontainer->dirty_pages_supported) { ++ return 0; ++ } ++ + g_assert(bcontainer->ops->set_dirty_page_tracking); + return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); + } +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 62af0f2bdd..4936b8f27f 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -266,10 +266,6 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + .argsz = sizeof(dirty), + }; + +- if (!bcontainer->dirty_pages_supported) { +- return 0; +- } +- + if (start) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Remove-CONFIG_IOMMUFD-usage.patch b/vfio-iommufd-Remove-CONFIG_IOMMUFD-usage.patch new file mode 100644 index 0000000000000000000000000000000000000000..9e0aea77a1a307cdea2d1dfc87c6bf86f4fe3640 --- /dev/null +++ b/vfio-iommufd-Remove-CONFIG_IOMMUFD-usage.patch @@ -0,0 +1,46 @@ +From 188948043652fbcdd4505fd9672e57bc61647159 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:25 +0100 +Subject: [PATCH] vfio/iommufd: Remove CONFIG_IOMMUFD usage +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Availability of the IOMMUFD backend can now be fully determined at +runtime and the ifdef check was a build time protection (for PPC not +supporting it mostly). + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index a8b7129fa5..b5d02df0c2 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -19,7 +19,6 @@ + */ + + #include "qemu/osdep.h" +-#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include + #ifdef CONFIG_KVM + #include +@@ -1652,11 +1651,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + +-#ifdef CONFIG_IOMMUFD + if (vbasedev->iommufd) { + ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + } +-#endif + + assert(ops); + +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-Return-errno-in-iommufd_cdev_attach_ioa.patch b/vfio-iommufd-Return-errno-in-iommufd_cdev_attach_ioa.patch new file mode 100644 index 0000000000000000000000000000000000000000..04cea10882383896e4ead1f6d29329ba377e1d33 --- /dev/null +++ b/vfio-iommufd-Return-errno-in-iommufd_cdev_attach_ioa.patch @@ -0,0 +1,46 @@ +From 56e5b9cf8e4041a023daca1ce439ca14619afa97 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Fri, 19 Jul 2024 13:04:52 +0100 +Subject: [PATCH] vfio/iommufd: Return errno in iommufd_cdev_attach_ioas_hwpt() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In preparation to implement auto domains have the attach function +return the errno it got during domain attach instead of a bool. + +-EINVAL is tracked to track domain incompatibilities, and decide whether +to create a new IOMMU domain. + +Signed-off-by: Joao Martins +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Reviewed-by: Zhenzhong Duan +--- + hw/vfio/iommufd.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index d5b923ca83..5e7788ed59 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -200,11 +200,12 @@ static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, + error_setg_errno(errp, errno, + "[iommufd=%d] error attach %s (%d) to id=%d", + iommufd, vbasedev->name, vbasedev->fd, id); +- } else { +- trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, +- vbasedev->fd, id); ++ return -errno; + } +- return ret; ++ ++ trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, ++ vbasedev->fd, id); ++ return 0; + } + + static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-container-Invoke-HostIOMMUDevice-realiz.patch b/vfio-iommufd-container-Invoke-HostIOMMUDevice-realiz.patch new file mode 100644 index 0000000000000000000000000000000000000000..25878555b0dccbf717ae3b24339f72beed3b474c --- /dev/null +++ b/vfio-iommufd-container-Invoke-HostIOMMUDevice-realiz.patch @@ -0,0 +1,141 @@ +From 2276a3a175576a63da6abd5ccb309dd1cdbc4021 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:21 +0100 +Subject: [PATCH] vfio/{iommufd, container}: Invoke HostIOMMUDevice::realize() + during attach_device() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Move the HostIOMMUDevice::realize() to be invoked during the attach of the device +before we allocate IOMMUFD hardware pagetable objects (HWPT). This allows the use +of the hw_caps obtained by IOMMU_GET_HW_INFO that essentially tell if the IOMMU +behind the device supports dirty tracking. + +Note: The HostIOMMUDevice data from legacy backend is static and doesn't +need any information from the (type1-iommu) backend to be initialized. +In contrast however, the IOMMUFD HostIOMMUDevice data requires the +iommufd FD to be connected and having a devid to be able to successfully +GET_HW_INFO. This means vfio_device_hiod_realize() is called in +different places within the backend .attach_device() implementation. + +Suggested-by: Cédric Le Goater +Signed-off-by: Joao Martins +Reviewed-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +[ clg: Fixed error handling in iommufd_cdev_attach() ] +Signed-off-by: Cédric Le Goater +Reviewed-by: Eric Auger +--- + hw/vfio/common.c | 19 +++++++------------ + hw/vfio/container.c | 4 ++++ + hw/vfio/helpers.c | 11 +++++++++++ + hw/vfio/iommufd.c | 11 +++++++++++ + include/hw/vfio/vfio-common.h | 1 + + 5 files changed, 34 insertions(+), 12 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index ceb1da0b94..65e1c9f810 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1659,22 +1659,17 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, + + assert(ops); + +- ret = ops->attach_device(name, vbasedev, as, errp); +- if (ret) { +- return ret; +- } +- +- if (vbasedev->mdev) { +- return true; ++ if (!vbasedev->mdev) { ++ hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); ++ vbasedev->hiod = hiod; + } + +- hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); +- if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { ++ ret = ops->attach_device(name, vbasedev, as, errp); ++ if (ret) { + object_unref(hiod); +- ops->detach_device(vbasedev); +- return -1; ++ vbasedev->hiod = NULL; ++ return ret; + } +- vbasedev->hiod = hiod; + + return 0; + } +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 30a62348d3..64eacfd912 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1030,6 +1030,10 @@ static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, + + trace_vfio_attach_device(vbasedev->name, groupid); + ++ if (!vfio_device_hiod_realize(vbasedev, errp)) { ++ return false; ++ } ++ + group = vfio_get_group(groupid, as, errp); + if (!group) { + return -ENOENT; +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index 37bc383c69..1f3bfed917 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -694,3 +694,14 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev) + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); + } ++ ++bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) ++{ ++ HostIOMMUDevice *hiod = vbasedev->hiod; ++ ++ if (!hiod) { ++ return true; ++ } ++ ++ return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); ++} +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index d9088705de..8fd6826826 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -424,6 +424,17 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + + space = vfio_get_address_space(as); + ++ /* ++ * The HostIOMMUDevice data from legacy backend is static and doesn't need ++ * any information from the (type1-iommu) backend to be initialized. In ++ * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd ++ * FD to be connected and having a devid to be able to successfully call ++ * iommufd_backend_get_device_info(). ++ */ ++ if (!vfio_device_hiod_realize(vbasedev, errp)) { ++ goto err_alloc_ioas; ++ } ++ + /* try to attach to an existing container in this space */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 2093ed2e91..63da291456 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -230,6 +230,7 @@ void vfio_region_finalize(VFIORegion *region); + void vfio_reset_handler(void *opaque); + struct vfio_device_info *vfio_get_device_info(int fd); + bool vfio_device_is_mdev(VFIODevice *vbasedev); ++bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp); + int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + void vfio_detach_device(VFIODevice *vbasedev); +-- +2.41.0.windows.1 + diff --git a/vfio-iommufd-container-Remove-caps-aw_bits.patch b/vfio-iommufd-container-Remove-caps-aw_bits.patch new file mode 100644 index 0000000000000000000000000000000000000000..0bdf25db7def106538f85ddc740465b90093ed1d --- /dev/null +++ b/vfio-iommufd-container-Remove-caps-aw_bits.patch @@ -0,0 +1,104 @@ +From 7d3634d73af1f53549eba4b3d50bb8f9f49a5243 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:19 +0100 +Subject: [PATCH] vfio/{iommufd,container}: Remove caps::aw_bits +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Remove caps::aw_bits which requires the bcontainer::iova_ranges being +initialized after device is actually attached. Instead defer that to +.get_cap() and call vfio_device_get_aw_bits() directly. + +This is in preparation for HostIOMMUDevice::realize() being called early +during attach_device(). + +Suggested-by: Zhenzhong Duan +Signed-off-by: Joao Martins +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +--- + backends/iommufd.c | 3 ++- + hw/vfio/container.c | 5 +---- + hw/vfio/iommufd.c | 1 - + include/sysemu/host_iommu_device.h | 3 --- + 4 files changed, 3 insertions(+), 9 deletions(-) + +diff --git a/backends/iommufd.c b/backends/iommufd.c +index 0d995d7563..4aebf54765 100644 +--- a/backends/iommufd.c ++++ b/backends/iommufd.c +@@ -19,6 +19,7 @@ + #include "qemu/error-report.h" + #include "monitor/monitor.h" + #include "trace.h" ++#include "hw/vfio/vfio-common.h" + #include + #include + +@@ -285,7 +286,7 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) + case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: + return caps->type; + case HOST_IOMMU_DEVICE_CAP_AW_BITS: +- return caps->aw_bits; ++ return vfio_device_get_aw_bits(hiod->agent); + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 8a5a112b6b..30a62348d3 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -1258,7 +1258,6 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + VFIODevice *vdev = opaque; + + hiod->name = g_strdup(vdev->name); +- hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); + hiod->agent = opaque; + + return true; +@@ -1267,11 +1266,9 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, + Error **errp) + { +- HostIOMMUDeviceCaps *caps = &hiod->caps; +- + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_AW_BITS: +- return caps->aw_bits; ++ return vfio_device_get_aw_bits(hiod->agent); + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 7a069ca576..06e6a400be 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -745,7 +745,6 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + + hiod->name = g_strdup(vdev->name); + caps->type = type; +- caps->aw_bits = vfio_device_get_aw_bits(vdev); + + return true; + } +diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h +index 3e5f058e7b..f586908945 100644 +--- a/include/sysemu/host_iommu_device.h ++++ b/include/sysemu/host_iommu_device.h +@@ -19,12 +19,9 @@ + * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. + * + * @type: host platform IOMMU type. +- * +- * @aw_bits: host IOMMU address width. 0xff if no limitation. + */ + typedef struct HostIOMMUDeviceCaps { + uint32_t type; +- uint8_t aw_bits; + } HostIOMMUDeviceCaps; + + #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +-- +2.41.0.windows.1 + diff --git a/vfio-migration-Don-t-block-migration-device-dirty-tr.patch b/vfio-migration-Don-t-block-migration-device-dirty-tr.patch new file mode 100644 index 0000000000000000000000000000000000000000..7010f82afd0652521c7d16681d1e8887bd999f1f --- /dev/null +++ b/vfio-migration-Don-t-block-migration-device-dirty-tr.patch @@ -0,0 +1,62 @@ +From 6eab0b4a0c79d53250da601da25e2813177d44fe Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Mon, 22 Jul 2024 22:13:25 +0100 +Subject: [PATCH] vfio/migration: Don't block migration device dirty tracking + is unsupported +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +By default VFIO migration is set to auto, which will support live +migration if the migration capability is set *and* also dirty page +tracking is supported. + +For testing purposes one can force enable without dirty page tracking +via enable-migration=on, but that option is generally left for testing +purposes. + +So starting with IOMMU dirty tracking it can use to accommodate the lack of +VF dirty page tracking allowing us to minimize the VF requirements for +migration and thus enabling migration by default for those too. + +While at it change the error messages to mention IOMMU dirty tracking as +well. + +Signed-off-by: Joao Martins +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +[ clg: - spelling in commit log ] +Signed-off-by: Cédric Le Goater +--- + hw/vfio/migration.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index 28d422b39f..db128204af 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -945,16 +945,16 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + return !vfio_block_migration(vbasedev, err, errp); + } + +- if (!vbasedev->dirty_pages_supported) { ++ if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { + if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { + error_setg(&err, +- "%s: VFIO device doesn't support device dirty tracking", +- vbasedev->name); ++ "%s: VFIO device doesn't support device and " ++ "IOMMU dirty tracking", vbasedev->name); + goto add_blocker; + } + +- warn_report("%s: VFIO device doesn't support device dirty tracking", +- vbasedev->name); ++ warn_report("%s: VFIO device doesn't support device and " ++ "IOMMU dirty tracking", vbasedev->name); + } + + ret = vfio_block_multiple_devices_migration(vbasedev, errp); +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Allow-the-selection-of-a-given-iommu-backen.patch b/vfio-pci-Allow-the-selection-of-a-given-iommu-backen.patch new file mode 100644 index 0000000000000000000000000000000000000000..c81907b10fa105e153040c1e6aaf08ae540479e9 --- /dev/null +++ b/vfio-pci-Allow-the-selection-of-a-given-iommu-backen.patch @@ -0,0 +1,72 @@ +From 6576af91f2621c24de4a8bbfa2c6681a16a5d043 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sat, 11 Jan 2025 10:52:46 +0800 +Subject: [PATCH] vfio/pci: Allow the selection of a given iommu backend +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Now we support two types of iommu backends, let's add the capability +to select one of them. This depends on whether an iommufd object has +been linked with the vfio-pci device: + +If the user wants to use the legacy backend, it shall not +link the vfio-pci device with any iommufd object: + + -device vfio-pci,host=0000:02:00.0 + +This is called the legacy mode/backend. + +If the user wants to use the iommufd backend (/dev/iommu) it +shall pass an iommufd object id in the vfio-pci device options: + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 + +Suggested-by: Alex Williamson +Signed-off-by: Eric Auger +Signed-off-by: Yi Liu +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/pci.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index d00c3472c7..c5984b0598 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -19,6 +19,7 @@ + */ + + #include "qemu/osdep.h" ++#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include + #include + +@@ -42,6 +43,7 @@ + #include "qapi/error.h" + #include "migration/blocker.h" + #include "migration/qemu-file.h" ++#include "sysemu/iommufd.h" + + #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" + +@@ -3386,6 +3388,10 @@ static Property vfio_pci_dev_properties[] = { + * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), + * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), + */ ++#ifdef CONFIG_IOMMUFD ++ DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, ++ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), ++#endif + DEFINE_PROP_END_OF_LIST(), + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Extract-mdev-check-into-an-helper.patch b/vfio-pci-Extract-mdev-check-into-an-helper.patch new file mode 100644 index 0000000000000000000000000000000000000000..4c3c1fa684947ff5b77e235b33e7067c0a457dee --- /dev/null +++ b/vfio-pci-Extract-mdev-check-into-an-helper.patch @@ -0,0 +1,103 @@ +From 92da638c3a97679ab4d9f497ae5c7bf652e7bf99 Mon Sep 17 00:00:00 2001 +From: Joao Martins +Date: Fri, 19 Jul 2024 13:04:49 +0100 +Subject: [PATCH] vfio/pci: Extract mdev check into an helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In preparation to skip initialization of the HostIOMMUDevice for mdev, +extract the checks that validate if a device is an mdev into helpers. + +A vfio_device_is_mdev() is created, and subsystems consult VFIODevice::mdev +to check if it's mdev or not. + +Signed-off-by: Joao Martins +Reviewed-by: Cédric Le Goater +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +--- + hw/vfio/helpers.c | 14 ++++++++++++++ + hw/vfio/pci.c | 12 +++--------- + include/hw/vfio/vfio-common.h | 2 ++ + 3 files changed, 19 insertions(+), 9 deletions(-) + +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index 35b8e42304..37bc383c69 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -680,3 +680,17 @@ int vfio_device_get_aw_bits(VFIODevice *vdev) + + return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; + } ++ ++bool vfio_device_is_mdev(VFIODevice *vbasedev) ++{ ++ g_autofree char *subsys = NULL; ++ g_autofree char *tmp = NULL; ++ ++ if (!vbasedev->sysfsdev) { ++ return false; ++ } ++ ++ tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); ++ subsys = realpath(tmp, NULL); ++ return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); ++} +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 675a608b9c..de040e73ca 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2942,10 +2942,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + { + VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; +- char *tmp, *subsys; + Error *err = NULL; + int i, ret; +- bool is_mdev; + char uuid[UUID_STR_LEN]; + char *name; + +@@ -2976,15 +2974,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + * stays in sync with the active working set of the guest driver. Prevent + * the x-balloon-allowed option unless this is minimally an mdev device. + */ +- tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); +- subsys = realpath(tmp, NULL); +- g_free(tmp); +- is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +- free(subsys); ++ vbasedev->mdev = vfio_device_is_mdev(vbasedev); + +- trace_vfio_mdev(vbasedev->name, is_mdev); ++ trace_vfio_mdev(vbasedev->name, vbasedev->mdev); + +- if (vbasedev->ram_block_discard_allowed && !is_mdev) { ++ if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { + error_setg(errp, "x-balloon-allowed only potentially compatible " + "with mdev devices"); + goto error; +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index d45d40c329..e49e5fabba 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -126,6 +126,7 @@ typedef struct VFIODevice { + DeviceState *dev; + int fd; + int type; ++ bool mdev; + bool reset_works; + bool needs_reset; + bool no_mmap; +@@ -219,6 +220,7 @@ void vfio_region_exit(VFIORegion *region); + void vfio_region_finalize(VFIORegion *region); + void vfio_reset_handler(void *opaque); + struct vfio_device_info *vfio_get_device_info(int fd); ++bool vfio_device_is_mdev(VFIODevice *vbasedev); + int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + void vfio_detach_device(VFIODevice *vbasedev); +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Extract-out-a-helper-vfio_pci_get_pci_hot_r.patch b/vfio-pci-Extract-out-a-helper-vfio_pci_get_pci_hot_r.patch new file mode 100644 index 0000000000000000000000000000000000000000..0cf494377109a14b21c9aa60165208062d4d6768 --- /dev/null +++ b/vfio-pci-Extract-out-a-helper-vfio_pci_get_pci_hot_r.patch @@ -0,0 +1,131 @@ +From 0b0701478649baccf3945051822f993619bce01e Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:43 +0800 +Subject: [PATCH] vfio/pci: Extract out a helper + vfio_pci_get_pci_hot_reset_info +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This helper will be used by both legacy and iommufd backends. + +No functional changes intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/pci.c | 54 +++++++++++++++++++++++++++++++++++---------------- + hw/vfio/pci.h | 3 +++ + 2 files changed, 40 insertions(+), 17 deletions(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index c62c02f7b6..eb55e8ae88 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2445,22 +2445,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) + return (strcmp(tmp, name) == 0); + } + +-static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) ++int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, ++ struct vfio_pci_hot_reset_info **info_p) + { +- VFIOGroup *group; + struct vfio_pci_hot_reset_info *info; +- struct vfio_pci_dependent_device *devices; +- struct vfio_pci_hot_reset *reset; +- int32_t *fds; +- int ret, i, count; +- bool multi = false; ++ int ret, count; + +- trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); +- +- if (!single) { +- vfio_pci_pre_reset(vdev); +- } +- vdev->vbasedev.needs_reset = false; ++ assert(info_p && !*info_p); + + info = g_malloc0(sizeof(*info)); + info->argsz = sizeof(*info); +@@ -2468,24 +2459,53 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); + if (ret && errno != ENOSPC) { + ret = -errno; ++ g_free(info); + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s, " + "no available reset mechanism.", vdev->vbasedev.name); + } +- goto out_single; ++ return ret; + } + + count = info->count; +- info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); +- info->argsz = sizeof(*info) + (count * sizeof(*devices)); +- devices = &info->devices[0]; ++ info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0]))); ++ info->argsz = sizeof(*info) + (count * sizeof(info->devices[0])); + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); + if (ret) { + ret = -errno; ++ g_free(info); + error_report("vfio: hot reset info failed: %m"); ++ return ret; ++ } ++ ++ *info_p = info; ++ return 0; ++} ++ ++static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) ++{ ++ VFIOGroup *group; ++ struct vfio_pci_hot_reset_info *info = NULL; ++ struct vfio_pci_dependent_device *devices; ++ struct vfio_pci_hot_reset *reset; ++ int32_t *fds; ++ int ret, i, count; ++ bool multi = false; ++ ++ trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); ++ ++ if (!single) { ++ vfio_pci_pre_reset(vdev); ++ } ++ vdev->vbasedev.needs_reset = false; ++ ++ ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); ++ ++ if (ret) { + goto out_single; + } ++ devices = &info->devices[0]; + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index fba8737ab2..1006061afb 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -218,6 +218,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); + + extern const PropertyInfo qdev_prop_nv_gpudirect_clique; + ++int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, ++ struct vfio_pci_hot_reset_info **info_p); ++ + int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); + + int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Introduce-a-vfio-pci-hot-reset-interface.patch b/vfio-pci-Introduce-a-vfio-pci-hot-reset-interface.patch new file mode 100644 index 0000000000000000000000000000000000000000..846896b5dedaed344880e8a1058d9a2a4e8e64ae --- /dev/null +++ b/vfio-pci-Introduce-a-vfio-pci-hot-reset-interface.patch @@ -0,0 +1,458 @@ +From 32beb7b360416a5f04cebac227ffdf102448d518 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:44 +0800 +Subject: [PATCH] vfio/pci: Introduce a vfio pci hot reset interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Legacy vfio pci and iommufd cdev have different process to hot reset +vfio device, expand current code to abstract out pci_hot_reset callback +for legacy vfio, this same interface will also be used by iommufd +cdev vfio device. + +Rename vfio_pci_hot_reset to vfio_legacy_pci_hot_reset and move it +into container.c. + +vfio_pci_[pre/post]_reset and vfio_pci_host_match are exported so +they could be called in legacy and iommufd pci_hot_reset callback. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/container.c | 170 ++++++++++++++++++++++++++ + hw/vfio/pci.c | 168 +------------------------ + hw/vfio/pci.h | 3 + + include/hw/vfio/vfio-container-base.h | 3 + + 4 files changed, 182 insertions(+), 162 deletions(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 4936b8f27f..e32e1b51e0 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -35,6 +35,7 @@ + #include "qapi/error.h" + #include "migration/migration.h" + #include "sysemu/kvm.h" ++#include "pci.h" + + VFIOGroupList vfio_group_list = + QLIST_HEAD_INITIALIZER(vfio_group_list); +@@ -1035,6 +1036,174 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) + vfio_put_group(group); + } + ++static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) ++{ ++ VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); ++ VFIOGroup *group; ++ struct vfio_pci_hot_reset_info *info = NULL; ++ struct vfio_pci_dependent_device *devices; ++ struct vfio_pci_hot_reset *reset; ++ int32_t *fds; ++ int ret, i, count; ++ bool multi = false; ++ ++ trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); ++ ++ if (!single) { ++ vfio_pci_pre_reset(vdev); ++ } ++ vdev->vbasedev.needs_reset = false; ++ ++ ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); ++ ++ if (ret) { ++ goto out_single; ++ } ++ devices = &info->devices[0]; ++ ++ trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); ++ ++ /* Verify that we have all the groups required */ ++ for (i = 0; i < info->count; i++) { ++ PCIHostDeviceAddress host; ++ VFIOPCIDevice *tmp; ++ VFIODevice *vbasedev_iter; ++ ++ host.domain = devices[i].segment; ++ host.bus = devices[i].bus; ++ host.slot = PCI_SLOT(devices[i].devfn); ++ host.function = PCI_FUNC(devices[i].devfn); ++ ++ trace_vfio_pci_hot_reset_dep_devices(host.domain, ++ host.bus, host.slot, host.function, devices[i].group_id); ++ ++ if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { ++ continue; ++ } ++ ++ QLIST_FOREACH(group, &vfio_group_list, next) { ++ if (group->groupid == devices[i].group_id) { ++ break; ++ } ++ } ++ ++ if (!group) { ++ if (!vdev->has_pm_reset) { ++ error_report("vfio: Cannot reset device %s, " ++ "depends on group %d which is not owned.", ++ vdev->vbasedev.name, devices[i].group_id); ++ } ++ ret = -EPERM; ++ goto out; ++ } ++ ++ /* Prep dependent devices for reset and clear our marker. */ ++ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { ++ if (!vbasedev_iter->dev->realized || ++ vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { ++ continue; ++ } ++ tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); ++ if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { ++ if (single) { ++ ret = -EINVAL; ++ goto out_single; ++ } ++ vfio_pci_pre_reset(tmp); ++ tmp->vbasedev.needs_reset = false; ++ multi = true; ++ break; ++ } ++ } ++ } ++ ++ if (!single && !multi) { ++ ret = -EINVAL; ++ goto out_single; ++ } ++ ++ /* Determine how many group fds need to be passed */ ++ count = 0; ++ QLIST_FOREACH(group, &vfio_group_list, next) { ++ for (i = 0; i < info->count; i++) { ++ if (group->groupid == devices[i].group_id) { ++ count++; ++ break; ++ } ++ } ++ } ++ ++ reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); ++ reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); ++ fds = &reset->group_fds[0]; ++ ++ /* Fill in group fds */ ++ QLIST_FOREACH(group, &vfio_group_list, next) { ++ for (i = 0; i < info->count; i++) { ++ if (group->groupid == devices[i].group_id) { ++ fds[reset->count++] = group->fd; ++ break; ++ } ++ } ++ } ++ ++ /* Bus reset! */ ++ ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); ++ g_free(reset); ++ if (ret) { ++ ret = -errno; ++ } ++ ++ trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, ++ ret ? strerror(errno) : "Success"); ++ ++out: ++ /* Re-enable INTx on affected devices */ ++ for (i = 0; i < info->count; i++) { ++ PCIHostDeviceAddress host; ++ VFIOPCIDevice *tmp; ++ VFIODevice *vbasedev_iter; ++ ++ host.domain = devices[i].segment; ++ host.bus = devices[i].bus; ++ host.slot = PCI_SLOT(devices[i].devfn); ++ host.function = PCI_FUNC(devices[i].devfn); ++ ++ if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { ++ continue; ++ } ++ ++ QLIST_FOREACH(group, &vfio_group_list, next) { ++ if (group->groupid == devices[i].group_id) { ++ break; ++ } ++ } ++ ++ if (!group) { ++ break; ++ } ++ ++ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { ++ if (!vbasedev_iter->dev->realized || ++ vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { ++ continue; ++ } ++ tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); ++ if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { ++ vfio_pci_post_reset(tmp); ++ break; ++ } ++ } ++ } ++out_single: ++ if (!single) { ++ vfio_pci_post_reset(vdev); ++ } ++ g_free(info); ++ ++ return ret; ++} ++ + const VFIOIOMMUOps vfio_legacy_ops = { + .dma_map = vfio_legacy_dma_map, + .dma_unmap = vfio_legacy_dma_unmap, +@@ -1042,4 +1211,5 @@ const VFIOIOMMUOps vfio_legacy_ops = { + .detach_device = vfio_legacy_detach_device, + .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, + .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, ++ .pci_hot_reset = vfio_legacy_pci_hot_reset, + }; +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index eb55e8ae88..d00c3472c7 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2374,7 +2374,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) + return 0; + } + +-static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) ++void vfio_pci_pre_reset(VFIOPCIDevice *vdev) + { + PCIDevice *pdev = &vdev->pdev; + uint16_t cmd; +@@ -2411,7 +2411,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + } + +-static void vfio_pci_post_reset(VFIOPCIDevice *vdev) ++void vfio_pci_post_reset(VFIOPCIDevice *vdev) + { + Error *err = NULL; + int nr; +@@ -2435,7 +2435,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) + vfio_quirk_reset(vdev); + } + +-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) ++bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) + { + char tmp[13]; + +@@ -2485,166 +2485,10 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + + static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) + { +- VFIOGroup *group; +- struct vfio_pci_hot_reset_info *info = NULL; +- struct vfio_pci_dependent_device *devices; +- struct vfio_pci_hot_reset *reset; +- int32_t *fds; +- int ret, i, count; +- bool multi = false; +- +- trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); +- +- if (!single) { +- vfio_pci_pre_reset(vdev); +- } +- vdev->vbasedev.needs_reset = false; +- +- ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); +- +- if (ret) { +- goto out_single; +- } +- devices = &info->devices[0]; +- +- trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); +- +- /* Verify that we have all the groups required */ +- for (i = 0; i < info->count; i++) { +- PCIHostDeviceAddress host; +- VFIOPCIDevice *tmp; +- VFIODevice *vbasedev_iter; +- +- host.domain = devices[i].segment; +- host.bus = devices[i].bus; +- host.slot = PCI_SLOT(devices[i].devfn); +- host.function = PCI_FUNC(devices[i].devfn); +- +- trace_vfio_pci_hot_reset_dep_devices(host.domain, +- host.bus, host.slot, host.function, devices[i].group_id); +- +- if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { +- continue; +- } +- +- QLIST_FOREACH(group, &vfio_group_list, next) { +- if (group->groupid == devices[i].group_id) { +- break; +- } +- } +- +- if (!group) { +- if (!vdev->has_pm_reset) { +- error_report("vfio: Cannot reset device %s, " +- "depends on group %d which is not owned.", +- vdev->vbasedev.name, devices[i].group_id); +- } +- ret = -EPERM; +- goto out; +- } +- +- /* Prep dependent devices for reset and clear our marker. */ +- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { +- if (!vbasedev_iter->dev->realized || +- vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { +- continue; +- } +- tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); +- if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { +- if (single) { +- ret = -EINVAL; +- goto out_single; +- } +- vfio_pci_pre_reset(tmp); +- tmp->vbasedev.needs_reset = false; +- multi = true; +- break; +- } +- } +- } +- +- if (!single && !multi) { +- ret = -EINVAL; +- goto out_single; +- } +- +- /* Determine how many group fds need to be passed */ +- count = 0; +- QLIST_FOREACH(group, &vfio_group_list, next) { +- for (i = 0; i < info->count; i++) { +- if (group->groupid == devices[i].group_id) { +- count++; +- break; +- } +- } +- } +- +- reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); +- reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); +- fds = &reset->group_fds[0]; +- +- /* Fill in group fds */ +- QLIST_FOREACH(group, &vfio_group_list, next) { +- for (i = 0; i < info->count; i++) { +- if (group->groupid == devices[i].group_id) { +- fds[reset->count++] = group->fd; +- break; +- } +- } +- } +- +- /* Bus reset! */ +- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); +- g_free(reset); +- +- trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, +- ret ? strerror(errno) : "Success"); +- +-out: +- /* Re-enable INTx on affected devices */ +- for (i = 0; i < info->count; i++) { +- PCIHostDeviceAddress host; +- VFIOPCIDevice *tmp; +- VFIODevice *vbasedev_iter; +- +- host.domain = devices[i].segment; +- host.bus = devices[i].bus; +- host.slot = PCI_SLOT(devices[i].devfn); +- host.function = PCI_FUNC(devices[i].devfn); +- +- if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { +- continue; +- } +- +- QLIST_FOREACH(group, &vfio_group_list, next) { +- if (group->groupid == devices[i].group_id) { +- break; +- } +- } +- +- if (!group) { +- break; +- } +- +- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { +- if (!vbasedev_iter->dev->realized || +- vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { +- continue; +- } +- tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); +- if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { +- vfio_pci_post_reset(tmp); +- break; +- } +- } +- } +-out_single: +- if (!single) { +- vfio_pci_post_reset(vdev); +- } +- g_free(info); ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops; + +- return ret; ++ return ops->pci_hot_reset(vbasedev, single); + } + + /* +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 1006061afb..6e64a2654e 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -218,6 +218,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); + + extern const PropertyInfo qdev_prop_nv_gpudirect_clique; + ++void vfio_pci_pre_reset(VFIOPCIDevice *vdev); ++void vfio_pci_post_reset(VFIOPCIDevice *vdev); ++bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name); + int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p); + +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 4b6f017c6f..45bb19c767 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -106,6 +106,9 @@ struct VFIOIOMMUOps { + int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); + int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); ++ /* PCI specific */ ++ int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); ++ + /* SPAPR specific */ + int (*add_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch b/vfio-pci-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch new file mode 100644 index 0000000000000000000000000000000000000000..09f0408a9d7de8de9f4f423dff66d2c1c7b9f61e --- /dev/null +++ b/vfio-pci-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch @@ -0,0 +1,224 @@ +From 008d4e37fe67c7f81920efe862352c4b1f3cd1b0 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:47 +0800 +Subject: [PATCH] vfio/pci: Make vfio cdev pre-openable by passing a file + handle +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This gives management tools like libvirt a chance to open the vfio +cdev with privilege and pass FD to qemu. This way qemu never needs +to have privilege to open a VFIO or iommu cdev node. + +Together with the earlier support of pre-opening /dev/iommu device, +now we have full support of passing a vfio device to unprivileged +qemu by management tool. This mode is no more considered for the +legacy backend. So let's remove the "TODO" comment. + +Add helper functions vfio_device_set_fd() and vfio_device_get_name() +to set fd and get device name, they will also be used by other vfio +devices. + +There is no easy way to check if a device is mdev with FD passing, +so fail the x-balloon-allowed check unconditionally in this case. + +There is also no easy way to get BDF as name with FD passing, so +we fake a name by VFIO_FD[fd]. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Tested-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/helpers.c | 43 +++++++++++++++++++++++++++++++++++ + hw/vfio/iommufd.c | 12 ++++++---- + hw/vfio/pci.c | 28 +++++++++++++---------- + include/hw/vfio/vfio-common.h | 4 ++++ + 4 files changed, 71 insertions(+), 16 deletions(-) + +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index 168847e7c5..3592c3d54e 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -27,6 +27,7 @@ + #include "trace.h" + #include "qapi/error.h" + #include "qemu/error-report.h" ++#include "monitor/monitor.h" + + /* + * Common VFIO interrupt disable +@@ -609,3 +610,45 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) + + return ret; + } ++ ++int vfio_device_get_name(VFIODevice *vbasedev, Error **errp) ++{ ++ struct stat st; ++ ++ if (vbasedev->fd < 0) { ++ if (stat(vbasedev->sysfsdev, &st) < 0) { ++ error_setg_errno(errp, errno, "no such host device"); ++ error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); ++ return -errno; ++ } ++ /* User may specify a name, e.g: VFIO platform device */ ++ if (!vbasedev->name) { ++ vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); ++ } ++ } else { ++ if (!vbasedev->iommufd) { ++ error_setg(errp, "Use FD passing only with iommufd backend"); ++ return -EINVAL; ++ } ++ /* ++ * Give a name with fd so any function printing out vbasedev->name ++ * will not break. ++ */ ++ if (!vbasedev->name) { ++ vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); ++ } ++ } ++ ++ return 0; ++} ++ ++void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) ++{ ++ int fd = monitor_fd_param(monitor_cur(), str, errp); ++ ++ if (fd < 0) { ++ error_prepend(errp, "Could not parse remote object fd %s:", str); ++ return; ++ } ++ vbasedev->fd = fd; ++} +diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c +index 6e53e013ef..5accd26484 100644 +--- a/hw/vfio/iommufd.c ++++ b/hw/vfio/iommufd.c +@@ -320,11 +320,15 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + uint32_t ioas_id; + Error *err = NULL; + +- devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); +- if (devfd < 0) { +- return devfd; ++ if (vbasedev->fd < 0) { ++ devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); ++ if (devfd < 0) { ++ return devfd; ++ } ++ vbasedev->fd = devfd; ++ } else { ++ devfd = vbasedev->fd; + } +- vbasedev->fd = devfd; + + ret = iommufd_cdev_connect_and_bind(vbasedev, errp); + if (ret) { +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index c5984b0598..445d58c8e5 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2944,17 +2944,19 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + VFIODevice *vbasedev = &vdev->vbasedev; + char *tmp, *subsys; + Error *err = NULL; +- struct stat st; + int i, ret; + bool is_mdev; + char uuid[UUID_STR_LEN]; + char *name; + +- if (!vbasedev->sysfsdev) { ++ if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { + if (!(~vdev->host.domain || ~vdev->host.bus || + ~vdev->host.slot || ~vdev->host.function)) { + error_setg(errp, "No provided host device"); + error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " ++#ifdef CONFIG_IOMMUFD ++ "or -device vfio-pci,fd=DEVICE_FD " ++#endif + "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); + return; + } +@@ -2964,13 +2966,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + vdev->host.slot, vdev->host.function); + } + +- if (stat(vbasedev->sysfsdev, &st) < 0) { +- error_setg_errno(errp, errno, "no such host device"); +- error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); ++ if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } +- +- vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + vbasedev->ops = &vfio_pci_ops; + vbasedev->type = VFIO_DEVICE_TYPE_PCI; + vbasedev->dev = DEVICE(vdev); +@@ -3330,6 +3328,7 @@ static void vfio_instance_init(Object *obj) + vdev->host.bus = ~0U; + vdev->host.slot = ~0U; + vdev->host.function = ~0U; ++ vdev->vbasedev.fd = -1; + + vdev->nv_gpudirect_clique = 0xFF; + +@@ -3383,11 +3382,6 @@ static Property vfio_pci_dev_properties[] = { + qdev_prop_nv_gpudirect_clique, uint8_t), + DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, + OFF_AUTOPCIBAR_OFF), +- /* +- * TODO - support passed fds... is this necessary? +- * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), +- * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), +- */ + #ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +@@ -3395,6 +3389,13 @@ static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_END_OF_LIST(), + }; + ++#ifdef CONFIG_IOMMUFD ++static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) ++{ ++ vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); ++} ++#endif ++ + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); +@@ -3402,6 +3403,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) + + dc->reset = vfio_pci_reset; + device_class_set_props(dc, vfio_pci_dev_properties); ++#ifdef CONFIG_IOMMUFD ++ object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); ++#endif + dc->desc = "VFIO-based PCI device assignment"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pdc->realize = vfio_realize; +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 9b9fd7b461..5f35f2900b 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -265,4 +265,8 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, + hwaddr size); + int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); ++ ++/* Returns 0 on success, or a negative errno. */ ++int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); ++void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); + #endif /* HW_VFIO_VFIO_COMMON_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Move-VFIODevice-initializations-in-vfio_ins.patch b/vfio-pci-Move-VFIODevice-initializations-in-vfio_ins.patch new file mode 100644 index 0000000000000000000000000000000000000000..669232d083695bbfc2f59b2be8c7bce3b6e53151 --- /dev/null +++ b/vfio-pci-Move-VFIODevice-initializations-in-vfio_ins.patch @@ -0,0 +1,62 @@ +From 0781636a0c5652c25f81c06ba5fc289966021a33 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:58 +0800 +Subject: [PATCH] vfio/pci: Move VFIODevice initializations in + vfio_instance_init +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Some of the VFIODevice initializations is in vfio_realize, +move all of them in vfio_instance_init. + +No functional change intended. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/pci.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 445d58c8e5..87405584d7 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2969,9 +2969,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } +- vbasedev->ops = &vfio_pci_ops; +- vbasedev->type = VFIO_DEVICE_TYPE_PCI; +- vbasedev->dev = DEVICE(vdev); + + /* + * Mediated devices *might* operate compatibly with discarding of RAM, but +@@ -3320,6 +3317,7 @@ static void vfio_instance_init(Object *obj) + { + PCIDevice *pci_dev = PCI_DEVICE(obj); + VFIOPCIDevice *vdev = VFIO_PCI(obj); ++ VFIODevice *vbasedev = &vdev->vbasedev; + + device_add_bootindex_property(obj, &vdev->bootindex, + "bootindex", NULL, +@@ -3328,7 +3326,11 @@ static void vfio_instance_init(Object *obj) + vdev->host.bus = ~0U; + vdev->host.slot = ~0U; + vdev->host.function = ~0U; +- vdev->vbasedev.fd = -1; ++ ++ vbasedev->type = VFIO_DEVICE_TYPE_PCI; ++ vbasedev->ops = &vfio_pci_ops; ++ vbasedev->dev = DEVICE(vdev); ++ vbasedev->fd = -1; + + vdev->nv_gpudirect_clique = 0xFF; + +-- +2.41.0.windows.1 + diff --git a/vfio-pci-Pass-HostIOMMUDevice-to-vIOMMU.patch b/vfio-pci-Pass-HostIOMMUDevice-to-vIOMMU.patch new file mode 100644 index 0000000000000000000000000000000000000000..4c91266eb915e5b7a5ef5333855844ef0aff8e18 --- /dev/null +++ b/vfio-pci-Pass-HostIOMMUDevice-to-vIOMMU.patch @@ -0,0 +1,89 @@ +From dbbf6b33d9ce5f2785972f81919be143e81f866b Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 5 Jun 2024 16:30:40 +0800 +Subject: [PATCH] vfio/pci: Pass HostIOMMUDevice to vIOMMU + +With HostIOMMUDevice passed, vIOMMU can check compatibility with host +IOMMU, call into IOMMUFD specific methods, etc. + +Originally-by: Yi Liu +Signed-off-by: Nicolin Chen +Signed-off-by: Yi Sun +Signed-off-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +--- + hw/vfio/pci.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index d84a9e73a6..675a608b9c 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3107,6 +3107,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + + vfio_bars_register(vdev); + ++ if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { ++ error_prepend(errp, "Failed to set iommu_device: "); ++ goto out_teardown; ++ } ++ + ret = vfio_add_capabilities(vdev, errp); + if (ret) { + goto out_teardown; +@@ -3128,7 +3133,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + error_setg(errp, + "cannot support IGD OpRegion feature on hotplugged " + "device"); +- goto out_teardown; ++ goto out_unset_idev; + } + + ret = vfio_get_dev_region_info(vbasedev, +@@ -3137,13 +3142,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + if (ret) { + error_setg_errno(errp, -ret, + "does not support requested IGD OpRegion feature"); +- goto out_teardown; ++ goto out_unset_idev; + } + + ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); + g_free(opregion); + if (ret) { +- goto out_teardown; ++ goto out_unset_idev; + } + } + +@@ -3229,6 +3234,8 @@ out_deregister: + if (vdev->intx.mmap_timer) { + timer_free(vdev->intx.mmap_timer); + } ++out_unset_idev: ++ pci_device_unset_iommu_device(pdev); + out_teardown: + vfio_teardown_msi(vdev); + vfio_bars_exit(vdev); +@@ -3257,6 +3264,7 @@ static void vfio_instance_finalize(Object *obj) + static void vfio_exitfn(PCIDevice *pdev) + { + VFIOPCIDevice *vdev = VFIO_PCI(pdev); ++ VFIODevice *vbasedev = &vdev->vbasedev; + + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); +@@ -3271,7 +3279,8 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_teardown_msi(vdev); + vfio_pci_disable_rp_atomics(vdev); + vfio_bars_exit(vdev); +- vfio_migration_exit(&vdev->vbasedev); ++ vfio_migration_exit(vbasedev); ++ pci_device_unset_iommu_device(pdev); + } + + static void vfio_pci_reset(DeviceState *dev) +-- +2.41.0.windows.1 + diff --git a/vfio-platform-Allow-the-selection-of-a-given-iommu-b.patch b/vfio-platform-Allow-the-selection-of-a-given-iommu-b.patch new file mode 100644 index 0000000000000000000000000000000000000000..d4c036bc026d3fcaef5b3f8b5ea7ccef2c6899da --- /dev/null +++ b/vfio-platform-Allow-the-selection-of-a-given-iommu-b.patch @@ -0,0 +1,68 @@ +From 1bbc795190c3ad7c838dc57a6f7a38a779dfdd65 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:48 +0800 +Subject: [PATCH] vfio/platform: Allow the selection of a given iommu backend +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Now we support two types of iommu backends, let's add the capability +to select one of them. This depends on whether an iommufd object has +been linked with the vfio-platform device: + +If the user wants to use the legacy backend, it shall not +link the vfio-platform device with any iommufd object: + + -device vfio-platform,host=XXX + +This is called the legacy mode/backend. + +If the user wants to use the iommufd backend (/dev/iommu) it +shall pass an iommufd object id in the vfio-platform device options: + + -object iommufd,id=iommufd0 + -device vfio-platform,host=XXX,iommufd=iommufd0 + +Suggested-by: Alex Williamson +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Reviewed-by: Eric Auger +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/platform.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c +index 8e3d4ac458..98ae4bc655 100644 +--- a/hw/vfio/platform.c ++++ b/hw/vfio/platform.c +@@ -15,11 +15,13 @@ + */ + + #include "qemu/osdep.h" ++#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ + #include "qapi/error.h" + #include + #include + + #include "hw/vfio/vfio-platform.h" ++#include "sysemu/iommufd.h" + #include "migration/vmstate.h" + #include "qemu/error-report.h" + #include "qemu/lockable.h" +@@ -649,6 +651,10 @@ static Property vfio_platform_dev_properties[] = { + DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, + mmap_timeout, 1100), + DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), ++#ifdef CONFIG_IOMMUFD ++ DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd, ++ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), ++#endif + DEFINE_PROP_END_OF_LIST(), + }; + +-- +2.41.0.windows.1 + diff --git a/vfio-platform-Make-vfio-cdev-pre-openable-by-passing.patch b/vfio-platform-Make-vfio-cdev-pre-openable-by-passing.patch new file mode 100644 index 0000000000000000000000000000000000000000..8de50ae3ef9687769bdf5e4ff21a2932de3f0da7 --- /dev/null +++ b/vfio-platform-Make-vfio-cdev-pre-openable-by-passing.patch @@ -0,0 +1,100 @@ +From 9a12f3f754fcebe86fe2346e62cd25d8a2d06a89 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:49 +0800 +Subject: [PATCH] vfio/platform: Make vfio cdev pre-openable by passing a file + handle +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This gives management tools like libvirt a chance to open the vfio +cdev with privilege and pass FD to qemu. This way qemu never needs +to have privilege to open a VFIO or iommu cdev node. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/platform.c | 32 ++++++++++++++++++++++++-------- + 1 file changed, 24 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c +index 98ae4bc655..a97d9c6234 100644 +--- a/hw/vfio/platform.c ++++ b/hw/vfio/platform.c +@@ -531,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = { + */ + static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) + { +- struct stat st; + int ret; + +- /* @sysfsdev takes precedence over @host */ +- if (vbasedev->sysfsdev) { ++ /* @fd takes precedence over @sysfsdev which takes precedence over @host */ ++ if (vbasedev->fd < 0 && vbasedev->sysfsdev) { + g_free(vbasedev->name); + vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); +- } else { ++ } else if (vbasedev->fd < 0) { + if (!vbasedev->name || strchr(vbasedev->name, '/')) { + error_setg(errp, "wrong host device name"); + return -EINVAL; +@@ -548,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) + vbasedev->name); + } + +- if (stat(vbasedev->sysfsdev, &st) < 0) { +- error_setg_errno(errp, errno, +- "failed to get the sysfs host device file status"); +- return -errno; ++ ret = vfio_device_get_name(vbasedev, errp); ++ if (ret) { ++ return ret; + } + + ret = vfio_attach_device(vbasedev->name, vbasedev, +@@ -658,6 +656,20 @@ static Property vfio_platform_dev_properties[] = { + DEFINE_PROP_END_OF_LIST(), + }; + ++static void vfio_platform_instance_init(Object *obj) ++{ ++ VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); ++ ++ vdev->vbasedev.fd = -1; ++} ++ ++#ifdef CONFIG_IOMMUFD ++static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp) ++{ ++ vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp); ++} ++#endif ++ + static void vfio_platform_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); +@@ -665,6 +677,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) + + dc->realize = vfio_platform_realize; + device_class_set_props(dc, vfio_platform_dev_properties); ++#ifdef CONFIG_IOMMUFD ++ object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd); ++#endif + dc->vmsd = &vfio_platform_vmstate; + dc->desc = "VFIO-based platform device assignment"; + sbc->connect_irq_notifier = vfio_start_irqfd_injection; +@@ -677,6 +692,7 @@ static const TypeInfo vfio_platform_dev_info = { + .name = TYPE_VFIO_PLATFORM, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(VFIOPlatformDevice), ++ .instance_init = vfio_platform_instance_init, + .class_init = vfio_platform_class_init, + .class_size = sizeof(VFIOPlatformDeviceClass), + }; +-- +2.41.0.windows.1 + diff --git a/vfio-platform-Move-VFIODevice-initializations-in-vfi.patch b/vfio-platform-Move-VFIODevice-initializations-in-vfi.patch new file mode 100644 index 0000000000000000000000000000000000000000..ecfaef2b81ac29b92b5f3a7be725c30e52c8ef50 --- /dev/null +++ b/vfio-platform-Move-VFIODevice-initializations-in-vfi.patch @@ -0,0 +1,56 @@ +From 594a30d0a9d0d569cf264ffd7b042aa39a404383 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:59 +0800 +Subject: [PATCH] vfio/platform: Move VFIODevice initializations in + vfio_platform_instance_init +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Some of the VFIODevice initializations is in vfio_platform_realize, +move all of them in vfio_platform_instance_init. + +No functional change intended. + +Suggested-by: Cédric Le Goater +Signed-off-by: Zhenzhong Duan +Reviewed-by: Philippe Mathieu-Daudé +Tested-by: Nicolin Chen +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/platform.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c +index a97d9c6234..506eb8193f 100644 +--- a/hw/vfio/platform.c ++++ b/hw/vfio/platform.c +@@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) + VFIODevice *vbasedev = &vdev->vbasedev; + int i, ret; + +- vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; +- vbasedev->dev = dev; +- vbasedev->ops = &vfio_platform_ops; +- + qemu_mutex_init(&vdev->intp_mutex); + + trace_vfio_platform_realize(vbasedev->sysfsdev ? +@@ -659,8 +655,12 @@ static Property vfio_platform_dev_properties[] = { + static void vfio_platform_instance_init(Object *obj) + { + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); ++ VFIODevice *vbasedev = &vdev->vbasedev; + +- vdev->vbasedev.fd = -1; ++ vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; ++ vbasedev->ops = &vfio_platform_ops; ++ vbasedev->dev = DEVICE(vdev); ++ vbasedev->fd = -1; + } + + #ifdef CONFIG_IOMMUFD +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-Extend-VFIOIOMMUOps-with-a-release-handle.patch b/vfio-spapr-Extend-VFIOIOMMUOps-with-a-release-handle.patch new file mode 100644 index 0000000000000000000000000000000000000000..a622f5f3a067856210d11598a8079710f3ed6b2e --- /dev/null +++ b/vfio-spapr-Extend-VFIOIOMMUOps-with-a-release-handle.patch @@ -0,0 +1,97 @@ +From feed555b60bc36d3e704431148e302dae48b77a1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:16 +0100 +Subject: [PATCH] vfio/spapr: Extend VFIOIOMMUOps with a release handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This allows to abstract a bit more the sPAPR IOMMU support in the +legacy IOMMU backend. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/container.c | 8 ++++++-- + hw/vfio/spapr.c | 19 +++++++++++++++++++ + include/hw/vfio/vfio-container-base.h | 1 + + 3 files changed, 26 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index e245d5a082..4c62f088b1 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -764,7 +764,9 @@ listener_release_exit: + } + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { +- vfio_spapr_container_deinit(container); ++ if (bcontainer->ops->release) { ++ bcontainer->ops->release(bcontainer); ++ } + } + + enable_discards_exit: +@@ -803,7 +805,9 @@ static void vfio_disconnect_container(VFIOGroup *group) + } + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || + container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { +- vfio_spapr_container_deinit(container); ++ if (bcontainer->ops->release) { ++ bcontainer->ops->release(bcontainer); ++ } + } + } + +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 3694dfb874..697f80d11d 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -440,6 +440,24 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, + } + } + ++static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) ++{ ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); ++ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, ++ container); ++ VFIOHostDMAWindow *hostwin, *next; ++ ++ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { ++ memory_listener_unregister(&scontainer->prereg_listener); ++ } ++ QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, ++ next) { ++ QLIST_REMOVE(hostwin, hostwin_next); ++ g_free(hostwin); ++ } ++} ++ + static VFIOIOMMUOps vfio_iommu_spapr_ops; + + static void setup_spapr_ops(VFIOContainerBase *bcontainer) +@@ -447,6 +465,7 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer) + vfio_iommu_spapr_ops = *bcontainer->ops; + vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; + vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; ++ vfio_iommu_spapr_ops.release = vfio_spapr_container_release; + bcontainer->ops = &vfio_iommu_spapr_ops; + } + +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index c12ce4dfcb..b2813b0c11 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -135,5 +135,6 @@ struct VFIOIOMMUClass { + Error **errp); + void (*del_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); ++ void (*release)(VFIOContainerBase *bcontainer); + }; + #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-Introduce-a-sPAPR-VFIOIOMMU-QOM-interface.patch b/vfio-spapr-Introduce-a-sPAPR-VFIOIOMMU-QOM-interface.patch new file mode 100644 index 0000000000000000000000000000000000000000..443cccb28eaccc2c5e346078df2dbc53aaa75730 --- /dev/null +++ b/vfio-spapr-Introduce-a-sPAPR-VFIOIOMMU-QOM-interface.patch @@ -0,0 +1,113 @@ +From 2692ea754863364731e5712ebf83208690179089 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:22 +0100 +Subject: [PATCH] vfio/spapr: Introduce a sPAPR VFIOIOMMU QOM interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Move vfio_spapr_container_setup() to a VFIOIOMMUClass::setup handler +and convert the sPAPR VFIOIOMMUOps struct to a QOM interface. The +sPAPR QOM interface inherits from the legacy QOM interface because +because both have the same basic needs. The sPAPR interface is then +extended with the handlers specific to the sPAPR IOMMU. + +This allows reuse and provides better abstraction of the backends. It +will be useful to avoid compiling the sPAPR IOMMU backend on targets +not supporting it. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/container.c | 24 ++++++------------------ + hw/vfio/spapr.c | 20 ++++++++++++++++++++ + include/hw/vfio/vfio-container-base.h | 1 + + 3 files changed, 27 insertions(+), 18 deletions(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 845239eff4..e245d5a082 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -441,6 +441,10 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) + case VFIO_TYPE1_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); + break; ++ case VFIO_SPAPR_TCE_v2_IOMMU: ++ case VFIO_SPAPR_TCE_IOMMU: ++ klass = object_class_by_name(TYPE_VFIO_IOMMU_SPAPR); ++ break; + default: + g_assert_not_reached(); + }; +@@ -716,25 +720,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + goto free_container_exit; + } + +- switch (container->iommu_type) { +- case VFIO_TYPE1v2_IOMMU: +- case VFIO_TYPE1_IOMMU: +- case VFIO_TYPE1v2_S_IOMMU: +- ret = vfio_legacy_setup(bcontainer, errp); +- break; +- case VFIO_SPAPR_TCE_v2_IOMMU: +- case VFIO_SPAPR_TCE_IOMMU: +- { +- ret = vfio_spapr_container_init(container, errp); +- if (ret) { +- goto enable_discards_exit; +- } +- break; +- } +- default: +- g_assert_not_reached(); +- } ++ assert(bcontainer->ops->setup); + ++ ret = bcontainer->ops->setup(bcontainer, errp); + if (ret) { + goto enable_discards_exit; + } +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 5c6426e697..3694dfb874 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -543,3 +543,23 @@ void vfio_spapr_container_deinit(VFIOContainer *container) + g_free(hostwin); + } + } ++ ++static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) ++{ ++ VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); ++ ++ vioc->add_window = vfio_spapr_container_add_section_window; ++ vioc->del_window = vfio_spapr_container_del_section_window; ++ //vioc->release = vfio_spapr_container_release; ++ //vioc->setup = vfio_spapr_container_setup; ++}; ++ ++static const TypeInfo types[] = { ++ { ++ .name = TYPE_VFIO_IOMMU_SPAPR, ++ .parent = TYPE_VFIO_IOMMU_LEGACY, ++ .class_init = vfio_iommu_spapr_class_init, ++ }, ++}; ++ ++DEFINE_TYPES(types) +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 614de90767..1085109d0c 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -95,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); + + #define TYPE_VFIO_IOMMU "vfio-iommu" + #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" ++#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" + + /* + * VFIOContainerBase is not an abstract QOM object because it felt +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-Introduce-spapr-backend-and-target-interf.patch b/vfio-spapr-Introduce-spapr-backend-and-target-interf.patch new file mode 100644 index 0000000000000000000000000000000000000000..4b57f50c7bcc5105ca967a3bc2bc7e8926c8f96b --- /dev/null +++ b/vfio-spapr-Introduce-spapr-backend-and-target-interf.patch @@ -0,0 +1,82 @@ +From 4b0bff002d93d8785ccec8020667dc559bda4e9c Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:33 +0800 +Subject: [PATCH] vfio/spapr: Introduce spapr backend and target interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce an empty spapr backend which will hold spapr specific +content, currently only prereg_listener and hostwin_list. + +Also introduce two spapr specific callbacks add/del_window into +VFIOIOMMUOps. Instantiate a spapr ops with a helper setup_spapr_ops +and assign it to bcontainer->ops. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/spapr.c | 14 ++++++++++++++ + include/hw/vfio/vfio-container-base.h | 6 ++++++ + 2 files changed, 20 insertions(+) + +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 7a50975f25..e1a6b35563 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -24,6 +24,10 @@ + #include "qapi/error.h" + #include "trace.h" + ++typedef struct VFIOSpaprContainer { ++ VFIOContainer container; ++} VFIOSpaprContainer; ++ + static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) + { + if (memory_region_is_iommu(section->mr)) { +@@ -421,6 +425,14 @@ void vfio_container_del_section_window(VFIOContainer *container, + } + } + ++static VFIOIOMMUOps vfio_iommu_spapr_ops; ++ ++static void setup_spapr_ops(VFIOContainerBase *bcontainer) ++{ ++ vfio_iommu_spapr_ops = *bcontainer->ops; ++ bcontainer->ops = &vfio_iommu_spapr_ops; ++} ++ + int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + { + VFIOContainerBase *bcontainer = &container->bcontainer; +@@ -486,6 +498,8 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + 0x1000); + } + ++ setup_spapr_ops(bcontainer); ++ + return 0; + + listener_unregister_exit: +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index 9658ffb526..f62a14ac73 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -101,5 +101,11 @@ struct VFIOIOMMUOps { + int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); + int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); ++ /* SPAPR specific */ ++ int (*add_window)(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section, ++ Error **errp); ++ void (*del_window)(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section); + }; + #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-Move-hostwin_list-into-spapr-container.patch b/vfio-spapr-Move-hostwin_list-into-spapr-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..075c7321ca70465235b22295b9ca4471b6579cf6 --- /dev/null +++ b/vfio-spapr-Move-hostwin_list-into-spapr-container.patch @@ -0,0 +1,180 @@ +From 13c57d5e888fe9d6bdf68469c8e76991a789c1e6 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:36 +0800 +Subject: [PATCH] vfio/spapr: Move hostwin_list into spapr container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional changes intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/spapr.c | 36 +++++++++++++++++++---------------- + include/hw/vfio/vfio-common.h | 1 - + 2 files changed, 20 insertions(+), 17 deletions(-) + +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 68c3dd6c75..5c6426e697 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -27,6 +27,7 @@ + typedef struct VFIOSpaprContainer { + VFIOContainer container; + MemoryListener prereg_listener; ++ QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + } VFIOSpaprContainer; + + static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) +@@ -154,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = { + .region_del = vfio_prereg_listener_region_del, + }; + +-static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, ++static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, + hwaddr max_iova, uint64_t iova_pgsizes) + { + VFIOHostDMAWindow *hostwin; + +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { ++ QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + min_iova, +@@ -173,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, + hostwin->min_iova = min_iova; + hostwin->max_iova = max_iova; + hostwin->iova_pgsizes = iova_pgsizes; +- QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); ++ QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); + } + +-static int vfio_host_win_del(VFIOContainer *container, ++static int vfio_host_win_del(VFIOSpaprContainer *scontainer, + hwaddr min_iova, hwaddr max_iova) + { + VFIOHostDMAWindow *hostwin; + +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { ++ QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { + if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); +@@ -192,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container, + return -1; + } + +-static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, ++static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, + hwaddr iova, hwaddr end) + { + VFIOHostDMAWindow *hostwin; +@@ -329,6 +330,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); ++ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, ++ container); + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; + int ret; +@@ -344,7 +347,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + iova = section->offset_within_address_space; + end = iova + int128_get64(section->size) - 1; + +- if (!vfio_find_hostwin(container, iova, end)) { ++ if (!vfio_find_hostwin(scontainer, iova, end)) { + error_setg(errp, "Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, + iova, end); +@@ -358,7 +361,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + } + + /* For now intersections are not allowed, we may relax this later */ +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { ++ QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + section->offset_within_address_space, +@@ -380,7 +383,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + return ret; + } + +- vfio_host_win_add(container, section->offset_within_address_space, ++ vfio_host_win_add(scontainer, section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, pgsize); + #ifdef CONFIG_KVM +@@ -419,6 +422,8 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, + { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); ++ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, ++ container); + + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return; +@@ -426,7 +431,7 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, + + vfio_spapr_remove_window(container, + section->offset_within_address_space); +- if (vfio_host_win_del(container, ++ if (vfio_host_win_del(scontainer, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { +@@ -454,7 +459,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; + int ret, fd = container->fd; + +- QLIST_INIT(&container->hostwin_list); ++ QLIST_INIT(&scontainer->hostwin_list); + + /* + * The host kernel code implementing VFIO_IOMMU_DISABLE is called +@@ -506,7 +511,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + } else { + /* The default table uses 4K pages */ + bcontainer->pgsizes = 0x1000; +- vfio_host_win_add(container, info.dma32_window_start, ++ vfio_host_win_add(scontainer, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); +@@ -525,15 +530,14 @@ listener_unregister_exit: + + void vfio_spapr_container_deinit(VFIOContainer *container) + { ++ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, ++ container); + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { +- VFIOSpaprContainer *scontainer = container_of(container, +- VFIOSpaprContainer, +- container); + memory_listener_unregister(&scontainer->prereg_listener); + } +- QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, ++ QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index ba8abed75a..9e22acbfb6 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -88,7 +88,6 @@ typedef struct VFIOContainer { + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ + unsigned iommu_type; + bool dirty_log_manual_clear; +- QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIODMARange) dma_list; + } VFIOContainer; +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-Move-prereg_listener-into-spapr-container.patch b/vfio-spapr-Move-prereg_listener-into-spapr-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..70cce72253cf8ad074db53fa9866a4a6999d71ee --- /dev/null +++ b/vfio-spapr-Move-prereg_listener-into-spapr-container.patch @@ -0,0 +1,112 @@ +From 8f27e17107a923a0739c17efe5dcd11f818364af Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:35 +0800 +Subject: [PATCH] vfio/spapr: Move prereg_listener into spapr container +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional changes intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/spapr.c | 24 ++++++++++++++++-------- + include/hw/vfio/vfio-common.h | 1 - + 2 files changed, 16 insertions(+), 9 deletions(-) + +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index 5be1911aad..68c3dd6c75 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -26,6 +26,7 @@ + + typedef struct VFIOSpaprContainer { + VFIOContainer container; ++ MemoryListener prereg_listener; + } VFIOSpaprContainer; + + static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) +@@ -48,8 +49,9 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) + static void vfio_prereg_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, +- prereg_listener); ++ VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, ++ prereg_listener); ++ VFIOContainer *container = &scontainer->container; + VFIOContainerBase *bcontainer = &container->bcontainer; + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; +@@ -107,8 +109,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, + static void vfio_prereg_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { +- VFIOContainer *container = container_of(listener, VFIOContainer, +- prereg_listener); ++ VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, ++ prereg_listener); ++ VFIOContainer *container = &scontainer->container; + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; + int ret; +@@ -445,6 +448,8 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer) + int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + { + VFIOContainerBase *bcontainer = &container->bcontainer; ++ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, ++ container); + struct vfio_iommu_spapr_tce_info info; + bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; + int ret, fd = container->fd; +@@ -463,9 +468,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + return -errno; + } + } else { +- container->prereg_listener = vfio_prereg_listener; ++ scontainer->prereg_listener = vfio_prereg_listener; + +- memory_listener_register(&container->prereg_listener, ++ memory_listener_register(&scontainer->prereg_listener, + &address_space_memory); + if (bcontainer->error) { + ret = -1; +@@ -513,7 +518,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) + + listener_unregister_exit: + if (v2) { +- memory_listener_unregister(&container->prereg_listener); ++ memory_listener_unregister(&scontainer->prereg_listener); + } + return ret; + } +@@ -523,7 +528,10 @@ void vfio_spapr_container_deinit(VFIOContainer *container) + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { +- memory_listener_unregister(&container->prereg_listener); ++ VFIOSpaprContainer *scontainer = container_of(container, ++ VFIOSpaprContainer, ++ container); ++ memory_listener_unregister(&scontainer->prereg_listener); + } + QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + next) { +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index c6b1260911..ba8abed75a 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -86,7 +86,6 @@ typedef struct VFIODMARange { + typedef struct VFIOContainer { + VFIOContainerBase bcontainer; + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ +- MemoryListener prereg_listener; + unsigned iommu_type; + bool dirty_log_manual_clear; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-Only-compile-sPAPR-IOMMU-support-when-nee.patch b/vfio-spapr-Only-compile-sPAPR-IOMMU-support-when-nee.patch new file mode 100644 index 0000000000000000000000000000000000000000..06e68137f9cbf576d449b85b0af2ec118d481fff --- /dev/null +++ b/vfio-spapr-Only-compile-sPAPR-IOMMU-support-when-nee.patch @@ -0,0 +1,36 @@ +From 017272249cc362055dc5b31cdc16b2265df39e5c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 19 Dec 2023 07:58:24 +0100 +Subject: [PATCH] vfio/spapr: Only compile sPAPR IOMMU support when needed +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +sPAPR IOMMU support is only needed for pseries machines. Compile out +support when CONFIG_PSERIES is not set. This saves ~7K of text. + +Reviewed-by: Zhenzhong Duan +Tested-by: Eric Farman +Signed-off-by: Cédric Le Goater +--- + hw/vfio/meson.build | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build +index bd5cc4ca79..bda2688983 100644 +--- a/hw/vfio/meson.build ++++ b/hw/vfio/meson.build +@@ -4,9 +4,9 @@ vfio_ss.add(files( + 'common.c', + 'container-base.c', + 'container.c', +- 'spapr.c', + 'migration.c', + )) ++vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) + vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( + 'iommufd.c', + )) +-- +2.41.0.windows.1 + diff --git a/vfio-spapr-switch-to-spapr-IOMMU-BE-add-del_section_.patch b/vfio-spapr-switch-to-spapr-IOMMU-BE-add-del_section_.patch new file mode 100644 index 0000000000000000000000000000000000000000..28a9e4e22258edd951b5bf41e906fd8625ff927a --- /dev/null +++ b/vfio-spapr-switch-to-spapr-IOMMU-BE-add-del_section_.patch @@ -0,0 +1,175 @@ +From 42d02193bbe543173aa16e463015c76fa2d38ec0 Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Sat, 11 Jan 2025 10:52:34 +0800 +Subject: [PATCH] vfio/spapr: switch to spapr IOMMU BE add/del_section_window +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change intended. + +Signed-off-by: Zhenzhong Duan +Reviewed-by: Cédric Le Goater +Signed-off-by: Zhou Wang +--- + hw/vfio/common.c | 8 ++------ + hw/vfio/container-base.c | 21 +++++++++++++++++++++ + hw/vfio/spapr.c | 19 ++++++++++++++----- + include/hw/vfio/vfio-common.h | 5 ----- + include/hw/vfio/vfio-container-base.h | 5 +++++ + 5 files changed, 42 insertions(+), 16 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 488aa43c9b..679fee4321 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -571,8 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener, + { + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); +- VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); + hwaddr iova, end; + Int128 llend, llsize; + void *vaddr; +@@ -595,7 +593,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + return; + } + +- if (vfio_container_add_section_window(container, section, &err)) { ++ if (vfio_container_add_section_window(bcontainer, section, &err)) { + goto fail; + } + +@@ -738,8 +736,6 @@ static void vfio_listener_region_del(MemoryListener *listener, + { + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); +- VFIOContainer *container = container_of(bcontainer, VFIOContainer, +- bcontainer); + hwaddr iova, end; + Int128 llend, llsize; + int ret; +@@ -818,7 +814,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + + memory_region_unref(section->mr); + +- vfio_container_del_section_window(container, section); ++ vfio_container_del_section_window(bcontainer, section); + } + + typedef struct VFIODirtyRanges { +diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c +index 0177f43741..71f7274973 100644 +--- a/hw/vfio/container-base.c ++++ b/hw/vfio/container-base.c +@@ -31,6 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); + } + ++int vfio_container_add_section_window(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section, ++ Error **errp) ++{ ++ if (!bcontainer->ops->add_window) { ++ return 0; ++ } ++ ++ return bcontainer->ops->add_window(bcontainer, section, errp); ++} ++ ++void vfio_container_del_section_window(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section) ++{ ++ if (!bcontainer->ops->del_window) { ++ return; ++ } ++ ++ return bcontainer->ops->del_window(bcontainer, section); ++} ++ + int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) + { +diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c +index e1a6b35563..5be1911aad 100644 +--- a/hw/vfio/spapr.c ++++ b/hw/vfio/spapr.c +@@ -319,10 +319,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, + return 0; + } + +-int vfio_container_add_section_window(VFIOContainer *container, +- MemoryRegionSection *section, +- Error **errp) ++static int ++vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section, ++ Error **errp) + { ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; + int ret; +@@ -407,9 +410,13 @@ int vfio_container_add_section_window(VFIOContainer *container, + return 0; + } + +-void vfio_container_del_section_window(VFIOContainer *container, +- MemoryRegionSection *section) ++static void ++vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section) + { ++ VFIOContainer *container = container_of(bcontainer, VFIOContainer, ++ bcontainer); ++ + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { + return; + } +@@ -430,6 +437,8 @@ static VFIOIOMMUOps vfio_iommu_spapr_ops; + static void setup_spapr_ops(VFIOContainerBase *bcontainer) + { + vfio_iommu_spapr_ops = *bcontainer->ops; ++ vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; ++ vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; + bcontainer->ops = &vfio_iommu_spapr_ops; + } + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 9a2e0ace72..c6b1260911 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -183,11 +183,6 @@ VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); + + /* SPAPR specific */ +-int vfio_container_add_section_window(VFIOContainer *container, +- MemoryRegionSection *section, +- Error **errp); +-void vfio_container_del_section_window(VFIOContainer *container, +- MemoryRegionSection *section); + int vfio_spapr_container_init(VFIOContainer *container, Error **errp); + void vfio_spapr_container_deinit(VFIOContainer *container); + +diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h +index f62a14ac73..4b6f017c6f 100644 +--- a/include/hw/vfio/vfio-container-base.h ++++ b/include/hw/vfio/vfio-container-base.h +@@ -75,6 +75,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, + int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); ++int vfio_container_add_section_window(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section, ++ Error **errp); ++void vfio_container_del_section_window(VFIOContainerBase *bcontainer, ++ MemoryRegionSection *section); + int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start); + int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, +-- +2.41.0.windows.1 + diff --git a/virtcca-add-kvm-isolation-when-get-tmi-version.patch b/virtcca-add-kvm-isolation-when-get-tmi-version.patch new file mode 100644 index 0000000000000000000000000000000000000000..797768e12b8f1505980d2577abe4ec62b3141203 --- /dev/null +++ b/virtcca-add-kvm-isolation-when-get-tmi-version.patch @@ -0,0 +1,72 @@ +From c8eba92f97b68fad3f84dde2fb6fd4409738e626 Mon Sep 17 00:00:00 2001 +From: lihuhua +Date: Sat, 22 Mar 2025 12:01:26 +0800 +Subject: [PATCH] virtcca: add kvm isolation when get tmi version. + +--- + hw/arm/boot.c | 7 ++++++- + hw/arm/virt.c | 6 +++++- + linux-headers/asm-arm64/kvm.h | 2 -- + linux-headers/linux/kvm.h | 1 + + 4 files changed, 12 insertions(+), 4 deletions(-) + +diff --git a/hw/arm/boot.c b/hw/arm/boot.c +index a3e0dbb68c..9a33601d35 100644 +--- a/hw/arm/boot.c ++++ b/hw/arm/boot.c +@@ -1163,7 +1163,12 @@ static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, + const char *firmware_filename) + { + uint64_t tmi_version = 0; +- if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { ++ int ret = -1; ++ ++ if (kvm_enabled()) { ++ ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version); ++ } ++ if (ret < 0) { + error_report("please check the kernel version!"); + exit(EXIT_FAILURE); + } +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 3c31d3667e..fed2f8c4d7 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -2057,7 +2057,11 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) + vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; + vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; + uint64_t tmi_version = 0; +- if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { ++ int ret = -1; ++ if (kvm_enabled()) { ++ ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version); ++ } ++ if (ret < 0) { + warn_report("can not get tmi version"); + } + if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { +diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h +index d69a71cbec..552fdcb18f 100644 +--- a/linux-headers/asm-arm64/kvm.h ++++ b/linux-headers/asm-arm64/kvm.h +@@ -597,6 +597,4 @@ struct kvm_cap_arm_tmm_populate_region_args { + + #endif + +-#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 +- + #endif /* __ARM_KVM_H__ */ +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index c9ec7f862a..b94c5fd90f 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -2430,5 +2430,6 @@ struct kvm_s390_zpci_op { + + /* get tmi version */ + #define KVM_GET_TMI_VERSION _IOR(KVMIO, 0xd2, uint64_t) ++#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 + + #endif /* __LINUX_KVM_H */ +-- +2.41.0.windows.1 + diff --git a/vmxcap-add-support-for-VMX-FRED-controls.patch b/vmxcap-add-support-for-VMX-FRED-controls.patch new file mode 100644 index 0000000000000000000000000000000000000000..347356b3b5f3b9cfb86007044c5f17be1a98ca19 --- /dev/null +++ b/vmxcap-add-support-for-VMX-FRED-controls.patch @@ -0,0 +1,66 @@ +From 3aa85bc2d9265305dde99cde12d716ffa9bcef4b Mon Sep 17 00:00:00 2001 +From: Xin Li +Date: Wed, 8 Nov 2023 23:20:10 -0800 +Subject: [PATCH] vmxcap: add support for VMX FRED controls + +commit 2e641870170e28df28c5d9914e76ea7cab141516 upstream. + +Report secondary vm-exit controls and the VMX controls used to +save/load FRED MSRs. + +Intel-SIG: commit 2e641870170e vmxcap: add support for VMX FRED controls + +Tested-by: Shan Kang +Signed-off-by: Xin Li +Message-ID: <20231109072012.8078-5-xin3.li@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + scripts/kvm/vmxcap | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap +index 3fb4d5b342..44898d73c2 100755 +--- a/scripts/kvm/vmxcap ++++ b/scripts/kvm/vmxcap +@@ -24,6 +24,7 @@ MSR_IA32_VMX_TRUE_EXIT_CTLS = 0x48F + MSR_IA32_VMX_TRUE_ENTRY_CTLS = 0x490 + MSR_IA32_VMX_VMFUNC = 0x491 + MSR_IA32_VMX_PROCBASED_CTLS3 = 0x492 ++MSR_IA32_VMX_EXIT_CTLS2 = 0x493 + + class msr(object): + def __init__(self): +@@ -219,11 +220,21 @@ controls = [ + 23: 'Clear IA32_BNDCFGS', + 24: 'Conceal VM exits from PT', + 25: 'Clear IA32_RTIT_CTL', ++ 31: 'Activate secondary VM-exit controls', + }, + cap_msr = MSR_IA32_VMX_EXIT_CTLS, + true_cap_msr = MSR_IA32_VMX_TRUE_EXIT_CTLS, + ), + ++ Allowed1Control( ++ name = 'secondary VM-Exit controls', ++ bits = { ++ 0: 'Save IA32 FRED MSRs', ++ 1: 'Load IA32 FRED MSRs', ++ }, ++ cap_msr = MSR_IA32_VMX_EXIT_CTLS2, ++ ), ++ + Control( + name = 'VM-Entry controls', + bits = { +@@ -237,6 +248,7 @@ controls = [ + 16: 'Load IA32_BNDCFGS', + 17: 'Conceal VM entries from PT', + 18: 'Load IA32_RTIT_CTL', ++ 23: 'Load IA32 FRED MSRs', + }, + cap_msr = MSR_IA32_VMX_ENTRY_CTLS, + true_cap_msr = MSR_IA32_VMX_TRUE_ENTRY_CTLS, +-- +2.41.0.windows.1 +