From 166ecdd78a0f5cf359c0cbb4f7a5c32beee12fd7 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:18 +0800 Subject: [PATCH 001/134] vfio: Introduce base object for VFIOContainer and targeted interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a dumb VFIOContainerBase object and its targeted interface. This is willingly not a QOM object because we don't want it to be visible from the user interface. The VFIOContainerBase will be smoothly populated in subsequent patches as well as interfaces. No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- include/hw/vfio/vfio-common.h | 8 ++--- include/hw/vfio/vfio-container-base.h | 50 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 include/hw/vfio/vfio-container-base.h diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index fd9828d50b..c89b5886f2 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -30,6 +30,7 @@ #include #endif #include "sysemu/sysemu.h" +#include "hw/vfio/vfio-container-base.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -89,6 +90,7 @@ typedef struct VFIODMARange { } VFIODMARange; typedef struct VFIOContainer { + VFIOContainerBase bcontainer; VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; @@ -211,12 +213,6 @@ typedef struct VFIODisplay { } dmabuf; } VFIODisplay; -typedef struct { - unsigned long *bitmap; - hwaddr size; - hwaddr pages; -} VFIOBitmap; - VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h new file mode 100644 index 0000000000..1d6daaea5d --- /dev/null +++ b/include/hw/vfio/vfio-container-base.h @@ -0,0 +1,50 @@ +/* + * VFIO BASE CONTAINER + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H +#define HW_VFIO_VFIO_CONTAINER_BASE_H + +#include "exec/memory.h" + +typedef struct VFIODevice VFIODevice; +typedef struct VFIOIOMMUOps VFIOIOMMUOps; + +typedef struct { + unsigned long *bitmap; + hwaddr size; + hwaddr pages; +} VFIOBitmap; + +/* + * This is the base object for vfio container backends + */ +typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; +} VFIOContainerBase; + +struct VFIOIOMMUOps { + /* basic feature */ + int (*dma_map)(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); + int (*dma_unmap)(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + int (*attach_device)(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + void (*detach_device)(VFIODevice *vbasedev); + /* migration feature */ + int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); + int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); +}; +#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ -- Gitee From bda13dc55ae5e16174a4a611353f4bb8a590d510 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:19 +0800 Subject: [PATCH 002/134] vfio/container: Introduce a empty VFIOIOMMUOps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This empty VFIOIOMMUOps named vfio_legacy_ops will hold all general IOMMU ops of legacy container. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container.c | 5 +++++ include/hw/vfio/vfio-common.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 77e61cfedd..8d8ed13e93 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -565,6 +565,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { VFIOContainer *container; + VFIOContainerBase *bcontainer; int ret, fd; VFIOAddressSpace *space; @@ -646,6 +647,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; + bcontainer->ops = &vfio_legacy_ops; ret = vfio_init_container(container, group->fd, errp); if (ret) { @@ -1046,3 +1049,5 @@ void vfio_detach_device(VFIODevice *vbasedev) vfio_put_base_device(vbasedev); vfio_put_group(group); } + +const VFIOIOMMUOps vfio_legacy_ops; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c89b5886f2..3a0a6ab6ee 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -268,7 +268,7 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; - +extern const VFIOIOMMUOps vfio_legacy_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; -- Gitee From 775cf7c2a0dc34d7163eeea1aab6bfc6cb28be9b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:20 +0800 Subject: [PATCH 003/134] vfio/container: Switch to dma_map|unmap API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 45 +++++++++++++++------------ hw/vfio/container-base.c | 32 +++++++++++++++++++ hw/vfio/container.c | 22 ++++++++----- hw/vfio/meson.build | 1 + hw/vfio/trace-events | 2 +- include/hw/vfio/vfio-common.h | 4 --- include/hw/vfio/vfio-container-base.h | 7 +++++ 7 files changed, 81 insertions(+), 32 deletions(-) create mode 100644 hw/vfio/container-base.c diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e08b147b3d..ea63271167 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainer *container = giommu->container; + VFIOContainerBase *bcontainer = &giommu->container->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; void *vaddr; int ret; @@ -322,21 +322,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) * of vaddr will always be there, even if the memory object is * destroyed and its backing memory munmap-ed. */ - ret = vfio_dma_map(container, iova, - iotlb->addr_mask + 1, vaddr, - read_only); + ret = vfio_container_dma_map(bcontainer, iova, + iotlb->addr_mask + 1, vaddr, + read_only); if (ret) { - error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", - container, iova, + bcontainer, iova, iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); } } else { - ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); + ret = vfio_container_dma_unmap(bcontainer, iova, + iotlb->addr_mask + 1, iotlb); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, + bcontainer, iova, iotlb->addr_mask + 1, ret, strerror(-ret)); vfio_set_migration_error(ret); } @@ -355,9 +356,10 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, int ret; /* Unmap with a single call. */ - ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + ret = vfio_container_dma_unmap(&vrdl->container->bcontainer, + iova, size , NULL); if (ret) { - error_report("%s: vfio_dma_unmap() failed: %s", __func__, + error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); } } @@ -385,8 +387,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, section->offset_within_address_space; vaddr = memory_region_get_ram_ptr(section->mr) + start; - ret = vfio_dma_map(vrdl->container, iova, next - start, - vaddr, section->readonly); + ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova, + next - start, vaddr, section->readonly); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -684,10 +686,11 @@ static void vfio_listener_region_add(MemoryListener *listener, } } - ret = vfio_dma_map(container, iova, int128_get64(llsize), - vaddr, section->readonly); + ret = vfio_container_dma_map(&container->bcontainer, + iova, int128_get64(llsize), vaddr, + section->readonly); if (ret) { - error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", container, iova, int128_get64(llsize), vaddr, ret, strerror(-ret)); @@ -784,18 +787,20 @@ static void vfio_listener_region_del(MemoryListener *listener, if (int128_eq(llsize, int128_2_64())) { /* The unmap ioctl doesn't accept a full 64-bit span. */ llsize = int128_rshift(llsize, 1); - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(&container->bcontainer, iova, + int128_get64(llsize), NULL); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", container, iova, int128_get64(llsize), ret, strerror(-ret)); } iova += int128_get64(llsize); } - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(&container->bcontainer, iova, + int128_get64(llsize), NULL); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", container, iova, int128_get64(llsize), ret, strerror(-ret)); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c new file mode 100644 index 0000000000..55d3a35fa4 --- /dev/null +++ b/hw/vfio/container-base.c @@ -0,0 +1,32 @@ +/* + * VFIO BASE CONTAINER + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/vfio/vfio-container-base.h" + +int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly) +{ + g_assert(bcontainer->ops->dma_map); + return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly); +} + +int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + g_assert(bcontainer->ops->dma_unmap); + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); +} diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 8d8ed13e93..40e378e888 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -140,9 +140,11 @@ void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ -int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb) +static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, IOMMUTLBEntry *iotlb) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), .flags = 0, @@ -193,7 +195,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, */ if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && container->iommu_type == VFIO_TYPE1v2_IOMMU) { - trace_vfio_dma_unmap_overflow_workaround(); + trace_vfio_legacy_dma_unmap_overflow_workaround(); unmap.size -= 1ULL << ctz64(container->pgsizes); continue; } @@ -212,9 +214,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, return 0; } -int vfio_dma_map(VFIOContainer *container, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) +static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ, @@ -241,7 +245,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && + (errno == EBUSY && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -1050,4 +1055,7 @@ void vfio_detach_device(VFIODevice *vbasedev) vfio_put_group(group); } -const VFIOIOMMUOps vfio_legacy_ops; +const VFIOIOMMUOps vfio_legacy_ops = { + .dma_map = vfio_legacy_dma_map, + .dma_unmap = vfio_legacy_dma_unmap, +}; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index b1db4c8605..32a6933280 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -2,6 +2,7 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'helpers.c', 'common.c', + 'container-base.c', 'container.c', 'spapr.c', 'migration.c', diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0eb2387cf2..9f7fedee98 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -116,7 +116,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" -vfio_dma_unmap_overflow_workaround(void) "" +vfio_legacy_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 3a0a6ab6ee..f94baf72db 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -221,10 +221,6 @@ bool vfio_devices_all_running_and_saving(VFIOContainer *container); VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, hwaddr start_addr, hwaddr size); void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); -int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb); -int vfio_dma_map(VFIOContainer *container, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly); int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 1d6daaea5d..56b033f59f 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -31,6 +31,13 @@ typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; } VFIOContainerBase; +int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + struct VFIOIOMMUOps { /* basic feature */ int (*dma_map)(VFIOContainerBase *bcontainer, -- Gitee From ff4e67fa5ceb31f1dc686a661cbf37c1a81cd644 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:21 +0800 Subject: [PATCH 004/134] vfio/common: Introduce vfio_container_init/destroy helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds two helper functions vfio_container_init/destroy which will be used by both legacy and iommufd containers to do base container specific initialization and release. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container-base.c | 9 +++++++++ hw/vfio/container.c | 4 +++- include/hw/vfio/vfio-container-base.h | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 55d3a35fa4..e929435751 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -30,3 +30,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, g_assert(bcontainer->ops->dma_unmap); return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } + +void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) +{ + bcontainer->ops = ops; +} + +void vfio_container_destroy(VFIOContainerBase *bcontainer) +{ +} diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 40e378e888..5a8c55056b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -653,7 +653,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; - bcontainer->ops = &vfio_legacy_ops; + vfio_container_init(bcontainer, &vfio_legacy_ops); ret = vfio_init_container(container, group->fd, errp); if (ret) { @@ -765,6 +765,7 @@ put_space_exit: static void vfio_disconnect_container(VFIOGroup *group) { VFIOContainer *container = group->container; + VFIOContainerBase *bcontainer = &container->bcontainer; QLIST_REMOVE(group, container_next); group->container = NULL; @@ -803,6 +804,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(giommu, giommu_next); g_free(giommu); } + vfio_container_destroy(bcontainer); trace_vfio_disconnect_container(container->fd); close(container->fd); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 56b033f59f..577f52ccbc 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -38,6 +38,10 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); +void vfio_container_init(VFIOContainerBase *bcontainer, + const VFIOIOMMUOps *ops); +void vfio_container_destroy(VFIOContainerBase *bcontainer); + struct VFIOIOMMUOps { /* basic feature */ int (*dma_map)(VFIOContainerBase *bcontainer, -- Gitee From 350f1a4d221849cc26a6d3950c128f951648c391 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:22 +0800 Subject: [PATCH 005/134] vfio/common: Move giommu_list in base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the giommu_list field in the base container and store the base container in the VFIOGuestIOMMU. No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 17 +++++++++++------ hw/vfio/container-base.c | 9 +++++++++ hw/vfio/container.c | 8 -------- include/hw/vfio/vfio-common.h | 9 --------- include/hw/vfio/vfio-container-base.h | 9 +++++++++ 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ea63271167..b8007b22c3 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainerBase *bcontainer = &giommu->container->bcontainer; + VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; void *vaddr; int ret; @@ -569,6 +569,7 @@ static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = &container->bcontainer; hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -612,7 +613,7 @@ static void vfio_listener_region_add(MemoryListener *listener, giommu->iommu_mr = iommu_mr; giommu->iommu_offset = section->offset_within_address_space - section->offset_within_region; - giommu->container = container; + giommu->bcontainer = bcontainer; llend = int128_add(int128_make64(section->offset_within_region), section->size); llend = int128_sub(llend, int128_one()); @@ -647,7 +648,7 @@ static void vfio_listener_region_add(MemoryListener *listener, g_free(giommu); goto fail; } - QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); return; @@ -732,6 +733,7 @@ static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = &container->bcontainer; hwaddr iova, end; Int128 llend, llsize; int ret; @@ -744,7 +746,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { memory_region_unregister_iommu_notifier(section->mr, @@ -1211,7 +1213,9 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier *gdn = container_of(n, vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; - VFIOContainer *container = giommu->container; + VFIOContainerBase *bcontainer = giommu->bcontainer; + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; int ret = -EINVAL; @@ -1289,12 +1293,13 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { + VFIOContainerBase *bcontainer = &container->bcontainer; ram_addr_t ram_addr; if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { Int128 llend; diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index e929435751..20bcb9669a 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -34,8 +34,17 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) { bcontainer->ops = ops; + QLIST_INIT(&bcontainer->giommu_list); } void vfio_container_destroy(VFIOContainerBase *bcontainer) { + VFIOGuestIOMMU *giommu, *tmp; + + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier( + MEMORY_REGION(giommu->iommu_mr), &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 5a8c55056b..03791601d0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -649,7 +649,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->dirty_pages_supported = false; container->dma_max_mappings = 0; container->iova_ranges = NULL; - QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; @@ -794,16 +793,9 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; - VFIOGuestIOMMU *giommu, *tmp; QLIST_REMOVE(container, next); - QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier( - MEMORY_REGION(giommu->iommu_mr), &giommu->n); - QLIST_REMOVE(giommu, giommu_next); - g_free(giommu); - } vfio_container_destroy(bcontainer); trace_vfio_disconnect_container(container->fd); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f94baf72db..6f02952ff6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -104,7 +104,6 @@ typedef struct VFIOContainer { uint64_t max_dirty_bitmap_size; unsigned long pgsizes; unsigned int dma_max_mappings; - QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; @@ -114,14 +113,6 @@ typedef struct VFIOContainer { GList *iova_ranges; } VFIOContainer; -typedef struct VFIOGuestIOMMU { - VFIOContainer *container; - IOMMUMemoryRegion *iommu_mr; - hwaddr iommu_offset; - IOMMUNotifier n; - QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; -} VFIOGuestIOMMU; - typedef struct VFIORamDiscardListener { VFIOContainer *container; MemoryRegion *mr; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 577f52ccbc..a11aec5755 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -29,8 +29,17 @@ typedef struct { */ typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; } VFIOContainerBase; +typedef struct VFIOGuestIOMMU { + VFIOContainerBase *bcontainer; + IOMMUMemoryRegion *iommu_mr; + hwaddr iommu_offset; + IOMMUNotifier n; + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; +} VFIOGuestIOMMU; + int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 97979ab4d92d0006ffefb586675b6110e5b7a746 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:23 +0800 Subject: [PATCH 006/134] vfio/container: Move space field to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the space field to the base object. Also the VFIOAddressSpace now contains a list of base containers. No functional change intended. Modify hw/vfio/container.c: vfio_connect_container->shared_memory_listener_register in kvm_csv3_enabled during backporting. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/ppc/spapr_pci_vfio.c | 10 +++++----- hw/vfio/common.c | 4 ++-- hw/vfio/container-base.c | 6 +++++- hw/vfio/container.c | 20 +++++++++----------- include/hw/vfio/vfio-common.h | 8 -------- include/hw/vfio/vfio-container-base.h | 9 +++++++++ 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index f283f7e38d..d1d07bec46 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -84,27 +84,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) { VFIOAddressSpace *space = vfio_get_address_space(as); - VFIOContainer *container = NULL; + VFIOContainerBase *bcontainer = NULL; if (QLIST_EMPTY(&space->containers)) { /* No containers to act on */ goto out; } - container = QLIST_FIRST(&space->containers); + bcontainer = QLIST_FIRST(&space->containers); - if (QLIST_NEXT(container, next)) { + if (QLIST_NEXT(bcontainer, next)) { /* * We don't yet have logic to synchronize EEH state across * multiple containers */ - container = NULL; + bcontainer = NULL; goto out; } out: vfio_put_address_space(space); - return container; + return container_of(bcontainer, VFIOContainer, bcontainer); } static bool vfio_eeh_as_ok(AddressSpace *as) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b8007b22c3..2f3f66991a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) bool vfio_viommu_preset(VFIODevice *vbasedev) { - return vbasedev->container->space->as != &address_space_memory; + return vbasedev->container->bcontainer.space->as != &address_space_memory; } static void vfio_set_migration_error(int err) @@ -922,7 +922,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, dirty.container = container; memory_listener_register(&dirty.listener, - container->space->as); + container->bcontainer.space->as); *ranges = dirty.ranges; diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 20bcb9669a..3933391e0d 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -31,9 +31,11 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } -void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) +void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + const VFIOIOMMUOps *ops) { bcontainer->ops = ops; + bcontainer->space = space; QLIST_INIT(&bcontainer->giommu_list); } @@ -41,6 +43,8 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) { VFIOGuestIOMMU *giommu, *tmp; + QLIST_REMOVE(bcontainer, next); + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { memory_region_unregister_iommu_notifier( MEMORY_REGION(giommu->iommu_mr), &giommu->n); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 03791601d0..b7ab0d7323 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -607,7 +607,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ - QLIST_FOREACH(container, &space->containers, next) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { ret = vfio_ram_block_discard_disable(container, true); if (ret) { @@ -643,7 +644,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } container = g_malloc0(sizeof(*container)); - container->space = space; container->fd = fd; container->error = NULL; container->dirty_pages_supported = false; @@ -652,7 +652,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; - vfio_container_init(bcontainer, &vfio_legacy_ops); + vfio_container_init(bcontainer, space, &vfio_legacy_ops); ret = vfio_init_container(container, group->fd, errp); if (ret) { @@ -708,7 +708,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_kvm_device_add_group(group); QLIST_INIT(&container->group_list); - QLIST_INSERT_HEAD(&space->containers, container, next); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); @@ -717,9 +717,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (kvm_csv3_enabled()) { shared_memory_listener_register(&container->listener, - container->space->as); + bcontainer->space->as); } else { - memory_listener_register(&container->listener, container->space->as); + memory_listener_register(&container->listener, bcontainer->space->as); } if (container->error) { @@ -734,7 +734,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, return 0; listener_release_exit: QLIST_REMOVE(group, container_next); - QLIST_REMOVE(container, next); + QLIST_REMOVE(bcontainer, next); vfio_kvm_device_del_group(group); if (kvm_csv3_enabled()) { shared_memory_listener_unregister(); @@ -792,9 +792,7 @@ static void vfio_disconnect_container(VFIOGroup *group) } if (QLIST_EMPTY(&container->group_list)) { - VFIOAddressSpace *space = container->space; - - QLIST_REMOVE(container, next); + VFIOAddressSpace *space = bcontainer->space; vfio_container_destroy(bcontainer); @@ -815,7 +813,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) QLIST_FOREACH(group, &vfio_group_list, next) { if (group->groupid == groupid) { /* Found it. Now is it already in the right context? */ - if (group->container->space->as == as) { + if (group->container->bcontainer.space->as == as) { return group; } else { error_setg(errp, "group %d used in multiple address spaces", diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 6f02952ff6..31c9df4b03 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -73,12 +73,6 @@ typedef struct VFIOMigration { bool initial_data_sent; } VFIOMigration; -typedef struct VFIOAddressSpace { - AddressSpace *as; - QLIST_HEAD(, VFIOContainer) containers; - QLIST_ENTRY(VFIOAddressSpace) list; -} VFIOAddressSpace; - struct VFIOGroup; typedef struct VFIODMARange { @@ -91,7 +85,6 @@ typedef struct VFIODMARange { typedef struct VFIOContainer { VFIOContainerBase bcontainer; - VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; MemoryListener prereg_listener; @@ -108,7 +101,6 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_HEAD(, VFIODMARange) dma_list; - QLIST_ENTRY(VFIOContainer) next; QLIST_HEAD(, VFIODevice) device_list; GList *iova_ranges; } VFIOContainer; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index a11aec5755..c7cc6ec9c5 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -24,12 +24,20 @@ typedef struct { hwaddr pages; } VFIOBitmap; +typedef struct VFIOAddressSpace { + AddressSpace *as; + QLIST_HEAD(, VFIOContainerBase) containers; + QLIST_ENTRY(VFIOAddressSpace) list; +} VFIOAddressSpace; + /* * This is the base object for vfio container backends */ typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; + VFIOAddressSpace *space; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_ENTRY(VFIOContainerBase) next; } VFIOContainerBase; typedef struct VFIOGuestIOMMU { @@ -48,6 +56,7 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, IOMMUTLBEntry *iotlb); void vfio_container_init(VFIOContainerBase *bcontainer, + VFIOAddressSpace *space, const VFIOIOMMUOps *ops); void vfio_container_destroy(VFIOContainerBase *bcontainer); -- Gitee From c8c17aaddeee1e5002fc4bde7245719db75d4021 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:24 +0800 Subject: [PATCH 007/134] vfio/container: Switch to IOMMU BE set_dirty_page_tracking/query_dirty_bitmap API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dirty_pages_supported field is also moved to the base container No functional change intended. Modify vfio_listener_log_clear during backporting. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 14 +++++++++----- hw/vfio/container-base.c | 16 ++++++++++++++++ hw/vfio/container.c | 21 ++++++++++++++------- include/hw/vfio/vfio-common.h | 5 ----- include/hw/vfio/vfio-container-base.h | 6 ++++++ 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 2f3f66991a..3be6cecc63 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1079,7 +1079,8 @@ static void vfio_listener_log_global_start(MemoryListener *listener) if (vfio_devices_all_device_dirty_tracking(container)) { ret = vfio_devices_dma_logging_start(container); } else { - ret = vfio_set_dirty_page_tracking(container, true); + ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, + true); } if (ret) { @@ -1097,7 +1098,8 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) if (vfio_devices_all_device_dirty_tracking(container)) { vfio_devices_dma_logging_stop(container); } else { - ret = vfio_set_dirty_page_tracking(container, false); + ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, + false); } if (ret) { @@ -1166,7 +1168,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, VFIODMARange *qrange; int ret; - if (!container->dirty_pages_supported && !all_device_dirty_tracking) { + if (!container->bcontainer.dirty_pages_supported && + !all_device_dirty_tracking) { cpu_physical_memory_set_dirty_range(ram_addr, size, tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE); @@ -1187,7 +1190,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, if (all_device_dirty_tracking) { ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); } else { - ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); + ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap, + iova, size); } if (ret) { @@ -1480,7 +1484,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, VFIOContainer *container = container_of(listener, VFIOContainer, listener); if (vfio_listener_skipped_section(section) || - !container->dirty_pages_supported) { + !container->bcontainer.dirty_pages_supported) { return; } diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 3933391e0d..5d654ae172 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -31,11 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } +int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) +{ + g_assert(bcontainer->ops->set_dirty_page_tracking); + return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); +} + +int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) +{ + g_assert(bcontainer->ops->query_dirty_bitmap); + return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size); +} + void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, const VFIOIOMMUOps *ops) { bcontainer->ops = ops; bcontainer->space = space; + bcontainer->dirty_pages_supported = false; QLIST_INIT(&bcontainer->giommu_list); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index b7ab0d7323..cf373e42ef 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -157,7 +157,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, if (iotlb && vfio_devices_all_running_and_mig_active(container)) { if (!vfio_devices_all_device_dirty_tracking(container) && - container->dirty_pages_supported) { + container->bcontainer.dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -255,14 +255,17 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, return -errno; } -int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { .argsz = sizeof(dirty), }; - if (!container->dirty_pages_supported) { + if (!bcontainer->dirty_pages_supported) { return 0; } @@ -282,9 +285,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) return ret; } -int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, - hwaddr iova, hwaddr size) +static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; int ret; @@ -528,7 +534,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - container->dirty_pages_supported = true; + container->bcontainer.dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; } @@ -646,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; container->error = NULL; - container->dirty_pages_supported = false; container->dma_max_mappings = 0; container->iova_ranges = NULL; QLIST_INIT(&container->vrdl_list); @@ -1050,4 +1055,6 @@ void vfio_detach_device(VFIODevice *vbasedev) const VFIOIOMMUOps vfio_legacy_ops = { .dma_map = vfio_legacy_dma_map, .dma_unmap = vfio_legacy_dma_unmap, + .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, + .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, }; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 31c9df4b03..af0ef9042d 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -91,7 +91,6 @@ typedef struct VFIOContainer { unsigned iommu_type; Error *error; bool initialized; - bool dirty_pages_supported; bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; @@ -200,13 +199,9 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container); -/* container->fd */ VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, hwaddr start_addr, hwaddr size); void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); -int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); -int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, - hwaddr iova, hwaddr size); /* SPAPR specific */ int vfio_container_add_section_window(VFIOContainer *container, diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index c7cc6ec9c5..f244f003d0 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -36,6 +36,7 @@ typedef struct VFIOAddressSpace { typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; VFIOAddressSpace *space; + bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_ENTRY(VFIOContainerBase) next; } VFIOContainerBase; @@ -54,6 +55,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); +int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start); +int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, -- Gitee From 22244582a5ff77c0d93008e603a343c1e47ca85d Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:25 +0800 Subject: [PATCH 008/134] vfio/container: Move per container device list in base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VFIO Device is also changed to point to base container instead of legacy container. No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 23 +++++++++++++++-------- hw/vfio/container.c | 12 ++++++------ include/hw/vfio/vfio-common.h | 3 +-- include/hw/vfio/vfio-container-base.h | 1 + 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 3be6cecc63..b952d1c811 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) bool vfio_viommu_preset(VFIODevice *vbasedev) { - return vbasedev->container->bcontainer.space->as != &address_space_memory; + return vbasedev->bcontainer->space->as != &address_space_memory; } static void vfio_set_migration_error(int err) @@ -179,6 +179,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); @@ -187,7 +188,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return false; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { VFIOMigration *migration = vbasedev->migration; if (!migration) { @@ -205,9 +206,10 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (!vbasedev->dirty_pages_supported) { return false; } @@ -222,13 +224,14 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) */ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; if (!migration_is_active(migrate_get_current())) { return false; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { VFIOMigration *migration = vbasedev->migration; if (!migration) { @@ -833,12 +836,13 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, VFIOContainer *container) { VFIOPCIDevice *pcidev; + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; Object *owner; owner = memory_region_owner(section->mr); - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { continue; } @@ -939,13 +943,14 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; feature->argsz = sizeof(buf); feature->flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (!vbasedev->dirty_tracking) { continue; } @@ -1036,6 +1041,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) { struct vfio_device_feature *feature; VFIODirtyRanges ranges; + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret = 0; @@ -1046,7 +1052,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) return -errno; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (vbasedev->dirty_tracking) { continue; } @@ -1139,10 +1145,11 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { ret = vfio_device_dma_logging_report(vbasedev, iova, size, vbmap->bitmap); if (ret) { diff --git a/hw/vfio/container.c b/hw/vfio/container.c index cf373e42ef..74d236ddee 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1001,7 +1001,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, int groupid = vfio_device_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; VFIOGroup *group; - VFIOContainer *container; + VFIOContainerBase *bcontainer; int ret; if (groupid < 0) { @@ -1028,9 +1028,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } - container = group->container; - vbasedev->container = container; - QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next); + bcontainer = &group->container->bcontainer; + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); return ret; @@ -1040,13 +1040,13 @@ void vfio_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - if (!vbasedev->container) { + if (!vbasedev->bcontainer) { return; } QLIST_REMOVE(vbasedev, global_next); QLIST_REMOVE(vbasedev, container_next); - vbasedev->container = NULL; + vbasedev->bcontainer = NULL; trace_vfio_detach_device(vbasedev->name, group->groupid); vfio_put_base_device(vbasedev); vfio_put_group(group); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index af0ef9042d..e27854228c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -100,7 +100,6 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_HEAD(, VFIODMARange) dma_list; - QLIST_HEAD(, VFIODevice) device_list; GList *iova_ranges; } VFIOContainer; @@ -128,7 +127,7 @@ typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) container_next; QLIST_ENTRY(VFIODevice) global_next; struct VFIOGroup *group; - VFIOContainer *container; + VFIOContainerBase *bcontainer; char *sysfsdev; char *name; DeviceState *dev; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index f244f003d0..7090962496 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -39,6 +39,7 @@ typedef struct VFIOContainerBase { bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_ENTRY(VFIOContainerBase) next; + QLIST_HEAD(, VFIODevice) device_list; } VFIOContainerBase; typedef struct VFIOGuestIOMMU { -- Gitee From 718cfbf181541fa4142aba10d5aee839e06b4d66 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:26 +0800 Subject: [PATCH 009/134] vfio/container: Convert functions to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the prospect to get rid of VFIOContainer refs in common.c lets convert misc functions to use the base container object instead: vfio_devices_all_dirty_tracking vfio_devices_all_device_dirty_tracking vfio_devices_all_running_and_mig_active vfio_devices_query_dirty_bitmap vfio_get_dirty_bitmap Modify vfio_get_dirty_bitmap/vfio_listener_log_clear during backporting. Signed-off-by: Eric Auger Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 46 ++++++++++++++++------------------- hw/vfio/container.c | 6 ++--- hw/vfio/trace-events | 2 +- include/hw/vfio/vfio-common.h | 9 +++---- 4 files changed, 29 insertions(+), 34 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b952d1c811..b663d0bcc0 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -177,9 +177,8 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; } -static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) +static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); @@ -204,9 +203,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return true; } -bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) +bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { @@ -222,9 +220,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. */ -bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) +bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; if (!migration_is_active(migrate_get_current())) { @@ -1082,7 +1079,7 @@ static void vfio_listener_log_global_start(MemoryListener *listener) VFIOContainer *container = container_of(listener, VFIOContainer, listener); int ret; - if (vfio_devices_all_device_dirty_tracking(container)) { + if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { ret = vfio_devices_dma_logging_start(container); } else { ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, @@ -1101,7 +1098,7 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) VFIOContainer *container = container_of(listener, VFIOContainer, listener); int ret = 0; - if (vfio_devices_all_device_dirty_tracking(container)) { + if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { vfio_devices_dma_logging_stop(container); } else { ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, @@ -1141,11 +1138,10 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, return 0; } -int vfio_devices_query_dirty_bitmap(VFIOContainer *container, +int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret; @@ -1165,18 +1161,19 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, return 0; } -int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, +int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { bool all_device_dirty_tracking = - vfio_devices_all_device_dirty_tracking(container); + vfio_devices_all_device_dirty_tracking(bcontainer); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); uint64_t dirty_pages; VFIOBitmap vbmap; VFIODMARange *qrange; int ret; - if (!container->bcontainer.dirty_pages_supported && - !all_device_dirty_tracking) { + if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { cpu_physical_memory_set_dirty_range(ram_addr, size, tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE); @@ -1195,10 +1192,9 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, vbmap.bitmap = qrange->bitmap; if (all_device_dirty_tracking) { - ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); + ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size); } else { - ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap, - iova, size); + ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size); } if (ret) { @@ -1208,8 +1204,7 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, vbmap.pages); - trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, - ram_addr, dirty_pages); + trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); out: return ret; } @@ -1241,8 +1236,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, - translated_addr); + ret = vfio_get_dirty_bitmap(&container->bcontainer, iova, + iotlb->addr_mask + 1, translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", @@ -1271,7 +1266,8 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); + return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size, + ram_addr); } static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, @@ -1340,7 +1336,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; - return vfio_get_dirty_bitmap(container, + return vfio_get_dirty_bitmap(&container->bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), int128_get64(section->size), ram_addr); } @@ -1355,7 +1351,7 @@ static void vfio_listener_log_sync(MemoryListener *listener, return; } - if (vfio_devices_all_dirty_tracking(container)) { + if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { ret = vfio_sync_dirty_bitmap(container, section); if (ret) { error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, @@ -1495,7 +1491,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, return; } - if (vfio_devices_all_dirty_tracking(container)) { + if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { vfio_physical_log_clear(container, section); } } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 74d236ddee..9a542368ab 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -155,8 +155,8 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, bool need_dirty_sync = false; int ret; - if (iotlb && vfio_devices_all_running_and_mig_active(container)) { - if (!vfio_devices_all_device_dirty_tracking(container) && + if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { + if (!vfio_devices_all_device_dirty_tracking(bcontainer) && container->bcontainer.dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -204,7 +204,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, } if (need_dirty_sync) { - ret = vfio_get_dirty_bitmap(container, iova, size, + ret = vfio_get_dirty_bitmap(bcontainer, iova, size, iotlb->translated_addr); if (ret) { return ret; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9f7fedee98..08a1f9dfa4 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -117,7 +117,7 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" vfio_legacy_dma_unmap_overflow_workaround(void) "" -vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 +vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 # platform.c diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index e27854228c..0295ede7ba 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -196,7 +196,6 @@ typedef struct VFIODisplay { VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); -bool vfio_devices_all_running_and_saving(VFIOContainer *container); VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, hwaddr start_addr, hwaddr size); @@ -274,11 +273,11 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); -bool vfio_devices_all_running_and_mig_active(VFIOContainer *container); -bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container); -int vfio_devices_query_dirty_bitmap(VFIOContainer *container, +bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer); +bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer); +int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); -int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, +int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From 961614f6c997caf632ce37ead96b301ec47b1847 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:27 +0800 Subject: [PATCH 010/134] vfio/container: Move pgsizes and dma_max_mappings to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 17 +++++++++-------- hw/vfio/container-base.c | 1 + hw/vfio/container.c | 11 +++++------ hw/vfio/spapr.c | 10 ++++++---- include/hw/vfio/vfio-common.h | 2 -- include/hw/vfio/vfio-container-base.h | 2 ++ 6 files changed, 23 insertions(+), 20 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b663d0bcc0..fd6249c290 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -401,6 +401,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, static void vfio_register_ram_discard_listener(VFIOContainer *container, MemoryRegionSection *section) { + VFIOContainerBase *bcontainer = &container->bcontainer; RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl; @@ -419,8 +420,8 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, section->mr); g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); - g_assert(container->pgsizes && - vrdl->granularity >= 1ULL << ctz64(container->pgsizes)); + g_assert(bcontainer->pgsizes && + vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); ram_discard_listener_init(&vrdl->listener, vfio_ram_discard_notify_populate, @@ -441,7 +442,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, * number of sections in the address space we could have over time, * also consuming DMA mappings. */ - if (container->dma_max_mappings) { + if (bcontainer->dma_max_mappings) { unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; #ifdef CONFIG_KVM @@ -462,11 +463,11 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } if (vrdl_mappings + max_memslots - vrdl_count > - container->dma_max_mappings) { + bcontainer->dma_max_mappings) { warn_report("%s: possibly running out of DMA mappings. E.g., try" " increasing the 'block-size' of virtio-mem devies." " Maximum possible DMA mappings: %d, Maximum possible" - " memslots: %d", __func__, container->dma_max_mappings, + " memslots: %d", __func__, bcontainer->dma_max_mappings, max_memslots); } } @@ -626,7 +627,7 @@ static void vfio_listener_region_add(MemoryListener *listener, iommu_idx); ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, - container->pgsizes, + bcontainer->pgsizes, &err); if (ret) { g_free(giommu); @@ -675,7 +676,7 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; + hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { trace_vfio_listener_region_add_no_dma_map( @@ -777,7 +778,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; - pgmask = (1ULL << ctz64(container->pgsizes)) - 1; + pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { vfio_unregister_ram_discard_listener(container, section); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 5d654ae172..dcce111349 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -52,6 +52,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, bcontainer->ops = ops; bcontainer->space = space; bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; QLIST_INIT(&bcontainer->giommu_list); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 9a542368ab..116a9e1e73 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -196,7 +196,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && container->iommu_type == VFIO_TYPE1v2_IOMMU) { trace_vfio_legacy_dma_unmap_overflow_workaround(); - unmap.size -= 1ULL << ctz64(container->pgsizes); + unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); continue; } error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); @@ -652,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; container->error = NULL; - container->dma_max_mappings = 0; container->iova_ranges = NULL; QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); @@ -684,13 +683,13 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - container->pgsizes = info->iova_pgsizes; + bcontainer->pgsizes = info->iova_pgsizes; } else { - container->pgsizes = qemu_real_host_page_size(); + bcontainer->pgsizes = qemu_real_host_page_size(); } - if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { - container->dma_max_mappings = 65535; + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; } vfio_get_info_iova_range(info, container); diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 83da2f7ec2..4f76bdd3ca 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -226,6 +226,7 @@ static int vfio_spapr_create_window(VFIOContainer *container, hwaddr *pgsize) { int ret = 0; + VFIOContainerBase *bcontainer = &container->bcontainer; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; unsigned entries, bits_total, bits_per_level, max_levels; @@ -239,13 +240,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, if (pagesize > rampagesize) { pagesize = rampagesize; } - pgmask = container->pgsizes & (pagesize | (pagesize - 1)); + pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; if (!pagesize) { error_report("Host doesn't support page size 0x%"PRIx64 ", the supported mask is 0x%lx", memory_region_iommu_get_min_page_size(iommu_mr), - container->pgsizes); + bcontainer->pgsizes); return -EINVAL; } @@ -421,6 +422,7 @@ void vfio_container_del_section_window(VFIOContainer *container, int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { + VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; @@ -461,7 +463,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } if (v2) { - container->pgsizes = info.ddw.pgsizes; + bcontainer->pgsizes = info.ddw.pgsizes; /* * There is a default window in just created container. * To make region_add/del simpler, we better remove this @@ -476,7 +478,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } } else { /* The default table uses 4K pages */ - container->pgsizes = 0x1000; + bcontainer->pgsizes = 0x1000; vfio_host_win_add(container, info.dma32_window_start, info.dma32_window_start + info.dma32_window_size - 1, diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0295ede7ba..3046287070 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -94,8 +94,6 @@ typedef struct VFIOContainer { bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; - unsigned long pgsizes; - unsigned int dma_max_mappings; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 7090962496..85ec7e1a56 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -36,6 +36,8 @@ typedef struct VFIOAddressSpace { typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; VFIOAddressSpace *space; + unsigned long pgsizes; + unsigned int dma_max_mappings; bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_ENTRY(VFIOContainerBase) next; -- Gitee From d0234f18616cfe9a43287ba75e4788a10166a526 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:28 +0800 Subject: [PATCH 011/134] vfio/container: Move vrdl_list to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 38 +++++++++++++-------------- hw/vfio/container-base.c | 1 + hw/vfio/container.c | 1 - include/hw/vfio/vfio-common.h | 11 -------- include/hw/vfio/vfio-container-base.h | 11 ++++++++ 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index fd6249c290..e9a19209ab 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -351,13 +351,13 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; const hwaddr size = int128_get64(section->size); const hwaddr iova = section->offset_within_address_space; int ret; /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(&vrdl->container->bcontainer, - iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); if (ret) { error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); @@ -369,6 +369,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; const hwaddr end = section->offset_within_region + int128_get64(section->size); hwaddr start, next, iova; @@ -387,8 +388,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, section->offset_within_address_space; vaddr = memory_region_get_ram_ptr(section->mr) + start; - ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova, - next - start, vaddr, section->readonly); + ret = vfio_container_dma_map(bcontainer, iova, next - start, + vaddr, section->readonly); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -398,10 +399,9 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, return 0; } -static void vfio_register_ram_discard_listener(VFIOContainer *container, +static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = &container->bcontainer; RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl; @@ -412,7 +412,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); vrdl = g_new0(VFIORamDiscardListener, 1); - vrdl->container = container; + vrdl->bcontainer = bcontainer; vrdl->mr = section->mr; vrdl->offset_within_address_space = section->offset_within_address_space; vrdl->size = int128_get64(section->size); @@ -427,7 +427,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, vfio_ram_discard_notify_populate, vfio_ram_discard_notify_discard, true); ram_discard_manager_register_listener(rdm, &vrdl->listener, section); - QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); /* * Sanity-check if we have a theoretically problematic setup where we could @@ -451,7 +451,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } #endif - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { hwaddr start, end; start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, @@ -473,13 +473,13 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } } -static void vfio_unregister_ram_discard_listener(VFIOContainer *container, +static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl = NULL; - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { if (vrdl->mr == section->mr && vrdl->offset_within_address_space == section->offset_within_address_space) { @@ -663,7 +663,7 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(container, section); + vfio_register_ram_discard_listener(bcontainer, section); return; } @@ -781,7 +781,7 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_unregister_ram_discard_listener(container, section); + vfio_unregister_ram_discard_listener(bcontainer, section); /* Unregistering will trigger an unmap. */ try_unmap = false; } @@ -1267,17 +1267,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size, - ram_addr); + return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr); } -static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, - MemoryRegionSection *section) +static int +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl = NULL; - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { if (vrdl->mr == section->mr && vrdl->offset_within_address_space == section->offset_within_address_space) { @@ -1331,7 +1331,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } return 0; } else if (memory_region_has_ram_discard_manager(section->mr)) { - return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); + return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); } ram_addr = memory_region_get_ram_addr(section->mr) + diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index dcce111349..584eee4ba1 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, bcontainer->dirty_pages_supported = false; bcontainer->dma_max_mappings = 0; QLIST_INIT(&bcontainer->giommu_list); + QLIST_INIT(&bcontainer->vrdl_list); } void vfio_container_destroy(VFIOContainerBase *bcontainer) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 116a9e1e73..023f220c93 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -653,7 +653,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; container->error = NULL; container->iova_ranges = NULL; - QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, &vfio_legacy_ops); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 3046287070..0174b767ca 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -96,21 +96,10 @@ typedef struct VFIOContainer { uint64_t max_dirty_bitmap_size; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; - QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_HEAD(, VFIODMARange) dma_list; GList *iova_ranges; } VFIOContainer; -typedef struct VFIORamDiscardListener { - VFIOContainer *container; - MemoryRegion *mr; - hwaddr offset_within_address_space; - hwaddr size; - uint64_t granularity; - RamDiscardListener listener; - QLIST_ENTRY(VFIORamDiscardListener) next; -} VFIORamDiscardListener; - typedef struct VFIOHostDMAWindow { hwaddr min_iova; hwaddr max_iova; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 85ec7e1a56..8e05b5ac5a 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -40,6 +40,7 @@ typedef struct VFIOContainerBase { unsigned int dma_max_mappings; bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainerBase) next; QLIST_HEAD(, VFIODevice) device_list; } VFIOContainerBase; @@ -52,6 +53,16 @@ typedef struct VFIOGuestIOMMU { QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; +typedef struct VFIORamDiscardListener { + VFIOContainerBase *bcontainer; + MemoryRegion *mr; + hwaddr offset_within_address_space; + hwaddr size; + uint64_t granularity; + RamDiscardListener listener; + QLIST_ENTRY(VFIORamDiscardListener) next; +} VFIORamDiscardListener; + int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 4515b719fb7a335ce76dd9168a9e4db24fca28df Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:29 +0800 Subject: [PATCH 012/134] vfio/container: Move listener to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move listener to base container. Also error and initialized fields are moved at the same time. No functional change intended. Modify vfio_physical_log_clear/vfio_connect_container during backporting. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 119 +++++++++++++------------- hw/vfio/container-base.c | 1 + hw/vfio/container.c | 23 +++-- hw/vfio/spapr.c | 11 +-- include/hw/vfio/vfio-common.h | 3 - include/hw/vfio/vfio-container-base.h | 3 + 6 files changed, 82 insertions(+), 78 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e9a19209ab..4647f4447d 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -541,7 +541,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section, return true; } -static bool vfio_get_section_iova_range(VFIOContainer *container, +static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, MemoryRegionSection *section, hwaddr *out_iova, hwaddr *out_end, Int128 *out_llend) @@ -569,8 +569,10 @@ static bool vfio_get_section_iova_range(VFIOContainer *container, static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -581,7 +583,8 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { if (memory_region_is_ram_device(section->mr)) { trace_vfio_listener_region_add_no_dma_map( memory_region_name(section->mr), @@ -688,13 +691,12 @@ static void vfio_listener_region_add(MemoryListener *listener, } } - ret = vfio_container_dma_map(&container->bcontainer, - iova, int128_get64(llsize), vaddr, - section->readonly); + ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), + vaddr, section->readonly); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", - container, iova, int128_get64(llsize), vaddr, ret, + bcontainer, iova, int128_get64(llsize), vaddr, ret, strerror(-ret)); if (memory_region_is_ram_device(section->mr)) { /* Allow unexpected mappings not to be fatal for RAM devices */ @@ -716,9 +718,9 @@ fail: * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ - if (!container->initialized) { - if (!container->error) { - error_propagate_prepend(&container->error, err, + if (!bcontainer->initialized) { + if (!bcontainer->error) { + error_propagate_prepend(&bcontainer->error, err, "Region %s: ", memory_region_name(section->mr)); } else { @@ -733,8 +735,10 @@ fail: static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -767,7 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { return; } @@ -790,22 +795,22 @@ static void vfio_listener_region_del(MemoryListener *listener, if (int128_eq(llsize, int128_2_64())) { /* The unmap ioctl doesn't accept a full 64-bit span. */ llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(&container->bcontainer, iova, + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, int128_get64(llsize), ret, + bcontainer, iova, int128_get64(llsize), ret, strerror(-ret)); } iova += int128_get64(llsize); } - ret = vfio_container_dma_unmap(&container->bcontainer, iova, + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, int128_get64(llsize), ret, + bcontainer, iova, int128_get64(llsize), ret, strerror(-ret)); } } @@ -825,16 +830,15 @@ typedef struct VFIODirtyRanges { } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { - VFIOContainer *container; + VFIOContainerBase *bcontainer; VFIODirtyRanges ranges; MemoryListener listener; } VFIODirtyRangesListener; static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainer *container) + VFIOContainerBase *bcontainer) { VFIOPCIDevice *pcidev; - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; Object *owner; @@ -863,7 +867,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, hwaddr iova, end, *min, *max; if (!vfio_listener_valid_section(section, "tracking_update") || - !vfio_get_section_iova_range(dirty->container, section, + !vfio_get_section_iova_range(dirty->bcontainer, section, &iova, &end, NULL)) { return; } @@ -887,7 +891,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, * The alternative would be an IOVATree but that has a much bigger runtime * overhead and unnecessary complexity. */ - if (vfio_section_is_vfio_pci(section, dirty->container) && + if (vfio_section_is_vfio_pci(section, dirty->bcontainer) && iova >= UINT32_MAX) { min = &range->minpci64; max = &range->maxpci64; @@ -911,7 +915,7 @@ static const MemoryListener vfio_dirty_tracking_listener = { .region_add = vfio_dirty_tracking_update, }; -static void vfio_dirty_tracking_init(VFIOContainer *container, +static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, VFIODirtyRanges *ranges) { VFIODirtyRangesListener dirty; @@ -921,10 +925,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, dirty.ranges.min64 = UINT64_MAX; dirty.ranges.minpci64 = UINT64_MAX; dirty.listener = vfio_dirty_tracking_listener; - dirty.container = container; + dirty.bcontainer = bcontainer; memory_listener_register(&dirty.listener, - container->bcontainer.space->as); + bcontainer->space->as); *ranges = dirty.ranges; @@ -936,12 +940,11 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, memory_listener_unregister(&dirty.listener); } -static void vfio_devices_dma_logging_stop(VFIOContainer *container) +static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) { uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; feature->argsz = sizeof(buf); @@ -962,7 +965,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) } static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainer *container, +vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, VFIODirtyRanges *tracking) { struct vfio_device_feature *feature; @@ -1035,16 +1038,15 @@ static void vfio_device_feature_dma_logging_start_destroy( g_free(feature); } -static int vfio_devices_dma_logging_start(VFIOContainer *container) +static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer) { struct vfio_device_feature *feature; VFIODirtyRanges ranges; - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret = 0; - vfio_dirty_tracking_init(container, &ranges); - feature = vfio_device_feature_dma_logging_start_create(container, + vfio_dirty_tracking_init(bcontainer, &ranges); + feature = vfio_device_feature_dma_logging_start_create(bcontainer, &ranges); if (!feature) { return -errno; @@ -1067,7 +1069,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) out: if (ret) { - vfio_devices_dma_logging_stop(container); + vfio_devices_dma_logging_stop(bcontainer); } vfio_device_feature_dma_logging_start_destroy(feature); @@ -1077,14 +1079,14 @@ out: static void vfio_listener_log_global_start(MemoryListener *listener) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret; - if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { - ret = vfio_devices_dma_logging_start(container); + if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + ret = vfio_devices_dma_logging_start(bcontainer); } else { - ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, - true); + ret = vfio_container_set_dirty_page_tracking(bcontainer, true); } if (ret) { @@ -1096,14 +1098,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener) static void vfio_listener_log_global_stop(MemoryListener *listener) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret = 0; - if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { - vfio_devices_dma_logging_stop(container); + if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + vfio_devices_dma_logging_stop(bcontainer); } else { - ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, - false); + ret = vfio_container_set_dirty_page_tracking(bcontainer, false); } if (ret) { @@ -1221,8 +1223,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; VFIOContainerBase *bcontainer = giommu->bcontainer; - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; int ret = -EINVAL; @@ -1237,12 +1237,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - ret = vfio_get_dirty_bitmap(&container->bcontainer, iova, - iotlb->addr_mask + 1, translated_addr); + ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, + translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, iotlb->addr_mask + 1, ret, + bcontainer, iova, iotlb->addr_mask + 1, ret, strerror(-ret)); } } @@ -1298,10 +1298,9 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, &vrdl); } -static int vfio_sync_dirty_bitmap(VFIOContainer *container, +static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = &container->bcontainer; ram_addr_t ram_addr; if (memory_region_is_iommu(section->mr)) { @@ -1337,7 +1336,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; - return vfio_get_dirty_bitmap(&container->bcontainer, + return vfio_get_dirty_bitmap(bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), int128_get64(section->size), ram_addr); } @@ -1345,15 +1344,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret; if (vfio_listener_skipped_section(section)) { return; } - if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { - ret = vfio_sync_dirty_bitmap(container, section); + if (vfio_devices_all_dirty_tracking(bcontainer)) { + ret = vfio_sync_dirty_bitmap(bcontainer, section); if (ret) { error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, strerror(-ret)); @@ -1485,14 +1485,17 @@ static int vfio_physical_log_clear(VFIOContainer *container, static void vfio_listener_log_clear(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); if (vfio_listener_skipped_section(section) || - !container->bcontainer.dirty_pages_supported) { + !bcontainer->dirty_pages_supported) { return; } - if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { + if (vfio_devices_all_dirty_tracking(bcontainer)) { vfio_physical_log_clear(container, section); } } diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 584eee4ba1..7f508669f5 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -51,6 +51,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, { bcontainer->ops = ops; bcontainer->space = space; + bcontainer->error = NULL; bcontainer->dirty_pages_supported = false; bcontainer->dma_max_mappings = 0; QLIST_INIT(&bcontainer->giommu_list); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 023f220c93..50da1300dd 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -520,6 +520,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, { struct vfio_info_cap_header *hdr; struct vfio_iommu_type1_info_cap_migration *cap_mig; + VFIOContainerBase *bcontainer = &container->bcontainer; hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); if (!hdr) { @@ -534,7 +535,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - container->bcontainer.dirty_pages_supported = true; + bcontainer->dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; } @@ -651,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; - container->error = NULL; container->iova_ranges = NULL; QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; @@ -716,23 +716,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); - container->listener = vfio_memory_listener; - if (kvm_csv3_enabled()) { - shared_memory_listener_register(&container->listener, + shared_memory_listener_register(&bcontainer->listener, bcontainer->space->as); - } else { - memory_listener_register(&container->listener, bcontainer->space->as); } - if (container->error) { + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); + + if (bcontainer->error) { ret = -1; - error_propagate_prepend(errp, container->error, + error_propagate_prepend(errp, bcontainer->error, "memory listener initialization failed: "); goto listener_release_exit; } - container->initialized = true; + bcontainer->initialized = true; return 0; listener_release_exit: @@ -742,7 +741,7 @@ listener_release_exit: if (kvm_csv3_enabled()) { shared_memory_listener_unregister(); } else { - memory_listener_unregister(&container->listener); + memory_listener_unregister(&bcontainer->listener); } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { @@ -781,7 +780,7 @@ static void vfio_disconnect_container(VFIOGroup *group) if (kvm_csv3_enabled()) { shared_memory_listener_unregister(); } else { - memory_listener_unregister(&container->listener); + memory_listener_unregister(&bcontainer->listener); } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 4f76bdd3ca..7a50975f25 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -46,6 +46,7 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, { VFIOContainer *container = container_of(listener, VFIOContainer, prereg_listener); + VFIOContainerBase *bcontainer = &container->bcontainer; const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -88,9 +89,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ - if (!container->initialized) { - if (!container->error) { - error_setg_errno(&container->error, -ret, + if (!bcontainer->initialized) { + if (!bcontainer->error) { + error_setg_errno(&bcontainer->error, -ret, "Memory registering failed"); } } else { @@ -445,9 +446,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) memory_listener_register(&container->prereg_listener, &address_space_memory); - if (container->error) { + if (bcontainer->error) { ret = -1; - error_propagate_prepend(errp, container->error, + error_propagate_prepend(errp, bcontainer->error, "RAM memory listener initialization failed: "); goto listener_unregister_exit; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0174b767ca..c23e7fb8ee 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -86,11 +86,8 @@ typedef struct VFIODMARange { typedef struct VFIOContainer { VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - MemoryListener listener; MemoryListener prereg_listener; unsigned iommu_type; - Error *error; - bool initialized; bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 8e05b5ac5a..95f8d319e0 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -36,6 +36,9 @@ typedef struct VFIOAddressSpace { typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; VFIOAddressSpace *space; + MemoryListener listener; + Error *error; + bool initialized; unsigned long pgsizes; unsigned int dma_max_mappings; bool dirty_pages_supported; -- Gitee From a59131a461adf9b626735886a53825e2a03f3272 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:30 +0800 Subject: [PATCH 013/134] vfio/container: Move dirty_pgsizes and max_dirty_bitmap_size to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container.c | 9 +++++---- include/hw/vfio/vfio-common.h | 2 -- include/hw/vfio/vfio-container-base.h | 2 ++ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 50da1300dd..191597167a 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -66,6 +66,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { + VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; VFIOBitmap vbmap; @@ -93,7 +94,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, bitmap->size = vbmap.size; bitmap->data = (__u64 *)vbmap.bitmap; - if (vbmap.size > container->max_dirty_bitmap_size) { + if (vbmap.size > bcontainer->max_dirty_bitmap_size) { error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); ret = -E2BIG; goto unmap_exit; @@ -157,7 +158,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { if (!vfio_devices_all_device_dirty_tracking(bcontainer) && - container->bcontainer.dirty_pages_supported) { + bcontainer->dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -536,8 +537,8 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { bcontainer->dirty_pages_supported = true; - container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; - container->dirty_pgsizes = cap_mig->pgsize_bitmap; + bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; } } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c23e7fb8ee..a8da41d27e 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -89,8 +89,6 @@ typedef struct VFIOContainer { MemoryListener prereg_listener; unsigned iommu_type; bool dirty_log_manual_clear; - uint64_t dirty_pgsizes; - uint64_t max_dirty_bitmap_size; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIODMARange) dma_list; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 95f8d319e0..80e4a993c5 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -39,6 +39,8 @@ typedef struct VFIOContainerBase { MemoryListener listener; Error *error; bool initialized; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; unsigned long pgsizes; unsigned int dma_max_mappings; bool dirty_pages_supported; -- Gitee From 4aac9c99e4f90d400d511bb46809714eab1fbf5f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:31 +0800 Subject: [PATCH 014/134] vfio/container: Move iova_ranges to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meanwhile remove the helper function vfio_free_container as it only calls g_free now. No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 5 +++-- hw/vfio/container-base.c | 3 +++ hw/vfio/container.c | 19 ++++++------------- include/hw/vfio/vfio-common.h | 1 - include/hw/vfio/vfio-container-base.h | 1 + 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 4647f4447d..9926454527 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -637,9 +637,10 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } - if (container->iova_ranges) { + if (bcontainer->iova_ranges) { ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, - container->iova_ranges, &err); + bcontainer->iova_ranges, + &err); if (ret) { g_free(giommu); goto fail; diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 7f508669f5..0177f43741 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, bcontainer->error = NULL; bcontainer->dirty_pages_supported = false; bcontainer->dma_max_mappings = 0; + bcontainer->iova_ranges = NULL; QLIST_INIT(&bcontainer->giommu_list); QLIST_INIT(&bcontainer->vrdl_list); } @@ -70,4 +71,6 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) QLIST_REMOVE(giommu, giommu_next); g_free(giommu); } + + g_list_free_full(bcontainer->iova_ranges, g_free); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 191597167a..13d42aad0d 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -360,7 +360,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, } static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, - VFIOContainer *container) + VFIOContainerBase *bcontainer) { struct vfio_info_cap_header *hdr; struct vfio_iommu_type1_info_cap_iova_range *cap; @@ -378,8 +378,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, range_set_bounds(range, cap->iova_ranges[i].start, cap->iova_ranges[i].end); - container->iova_ranges = - range_list_insert(container->iova_ranges, range); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); } return true; @@ -542,12 +542,6 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, } } -static void vfio_free_container(VFIOContainer *container) -{ - g_list_free_full(container->iova_ranges, g_free); - g_free(container); -} - static SharedRegionListener *g_shl; static void shared_memory_listener_register(MemoryListener *listener, @@ -653,7 +647,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; - container->iova_ranges = NULL; QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, &vfio_legacy_ops); @@ -692,7 +685,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, bcontainer->dma_max_mappings = 65535; } - vfio_get_info_iova_range(info, container); + vfio_get_info_iova_range(info, bcontainer); vfio_get_iommu_info_migration(container, info); g_free(info); @@ -753,7 +746,7 @@ enable_discards_exit: vfio_ram_block_discard_disable(container, false); free_container_exit: - vfio_free_container(container); + g_free(container); close_fd_exit: close(fd); @@ -801,7 +794,7 @@ static void vfio_disconnect_container(VFIOGroup *group) trace_vfio_disconnect_container(container->fd); close(container->fd); - vfio_free_container(container); + g_free(container); vfio_put_address_space(space); } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index a8da41d27e..9a2e0ace72 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -92,7 +92,6 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIODMARange) dma_list; - GList *iova_ranges; } VFIOContainer; typedef struct VFIOHostDMAWindow { diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 80e4a993c5..9658ffb526 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -48,6 +48,7 @@ typedef struct VFIOContainerBase { QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainerBase) next; QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; } VFIOContainerBase; typedef struct VFIOGuestIOMMU { -- Gitee From 1ba796aff9476e5850df910304eb3720a09feef2 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:32 +0800 Subject: [PATCH 015/134] vfio/container: Implement attach/detach_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 16 ++++++++++++++++ hw/vfio/container.c | 12 +++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 9926454527..488aa43c9b 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1644,3 +1644,19 @@ retry: return info; } + +int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const VFIOIOMMUOps *ops = &vfio_legacy_ops; + + return ops->attach_device(name, vbasedev, as, errp); +} + +void vfio_detach_device(VFIODevice *vbasedev) +{ + if (!vbasedev->bcontainer) { + return; + } + vbasedev->bcontainer->ops->detach_device(vbasedev); +} diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 13d42aad0d..62af0f2bdd 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -986,8 +986,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) * @name and @vbasedev->name are likely to be different depending * on the type of the device, hence the need for passing @name */ -int vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) { int groupid = vfio_device_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; @@ -1027,14 +1027,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } -void vfio_detach_device(VFIODevice *vbasedev) +static void vfio_legacy_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - if (!vbasedev->bcontainer) { - return; - } - QLIST_REMOVE(vbasedev, global_next); QLIST_REMOVE(vbasedev, container_next); vbasedev->bcontainer = NULL; @@ -1046,6 +1042,8 @@ void vfio_detach_device(VFIODevice *vbasedev) const VFIOIOMMUOps vfio_legacy_ops = { .dma_map = vfio_legacy_dma_map, .dma_unmap = vfio_legacy_dma_unmap, + .attach_device = vfio_legacy_attach_device, + .detach_device = vfio_legacy_detach_device, .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, }; -- Gitee From 4b0bff002d93d8785ccec8020667dc559bda4e9c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:33 +0800 Subject: [PATCH 016/134] vfio/spapr: Introduce spapr backend and target interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an empty spapr backend which will hold spapr specific content, currently only prereg_listener and hostwin_list. Also introduce two spapr specific callbacks add/del_window into VFIOIOMMUOps. Instantiate a spapr ops with a helper setup_spapr_ops and assign it to bcontainer->ops. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/spapr.c | 14 ++++++++++++++ include/hw/vfio/vfio-container-base.h | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 7a50975f25..e1a6b35563 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -24,6 +24,10 @@ #include "qapi/error.h" #include "trace.h" +typedef struct VFIOSpaprContainer { + VFIOContainer container; +} VFIOSpaprContainer; + static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) { if (memory_region_is_iommu(section->mr)) { @@ -421,6 +425,14 @@ void vfio_container_del_section_window(VFIOContainer *container, } } +static VFIOIOMMUOps vfio_iommu_spapr_ops; + +static void setup_spapr_ops(VFIOContainerBase *bcontainer) +{ + vfio_iommu_spapr_ops = *bcontainer->ops; + bcontainer->ops = &vfio_iommu_spapr_ops; +} + int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { VFIOContainerBase *bcontainer = &container->bcontainer; @@ -486,6 +498,8 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) 0x1000); } + setup_spapr_ops(bcontainer); + return 0; listener_unregister_exit: diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 9658ffb526..f62a14ac73 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -101,5 +101,11 @@ struct VFIOIOMMUOps { int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); + /* SPAPR specific */ + int (*add_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp); + void (*del_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); }; #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ -- Gitee From 42d02193bbe543173aa16e463015c76fa2d38ec0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:34 +0800 Subject: [PATCH 017/134] vfio/spapr: switch to spapr IOMMU BE add/del_section_window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 8 ++------ hw/vfio/container-base.c | 21 +++++++++++++++++++++ hw/vfio/spapr.c | 19 ++++++++++++++----- include/hw/vfio/vfio-common.h | 5 ----- include/hw/vfio/vfio-container-base.h | 5 +++++ 5 files changed, 42 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 488aa43c9b..679fee4321 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -571,8 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener, { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -595,7 +593,7 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (vfio_container_add_section_window(container, section, &err)) { + if (vfio_container_add_section_window(bcontainer, section, &err)) { goto fail; } @@ -738,8 +736,6 @@ static void vfio_listener_region_del(MemoryListener *listener, { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -818,7 +814,7 @@ static void vfio_listener_region_del(MemoryListener *listener, memory_region_unref(section->mr); - vfio_container_del_section_window(container, section); + vfio_container_del_section_window(bcontainer, section); } typedef struct VFIODirtyRanges { diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 0177f43741..71f7274973 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -31,6 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } +int vfio_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp) +{ + if (!bcontainer->ops->add_window) { + return 0; + } + + return bcontainer->ops->add_window(bcontainer, section, errp); +} + +void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + if (!bcontainer->ops->del_window) { + return; + } + + return bcontainer->ops->del_window(bcontainer, section); +} + int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index e1a6b35563..5be1911aad 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -319,10 +319,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, return 0; } -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp) +static int +vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -407,9 +410,13 @@ int vfio_container_add_section_window(VFIOContainer *container, return 0; } -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section) +static void +vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; } @@ -430,6 +437,8 @@ static VFIOIOMMUOps vfio_iommu_spapr_ops; static void setup_spapr_ops(VFIOContainerBase *bcontainer) { vfio_iommu_spapr_ops = *bcontainer->ops; + vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; + vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; bcontainer->ops = &vfio_iommu_spapr_ops; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9a2e0ace72..c6b1260911 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -183,11 +183,6 @@ VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); /* SPAPR specific */ -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp); -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section); int vfio_spapr_container_init(VFIOContainer *container, Error **errp); void vfio_spapr_container_deinit(VFIOContainer *container); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index f62a14ac73..4b6f017c6f 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -75,6 +75,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); +int vfio_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp); +void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start); int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, -- Gitee From 8f27e17107a923a0739c17efe5dcd11f818364af Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:35 +0800 Subject: [PATCH 018/134] vfio/spapr: Move prereg_listener into spapr container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/spapr.c | 24 ++++++++++++++++-------- include/hw/vfio/vfio-common.h | 1 - 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 5be1911aad..68c3dd6c75 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -26,6 +26,7 @@ typedef struct VFIOSpaprContainer { VFIOContainer container; + MemoryListener prereg_listener; } VFIOSpaprContainer; static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) @@ -48,8 +49,9 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) static void vfio_prereg_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - prereg_listener); + VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, + prereg_listener); + VFIOContainer *container = &scontainer->container; VFIOContainerBase *bcontainer = &container->bcontainer; const hwaddr gpa = section->offset_within_address_space; hwaddr end; @@ -107,8 +109,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, static void vfio_prereg_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - prereg_listener); + VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, + prereg_listener); + VFIOContainer *container = &scontainer->container; const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -445,6 +448,8 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer) int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; @@ -463,9 +468,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) return -errno; } } else { - container->prereg_listener = vfio_prereg_listener; + scontainer->prereg_listener = vfio_prereg_listener; - memory_listener_register(&container->prereg_listener, + memory_listener_register(&scontainer->prereg_listener, &address_space_memory); if (bcontainer->error) { ret = -1; @@ -513,7 +518,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) listener_unregister_exit: if (v2) { - memory_listener_unregister(&container->prereg_listener); + memory_listener_unregister(&scontainer->prereg_listener); } return ret; } @@ -523,7 +528,10 @@ void vfio_spapr_container_deinit(VFIOContainer *container) VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - memory_listener_unregister(&container->prereg_listener); + VFIOSpaprContainer *scontainer = container_of(container, + VFIOSpaprContainer, + container); + memory_listener_unregister(&scontainer->prereg_listener); } QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, next) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c6b1260911..ba8abed75a 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -86,7 +86,6 @@ typedef struct VFIODMARange { typedef struct VFIOContainer { VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - MemoryListener prereg_listener; unsigned iommu_type; bool dirty_log_manual_clear; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; -- Gitee From 13c57d5e888fe9d6bdf68469c8e76991a789c1e6 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:36 +0800 Subject: [PATCH 019/134] vfio/spapr: Move hostwin_list into spapr container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/spapr.c | 36 +++++++++++++++++++---------------- include/hw/vfio/vfio-common.h | 1 - 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 68c3dd6c75..5c6426e697 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -27,6 +27,7 @@ typedef struct VFIOSpaprContainer { VFIOContainer container; MemoryListener prereg_listener; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; } VFIOSpaprContainer; static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) @@ -154,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = { .region_del = vfio_prereg_listener_region_del, }; -static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, +static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, hwaddr max_iova, uint64_t iova_pgsizes) { VFIOHostDMAWindow *hostwin; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (ranges_overlap(hostwin->min_iova, hostwin->max_iova - hostwin->min_iova + 1, min_iova, @@ -173,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, hostwin->min_iova = min_iova; hostwin->max_iova = max_iova; hostwin->iova_pgsizes = iova_pgsizes; - QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); + QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); } -static int vfio_host_win_del(VFIOContainer *container, +static int vfio_host_win_del(VFIOSpaprContainer *scontainer, hwaddr min_iova, hwaddr max_iova) { VFIOHostDMAWindow *hostwin; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { QLIST_REMOVE(hostwin, hostwin_next); g_free(hostwin); @@ -192,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container, return -1; } -static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, hwaddr iova, hwaddr end) { VFIOHostDMAWindow *hostwin; @@ -329,6 +330,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, { VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -344,7 +347,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, iova = section->offset_within_address_space; end = iova + int128_get64(section->size) - 1; - if (!vfio_find_hostwin(container, iova, end)) { + if (!vfio_find_hostwin(scontainer, iova, end)) { error_setg(errp, "Container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); @@ -358,7 +361,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, } /* For now intersections are not allowed, we may relax this later */ - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (ranges_overlap(hostwin->min_iova, hostwin->max_iova - hostwin->min_iova + 1, section->offset_within_address_space, @@ -380,7 +383,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, return ret; } - vfio_host_win_add(container, section->offset_within_address_space, + vfio_host_win_add(scontainer, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1, pgsize); #ifdef CONFIG_KVM @@ -419,6 +422,8 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, { VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; @@ -426,7 +431,7 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, vfio_spapr_remove_window(container, section->offset_within_address_space); - if (vfio_host_win_del(container, + if (vfio_host_win_del(scontainer, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1) < 0) { @@ -454,7 +459,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; - QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&scontainer->hostwin_list); /* * The host kernel code implementing VFIO_IOMMU_DISABLE is called @@ -506,7 +511,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } else { /* The default table uses 4K pages */ bcontainer->pgsizes = 0x1000; - vfio_host_win_add(container, info.dma32_window_start, + vfio_host_win_add(scontainer, info.dma32_window_start, info.dma32_window_start + info.dma32_window_size - 1, 0x1000); @@ -525,15 +530,14 @@ listener_unregister_exit: void vfio_spapr_container_deinit(VFIOContainer *container) { + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - VFIOSpaprContainer *scontainer = container_of(container, - VFIOSpaprContainer, - container); memory_listener_unregister(&scontainer->prereg_listener); } - QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, next) { QLIST_REMOVE(hostwin, hostwin_next); g_free(hostwin); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ba8abed75a..9e22acbfb6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -88,7 +88,6 @@ typedef struct VFIOContainer { int fd; /* /dev/vfio/vfio, empowered by the attached groups */ unsigned iommu_type; bool dirty_log_manual_clear; - QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIODMARange) dma_list; } VFIOContainer; -- Gitee From 6cb41a55992571dd215fee86ed910bb4d6688bf8 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:37 +0800 Subject: [PATCH 020/134] backends/iommufd: Introduce the iommufd object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an iommufd object which allows the interaction with the host /dev/iommu device. The /dev/iommu can have been already pre-opened outside of qemu, in which case the fd can be passed directly along with the iommufd object: This allows the iommufd object to be shared accross several subsystems (VFIO, VDPA, ...). For example, libvirt would open the /dev/iommu once. If no fd is passed along with the iommufd object, the /dev/iommu is opened by the qemu code. Suggested-by: Alex Williamson Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- MAINTAINERS | 8 ++ backends/Kconfig | 4 + backends/iommufd.c | 245 +++++++++++++++++++++++++++++++++++++++ backends/meson.build | 1 + backends/trace-events | 10 ++ include/sysemu/iommufd.h | 38 ++++++ qapi/qom.json | 19 +++ qemu-options.hx | 12 ++ 8 files changed, 337 insertions(+) create mode 100644 backends/iommufd.c create mode 100644 include/sysemu/iommufd.h diff --git a/MAINTAINERS b/MAINTAINERS index 695e0bd34f..a5a446914a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2167,6 +2167,14 @@ F: hw/vfio/ap.c F: docs/system/s390x/vfio-ap.rst L: qemu-s390x@nongnu.org +iommufd +M: Yi Liu +M: Eric Auger +M: Zhenzhong Duan +S: Supported +F: backends/iommufd.c +F: include/sysemu/iommufd.h + vhost M: Michael S. Tsirkin S: Supported diff --git a/backends/Kconfig b/backends/Kconfig index f35abc1609..2cb23f62fa 100644 --- a/backends/Kconfig +++ b/backends/Kconfig @@ -1 +1,5 @@ source tpm/Kconfig + +config IOMMUFD + bool + depends on VFIO diff --git a/backends/iommufd.c b/backends/iommufd.c new file mode 100644 index 0000000000..ba58a0eb0d --- /dev/null +++ b/backends/iommufd.c @@ -0,0 +1,245 @@ +/* + * iommufd container backend + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "sysemu/iommufd.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qemu/module.h" +#include "qom/object_interfaces.h" +#include "qemu/error-report.h" +#include "monitor/monitor.h" +#include "trace.h" +#include +#include + +static void iommufd_backend_init(Object *obj) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + + be->fd = -1; + be->users = 0; + be->owned = true; + qemu_mutex_init(&be->lock); +} + +static void iommufd_backend_finalize(Object *obj) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + + if (be->owned) { + close(be->fd); + be->fd = -1; + } +} + +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + int fd = -1; + + fd = monitor_fd_param(monitor_cur(), str, errp); + if (fd == -1) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + qemu_mutex_lock(&be->lock); + be->fd = fd; + be->owned = false; + qemu_mutex_unlock(&be->lock); + trace_iommu_backend_set_fd(be->fd); +} + +static bool iommufd_backend_can_be_deleted(UserCreatable *uc) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(uc); + + return !be->users; +} + +static void iommufd_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->can_be_deleted = iommufd_backend_can_be_deleted; + + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); +} + +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) +{ + int fd, ret = 0; + + qemu_mutex_lock(&be->lock); + if (be->users == UINT32_MAX) { + error_setg(errp, "too many connections"); + ret = -E2BIG; + goto out; + } + if (be->owned && !be->users) { + fd = qemu_open_old("/dev/iommu", O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, "/dev/iommu opening failed"); + ret = fd; + goto out; + } + be->fd = fd; + } + be->users++; +out: + trace_iommufd_backend_connect(be->fd, be->owned, + be->users, ret); + qemu_mutex_unlock(&be->lock); + return ret; +} + +void iommufd_backend_disconnect(IOMMUFDBackend *be) +{ + qemu_mutex_lock(&be->lock); + if (!be->users) { + goto out; + } + be->users--; + if (!be->users && be->owned) { + close(be->fd); + be->fd = -1; + } +out: + trace_iommufd_backend_disconnect(be->fd, be->users); + qemu_mutex_unlock(&be->lock); +} + +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_ioas_alloc alloc_data = { + .size = sizeof(alloc_data), + .flags = 0, + }; + + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate ioas"); + return ret; + } + + *ioas_id = alloc_data.out_ioas_id; + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); + + return ret; +} + +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) +{ + int ret, fd = be->fd; + struct iommu_destroy des = { + .size = sizeof(des), + .id = id, + }; + + ret = ioctl(fd, IOMMU_DESTROY, &des); + trace_iommufd_backend_free_id(fd, id, ret); + if (ret) { + error_report("Failed to free id: %u %m", id); + } +} + +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + int ret, fd = be->fd; + struct iommu_ioas_map map = { + .size = sizeof(map), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .ioas_id = ioas_id, + .__reserved = 0, + .user_va = (uintptr_t)vaddr, + .iova = iova, + .length = size, + }; + + if (!readonly) { + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; + } + + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, + vaddr, readonly, ret); + if (ret) { + ret = -errno; + + /* TODO: Not support mapping hardware PCI BAR region for now. */ + if (errno == EFAULT) { + warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?"); + } else { + error_report("IOMMU_IOAS_MAP failed: %m"); + } + } + return ret; +} + +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size) +{ + int ret, fd = be->fd; + struct iommu_ioas_unmap unmap = { + .size = sizeof(unmap), + .ioas_id = ioas_id, + .iova = iova, + .length = size, + }; + + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); + /* + * IOMMUFD takes mapping as some kind of object, unmapping + * nonexistent mapping is treated as deleting a nonexistent + * object and return ENOENT. This is different from legacy + * backend which allows it. vIOMMU may trigger a lot of + * redundant unmapping, to avoid flush the log, treat them + * as succeess for IOMMUFD just like legacy backend. + */ + if (ret && errno == ENOENT) { + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret); + ret = 0; + } else { + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); + } + + if (ret) { + ret = -errno; + error_report("IOMMU_IOAS_UNMAP failed: %m"); + } + return ret; +} + +static const TypeInfo iommufd_backend_info = { + .name = TYPE_IOMMUFD_BACKEND, + .parent = TYPE_OBJECT, + .instance_size = sizeof(IOMMUFDBackend), + .instance_init = iommufd_backend_init, + .instance_finalize = iommufd_backend_finalize, + .class_size = sizeof(IOMMUFDBackendClass), + .class_init = iommufd_backend_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&iommufd_backend_info); +} + +type_init(register_types); diff --git a/backends/meson.build b/backends/meson.build index 914c7c4afb..9a5cea480d 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -20,6 +20,7 @@ if have_vhost_user system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) endif system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) if have_vhost_user_crypto system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c')) endif diff --git a/backends/trace-events b/backends/trace-events index 652eb76a57..d45c6e31a6 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void) dbus_vmstate_post_load(int version_id) "version_id: %d" dbus_vmstate_loading(const char *id) "id: %s" dbus_vmstate_saving(const char *id) "id: %s" + +# iommufd.c +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h new file mode 100644 index 0000000000..9c5524b0ed --- /dev/null +++ b/include/sysemu/iommufd.h @@ -0,0 +1,38 @@ +#ifndef SYSEMU_IOMMUFD_H +#define SYSEMU_IOMMUFD_H + +#include "qom/object.h" +#include "qemu/thread.h" +#include "exec/hwaddr.h" +#include "exec/cpu-common.h" + +#define TYPE_IOMMUFD_BACKEND "iommufd" +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) + +struct IOMMUFDBackendClass { + ObjectClass parent_class; +}; + +struct IOMMUFDBackend { + Object parent; + + /*< protected >*/ + int fd; /* /dev/iommu file descriptor */ + bool owned; /* is the /dev/iommu opened internally */ + QemuMutex lock; + uint32_t users; + + /*< public >*/ +}; + +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); +void iommufd_backend_disconnect(IOMMUFDBackend *be); + +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp); +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly); +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); +#endif diff --git a/qapi/qom.json b/qapi/qom.json index a74c7a91f9..a5336e6b11 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -794,6 +794,23 @@ { 'struct': 'VfioUserServerProperties', 'data': { 'socket': 'SocketAddress', 'device': 'str' } } +## +# @IOMMUFDProperties: +# +# Properties for iommufd objects. +# +# @fd: file descriptor name previously passed via 'getfd' command, +# which represents a pre-opened /dev/iommu. This allows the +# iommufd object to be shared accross several subsystems +# (VFIO, VDPA, ...), and the file descriptor to be shared +# with other process, e.g. DPDK. (default: QEMU opens +# /dev/iommu by itself) +# +# Since: 9.0 +## +{ 'struct': 'IOMMUFDProperties', + 'data': { '*fd': 'str' } } + ## # @RngProperties: # @@ -969,6 +986,7 @@ 'input-barrier', { 'name': 'input-linux', 'if': 'CONFIG_LINUX' }, + 'iommufd', 'iothread', 'main-loop', { 'name': 'memory-backend-epc', @@ -1039,6 +1057,7 @@ 'input-barrier': 'InputBarrierProperties', 'input-linux': { 'type': 'InputLinuxProperties', 'if': 'CONFIG_LINUX' }, + 'iommufd': 'IOMMUFDProperties', 'iothread': 'IothreadProperties', 'main-loop': 'MainLoopProperties', 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', diff --git a/qemu-options.hx b/qemu-options.hx index 8516b73206..7fe76c4b1d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -5224,6 +5224,18 @@ SRST The ``share`` boolean option is on by default with memfd. + ``-object iommufd,id=id[,fd=fd]`` + Creates an iommufd backend which allows control of DMA mapping + through the ``/dev/iommu`` device. + + The ``id`` parameter is a unique ID which frontends (such as + vfio-pci of vdpa) will use to connect with the iommufd backend. + + The ``fd`` parameter is an optional pre-opened file descriptor + resulting from ``/dev/iommu`` opening. Usually the iommufd is shared + across all subsystems, bringing the benefit of centralized + reference counting. + ``-object rng-builtin,id=id`` Creates a random number generator backend which obtains entropy from QEMU builtin functions. The ``id`` parameter is a unique ID -- Gitee From 90688ff9c5802965f24460ac79fe52b93d2adb1f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 11 Jan 2025 10:52:38 +0800 Subject: [PATCH 021/134] util/char_dev: Add open_cdev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /dev/vfio/devices/vfioX may not exist. In that case it is still possible to open /dev/char/$major:$minor instead. Add helper function to abstract the cdev open. Suggested-by: Jason Gunthorpe Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- MAINTAINERS | 2 + include/qemu/chardev_open.h | 16 ++++++++ util/chardev_open.c | 81 +++++++++++++++++++++++++++++++++++++ util/meson.build | 1 + 4 files changed, 100 insertions(+) create mode 100644 include/qemu/chardev_open.h create mode 100644 util/chardev_open.c diff --git a/MAINTAINERS b/MAINTAINERS index a5a446914a..ca70bb4e64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2174,6 +2174,8 @@ M: Zhenzhong Duan S: Supported F: backends/iommufd.c F: include/sysemu/iommufd.h +F: include/qemu/chardev_open.h +F: util/chardev_open.c vhost M: Michael S. Tsirkin diff --git a/include/qemu/chardev_open.h b/include/qemu/chardev_open.h new file mode 100644 index 0000000000..64e8fcfdcb --- /dev/null +++ b/include/qemu/chardev_open.h @@ -0,0 +1,16 @@ +/* + * QEMU Chardev Helper + * + * Copyright (C) 2023 Intel Corporation. + * + * Authors: Yi Liu + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef QEMU_CHARDEV_OPEN_H +#define QEMU_CHARDEV_OPEN_H + +int open_cdev(const char *devpath, dev_t cdev); +#endif diff --git a/util/chardev_open.c b/util/chardev_open.c new file mode 100644 index 0000000000..f776429788 --- /dev/null +++ b/util/chardev_open.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * Copyright (C) 2023 Intel Corporation. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Yi Liu + * + * Copied from + * https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c + * + */ + +#include "qemu/osdep.h" +#include "qemu/chardev_open.h" + +static int open_cdev_internal(const char *path, dev_t cdev) +{ + struct stat st; + int fd; + + fd = qemu_open_old(path, O_RDWR); + if (fd == -1) { + return -1; + } + if (fstat(fd, &st) || !S_ISCHR(st.st_mode) || + (cdev != 0 && st.st_rdev != cdev)) { + close(fd); + return -1; + } + return fd; +} + +static int open_cdev_robust(dev_t cdev) +{ + g_autofree char *devpath = NULL; + + /* + * This assumes that udev is being used and is creating the /dev/char/ + * symlinks. + */ + devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev)); + return open_cdev_internal(devpath, cdev); +} + +int open_cdev(const char *devpath, dev_t cdev) +{ + int fd; + + fd = open_cdev_internal(devpath, cdev); + if (fd == -1 && cdev != 0) { + return open_cdev_robust(cdev); + } + return fd; +} diff --git a/util/meson.build b/util/meson.build index c2322ef6e7..174c133368 100644 --- a/util/meson.build +++ b/util/meson.build @@ -108,6 +108,7 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c')) endif if cpu == 'aarch64' -- Gitee From bf4c408cd5d3daadbfd11136655e5bcb40dcbba0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:39 +0800 Subject: [PATCH 022/134] vfio/common: return early if space isn't empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a trivial optimization. If there is active container in space, vfio_reset_handler will never be unregistered. So revert the check of space->containers and return early. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 679fee4321..f6c2029aec 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1608,10 +1608,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) void vfio_put_address_space(VFIOAddressSpace *space) { - if (QLIST_EMPTY(&space->containers)) { - QLIST_REMOVE(space, list); - g_free(space); + if (!QLIST_EMPTY(&space->containers)) { + return; } + + QLIST_REMOVE(space, list); + g_free(space); + if (QLIST_EMPTY(&vfio_address_spaces)) { qemu_unregister_reset(vfio_reset_handler, NULL); } -- Gitee From 5c034b7ec5ca255551956744a386288a74ab172e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 11 Jan 2025 10:52:40 +0800 Subject: [PATCH 023/134] vfio/iommufd: Implement the iommufd backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The iommufd backend is implemented based on the new /dev/iommu user API. This backend obviously depends on CONFIG_IOMMUFD. So far, the iommufd backend doesn't support dirty page sync yet. Co-authored-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 6 + hw/vfio/iommufd.c | 422 ++++++++++++++++++++++++++++++++++ hw/vfio/meson.build | 3 + hw/vfio/trace-events | 10 + include/hw/vfio/vfio-common.h | 11 + 5 files changed, 452 insertions(+) create mode 100644 hw/vfio/iommufd.c diff --git a/hw/vfio/common.c b/hw/vfio/common.c index f6c2029aec..0e900c6746 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #ifdef CONFIG_KVM #include @@ -1649,6 +1650,11 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, { const VFIOIOMMUOps *ops = &vfio_legacy_ops; +#ifdef CONFIG_IOMMUFD + if (vbasedev->iommufd) { + ops = &vfio_iommufd_ops; + } +#endif return ops->attach_device(name, vbasedev, as, errp); } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c new file mode 100644 index 0000000000..6d31aeac7b --- /dev/null +++ b/hw/vfio/iommufd.c @@ -0,0 +1,422 @@ +/* + * iommufd container backend + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include +#include +#include + +#include "hw/vfio/vfio-common.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "qapi/error.h" +#include "sysemu/iommufd.h" +#include "hw/qdev-core.h" +#include "sysemu/reset.h" +#include "qemu/cutils.h" +#include "qemu/chardev_open.h" + +static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + return iommufd_backend_map_dma(container->be, + container->ioas_id, + iova, size, vaddr, readonly); +} + +static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ + return iommufd_backend_unmap_dma(container->be, + container->ioas_id, iova, size); +} + +static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp) +{ + return vfio_kvm_device_add_fd(vbasedev->fd, errp); +} + +static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev) +{ + Error *err = NULL; + + if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) { + error_report_err(err); + } +} + +static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) +{ + IOMMUFDBackend *iommufd = vbasedev->iommufd; + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + }; + int ret; + + ret = iommufd_backend_connect(iommufd, errp); + if (ret) { + return ret; + } + + /* + * Add device to kvm-vfio to be prepared for the tracking + * in KVM. Especially for some emulated devices, it requires + * to have kvm information in the device open. + */ + ret = iommufd_cdev_kvm_device_add(vbasedev, errp); + if (ret) { + goto err_kvm_device_add; + } + + /* Bind device to iommufd */ + bind.iommufd = iommufd->fd; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); + if (ret) { + error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d", + vbasedev->fd, bind.iommufd); + goto err_bind; + } + + vbasedev->devid = bind.out_devid; + trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, + vbasedev->fd, vbasedev->devid); + return ret; +err_bind: + iommufd_cdev_kvm_device_del(vbasedev); +err_kvm_device_add: + iommufd_backend_disconnect(iommufd); + return ret; +} + +static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) +{ + /* Unbind is automatically conducted when device fd is closed */ + iommufd_cdev_kvm_device_del(vbasedev); + iommufd_backend_disconnect(vbasedev->iommufd); +} + +static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) +{ + long int ret = -ENOTTY; + char *path, *vfio_dev_path = NULL, *vfio_path = NULL; + DIR *dir = NULL; + struct dirent *dent; + gchar *contents; + struct stat st; + gsize length; + int major, minor; + dev_t vfio_devt; + + path = g_strdup_printf("%s/vfio-dev", sysfs_path); + if (stat(path, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + goto out_free_path; + } + + dir = opendir(path); + if (!dir) { + error_setg_errno(errp, errno, "couldn't open directory %s", path); + goto out_free_path; + } + + while ((dent = readdir(dir))) { + if (!strncmp(dent->d_name, "vfio", 4)) { + vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name); + break; + } + } + + if (!vfio_dev_path) { + error_setg(errp, "failed to find vfio-dev/vfioX/dev"); + goto out_close_dir; + } + + if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) { + error_setg(errp, "failed to load \"%s\"", vfio_dev_path); + goto out_free_dev_path; + } + + if (sscanf(contents, "%d:%d", &major, &minor) != 2) { + error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path); + goto out_free_dev_path; + } + g_free(contents); + vfio_devt = makedev(major, minor); + + vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name); + ret = open_cdev(vfio_path, vfio_devt); + if (ret < 0) { + error_setg(errp, "Failed to open %s", vfio_path); + } + + trace_iommufd_cdev_getfd(vfio_path, ret); + g_free(vfio_path); + +out_free_dev_path: + g_free(vfio_dev_path); +out_close_dir: + closedir(dir); +out_free_path: + if (*errp) { + error_prepend(errp, VFIO_MSG_PREFIX, path); + } + g_free(path); + + return ret; +} + +static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, + Error **errp) +{ + int ret, iommufd = vbasedev->iommufd->fd; + struct vfio_device_attach_iommufd_pt attach_data = { + .argsz = sizeof(attach_data), + .flags = 0, + .pt_id = id, + }; + + /* Attach device to an IOAS or hwpt within iommufd */ + ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); + if (ret) { + error_setg_errno(errp, errno, + "[iommufd=%d] error attach %s (%d) to id=%d", + iommufd, vbasedev->name, vbasedev->fd, id); + } else { + trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, + vbasedev->fd, id); + } + return ret; +} + +static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) +{ + int ret, iommufd = vbasedev->iommufd->fd; + struct vfio_device_detach_iommufd_pt detach_data = { + .argsz = sizeof(detach_data), + .flags = 0, + }; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data); + if (ret) { + error_setg_errno(errp, errno, "detach %s failed", vbasedev->name); + } else { + trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name); + } + return ret; +} + +static int iommufd_cdev_attach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); +} + +static void iommufd_cdev_detach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + Error *err = NULL; + + if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { + error_report_err(err); + } +} + +static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (!QLIST_EMPTY(&bcontainer->device_list)) { + return; + } + memory_listener_unregister(&bcontainer->listener); + vfio_container_destroy(bcontainer); + iommufd_backend_free_id(container->be, container->ioas_id); + g_free(container); +} + +static int iommufd_cdev_ram_block_discard_disable(bool state) +{ + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); +} + +static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + VFIOContainerBase *bcontainer; + VFIOIOMMUFDContainer *container; + VFIOAddressSpace *space; + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; + int ret, devfd; + uint32_t ioas_id; + Error *err = NULL; + + devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); + if (devfd < 0) { + return devfd; + } + vbasedev->fd = devfd; + + ret = iommufd_cdev_connect_and_bind(vbasedev, errp); + if (ret) { + goto err_connect_bind; + } + + space = vfio_get_address_space(as); + + /* try to attach to an existing container in this space */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + if (bcontainer->ops != &vfio_iommufd_ops || + vbasedev->iommufd != container->be) { + continue; + } + if (iommufd_cdev_attach_container(vbasedev, container, &err)) { + const char *msg = error_get_pretty(err); + + trace_iommufd_cdev_fail_attach_existing_container(msg); + error_free(err); + err = NULL; + } else { + ret = iommufd_cdev_ram_block_discard_disable(true); + if (ret) { + error_setg(errp, + "Cannot set discarding of RAM broken (%d)", ret); + goto err_discard_disable; + } + goto found_container; + } + } + + /* Need to allocate a new dedicated container */ + ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp); + if (ret < 0) { + goto err_alloc_ioas; + } + + trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); + + container = g_malloc0(sizeof(*container)); + container->be = vbasedev->iommufd; + container->ioas_id = ioas_id; + + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, &vfio_iommufd_ops); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + + ret = iommufd_cdev_attach_container(vbasedev, container, errp); + if (ret) { + goto err_attach_container; + } + + ret = iommufd_cdev_ram_block_discard_disable(true); + if (ret) { + goto err_discard_disable; + } + + bcontainer->pgsizes = qemu_real_host_page_size(); + + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); + + if (bcontainer->error) { + ret = -1; + error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); + goto err_listener_register; + } + + bcontainer->initialized = true; + +found_container: + ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info); + if (ret) { + error_setg_errno(errp, errno, "error getting device info"); + goto err_listener_register; + } + + /* + * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level + * for discarding incompatibility check as well? + */ + if (vbasedev->ram_block_discard_allowed) { + iommufd_cdev_ram_block_discard_disable(false); + } + + vbasedev->group = 0; + vbasedev->num_irqs = dev_info.num_irqs; + vbasedev->num_regions = dev_info.num_regions; + vbasedev->flags = dev_info.flags; + vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, + vbasedev->num_regions, vbasedev->flags); + return 0; + +err_listener_register: + iommufd_cdev_ram_block_discard_disable(false); +err_discard_disable: + iommufd_cdev_detach_container(vbasedev, container); +err_attach_container: + iommufd_cdev_container_destroy(container); +err_alloc_ioas: + vfio_put_address_space(space); + iommufd_cdev_unbind_and_disconnect(vbasedev); +err_connect_bind: + close(vbasedev->fd); + return ret; +} + +static void iommufd_cdev_detach(VFIODevice *vbasedev) +{ + VFIOContainerBase *bcontainer = vbasedev->bcontainer; + VFIOAddressSpace *space = bcontainer->space; + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + QLIST_REMOVE(vbasedev, global_next); + QLIST_REMOVE(vbasedev, container_next); + vbasedev->bcontainer = NULL; + + if (!vbasedev->ram_block_discard_allowed) { + iommufd_cdev_ram_block_discard_disable(false); + } + + iommufd_cdev_detach_container(vbasedev, container); + iommufd_cdev_container_destroy(container); + vfio_put_address_space(space); + + iommufd_cdev_unbind_and_disconnect(vbasedev); + close(vbasedev->fd); +} + +const VFIOIOMMUOps vfio_iommufd_ops = { + .dma_map = iommufd_cdev_map, + .dma_unmap = iommufd_cdev_unmap, + .attach_device = iommufd_cdev_attach, + .detach_device = iommufd_cdev_detach, +}; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 32a6933280..bd5cc4ca79 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -7,6 +7,9 @@ vfio_ss.add(files( 'spapr.c', 'migration.c', )) +vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( + 'iommufd.c', +)) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', 'pci-quirks.c', diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 08a1f9dfa4..3340c93af0 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -164,3 +164,13 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" + +#iommufd.c + +iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d" +iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)" +iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d" +iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s" +iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" +iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" +iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9e22acbfb6..9b9fd7b461 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -99,6 +99,14 @@ typedef struct VFIOHostDMAWindow { QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; } VFIOHostDMAWindow; +typedef struct IOMMUFDBackend IOMMUFDBackend; + +typedef struct VFIOIOMMUFDContainer { + VFIOContainerBase bcontainer; + IOMMUFDBackend *be; + uint32_t ioas_id; +} VFIOIOMMUFDContainer; + typedef struct VFIODeviceOps VFIODeviceOps; typedef struct VFIODevice { @@ -126,6 +134,8 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + int devid; + IOMMUFDBackend *iommufd; } VFIODevice; struct VFIODeviceOps { @@ -215,6 +225,7 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; extern const VFIOIOMMUOps vfio_legacy_ops; +extern const VFIOIOMMUOps vfio_iommufd_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; -- Gitee From cb2bd16a67cd45a0ad3318098120aee10a298f3b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:41 +0800 Subject: [PATCH 024/134] vfio/iommufd: Relax assert check for iommufd backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently iommufd doesn't support dirty page sync yet, but it will not block us doing live migration if VFIO migration is force enabled. So in this case we allow set_dirty_page_tracking to be NULL. Note we don't need same change for query_dirty_bitmap because when dirty page sync isn't supported, query_dirty_bitmap will never be called. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container-base.c | 4 ++++ hw/vfio/container.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 71f7274973..eee2dcfe76 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -55,6 +55,10 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer, int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start) { + if (!bcontainer->dirty_pages_supported) { + return 0; + } + g_assert(bcontainer->ops->set_dirty_page_tracking); return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 62af0f2bdd..4936b8f27f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -266,10 +266,6 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, .argsz = sizeof(dirty), }; - if (!bcontainer->dirty_pages_supported) { - return 0; - } - if (start) { dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; } else { -- Gitee From d6f0612a8760959f25c148ab50a1e7c394d4279a Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:42 +0800 Subject: [PATCH 025/134] vfio/iommufd: Add support for iova_ranges and pgsizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some vIOMMU such as virtio-iommu use IOVA ranges from host side to setup reserved ranges for passthrough device, so that guest will not use an IOVA range beyond host support. Use an uAPI of IOMMUFD to get IOVA ranges of host side and pass to vIOMMU just like the legacy backend, if this fails, fallback to 64bit IOVA range. Also use out_iova_alignment returned from uAPI as pgsizes instead of qemu_real_host_page_size() as a fallback. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/iommufd.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 6d31aeac7b..01b448e840 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -261,6 +261,53 @@ static int iommufd_cdev_ram_block_discard_disable(bool state) return ram_block_uncoordinated_discard_disable(state); } +static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container, + uint32_t ioas_id, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + struct iommu_ioas_iova_ranges *info; + struct iommu_iova_range *iova_ranges; + int ret, sz, fd = container->be->fd; + + info = g_malloc0(sizeof(*info)); + info->size = sizeof(*info); + info->ioas_id = ioas_id; + + ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); + if (ret && errno != EMSGSIZE) { + goto error; + } + + sz = info->num_iovas * sizeof(struct iommu_iova_range); + info = g_realloc(info, sizeof(*info) + sz); + info->allowed_iovas = (uintptr_t)(info + 1); + + ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); + if (ret) { + goto error; + } + + iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas; + + for (int i = 0; i < info->num_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); + } + bcontainer->pgsizes = info->out_iova_alignment; + + g_free(info); + return 0; + +error: + ret = -errno; + g_free(info); + error_setg_errno(errp, errno, "Cannot get IOVA ranges"); + return ret; +} + static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { @@ -335,7 +382,14 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, goto err_discard_disable; } - bcontainer->pgsizes = qemu_real_host_page_size(); + ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err); + if (ret) { + error_append_hint(&err, + "Fallback to default 64bit IOVA range and 4K page size\n"); + warn_report_err(err); + err = NULL; + bcontainer->pgsizes = qemu_real_host_page_size(); + } bcontainer->listener = vfio_memory_listener; memory_listener_register(&bcontainer->listener, bcontainer->space->as); -- Gitee From 0b0701478649baccf3945051822f993619bce01e Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:43 +0800 Subject: [PATCH 026/134] vfio/pci: Extract out a helper vfio_pci_get_pci_hot_reset_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used by both legacy and iommufd backends. No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/pci.c | 54 +++++++++++++++++++++++++++++++++++---------------- hw/vfio/pci.h | 3 +++ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c62c02f7b6..eb55e8ae88 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2445,22 +2445,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) return (strcmp(tmp, name) == 0); } -static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p) { - VFIOGroup *group; struct vfio_pci_hot_reset_info *info; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; + int ret, count; - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); - } - vdev->vbasedev.needs_reset = false; + assert(info_p && !*info_p); info = g_malloc0(sizeof(*info)); info->argsz = sizeof(*info); @@ -2468,24 +2459,53 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret && errno != ENOSPC) { ret = -errno; + g_free(info); if (!vdev->has_pm_reset) { error_report("vfio: Cannot reset device %s, " "no available reset mechanism.", vdev->vbasedev.name); } - goto out_single; + return ret; } count = info->count; - info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); - info->argsz = sizeof(*info) + (count * sizeof(*devices)); - devices = &info->devices[0]; + info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0]))); + info->argsz = sizeof(*info) + (count * sizeof(info->devices[0])); ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret) { ret = -errno; + g_free(info); error_report("vfio: hot reset info failed: %m"); + return ret; + } + + *info_p = info; + return 0; +} + +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +{ + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { goto out_single; } + devices = &info->devices[0]; trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index fba8737ab2..1006061afb 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); extern const PropertyInfo qdev_prop_nv_gpudirect_clique; +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p); + int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, -- Gitee From 32beb7b360416a5f04cebac227ffdf102448d518 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:44 +0800 Subject: [PATCH 027/134] vfio/pci: Introduce a vfio pci hot reset interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy vfio pci and iommufd cdev have different process to hot reset vfio device, expand current code to abstract out pci_hot_reset callback for legacy vfio, this same interface will also be used by iommufd cdev vfio device. Rename vfio_pci_hot_reset to vfio_legacy_pci_hot_reset and move it into container.c. vfio_pci_[pre/post]_reset and vfio_pci_host_match are exported so they could be called in legacy and iommufd pci_hot_reset callback. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container.c | 170 ++++++++++++++++++++++++++ hw/vfio/pci.c | 168 +------------------------ hw/vfio/pci.h | 3 + include/hw/vfio/vfio-container-base.h | 3 + 4 files changed, 182 insertions(+), 162 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 4936b8f27f..e32e1b51e0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -35,6 +35,7 @@ #include "qapi/error.h" #include "migration/migration.h" #include "sysemu/kvm.h" +#include "pci.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -1035,6 +1036,174 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) vfio_put_group(group); } +static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + devices = &info->devices[0]; + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + trace_vfio_pci_hot_reset_dep_devices(host.domain, + host.bus, host.slot, host.function, devices[i].group_id); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + } + ret = -EPERM; + goto out; + } + + /* Prep dependent devices for reset and clear our marker. */ + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + break; + } + } + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Determine how many group fds need to be passed */ + count = 0; + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + count++; + break; + } + } + } + + reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); + reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); + fds = &reset->group_fds[0]; + + /* Fill in group fds */ + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + fds[reset->count++] = group->fd; + break; + } + } + } + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + +out: + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + break; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + vfio_pci_post_reset(tmp); + break; + } + } + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + const VFIOIOMMUOps vfio_legacy_ops = { .dma_map = vfio_legacy_dma_map, .dma_unmap = vfio_legacy_dma_unmap, @@ -1042,4 +1211,5 @@ const VFIOIOMMUOps vfio_legacy_ops = { .detach_device = vfio_legacy_detach_device, .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, + .pci_hot_reset = vfio_legacy_pci_hot_reset, }; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index eb55e8ae88..d00c3472c7 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2374,7 +2374,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) return 0; } -static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) +void vfio_pci_pre_reset(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; uint16_t cmd; @@ -2411,7 +2411,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } -static void vfio_pci_post_reset(VFIOPCIDevice *vdev) +void vfio_pci_post_reset(VFIOPCIDevice *vdev) { Error *err = NULL; int nr; @@ -2435,7 +2435,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_quirk_reset(vdev); } -static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { char tmp[13]; @@ -2485,166 +2485,10 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { - VFIOGroup *group; - struct vfio_pci_hot_reset_info *info = NULL; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; - - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); - } - vdev->vbasedev.needs_reset = false; - - ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); - - if (ret) { - goto out_single; - } - devices = &info->devices[0]; - - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); - - /* Verify that we have all the groups required */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - trace_vfio_pci_hot_reset_dep_devices(host.domain, - host.bus, host.slot, host.function, devices[i].group_id); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %s, " - "depends on group %d which is not owned.", - vdev->vbasedev.name, devices[i].group_id); - } - ret = -EPERM; - goto out; - } - - /* Prep dependent devices for reset and clear our marker. */ - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - if (single) { - ret = -EINVAL; - goto out_single; - } - vfio_pci_pre_reset(tmp); - tmp->vbasedev.needs_reset = false; - multi = true; - break; - } - } - } - - if (!single && !multi) { - ret = -EINVAL; - goto out_single; - } - - /* Determine how many group fds need to be passed */ - count = 0; - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - count++; - break; - } - } - } - - reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); - reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); - fds = &reset->group_fds[0]; - - /* Fill in group fds */ - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - fds[reset->count++] = group->fd; - break; - } - } - } - - /* Bus reset! */ - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); - g_free(reset); - - trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, - ret ? strerror(errno) : "Success"); - -out: - /* Re-enable INTx on affected devices */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - break; - } - - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - vfio_pci_post_reset(tmp); - break; - } - } - } -out_single: - if (!single) { - vfio_pci_post_reset(vdev); - } - g_free(info); + VFIODevice *vbasedev = &vdev->vbasedev; + const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops; - return ret; + return ops->pci_hot_reset(vbasedev, single); } /* diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 1006061afb..6e64a2654e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); extern const PropertyInfo qdev_prop_nv_gpudirect_clique; +void vfio_pci_pre_reset(VFIOPCIDevice *vdev); +void vfio_pci_post_reset(VFIOPCIDevice *vdev); +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name); int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, struct vfio_pci_hot_reset_info **info_p); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 4b6f017c6f..45bb19c767 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -106,6 +106,9 @@ struct VFIOIOMMUOps { int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); + /* PCI specific */ + int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); + /* SPAPR specific */ int (*add_window)(VFIOContainerBase *bcontainer, MemoryRegionSection *section, -- Gitee From de17750e24d4e583e9f392bbe47e4bd1aa81d6bc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:45 +0800 Subject: [PATCH 028/134] vfio/iommufd: Enable pci hot reset through iommufd cdev interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the newly introduced pci_hot_reset callback named iommufd_cdev_pci_hot_reset to do iommufd specific check and reset operation. Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/iommufd.c | 150 +++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 2 files changed, 151 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 01b448e840..6e53e013ef 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -24,6 +24,7 @@ #include "sysemu/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" +#include "pci.h" static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) @@ -468,9 +469,158 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) close(vbasedev->fd); } +static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) +{ + VFIODevice *vbasedev_iter; + + QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { + if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) { + continue; + } + if (devid == vbasedev_iter->devid) { + return vbasedev_iter; + } + } + return NULL; +} + +static VFIOPCIDevice * +iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev, + VFIODevice *reset_dev) +{ + VFIODevice *vbasedev_tmp; + + if (dep_dev->devid == reset_dev->devid || + dep_dev->devid == VFIO_PCI_DEVID_OWNED) { + return NULL; + } + + vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid); + if (!vbasedev_tmp || !vbasedev_tmp->dev->realized || + vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) { + return NULL; + } + + return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev); +} + +static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int ret, i; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + + assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID); + + devices = &info->devices[0]; + + if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) { + if (!vdev->has_pm_reset) { + for (i = 0; i < info->count; i++) { + if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) { + error_report("vfio: Cannot reset device %s, " + "depends on device %04x:%02x:%02x.%x " + "which is not owned.", + vdev->vbasedev.name, devices[i].segment, + devices[i].bus, PCI_SLOT(devices[i].devfn), + PCI_FUNC(devices[i].devfn)); + } + } + } + ret = -EPERM; + goto out_single; + } + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + for (i = 0; i < info->count; i++) { + VFIOPCIDevice *tmp; + + trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment, + devices[i].bus, + PCI_SLOT(devices[i].devfn), + PCI_FUNC(devices[i].devfn), + devices[i].devid); + + /* + * If a VFIO cdev device is resettable, all the dependent devices + * are either bound to same iommufd or within same iommu_groups as + * one of the iommufd bound devices. + */ + assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED); + + tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); + if (!tmp) { + continue; + } + + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Use zero length array for hot reset with iommufd backend */ + reset = g_malloc0(sizeof(*reset)); + reset->argsz = sizeof(*reset); + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + VFIOPCIDevice *tmp; + + tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); + if (!tmp) { + continue; + } + vfio_pci_post_reset(tmp); + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + const VFIOIOMMUOps vfio_iommufd_ops = { .dma_map = iommufd_cdev_map, .dma_unmap = iommufd_cdev_unmap, .attach_device = iommufd_cdev_attach, .detach_device = iommufd_cdev_detach, + .pci_hot_reset = iommufd_cdev_pci_hot_reset, }; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 3340c93af0..8fdde54456 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -174,3 +174,4 @@ iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Succ iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" +iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" -- Gitee From 6576af91f2621c24de4a8bbfa2c6681a16a5d043 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:46 +0800 Subject: [PATCH 029/134] vfio/pci: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-pci device: If the user wants to use the legacy backend, it shall not link the vfio-pci device with any iommufd object: -device vfio-pci,host=0000:02:00.0 This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-pci device options: -object iommufd,id=iommufd0 -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/pci.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d00c3472c7..c5984b0598 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include @@ -42,6 +43,7 @@ #include "qapi/error.h" #include "migration/blocker.h" #include "migration/qemu-file.h" +#include "sysemu/iommufd.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -3386,6 +3388,10 @@ static Property vfio_pci_dev_properties[] = { * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), */ +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From 008d4e37fe67c7f81920efe862352c4b1f3cd1b0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:47 +0800 Subject: [PATCH 030/134] vfio/pci: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Together with the earlier support of pre-opening /dev/iommu device, now we have full support of passing a vfio device to unprivileged qemu by management tool. This mode is no more considered for the legacy backend. So let's remove the "TODO" comment. Add helper functions vfio_device_set_fd() and vfio_device_get_name() to set fd and get device name, they will also be used by other vfio devices. There is no easy way to check if a device is mdev with FD passing, so fail the x-balloon-allowed check unconditionally in this case. There is also no easy way to get BDF as name with FD passing, so we fake a name by VFIO_FD[fd]. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/helpers.c | 43 +++++++++++++++++++++++++++++++++++ hw/vfio/iommufd.c | 12 ++++++---- hw/vfio/pci.c | 28 +++++++++++++---------- include/hw/vfio/vfio-common.h | 4 ++++ 4 files changed, 71 insertions(+), 16 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 168847e7c5..3592c3d54e 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -27,6 +27,7 @@ #include "trace.h" #include "qapi/error.h" #include "qemu/error-report.h" +#include "monitor/monitor.h" /* * Common VFIO interrupt disable @@ -609,3 +610,45 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) return ret; } + +int vfio_device_get_name(VFIODevice *vbasedev, Error **errp) +{ + struct stat st; + + if (vbasedev->fd < 0) { + if (stat(vbasedev->sysfsdev, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + return -errno; + } + /* User may specify a name, e.g: VFIO platform device */ + if (!vbasedev->name) { + vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + } + } else { + if (!vbasedev->iommufd) { + error_setg(errp, "Use FD passing only with iommufd backend"); + return -EINVAL; + } + /* + * Give a name with fd so any function printing out vbasedev->name + * will not break. + */ + if (!vbasedev->name) { + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + } + } + + return 0; +} + +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +{ + int fd = monitor_fd_param(monitor_cur(), str, errp); + + if (fd < 0) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + vbasedev->fd = fd; +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 6e53e013ef..5accd26484 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -320,11 +320,15 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, uint32_t ioas_id; Error *err = NULL; - devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); - if (devfd < 0) { - return devfd; + if (vbasedev->fd < 0) { + devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); + if (devfd < 0) { + return devfd; + } + vbasedev->fd = devfd; + } else { + devfd = vbasedev->fd; } - vbasedev->fd = devfd; ret = iommufd_cdev_connect_and_bind(vbasedev, errp); if (ret) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c5984b0598..445d58c8e5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2944,17 +2944,19 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; char *tmp, *subsys; Error *err = NULL; - struct stat st; int i, ret; bool is_mdev; char uuid[UUID_STR_LEN]; char *name; - if (!vbasedev->sysfsdev) { + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || ~vdev->host.slot || ~vdev->host.function)) { error_setg(errp, "No provided host device"); error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); return; } @@ -2964,13 +2966,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vdev->host.slot, vdev->host.function); } - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, "no such host device"); - error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + if (vfio_device_get_name(vbasedev, errp) < 0) { return; } - - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); vbasedev->ops = &vfio_pci_ops; vbasedev->type = VFIO_DEVICE_TYPE_PCI; vbasedev->dev = DEVICE(vdev); @@ -3330,6 +3328,7 @@ static void vfio_instance_init(Object *obj) vdev->host.bus = ~0U; vdev->host.slot = ~0U; vdev->host.function = ~0U; + vdev->vbasedev.fd = -1; vdev->nv_gpudirect_clique = 0xFF; @@ -3383,11 +3382,6 @@ static Property vfio_pci_dev_properties[] = { qdev_prop_nv_gpudirect_clique, uint8_t), DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, OFF_AUTOPCIBAR_OFF), - /* - * TODO - support passed fds... is this necessary? - * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), - * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), - */ #ifdef CONFIG_IOMMUFD DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), @@ -3395,6 +3389,13 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +#ifdef CONFIG_IOMMUFD +static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); +} +#endif + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3402,6 +3403,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) dc->reset = vfio_pci_reset; device_class_set_props(dc, vfio_pci_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); +#endif dc->desc = "VFIO-based PCI device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9b9fd7b461..5f35f2900b 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -265,4 +265,8 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, hwaddr size); int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr); + +/* Returns 0 on success, or a negative errno. */ +int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From 1bbc795190c3ad7c838dc57a6f7a38a779dfdd65 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:48 +0800 Subject: [PATCH 031/134] vfio/platform: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-platform device: If the user wants to use the legacy backend, it shall not link the vfio-platform device with any iommufd object: -device vfio-platform,host=XXX This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-platform device options: -object iommufd,id=iommufd0 -device vfio-platform,host=XXX,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/platform.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 8e3d4ac458..98ae4bc655 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -15,11 +15,13 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include "qapi/error.h" #include #include #include "hw/vfio/vfio-platform.h" +#include "sysemu/iommufd.h" #include "migration/vmstate.h" #include "qemu/error-report.h" #include "qemu/lockable.h" @@ -649,6 +651,10 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From 9a12f3f754fcebe86fe2346e62cd25d8a2d06a89 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:49 +0800 Subject: [PATCH 032/134] vfio/platform: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/platform.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 98ae4bc655..a97d9c6234 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -531,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = { */ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) { - struct stat st; int ret; - /* @sysfsdev takes precedence over @host */ - if (vbasedev->sysfsdev) { + /* @fd takes precedence over @sysfsdev which takes precedence over @host */ + if (vbasedev->fd < 0 && vbasedev->sysfsdev) { g_free(vbasedev->name); vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - } else { + } else if (vbasedev->fd < 0) { if (!vbasedev->name || strchr(vbasedev->name, '/')) { error_setg(errp, "wrong host device name"); return -EINVAL; @@ -548,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) vbasedev->name); } - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, - "failed to get the sysfs host device file status"); - return -errno; + ret = vfio_device_get_name(vbasedev, errp); + if (ret) { + return ret; } ret = vfio_attach_device(vbasedev->name, vbasedev, @@ -658,6 +656,20 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void vfio_platform_instance_init(Object *obj) +{ + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); + + vdev->vbasedev.fd = -1; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp); +} +#endif + static void vfio_platform_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -665,6 +677,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) dc->realize = vfio_platform_realize; device_class_set_props(dc, vfio_platform_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd); +#endif dc->vmsd = &vfio_platform_vmstate; dc->desc = "VFIO-based platform device assignment"; sbc->connect_irq_notifier = vfio_start_irqfd_injection; @@ -677,6 +692,7 @@ static const TypeInfo vfio_platform_dev_info = { .name = TYPE_VFIO_PLATFORM, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(VFIOPlatformDevice), + .instance_init = vfio_platform_instance_init, .class_init = vfio_platform_class_init, .class_size = sizeof(VFIOPlatformDeviceClass), }; -- Gitee From 6b9f02dbde780118d33abb998bc72ed246f50b6a Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:50 +0800 Subject: [PATCH 033/134] vfio/ap: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-ap device: if the user wants to use the legacy backend, it shall not link the vfio-ap device with any iommufd object: -device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-ap device options: -object iommufd,id=iommufd0 -device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index bbf69ff55a..80629609ae 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -11,10 +11,12 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include #include "qapi/error.h" #include "hw/vfio/vfio-common.h" +#include "sysemu/iommufd.h" #include "hw/s390x/ap-device.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" @@ -204,6 +206,10 @@ static void vfio_ap_unrealize(DeviceState *dev) static Property vfio_ap_properties[] = { DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From e4e2a6414eabe80d0d9f57446626c91c55b40afa Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:51 +0800 Subject: [PATCH 034/134] vfio/ap: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 80629609ae..f180e4a32a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -160,7 +160,10 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); VFIODevice *vbasedev = &vapdev->vdev; - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } + vbasedev->ops = &vfio_ap_ops; vbasedev->type = VFIO_DEVICE_TYPE_AP; vbasedev->dev = dev; @@ -230,11 +233,28 @@ static const VMStateDescription vfio_ap_vmstate = { .unmigratable = 1, }; +static void vfio_ap_instance_init(Object *obj) +{ + VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); + + vapdev->vdev.fd = -1; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp); +} +#endif + static void vfio_ap_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); device_class_set_props(dc, vfio_ap_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd); +#endif dc->vmsd = &vfio_ap_vmstate; dc->desc = "VFIO-based AP device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); @@ -249,6 +269,7 @@ static const TypeInfo vfio_ap_info = { .name = TYPE_VFIO_AP_DEVICE, .parent = TYPE_AP_DEVICE, .instance_size = sizeof(VFIOAPDevice), + .instance_init = vfio_ap_instance_init, .class_init = vfio_ap_class_init, }; -- Gitee From 5e743a2f7791f4fb3eea40806ca69f6cce1258c2 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:52 +0800 Subject: [PATCH 035/134] vfio/ccw: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-ccw device: If the user wants to use the legacy backend, it shall not link the vfio-ccw device with any iommufd object: -device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-ccw device options: -object iommufd,id=iommufd0 -device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ccw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index d857bb8d0f..d2d58bb677 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -15,12 +15,14 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include #include #include "qapi/error.h" #include "hw/vfio/vfio-common.h" +#include "sysemu/iommufd.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" #include "hw/qdev-properties.h" @@ -677,6 +679,10 @@ static void vfio_ccw_unrealize(DeviceState *dev) static Property vfio_ccw_properties[] = { DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev), DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From 0f9545907220680ee7e85a823a0e19b216a8b7d9 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:53 +0800 Subject: [PATCH 036/134] vfio/ccw: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ccw.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index d2d58bb677..2afdf17dbe 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -590,11 +590,12 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) } } + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } + vbasedev->ops = &vfio_ccw_ops; vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid, - vcdev->cdev.hostid.ssid, - vcdev->cdev.hostid.devid); vbasedev->dev = dev; /* @@ -691,12 +692,29 @@ static const VMStateDescription vfio_ccw_vmstate = { .unmigratable = 1, }; +static void vfio_ccw_instance_init(Object *obj) +{ + VFIOCCWDevice *vcdev = VFIO_CCW(obj); + + vcdev->vdev.fd = -1; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp); +} +#endif + static void vfio_ccw_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass); device_class_set_props(dc, vfio_ccw_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd); +#endif dc->vmsd = &vfio_ccw_vmstate; dc->desc = "VFIO-based subchannel assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); @@ -714,6 +732,7 @@ static const TypeInfo vfio_ccw_info = { .name = TYPE_VFIO_CCW, .parent = TYPE_S390_CCW, .instance_size = sizeof(VFIOCCWDevice), + .instance_init = vfio_ccw_instance_init, .class_init = vfio_ccw_class_init, }; -- Gitee From f702d050b4309bb7e7ffc159a3c41c82fe34ba07 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:54 +0800 Subject: [PATCH 037/134] vfio: Make VFIOContainerBase poiner parameter const in VFIOIOMMUOps callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the callbacks in VFIOIOMMUOps pass VFIOContainerBase poiner, those callbacks only need read access to the sub object of VFIOContainerBase. So make VFIOContainerBase, VFIOContainer and VFIOIOMMUFDContainer as const in these callbacks. Local functions called by those callbacks also need same changes to avoid build error. Modify vfio_lookup_match_range/vfio_legacy_dma_map during backporting. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 9 +++---- hw/vfio/container-base.c | 2 +- hw/vfio/container.c | 34 ++++++++++++++------------- hw/vfio/iommufd.c | 8 +++---- include/hw/vfio/vfio-common.h | 14 ++++++----- include/hw/vfio/vfio-container-base.h | 12 ++++++---- 6 files changed, 43 insertions(+), 36 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 0e900c6746..d572ec5880 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -204,7 +204,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) return true; } -bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) +bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; @@ -221,7 +221,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. */ -bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer) +bool +vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; @@ -1139,7 +1140,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, return 0; } -int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, +int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { @@ -1162,7 +1163,7 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, return 0; } -int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, +int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { bool all_device_dirty_tracking = diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index eee2dcfe76..1ffd25bbfa 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -63,7 +63,7 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); } -int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { diff --git a/hw/vfio/container.c b/hw/vfio/container.c index e32e1b51e0..67aeaa825b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -63,11 +63,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) } } -static int vfio_dma_unmap_bitmap(VFIOContainer *container, +static int vfio_dma_unmap_bitmap(const VFIOContainer *container, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { - VFIOContainerBase *bcontainer = &container->bcontainer; + const VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; VFIOBitmap vbmap; @@ -116,7 +116,7 @@ unmap_exit: return ret; } -VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, +VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, hwaddr start_addr, hwaddr size) { VFIODMARange *qrange; @@ -142,11 +142,12 @@ void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ -static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb) +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), .flags = 0, @@ -216,11 +217,11 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, return 0; } -static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, +static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) { VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + bcontainer); struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ, @@ -257,11 +258,12 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, return -errno; } -static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, - bool start) +static int +vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { .argsz = sizeof(dirty), @@ -283,12 +285,12 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, return ret; } -static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer, +static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; int ret; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 5accd26484..87a561c545 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -26,10 +26,10 @@ #include "qemu/chardev_open.h" #include "pci.h" -static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, +static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) { - VFIOIOMMUFDContainer *container = + const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); return iommufd_backend_map_dma(container->be, @@ -37,11 +37,11 @@ static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, iova, size, vaddr, readonly); } -static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer, +static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { - VFIOIOMMUFDContainer *container = + const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 5f35f2900b..37f01410d5 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -186,7 +186,7 @@ typedef struct VFIODisplay { VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); -VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, +VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, hwaddr start_addr, hwaddr size); void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); @@ -258,13 +258,15 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); -bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer); -bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer); -int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, +bool +vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer); +bool +vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer); +int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); -int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, - uint64_t size, ram_addr_t ram_addr); +int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); /* Returns 0 on success, or a negative errno. */ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 45bb19c767..2ae297ccda 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -82,7 +82,7 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer, MemoryRegionSection *section); int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start); -int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); @@ -93,18 +93,20 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); struct VFIOIOMMUOps { /* basic feature */ - int (*dma_map)(VFIOContainerBase *bcontainer, + int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); - int (*dma_unmap)(VFIOContainerBase *bcontainer, + int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); int (*attach_device)(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void (*detach_device)(VFIODevice *vbasedev); /* migration feature */ - int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); - int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, + int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, + bool start); + int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, hwaddr size); /* PCI specific */ int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); -- Gitee From bcb031b40fe40d5b6347b2134fb039945b87e8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 11 Jan 2025 10:52:55 +0800 Subject: [PATCH 038/134] hw/arm: Activate IOMMUFD for virt machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index c0a7d0bd58..4a0ea0628f 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -8,6 +8,7 @@ config ARM_VIRT imply TPM_TIS_SYSBUS imply TPM_TIS_I2C imply NVDIMM + imply IOMMUFD select ARM_GIC select ACPI select ARM_SMMUV3 -- Gitee From 3dfc0dd0b59925d1b73ca1a0db6d307ae597f76e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 11 Jan 2025 10:52:56 +0800 Subject: [PATCH 039/134] kconfig: Activate IOMMUFD for s390x machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/s390x/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig index 4c068d7960..26ad104485 100644 --- a/hw/s390x/Kconfig +++ b/hw/s390x/Kconfig @@ -6,6 +6,7 @@ config S390_CCW_VIRTIO imply VFIO_CCW imply WDT_DIAG288 imply PCIE_DEVICES + imply IOMMUFD select PCI_EXPRESS select S390_FLIC select S390_FLIC_KVM if KVM -- Gitee From 5405fa36c5f2784a9a6b19ee60d44b6cffb9f769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 11 Jan 2025 10:52:57 +0800 Subject: [PATCH 040/134] hw/i386: Activate IOMMUFD for q35 machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/i386/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 682e324f1c..908f29e02b 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -105,6 +105,7 @@ config Q35 imply E1000E_PCI_EXPRESS imply VMPORT imply VMMOUSE + imply IOMMUFD select PC_PCI select PC_ACPI select PCI_EXPRESS_Q35 -- Gitee From 0781636a0c5652c25f81c06ba5fc289966021a33 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:58 +0800 Subject: [PATCH 041/134] vfio/pci: Move VFIODevice initializations in vfio_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_realize, move all of them in vfio_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/pci.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 445d58c8e5..87405584d7 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2969,9 +2969,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vfio_device_get_name(vbasedev, errp) < 0) { return; } - vbasedev->ops = &vfio_pci_ops; - vbasedev->type = VFIO_DEVICE_TYPE_PCI; - vbasedev->dev = DEVICE(vdev); /* * Mediated devices *might* operate compatibly with discarding of RAM, but @@ -3320,6 +3317,7 @@ static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, "bootindex", NULL, @@ -3328,7 +3326,11 @@ static void vfio_instance_init(Object *obj) vdev->host.bus = ~0U; vdev->host.slot = ~0U; vdev->host.function = ~0U; - vdev->vbasedev.fd = -1; + + vbasedev->type = VFIO_DEVICE_TYPE_PCI; + vbasedev->ops = &vfio_pci_ops; + vbasedev->dev = DEVICE(vdev); + vbasedev->fd = -1; vdev->nv_gpudirect_clique = 0xFF; -- Gitee From 594a30d0a9d0d569cf264ffd7b042aa39a404383 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:59 +0800 Subject: [PATCH 042/134] vfio/platform: Move VFIODevice initializations in vfio_platform_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_platform_realize, move all of them in vfio_platform_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/platform.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index a97d9c6234..506eb8193f 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i, ret; - vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; - vbasedev->dev = dev; - vbasedev->ops = &vfio_platform_ops; - qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? @@ -659,8 +655,12 @@ static Property vfio_platform_dev_properties[] = { static void vfio_platform_instance_init(Object *obj) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; - vdev->vbasedev.fd = -1; + vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; + vbasedev->ops = &vfio_platform_ops; + vbasedev->dev = DEVICE(vdev); + vbasedev->fd = -1; } #ifdef CONFIG_IOMMUFD -- Gitee From 69da3907dc07bdb3cab4519922842820388bac4c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:00 +0800 Subject: [PATCH 043/134] vfio/ap: Move VFIODevice initializations in vfio_ap_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_ap_realize, move all of them in vfio_ap_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index f180e4a32a..95fe7cd98b 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -164,18 +164,6 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) return; } - vbasedev->ops = &vfio_ap_ops; - vbasedev->type = VFIO_DEVICE_TYPE_AP; - vbasedev->dev = dev; - - /* - * vfio-ap devices operate in a way compatible with discarding of - * memory in RAM blocks, as no pages are pinned in the host. - * This needs to be set before vfio_get_device() for vfio common to - * handle ram_block_discard_disable(). - */ - vapdev->vdev.ram_block_discard_allowed = true; - ret = vfio_attach_device(vbasedev->name, vbasedev, &address_space_memory, errp); if (ret) { @@ -236,8 +224,20 @@ static const VMStateDescription vfio_ap_vmstate = { static void vfio_ap_instance_init(Object *obj) { VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); + VFIODevice *vbasedev = &vapdev->vdev; - vapdev->vdev.fd = -1; + vbasedev->type = VFIO_DEVICE_TYPE_AP; + vbasedev->ops = &vfio_ap_ops; + vbasedev->dev = DEVICE(vapdev); + vbasedev->fd = -1; + + /* + * vfio-ap devices operate in a way compatible with discarding of + * memory in RAM blocks, as no pages are pinned in the host. + * This needs to be set before vfio_get_device() for vfio common to + * handle ram_block_discard_disable(). + */ + vbasedev->ram_block_discard_allowed = true; } #ifdef CONFIG_IOMMUFD -- Gitee From 4d12d39e824a35014f753a25e5aa8ec0e275a38c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:01 +0800 Subject: [PATCH 044/134] vfio/ccw: Move VFIODevice initializations in vfio_ccw_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_ccw_realize, move all of them in vfio_ccw_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ccw.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 2afdf17dbe..6305a4c1b8 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -594,20 +594,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) return; } - vbasedev->ops = &vfio_ccw_ops; - vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->dev = dev; - - /* - * All vfio-ccw devices are believed to operate in a way compatible with - * discarding of memory in RAM blocks, ie. pages pinned in the host are - * in the current working set of the guest driver and therefore never - * overlap e.g., with pages available to the guest balloon driver. This - * needs to be set before vfio_get_device() for vfio common to handle - * ram_block_discard_disable(). - */ - vbasedev->ram_block_discard_allowed = true; - ret = vfio_attach_device(cdev->mdevid, vbasedev, &address_space_memory, errp); if (ret) { @@ -695,8 +681,22 @@ static const VMStateDescription vfio_ccw_vmstate = { static void vfio_ccw_instance_init(Object *obj) { VFIOCCWDevice *vcdev = VFIO_CCW(obj); + VFIODevice *vbasedev = &vcdev->vdev; + + vbasedev->type = VFIO_DEVICE_TYPE_CCW; + vbasedev->ops = &vfio_ccw_ops; + vbasedev->dev = DEVICE(vcdev); + vbasedev->fd = -1; - vcdev->vdev.fd = -1; + /* + * All vfio-ccw devices are believed to operate in a way compatible with + * discarding of memory in RAM blocks, ie. pages pinned in the host are + * in the current working set of the guest driver and therefore never + * overlap e.g., with pages available to the guest balloon driver. This + * needs to be set before vfio_get_device() for vfio common to handle + * ram_block_discard_disable(). + */ + vbasedev->ram_block_discard_allowed = true; } #ifdef CONFIG_IOMMUFD -- Gitee From 65c5381ba3ce5f062f0be9aa796e68b8a9d6bb3c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:02 +0800 Subject: [PATCH 045/134] vfio: Introduce a helper function to initialize VFIODevice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a helper function to replace the common code to initialize VFIODevice in pci, platform, ap and ccw VFIO device. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 8 ++------ hw/vfio/ccw.c | 8 ++------ hw/vfio/helpers.c | 11 +++++++++++ hw/vfio/pci.c | 6 ++---- hw/vfio/platform.c | 6 ++---- include/hw/vfio/vfio-common.h | 2 ++ 6 files changed, 21 insertions(+), 20 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 95fe7cd98b..e157aa1ff7 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -226,18 +226,14 @@ static void vfio_ap_instance_init(Object *obj) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); VFIODevice *vbasedev = &vapdev->vdev; - vbasedev->type = VFIO_DEVICE_TYPE_AP; - vbasedev->ops = &vfio_ap_ops; - vbasedev->dev = DEVICE(vapdev); - vbasedev->fd = -1; - /* * vfio-ap devices operate in a way compatible with discarding of * memory in RAM blocks, as no pages are pinned in the host. * This needs to be set before vfio_get_device() for vfio common to * handle ram_block_discard_disable(). */ - vbasedev->ram_block_discard_allowed = true; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, + DEVICE(vapdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 6305a4c1b8..90e4a53437 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -683,11 +683,6 @@ static void vfio_ccw_instance_init(Object *obj) VFIOCCWDevice *vcdev = VFIO_CCW(obj); VFIODevice *vbasedev = &vcdev->vdev; - vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->ops = &vfio_ccw_ops; - vbasedev->dev = DEVICE(vcdev); - vbasedev->fd = -1; - /* * All vfio-ccw devices are believed to operate in a way compatible with * discarding of memory in RAM blocks, ie. pages pinned in the host are @@ -696,7 +691,8 @@ static void vfio_ccw_instance_init(Object *obj) * needs to be set before vfio_get_device() for vfio common to handle * ram_block_discard_disable(). */ - vbasedev->ram_block_discard_allowed = true; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops, + DEVICE(vcdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 3592c3d54e..6789870802 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -652,3 +652,14 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) } vbasedev->fd = fd; } + +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard) +{ + vbasedev->type = type; + vbasedev->ops = ops; + vbasedev->dev = dev; + vbasedev->fd = -1; + + vbasedev->ram_block_discard_allowed = ram_discard; +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 87405584d7..1874ec1aba 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3327,10 +3327,8 @@ static void vfio_instance_init(Object *obj) vdev->host.slot = ~0U; vdev->host.function = ~0U; - vbasedev->type = VFIO_DEVICE_TYPE_PCI; - vbasedev->ops = &vfio_pci_ops; - vbasedev->dev = DEVICE(vdev); - vbasedev->fd = -1; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, + DEVICE(vdev), false); vdev->nv_gpudirect_clique = 0xFF; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 506eb8193f..a8d9b7da63 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -657,10 +657,8 @@ static void vfio_platform_instance_init(Object *obj) VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); VFIODevice *vbasedev = &vdev->vbasedev; - vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; - vbasedev->ops = &vfio_platform_ops; - vbasedev->dev = DEVICE(vdev); - vbasedev->fd = -1; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, + DEVICE(vdev), false); } #ifdef CONFIG_IOMMUFD diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 37f01410d5..151b2ab65f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -271,4 +271,6 @@ int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, /* Returns 0 on success, or a negative errno. */ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From fd1d6d64803a052adcab8c7993ca40cabc9c926d Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:03 +0800 Subject: [PATCH 046/134] docs/devel: Add VFIO iommufd backend documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Cédric Le Goater Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- MAINTAINERS | 1 + docs/devel/index-internals.rst | 1 + docs/devel/vfio-iommufd.rst | 166 +++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 docs/devel/vfio-iommufd.rst diff --git a/MAINTAINERS b/MAINTAINERS index ca70bb4e64..0ddb20a35f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2176,6 +2176,7 @@ F: backends/iommufd.c F: include/sysemu/iommufd.h F: include/qemu/chardev_open.h F: util/chardev_open.c +F: docs/devel/vfio-iommufd.rst vhost M: Michael S. Tsirkin diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 6f81df92bc..3def4a138b 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them. s390-dasd-ipl tracing vfio-migration + vfio-iommufd writing-monitor-commands virtio-backends diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst new file mode 100644 index 0000000000..3d1c11f175 --- /dev/null +++ b/docs/devel/vfio-iommufd.rst @@ -0,0 +1,166 @@ +=============================== +IOMMUFD BACKEND usage with VFIO +=============================== + +(Same meaning for backend/container/BE) + +With the introduction of iommufd, the Linux kernel provides a generic +interface for user space drivers to propagate their DMA mappings to kernel +for assigned devices. While the legacy kernel interface is group-centric, +the new iommufd interface is device-centric, relying on device fd and iommufd. + +To support both interfaces in the QEMU VFIO device, introduce a base container +to abstract the common part of VFIO legacy and iommufd container. So that the +generic VFIO code can use either container. + +The base container implements generic functions such as memory_listener and +address space management whereas the derived container implements callbacks +specific to either legacy or iommufd. Each container has its own way to setup +secure context and dma management interface. The below diagram shows how it +looks like with both containers. + +:: + + VFIO AddressSpace/Memory + +-------+ +----------+ +-----+ +-----+ + | pci | | platform | | ap | | ccw | + +---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+ + | | | | | AddressSpace | + | | | | +------------+---------+ + +---V-----------V-----------V--------V----+ / + | VFIOAddressSpace | <------------+ + | | | MemoryListener + | VFIOContainerBase list | + +-------+----------------------------+----+ + | | + | | + +-------V------+ +--------V----------+ + | iommufd | | vfio legacy | + | container | | container | + +-------+------+ +--------+----------+ + | | + | /dev/iommu | /dev/vfio/vfio + | /dev/vfio/devices/vfioX | /dev/vfio/$group_id + Userspace | | + ============+============================+=========================== + Kernel | device fd | + +---------------+ | group/container fd + | (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU) + | ATTACH_IOAS) | | device fd + | | | + | +-------V------------V-----------------+ + iommufd | | vfio | + (map/unmap | +---------+--------------------+-------+ + ioas_copy) | | | map/unmap + | | | + +------V------+ +-----V------+ +------V--------+ + | iommfd core | | device | | vfio iommu | + +-------------+ +------------+ +---------------+ + +* Secure Context setup + + - iommufd BE: uses device fd and iommufd to setup secure context + (bind_iommufd, attach_ioas) + - vfio legacy BE: uses group fd and container fd to setup secure context + (set_container, set_iommu) + +* Device access + + - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX`` + - vfio legacy BE: device fd is retrieved from group fd ioctl + +* DMA Mapping flow + + 1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener + 2. VFIO populates DMA map/unmap via the container BEs + * iommufd BE: uses iommufd + * vfio legacy BE: uses container fd + +Example configuration +===================== + +Step 1: configure the host device +--------------------------------- + +It's exactly same as the VFIO device with legacy VFIO container. + +Step 2: configure QEMU +---------------------- + +Interactions with the ``/dev/iommu`` are abstracted by a new iommufd +object (compiled in with the ``CONFIG_IOMMUFD`` option). + +Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must +be linked with an iommufd object. It gets a new optional property +named iommufd which allows to pass an iommufd object. Take ``vfio-pci`` +device for example: + +.. code-block:: bash + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 + +Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a +management layer. In such a case the fd is passed, the fd supports a +string naming the fd or a number, for example: + +.. code-block:: bash + + -object iommufd,id=iommufd0,fd=22 + -device vfio-pci,iommufd=iommufd0,fd=23 + +If the ``fd`` property is not passed, the fd is opened by QEMU. + +If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd +is not used and the user gets the behavior based on the legacy VFIO +container: + +.. code-block:: bash + + -device vfio-pci,host=0000:02:00.0 + +Supported platform +================== + +Supports x86, ARM and s390x currently. + +Caveats +======= + +Dirty page sync +--------------- + +Dirty page sync with iommufd backend is unsupported yet, live migration is +disabled by default. But it can be force enabled like below, low efficient +though. + +.. code-block:: bash + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on + +P2P DMA +------- + +PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI +BAR region yet. Below warning shows for assigned PCI device, it's not a bug. + +.. code-block:: none + + qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR? + qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address) + +FD passing with mdev +-------------------- + +``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev. +If FD passing is used, there is no way to know that and the mdev is treated +like a real PCI device. There is an error as below if user wants to enable +RAM discarding for mdev. + +.. code-block:: none + + qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices + +``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend +devices are always mdev and RAM discarding is force enabled. -- Gitee From 1bb64d6e69c385af5817dc6f0c3bbd204783c237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:17 +0100 Subject: [PATCH 047/134] vfio/container: Introduce vfio_legacy_setup() for further cleanups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help subsequent patches to unify the initialization of type1 and sPAPR IOMMU backends. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 67aeaa825b..27ce31c883 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -567,6 +567,35 @@ static void shared_memory_listener_unregister(void) g_shl = NULL; } +static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) +{ + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + g_autofree struct vfio_iommu_type1_info *info = NULL; + int ret; + + ret = vfio_get_iommu_info(container, &info); + if (ret) { + error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); + return ret; + } + + if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { + bcontainer->pgsizes = info->iova_pgsizes; + } else { + bcontainer->pgsizes = qemu_real_host_page_size(); + } + + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; + } + + vfio_get_info_iova_range(info, bcontainer); + + vfio_get_iommu_info_migration(container, info); + return 0; +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -665,31 +694,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_S_IOMMU: - { - struct vfio_iommu_type1_info *info; - - ret = vfio_get_iommu_info(container, &info); - if (ret) { - error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); - goto enable_discards_exit; - } - - if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - bcontainer->pgsizes = info->iova_pgsizes; - } else { - bcontainer->pgsizes = qemu_real_host_page_size(); - } - - if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { - bcontainer->dma_max_mappings = 65535; - } - - vfio_get_info_iova_range(info, bcontainer); - - vfio_get_iommu_info_migration(container, info); - g_free(info); + ret = vfio_legacy_setup(bcontainer, errp); break; - } case VFIO_SPAPR_TCE_v2_IOMMU: case VFIO_SPAPR_TCE_IOMMU: { @@ -699,6 +705,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } break; } + default: + g_assert_not_reached(); + } + + if (ret) { + goto enable_discards_exit; } vfio_kvm_device_add_group(group); -- Gitee From 7a81c3919dda48b4e12b83ceb661896523cce6ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:18 +0100 Subject: [PATCH 048/134] vfio/container: Initialize VFIOIOMMUOps under vfio_init_container() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_init_container() already defines the IOMMU type of the container. Do the same for the VFIOIOMMUOps struct. This prepares ground for the following patches that will deduce the associated VFIOIOMMUOps struct from the IOMMU type. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 27ce31c883..dc805ceb12 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -430,7 +430,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, } static int vfio_init_container(VFIOContainer *container, int group_fd, - Error **errp) + VFIOAddressSpace *space, Error **errp) { int iommu_type, dirty_log_manual_clear, ret; @@ -467,7 +467,7 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, if (dirty_log_manual_clear) { container->dirty_log_manual_clear = dirty_log_manual_clear; } - + vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops); return 0; } @@ -679,7 +679,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, &vfio_legacy_ops); - ret = vfio_init_container(container, group->fd, errp); + ret = vfio_init_container(container, group->fd, space, errp); if (ret) { goto free_container_exit; } -- Gitee From 5f62836c64d5abdbdb0d8fb9f0d2fd0d87f47b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:19 +0100 Subject: [PATCH 049/134] vfio/container: Introduce a VFIOIOMMU QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VFIOContainerBase was not introduced as an abstract QOM object because it felt unnecessary to expose all the IOMMU backends to the QEMU machine and human interface. However, we can still abstract the IOMMU backend handlers using a QOM interface class. This provides more flexibility when referencing the various implementations. Simply transform the VFIOIOMMUOps struct in an InterfaceClass and do some initial name replacements. Next changes will start converting VFIOIOMMUOps. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 2 +- hw/vfio/container-base.c | 12 +++++++++++- hw/vfio/pci.c | 2 +- include/hw/vfio/vfio-container-base.h | 23 +++++++++++++++++++---- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d572ec5880..abca6aa01a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1649,7 +1649,7 @@ retry: int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - const VFIOIOMMUOps *ops = &vfio_legacy_ops; + const VFIOIOMMUClass *ops = &vfio_legacy_ops; #ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 1ffd25bbfa..913ae49077 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -72,7 +72,7 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, } void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, - const VFIOIOMMUOps *ops) + const VFIOIOMMUClass *ops) { bcontainer->ops = ops; bcontainer->space = space; @@ -99,3 +99,13 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) g_list_free_full(bcontainer->iova_ranges, g_free); } + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU, + .parent = TYPE_INTERFACE, + .class_size = sizeof(VFIOIOMMUClass), + }, +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1874ec1aba..d84a9e73a6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2488,7 +2488,7 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { VFIODevice *vbasedev = &vdev->vbasedev; - const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops; + const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops; return ops->pci_hot_reset(vbasedev, single); } diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 2ae297ccda..ce8bf9e2e6 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -16,7 +16,8 @@ #include "exec/memory.h" typedef struct VFIODevice VFIODevice; -typedef struct VFIOIOMMUOps VFIOIOMMUOps; +typedef struct VFIOIOMMUClass VFIOIOMMUClass; +#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */ typedef struct { unsigned long *bitmap; @@ -34,7 +35,7 @@ typedef struct VFIOAddressSpace { * This is the base object for vfio container backends */ typedef struct VFIOContainerBase { - const VFIOIOMMUOps *ops; + const VFIOIOMMUClass *ops; VFIOAddressSpace *space; MemoryListener listener; Error *error; @@ -88,10 +89,24 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, - const VFIOIOMMUOps *ops); + const VFIOIOMMUClass *ops); void vfio_container_destroy(VFIOContainerBase *bcontainer); -struct VFIOIOMMUOps { + +#define TYPE_VFIO_IOMMU "vfio-iommu" + +/* + * VFIOContainerBase is not an abstract QOM object because it felt + * unnecessary to expose all the IOMMU backends to the QEMU machine + * and human interface. However, we can still abstract the IOMMU + * backend handlers using a QOM interface class. This provides more + * flexibility when referencing the various implementations. + */ +DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) + +struct VFIOIOMMUClass { + InterfaceClass parent_class; + /* basic feature */ int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, -- Gitee From 9f04d045ef1b2d206b002d20b792111b3ce86909 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:20 +0100 Subject: [PATCH 050/134] vfio/container: Introduce a VFIOIOMMU legacy QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the legacy VFIOIOMMUOps struct to the new VFIOIOMMU QOM interface. The set of of operations for this backend can be referenced with a literal typename instead of a C struct. This will simplify support of multiple backends. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 6 ++- hw/vfio/container.c | 59 ++++++++++++++++++++++----- include/hw/vfio/vfio-common.h | 1 - include/hw/vfio/vfio-container-base.h | 1 + 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index abca6aa01a..d98c3b7422 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1649,13 +1649,17 @@ retry: int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - const VFIOIOMMUClass *ops = &vfio_legacy_ops; + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); #ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { ops = &vfio_iommufd_ops; } #endif + + assert(ops); + return ops->attach_device(name, vbasedev, as, errp); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index dc805ceb12..6b8de8f471 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -429,10 +429,30 @@ static int vfio_get_iommu_type(VFIOContainer *container, return -EINVAL; } +/* + * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type + */ +static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) +{ + ObjectClass *klass = NULL; + + switch (iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); + break; + default: + g_assert_not_reached(); + }; + + return VFIO_IOMMU_CLASS(klass); +} + static int vfio_init_container(VFIOContainer *container, int group_fd, VFIOAddressSpace *space, Error **errp) { int iommu_type, dirty_log_manual_clear, ret; + const VFIOIOMMUClass *vioc; iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { @@ -467,7 +487,14 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, if (dirty_log_manual_clear) { container->dirty_log_manual_clear = dirty_log_manual_clear; } - vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops); + + vioc = vfio_get_iommu_class(iommu_type, errp); + if (!vioc) { + error_setg(errp, "No available IOMMU models"); + return -EINVAL; + } + + vfio_container_init(&container->bcontainer, space, vioc); return 0; } @@ -677,7 +704,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; - vfio_container_init(bcontainer, space, &vfio_legacy_ops); ret = vfio_init_container(container, group->fd, space, errp); if (ret) { @@ -1218,12 +1244,25 @@ out_single: return ret; } -const VFIOIOMMUOps vfio_legacy_ops = { - .dma_map = vfio_legacy_dma_map, - .dma_unmap = vfio_legacy_dma_unmap, - .attach_device = vfio_legacy_attach_device, - .detach_device = vfio_legacy_detach_device, - .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, - .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, - .pci_hot_reset = vfio_legacy_pci_hot_reset, +static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; + vioc->attach_device = vfio_legacy_attach_device; + vioc->detach_device = vfio_legacy_detach_device; + vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; }; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_LEGACY, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_legacy_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 151b2ab65f..f78a97006c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -224,7 +224,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; -extern const VFIOIOMMUOps vfio_legacy_ops; extern const VFIOIOMMUOps vfio_iommufd_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index ce8bf9e2e6..dce801378b 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -94,6 +94,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU "vfio-iommu" +#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" /* * VFIOContainerBase is not an abstract QOM object because it felt -- Gitee From b8e67d06ec3036cd3fd6d625c550e0c542e49d60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:21 +0100 Subject: [PATCH 051/134] vfio/container: Intoduce a new VFIOIOMMUClass::setup handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help in converting the sPAPR IOMMU backend to a QOM interface. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 1 + include/hw/vfio/vfio-container-base.h | 1 + 2 files changed, 2 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 6b8de8f471..845239eff4 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1248,6 +1248,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + vioc->setup = vfio_legacy_setup; vioc->dma_map = vfio_legacy_dma_map; vioc->dma_unmap = vfio_legacy_dma_unmap; vioc->attach_device = vfio_legacy_attach_device; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index dce801378b..614de90767 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -109,6 +109,7 @@ struct VFIOIOMMUClass { InterfaceClass parent_class; /* basic feature */ + int (*setup)(VFIOContainerBase *bcontainer, Error **errp); int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 2692ea754863364731e5712ebf83208690179089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:22 +0100 Subject: [PATCH 052/134] vfio/spapr: Introduce a sPAPR VFIOIOMMU QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move vfio_spapr_container_setup() to a VFIOIOMMUClass::setup handler and convert the sPAPR VFIOIOMMUOps struct to a QOM interface. The sPAPR QOM interface inherits from the legacy QOM interface because because both have the same basic needs. The sPAPR interface is then extended with the handlers specific to the sPAPR IOMMU. This allows reuse and provides better abstraction of the backends. It will be useful to avoid compiling the sPAPR IOMMU backend on targets not supporting it. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 24 ++++++------------------ hw/vfio/spapr.c | 20 ++++++++++++++++++++ include/hw/vfio/vfio-container-base.h | 1 + 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 845239eff4..e245d5a082 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -441,6 +441,10 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) case VFIO_TYPE1_IOMMU: klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); break; + case VFIO_SPAPR_TCE_v2_IOMMU: + case VFIO_SPAPR_TCE_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_SPAPR); + break; default: g_assert_not_reached(); }; @@ -716,25 +720,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, goto free_container_exit; } - switch (container->iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - case VFIO_TYPE1v2_S_IOMMU: - ret = vfio_legacy_setup(bcontainer, errp); - break; - case VFIO_SPAPR_TCE_v2_IOMMU: - case VFIO_SPAPR_TCE_IOMMU: - { - ret = vfio_spapr_container_init(container, errp); - if (ret) { - goto enable_discards_exit; - } - break; - } - default: - g_assert_not_reached(); - } + assert(bcontainer->ops->setup); + ret = bcontainer->ops->setup(bcontainer, errp); if (ret) { goto enable_discards_exit; } diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 5c6426e697..3694dfb874 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -543,3 +543,23 @@ void vfio_spapr_container_deinit(VFIOContainer *container) g_free(hostwin); } } + +static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->add_window = vfio_spapr_container_add_section_window; + vioc->del_window = vfio_spapr_container_del_section_window; + //vioc->release = vfio_spapr_container_release; + //vioc->setup = vfio_spapr_container_setup; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_SPAPR, + .parent = TYPE_VFIO_IOMMU_LEGACY, + .class_init = vfio_iommu_spapr_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 614de90767..1085109d0c 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -95,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU "vfio-iommu" #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" +#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" /* * VFIOContainerBase is not an abstract QOM object because it felt -- Gitee From 66f71e9acdaa0c1c31770f00a21ea32644ebaac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:23 +0100 Subject: [PATCH 053/134] vfio/iommufd: Introduce a VFIOIOMMU iommufd QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As previously done for the sPAPR and legacy IOMMU backends, convert the VFIOIOMMUOps struct to a QOM interface. The set of of operations for this backend can be referenced with a literal typename instead of a C struct. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 2 +- hw/vfio/iommufd.c | 35 ++++++++++++++++++++------- include/hw/vfio/vfio-common.h | 1 - include/hw/vfio/vfio-container-base.h | 2 +- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d98c3b7422..a8b7129fa5 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1654,7 +1654,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, #ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { - ops = &vfio_iommufd_ops; + ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); } #endif diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 87a561c545..d4c586e842 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -319,6 +319,8 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, int ret, devfd; uint32_t ioas_id; Error *err = NULL; + const VFIOIOMMUClass *iommufd_vioc = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); if (vbasedev->fd < 0) { devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); @@ -340,7 +342,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); - if (bcontainer->ops != &vfio_iommufd_ops || + if (bcontainer->ops != iommufd_vioc || vbasedev->iommufd != container->be) { continue; } @@ -374,7 +376,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, container->ioas_id = ioas_id; bcontainer = &container->bcontainer; - vfio_container_init(bcontainer, space, &vfio_iommufd_ops); + vfio_container_init(bcontainer, space, iommufd_vioc); QLIST_INSERT_HEAD(&space->containers, bcontainer, next); ret = iommufd_cdev_attach_container(vbasedev, container, errp); @@ -476,9 +478,11 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) { VFIODevice *vbasedev_iter; + const VFIOIOMMUClass *iommufd_vioc = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { - if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) { + if (vbasedev_iter->bcontainer->ops != iommufd_vioc) { continue; } if (devid == vbasedev_iter->devid) { @@ -621,10 +625,23 @@ out_single: return ret; } -const VFIOIOMMUOps vfio_iommufd_ops = { - .dma_map = iommufd_cdev_map, - .dma_unmap = iommufd_cdev_unmap, - .attach_device = iommufd_cdev_attach, - .detach_device = iommufd_cdev_detach, - .pci_hot_reset = iommufd_cdev_pci_hot_reset, +static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->dma_map = iommufd_cdev_map; + vioc->dma_unmap = iommufd_cdev_unmap; + vioc->attach_device = iommufd_cdev_attach; + vioc->detach_device = iommufd_cdev_detach; + vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; }; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_IOMMUFD, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_iommufd_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f78a97006c..f3966410c1 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -224,7 +224,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; -extern const VFIOIOMMUOps vfio_iommufd_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 1085109d0c..c12ce4dfcb 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -17,7 +17,6 @@ typedef struct VFIODevice VFIODevice; typedef struct VFIOIOMMUClass VFIOIOMMUClass; -#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */ typedef struct { unsigned long *bitmap; @@ -96,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU "vfio-iommu" #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" #define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" +#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" /* * VFIOContainerBase is not an abstract QOM object because it felt -- Gitee From 017272249cc362055dc5b31cdc16b2265df39e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:24 +0100 Subject: [PATCH 054/134] vfio/spapr: Only compile sPAPR IOMMU support when needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sPAPR IOMMU support is only needed for pseries machines. Compile out support when CONFIG_PSERIES is not set. This saves ~7K of text. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bd5cc4ca79..bda2688983 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -4,9 +4,9 @@ vfio_ss.add(files( 'common.c', 'container-base.c', 'container.c', - 'spapr.c', 'migration.c', )) +vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( 'iommufd.c', )) -- Gitee From feed555b60bc36d3e704431148e302dae48b77a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:16 +0100 Subject: [PATCH 055/134] vfio/spapr: Extend VFIOIOMMUOps with a release handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to abstract a bit more the sPAPR IOMMU support in the legacy IOMMU backend. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 8 ++++++-- hw/vfio/spapr.c | 19 +++++++++++++++++++ include/hw/vfio/vfio-container-base.h | 1 + 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index e245d5a082..4c62f088b1 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -764,7 +764,9 @@ listener_release_exit: } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { - vfio_spapr_container_deinit(container); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } } enable_discards_exit: @@ -803,7 +805,9 @@ static void vfio_disconnect_container(VFIOGroup *group) } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { - vfio_spapr_container_deinit(container); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } } } diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 3694dfb874..697f80d11d 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -440,6 +440,24 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, } } +static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) +{ + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&scontainer->prereg_listener); + } + QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + } +} + static VFIOIOMMUOps vfio_iommu_spapr_ops; static void setup_spapr_ops(VFIOContainerBase *bcontainer) @@ -447,6 +465,7 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer) vfio_iommu_spapr_ops = *bcontainer->ops; vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; + vfio_iommu_spapr_ops.release = vfio_spapr_container_release; bcontainer->ops = &vfio_iommu_spapr_ops; } diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index c12ce4dfcb..b2813b0c11 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -135,5 +135,6 @@ struct VFIOIOMMUClass { Error **errp); void (*del_window)(VFIOContainerBase *bcontainer, MemoryRegionSection *section); + void (*release)(VFIOContainerBase *bcontainer); }; #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ -- Gitee From 188948043652fbcdd4505fd9672e57bc61647159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:25 +0100 Subject: [PATCH 056/134] vfio/iommufd: Remove CONFIG_IOMMUFD usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Availability of the IOMMUFD backend can now be fully determined at runtime and the ifdef check was a build time protection (for PPC not supporting it mostly). Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a8b7129fa5..b5d02df0c2 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -19,7 +19,6 @@ */ #include "qemu/osdep.h" -#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #ifdef CONFIG_KVM #include @@ -1652,11 +1651,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); -#ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); } -#endif assert(ops); -- Gitee From 626698a1e9edff6a1032f496858555e1a4614fbe Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:27 +0800 Subject: [PATCH 057/134] backends: Introduce HostIOMMUDevice abstract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A HostIOMMUDevice is an abstraction for an assigned device that is protected by a physical IOMMU (aka host IOMMU). The userspace interaction with this physical IOMMU can be done either through the VFIO IOMMU type 1 legacy backend or the new iommufd backend. The assigned device can be a VFIO device or a VDPA device. The HostIOMMUDevice is needed to interact with the host IOMMU that protects the assigned device. It is especially useful when the device is also protected by a virtual IOMMU as this latter use the translation services of the physical IOMMU and is constrained by it. In that context the HostIOMMUDevice can be passed to the virtual IOMMU to collect physical IOMMU capabilities such as the supported address width. In the future, the virtual IOMMU will use the HostIOMMUDevice to program the guest page tables in the first translation stage of the physical IOMMU. Introduce .realize() to initialize HostIOMMUDevice further after instance init. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- MAINTAINERS | 2 ++ backends/host_iommu_device.c | 33 +++++++++++++++++++ backends/meson.build | 1 + include/sysemu/host_iommu_device.h | 53 ++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+) create mode 100644 backends/host_iommu_device.c create mode 100644 include/sysemu/host_iommu_device.h diff --git a/MAINTAINERS b/MAINTAINERS index 0ddb20a35f..ada87bfa9e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2174,6 +2174,8 @@ M: Zhenzhong Duan S: Supported F: backends/iommufd.c F: include/sysemu/iommufd.h +F: backends/host_iommu_device.c +F: include/sysemu/host_iommu_device.h F: include/qemu/chardev_open.h F: util/chardev_open.c F: docs/devel/vfio-iommufd.rst diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c new file mode 100644 index 0000000000..8f2dda1beb --- /dev/null +++ b/backends/host_iommu_device.c @@ -0,0 +1,33 @@ +/* + * Host IOMMU device abstract + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/host_iommu_device.h" + +OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice, + host_iommu_device, + HOST_IOMMU_DEVICE, + OBJECT) + +static void host_iommu_device_class_init(ObjectClass *oc, void *data) +{ +} + +static void host_iommu_device_init(Object *obj) +{ +} + +static void host_iommu_device_finalize(Object *obj) +{ + HostIOMMUDevice *hiod = HOST_IOMMU_DEVICE(obj); + + g_free(hiod->name); +} diff --git a/backends/meson.build b/backends/meson.build index 9a5cea480d..68b5e34e04 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -13,6 +13,7 @@ system_ss.add([files( system_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c')) system_ss.add(when: 'CONFIG_POSIX', if_true: files('hostmem-file.c')) system_ss.add(when: 'CONFIG_LINUX', if_true: files('hostmem-memfd.c')) +system_ss.add(when: 'CONFIG_LINUX', if_true: files('host_iommu_device.c')) if keyutils.found() system_ss.add(keyutils, files('cryptodev-lkcf.c')) endif diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h new file mode 100644 index 0000000000..db47a16189 --- /dev/null +++ b/include/sysemu/host_iommu_device.h @@ -0,0 +1,53 @@ +/* + * Host IOMMU device abstract declaration + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef HOST_IOMMU_DEVICE_H +#define HOST_IOMMU_DEVICE_H + +#include "qom/object.h" +#include "qapi/error.h" + +#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) + +struct HostIOMMUDevice { + Object parent_obj; + + char *name; +}; + +/** + * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices. + * + * Different types of host devices (e.g., VFIO or VDPA device) or devices + * with different backend (e.g., VFIO legacy container or IOMMUFD backend) + * will have different implementations of the HostIOMMUDeviceClass. + */ +struct HostIOMMUDeviceClass { + ObjectClass parent_class; + + /** + * @realize: initialize host IOMMU device instance further. + * + * Mandatory callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); +}; +#endif -- Gitee From ca210a4a8fe97dd56baa184671bb48bff9a54ecb Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:28 +0800 Subject: [PATCH 058/134] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities. Different platform IOMMU can support different elements. Currently only two elements, type and aw_bits, type hints the host platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints host IOMMU address width. Introduce .get_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX is supported. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- include/sysemu/host_iommu_device.h | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index db47a16189..a57873958b 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -15,6 +15,18 @@ #include "qom/object.h" #include "qapi/error.h" +/** + * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. + * + * @type: host platform IOMMU type. + * + * @aw_bits: host IOMMU address width. 0xff if no limitation. + */ +typedef struct HostIOMMUDeviceCaps { + uint32_t type; + uint8_t aw_bits; +} HostIOMMUDeviceCaps; + #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) @@ -22,6 +34,7 @@ struct HostIOMMUDevice { Object parent_obj; char *name; + HostIOMMUDeviceCaps caps; }; /** @@ -49,5 +62,30 @@ struct HostIOMMUDeviceClass { * Returns: true on success, false on failure. */ bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @get_cap: check if a host IOMMU device capability is supported. + * + * Optional callback, if not implemented, hint not supporting query + * of @cap. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @cap: capability to check. + * + * @errp: pass an Error out when fails to query capability. + * + * Returns: <0 on failure, 0 if a @cap is unsupported, or else + * 1 or some positive value for some special @cap, + * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS. + */ + int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp); }; + +/* + * Host IOMMU device capability list. + */ +#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE 0 +#define HOST_IOMMU_DEVICE_CAP_AW_BITS 1 + +#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX 64 #endif -- Gitee From c253a07d9fe1598c4dbbb1cefee457806c417885 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:29 +0800 Subject: [PATCH 059/134] vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO represents a host IOMMU device under VFIO legacy container backend. It will have its own realize implementation. Suggested-by: Eric Auger Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 6 +++++- include/hw/vfio/vfio-common.h | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 4c62f088b1..dcf49af2d0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1255,7 +1255,11 @@ static const TypeInfo types[] = { .name = TYPE_VFIO_IOMMU_LEGACY, .parent = TYPE_VFIO_IOMMU, .class_init = vfio_iommu_legacy_class_init, - }, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE, + } + }; DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f3966410c1..0c807c2806 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -31,6 +31,7 @@ #endif #include "sysemu/sysemu.h" #include "hw/vfio/vfio-container-base.h" +#include "sysemu/host_iommu_device.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -75,6 +76,8 @@ typedef struct VFIOMigration { struct VFIOGroup; +#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" + typedef struct VFIODMARange { QLIST_ENTRY(VFIODMARange) next; hwaddr iova; -- Gitee From 50142057ec070a70f3f38ec272ec61cc3ae6e071 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:30 +0800 Subject: [PATCH 060/134] backends/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD[_VFIO] devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TYPE_HOST_IOMMU_DEVICE_IOMMUFD represents a host IOMMU device under iommufd backend. It is abstract, because it is going to be derived into VFIO or VDPA type'd device. It will have its own .get_cap() implementation. TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO is a sub-class of TYPE_HOST_IOMMU_DEVICE_IOMMUFD, represents a VFIO type'd host IOMMU device under iommufd backend. It will be created during VFIO device attaching and passed to vIOMMU. It will have its own .realize() implementation. Opportunistically, add missed header to include/sysemu/iommufd.h. Suggested-by: Cédric Le Goater Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- backends/iommufd.c | 36 +++++++++++++++++------------------ hw/vfio/iommufd.c | 5 ++++- include/hw/vfio/vfio-common.h | 3 +++ include/sysemu/iommufd.h | 16 ++++++++++++++++ 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index ba58a0eb0d..a2b7f5c3c4 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -223,23 +223,23 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } -static const TypeInfo iommufd_backend_info = { - .name = TYPE_IOMMUFD_BACKEND, - .parent = TYPE_OBJECT, - .instance_size = sizeof(IOMMUFDBackend), - .instance_init = iommufd_backend_init, - .instance_finalize = iommufd_backend_finalize, - .class_size = sizeof(IOMMUFDBackendClass), - .class_init = iommufd_backend_class_init, - .interfaces = (InterfaceInfo[]) { - { TYPE_USER_CREATABLE }, - { } +static const TypeInfo types[] = { + { + .name = TYPE_IOMMUFD_BACKEND, + .parent = TYPE_OBJECT, + .instance_size = sizeof(IOMMUFDBackend), + .instance_init = iommufd_backend_init, + .instance_finalize = iommufd_backend_finalize, + .class_size = sizeof(IOMMUFDBackendClass), + .class_init = iommufd_backend_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .parent = TYPE_HOST_IOMMU_DEVICE, + .abstract = true, } }; - -static void register_types(void) -{ - type_register_static(&iommufd_backend_info); -} - -type_init(register_types); +DEFINE_TYPES(types) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d4c586e842..7a4b818830 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -641,7 +641,10 @@ static const TypeInfo types[] = { .name = TYPE_VFIO_IOMMU_IOMMUFD, .parent = TYPE_VFIO_IOMMU, .class_init = vfio_iommu_iommufd_class_init, - }, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + } }; DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0c807c2806..2cfc8521cd 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -32,6 +32,7 @@ #include "sysemu/sysemu.h" #include "hw/vfio/vfio-container-base.h" #include "sysemu/host_iommu_device.h" +#include "sysemu/iommufd.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -77,6 +78,8 @@ typedef struct VFIOMigration { struct VFIOGroup; #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ + TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" typedef struct VFIODMARange { QLIST_ENTRY(VFIODMARange) next; diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 9c5524b0ed..1a75e82f42 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -1,3 +1,16 @@ +/* + * iommufd container backend declaration + * + * Copyright (C) 2024 Intel Corporation. + * Copyright Red Hat, Inc. 2024 + * + * Authors: Yi Liu + * Eric Auger + * Zhenzhong Duan + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + #ifndef SYSEMU_IOMMUFD_H #define SYSEMU_IOMMUFD_H @@ -5,6 +18,7 @@ #include "qemu/thread.h" #include "exec/hwaddr.h" #include "exec/cpu-common.h" +#include "sysemu/host_iommu_device.h" #define TYPE_IOMMUFD_BACKEND "iommufd" OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) @@ -35,4 +49,6 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); + +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 30150b8727e9ec41f83c4dfcd93f04b766357469 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:31 +0800 Subject: [PATCH 061/134] range: Introduce range_get_last_bit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper get the highest 1 bit position of the upper bound. If the range is empty or upper bound is zero, -1 is returned. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- include/qemu/range.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/qemu/range.h b/include/qemu/range.h index 205e1da76d..4ce694a398 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -20,6 +20,8 @@ #ifndef QEMU_RANGE_H #define QEMU_RANGE_H +#include "qemu/bitops.h" + /* * Operations on 64 bit address ranges. * Notes: @@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, return !(last2 < first1 || last1 < first2); } +/* Get highest non-zero bit position of a range */ +static inline int range_get_last_bit(Range *range) +{ + if (range_is_empty(range)) { + return -1; + } + return 63 - clz64(range->upb); +} + /* * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. * Both @a and @b must not be empty. -- Gitee From c66d22fa4ee9f6f38193256d7ce1494c32e10581 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:32 +0800 Subject: [PATCH 062/134] vfio/container: Implement HostIOMMUDeviceClass::realize() handler The realize function populates the capabilities. For now only the aw_bits caps is computed for legacy backend. Introduce a helper function vfio_device_get_aw_bits() which calls range_get_last_bit() to get host aw_bits and package it in HostIOMMUDeviceCaps for query with .get_cap(). This helper will also be used by iommufd backend. Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 20 +++++++++++++++++++- hw/vfio/helpers.c | 17 +++++++++++++++++ include/hw/vfio/vfio-common.h | 1 + 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index dcf49af2d0..fbe2bc50d4 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1250,6 +1250,24 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; }; +static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + + hiod->name = g_strdup(vdev->name); + hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); + + return true; +} + +static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->realize = hiod_legacy_vfio_realize; +}; + static const TypeInfo types[] = { { .name = TYPE_VFIO_IOMMU_LEGACY, @@ -1258,8 +1276,8 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_legacy_vfio_class_init, } - }; DEFINE_TYPES(types) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 6789870802..35b8e42304 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -663,3 +663,20 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, vbasedev->ram_block_discard_allowed = ram_discard; } + +int vfio_device_get_aw_bits(VFIODevice *vdev) +{ + /* + * iova_ranges is a sorted list. For old kernels that support + * VFIO but not support query of iova ranges, iova_ranges is NULL, + * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. + */ + GList *l = g_list_last(vdev->bcontainer->iova_ranges); + + if (l) { + Range *range = l->data; + return range_get_last_bit(range) + 1; + } + + return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; +} diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 2cfc8521cd..376b8350b9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -277,4 +277,5 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, DeviceState *dev, bool ram_discard); +int vfio_device_get_aw_bits(VFIODevice *vdev); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From ccd8baf4648e6fd6b69e65ee249609904edc92e1 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:33 +0800 Subject: [PATCH 063/134] backends/iommufd: Introduce helper function iommufd_backend_get_device_info() Introduce a helper function iommufd_backend_get_device_info() to get host IOMMU related information through iommufd uAPI. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- backends/iommufd.c | 22 ++++++++++++++++++++++ include/sysemu/iommufd.h | 3 +++ 2 files changed, 25 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index a2b7f5c3c4..604a8f4e7d 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -223,6 +223,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + Error **errp) +{ + struct iommu_hw_info info = { + .size = sizeof(info), + .dev_id = devid, + .data_len = len, + .data_uptr = (uintptr_t)data, + }; + + if (ioctl(be->fd, IOMMU_GET_HW_INFO, &info)) { + error_setg_errno(errp, errno, "Failed to get hardware info"); + return false; + } + + g_assert(type); + *type = info.out_data_type; + + return true; +} + static const TypeInfo types[] = { { .name = TYPE_IOMMUFD_BACKEND, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 1a75e82f42..dfade18e6d 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -49,6 +49,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From c9f1b73eb36a84347c3720ce2a93f72ea47f5daa Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:34 +0800 Subject: [PATCH 064/134] vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler It calls iommufd_backend_get_device_info() to get host IOMMU related information and translate it into HostIOMMUDeviceCaps for query with .get_cap(). For aw_bits, use the same way as legacy backend by calling vfio_device_get_aw_bits() which is common for different vendor IOMMU. Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/iommufd.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7a4b818830..2efdba5565 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -636,6 +636,35 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; }; +static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + HostIOMMUDeviceCaps *caps = &hiod->caps; + enum iommu_hw_info_type type; + union { + struct iommu_hw_info_vtd vtd; + } data; + + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, + &type, &data, sizeof(data), errp)) { + return false; + } + + hiod->name = g_strdup(vdev->name); + caps->type = type; + caps->aw_bits = vfio_device_get_aw_bits(vdev); + + return true; +} + +static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + + hiodc->realize = hiod_iommufd_vfio_realize; +}; + static const TypeInfo types[] = { { .name = TYPE_VFIO_IOMMU_IOMMUFD, @@ -644,6 +673,7 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .class_init = hiod_iommufd_vfio_class_init, } }; -- Gitee From b6830d3caff821b2472e369042c169935c906ef2 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:35 +0800 Subject: [PATCH 065/134] vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index fbe2bc50d4..ed54ce6d0c 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1261,11 +1261,26 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, return true; } +static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, + Error **errp) +{ + HostIOMMUDeviceCaps *caps = &hiod->caps; + + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return caps->aw_bits; + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) { HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); hioc->realize = hiod_legacy_vfio_realize; + hioc->get_cap = hiod_legacy_vfio_get_cap; }; static const TypeInfo types[] = { -- Gitee From 2f1a2f4b320e70a85cef8392cd5f4b1e54afb9c9 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:36 +0800 Subject: [PATCH 066/134] backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- backends/iommufd.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 604a8f4e7d..7e805bd664 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -245,6 +245,28 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, return true; } +static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) +{ + HostIOMMUDeviceCaps *caps = &hiod->caps; + + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: + return caps->type; + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return caps->aw_bits; + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + +static void hiod_iommufd_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->get_cap = hiod_iommufd_get_cap; +}; + static const TypeInfo types[] = { { .name = TYPE_IOMMUFD_BACKEND, @@ -261,6 +283,7 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_iommufd_class_init, .abstract = true, } }; -- Gitee From a152921f6d534f2a515b4e88304ad115fae8fa8f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:37 +0800 Subject: [PATCH 067/134] vfio: Create host IOMMU device instance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create host IOMMU device instance in vfio_attach_device() and call .realize() to initialize it further. Introuduce attribute VFIOIOMMUClass::hiod_typename and initialize it based on VFIO backend type. It will facilitate HostIOMMUDevice creation in vfio_attach_device(). Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/common.c | 18 +++++++++++++++++- hw/vfio/container.c | 2 ++ hw/vfio/iommufd.c | 2 ++ include/hw/vfio/vfio-common.h | 1 + include/hw/vfio/vfio-container-base.h | 3 +++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b5d02df0c2..d5ff65f90a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1650,6 +1650,8 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, { const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + HostIOMMUDevice *hiod = NULL; + int ret; if (vbasedev->iommufd) { ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); @@ -1657,7 +1659,20 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, assert(ops); - return ops->attach_device(name, vbasedev, as, errp); + ret = ops->attach_device(name, vbasedev, as, errp); + if (ret) { + return ret; + } + + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + object_unref(hiod); + ops->detach_device(vbasedev); + return -1; + } + vbasedev->hiod = hiod; + + return 0; } void vfio_detach_device(VFIODevice *vbasedev) @@ -1665,5 +1680,6 @@ void vfio_detach_device(VFIODevice *vbasedev) if (!vbasedev->bcontainer) { return; } + object_unref(vbasedev->hiod); vbasedev->bcontainer->ops->detach_device(vbasedev); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index ed54ce6d0c..10f7635425 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1240,6 +1240,8 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO; + vioc->setup = vfio_legacy_setup; vioc->dma_map = vfio_legacy_dma_map; vioc->dma_unmap = vfio_legacy_dma_unmap; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 2efdba5565..7cbf0e44f1 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -629,6 +629,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO; + vioc->dma_map = iommufd_cdev_map; vioc->dma_unmap = iommufd_cdev_unmap; vioc->attach_device = iommufd_cdev_attach; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 376b8350b9..d45d40c329 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -140,6 +140,7 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; } VFIODevice; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index b2813b0c11..7a4c575115 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -109,6 +109,9 @@ DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) struct VFIOIOMMUClass { InterfaceClass parent_class; + /* Properties */ + const char *hiod_typename; + /* basic feature */ int (*setup)(VFIOContainerBase *bcontainer, Error **errp); int (*dma_map)(const VFIOContainerBase *bcontainer, -- Gitee From 03f9b12e33238587da36be24523911fd1b003324 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:38 +0800 Subject: [PATCH 068/134] hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn() Extract out pci_device_get_iommu_bus_devfn() from pci_device_iommu_address_space() to facilitate implementation of pci_device_[set|unset]_iommu_device() in following patch. No functional change intended. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Nicolin Chen Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/pci/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 7467a2a9de..0884fbb760 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2681,11 +2681,27 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data) } } -AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) +/* + * Get IOMMU root bus, aliased bus and devfn of a PCI device + * + * IOMMU root bus is needed by all call sites to call into iommu_ops. + * For call sites which don't need aliased BDF, passing NULL to + * aliased_[bus|devfn] is allowed. + * + * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device. + * + * @aliased_bus: return aliased #PCIBus of the PCI device, optional. + * + * @aliased_devfn: return aliased devfn of the PCI device, optional. + */ +static void pci_device_get_iommu_bus_devfn(PCIDevice *dev, + PCIBus **piommu_bus, + PCIBus **aliased_bus, + int *aliased_devfn) { PCIBus *bus = pci_get_bus(dev); PCIBus *iommu_bus = bus; - uint8_t devfn = dev->devfn; + int devfn = dev->devfn; while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); @@ -2726,7 +2742,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { + + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + assert(iommu_bus); + + if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) { + iommu_bus = NULL; + } + + *piommu_bus = iommu_bus; + + if (aliased_bus) { + *aliased_bus = bus; + } + + if (aliased_devfn) { + *aliased_devfn = devfn; + } +} + +AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); + if (iommu_bus) { return iommu_bus->iommu_ops->get_address_space(bus, iommu_bus->iommu_opaque, devfn); } -- Gitee From 7bc73d38984460315df315d007789f87f4d11994 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 5 Jun 2024 16:30:39 +0800 Subject: [PATCH 069/134] hw/pci: Introduce pci_device_[set|unset]_iommu_device() pci_device_[set|unset]_iommu_device() call pci_device_get_iommu_bus_devfn() to get iommu_bus->iommu_ops and call [set|unset]_iommu_device callback to set/unset HostIOMMUDevice for a given PCI device. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Nicolin Chen Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/pci/pci.c | 27 +++++++++++++++++++++++++++ include/hw/pci/pci.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 0884fbb760..d6f627aa51 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2775,6 +2775,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) return &address_space_memory; } +bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp) +{ + PCIBus *iommu_bus; + + /* set_iommu_device requires device's direct BDF instead of aliased BDF */ + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) { + return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn, hiod, errp); + } + return true; +} + +void pci_device_unset_iommu_device(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) { + return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn); + } +} + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { /* diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index cee0cf7460..8d1af44249 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -3,6 +3,7 @@ #include "exec/memory.h" #include "sysemu/dma.h" +#include "sysemu/host_iommu_device.h" /* PCI includes legacy ISA access. */ #include "hw/isa/isa.h" @@ -384,10 +385,45 @@ typedef struct PCIIOMMUOps { * * @devfn: device and function number */ - AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); + AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); + /** + * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU + * + * Optional callback, if not implemented in vIOMMU, then vIOMMU can't + * retrieve host information from the associated HostIOMMUDevice. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + * + * @dev: the #HostIOMMUDevice to attach. + * + * @errp: pass an Error out only when return false + * + * Returns: true if HostIOMMUDevice is attached or else false with errp set. + */ + bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *dev, Error **errp); + /** + * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU + * + * Optional callback. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + */ + void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); } PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); +bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp); +void pci_device_unset_iommu_device(PCIDevice *dev); /** * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus -- Gitee From dbbf6b33d9ce5f2785972f81919be143e81f866b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:40 +0800 Subject: [PATCH 070/134] vfio/pci: Pass HostIOMMUDevice to vIOMMU With HostIOMMUDevice passed, vIOMMU can check compatibility with host IOMMU, call into IOMMUFD specific methods, etc. Originally-by: Yi Liu Signed-off-by: Nicolin Chen Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/pci.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d84a9e73a6..675a608b9c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3107,6 +3107,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); + if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + error_prepend(errp, "Failed to set iommu_device: "); + goto out_teardown; + } + ret = vfio_add_capabilities(vdev, errp); if (ret) { goto out_teardown; @@ -3128,7 +3133,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) error_setg(errp, "cannot support IGD OpRegion feature on hotplugged " "device"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_get_dev_region_info(vbasedev, @@ -3137,13 +3142,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (ret) { error_setg_errno(errp, -ret, "does not support requested IGD OpRegion feature"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); g_free(opregion); if (ret) { - goto out_teardown; + goto out_unset_idev; } } @@ -3229,6 +3234,8 @@ out_deregister: if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } +out_unset_idev: + pci_device_unset_iommu_device(pdev); out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3257,6 +3264,7 @@ static void vfio_instance_finalize(Object *obj) static void vfio_exitfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); @@ -3271,7 +3279,8 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); - vfio_migration_exit(&vdev->vbasedev); + vfio_migration_exit(vbasedev); + pci_device_unset_iommu_device(pdev); } static void vfio_pci_reset(DeviceState *dev) -- Gitee From a051e4349316d7065c9418de691787edae8e7f4e Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:41 +0800 Subject: [PATCH 071/134] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap Extract cap/ecap initialization in vtd_cap_init() to make code cleaner. No functional change intended. Reviewed-by: Eric Auger Signed-off-by: Zhenzhong Duan Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 93 ++++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3da56e439e..6716407b7a 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3935,30 +3935,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) return; } -/* Do the initialization. It will also be called when reset, so pay - * attention when adding new initialization stuff. - */ -static void vtd_init(IntelIOMMUState *s) +static void vtd_cap_init(IntelIOMMUState *s) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); - memset(s->csr, 0, DMAR_REG_SIZE); - memset(s->wmask, 0, DMAR_REG_SIZE); - memset(s->w1cmask, 0, DMAR_REG_SIZE); - memset(s->womask, 0, DMAR_REG_SIZE); - - s->root = 0; - s->root_scalable = false; - s->dmar_enabled = false; - s->intr_enabled = false; - s->iq_head = 0; - s->iq_tail = 0; - s->iq = 0; - s->iq_size = 0; - s->qi_enabled = false; - s->iq_last_desc_type = VTD_INV_DESC_NONE; - s->iq_dw = false; - s->next_frcd_reg = 0; s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | VTD_CAP_MGAW(s->aw_bits); @@ -3975,27 +3955,6 @@ static void vtd_init(IntelIOMMUState *s) } s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; - /* - * Rsvd field masks for spte - */ - vtd_spte_rsvd[0] = ~0ULL; - vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); - vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); - vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); - - vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - - if (s->scalable_mode || s->snoop_control) { - vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; - vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; - vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; - } - if (x86_iommu_ir_supported(x86_iommu)) { s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; if (s->intr_eim == ON_OFF_AUTO_ON) { @@ -4028,6 +3987,56 @@ static void vtd_init(IntelIOMMUState *s) if (s->pasid) { s->ecap |= VTD_ECAP_PASID; } +} + +/* + * Do the initialization. It will also be called when reset, so pay + * attention when adding new initialization stuff. + */ +static void vtd_init(IntelIOMMUState *s) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + memset(s->csr, 0, DMAR_REG_SIZE); + memset(s->wmask, 0, DMAR_REG_SIZE); + memset(s->w1cmask, 0, DMAR_REG_SIZE); + memset(s->womask, 0, DMAR_REG_SIZE); + + s->root = 0; + s->root_scalable = false; + s->dmar_enabled = false; + s->intr_enabled = false; + s->iq_head = 0; + s->iq_tail = 0; + s->iq = 0; + s->iq_size = 0; + s->qi_enabled = false; + s->iq_last_desc_type = VTD_INV_DESC_NONE; + s->iq_dw = false; + s->next_frcd_reg = 0; + + vtd_cap_init(s); + + /* + * Rsvd field masks for spte + */ + vtd_spte_rsvd[0] = ~0ULL; + vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); + + vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + + if (s->scalable_mode || s->snoop_control) { + vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; + } vtd_reset_caches(s); -- Gitee From 5834bb1ccce592380a91a5cf127f90a031cd7cf2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 5 Jun 2024 16:30:42 +0800 Subject: [PATCH 072/134] intel_iommu: Implement [set|unset]_iommu_device() callbacks Implement [set|unset]_iommu_device() callbacks in Intel vIOMMU. In set call, we take a reference of HostIOMMUDevice and store it in hash table indexed by PCI BDF. Note this BDF index is device's real BDF not the aliased one which is different from the index of VTDAddressSpace. There can be multiple assigned devices under same virtual iommu group and share same VTDAddressSpace, but each has its own HostIOMMUDevice. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 81 +++++++++++++++++++++++++++++++++++ include/hw/i386/intel_iommu.h | 2 + 2 files changed, 83 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 6716407b7a..bdc14f8438 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -61,6 +61,12 @@ struct vtd_as_key { uint32_t pasid; }; +/* bus/devfn is PCI device's real BDF not the aliased one */ +struct vtd_hiod_key { + PCIBus *bus; + uint8_t devfn; +}; + struct vtd_iotlb_key { uint64_t gfn; uint32_t pasid; @@ -250,6 +256,25 @@ static guint vtd_as_hash(gconstpointer v) return (guint)(value << 8 | key->devfn); } +/* Same implementation as vtd_as_hash() */ +static guint vtd_hiod_hash(gconstpointer v) +{ + return vtd_as_hash(v); +} + +static gboolean vtd_hiod_equal(gconstpointer v1, gconstpointer v2) +{ + const struct vtd_hiod_key *key1 = v1; + const struct vtd_hiod_key *key2 = v2; + + return (key1->bus == key2->bus) && (key1->devfn == key2->devfn); +} + +static void vtd_hiod_destroy(gpointer v) +{ + object_unref(v); +} + static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, gpointer user_data) { @@ -3813,6 +3838,58 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, return vtd_dev_as; } +static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) +{ + IntelIOMMUState *s = opaque; + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + }; + struct vtd_as_key *new_key; + + assert(hiod); + + vtd_iommu_lock(s); + + if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { + error_setg(errp, "Host IOMMU device already exist"); + vtd_iommu_unlock(s); + return false; + } + + new_key = g_malloc(sizeof(*new_key)); + new_key->bus = bus; + new_key->devfn = devfn; + + object_ref(hiod); + g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod); + + vtd_iommu_unlock(s); + + return true; +} + +static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) +{ + IntelIOMMUState *s = opaque; + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + }; + + vtd_iommu_lock(s); + + if (!g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { + vtd_iommu_unlock(s); + return; + } + + g_hash_table_remove(s->vtd_host_iommu_dev, &key); + + vtd_iommu_unlock(s); +} + /* Unmap the whole range in the notifier's scope. */ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) { @@ -4117,6 +4194,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, + .set_iommu_device = vtd_dev_set_iommu_device, + .unset_iommu_device = vtd_dev_unset_iommu_device, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4240,6 +4319,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) g_free, g_free); s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, g_free, g_free); + s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_hiod_hash, vtd_hiod_equal, + g_free, vtd_hiod_destroy); vtd_init(s); pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 7fa0a695c8..1eb05c29fc 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -292,6 +292,8 @@ struct IntelIOMMUState { /* list of registered notifiers */ QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; + GHashTable *vtd_host_iommu_dev; /* HostIOMMUDevice */ + /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ dma_addr_t intr_root; /* Interrupt remapping table pointer */ -- Gitee From 4ef1b086272552378c09356b0e9fd2548a27a621 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:43 +0800 Subject: [PATCH 073/134] intel_iommu: Check compatibility with host IOMMU capabilities If check fails, host device (either VFIO or VDPA device) is not compatible with current vIOMMU config and should not be passed to guest. Only aw_bits is checked for now, we don't care about other caps before scalable modern mode is introduced. Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index bdc14f8438..60d86e0cb6 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3838,6 +3838,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, return vtd_dev_as; } +static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod, + Error **errp) +{ + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod); + int ret; + + if (!hiodc->get_cap) { + error_setg(errp, ".get_cap() not implemented"); + return false; + } + + /* Common checks */ + ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp); + if (ret < 0) { + return false; + } + if (s->aw_bits > ret) { + error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret); + return false; + } + + return true; +} + static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, HostIOMMUDevice *hiod, Error **errp) { @@ -3858,6 +3882,11 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, return false; } + if (!vtd_check_hiod(s, hiod, errp)) { + vtd_iommu_unlock(s); + return false; + } + new_key = g_malloc(sizeof(*new_key)); new_key->bus = bus; new_key->devfn = devfn; -- Gitee From 92da638c3a97679ab4d9f497ae5c7bf652e7bf99 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:49 +0100 Subject: [PATCH 074/134] vfio/pci: Extract mdev check into an helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to skip initialization of the HostIOMMUDevice for mdev, extract the checks that validate if a device is an mdev into helpers. A vfio_device_is_mdev() is created, and subsystems consult VFIODevice::mdev to check if it's mdev or not. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- hw/vfio/helpers.c | 14 ++++++++++++++ hw/vfio/pci.c | 12 +++--------- include/hw/vfio/vfio-common.h | 2 ++ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 35b8e42304..37bc383c69 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -680,3 +680,17 @@ int vfio_device_get_aw_bits(VFIODevice *vdev) return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; } + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 675a608b9c..de040e73ca 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2942,10 +2942,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - char *tmp, *subsys; Error *err = NULL; int i, ret; - bool is_mdev; char uuid[UUID_STR_LEN]; char *name; @@ -2976,15 +2974,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) * stays in sync with the active working set of the guest driver. Prevent * the x-balloon-allowed option unless this is minimally an mdev device. */ - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - g_free(tmp); - is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); - free(subsys); + vbasedev->mdev = vfio_device_is_mdev(vbasedev); - trace_vfio_mdev(vbasedev->name, is_mdev); + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - if (vbasedev->ram_block_discard_allowed && !is_mdev) { + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { error_setg(errp, "x-balloon-allowed only potentially compatible " "with mdev devices"); goto error; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index d45d40c329..e49e5fabba 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -126,6 +126,7 @@ typedef struct VFIODevice { DeviceState *dev; int fd; int type; + bool mdev; bool reset_works; bool needs_reset; bool no_mmap; @@ -219,6 +220,7 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); +bool vfio_device_is_mdev(VFIODevice *vbasedev); int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); -- Gitee From b2d58d5b474633514c3195d6948e1cd2a9c78d67 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:50 +0100 Subject: [PATCH 075/134] vfio/iommufd: Don't initialize nor set a HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by skipping HostIOMMUDevice initialization in the presence of mdevs, and skip setting an iommu device when it is known to be an mdev. Cc: Zhenzhong Duan Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Joao Martins Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- hw/vfio/common.c | 4 ++++ hw/vfio/pci.c | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d5ff65f90a..ceb1da0b94 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1664,6 +1664,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } + if (vbasedev->mdev) { + return true; + } + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { object_unref(hiod); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index de040e73ca..19211f4368 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3101,7 +3101,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); - if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { error_prepend(errp, "Failed to set iommu_device: "); goto out_teardown; } @@ -3229,7 +3230,9 @@ out_deregister: timer_free(vdev->intx.mmap_timer); } out_unset_idev: - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); -- Gitee From 7d53d0938921d0faa32e1fef4c7bcc45d21f9bfb Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:51 +0100 Subject: [PATCH 076/134] backends/iommufd: Extend iommufd_backend_get_device_info() to fetch HW capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The helper will be able to fetch vendor agnostic IOMMU capabilities supported both by hardware and software. Right now it is only iommu dirty tracking. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 4 +++- hw/vfio/iommufd.c | 4 +++- include/sysemu/iommufd.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 7e805bd664..1ce2a24226 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -225,7 +225,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp) + uint64_t *caps, Error **errp) { struct iommu_hw_info info = { .size = sizeof(info), @@ -241,6 +241,8 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, g_assert(type); *type = info.out_data_type; + g_assert(caps); + *caps = info.out_capabilities; return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7cbf0e44f1..d5b923ca83 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -647,9 +647,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, union { struct iommu_hw_info_vtd vtd; } data; + uint64_t hw_caps; if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), errp)) { + &type, &data, sizeof(data), + &hw_caps, errp)) { return false; } diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index dfade18e6d..a0a0143856 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -51,7 +51,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp); + uint64_t *caps, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 56e5b9cf8e4041a023daca1ce439ca14619afa97 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:52 +0100 Subject: [PATCH 077/134] vfio/iommufd: Return errno in iommufd_cdev_attach_ioas_hwpt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to implement auto domains have the attach function return the errno it got during domain attach instead of a bool. -EINVAL is tracked to track domain incompatibilities, and decide whether to create a new IOMMU domain. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d5b923ca83..5e7788ed59 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -200,11 +200,12 @@ static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, error_setg_errno(errp, errno, "[iommufd=%d] error attach %s (%d) to id=%d", iommufd, vbasedev->name, vbasedev->fd, id); - } else { - trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, - vbasedev->fd, id); + return -errno; } - return ret; + + trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, + vbasedev->fd, id); + return 0; } static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) -- Gitee From 44d573b10c45746e81d0d1786fe61d45160f2181 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 22 Jul 2024 15:07:12 +0800 Subject: [PATCH 078/134] vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by setting vbasedev->mdev true so skipping HostIOMMUDevice initialization in the presence of mdevs. Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Zhenzhong Duan Reviewed-by: Joao Martins Reviewed-by: Eric Auger --- hw/vfio/ap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index e157aa1ff7..6b2bc32549 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -234,6 +234,9 @@ static void vfio_ap_instance_init(Object *obj) */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, DEVICE(vapdev), true); + + /* AP device is mdev type device */ + vbasedev->mdev = true; } #ifdef CONFIG_IOMMUFD -- Gitee From ffcda8cc141e14528fd73aea750be822575eedcc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 22 Jul 2024 15:07:13 +0800 Subject: [PATCH 079/134] vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by setting vbasedev->mdev true so skipping HostIOMMUDevice initialization in the presence of mdevs. Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Zhenzhong Duan Reviewed-by: Joao Martins Acked-by: Eric Farman Reviewed-by: Eric Auger --- hw/vfio/ccw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 90e4a53437..257e9723cf 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -683,6 +683,9 @@ static void vfio_ccw_instance_init(Object *obj) VFIOCCWDevice *vcdev = VFIO_CCW(obj); VFIODevice *vbasedev = &vcdev->vdev; + /* CCW device is mdev type device */ + vbasedev->mdev = true; + /* * All vfio-ccw devices are believed to operate in a way compatible with * discarding of memory in RAM blocks, ie. pages pinned in the host are -- Gitee From 630efd6ca2f0c9383223f0ea092abda1c7528f21 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:18 +0100 Subject: [PATCH 080/134] vfio/iommufd: Introduce auto domain creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's generally two modes of operation for IOMMUFD: 1) The simple user API which intends to perform relatively simple things with IOMMUs e.g. DPDK. The process generally creates an IOAS and attaches to VFIO and mainly performs IOAS_MAP and UNMAP. 2) The native IOMMUFD API where you have fine grained control of the IOMMU domain and model it accordingly. This is where most new feature are being steered to. For dirty tracking 2) is required, as it needs to ensure that the stage-2/parent IOMMU domain will only attach devices that support dirty tracking (so far it is all homogeneous in x86, likely not the case for smmuv3). Such invariant on dirty tracking provides a useful guarantee to VMMs that will refuse incompatible device attachments for IOMMU domains. Dirty tracking insurance is enforced via HWPT_ALLOC, which is responsible for creating an IOMMU domain. This is contrast to the 'simple API' where the IOMMU domain is created by IOMMUFD automatically when it attaches to VFIO (usually referred as autodomains) but it has the needed handling for mdevs. To support dirty tracking with the advanced IOMMUFD API, it needs similar logic, where IOMMU domains are created and devices attached to compatible domains. Essentially mimicking kernel iommufd_device_auto_get_domain(). With mdevs given there's no IOMMU domain it falls back to IOAS attach. The auto domain logic allows different IOMMU domains to be created when DMA dirty tracking is not desired (and VF can provide it), and others where it is. Here it is not used in this way given how VFIODevice migration state is initialized after the device attachment. But such mixed mode of IOMMU dirty tracking + device dirty tracking is an improvement that can be added on. Keep the 'all of nothing' of type1 approach that we have been using so far between container vs device dirty tracking. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan [ clg: Added ERRP_GUARD() in iommufd_cdev_autodomains_get() ] Signed-off-by: Cédric Le Goater Reviewed-by: Eric Auger [Shameer: Changed ret for iommufd_cdev_autodomains_get() ] Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 30 +++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 85 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 9 ++++ include/sysemu/iommufd.h | 5 +++ 5 files changed, 130 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 1ce2a24226..0d995d7563 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -223,6 +223,36 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_hwpt_alloc alloc_hwpt = { + .size = sizeof(struct iommu_hwpt_alloc), + .flags = flags, + .dev_id = dev_id, + .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); + trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, + data_len, (uintptr_t)data_ptr, + alloc_hwpt.out_hwpt_id, ret); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate hwpt"); + return false; + } + + *out_hwpt = alloc_hwpt.out_hwpt_id; + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index d45c6e31a6..e248bf039e 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 5e7788ed59..3b75cba26c 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -225,10 +225,89 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) return ret; } +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + ERRP_GUARD(); + IOMMUFDBackend *iommufd = vbasedev->iommufd; + uint32_t flags = 0; + VFIOIOASHwpt *hwpt; + uint32_t hwpt_id; + int ret; + + /* Try to find a domain */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + /* -EINVAL means the domain is incompatible with the device. */ + if (ret == -EINVAL) { + /* + * It is an expected failure and it just means we will try + * another domain, or create one if no existing compatible + * domain is found. Hence why the error is discarded below. + */ + error_free(*errp); + *errp = NULL; + continue; + } + + return ret; + } else { + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + return 0; + } + } + + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &hwpt_id, errp)) { + return -EINVAL; + } + + hwpt = g_malloc0(sizeof(*hwpt)); + hwpt->hwpt_id = hwpt_id; + QLIST_INIT(&hwpt->device_list); + + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + return ret; + } + + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + return 0; +} + +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + VFIOIOASHwpt *hwpt = vbasedev->hwpt; + + QLIST_REMOVE(vbasedev, hwpt_next); + vbasedev->hwpt = NULL; + + if (QLIST_EMPTY(&hwpt->device_list)) { + QLIST_REMOVE(hwpt, next); + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + } +} + static int iommufd_cdev_attach_container(VFIODevice *vbasedev, VFIOIOMMUFDContainer *container, Error **errp) { + /* mdevs aren't physical devices and will fail with auto domains */ + if (!vbasedev->mdev) { + return iommufd_cdev_autodomains_get(vbasedev, container, errp); + } + return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } @@ -240,6 +319,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { error_report_err(err); } + + if (vbasedev->hwpt) { + iommufd_cdev_autodomains_put(vbasedev, container); + } + } static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) @@ -375,6 +459,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, container = g_malloc0(sizeof(*container)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; + QLIST_INIT(&container->hwpt_list); bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, iommufd_vioc); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index e49e5fabba..2093ed2e91 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -107,10 +107,17 @@ typedef struct VFIOHostDMAWindow { typedef struct IOMMUFDBackend IOMMUFDBackend; +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + typedef struct VFIOIOMMUFDContainer { VFIOContainerBase bcontainer; IOMMUFDBackend *be; uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; } VFIOIOMMUFDContainer; typedef struct VFIODeviceOps VFIODeviceOps; @@ -144,6 +151,8 @@ typedef struct VFIODevice { HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; + VFIOIOASHwpt *hwpt; + QLIST_ENTRY(VFIODevice) hwpt_next; } VFIODevice; struct VFIODeviceOps { diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index a0a0143856..f6f01e4be8 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -52,6 +52,11 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp); +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 35f33bf18826286c9e9fc739a893b9915c71f43c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Jun 2024 11:52:51 +0200 Subject: [PATCH 081/134] HostIOMMUDevice: Store the VFIO/VDPA agent Store the agent device (VFIO or VDPA) in the host IOMMU device. This will allow easy access to some of its resources. Signed-off-by: Eric Auger Reviewed-by: Zhenzhong Duan Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 1 + hw/vfio/iommufd.c | 2 ++ include/sysemu/host_iommu_device.h | 1 + 3 files changed, 4 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 10f7635425..8a5a112b6b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1259,6 +1259,7 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); + hiod->agent = opaque; return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 3b75cba26c..7a069ca576 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -735,6 +735,8 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, } data; uint64_t hw_caps; + hiod->agent = opaque; + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, &data, sizeof(data), &hw_caps, errp)) { diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index a57873958b..3e5f058e7b 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -34,6 +34,7 @@ struct HostIOMMUDevice { Object parent_obj; char *name; + void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ HostIOMMUDeviceCaps caps; }; -- Gitee From 7d3634d73af1f53549eba4b3d50bb8f9f49a5243 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:19 +0100 Subject: [PATCH 082/134] vfio/{iommufd,container}: Remove caps::aw_bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove caps::aw_bits which requires the bcontainer::iova_ranges being initialized after device is actually attached. Instead defer that to .get_cap() and call vfio_device_get_aw_bits() directly. This is in preparation for HostIOMMUDevice::realize() being called early during attach_device(). Suggested-by: Zhenzhong Duan Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 3 ++- hw/vfio/container.c | 5 +---- hw/vfio/iommufd.c | 1 - include/sysemu/host_iommu_device.h | 3 --- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 0d995d7563..4aebf54765 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -19,6 +19,7 @@ #include "qemu/error-report.h" #include "monitor/monitor.h" #include "trace.h" +#include "hw/vfio/vfio-common.h" #include #include @@ -285,7 +286,7 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: return caps->type; case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 8a5a112b6b..30a62348d3 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1258,7 +1258,6 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, VFIODevice *vdev = opaque; hiod->name = g_strdup(vdev->name); - hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); hiod->agent = opaque; return true; @@ -1267,11 +1266,9 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { - HostIOMMUDeviceCaps *caps = &hiod->caps; - switch (cap) { case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7a069ca576..06e6a400be 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -745,7 +745,6 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); caps->type = type; - caps->aw_bits = vfio_device_get_aw_bits(vdev); return true; } diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index 3e5f058e7b..f586908945 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -19,12 +19,9 @@ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. * * @type: host platform IOMMU type. - * - * @aw_bits: host IOMMU address width. 0xff if no limitation. */ typedef struct HostIOMMUDeviceCaps { uint32_t type; - uint8_t aw_bits; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" -- Gitee From 72660b98e799248338588fe97f191c544c073806 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:20 +0100 Subject: [PATCH 083/134] vfio/iommufd: Add hw_caps field to HostIOMMUDeviceCaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store the value of @caps returned by iommufd_backend_get_device_info() in a new field HostIOMMUDeviceCaps::hw_caps. Right now the only value is whether device IOMMU supports dirty tracking (IOMMU_HW_CAP_DIRTY_TRACKING). This is in preparation for HostIOMMUDevice::realize() being called early during attach_device(). Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- hw/vfio/iommufd.c | 1 + include/sysemu/host_iommu_device.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 06e6a400be..d9088705de 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -745,6 +745,7 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); caps->type = type; + caps->hw_caps = hw_caps; return true; } diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index f586908945..e4d8300350 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -19,9 +19,13 @@ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. * * @type: host platform IOMMU type. + * + * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents + * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; + uint64_t hw_caps; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" -- Gitee From 2276a3a175576a63da6abd5ccb309dd1cdbc4021 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:21 +0100 Subject: [PATCH 084/134] vfio/{iommufd, container}: Invoke HostIOMMUDevice::realize() during attach_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the HostIOMMUDevice::realize() to be invoked during the attach of the device before we allocate IOMMUFD hardware pagetable objects (HWPT). This allows the use of the hw_caps obtained by IOMMU_GET_HW_INFO that essentially tell if the IOMMU behind the device supports dirty tracking. Note: The HostIOMMUDevice data from legacy backend is static and doesn't need any information from the (type1-iommu) backend to be initialized. In contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd FD to be connected and having a devid to be able to successfully GET_HW_INFO. This means vfio_device_hiod_realize() is called in different places within the backend .attach_device() implementation. Suggested-by: Cédric Le Goater Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater [ clg: Fixed error handling in iommufd_cdev_attach() ] Signed-off-by: Cédric Le Goater Reviewed-by: Eric Auger --- hw/vfio/common.c | 19 +++++++------------ hw/vfio/container.c | 4 ++++ hw/vfio/helpers.c | 11 +++++++++++ hw/vfio/iommufd.c | 11 +++++++++++ include/hw/vfio/vfio-common.h | 1 + 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ceb1da0b94..65e1c9f810 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1659,22 +1659,17 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, assert(ops); - ret = ops->attach_device(name, vbasedev, as, errp); - if (ret) { - return ret; - } - - if (vbasedev->mdev) { - return true; + if (!vbasedev->mdev) { + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + vbasedev->hiod = hiod; } - hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); - if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + ret = ops->attach_device(name, vbasedev, as, errp); + if (ret) { object_unref(hiod); - ops->detach_device(vbasedev); - return -1; + vbasedev->hiod = NULL; + return ret; } - vbasedev->hiod = hiod; return 0; } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 30a62348d3..64eacfd912 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1030,6 +1030,10 @@ static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, trace_vfio_attach_device(vbasedev->name, groupid); + if (!vfio_device_hiod_realize(vbasedev, errp)) { + return false; + } + group = vfio_get_group(groupid, as, errp); if (!group) { return -ENOENT; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 37bc383c69..1f3bfed917 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -694,3 +694,14 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev) subsys = realpath(tmp, NULL); return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); } + +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) +{ + HostIOMMUDevice *hiod = vbasedev->hiod; + + if (!hiod) { + return true; + } + + return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d9088705de..8fd6826826 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -424,6 +424,17 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, space = vfio_get_address_space(as); + /* + * The HostIOMMUDevice data from legacy backend is static and doesn't need + * any information from the (type1-iommu) backend to be initialized. In + * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd + * FD to be connected and having a devid to be able to successfully call + * iommufd_backend_get_device_info(). + */ + if (!vfio_device_hiod_realize(vbasedev, errp)) { + goto err_alloc_ioas; + } + /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 2093ed2e91..63da291456 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -230,6 +230,7 @@ void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); bool vfio_device_is_mdev(VFIODevice *vbasedev); +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp); int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); -- Gitee From db8ef4524568c2379c25986db6e30cb0f6c0ec05 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:22 +0100 Subject: [PATCH 085/134] vfio/iommufd: Probe and request hwpt dirty tracking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to using the dirty tracking UAPI, probe whether the IOMMU supports dirty tracking. This is done via the data stored in hiod::caps::hw_caps initialized from GET_HW_INFO. Qemu doesn't know if VF dirty tracking is supported when allocating hardware pagetable in iommufd_cdev_autodomains_get(). This is because VFIODevice migration state hasn't been initialized *yet* hence it can't pick between VF dirty tracking vs IOMMU dirty tracking. So, if IOMMU supports dirty tracking it always creates HWPTs with IOMMU_HWPT_ALLOC_DIRTY_TRACKING even if later on VFIOMigration decides to use VF dirty tracking instead. Signed-off-by: Joao Martins [ clg: - Fixed vbasedev->iommu_dirty_tracking assignment in iommufd_cdev_autodomains_get() - Added warning for heterogeneous dirty page tracking support in iommufd_cdev_autodomains_get() ] Signed-off-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 26 ++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 8fd6826826..a9400d8107 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -114,6 +114,11 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) iommufd_backend_disconnect(vbasedev->iommufd); } +static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) +{ + return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { long int ret = -ENOTTY; @@ -256,10 +261,22 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, } else { vbasedev->hwpt = hwpt; QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); return 0; } } + /* + * This is quite early and VFIO Migration state isn't yet fully + * initialized, thus rely only on IOMMU hardware capabilities as to + * whether IOMMU dirty tracking is going to be requested. Later + * vfio_migration_realize() may decide to use VF dirty tracking + * instead. + */ + if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + } + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, @@ -269,6 +286,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, hwpt = g_malloc0(sizeof(*hwpt)); hwpt->hwpt_id = hwpt_id; + hwpt->hwpt_flags = flags; QLIST_INIT(&hwpt->device_list); ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); @@ -279,8 +297,16 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, } vbasedev->hwpt = hwpt; + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + container->bcontainer.dirty_pages_supported |= + vbasedev->iommu_dirty_tracking; + if (container->bcontainer.dirty_pages_supported && + !vbasedev->iommu_dirty_tracking) { + warn_report("IOMMU instance for device %s doesn't support dirty tracking", + vbasedev->name); + } return 0; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 63da291456..22a7386591 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -109,6 +109,7 @@ typedef struct IOMMUFDBackend IOMMUFDBackend; typedef struct VFIOIOASHwpt { uint32_t hwpt_id; + uint32_t hwpt_flags; QLIST_HEAD(, VFIODevice) device_list; QLIST_ENTRY(VFIOIOASHwpt) next; } VFIOIOASHwpt; @@ -148,6 +149,7 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + bool iommu_dirty_tracking; HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; -- Gitee From 73b24be504fcd9b453a51e1f2fc8af64b092c586 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:23 +0100 Subject: [PATCH 086/134] vfio/iommufd: Implement VFIOIOMMUClass::set_dirty_tracking support ioctl(iommufd, IOMMU_HWPT_SET_DIRTY_TRACKING, arg) is the UAPI that enables or disables dirty page tracking. The ioctl is used if the hwpt has been created with dirty tracking supported domain (stored in hwpt::flags) and it is called on the whole list of iommu domains. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger [Shameer: changed iommufd_set_dirty_page_tracking() declaration] Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 23 +++++++++++++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 34 ++++++++++++++++++++++++++++++++++ include/sysemu/iommufd.h | 2 ++ 4 files changed, 60 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 4aebf54765..785d3fbbad 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -254,6 +254,29 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, return true; } +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, + uint32_t hwpt_id, bool start, + Error **errp) +{ + int ret; + struct iommu_hwpt_set_dirty_tracking set_dirty = { + .size = sizeof(set_dirty), + .hwpt_id = hwpt_id, + .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty); + trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed", + hwpt_id); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index e248bf039e..fe3297ca15 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -16,3 +16,4 @@ iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t si iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index a9400d8107..11e1392527 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -119,6 +119,39 @@ static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } +static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, start, NULL)) { + error_report("Failed to set dirty tracking hwpt_id %u errno: %d", + hwpt->hwpt_id, errno); + goto err; + } + } + + return 0; + +err: + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, !start, NULL); + } + return -EINVAL; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { long int ret = -ENOTTY; @@ -759,6 +792,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index f6f01e4be8..4f1dbe827c 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -57,6 +57,8 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, Error **errp); +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From d09cb3d1907e3afbae9b3ea345c9973e207614bf Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:24 +0100 Subject: [PATCH 087/134] vfio/iommufd: Implement VFIOIOMMUClass::query_dirty_bitmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioctl(iommufd, IOMMU_HWPT_GET_DIRTY_BITMAP, arg) is the UAPI that fetches the bitmap that tells what was dirty in an IOVA range. A single bitmap is allocated and used across all the hwpts sharing an IOAS which is then used in log_sync() to set Qemu global bitmaps. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan [Shameer: changed iommufd_query_dirty_bitmap() declaration] Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 29 +++++++++++++++++++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 32 ++++++++++++++++++++++++++++++++ include/sysemu/iommufd.h | 4 ++++ 4 files changed, 66 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 785d3fbbad..c1260766f0 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -277,6 +277,35 @@ bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, return true; } +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, + uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp) +{ + int ret; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = { + .size = sizeof(get_dirty_bitmap), + .hwpt_id = hwpt_id, + .iova = iova, + .length = size, + .page_size = page_size, + .data = (uintptr_t)data, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap); + trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size, + page_size, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx + " size: 0x"RAM_ADDR_FMT") failed", iova, size); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index fe3297ca15..b02433710a 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -17,3 +17,4 @@ iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioa iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" +iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 11e1392527..3d4f902ae5 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -25,6 +25,7 @@ #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" +#include "exec/ram_addr.h" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) @@ -152,6 +153,36 @@ err: return -EINVAL; } +static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) +{ + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + unsigned long page_size = qemu_real_host_page_size(); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id, + iova, size, page_size, + (uint64_t *)vbmap->bitmap, + NULL)) { + error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)iova, + (uint64_t)size, errno); + + return -EINVAL; + } + } + + return 0; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { long int ret = -ENOTTY; @@ -793,6 +824,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 4f1dbe827c..3b28c8a81c 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -59,6 +59,10 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, Error **errp); bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, bool start, Error **errp); +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 6eab0b4a0c79d53250da601da25e2813177d44fe Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:25 +0100 Subject: [PATCH 088/134] vfio/migration: Don't block migration device dirty tracking is unsupported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default VFIO migration is set to auto, which will support live migration if the migration capability is set *and* also dirty page tracking is supported. For testing purposes one can force enable without dirty page tracking via enable-migration=on, but that option is generally left for testing purposes. So starting with IOMMU dirty tracking it can use to accommodate the lack of VF dirty page tracking allowing us to minimize the VF requirements for migration and thus enabling migration by default for those too. While at it change the error messages to mention IOMMU dirty tracking as well. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger [ clg: - spelling in commit log ] Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 28d422b39f..db128204af 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -945,16 +945,16 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported) { + if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, - "%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + "%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); goto add_blocker; } - warn_report("%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + warn_report("%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); } ret = vfio_block_multiple_devices_migration(vbasedev, errp); -- Gitee From b0fe5a6794c5403f4ab9859ec2ced338246690bd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:26 +0100 Subject: [PATCH 089/134] vfio/common: Allow disabling device dirty page tracking The property 'x-pre-copy-dirty-page-tracking' allows disabling the whole tracking of VF pre-copy phase of dirty page tracking, though it means that it will only be used at the start of the switchover phase. Add an option that disables the VF dirty page tracking, and fall back into container-based dirty page tracking. This also allows to use IOMMU dirty tracking even on VFs with their own dirty tracker scheme. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan --- hw/vfio/common.c | 3 +++ hw/vfio/migration.c | 4 +++- hw/vfio/pci.c | 3 +++ include/hw/vfio/vfio-common.h | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 65e1c9f810..a8bc1c6055 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -208,6 +208,9 @@ bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) VFIODevice *vbasedev; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; + } if (!vbasedev->dirty_pages_supported) { return false; } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index db128204af..3924beb289 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -945,7 +945,9 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { + if ((!vbasedev->dirty_pages_supported || + vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && + !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, "%s: VFIO device doesn't support device and " diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 19211f4368..f585f285f4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3350,6 +3350,9 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, + vbasedev.device_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 22a7386591..abae8655c4 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -147,6 +147,7 @@ typedef struct VFIODevice { VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + OnOffAuto device_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; bool iommu_dirty_tracking; -- Gitee From ac715e361fdb6d92169b3b3f5964405c816a13ac Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 14 Jan 2025 10:29:24 +0000 Subject: [PATCH 090/134] Update iommufd.h header for vSVA This is based on Linaro UADK branch: https://github.com/Linaro/linux-kernel-uadk/tree/6.12-wip-10.26 Signed-off-by: Shameer Kolothum --- linux-headers/linux/iommufd.h | 394 ++++++++++++++++++++++++++++++++-- 1 file changed, 371 insertions(+), 23 deletions(-) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 806d98d09c..41559c6064 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -37,18 +37,22 @@ enum { IOMMUFD_CMD_BASE = 0x80, IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, - IOMMUFD_CMD_IOAS_ALLOC, - IOMMUFD_CMD_IOAS_ALLOW_IOVAS, - IOMMUFD_CMD_IOAS_COPY, - IOMMUFD_CMD_IOAS_IOVA_RANGES, - IOMMUFD_CMD_IOAS_MAP, - IOMMUFD_CMD_IOAS_UNMAP, - IOMMUFD_CMD_OPTION, - IOMMUFD_CMD_VFIO_IOAS, - IOMMUFD_CMD_HWPT_ALLOC, - IOMMUFD_CMD_GET_HW_INFO, - IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, - IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, + IOMMUFD_CMD_IOAS_ALLOC = 0x81, + IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82, + IOMMUFD_CMD_IOAS_COPY = 0x83, + IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84, + IOMMUFD_CMD_IOAS_MAP = 0x85, + IOMMUFD_CMD_IOAS_UNMAP = 0x86, + IOMMUFD_CMD_OPTION = 0x87, + IOMMUFD_CMD_VFIO_IOAS = 0x88, + IOMMUFD_CMD_HWPT_ALLOC = 0x89, + IOMMUFD_CMD_GET_HW_INFO = 0x8a, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, + IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, + IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, + IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f, + IOMMUFD_CMD_VDEVICE_ALLOC = 0x90, }; /** @@ -355,10 +359,13 @@ struct iommu_vfio_ioas { * the parent HWPT in a nesting configuration. * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is * enforced on device attachment + * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is + * valid. */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, + IOMMU_HWPT_FAULT_ID_VALID = 1 << 2, }; /** @@ -389,14 +396,34 @@ struct iommu_hwpt_vtd_s1 { __u32 __reserved; }; +/** + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for + * a user stage-1 Context Descriptor Table. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass + * nested domain will translate the same as the nesting parent. + */ +struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table */ enum iommu_hwpt_data_type { - IOMMU_HWPT_DATA_NONE, - IOMMU_HWPT_DATA_VTD_S1, + IOMMU_HWPT_DATA_NONE = 0, + IOMMU_HWPT_DATA_VTD_S1 = 1, + IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, }; /** @@ -404,12 +431,15 @@ enum iommu_hwpt_data_type { * @size: sizeof(struct iommu_hwpt_alloc) * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS or HWPT to connect this HWPT to + * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 * @data_type: One of enum iommu_hwpt_data_type * @data_len: Length of the type specific data * @data_uptr: User pointer to the type specific data + * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of + * IOMMU_HWPT_FAULT_ID_VALID is set. + * @__reserved2: Padding to 64-bit alignment. Must be 0. * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the @@ -420,11 +450,13 @@ enum iommu_hwpt_data_type { * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. * - * A user-managed nested HWPT will be created from a given parent HWPT via - * @pt_id, in which the parent HWPT must be allocated previously via the - * same ioctl from a given IOAS (@pt_id). In this case, the @data_type - * must be set to a pre-defined type corresponding to an I/O page table - * type supported by the underlying IOMMU hardware. + * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a + * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be + * allocated previously via the same ioctl from a given IOAS (@pt_id). In this + * case, the @data_type must be set to a pre-defined type corresponding to an + * I/O page table type supported by the underlying IOMMU hardware. The device + * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU + * instance. * * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr @@ -440,6 +472,8 @@ struct iommu_hwpt_alloc { __u32 data_type; __u32 data_len; __aligned_u64 data_uptr; + __u32 fault_id; + __u32 __reserved2; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) @@ -474,15 +508,50 @@ struct iommu_hw_info_vtd { __aligned_u64 ecap_reg; }; +/** + * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information + * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + * + * @flags: Must be set to 0 + * @__reserved: Must be 0 + * @idr: Implemented features for ARM SMMU Non-secure programming interface + * @iidr: Information about the implementation and implementer of ARM SMMU, + * and architecture version supported + * @aidr: ARM SMMU architecture version + * + * For the details of @idr, @iidr and @aidr, please refer to the chapters + * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. + * + * User space should read the underlying ARM SMMUv3 hardware information for + * the list of supported features. + * + * Note that these values reflect the raw HW capability, without any insight if + * any required kernel driver support is present. Bits may be set indicating the + * HW has functionality that is lacking kernel software support, such as BTM. If + * a VMM is using this information to construct emulated copies of these + * registers it should only forward bits that it knows it can support. + * + * In future, presence of required kernel support will be indicated in flags. + */ +struct iommu_hw_info_arm_smmuv3 { + __u32 flags; + __u32 __reserved; + __u32 idr[6]; + __u32 iidr; + __u32 aidr; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type + * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { - IOMMU_HW_INFO_TYPE_NONE, - IOMMU_HW_INFO_TYPE_INTEL_VTD, + IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, + IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, }; /** @@ -494,9 +563,17 @@ enum iommu_hw_info_type { * IOMMU_HWPT_GET_DIRTY_BITMAP * IOMMU_HWPT_SET_DIRTY_TRACKING * + * @IOMMU_HW_CAP_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct iommu_hw_info::out_max_pasid_log2 + * is zero. + * @IOMMU_HW_CAP_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct iommu_hw_info::out_max_pasid_log2 + * is zero. */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; /** @@ -512,6 +589,9 @@ enum iommufd_hw_capabilities { * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -535,7 +615,8 @@ struct iommu_hw_info { __u32 data_len; __aligned_u64 data_uptr; __u32 out_data_type; - __u32 __reserved; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) @@ -613,4 +694,271 @@ struct iommu_hwpt_get_dirty_bitmap { #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) +/** + * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation + * Data Type + * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 + * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3 + */ +enum iommu_hwpt_invalidate_data_type { + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1, +}; + +/** + * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d + * stage-1 cache invalidation + * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies + * to all-levels page structure cache or just + * the leaf PTE cache. + */ +enum iommu_hwpt_vtd_s1_invalidate_flags { + IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0, +}; + +/** + * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation + * (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) + * @addr: The start address of the range to be invalidated. It needs to + * be 4KB aligned. + * @npages: Number of contiguous 4K pages to be invalidated. + * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags + * @__reserved: Must be 0 + * + * The Intel VT-d specific invalidation data for user-managed stage-1 cache + * invalidation in nested translation. Userspace uses this structure to + * tell the impacted cache scope after modifying the stage-1 page table. + * + * Invalidating all the caches related to the page table by setting @addr + * to be 0 and @npages to be U64_MAX. + * + * The device TLB will be invalidated automatically if ATS is enabled. + */ +struct iommu_hwpt_vtd_s1_invalidate { + __aligned_u64 addr; + __aligned_u64 npages; + __u32 flags; + __u32 __reserved; +}; + +/** + * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation + * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3) + * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ. + * Must be little-endian. + * + * Supported command list only when passing in a vIOMMU via @hwpt_id: + * CMDQ_OP_TLBI_NSNH_ALL + * CMDQ_OP_TLBI_NH_VA + * CMDQ_OP_TLBI_NH_VAA + * CMDQ_OP_TLBI_NH_ALL + * CMDQ_OP_TLBI_NH_ASID + * CMDQ_OP_ATC_INV + * CMDQ_OP_CFGI_CD + * CMDQ_OP_CFGI_CD_ALL + * + * -EIO will be returned if the command is not supported. + */ +struct iommu_viommu_arm_smmuv3_invalidate { + __aligned_le64 cmd[2]; +}; + +/** + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) + * @size: sizeof(struct iommu_hwpt_invalidate) + * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation + * @data_uptr: User pointer to an array of driver-specific cache invalidation + * data. + * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data + * type of all the entries in the invalidation request array. It + * should be a type supported by the hwpt pointed by @hwpt_id. + * @entry_len: Length (in bytes) of a request entry in the request array + * @entry_num: Input the number of cache invalidation requests in the array. + * Output the number of requests successfully handled by kernel. + * @__reserved: Must be 0. + * + * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications + * on a user-managed page table should be followed by this operation, if a HWPT + * is passed in via @hwpt_id. Other caches, such as device cache or descriptor + * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field. + * + * Each ioctl can support one or more cache invalidation requests in the array + * that has a total size of @entry_len * @entry_num. + * + * An empty invalidation request array by setting @entry_num==0 is allowed, and + * @entry_len and @data_uptr would be ignored in this case. This can be used to + * check if the given @data_type is supported or not by kernel. + */ +struct iommu_hwpt_invalidate { + __u32 size; + __u32 hwpt_id; + __aligned_u64 data_uptr; + __u32 data_type; + __u32 entry_len; + __u32 entry_num; + __u32 __reserved; +}; +#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) + +/** + * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is + * valid. + * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group. + */ +enum iommu_hwpt_pgfault_flags { + IOMMU_PGFAULT_FLAGS_PASID_VALID = (1 << 0), + IOMMU_PGFAULT_FLAGS_LAST_PAGE = (1 << 1), +}; + +/** + * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_PERM_READ: request for read permission + * @IOMMU_PGFAULT_PERM_WRITE: request for write permission + * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the + * Execute Requested bit set in PASID TLP Prefix. + * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the + * Privileged Mode Requested bit set in PASID TLP + * Prefix. + */ +enum iommu_hwpt_pgfault_perm { + IOMMU_PGFAULT_PERM_READ = (1 << 0), + IOMMU_PGFAULT_PERM_WRITE = (1 << 1), + IOMMU_PGFAULT_PERM_EXEC = (1 << 2), + IOMMU_PGFAULT_PERM_PRIV = (1 << 3), +}; + +/** + * struct iommu_hwpt_pgfault - iommu page fault data + * @flags: Combination of enum iommu_hwpt_pgfault_flags + * @dev_id: id of the originated device + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: Combination of enum iommu_hwpt_pgfault_perm + * @addr: Fault address + * @length: a hint of how much data the requestor is expecting to fetch. For + * example, if the PRI initiator knows it is going to do a 10MB + * transfer, it could fill in 10MB and the OS could pre-fault in + * 10MB of IOVA. It's default to 0 if there's no such hint. + * @cookie: kernel-managed cookie identifying a group of fault messages. The + * cookie number encoded in the last page fault of the group should + * be echoed back in the response message. + */ +struct iommu_hwpt_pgfault { + __u32 flags; + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u32 length; + __u32 cookie; +}; + +/** + * enum iommufd_page_response_code - Return status of fault handlers + * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is the + * "Success" defined in PCI 10.4.2.1. + * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is the "Invalid Request" in PCI + * 10.4.2.1. + */ +enum iommufd_page_response_code { + IOMMUFD_PAGE_RESP_SUCCESS = 0, + IOMMUFD_PAGE_RESP_INVALID = 1, +}; + +/** + * struct iommu_hwpt_page_response - IOMMU page fault response + * @cookie: The kernel-managed cookie reported in the fault message. + * @code: One of response code in enum iommufd_page_response_code. + */ +struct iommu_hwpt_page_response { + __u32 cookie; + __u32 code; +}; + +/** + * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC) + * @size: sizeof(struct iommu_fault_alloc) + * @flags: Must be 0 + * @out_fault_id: The ID of the new FAULT + * @out_fault_fd: The fd of the new FAULT + * + * Explicitly allocate a fault handling object. + */ +struct iommu_fault_alloc { + __u32 size; + __u32 flags; + __u32 out_fault_id; + __u32 out_fault_fd; +}; +#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC) + +/** + * enum iommu_viommu_type - Virtual IOMMU Type + * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + */ +enum iommu_viommu_type { + IOMMU_VIOMMU_TYPE_DEFAULT = 0, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC) + * @size: sizeof(struct iommu_viommu_alloc) + * @flags: Must be 0 + * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type + * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU + * @hwpt_id: ID of a nesting parent HWPT to associate to + * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * + * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's + * virtualization support that is a security-isolated slice of the real IOMMU HW + * that is unique to a specific VM. Operations global to the IOMMU are connected + * to the vIOMMU, such as: + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags + * - Access to a sharable nesting parent pagetable across physical IOMMUs + * - Non-affiliated event reporting (e.g. an invalidation queue error) + * - Virtualization of various platforms IDs, e.g. RIDs and others + * - Delivery of paravirtualized invalidation + * - Direct assigned invalidation queues + * - Direct assigned interrupts + */ +struct iommu_viommu_alloc { + __u32 size; + __u32 flags; + __u32 type; + __u32 dev_id; + __u32 hwpt_id; + __u32 out_viommu_id; +}; +#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) + +/** + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) + * @size: sizeof(struct iommu_vdevice_alloc) + * @viommu_id: vIOMMU ID to associate with the virtual device + * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU + * @__reserved: Must be 0 + * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID + * of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table. + * @out_vdevice_id: Output virtual instance ID for the allocated object + * @__reserved2: Must be 0 + * + * Allocate a virtual device instance (for a physical device) against a vIOMMU. + * This instance holds the device's information (related to its vIOMMU) in a VM. + */ +struct iommu_vdevice_alloc { + __u32 size; + __u32 viommu_id; + __u32 dev_id; + __u32 __reserved; + __aligned_u64 virt_id; + __u32 out_vdevice_id; + __u32 __reserved2; +}; +#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) #endif -- Gitee From cedca4d3635cde049151b5818df2cb66c2b1531f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 3 Nov 2023 16:54:01 +0800 Subject: [PATCH 091/134] backends/iommufd: Add helpers for invalidating user-managed HWPT Signed-off-by: Nicolin Chen Signed-off-by: Zhenzhong Duan --- backends/iommufd.c | 30 ++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 3 +++ 3 files changed, 34 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index c1260766f0..cf24370385 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -330,6 +330,36 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, return true; } +int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr) +{ + int ret, fd = be->fd; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = hwpt_id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = *entry_num, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + + trace_iommufd_backend_invalidate_cache(fd, hwpt_id, data_type, entry_len, + *entry_num, cache.entry_num, + (uintptr_t)data_ptr, ret); + if (ret) { + *entry_num = cache.entry_num; + error_report("IOMMU_HWPT_INVALIDATE failed: %s", strerror(errno)); + ret = -errno; + } else { + g_assert(*entry_num == cache.entry_num); + } + + return ret; +} + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { HostIOMMUDeviceCaps *caps = &hiod->caps; diff --git a/backends/trace-events b/backends/trace-events index b02433710a..ef0ff98921 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -18,3 +18,4 @@ iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" +iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 3b28c8a81c..f6596f6338 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -63,6 +63,9 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, uint64_t iova, ram_addr_t size, uint64_t page_size, uint64_t *data, Error **errp); +int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 0e0956cb785f868dfe48201fcdead71dbdd234b0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 15 Jan 2024 15:05:19 +0800 Subject: [PATCH 092/134] vfio/iommufd: Add properties and handlers to TYPE_HOST_IOMMU_DEVICE_IOMMUFD New added properties include IOMMUFD handle and devid, ioas. IOMMUFD handle and devid are used to allocate/free ioas, hwpt. ioas is used to re-attach IOMMUFD backed device to its default ioas id, i.e., when vIOMMU is disabled by guest. These properties are initialized in .realize() handler. New added handlers include [at|de]tach_hwpt. They are used to attaching/detaching hwpt. VFIO and VDPA have different way to attach and detach, so implementation will be in sub-class instead of HostIOMMUDeviceIOMMUFD. Add two wrappers host_iommu_device_iommufd_[at|de]tach_hwpt to wrap the two handlers. This is a prerequisite patch for following ones. Signed-off-by: Zhenzhong Duan --- backends/iommufd.c | 22 ++++++++++++++++++ include/sysemu/iommufd.h | 50 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index cf24370385..c10aa9b011 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -360,6 +360,26 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, return ret; } +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->attach_hwpt); + return idevc->attach_hwpt(idev, hwpt_id, errp); +} + +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->detach_hwpt); + return idevc->detach_hwpt(idev, errp); +} + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { HostIOMMUDeviceCaps *caps = &hiod->caps; @@ -398,6 +418,8 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, .parent = TYPE_HOST_IOMMU_DEVICE, + .instance_size = sizeof(HostIOMMUDeviceIOMMUFD), + .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass), .class_init = hiod_iommufd_class_init, .abstract = true, } diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index f6596f6338..3dc6934144 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -68,4 +68,54 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, uint32_t *entry_num, void *data_ptr); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Abstract of host IOMMU device with iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t ioas_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); #endif -- Gitee From 53a82c6a5a22bb41e9bd3f754479baf4ce0845bf Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 5 Aug 2024 09:29:00 +0800 Subject: [PATCH 093/134] HostIOMMUDevice: Introduce realize_late callback Previously we have a realize() callback which is called before attachment. But there are still some elements e.g., ioas not ready before attachment. So we need a realize_late() callback to further initialize them. Currently, this callback is only useful for iommufd backend. For legacy backend nothing needs to be initialized after attachment. Signed-off-by: Zhenzhong Duan --- hw/vfio/common.c | 18 +++++++++++++++--- include/sysemu/host_iommu_device.h | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a8bc1c6055..0be63c5fbc 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1654,6 +1654,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); HostIOMMUDevice *hiod = NULL; + HostIOMMUDeviceClass *hiod_ops = NULL; int ret; if (vbasedev->iommufd) { @@ -1664,17 +1665,28 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, if (!vbasedev->mdev) { hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + hiod_ops = HOST_IOMMU_DEVICE_GET_CLASS(hiod); vbasedev->hiod = hiod; } ret = ops->attach_device(name, vbasedev, as, errp); if (ret) { - object_unref(hiod); - vbasedev->hiod = NULL; - return ret; + goto err_attach; + } + + if (hiod_ops && hiod_ops->realize_late && + !hiod_ops->realize_late(hiod, vbasedev, errp)) { + ops->detach_device(vbasedev); + ret = -EINVAL; + goto err_attach; } return 0; + +err_attach: + object_unref(hiod); + vbasedev->hiod = NULL; + return ret; } void vfio_detach_device(VFIODevice *vbasedev) diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index e4d8300350..84131f5495 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -64,6 +64,23 @@ struct HostIOMMUDeviceClass { * Returns: true on success, false on failure. */ bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @realize_late: initialize host IOMMU device instance after attachment, + * some elements e.g., ioas are ready only after attachment. + * This callback initialize them. + * + * Optional callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize_late)(HostIOMMUDevice *hiod, void *opaque, Error **errp); /** * @get_cap: check if a host IOMMU device capability is supported. * -- Gitee From b727a28ce2cf062473ca011dd69697e0b7826a25 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 5 Aug 2024 09:29:00 +0800 Subject: [PATCH 094/134] vfio/iommufd: Implement HostIOMMUDeviceClass::realize_late() handler There are three iommufd related elements iommufd handle, devid and ioas_id. ioas_id is ready only after VFIO device attachment. Device id and iommufd handle are ready before attachment, but they are all iommufd related elements, initialize them together with ioas_id. Signed-off-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 3d4f902ae5..47a8823146 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -827,6 +827,22 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + VFIOIOMMUFDContainer *container = container_of(vdev->bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->ioas_id = container->ioas_id; + + return true; +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { @@ -858,6 +874,7 @@ static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; + hiodc->realize_late = hiod_iommufd_vfio_realize_late; }; static const TypeInfo types[] = { -- Gitee From aea706f6a71ddbcc9bd342ece14991f8f8261224 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jan 2024 17:26:50 +0800 Subject: [PATCH 095/134] vfio/iommufd: Implement [at|de]tach_hwpt handlers Implement [at|de]tach_hwpt handlers in VFIO subsystem. vIOMMU utilizes them to attach to or detach from hwpt on host side. To achieve that, a new property vdev is add to TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO which is initialized in .realize() handler. Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan [Shameer: Changed ret for host_iommu_device_iommufd_vfio_detach_hwpt()] Signed-off-by: Shameer Kolothum --- hw/vfio/iommufd.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 47a8823146..528023b95b 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -827,6 +827,24 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, Error **errp) { @@ -872,9 +890,13 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->realize_late = hiod_iommufd_vfio_realize_late; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { -- Gitee From 207259b8f08e87b4a741a8b7884e699c95641a2e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Sat, 13 Apr 2024 00:15:17 +0000 Subject: [PATCH 096/134] backends/iommufd: Introduce iommufd_backend_alloc_viommu Add a helper to allocate a viommu object. Signed-off-by: Nicolin Chen --- backends/iommufd.c | 35 +++++++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 10 ++++++++++ 3 files changed, 46 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index c10aa9b011..82368a3918 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -360,6 +360,41 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, return ret; } +struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id) +{ + int ret, fd = be->fd; + struct IOMMUFDViommu *viommu = g_malloc(sizeof(*viommu)); + struct iommu_viommu_alloc alloc_viommu = { + .size = sizeof(alloc_viommu), + .type = viommu_type, + .dev_id = dev_id, + .hwpt_id = hwpt_id, + }; + + if (!viommu) { + error_report("failed to allocate viommu object"); + return NULL; + } + + ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu); + + trace_iommufd_backend_alloc_viommu(fd, viommu_type, dev_id, hwpt_id, + alloc_viommu.out_viommu_id, ret); + if (ret) { + error_report("IOMMU_VIOMMU_ALLOC failed: %s", strerror(errno)); + g_free(viommu); + return NULL; + } + + viommu->viommu_id = alloc_viommu.out_viommu_id; + viommu->s2_hwpt_id = hwpt_id; + viommu->iommufd = be; + return viommu; +} + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp) { diff --git a/backends/trace-events b/backends/trace-events index ef0ff98921..c24cd378df 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -19,3 +19,4 @@ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (% iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" +iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 3dc6934144..05a08c49c2 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -39,6 +39,12 @@ struct IOMMUFDBackend { /*< public >*/ }; +typedef struct IOMMUFDViommu { + IOMMUFDBackend *iommufd; + uint32_t s2_hwpt_id; + uint32_t viommu_id; +} IOMMUFDViommu; + int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); void iommufd_backend_disconnect(IOMMUFDBackend *be); @@ -66,6 +72,10 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t *entry_num, void *data_ptr); +struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, -- Gitee From 005b8f4b6cef11982abcc2c071cbe40b69fb22e7 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Sat, 13 Apr 2024 00:21:22 +0000 Subject: [PATCH 097/134] backends/iommufd: Introduce iommufd_vdev_alloc Add a helper to allocate an iommufd device's virtual device (in the user space) per a viommu instance. Signed-off-by: Nicolin Chen --- backends/iommufd.c | 31 +++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 11 +++++++++++ 3 files changed, 43 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 82368a3918..af3376d0bf 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -395,6 +395,37 @@ struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, return viommu; } +struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + IOMMUFDViommu *viommu, + uint64_t virt_id) +{ + int ret, fd = viommu->iommufd->fd; + struct IOMMUFDVdev *vdev = g_malloc(sizeof(*vdev)); + struct iommu_vdevice_alloc alloc_vdev = { + .size = sizeof(alloc_vdev), + .viommu_id = viommu->viommu_id, + .dev_id = idev->devid, + .virt_id = virt_id, + }; + + ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev); + + trace_iommufd_backend_alloc_vdev(fd, idev->devid, viommu->viommu_id, virt_id, + alloc_vdev.out_vdevice_id, ret); + + if (ret) { + error_report("IOMMU_VDEVICE_ALLOC failed: %s", strerror(errno)); + g_free(vdev); + return NULL; + } + + vdev->idev = idev; + vdev->viommu = viommu; + vdev->virt_id = virt_id; + vdev->vdev_id = alloc_vdev.out_vdevice_id; + return vdev; +} + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp) { diff --git a/backends/trace-events b/backends/trace-events index c24cd378df..e150a37e9a 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -20,3 +20,4 @@ iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" +iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 05a08c49c2..0284e95460 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -128,4 +128,15 @@ bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp); bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, Error **errp); + +typedef struct IOMMUFDVdev { + HostIOMMUDeviceIOMMUFD *idev; + IOMMUFDViommu *viommu; + uint32_t vdev_id; + uint64_t virt_id; +} IOMMUFDVdev; + +struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + IOMMUFDViommu *viommu, + uint64_t virt_id); #endif -- Gitee From 2be28f75e4ed2a0a35549dd1a545e0655e63973d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 12 Apr 2024 23:27:54 +0000 Subject: [PATCH 098/134] backends/iommufd: Introduce iommufd_viommu_invalidate_cache Similar to iommufd_backend_invalidate_cache for iotlb invalidation via IOMMU_HWPT_INVALIDATE ioctl, add a new helper for viommu specific cache invalidation via IOMMU_VIOMMU_INVALIDATE ioctl. Signed-off-by: Nicolin Chen --- backends/iommufd.c | 31 +++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 3 +++ 3 files changed, 35 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index af3376d0bf..ee6f5bcf65 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -426,6 +426,37 @@ struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, return vdev; } +int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr) +{ + int ret, fd = be->fd; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = viommu_id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = *entry_num, + .data_uptr = (uint64_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + + trace_iommufd_viommu_invalidate_cache(fd, viommu_id, data_type, + entry_len, *entry_num, + cache.entry_num, + (uint64_t)data_ptr, ret); + if (ret) { + *entry_num = cache.entry_num; + error_report("IOMMU_VIOMMU_INVALIDATE failed: %s", strerror(errno)); + ret = -errno; + } else { + g_assert(*entry_num == cache.entry_num); + } + + return ret; +} + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp) { diff --git a/backends/trace-events b/backends/trace-events index e150a37e9a..f8592a2711 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -21,3 +21,4 @@ iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, u iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" +iommufd_viommu_invalidate_cache(int iommufd, uint32_t viommu_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d viommu_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 0284e95460..0f2c826036 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -76,6 +76,9 @@ struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id, uint32_t viommu_type, uint32_t hwpt_id); +int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, -- Gitee From d589010512005bfc698f30417911e4b14478c81b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Jun 2022 01:30:39 -0700 Subject: [PATCH 099/134] hw/arm/smmu-common: Add a nested flag to SMMUState Add a nested flag in the SMMUState, passed in from device property. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 1 + hw/arm/smmuv3.c | 5 +++++ include/hw/arm/smmu-common.h | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 9a8ac45431..c5f3e02065 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -683,6 +683,7 @@ static Property smmu_dev_properties[] = { DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0), DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus, TYPE_PCI_BUS, PCIBus *), + DEFINE_PROP_BOOL("nested", SMMUState, nested, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index c3871ae067..64ca4c5542 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1746,6 +1746,11 @@ static void smmu_realize(DeviceState *d, Error **errp) SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; + if (s->stage && strcmp("1", s->stage)) { + /* Only support nested with an stage1 only vSMMU */ + sys->nested = false; + } + c->parent_realize(d, &local_err); if (local_err) { error_propagate(errp, local_err); diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index fd8d772da1..eae5d4d05b 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -22,6 +22,7 @@ #include "hw/sysbus.h" #include "hw/pci/pci.h" #include "qom/object.h" +#include "sysemu/iommufd.h" #define SMMU_PCI_BUS_MAX 256 #define SMMU_PCI_DEVFN_MAX 256 @@ -136,6 +137,9 @@ struct SMMUState { const char *mrtypename; MemoryRegion iomem; + /* Nested SMMU */ + bool nested; + GHashTable *smmu_pcibus_by_busptr; GHashTable *configs; /* cache for configuration data */ GHashTable *iotlb; -- Gitee From 6c330f39cc08e4c641a3567e2b6ad0ebcadf5165 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Jun 2024 21:22:04 +0000 Subject: [PATCH 100/134] hw/arm/smmu-common: Bypass emulated IOTLB for a nested SMMU If a vSMMU is configured as a nested one, HW IOTLB will be used and all cache invalidation should be done to the HW IOTLB too, v.s. the emulated iotlb. In this case, an iommu notifier isn't registered, as the devices behind a nested SMMU would stay in the system address space for stage-2 mappings. However, the KVM code still requests an iommu address space to translate an MSI doorbell gIOVA via get_msi_address_space() and translate(). Since a nested SMMU doesn't register an iommu notifier to flush emulated iotlb, bypass the emulated IOTLB and always walk through the guest-level IO page table. Note that regular nested SMMU could still register an iommu notifier for IOTLB invalidation, since QEMU traps the invalidation commands. But this would result in invalidation inefficiency since each invlaidation would be doubled for both HW IOTLB and the emulated IOTLB. Also, with NVIDIA's CMDQV feature on its Grace SoC, invalidation commands are issued to the CMDQ HW direclty, without any trapping. So, there is no way to maintain the emulated IOTLB. Meanwhile, the stage-1 translation request from KVM is only activated in case of an MSI table update, which does not happen that often to impact performance if walking through the guest RAM every time. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index c5f3e02065..016418a48c 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -75,6 +75,16 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, uint8_t level = 4 - (inputsize - 4) / stride; SMMUTLBEntry *entry = NULL; + /* + * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, + * KVM still requests for an iommu address space for an MSI fixup by looking + * up stage-1 page table. Make sure we don't go through the emulated pathway + * so that the emulated iotlb will not need any invalidation. + */ + if (bs->nested) { + return NULL; + } + while (level <= 3) { uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz); uint64_t mask = subpage_size - 1; @@ -110,6 +120,16 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); uint8_t tg = (new->granule - 10) / 2; + /* + * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, + * KVM still requests for an iommu address space for an MSI fixup by looking + * up stage-1 page table. Make sure we don't go through the emulated pathway + * so that the emulated iotlb will not need any invalidation. + */ + if (bs->nested) { + return; + } + if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { smmu_iotlb_inv_all(bs); } -- Gitee From 2fea4f93632679afcb15f0c35b3d9abeede37778 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Apr 2024 16:37:25 +0000 Subject: [PATCH 101/134] hw/arm/smmu-common: Extract smmu_get_sbus and smmu_get_sdev helpers Add two helpers to get sbus and sdev respectively. These will be used by the following patch adding set/unset_iommu_device ops. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 016418a48c..03d9ff58d4 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -589,12 +589,9 @@ SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num) return NULL; } -static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) +static SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus) { - SMMUState *s = opaque; SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); - SMMUDevice *sdev; - static unsigned int index; if (!sbus) { sbus = g_malloc0(sizeof(SMMUPciBus) + @@ -603,7 +600,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) g_hash_table_insert(s->smmu_pcibus_by_busptr, bus, sbus); } - sdev = sbus->pbdev[devfn]; + return sbus; +} + +static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, + PCIBus *bus, int devfn) +{ + SMMUDevice *sdev = sbus->pbdev[devfn]; + static unsigned int index; + if (!sdev) { char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn, index++); @@ -622,6 +627,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) g_free(name); } + return sdev; +} + +static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) +{ + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + return &sdev->as; } -- Gitee From 539e12641dc2db30a6fea7a0f061e163bc245d79 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Jun 2022 02:16:52 -0700 Subject: [PATCH 102/134] hw/arm/smmu-common: Add set/unset_iommu_device callback Implement a set_iommu_device callback: - Find an existing S2 hwpt to test attach() or allocate a new one (Devices behind the same physical SMMU should share an S2 HWPT.) - Attach the device to the S2 hwpt and add it to its device list And add an unset_iommu_device doing the opposite cleanup routine. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 177 +++++++++++++++++++++++++++++++++++ hw/arm/trace-events | 2 + include/hw/arm/smmu-common.h | 21 +++++ 3 files changed, 200 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 03d9ff58d4..038ae857d8 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -20,6 +20,7 @@ #include "trace.h" #include "exec/target_page.h" #include "hw/core/cpu.h" +#include "hw/pci/pci_device.h" #include "hw/qdev-properties.h" #include "qapi/error.h" #include "qemu/jhash.h" @@ -639,8 +640,184 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) return &sdev->as; } +static bool smmu_dev_attach_viommu(SMMUDevice *sdev, + HostIOMMUDeviceIOMMUFD *idev, Error **errp) +{ + struct iommu_hwpt_arm_smmuv3 bypass_data = { + .ste = { 0x9ULL, 0x0ULL }, //0x1ULL << (108 - 64) }, + }; + struct iommu_hwpt_arm_smmuv3 abort_data = { + .ste = { 0x1ULL, 0x0ULL }, + }; + SMMUState *s = sdev->smmu; + SMMUS2Hwpt *s2_hwpt; + SMMUViommu *viommu; + uint32_t s2_hwpt_id; + + if (s->viommu) { + return host_iommu_device_iommufd_attach_hwpt( + idev, s->viommu->s2_hwpt->hwpt_id, errp); + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &s2_hwpt_id, errp)) { + error_setg(errp, "failed to allocate an S2 hwpt"); + return false; + } + + /* Attach to S2 for MSI cookie */ + if (!host_iommu_device_iommufd_attach_hwpt(idev, s2_hwpt_id, errp)) { + error_setg(errp, "failed to attach stage-2 HW pagetable"); + goto free_s2_hwpt; + } + + viommu = g_new0(SMMUViommu, 1); + + viommu->core = iommufd_backend_alloc_viommu(idev->iommufd, idev->devid, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3, + s2_hwpt_id); + if (!viommu->core) { + error_setg(errp, "failed to allocate a viommu"); + goto free_viommu; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(abort_data), &abort_data, + &viommu->abort_hwpt_id, errp)) { + error_setg(errp, "failed to allocate an abort pagetable"); + goto free_viommu_core; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(bypass_data), &bypass_data, + &viommu->bypass_hwpt_id, errp)) { + error_setg(errp, "failed to allocate a bypass pagetable"); + goto free_abort_hwpt; + } + + if (!host_iommu_device_iommufd_attach_hwpt( + idev, viommu->bypass_hwpt_id, errp)) { + error_setg(errp, "failed to attach the bypass pagetable"); + goto free_bypass_hwpt; + } + + s2_hwpt = g_new0(SMMUS2Hwpt, 1); + s2_hwpt->iommufd = idev->iommufd; + s2_hwpt->hwpt_id = s2_hwpt_id; + s2_hwpt->ioas_id = idev->ioas_id; + + viommu->iommufd = idev->iommufd; + viommu->s2_hwpt = s2_hwpt; + + s->viommu = viommu; + return true; + +free_bypass_hwpt: + iommufd_backend_free_id(idev->iommufd, viommu->bypass_hwpt_id); +free_abort_hwpt: + iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id); +free_viommu_core: + iommufd_backend_free_id(idev->iommufd, viommu->core->viommu_id); + g_free(viommu->core); +free_viommu: + g_free(viommu); + host_iommu_device_iommufd_attach_hwpt(idev, sdev->idev->ioas_id, errp); +free_s2_hwpt: + iommufd_backend_free_id(idev->iommufd, s2_hwpt_id); + return false; +} + +static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) +{ + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + + if (!s->nested) { + return true; + } + + if (sdev->idev) { + if (sdev->idev != idev) { + return false;//-EEXIST; + } else { + return true; + } + } + + if (!idev) { + return true; + } + + if (!smmu_dev_attach_viommu(sdev, idev, errp)) { + error_report("Unable to attach viommu"); + return false; + } + + sdev->idev = idev; + sdev->viommu = s->viommu; + QLIST_INSERT_HEAD(&s->viommu->device_list, sdev, next); + trace_smmu_set_iommu_device(devfn, smmu_get_sid(sdev)); + + return true; +} + +static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) +{ + SMMUDevice *sdev; + SMMUViommu *viommu; + SMMUState *s = opaque; + SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); + + if (!s->nested) { + return; + } + + if (!sbus) { + return; + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + return; + } + + if (!host_iommu_device_iommufd_attach_hwpt(sdev->idev, + sdev->idev->ioas_id, NULL)) { + error_report("Unable to attach dev to the default HW pagetable"); + } + + viommu = sdev->viommu; + + sdev->idev = NULL; + sdev->viommu = NULL; + QLIST_REMOVE(sdev, next); + trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); + + if (QLIST_EMPTY(&viommu->device_list)) { + iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->core->viommu_id); + g_free(viommu->core); + iommufd_backend_free_id(viommu->iommufd, viommu->s2_hwpt->hwpt_id); + g_free(viommu->s2_hwpt); + g_free(viommu); + s->viommu = NULL; + } +} + static const PCIIOMMUOps smmu_ops = { .get_address_space = smmu_find_add_as, + .set_iommu_device = smmu_dev_set_iommu_device, + .unset_iommu_device = smmu_dev_unset_iommu_device, }; IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) diff --git a/hw/arm/trace-events b/hw/arm/trace-events index cdc1ea06a8..58e0636e95 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -5,6 +5,8 @@ virt_acpi_setup(void) "No fw cfg or ACPI disabled. Bailing out." # smmu-common.c smmu_add_mr(const char *name) "%s" +smmu_set_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" +smmu_unset_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" smmu_ptw_level(int stage, int level, uint64_t iova, size_t subpage_size, uint64_t baseaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d iova=0x%"PRIx64" subpage_sz=0x%zx baseaddr=0x%"PRIx64" offset=%d => pte=0x%"PRIx64 smmu_ptw_invalid_pte(int stage, int level, uint64_t baseaddr, uint64_t pteaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d base@=0x%"PRIx64" pte@=0x%"PRIx64" offset=%d pte=0x%"PRIx64 smmu_ptw_page_pte(int stage, int level, uint64_t iova, uint64_t baseaddr, uint64_t pteaddr, uint64_t pte, uint64_t address) "stage=%d level=%d iova=0x%"PRIx64" base@=0x%"PRIx64" pte@=0x%"PRIx64" pte=0x%"PRIx64" page address = 0x%"PRIx64 diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index eae5d4d05b..3bfb68cef6 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -23,6 +23,7 @@ #include "hw/pci/pci.h" #include "qom/object.h" #include "sysemu/iommufd.h" +#include #define SMMU_PCI_BUS_MAX 256 #define SMMU_PCI_DEVFN_MAX 256 @@ -107,11 +108,30 @@ typedef struct SMMUTransCfg { struct SMMUS2Cfg s2cfg; } SMMUTransCfg; +typedef struct SMMUS2Hwpt { + IOMMUFDBackend *iommufd; + uint32_t hwpt_id; + uint32_t ioas_id; +} SMMUS2Hwpt; + +typedef struct SMMUViommu { + void *smmu; + IOMMUFDBackend *iommufd; + IOMMUFDViommu *core; + SMMUS2Hwpt *s2_hwpt; + uint32_t bypass_hwpt_id; + uint32_t abort_hwpt_id; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; +} SMMUViommu; + typedef struct SMMUDevice { void *smmu; PCIBus *bus; int devfn; IOMMUMemoryRegion iommu; + HostIOMMUDeviceIOMMUFD *idev; + SMMUViommu *viommu; AddressSpace as; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; @@ -139,6 +159,7 @@ struct SMMUState { /* Nested SMMU */ bool nested; + SMMUViommu *viommu; GHashTable *smmu_pcibus_by_busptr; GHashTable *configs; /* cache for configuration data */ -- Gitee From a2735cd15160a62065a0a0b39af405c7b0f3fae8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Jun 2022 14:41:27 -0700 Subject: [PATCH 103/134] hw/arm/smmu-common: Add iommufd helpers Add a set of helper functions for IOMMUFD and new "struct SMMUS1Hwpt" to store the nested hwpt information. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 108 +++++++++++++++++++++++++++++++++++ include/hw/arm/smmu-common.h | 20 +++++++ 2 files changed, 128 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 038ae857d8..a79eb34277 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -838,6 +838,114 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) return NULL; } +/* IOMMUFD helpers */ +int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, void *data) +{ + uint64_t caps; + + if (!sdev || !sdev->idev) { + return -ENOENT; + } + + return !iommufd_backend_get_device_info(sdev->idev->iommufd, + sdev->idev->devid, data_type, data, + data_len, &caps, NULL); +} + +void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) +{ + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + uint32_t hwpt_id; + + if (!s1_hwpt || !sdev->viommu) { + return; + } + + if (abort) { + hwpt_id = sdev->viommu->abort_hwpt_id; + } else { + hwpt_id = sdev->viommu->bypass_hwpt_id; + } + + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { + return; + } + + iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); + sdev->s1_hwpt = NULL; + g_free(s1_hwpt); +} + +int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data) +{ + SMMUViommu *viommu = sdev->viommu; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + + if (!idev || !viommu) { + return -ENOENT; + } + + if (s1_hwpt) { + smmu_dev_uninstall_nested_ste(sdev, false); + } + + s1_hwpt = g_new0(SMMUS1Hwpt, 1); + if (!s1_hwpt) { + return -ENOMEM; + } + + s1_hwpt->smmu = sdev->smmu; + s1_hwpt->viommu = viommu; + s1_hwpt->iommufd = idev->iommufd; + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, data_type, + data_len, data, &s1_hwpt->hwpt_id, NULL)) { + goto free; + } + + if (!host_iommu_device_iommufd_attach_hwpt(idev, s1_hwpt->hwpt_id, NULL)) { + goto free_hwpt; + } + + sdev->s1_hwpt = s1_hwpt; + + return 0; +free_hwpt: + iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); +free: + sdev->s1_hwpt = NULL; + g_free(s1_hwpt); + + return -EINVAL; +} + +int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs) +{ + if (!s1_hwpt) { + return -ENOENT; + } + + return iommufd_backend_invalidate_cache(s1_hwpt->iommufd, s1_hwpt->hwpt_id, + type, len, num, reqs); +} + +int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, + uint32_t len, uint32_t *num, void *reqs) +{ + if (!viommu) { + return -ENOENT; + } + + return iommufd_viommu_invalidate_cache(viommu->iommufd, viommu->viommu_id, + type, len, num, reqs); +} + /* Unmap all notifiers attached to @mr */ static void smmu_inv_notifiers_mr(IOMMUMemoryRegion *mr) { diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 3bfb68cef6..66dc7206ea 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -125,6 +125,15 @@ typedef struct SMMUViommu { QLIST_ENTRY(SMMUViommu) next; } SMMUViommu; +typedef struct SMMUS1Hwpt { + void *smmu; + IOMMUFDBackend *iommufd; + SMMUViommu *viommu; + uint32_t hwpt_id; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; +} SMMUS1Hwpt; + typedef struct SMMUDevice { void *smmu; PCIBus *bus; @@ -132,6 +141,7 @@ typedef struct SMMUDevice { IOMMUMemoryRegion iommu; HostIOMMUDeviceIOMMUFD *idev; SMMUViommu *viommu; + SMMUS1Hwpt *s1_hwpt; AddressSpace as; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; @@ -225,4 +235,14 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, dma_addr_t iova, /* Unmap the range of all the notifiers registered to any IOMMU mr */ void smmu_inv_notifiers_all(SMMUState *s); +/* IOMMUFD helpers */ +int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, void *data); +void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); +int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data); +int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs); +int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, + uint32_t len, uint32_t *num, void *reqs); #endif /* HW_ARM_SMMU_COMMON_H */ -- Gitee From 3c6c29612d5ca0ff07bcb8a45735a3877c8fadd4 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 7 Dec 2023 20:04:47 +0000 Subject: [PATCH 104/134] hw/arm/smmu-common: Return sysmem if stage-1 is bypassed When nested translation is enabled, there are 2-stage translation occuring to two different address spaces: stage-1 in the iommu as, while stage-2 in the system as. If a device attached to the vSMMU doesn't enable stage-1 translation, e.g. vSTE sets to Config=Bypass, the system as should be returned, so QEMU can set up system memory mappings onto the stage-2 page table. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 18 +++++++++++++++++- include/hw/arm/smmu-common.h | 3 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index a79eb34277..cc41bf3de8 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -622,6 +622,9 @@ static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu), s->mrtypename, OBJECT(s), name, UINT64_MAX); + if (s->nested) { + address_space_init(&sdev->as_sysmem, &s->root, name); + } address_space_init(&sdev->as, MEMORY_REGION(&sdev->iommu), name); trace_smmu_add_mr(name); @@ -637,7 +640,12 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) SMMUPciBus *sbus = smmu_get_sbus(s, bus); SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); - return &sdev->as; + /* Return the system as if the device uses stage-2 only */ + if (s->nested && !sdev->s1_hwpt) { + return &sdev->as_sysmem; + } else { + return &sdev->as; + } } static bool smmu_dev_attach_viommu(SMMUDevice *sdev, @@ -983,6 +991,14 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) g_free, g_free); s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); + if (s->nested) { + memory_region_init(&s->root, OBJECT(s), "root", UINT64_MAX); + memory_region_init_alias(&s->sysmem, OBJECT(s), + "smmu-sysmem", get_system_memory(), 0, + memory_region_size(get_system_memory())); + memory_region_add_subregion(&s->root, 0, &s->sysmem); + } + if (s->primary_bus) { pci_setup_iommu(s->primary_bus, &smmu_ops, s); } else { diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 66dc7206ea..37dfeed026 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -143,6 +143,7 @@ typedef struct SMMUDevice { SMMUViommu *viommu; SMMUS1Hwpt *s1_hwpt; AddressSpace as; + AddressSpace as_sysmem; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; QLIST_ENTRY(SMMUDevice) next; @@ -165,7 +166,9 @@ struct SMMUState { /* */ SysBusDevice dev; const char *mrtypename; + MemoryRegion root; MemoryRegion iomem; + MemoryRegion sysmem; /* Nested SMMU */ bool nested; -- Gitee From 9f3b8c283d4c1014ff292faddb78bbbfd7ec22d3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 9 Apr 2024 01:49:26 +0000 Subject: [PATCH 105/134] hw/arm/smmuv3: Ignore IOMMU_NOTIFIER_MAP for nested-smmuv3 If a device's MemmoryRegion type is iommu, vfio core registers a listener, passing the IOMMU_NOTIFIER_IOTLB_EVENTS flag (bundle of IOMMU_NOTIFIER_MAP and IOMMU_NOTIFIER_UNMAP). On the other hand, nested SMMUv3 does not use a map notifier. And it would only insert an IOTLB entry for MSI doorbell page mapping, which can simply be done by the mr->translate call. Ignore the IOMMU_NOTIFIER_MAP flag and drop the error out. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 64ca4c5542..db111220c7 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1881,12 +1881,9 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, return -EINVAL; } - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu MAP notifier which is " - "not currently supported", pci_bus_num(sdev->bus), - PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn)); - return -EINVAL; + /* nested-smmuv3 does not need IOMMU_NOTIFIER_MAP. Ignore it. */ + if (s->nested) { + new &= ~IOMMU_NOTIFIER_MAP; } if (old == IOMMU_NOTIFIER_NONE) { -- Gitee From 03964c037862a594b4eb7d2e3754acd32c01c80b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 22 Sep 2022 14:06:07 -0700 Subject: [PATCH 106/134] hw/arm/smmuv3: Read host SMMU device info Read the underlying SMMU device info and set corresponding IDR bits. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3.c | 77 ++++++++++++++++++++++++++++++++++++ hw/arm/trace-events | 1 + include/hw/arm/smmu-common.h | 1 + 3 files changed, 79 insertions(+) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index db111220c7..4208325ab3 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -254,6 +254,80 @@ void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *info) info->recorded = true; } +static void smmuv3_nested_init_regs(SMMUv3State *s) +{ + SMMUState *bs = ARM_SMMU(s); + SMMUDevice *sdev; + uint32_t data_type; + uint32_t val; + int ret; + + if (!bs->nested || !bs->viommu) { + return; + } + + sdev = QLIST_FIRST(&bs->viommu->device_list); + if (!sdev) { + return; + } + + if (sdev->info.idr[0]) { + error_report("reusing the previous hw_info"); + goto out; + } + + ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &sdev->info); + if (ret) { + error_report("failed to get SMMU device info"); + return; + } + + if (data_type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { + error_report( "Wrong data type (%d)!", data_type); + return; + } + +out: + trace_smmuv3_get_device_info(sdev->info.idr[0], sdev->info.idr[1], + sdev->info.idr[3], sdev->info.idr[5]); + + val = FIELD_EX32(sdev->info.idr[0], IDR0, BTM); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, BTM, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, ATS); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, ASID16); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, TERM_MODEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, STALL_MODEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, STLEVEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, val); + + val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); + val = FIELD_EX32(sdev->info.idr[1], IDR1, SSIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, val); + + val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); + val = FIELD_EX32(sdev->info.idr[3], IDR3, RIL); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, val); + val = FIELD_EX32(sdev->info.idr[3], IDR3, BBML); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, val); + + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN4K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN16K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN64K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, OAS); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, val); + + /* FIXME check iidr and aidr registrs too */ +} + static void smmuv3_init_regs(SMMUv3State *s) { /* Based on sys property, the stages supported in smmu will be advertised.*/ @@ -292,6 +366,9 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1); + /* Override IDR fields with HW caps */ + smmuv3_nested_init_regs(s); + s->cmdq.base = deposit64(s->cmdq.base, 0, 5, SMMU_CMDQS); s->cmdq.prod = 0; s->cmdq.cons = 0; diff --git a/hw/arm/trace-events b/hw/arm/trace-events index 58e0636e95..1e3d86382d 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -55,5 +55,6 @@ smmuv3_cmdq_tlbi_s12_vmid(uint16_t vmid) "vmid=%d" smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 37dfeed026..d120c352cf 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -146,6 +146,7 @@ typedef struct SMMUDevice { AddressSpace as_sysmem; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; + struct iommu_hw_info_arm_smmuv3 info; QLIST_ENTRY(SMMUDevice) next; } SMMUDevice; -- Gitee From fac9784bbedb50dc964feb9cf70b6f37472fcf60 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Apr 2023 22:10:44 -0700 Subject: [PATCH 107/134] hw/arm/smmuv3: Check idr registers for STE_S1CDMAX and STE_S1STALLD With nested translation, the underlying HW could support those two fields. Allow them according to the updated idr registers after the hw_info ioctl. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 4208325ab3..253d297eec 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -622,13 +622,14 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, } } - if (STE_S1CDMAX(ste) != 0) { + if (!FIELD_EX32(s->idr[1], IDR1, SSIDSIZE) && STE_S1CDMAX(ste) != 0) { qemu_log_mask(LOG_UNIMP, "SMMUv3 does not support multiple context descriptors yet\n"); goto bad_ste; } - if (STE_S1STALLD(ste)) { + /* STALL_MODEL being 0b01 means "stall is not supported" */ + if ((FIELD_EX32(s->idr[0], IDR0, STALL_MODEL) & 0x1) && STE_S1STALLD(ste)) { qemu_log_mask(LOG_UNIMP, "SMMUv3 S1 stalling fault model not allowed yet\n"); goto bad_ste; -- Gitee From 13b84313c9f7ca4823abdbad92baf091c337861e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Apr 2023 15:13:53 -0700 Subject: [PATCH 108/134] hw/arm/smmuv3: Add smmu_dev_install_nested_ste() for CFGI_STE Call smmu_dev_install_nested_ste and eventually down to IOMMU_HWPT_ALLOC ioctl for a nested HWPT allocation. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 9 ++++ hw/arm/smmuv3-internal.h | 1 + hw/arm/smmuv3.c | 97 +++++++++++++++++++++++++++++++++++- hw/arm/trace-events | 1 + include/hw/arm/smmu-common.h | 14 ++++++ 5 files changed, 120 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index cc41bf3de8..9e9af8f5c7 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -780,6 +780,7 @@ static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) { + SMMUVdev *vdev; SMMUDevice *sdev; SMMUViommu *viommu; SMMUState *s = opaque; @@ -803,13 +804,21 @@ static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) error_report("Unable to attach dev to the default HW pagetable"); } + vdev = sdev->vdev; viommu = sdev->viommu; sdev->idev = NULL; sdev->viommu = NULL; + sdev->vdev = NULL; QLIST_REMOVE(sdev, next); trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); + if (vdev) { + iommufd_backend_free_id(viommu->iommufd, vdev->core->vdev_id); + g_free(vdev->core); + g_free(vdev); + } + if (QLIST_EMPTY(&viommu->device_list)) { iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index 6076025ad6..163459d450 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -552,6 +552,7 @@ typedef struct CD { #define STE_S1FMT(x) extract32((x)->word[0], 4 , 2) #define STE_S1CDMAX(x) extract32((x)->word[1], 27, 5) +#define STE_S1DSS(x) extract32((x)->word[2], 0, 2) #define STE_S1STALLD(x) extract32((x)->word[2], 27, 1) #define STE_EATS(x) extract32((x)->word[2], 28, 2) #define STE_STRW(x) extract32((x)->word[2], 30, 2) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 253d297eec..540831ab8e 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -563,6 +563,27 @@ bad_ste: return -EINVAL; } +static void decode_ste_config(SMMUTransCfg *cfg, uint32_t config) +{ + + if (STE_CFG_ABORT(config)) { + cfg->aborted = true; + return; + } + if (STE_CFG_BYPASS(config)) { + cfg->bypassed = true; + return; + } + + if (STE_CFG_S1_ENABLED(config)) { + cfg->stage = SMMU_STAGE_1; + } + + if (STE_CFG_S2_ENABLED(config)) { + cfg->stage |= SMMU_STAGE_2; + } +} + /* Returns < 0 in case of invalid STE, 0 otherwise */ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, STE *ste, SMMUEventInfo *event) @@ -579,12 +600,19 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, config = STE_CONFIG(ste); - if (STE_CFG_ABORT(config)) { + decode_ste_config(cfg, config); + + /* S1DSS.Terminate is same as Config.abort for default stream */ + if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0) { cfg->aborted = true; + } + + if (cfg->aborted || cfg->bypassed) { return 0; } - if (STE_CFG_BYPASS(config)) { + /* S1DSS.Bypass is same as Config.bypass for default stream */ + if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0x1) { cfg->bypassed = true; return 0; } @@ -1231,6 +1259,68 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) } } +static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) +{ +#ifdef __linux__ + SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid, + .inval_ste_allowed = true}; + struct iommu_hwpt_arm_smmuv3 nested_data = {}; + SMMUv3State *s = sdev->smmu; + SMMUState *bs = &s->smmu_state; + uint32_t config; + STE ste; + int ret; + + if (!sdev->viommu || !bs->nested) { + return; + } + + if (!sdev->vdev && sdev->idev && sdev->viommu) { + SMMUVdev *vdev = g_new0(SMMUVdev, 1); + vdev->core = iommufd_backend_alloc_vdev(sdev->idev, sdev->viommu->core, + sid); + if (!vdev->core) { + error_report("failed to allocate a vDEVICE"); + g_free(vdev); + return; + } + sdev->vdev = vdev; + } + + ret = smmu_find_ste(sdev->smmu, sid, &ste, &event); + if (ret) { + /* + * For a 2-level Stream Table, the level-2 table might not be ready + * until the device gets inserted to the stream table. Ignore this. + */ + return; + } + + config = STE_CONFIG(&ste); + if (!STE_VALID(&ste) || !STE_CFG_S1_ENABLED(config)) { + smmu_dev_uninstall_nested_ste(sdev, STE_CFG_ABORT(config)); + smmuv3_flush_config(sdev); + return; + } + + nested_data.ste[0] = (uint64_t)ste.word[0] | (uint64_t)ste.word[1] << 32; + nested_data.ste[1] = (uint64_t)ste.word[2] | (uint64_t)ste.word[3] << 32; + /* V | CONFIG | S1FMT | S1CTXPTR | S1CDMAX */ + nested_data.ste[0] &= 0xf80fffffffffffffULL; + /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ + nested_data.ste[1] &= 0x380000ffULL; + + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(nested_data), &nested_data); + if (ret) { + error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", + nested_data.ste[1], nested_data.ste[0], ret); + } + + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); +#endif +} + static gboolean smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) { @@ -1241,6 +1331,8 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) if (sid < sid_range->start || sid > sid_range->end) { return false; } + smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); trace_smmuv3_config_cache_inv(sid); return true; } @@ -1310,6 +1402,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_cfgi_ste(sid); sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); break; } diff --git a/hw/arm/trace-events b/hw/arm/trace-events index 1e3d86382d..490da6349c 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -57,4 +57,5 @@ smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 +smmuv3_install_nested_ste(uint32_t sid, uint64_t ste_1, uint64_t ste_0) "sid=%d ste=%"PRIx64":%"PRIx64 diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index d120c352cf..955ca716a5 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -51,6 +51,13 @@ typedef enum { SMMU_PTW_ERR_PERMISSION, /* Permission fault */ } SMMUPTWEventType; +/* SMMU Stage */ +typedef enum { + SMMU_STAGE_1 = 1, + SMMU_STAGE_2, + SMMU_NESTED, +} SMMUStage; + typedef struct SMMUPTWEventInfo { int stage; SMMUPTWEventType type; @@ -125,6 +132,12 @@ typedef struct SMMUViommu { QLIST_ENTRY(SMMUViommu) next; } SMMUViommu; +typedef struct SMMUVdev { + SMMUViommu *vsmmu; + IOMMUFDVdev *core; + uint32_t sid; +}SMMUVdev; + typedef struct SMMUS1Hwpt { void *smmu; IOMMUFDBackend *iommufd; @@ -141,6 +154,7 @@ typedef struct SMMUDevice { IOMMUMemoryRegion iommu; HostIOMMUDeviceIOMMUFD *idev; SMMUViommu *viommu; + SMMUVdev *vdev; SMMUS1Hwpt *s1_hwpt; AddressSpace as; AddressSpace as_sysmem; -- Gitee From 707bd8198642549595f11ef34c80094fbf7d2de1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 29 Apr 2024 21:26:41 +0000 Subject: [PATCH 109/134] hw/arm/smmuv3: Add missing STE invalidation Multitple STEs can be invalidated in a range via SMMU_CMD_CFGI_STE_RANGE or SMMU_CMD_CFGI_ALL command. Add the missing STE invalidation in this pathway. Signed-off-by: Nicolin Chen --- hw/arm/smmu-internal.h | 1 + hw/arm/smmuv3.c | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h index 843bebb185..5a81dd1b82 100644 --- a/hw/arm/smmu-internal.h +++ b/hw/arm/smmu-internal.h @@ -142,6 +142,7 @@ typedef struct SMMUIOTLBPageInvInfo { } SMMUIOTLBPageInvInfo; typedef struct SMMUSIDRange { + SMMUState *state; uint32_t start; uint32_t end; } SMMUSIDRange; diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 540831ab8e..9d44bb19bc 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1322,11 +1322,9 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) } static gboolean -smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) +_smmuv3_invalidate_ste(SMMUDevice *sdev, SMMUSIDRange *sid_range) { - SMMUDevice *sdev = (SMMUDevice *)key; uint32_t sid = smmu_get_sid(sdev); - SMMUSIDRange *sid_range = (SMMUSIDRange *)user_data; if (sid < sid_range->start || sid > sid_range->end) { return false; @@ -1337,6 +1335,28 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) return true; } +static gboolean +smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) +{ + return _smmuv3_invalidate_ste((SMMUDevice *)key, (SMMUSIDRange *)user_data); +} + +static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) +{ + SMMUState *bs = sid_range->state; + SMMUDevice *sdev; + + if (!bs->viommu) { + return; + } + + QLIST_FOREACH(sdev, &bs->viommu->device_list, next) { + if (smmu_get_sid(sdev)) { + _smmuv3_invalidate_ste(sdev, sid_range); + } + } +} + static int smmuv3_cmdq_consume(SMMUv3State *s) { SMMUState *bs = ARM_SMMU(s); @@ -1418,12 +1438,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) } mask = (1ULL << (range + 1)) - 1; + sid_range.state = bs; sid_range.start = sid & ~mask; sid_range.end = sid_range.start + mask; trace_smmuv3_cmdq_cfgi_ste_range(sid_range.start, sid_range.end); g_hash_table_foreach_remove(bs->configs, smmuv3_invalidate_ste, &sid_range); + smmuv3_invalidate_nested_ste(&sid_range); break; } case SMMU_CMD_CFGI_CD: -- Gitee From d8d7f775b602a84c37b8aced11e00cb5b0521c4e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 18 Jun 2024 17:22:18 -0700 Subject: [PATCH 110/134] hw/arm/smmu-common: Replace smmu_iommu_mr with smmu_find_sdev The caller of smmu_iommu_mr wants to get sdev for smmuv3_flush_config(). Do it directly instead of bridging with an iommu mr pointer. Signed-off-by: Nicolin Chen Message-id: 20240619002218.926674-1-nicolinc@nvidia.com Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell --- hw/arm/smmu-common.c | 8 ++------ hw/arm/smmuv3.c | 12 ++++-------- include/hw/arm/smmu-common.h | 4 ++-- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 9e9af8f5c7..d0bc620606 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -837,20 +837,16 @@ static const PCIIOMMUOps smmu_ops = { .unset_iommu_device = smmu_dev_unset_iommu_device, }; -IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) +SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) { uint8_t bus_n, devfn; SMMUPciBus *smmu_bus; - SMMUDevice *smmu; bus_n = PCI_BUS_NUM(sid); smmu_bus = smmu_find_smmu_pcibus(s, bus_n); if (smmu_bus) { devfn = SMMU_PCI_DEVFN(sid); - smmu = smmu_bus->pbdev[devfn]; - if (smmu) { - return &smmu->iommu; - } + return smmu_bus->pbdev[devfn]; } return NULL; } diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 9d44bb19bc..b2ffe2d40b 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1407,20 +1407,18 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_CFGI_STE: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; + SMMUDevice *sdev = smmu_find_sdev(bs, sid); if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { + if (!sdev) { break; } trace_smmuv3_cmdq_cfgi_ste(sid); - sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); smmuv3_install_nested_ste(sdev, sid); @@ -1452,20 +1450,18 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_CFGI_CD_ALL: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; + SMMUDevice *sdev = smmu_find_sdev(bs, sid); if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { + if (!sdev) { break; } trace_smmuv3_cmdq_cfgi_cd(sid); - sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); break; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 955ca716a5..e30539a8d4 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -234,8 +234,8 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, */ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova); -/* Return the iommu mr associated to @sid, or NULL if none */ -IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); +/* Return the SMMUDevice associated to @sid, or NULL if none */ +SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid); #define SMMU_IOTLB_MAX_SIZE 256 -- Gitee From b331acc42fa54ca93496c32d92cdf5397927bff1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Apr 2023 15:18:56 -0700 Subject: [PATCH 111/134] hw/arm/smmuv3: Forward cache invalidate commands via iommufd Inroduce an SMMUCommandBatch and some helpers to batch the commands. Rewind the q->cons accordingly when it fails to execute a batch/command. Currently separate TLBI commands and device cache commands to avoid some errata on certain version of SMMUs. Later it should check IIDR register to detect if underlying SMMU hw has such an erratum. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3-internal.h | 13 +++++ hw/arm/smmuv3.c | 113 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index 163459d450..a411fd4048 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -226,6 +226,19 @@ static inline bool smmuv3_gerror_irq_enabled(SMMUv3State *s) #define Q_CONS_WRAP(q) (((q)->cons & WRAP_MASK(q)) >> (q)->log2size) #define Q_PROD_WRAP(q) (((q)->prod & WRAP_MASK(q)) >> (q)->log2size) +#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) + +static inline int smmuv3_q_ncmds(SMMUQueue *q) +{ + uint32_t prod = Q_PROD(q); + uint32_t cons = Q_CONS(q); + + if (Q_PROD_WRAP(q) == Q_CONS_WRAP(q)) + return prod - cons; + else + return WRAP_MASK(q) - cons + prod; +} + static inline bool smmuv3_q_full(SMMUQueue *q) { return ((q->cons ^ q->prod) & WRAP_INDEX_MASK(q)) == WRAP_MASK(q); diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index b2ffe2d40b..b860c8385f 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1357,16 +1357,85 @@ static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) } } +/** + * SMMUCommandBatch - batch of commands to issue for nested SMMU invalidation + * @cmds: Pointer to list of commands + * @cons: Pointer to list of CONS corresponding to the commands + * @ncmds: Total ncmds in the batch + * @dev_cache: Issue to a device cache + */ +typedef struct SMMUCommandBatch { + Cmd *cmds; + uint32_t *cons; + uint32_t ncmds; + bool dev_cache; +} SMMUCommandBatch; + +/* Update batch->ncmds to the number of execute cmds */ +static int smmuv3_issue_cmd_batch(SMMUState *bs, SMMUCommandBatch *batch) +{ + uint32_t total = batch->ncmds; + int ret; + + ret = smmu_viommu_invalidate_cache(bs->viommu->core, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3, + sizeof(Cmd), &batch->ncmds, batch->cmds); + if (total != batch->ncmds) { + error_report("%s failed: ret=%d, total=%d, done=%d", + __func__, ret, total, batch->ncmds); + return ret; + } + + batch->ncmds = 0; + batch->dev_cache = false; + return ret; +} + +static int smmuv3_batch_cmds(SMMUState *bs, SMMUCommandBatch *batch, + Cmd *cmd, uint32_t *cons, bool dev_cache) +{ + int ret; + + if (!bs->nested || !bs->viommu) { + return 0; + } + + /* + * Currently separate dev_cache and hwpt for safety, which might not be + * necessary if underlying HW SMMU does not have the errata. + * + * TODO check IIDR register values read from hw_info. + */ + if (batch->ncmds && (dev_cache != batch->dev_cache)) { + ret = smmuv3_issue_cmd_batch(bs, batch); + if (ret) { + *cons = batch->cons[batch->ncmds]; + return ret; + } + } + batch->dev_cache = dev_cache; + batch->cmds[batch->ncmds] = *cmd; + batch->cons[batch->ncmds++] = *cons; + return 0; +} + static int smmuv3_cmdq_consume(SMMUv3State *s) { SMMUState *bs = ARM_SMMU(s); SMMUCmdError cmd_error = SMMU_CERROR_NONE; SMMUQueue *q = &s->cmdq; SMMUCommandType type = 0; + SMMUCommandBatch batch = {}; + uint32_t ncmds = 0; if (!smmuv3_cmdq_enabled(s)) { return 0; } + + ncmds = smmuv3_q_ncmds(q); + batch.cmds = g_new0(Cmd, ncmds); + batch.cons = g_new0(uint32_t, ncmds); + /* * some commands depend on register values, typically CR0. In case those * register values change while handling the command, spec says it @@ -1463,6 +1532,13 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_cfgi_cd(sid); smmuv3_flush_config(sdev); + + if (sdev->s1_hwpt) { + if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + } break; } case SMMU_CMD_TLBI_NH_ASID: @@ -1477,6 +1553,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_tlbi_nh_asid(asid); smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_asid(bs, asid); + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; } case SMMU_CMD_TLBI_NH_ALL: @@ -1489,6 +1569,11 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_tlbi_nh(); smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_all(bs); + + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; case SMMU_CMD_TLBI_NH_VAA: case SMMU_CMD_TLBI_NH_VA: @@ -1497,7 +1582,24 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) break; } smmuv3_range_inval(bs, &cmd); + + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; + case SMMU_CMD_ATC_INV: + { + SMMUDevice *sdev = smmu_find_sdev(bs, CMD_SID(&cmd)); + + if (sdev->s1_hwpt) { + if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + } + break; + } case SMMU_CMD_TLBI_S12_VMALL: { uint16_t vmid = CMD_VMID(&cmd); @@ -1529,7 +1631,6 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_TLBI_EL2_ASID: case SMMU_CMD_TLBI_EL2_VA: case SMMU_CMD_TLBI_EL2_VAA: - case SMMU_CMD_ATC_INV: case SMMU_CMD_PRI_RESP: case SMMU_CMD_RESUME: case SMMU_CMD_STALL_TERM: @@ -1554,12 +1655,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) */ queue_cons_incr(q); } + qemu_mutex_lock(&s->mutex); + if (!cmd_error && batch.ncmds && bs->viommu) { + if (smmuv3_issue_cmd_batch(bs, &batch)) { + q->cons = batch.cons[batch.ncmds]; + cmd_error = SMMU_CERROR_ILL; + } + } + qemu_mutex_unlock(&s->mutex); if (cmd_error) { trace_smmuv3_cmdq_consume_error(smmu_cmd_string(type), cmd_error); smmu_write_cmdq_err(s, cmd_error); smmuv3_trigger_irq(s, SMMU_IRQ_GERROR, R_GERROR_CMDQ_ERR_MASK); } + g_free(batch.cmds); + g_free(batch.cons); trace_smmuv3_cmdq_consume_out(Q_PROD(q), Q_CONS(q), Q_PROD_WRAP(q), Q_CONS_WRAP(q)); -- Gitee From ea23e4215b332446d4964769d004f7a11caba00b Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:02:37 +0800 Subject: [PATCH 112/134] tests/qtest: Allow DSDT acpi tables to change List all DSDT files and allow them to change. Signed-of-by: caijian --- tests/qtest/bios-tables-test-allowed-diff.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..e4a94bb8bd 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,7 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/microvm/DSDT.pcie", +"tests/data/acpi/virt/DSDT", +"tests/data/acpi/virt/DSDT.acpihmatvirt", +"tests/data/acpi/virt/DSDT.memhp", +"tests/data/acpi/virt/DSDT.pxb", +"tests/data/acpi/virt/DSDT.topology", -- Gitee From 237fdc8ddb0598234aace9c88ac4c8387119a12a Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Jul 2022 11:55:25 -0400 Subject: [PATCH 113/134] acpi/gpex: Fix PCI Express Slot Information function 0 returned value At the moment we do not support other function than function 0. So according to ACPI spec "_DSM (Device Specific Method)" description, bit 0 should rather be 0, meaning no other function is supported than function 0. Signed-off-by: Eric Auger --- hw/pci-host/gpex-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index 1092dc3b70..ac5d229757 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -113,7 +113,7 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); ifctx = aml_if(aml_equal(aml_arg(0), UUID)); ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); - uint8_t byte_list[1] = {1}; + uint8_t byte_list[1] = {0}; buf = aml_buffer(1, byte_list); aml_append(ifctx1, aml_return(buf)); aml_append(ifctx, ifctx1); -- Gitee From 4a065d0fbbe159dfbc073e4480434d6889b7c5a4 Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:03:02 +0800 Subject: [PATCH 114/134] tests/data/acpi: Update DSDT acpi tables - * Disassembly of tests/data/acpi/virt/DSDT, Fri Mar 28 16:43:04 2025 + * Disassembly of /tmp/aml-1KF432, Fri Mar 28 16:43:04 2025 * * Original Table Header: * Signature "DSDT" * Length 0x000016B6 (5814) * Revision 0x02 - * Checksum 0x46 + * Checksum 0x47 * OEM ID "BOCHS " * OEM Table ID "BXPC " * OEM Revision 0x00000001 (1) * Compiler ID "BXPC" * Compiler Version 0x00000001 (1) */ DefinitionBlock ("", "DSDT", 2, "BOCHS ", "BXPC ", 0x00000001) @@ -2090,33 +2090,33 @@ } Else { CDW1 |= 0x04 Return (Arg3) } } Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method { If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */)) { If ((Arg2 == Zero)) { Return (Buffer (One) { - 0x01 // . + 0x00 // . }) } } Return (Buffer (One) { 0x00 }) } Signed-off-by: caijian --- tests/data/acpi/microvm/DSDT.pcie | Bin 3023 -> 3023 bytes tests/data/acpi/virt/DSDT | Bin 5814 -> 5814 bytes tests/data/acpi/virt/DSDT.acpihmatvirt | Bin 7323 -> 7323 bytes tests/data/acpi/virt/DSDT.memhp | Bin 7175 -> 7175 bytes tests/data/acpi/virt/DSDT.pxb | Bin 8297 -> 8297 bytes tests/data/acpi/virt/DSDT.topology | Bin 9335 -> 9335 bytes tests/qtest/bios-tables-test-allowed-diff.h | 6 ------ 7 files changed, 6 deletions(-) diff --git a/tests/data/acpi/microvm/DSDT.pcie b/tests/data/acpi/microvm/DSDT.pcie index 765f14ef3d1e54d3cadccbf0a880f8adb73b3f1f..af4c3eb38866d5a32928a73b01ea49a946073aa6 100644 GIT binary patch delta 25 gcmX>veqNl*CDveqNl*CD@!X7zle4%}0cSu5P5=M^ diff --git a/tests/data/acpi/virt/DSDT b/tests/data/acpi/virt/DSDT index d49ead54fa2d8fcd6c0f25ba74e748d90fec3551..404bc5ac2188d885e28b6c80d499193865cf1c9c 100644 GIT binary patch delta 25 hcmdm{yG@tNCD!0RUsV2lD^` delta 25 hcmdm{yG@tNCD!0RUsP2lD^` diff --git a/tests/data/acpi/virt/DSDT.acpihmatvirt b/tests/data/acpi/virt/DSDT.acpihmatvirt index c753f34bb050d146f4dc3195ec850ea26b3141ba..5f9c0b2d3cdc55949c32d564c92309aa54529d8d 100644 GIT binary patch delta 25 hcmbPjIop!UCD^Da?eHUL|C2DbnJ diff --git a/tests/data/acpi/virt/DSDT.pxb b/tests/data/acpi/virt/DSDT.pxb index 1e91767c3045bb8569fd7d5dfa991348ed625944..ccb43ab242521cdfc80f6d6b170d2e0818186632 100644 GIT binary patch delta 32 ncmaFq@X~?HCD Date: Tue, 5 Oct 2021 10:53:12 +0200 Subject: [PATCH 115/134] hw/pci-host/gpex: [needs kernel fix] Allow to generate preserve boot config DSM #5 Add a 'preserve_config' field in struct GPEXConfig and if set, generate the DSM #5 for preserving PCI boot configurations. The DSM presence is needed to expose RMRs. At the moment the DSM generation is not yet enabled. Signed-off-by: Eric Auger --- hw/pci-host/gpex-acpi.c | 35 +++++++++++++++++++++++++++++++---- include/hw/pci-host/gpex.h | 1 + 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index ac5d229757..ce424fc9da 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -49,9 +49,10 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) } } -static void acpi_dsdt_add_pci_osc(Aml *dev) +static void acpi_dsdt_add_pci_osc(Aml *dev, bool preserve_config) { Aml *method, *UUID, *ifctx, *ifctx1, *elsectx, *buf; + uint8_t byte_list[1] = {0}; /* Declare an _OSC (OS Control Handoff) method */ aml_append(dev, aml_name_decl("SUPP", aml_int(0))); @@ -113,10 +114,24 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); ifctx = aml_if(aml_equal(aml_arg(0), UUID)); ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); - uint8_t byte_list[1] = {0}; + if (preserve_config) { + /* support for functions other than function 0 and function 5 */ + byte_list[0] = 0x21; + } buf = aml_buffer(1, byte_list); aml_append(ifctx1, aml_return(buf)); aml_append(ifctx, ifctx1); + + if (preserve_config) { + Aml *ifctx2 = aml_if(aml_equal(aml_arg(2), aml_int(5))); + /* + * 0 - The operating system must not ignore the PCI configuration that + * firmware has done at boot time. + */ + aml_append(ifctx2, aml_return(aml_int(0))); + aml_append(ifctx, ifctx2); + } + aml_append(method, ifctx); byte_list[0] = 0; @@ -174,6 +189,12 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); } + if (cfg->preserve_config) { + method = aml_method("_DSM", 5, AML_SERIALIZED); + aml_append(method, aml_return(aml_int(0))); + aml_append(dev, method); + } + acpi_dsdt_add_pci_route_table(dev, cfg->irq); /* @@ -188,7 +209,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) if (is_cxl) { build_cxl_osc_method(dev); } else { - acpi_dsdt_add_pci_osc(dev); + acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); } aml_append(scope, dev); @@ -205,6 +226,12 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + if (cfg->preserve_config) { + method = aml_method("_DSM", 5, AML_SERIALIZED); + aml_append(method, aml_return(aml_int(0))); + aml_append(dev, method); + } + acpi_dsdt_add_pci_route_table(dev, cfg->irq); method = aml_method("_CBA", 0, AML_NOTSERIALIZED); @@ -263,7 +290,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) } aml_append(dev, aml_name_decl("_CRS", rbuf)); - acpi_dsdt_add_pci_osc(dev); + acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); Aml *dev_res0 = aml_device("%s", "RES0"); aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02"))); diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h index b0240bd768..65475f7f9d 100644 --- a/include/hw/pci-host/gpex.h +++ b/include/hw/pci-host/gpex.h @@ -64,6 +64,7 @@ struct GPEXConfig { MemMapEntry pio; int irq; PCIBus *bus; + bool preserve_config; }; int gpex_set_irq_num(GPEXHost *s, int index, int gsi); -- Gitee From a6c7b16107b506f85e6643604c923291e41f70d1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Jun 2024 04:42:33 +0000 Subject: [PATCH 116/134] hw/arm/virt: Add an SMMU_IO_LEN macro A following patch will add a new MMIO region for nested SMMU instances. This macro will be repeatedly used to set offsets and MMIO sizes in both virt and virt-acpi-build. Signed-off-by: Nicolin Chen Signed-off-by: Shameer Kolothum --- hw/arm/virt.c | 2 +- include/hw/arm/virt.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 8823f2ed1c..08c40c314b 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -155,7 +155,7 @@ static const MemMapEntry base_memmap[] = { [VIRT_FW_CFG] = { 0x09020000, 0x00000018 }, [VIRT_GPIO] = { 0x09030000, 0x00001000 }, [VIRT_SECURE_UART] = { 0x09040000, 0x00001000 }, - [VIRT_SMMU] = { 0x09050000, 0x00020000 }, + [VIRT_SMMU] = { 0x09050000, SMMU_IO_LEN }, [VIRT_PCDIMM_ACPI] = { 0x09070000, MEMORY_HOTPLUG_IO_LEN }, [VIRT_ACPI_GED] = { 0x09080000, ACPI_GED_EVT_SEL_LEN }, [VIRT_NVDIMM_ACPI] = { 0x09090000, NVDIMM_ACPI_IO_LEN}, diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 345b2d5594..e6a449becd 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -106,6 +106,9 @@ typedef enum { ARM_L3_CACHE } ArmCacheType; +/* MMIO region size for SMMUv3 */ +#define SMMU_IO_LEN 0x20000 + enum { VIRT_FLASH, VIRT_MEM, -- Gitee From 9895192512af4b52aff88432618a474e69b44bdd Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 6 Nov 2024 14:47:27 +0000 Subject: [PATCH 117/134] hw/arm/smmuv3: Add initial support for SMMUv3 Nested device Based on SMMUv3 as a parent device, add a user-creatable smmuv3-nested device. Subsequent patches will add support to specify a PCI bus for this device. Currently only supported for "virt", so hook up the sybus mem & irq for that as well. No FDT support is added for now. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 34 ++++++++++++++++++++++++++++++++++ hw/arm/virt.c | 31 +++++++++++++++++++++++++++++-- hw/core/sysbus-fdt.c | 1 + include/hw/arm/smmuv3.h | 15 +++++++++++++++ include/hw/arm/virt.h | 6 ++++++ 5 files changed, 85 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index b860c8385f..3010471cdc 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -2069,6 +2069,19 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } +static void smmu_nested_realize(DeviceState *d, Error **errp) +{ + SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); + Error *local_err = NULL; + + c->parent_realize(d, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } +} + static const VMStateDescription vmstate_smmuv3_queue = { .name = "smmuv3_queue", .version_id = 1, @@ -2167,6 +2180,18 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmuv3_properties); } +static void smmuv3_nested_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_CLASS(klass); + + dc->vmsd = &vmstate_smmuv3; + device_class_set_parent_realize(dc, smmu_nested_realize, + &c->parent_realize); + dc->user_creatable = true; + dc->hotpluggable = false; +} + static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, IOMMUNotifierFlag old, IOMMUNotifierFlag new, @@ -2205,6 +2230,14 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, imrc->notify_flag_changed = smmuv3_notify_flag_changed; } +static const TypeInfo smmuv3_nested_type_info = { + .name = TYPE_ARM_SMMUV3_NESTED, + .parent = TYPE_ARM_SMMUV3, + .instance_size = sizeof(SMMUv3NestedState), + .class_size = sizeof(SMMUv3NestedClass), + .class_init = smmuv3_nested_class_init, +}; + static const TypeInfo smmuv3_type_info = { .name = TYPE_ARM_SMMUV3, .parent = TYPE_ARM_SMMU, @@ -2223,6 +2256,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { static void smmuv3_register_types(void) { type_register(&smmuv3_type_info); + type_register(&smmuv3_nested_type_info); type_register(&smmuv3_iommu_memory_region_info); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 08c40c314b..a55f297af2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -166,6 +166,7 @@ static const MemMapEntry base_memmap[] = { /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, + [VIRT_SMMU_NESTED] = { 0x0b010000, 0x00ff0000}, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, @@ -211,6 +212,7 @@ static const int a15irqmap[] = { [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ + [VIRT_SMMU_NESTED] = 200, }; static const char *valid_cpus[] = { @@ -3613,10 +3615,34 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineClass *mc = MACHINE_GET_CLASS(vms); - if (vms->platform_bus_dev) { - MachineClass *mc = MACHINE_GET_CLASS(vms); + /* For smmuv3-nested devices we need to set the mem & irq */ + if (device_is_dynamic_sysbus(mc, dev) && + object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_NESTED)) { + hwaddr base = vms->memmap[VIRT_SMMU_NESTED].base; + int irq = vms->irqmap[VIRT_SMMU_NESTED]; + + if (vms->smmu_nested_count >= MAX_SMMU_NESTED) { + error_setg(errp, "smmuv3-nested max count reached!"); + return; + } + + base += (vms->smmu_nested_count * SMMU_IO_LEN); + irq += (vms->smmu_nested_count * NUM_SMMU_IRQS); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); + for (int i = 0; i < 4; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, + qdev_get_gpio_in(vms->gic, irq + i)); + } + if (vms->iommu != VIRT_IOMMU_SMMUV3_NESTED) { + vms->iommu = VIRT_IOMMU_SMMUV3_NESTED; + } + vms->smmu_nested_count++; + } + + if (vms->platform_bus_dev) { if (device_is_dynamic_sysbus(mc, dev)) { platform_bus_link_device(PLATFORM_BUS_DEVICE(vms->platform_bus_dev), SYS_BUS_DEVICE(dev)); @@ -3789,6 +3815,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_NESTED); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); #endif diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c index eebcd28f9a..0f0d0b3e58 100644 --- a/hw/core/sysbus-fdt.c +++ b/hw/core/sysbus-fdt.c @@ -489,6 +489,7 @@ static const BindingEntry bindings[] = { #ifdef CONFIG_LINUX TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), + TYPE_BINDING("arm-smmuv3-nested", no_fdt_node), VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), #endif #ifdef CONFIG_TPM diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index d183a62766..87e628be7a 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -84,6 +84,21 @@ struct SMMUv3Class { #define TYPE_ARM_SMMUV3 "arm-smmuv3" OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) +#define TYPE_ARM_SMMUV3_NESTED "arm-smmuv3-nested" +OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) + +struct SMMUv3NestedState { + SMMUv3State smmuv3_state; +}; + +struct SMMUv3NestedClass { + /*< private >*/ + SMMUv3Class smmuv3_class; + /*< public >*/ + + DeviceRealize parent_realize; +}; + #define STAGE1_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S1P) #define STAGE2_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S2P) diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index e6a449becd..cd41e28202 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -109,6 +109,9 @@ typedef enum { /* MMIO region size for SMMUv3 */ #define SMMU_IO_LEN 0x20000 +/* Max supported nested SMMUv3 */ +#define MAX_SMMU_NESTED 64 + enum { VIRT_FLASH, VIRT_MEM, @@ -121,6 +124,7 @@ enum { VIRT_GIC_ITS, VIRT_GIC_REDIST, VIRT_SMMU, + VIRT_SMMU_NESTED, VIRT_UART, VIRT_CPUFREQ, VIRT_MMIO, @@ -155,6 +159,7 @@ enum { typedef enum VirtIOMMUType { VIRT_IOMMU_NONE, VIRT_IOMMU_SMMUV3, + VIRT_IOMMU_SMMUV3_NESTED, VIRT_IOMMU_VIRTIO, } VirtIOMMUType; @@ -222,6 +227,7 @@ struct VirtMachineState { bool mte; bool dtb_randomness; bool pmu; + int smmu_nested_count; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; -- Gitee From afca50145f52601d912a805b65bd4530e9278388 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 6 Nov 2024 15:53:45 +0000 Subject: [PATCH 118/134] hw/arm/smmuv3: Associate a pci bus with a SMMUv3 Nested device Subsequent patches will add IORT modifications to get this working. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 27 +++++++++++++++++++++++++++ include/hw/arm/smmuv3.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 3010471cdc..66e4e1b57d 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -24,6 +24,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-core.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bridge.h" #include "cpu.h" #include "trace.h" #include "qemu/log.h" @@ -2069,12 +2070,32 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } +static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) +{ + DeviceState *d = opaque; + SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + if (s_nested->pci_bus && !strcmp(bus->qbus.name, s_nested->pci_bus)) { + object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), + &error_abort); + } + } + return 0; +} + static void smmu_nested_realize(DeviceState *d, Error **errp) { SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); + SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; + object_child_foreach_recursive(object_get_root(), + smmuv3_nested_pci_host_bridge, d); + object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); + c->parent_realize(d, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -2161,6 +2182,11 @@ static Property smmuv3_properties[] = { DEFINE_PROP_END_OF_LIST() }; +static Property smmuv3_nested_properties[] = { + DEFINE_PROP_STRING("pci-bus", SMMUv3NestedState, pci_bus), + DEFINE_PROP_END_OF_LIST() +}; + static void smmuv3_instance_init(Object *obj) { /* Nothing much to do here as of now */ @@ -2188,6 +2214,7 @@ static void smmuv3_nested_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_smmuv3; device_class_set_parent_realize(dc, smmu_nested_realize, &c->parent_realize); + device_class_set_props(dc, smmuv3_nested_properties); dc->user_creatable = true; dc->hotpluggable = false; } diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index 87e628be7a..96513fce56 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -89,6 +89,8 @@ OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) struct SMMUv3NestedState { SMMUv3State smmuv3_state; + + char *pci_bus; }; struct SMMUv3NestedClass { -- Gitee From a7ffb5856940a1515ef84a4d4644b7c7c07afb8f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 6 Nov 2024 19:22:13 +0000 Subject: [PATCH 119/134] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we can have multiple user-creatable smmuv3-nested devices, each associated with different pci buses, update IORT ID mappings accordingly. Signed-off-by: Nicolin Chen Signed-off-by: Shameer Kolothum --- hw/arm/virt-acpi-build.c | 43 ++++++++++++++++++++++++++++------------ include/hw/arm/virt.h | 6 ++++++ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 076781423b..1d7839e4a0 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -555,8 +555,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i, nb_nodes, rc_mapping_count; const uint32_t iort_node_offset = IORT_NODE_OFFSET; - size_t node_size, smmu_offset = 0; + size_t node_size, *smmu_offset; AcpiIortIdMapping *idmap; + hwaddr base; + int irq, num_smmus = 0; uint32_t id = 0; GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); @@ -566,7 +568,21 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Table 2 The IORT */ acpi_table_begin(&table, table_data); - if (vms->iommu == VIRT_IOMMU_SMMUV3) { + if (vms->smmu_nested_count) { + irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU_NESTED].base; + num_smmus = vms->smmu_nested_count; + } else if (virt_has_smmuv3(vms)) { + irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU].base; + num_smmus = 1; + } + + smmu_offset = g_new0(size_t, num_smmus); + nb_nodes = 2; /* RC, ITS */ + nb_nodes += num_smmus; /* SMMU nodes */ + + if (virt_has_smmuv3(vms)) { AcpiIortIdMapping next_range = {0}; object_child_foreach_recursive(object_get_root(), @@ -588,18 +604,19 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } next_range.input_base = idmap->input_base + idmap->id_count; + if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + nb_nodes++; /* RMR node per SMMU */ + } } /* Append the last RC -> ITS ID mapping */ - if (next_range.input_base < 0xFFFF) { - next_range.id_count = 0xFFFF - next_range.input_base; + if (next_range.input_base < 0x10000) { + next_range.id_count = 0x10000 - next_range.input_base; g_array_append_val(its_idmaps, next_range); } - nb_nodes = 3; /* RC, ITS, SMMUv3 */ rc_mapping_count = smmu_idmaps->len + its_idmaps->len; } else { - nb_nodes = 2; /* RC, ITS */ rc_mapping_count = 1; } /* Number of IORT Nodes */ @@ -621,10 +638,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* GIC ITS Identifier Array */ build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); - if (vms->iommu == VIRT_IOMMU_SMMUV3) { - int irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + for (i = 0; i < num_smmus; i++) { + smmu_offset[i] = table_data->len - table.table_offset; - smmu_offset = table_data->len - table.table_offset; /* Table 9 SMMUv3 Format */ build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */ node_size = SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE; @@ -635,7 +651,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Reference to ID Array */ build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4); /* Base address */ - build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8); + build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8); /* Flags */ build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4); build_append_int_noprefix(table_data, 0, 4); /* Reserved */ @@ -646,12 +662,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */ build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */ build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */ + irq += NUM_SMMU_IRQS; build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */ /* DeviceID mapping index (ignored since interrupts are GSIV based) */ build_append_int_noprefix(table_data, 0, 4); /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); } /* Table 17 Root Complex Node */ @@ -684,7 +701,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 0, 3); /* Reserved */ /* Output Reference */ - if (vms->iommu == VIRT_IOMMU_SMMUV3) { + if (virt_has_smmuv3(vms)) { AcpiIortIdMapping *range; /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ @@ -692,7 +709,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); /* output IORT node is the smmuv3 node */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, smmu_offset); + range->id_count, smmu_offset[i]); } /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index cd41e28202..bc3c8b70da 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -295,4 +295,10 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) vms->highmem_redists) ? 2 : 1; } +static inline bool virt_has_smmuv3(const VirtMachineState *vms) +{ + return vms->iommu == VIRT_IOMMU_SMMUV3 || + vms->iommu == VIRT_IOMMU_SMMUV3_NESTED; +} + #endif /* QEMU_ARM_VIRT_H */ -- Gitee From ca17fd9b9e608e0a6e8a948ccf46fa020c12f510 Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:06:13 +0800 Subject: [PATCH 120/134] tests/qtest: Allow IORT acpi table to change List changed IORT file and allow it to change. Signed-off-by: caijian --- tests/qtest/bios-tables-test-allowed-diff.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..9a5a923d6b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,2 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/virt/IORT", -- Gitee From 1746ba1aee671b9552540e36a629988b00846a82 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 5 Oct 2021 10:53:13 +0200 Subject: [PATCH 121/134] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding To handle SMMUv3 nested stage support it is practical to expose the guest with reserved memory regions (RMRs) covering the IOVAs used by the host kernel to map physical MSI doorbells. Those IOVAs belong to [0x8000000, 0x8100000] matching MSI_IOVA_BASE and MSI_IOVA_LENGTH definitions in kernel arm-smmu-v3 driver. This is the window used to allocate IOVAs matching physical MSI doorbells. With those RMRs, the guest is forced to use a flat mapping for this range. Hence the assigned device is programmed with one IOVA from this range. Stage 1, owned by the guest has a flat mapping for this IOVA. Stage2, owned by the VMM then enforces a mapping from this IOVA to the physical MSI doorbell. The creation of those RMR nodes only is relevant if nested stage SMMU is in use, along with VFIO. As VFIO devices can be hotplugged, all RMRs need to be created in advance. Hence the patch introduces a new arm virt "nested-smmuv3" iommu type. ARM DEN 0049E.b IORT specification also mandates that when RMRs are present, the OS must preserve PCIe configuration performed by the boot FW. So along with the RMR IORT nodes, a _DSM function #5, as defined by PCI FIRMWARE SPECIFICATION EVISION 3.3, chapter 4.6.5 is added to PCIe host bridge and PCIe expander bridge objects. Signed-off-by: Eric Auger Suggested-by: Jean-Philippe Brucker Signed-off-by: Nicolin Chen Signed-off-by: Shameer Kolothum --- hw/arm/virt-acpi-build.c | 71 +++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 1d7839e4a0..ad0f79e03d 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -417,6 +417,14 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, .bus = vms->bus, }; + /* + * Nested SMMU requires RMRs for MSI 1-1 mapping, which + * require _DSM for PreservingPCI Boot Configurations + */ + if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + cfg.preserve_config = true; + } + if (vms->highmem_mmio) { cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO]; } @@ -495,7 +503,7 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms) #define IORT_NODE_OFFSET 48 static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, - uint32_t id_count, uint32_t out_ref) + uint32_t id_count, uint32_t out_ref, uint32_t flags) { /* Table 4 ID mapping format */ build_append_int_noprefix(table_data, input_base, 4); /* Input base */ @@ -503,7 +511,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, build_append_int_noprefix(table_data, input_base, 4); /* Output base */ build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */ /* Flags */ - build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4); + build_append_int_noprefix(table_data, flags, 4); /* Flags */ } struct AcpiIortIdMapping { @@ -545,6 +553,50 @@ static int iort_idmap_compare(gconstpointer a, gconstpointer b) return idmap_a->input_base - idmap_b->input_base; } +static void +build_iort_rmr_nodes(GArray *table_data, GArray *smmu_idmaps, + size_t *smmu_offset, uint32_t *id) +{ + AcpiIortIdMapping *range; + int i; + + for (i = 0; i < smmu_idmaps->len; i++) { + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + int bdf = range->input_base; + + /* Table 18 Reserved Memory Range Node */ + + build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */ + /* Length */ + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20, 2); + build_append_int_noprefix(table_data, 3, 1); /* Revision */ + build_append_int_noprefix(table_data, *id, 4); /* Identifier */ + /* Number of ID mappings */ + build_append_int_noprefix(table_data, 1, 4); + /* Reference to ID Array */ + build_append_int_noprefix(table_data, 28, 4); + + /* RMR specific data */ + + /* Flags */ + build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4); + /* Number of Memory Range Descriptors */ + build_append_int_noprefix(table_data, 1 , 4); + /* Reference to Memory Range Descriptors */ + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4); + build_iort_id_mapping(table_data, bdf, range->id_count, smmu_offset[i], 1); + + /* Table 19 Memory Range Descriptor */ + + /* Physical Range offset */ + build_append_int_noprefix(table_data, 0x8000000, 8); + /* Physical Range length */ + build_append_int_noprefix(table_data, 0x100000, 8); + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + *id += 1; + } +} + /* * Input Output Remapping Table (IORT) * Conforms to "IO Remapping Table System Software on ARM Platforms", @@ -554,7 +606,6 @@ static void build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i, nb_nodes, rc_mapping_count; - const uint32_t iort_node_offset = IORT_NODE_OFFSET; size_t node_size, *smmu_offset; AcpiIortIdMapping *idmap; hwaddr base; @@ -563,7 +614,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); - AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id, + AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id, .oem_table_id = vms->oem_table_id }; /* Table 2 The IORT */ acpi_table_begin(&table, table_data); @@ -668,7 +719,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 0, 4); /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); } /* Table 17 Root Complex Node */ @@ -709,7 +760,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); /* output IORT node is the smmuv3 node */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, smmu_offset[i]); + range->id_count, smmu_offset[i], 0); } /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ @@ -717,11 +768,15 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); /* output IORT node is the ITS group node (the first node) */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, iort_node_offset); + range->id_count, IORT_NODE_OFFSET, 0); } } else { /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); + } + + if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); } acpi_table_end(linker, &table); -- Gitee From bf12438e93f2d55aac6245f6a9f77f51b6fd2d8a Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:06:24 +0800 Subject: [PATCH 122/134] tests/data/acpi/virt: Update IORT acpi table - * Disassembly of tests/data/acpi/virt/IORT, Fri Mar 28 18:05:37 2025 + * Disassembly of /tmp/aml-9R3932, Fri Mar 28 18:05:37 2025 * * ACPI Data Table [IORT] * * Format: [HexOffset DecimalOffset ByteLength] FieldName : FieldValue */ [000h 0000 4] Signature : "IORT" [IO Remapping Table] [004h 0004 4] Table Length : 00000080 -[008h 0008 1] Revision : 03 -[009h 0009 1] Checksum : B3 +[008h 0008 1] Revision : 05 +[009h 0009 1] Checksum : AE [00Ah 0010 6] Oem ID : "BOCHS " [010h 0016 8] Oem Table ID : "BXPC " [018h 0024 4] Oem Revision : 00000001 [01Ch 0028 4] Asl Compiler ID : "BXPC" [020h 0032 4] Asl Compiler Revision : 00000001 @@ -45,32 +45,32 @@ [058h 0088 4] Cache Coherency : 00000001 [05Ch 0092 1] Hints (decoded below) : 00 Transient : 0 Write Allocate : 0 Read Allocate : 0 Override : 0 [05Dh 0093 2] Reserved : 0000 [05Fh 0095 1] Memory Flags (decoded below) : 03 Coherency : 1 Device Attribute : 1 [060h 0096 4] ATS Attribute : 00000000 [064h 0100 4] PCI Segment Number : 00000000 [068h 0104 1] Memory Size Limit : 40 [069h 0105 3] Reserved : 000000 [06Ch 0108 4] Input base : 00000000 -[070h 0112 4] ID Count : 0000FFFF +[070h 0112 4] ID Count : 00010000 [074h 0116 4] Output Base : 00000000 [078h 0120 4] Output Reference : 00000030 [07Ch 0124 4] Flags (decoded below) : 00000000 Single Mapping : 0 Raw Table Data: Length 128 (0x80) - 0000: 49 4F 52 54 80 00 00 00 03 B3 42 4F 43 48 53 20 // IORT......BOCHS + 0000: 49 4F 52 54 80 00 00 00 05 AE 42 4F 43 48 53 20 // IORT......BOCHS 0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43 // BXPC ....BXPC 0020: 01 00 00 00 02 00 00 00 30 00 00 00 00 00 00 00 // ........0....... 0030: 00 18 00 01 00 00 00 00 00 00 00 00 00 00 00 00 // ................ 0040: 01 00 00 00 00 00 00 00 02 38 00 03 01 00 00 00 // .........8...... 0050: 01 00 00 00 24 00 00 00 01 00 00 00 00 00 00 03 // ....$........... 0060: 00 00 00 00 00 00 00 00 40 00 00 00 00 00 00 00 // ........@....... - 0070: FF FF 00 00 00 00 00 00 30 00 00 00 00 00 00 00 // ........0....... + 0070: 00 00 01 00 00 00 00 00 30 00 00 00 00 00 00 00 // ........0....... Signed-off-by: caijian --- tests/data/acpi/virt/IORT | Bin 128 -> 128 bytes tests/qtest/bios-tables-test-allowed-diff.h | 1 - 2 files changed, 1 deletion(-) diff --git a/tests/data/acpi/virt/IORT b/tests/data/acpi/virt/IORT index 7efd0ce8a6b3928efa7e1373f688ab4c5f50543b..9f0958b3df5aa6b9c3092885f79a20d82da8f011 100644 GIT binary patch delta 35 lcmZo*Y+&T_^bZPYU|?WiT{n>{O@M)c5y)m>FaVPb3;=k51i%0Q delta 35 jcmZo*Y+&T_^bZPYU|?Wi-aL^jP2m53AQK1-AQS@tmyia) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index 9a5a923d6b..dfb8523c8b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1,2 +1 @@ /* List of comma-separated changed AML files to ignore */ -"tests/data/acpi/virt/IORT", -- Gitee From 8414bc02f988ecca7dda5325227ff5ffbe45150c Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 15 Jan 2025 10:02:58 +0000 Subject: [PATCH 123/134] iommufd.h: Updated to openeuler olk-6.6 kernel Signed-off-by: Shameer Kolothum --- linux-headers/linux/iommufd.h | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 41559c6064..3e57fee01c 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -51,8 +51,8 @@ enum { IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, - IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f, - IOMMUFD_CMD_VDEVICE_ALLOC = 0x90, + IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, + IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, }; /** @@ -397,18 +397,20 @@ struct iommu_hwpt_vtd_s1 { }; /** - * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE * (IOMMU_HWPT_DATA_ARM_SMMUV3) * * @ste: The first two double words of the user space Stream Table Entry for - * a user stage-1 Context Descriptor Table. Must be little-endian. + * the translation. Must be little-endian. * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD * * -EIO will be returned if @ste is not legal or contains any non-allowed field. * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass - * nested domain will translate the same as the nesting parent. + * nested domain will translate the same as the nesting parent. The S1 will + * install a Context Descriptor Table pointing at userspace memory translated + * by the nesting parent. */ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; @@ -920,8 +922,8 @@ enum iommu_viommu_type { * that is unique to a specific VM. Operations global to the IOMMU are connected * to the vIOMMU, such as: * - Security namespace for guest owned ID, e.g. guest-controlled cache tags + * - Non-device-affiliated event reporting, e.g. invalidation queue errors * - Access to a sharable nesting parent pagetable across physical IOMMUs - * - Non-affiliated event reporting (e.g. an invalidation queue error) * - Virtualization of various platforms IDs, e.g. RIDs and others * - Delivery of paravirtualized invalidation * - Direct assigned invalidation queues @@ -941,12 +943,10 @@ struct iommu_viommu_alloc { * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) * @size: sizeof(struct iommu_vdevice_alloc) * @viommu_id: vIOMMU ID to associate with the virtual device - * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU - * @__reserved: Must be 0 + * @dev_id: The physical device to allocate a virtual instance on the vIOMMU + * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID - * of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table. - * @out_vdevice_id: Output virtual instance ID for the allocated object - * @__reserved2: Must be 0 + * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. @@ -955,10 +955,8 @@ struct iommu_vdevice_alloc { __u32 size; __u32 viommu_id; __u32 dev_id; - __u32 __reserved; - __aligned_u64 virt_id; __u32 out_vdevice_id; - __u32 __reserved2; + __aligned_u64 virt_id; }; #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) #endif -- Gitee From c8267f88b2af37779a597aac00aeaf06adc80ccc Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Mon, 11 Dec 2023 14:42:01 +0000 Subject: [PATCH 124/134] hw/arm/smmuv3: Enable sva/stall IDR features Emulate features that will enable the stall and sva feature in Guest. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3-internal.h | 3 ++- hw/arm/smmuv3.c | 8 +++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index a411fd4048..cfc04c563e 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -74,6 +74,7 @@ REG32(IDR1, 0x4) FIELD(IDR1, ECMDQ, 31, 1) #define SMMU_IDR1_SIDSIZE 16 +#define SMMU_IDR1_SSIDSIZE 16 #define SMMU_CMDQS 19 #define SMMU_EVENTQS 19 @@ -104,7 +105,7 @@ REG32(IDR5, 0x14) FIELD(IDR5, VAX, 10, 2); FIELD(IDR5, STALL_MAX, 16, 16); -#define SMMU_IDR5_OAS 4 +#define SMMU_IDR5_OAS 5 REG32(IIDR, 0x18) REG32(AIDR, 0x1c) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 66e4e1b57d..8d8dcccd48 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -343,13 +343,14 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, 1); /* 16-bit ASID */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, VMID16, 1); /* 16-bit VMID */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TTENDIAN, 2); /* little endian */ - s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 1); /* No stall */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 0); /* stall */ /* terminated transaction will always be aborted/error returned */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, 1); /* 2-level stream table supported */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, 1); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, SMMU_IDR1_SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, SMMU_IDR1_SSIDSIZE); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS, SMMU_CMDQS); @@ -361,7 +362,7 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1); s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2); - s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */ + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 48 bits */ /* 4K, 16K and 64K granule support */ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); @@ -776,9 +777,6 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event) if (!CD_A(cd)) { goto bad_cd; /* SMMU_IDR0.TERM_MODEL == 1 */ } - if (CD_S(cd)) { - goto bad_cd; /* !STE_SECURE && SMMU_IDR0.STALL_MODEL == 1 */ - } if (CD_HA(cd) || CD_HD(cd)) { goto bad_cd; /* HTTU = 0 */ } -- Gitee From cdd5c088ff46ebf423c926fe4c0b12e345ae0db0 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 23 Feb 2023 12:12:48 +0000 Subject: [PATCH 125/134] =?UTF-8?q?kvm:=20Translate=20MSI=20doorbell=20add?= =?UTF-8?q?ress=C2=A0only=20if=20it=20is=20valid?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guest might have already set the MSI doorbell address to invalid and if we try to translate the address again, Guest reports, [ 26.784082] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x10 received: [ 26.784088] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000001000000010 [ 26.784090] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 [ 26.784092] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 [ 26.784094] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 [ 26.788082] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x10 received: [ 26.788085] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000001000000010 [ 26.788087] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 .... eg: rmmod hisi_zip.ko. The sequence seems to be, - Write 0 to MSI Message Address register - Disable MSI Hence check for address validity before we try to do the translation. Note: The fix is placed in generic code and hopefully is not a problem for other architectures. Signed-off-by: Shameer Kolothum --- accel/kvm/kvm-all.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index a8e29f148e..6fa97d2cbf 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2074,7 +2074,8 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, kroute.flags = KVM_MSI_VALID_DEVID; kroute.u.msi.devid = pci_requester_id(dev); } - if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { + if (msg.address && + kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { return -EINVAL; } -- Gitee From ebfa7213e32faafd5532d6f5b3cb873018b671ae Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 10 Oct 2024 06:19:31 +0000 Subject: [PATCH 126/134] smmuv3: Add support for page fault handling Handle page fault from host and send response back. Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 20 +++- hw/arm/smmu-common.c | 39 ++++++-- hw/arm/smmuv3.c | 188 ++++++++++++++++++++++++++++++++++- hw/vfio/iommufd.c | 2 +- include/hw/arm/smmu-common.h | 24 ++++- include/sysemu/iommufd.h | 2 +- 6 files changed, 263 insertions(+), 12 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index ee6f5bcf65..e9ce82297b 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -228,7 +228,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, - Error **errp) + uint32_t *out_fault_fd, Error **errp) { int ret, fd = be->fd; struct iommu_hwpt_alloc alloc_hwpt = { @@ -241,6 +241,24 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, .data_uptr = (uintptr_t)data_ptr, }; + if (flags & IOMMU_HWPT_FAULT_ID_VALID) { + + struct iommu_fault_alloc cmd = { + .size = sizeof(cmd), + }; + + ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd); + if (ret) { + ret = -errno; + error_report("IOMMU_FAULT_ALLOC failed: %m"); + } else { + alloc_hwpt.fault_id = cmd.out_fault_id; + if (out_fault_fd) { + *out_fault_fd = cmd.out_fault_fd; + } + } + } + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, data_len, (uintptr_t)data_ptr, diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index d0bc620606..c382fa16e5 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -670,7 +670,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, IOMMU_HWPT_DATA_NONE, 0, NULL, - &s2_hwpt_id, errp)) { + &s2_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate an S2 hwpt"); return false; } @@ -695,7 +695,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, viommu->core->viommu_id, 0, IOMMU_HWPT_DATA_ARM_SMMUV3, sizeof(abort_data), &abort_data, - &viommu->abort_hwpt_id, errp)) { + &viommu->abort_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate an abort pagetable"); goto free_viommu_core; } @@ -704,7 +704,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, viommu->core->viommu_id, 0, IOMMU_HWPT_DATA_ARM_SMMUV3, sizeof(bypass_data), &bypass_data, - &viommu->bypass_hwpt_id, errp)) { + &viommu->bypass_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate a bypass pagetable"); goto free_abort_hwpt; } @@ -882,6 +882,25 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) hwpt_id = sdev->viommu->bypass_hwpt_id; } + /* ToDo: May be better to move the below to smmuv3. */ + if (s1_hwpt->out_fault_fd) { + struct io_uring *ring = &s1_hwpt->fault_ring; + struct io_uring_sqe *sqe; + struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1}; + + s1_hwpt->exiting = true; + /* Send out a timeout sqe for the read handler to exit */ + sqe = io_uring_get_sqe(ring); + io_uring_prep_timeout(sqe, &ts, 0, 0); + io_uring_submit(ring); + + qemu_cond_signal(&s1_hwpt->fault_cond); + qemu_thread_join(&s1_hwpt->read_fault_thread); + qemu_thread_join(&s1_hwpt->write_fault_thread); + qemu_mutex_destroy(&s1_hwpt->fault_mutex); + io_uring_queue_exit(&s1_hwpt->fault_ring); + } + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { return; } @@ -892,11 +911,13 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) } int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, - uint32_t data_len, void *data) + uint32_t data_len, void *data, + bool req_fault_fd) { SMMUViommu *viommu = sdev->viommu; SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + uint32_t flags = 0; if (!idev || !viommu) { return -ENOENT; @@ -912,12 +933,18 @@ int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, } s1_hwpt->smmu = sdev->smmu; + s1_hwpt->sdev = sdev; s1_hwpt->viommu = viommu; s1_hwpt->iommufd = idev->iommufd; + if (req_fault_fd) { + flags |= IOMMU_HWPT_FAULT_ID_VALID; + } + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, - viommu->core->viommu_id, 0, data_type, - data_len, data, &s1_hwpt->hwpt_id, NULL)) { + viommu->core->viommu_id, flags, data_type, + data_len, data, &s1_hwpt->hwpt_id, + &s1_hwpt->out_fault_fd, NULL)) { goto free; } diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 8d8dcccd48..30c0ae4c3b 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -34,6 +34,9 @@ #include "hw/arm/smmuv3.h" #include "smmuv3-internal.h" #include "smmu-internal.h" +#ifdef CONFIG_LINUX_IO_URING +#include +#endif #define PTW_RECORD_FAULT(cfg) (((cfg)->stage == 1) ? (cfg)->record_faults : \ (cfg)->s2cfg.record_faults) @@ -1258,6 +1261,165 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) } } +static void smmuv3_report_iommu_fault(SMMUS1Hwpt *hwpt, + struct iommu_hwpt_pgfault *fault) +{ + PendFaultEntry *pend; + SMMUDevice *sdev = hwpt->sdev; + SMMUv3State *s3 = sdev->smmu; + uint32_t sid = smmu_get_sid(sdev); + SMMUEventInfo info = {0}; + + info.sid = sid; + info.type = SMMU_EVT_F_TRANSLATION; + info.u.f_translation.addr = fault->addr; + info.u.f_translation.stall = true; + info.u.f_translation.ssid = fault->pasid; + info.u.f_translation.stag = fault->grpid; + + if (fault->flags | IOMMU_PGFAULT_FLAGS_PASID_VALID) { + info.u.f_translation.ssv = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_READ) { + info.u.f_translation.rnw = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_PRIV) { + info.u.f_translation.pnu = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_EXEC) { + info.u.f_translation.ind = true; + } + + pend = g_new0(PendFaultEntry, 1); + memcpy(&pend->fault, fault, sizeof(*fault)); + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_INSERT_TAIL(&hwpt->pendfault, pend, entry); + qemu_mutex_unlock(&hwpt->fault_mutex); + smmuv3_record_event(s3, &info); + return; +} + +static void smmuv3_notify_stall_resume(SMMUState *bs, uint32_t sid, + uint32_t stag, uint32_t code) +{ + SMMUDevice *sdev = smmu_find_sdev(bs, sid); + PageRespEntry *msg; + PendFaultEntry *pend, *tmp; + SMMUS1Hwpt *hwpt; + bool found = false; + + if (!sdev) { + return; + } + + hwpt = sdev->s1_hwpt; + msg = g_new0(PageRespEntry, 1); + + /* Kernel expects addr and pasid info for page response */ + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(pend, &hwpt->pendfault, entry, tmp) { + if (pend->fault.grpid == stag) { + QTAILQ_REMOVE(&hwpt->pendfault, pend, entry); + msg->resp.cookie = pend->fault.cookie; + msg->resp.code = code; + QTAILQ_INSERT_TAIL(&hwpt->pageresp, msg, entry); + qemu_cond_signal(&hwpt->fault_cond); + + g_free(pend); + found = true; + break; + } + } + + qemu_mutex_unlock(&hwpt->fault_mutex); + if (!found) { + warn_report("No matching fault for resume(stag 0x%x), drop!", stag); + return; + } +} + +static void *write_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + PageRespEntry *msg, *tmp; + struct iommu_hwpt_page_response *resp; + int ret; + + resp = g_new0(struct iommu_hwpt_page_response, 1); + while (!hwpt->exiting) { + /* Check we have any pending responses */ + qemu_mutex_lock(&hwpt->fault_mutex); + qemu_cond_wait(&hwpt->fault_cond, &hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(msg, &hwpt->pageresp, entry, tmp) { + QTAILQ_REMOVE(&hwpt->pageresp, msg, entry); + memcpy(resp, &msg->resp, sizeof(*resp)); + g_free(msg); + + ret = write(hwpt->out_fault_fd, resp, sizeof(*resp)); + if (ret != sizeof(*resp)) { + warn_report("Write resp[cookie 0x%x] fail %d", + resp->cookie, ret); + } + } + qemu_mutex_unlock(&hwpt->fault_mutex); + } + g_free(resp); + return NULL; +} + +static void *read_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct iommu_hwpt_pgfault *fault; + struct io_uring *ring = &hwpt->fault_ring; + void *data; + int ret; + + fault = g_new0(struct iommu_hwpt_pgfault, 1); + while (!hwpt->exiting) { + sqe = io_uring_get_sqe(ring); + io_uring_prep_read(sqe, hwpt->out_fault_fd, fault, + sizeof(*fault), 0); + io_uring_sqe_set_data(sqe, fault); + io_uring_submit(ring); + + ret = io_uring_wait_cqe(ring, &cqe); + if (ret == 0) { + if (cqe->res == sizeof(*fault)) { + data = io_uring_cqe_get_data(cqe); + smmuv3_report_iommu_fault(hwpt, data); + } + } else { + warn_report("Read fault[hwpt_id 0x%x] failed %d", + hwpt->hwpt_id, ret); + } + io_uring_cqe_seen(ring, cqe); + } + g_free(fault); + return NULL; +} + +static void create_fault_handlers(SMMUS1Hwpt *hwpt) +{ + if (!hwpt->out_fault_fd) { + warn_report("No fault fd for hwpt id: %d", hwpt->hwpt_id); + return; + } + + io_uring_queue_init(1024, &hwpt->fault_ring, 0); + qemu_mutex_init(&hwpt->fault_mutex); + qemu_cond_init(&hwpt->fault_cond); + QTAILQ_INIT(&hwpt->pageresp); + QTAILQ_INIT(&hwpt->pendfault); + qemu_thread_create(&hwpt->read_fault_thread, "io fault read", + read_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); + qemu_thread_create(&hwpt->write_fault_thread, "io fault write", + write_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); +} static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) { #ifdef __linux__ @@ -1266,6 +1428,7 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) struct iommu_hwpt_arm_smmuv3 nested_data = {}; SMMUv3State *s = sdev->smmu; SMMUState *bs = &s->smmu_state; + bool req_fault_fd = false; uint32_t config; STE ste; int ret; @@ -1309,13 +1472,22 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ nested_data.ste[1] &= 0x380000ffULL; + if (STE_S1CDMAX(&ste)) { + req_fault_fd = true; + } + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, - sizeof(nested_data), &nested_data); + sizeof(nested_data), &nested_data, + req_fault_fd); if (ret) { error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", nested_data.ste[1], nested_data.ste[0], ret); } + if (req_fault_fd) { + create_fault_handlers(sdev->s1_hwpt); + } + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); #endif } @@ -1631,10 +1803,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_TLBI_EL2_VA: case SMMU_CMD_TLBI_EL2_VAA: case SMMU_CMD_PRI_RESP: - case SMMU_CMD_RESUME: case SMMU_CMD_STALL_TERM: trace_smmuv3_unhandled_cmd(type); break; + case SMMU_CMD_RESUME: + { + uint32_t sid = CMD_SID(&cmd); + uint16_t stag = CMD_RESUME_STAG(&cmd); + uint8_t action = CMD_RESUME_AC(&cmd); + uint32_t code = IOMMUFD_PAGE_RESP_INVALID; + + if (action) { + code = IOMMUFD_PAGE_RESP_SUCCESS; + } + smmuv3_notify_stall_resume(bs, sid, stag, code); + break; + } default: cmd_error = SMMU_CERROR_ILL; break; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 528023b95b..c0eb87c78c 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -344,7 +344,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, - &hwpt_id, errp)) { + &hwpt_id, NULL, errp)) { return -EINVAL; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index e30539a8d4..087a11efc7 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -138,13 +138,34 @@ typedef struct SMMUVdev { uint32_t sid; }SMMUVdev; +typedef struct PendFaultEntry { + struct iommu_hwpt_pgfault fault; + QTAILQ_ENTRY(PendFaultEntry) entry; +} PendFaultEntry; + +typedef struct PageRespEntry { + struct iommu_hwpt_page_response resp; + QTAILQ_ENTRY(PageRespEntry) entry; +} PageRespEntry; + typedef struct SMMUS1Hwpt { + void *sdev; void *smmu; IOMMUFDBackend *iommufd; SMMUViommu *viommu; uint32_t hwpt_id; + uint32_t out_fault_fd; QLIST_HEAD(, SMMUDevice) device_list; QLIST_ENTRY(SMMUViommu) next; + /* fault handling */ + struct io_uring fault_ring; + QemuThread read_fault_thread; + QemuThread write_fault_thread; + QemuMutex fault_mutex; + QemuCond fault_cond; + QTAILQ_HEAD(, PageRespEntry) pageresp; + QTAILQ_HEAD(, PendFaultEntry) pendfault; + bool exiting; } SMMUS1Hwpt; typedef struct SMMUDevice { @@ -258,7 +279,8 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, uint32_t data_len, void *data); void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, - uint32_t data_len, void *data); + uint32_t data_len, void *data, + bool req_fault_fd); int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, uint32_t *num, void *reqs); int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 0f2c826036..b279184974 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -62,7 +62,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, - Error **errp); + uint32_t *out_fault_fd, Error **errp); bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, bool start, Error **errp); bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, -- Gitee From 494e0ace6c120af00b27a0cc1d4a478073654e35 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 12 Sep 2024 00:33:13 -0700 Subject: [PATCH 127/134] pci: Get pasid capability from vIOMMU Signed-off-by: Yi Liu --- hw/pci/pci.c | 13 +++++++++++++ include/hw/pci/pci.h | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index d6f627aa51..447ef2b163 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2802,6 +2802,19 @@ void pci_device_unset_iommu_device(PCIDevice *dev) } } +bool pci_device_get_pasid_cap(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->get_pasid_cap) { + return iommu_bus->iommu_ops->get_pasid_cap(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn); + } + return false; +} + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { /* diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 8d1af44249..0dfe274c33 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -418,12 +418,25 @@ typedef struct PCIIOMMUOps { * @devfn: device and function number of the PCI device. */ void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); + /** + * @get_pasid_cap: get pasid capability from vIOMMU + * + * Optional callback. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + */ + bool (*get_pasid_cap)(PCIBus *bus, void *opaque, int devfn); } PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, Error **errp); void pci_device_unset_iommu_device(PCIDevice *dev); +bool pci_device_get_pasid_cap(PCIDevice *dev); /** * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus -- Gitee From 0978556247d968ffc83beff3b2611c93fd9b6b13 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 12 Sep 2024 00:17:31 -0700 Subject: [PATCH 128/134] backend/iommufd: Report PASID capability Signed-off-by: Yi Liu --- backends/iommufd.c | 4 +++- hw/arm/smmu-common.c | 4 ++-- hw/arm/smmuv3.c | 4 +++- hw/vfio/iommufd.c | 4 +++- include/hw/arm/smmu-common.h | 2 +- include/sysemu/host_iommu_device.h | 1 + include/sysemu/iommufd.h | 3 ++- 7 files changed, 15 insertions(+), 7 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index e9ce82297b..4f5df63331 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -326,7 +326,8 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - uint64_t *caps, Error **errp) + uint64_t *caps, uint8_t *max_pasid_log2, + Error **errp) { struct iommu_hw_info info = { .size = sizeof(info), @@ -344,6 +345,7 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, *type = info.out_data_type; g_assert(caps); *caps = info.out_capabilities; + *max_pasid_log2 = info.out_max_pasid_log2; return true; } diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index c382fa16e5..e7028bd4ec 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -853,7 +853,7 @@ SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) /* IOMMUFD helpers */ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, - uint32_t data_len, void *data) + uint32_t data_len, uint8_t *pasid, void *data) { uint64_t caps; @@ -863,7 +863,7 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, return !iommufd_backend_get_device_info(sdev->idev->iommufd, sdev->idev->devid, data_type, data, - data_len, &caps, NULL); + data_len, &caps, pasid, NULL); } void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 30c0ae4c3b..0ca0e96fcc 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -264,6 +264,7 @@ static void smmuv3_nested_init_regs(SMMUv3State *s) SMMUDevice *sdev; uint32_t data_type; uint32_t val; + uint8_t pasid; int ret; if (!bs->nested || !bs->viommu) { @@ -280,7 +281,8 @@ static void smmuv3_nested_init_regs(SMMUv3State *s) goto out; } - ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &sdev->info); + ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &pasid, + &sdev->info); if (ret) { error_report("failed to get SMMU device info"); return; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index c0eb87c78c..a108beda29 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -871,18 +871,20 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, struct iommu_hw_info_vtd vtd; } data; uint64_t hw_caps; + uint8_t pasids; hiod->agent = opaque; if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, &data, sizeof(data), - &hw_caps, errp)) { + &hw_caps, &pasids, errp)) { return false; } hiod->name = g_strdup(vdev->name); caps->type = type; caps->hw_caps = hw_caps; + caps->max_pasid_log2 = pasids; return true; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 087a11efc7..8ae33c3753 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -276,7 +276,7 @@ void smmu_inv_notifiers_all(SMMUState *s); /* IOMMUFD helpers */ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, - uint32_t data_len, void *data); + uint32_t data_len, uint8_t *pasid, void *data); void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, uint32_t data_len, void *data, diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index 84131f5495..22c76a37a7 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -26,6 +26,7 @@ typedef struct HostIOMMUDeviceCaps { uint32_t type; uint64_t hw_caps; + uint8_t max_pasid_log2; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index b279184974..29afaa429d 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -57,7 +57,8 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - uint64_t *caps, Error **errp); + uint64_t *caps, uint8_t *max_pasid_log2, + Error **errp); bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, -- Gitee From da7cdc41aa3813f6bb1c87ced178f60185dac692 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 12 Sep 2024 01:38:46 -0700 Subject: [PATCH 129/134] vfio: Synthesize vPASID capability to VM If user wants to expose PASID capability in vIOMMU, then VFIO would also report the PASID cap for this device if the underlying hardware supports it as well. As a start, this chooses to put the vPASID cap in the last 8 bytes of the vconfig space. This is a choice in the good hope of no conflict with any existing cap or hidden registers. For the devices that has hidden registers, user should figure out a proper offset for the vPASID cap. This may require an option for user to config it. Here we leave it as a future extension. There are more discussions on the mechanism of finding the proper offset. https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/ Signed-off-by: Yi Liu --- hw/pci/pcie.c | 12 ++++++++++++ hw/vfio/pci.c | 28 ++++++++++++++++++++++++++++ include/hw/pci/pcie.h | 4 ++++ 3 files changed, 44 insertions(+) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 04fbd794a8..a5b4e54bd7 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1123,3 +1123,15 @@ void pcie_acs_reset(PCIDevice *dev) pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0); } } + +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps) +{ + pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, 1, + offset, PCI_EXT_CAP_PASID_SIZEOF); + + dev->exp.pasid_cap = offset; + + pci_set_word(dev->config + offset + PCI_PASID_CAP, caps); + + pci_set_word(dev->wmask + dev->exp.pasid_cap + PCI_PASID_CTRL, 0x7); +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f585f285f4..293deb8737 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -21,6 +21,7 @@ #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include +#include #include #include "hw/hw.h" @@ -2348,6 +2349,33 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) } + { + HostIOMMUDeviceCaps *caps = &vdev->vbasedev.hiod->caps; + + /* + * TODO: Add option for enabling pasid at a safe offset, this adds the + * pasid capability in the end of the PCIE config space. + */ + if (caps->max_pasid_log2 && pci_device_get_pasid_cap(&vdev->pdev)) { + uint16_t pasid_caps = (caps->max_pasid_log2 << 8) & PCI_PASID_CAP_WIDTH; + + if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) { + pasid_caps |= PCI_PASID_CAP_EXEC; + } + + if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) { + pasid_caps |= PCI_PASID_CAP_PRIV; + } + + pcie_pasid_init(pdev, + PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF, + pasid_caps); + + /* PASID capability is fully emulated by QEMU */ + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8); + } + } + /* Cleanup chain head ID if necessary */ if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index 11f5a91bbb..41ee27f023 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -79,6 +79,9 @@ struct PCIExpressDevice { uint16_t sriov_cap; PCIESriovPF sriov_pf; PCIESriovVF sriov_vf; + + /* Offset of PASID capability in config space */ + uint16_t pasid_cap; }; #define COMPAT_PROP_PCP "power_controller_present" @@ -147,4 +150,5 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps); #endif /* QEMU_PCIE_H */ -- Gitee From d4d0d15716a3f4c89ca9532e6b598b14db76ae0c Mon Sep 17 00:00:00 2001 From: Zhangfei Gao Date: Sat, 26 Oct 2024 08:40:11 +0000 Subject: [PATCH 130/134] smmuv3: realize get_pasid_cap and set ssidsize with pasid Signed-off-by: Zhangfei Gao --- hw/arm/smmu-common.c | 9 +++++++++ hw/arm/smmuv3.c | 3 +-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index e7028bd4ec..3a257a5b0e 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -831,10 +831,19 @@ static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) } } +static bool smmu_dev_get_pasid_cap(PCIBus *bus, + void *opaque, int devfn) +{ + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + + return true; +} + static const PCIIOMMUOps smmu_ops = { .get_address_space = smmu_find_add_as, .set_iommu_device = smmu_dev_set_iommu_device, .unset_iommu_device = smmu_dev_unset_iommu_device, + .get_pasid_cap = smmu_dev_get_pasid_cap, }; SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 0ca0e96fcc..6964ab000d 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -312,8 +312,7 @@ out: val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); - val = FIELD_EX32(sdev->info.idr[1], IDR1, SSIDSIZE); - s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, val); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, pasid); val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); -- Gitee From 58f66c2581b3c4a45a02717330f1b2188424889b Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 15 Jan 2025 16:11:21 +0000 Subject: [PATCH 131/134] smmu-common: Return sysmem address space only for vfio-pci This will enable pcie-root-port hotplug event irq to work. Discussion Link: https://lore.kernel.org/qemu-devel/74114c0db34b420a90e9fe5bd991767e@huawei.com/ Signed-off-by: Shameer Kolothum --- hw/arm/smmu-common.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 3a257a5b0e..6c4b82757f 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -639,9 +639,16 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) SMMUState *s = opaque; SMMUPciBus *sbus = smmu_get_sbus(s, bus); SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + bool is_vfio = false; + PCIDevice *pdev; + + pdev = pci_find_device(bus, pci_bus_num(bus), devfn); + if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + is_vfio = true; + } /* Return the system as if the device uses stage-2 only */ - if (s->nested && !sdev->s1_hwpt) { + if (s->nested && !sdev->s1_hwpt && is_vfio) { return &sdev->as_sysmem; } else { return &sdev->as; -- Gitee From 2697e7418c1e0d87c82feca33800e3a093546a90 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 16 Jan 2025 15:20:18 +0000 Subject: [PATCH 132/134] smmuv3: Change arm-smmuv3-nested name to arm-smmuv3-accel This is based on feedback received for RFC v1. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 38 +++++++++++++++++++------------------- hw/arm/virt-acpi-build.c | 16 ++++++++-------- hw/arm/virt.c | 24 ++++++++++++------------ hw/core/sysbus-fdt.c | 2 +- include/hw/arm/smmuv3.h | 8 ++++---- include/hw/arm/virt.h | 10 +++++----- 6 files changed, 49 insertions(+), 49 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 6964ab000d..ecdad6bda4 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -2253,14 +2253,14 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } -static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) +static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) { DeviceState *d = opaque; - SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; - if (s_nested->pci_bus && !strcmp(bus->qbus.name, s_nested->pci_bus)) { + if (s_accel->pci_bus && !strcmp(bus->qbus.name, s_accel->pci_bus)) { object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), &error_abort); } @@ -2268,15 +2268,15 @@ static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) return 0; } -static void smmu_nested_realize(DeviceState *d, Error **errp) +static void smmu_accel_realize(DeviceState *d, Error **errp) { - SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); - SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); + SMMUv3AccelState *s_nested = ARM_SMMUV3_ACCEL(d); + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_nested); SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; object_child_foreach_recursive(object_get_root(), - smmuv3_nested_pci_host_bridge, d); + smmuv3_accel_pci_host_bridge, d); object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); c->parent_realize(d, &local_err); @@ -2365,8 +2365,8 @@ static Property smmuv3_properties[] = { DEFINE_PROP_END_OF_LIST() }; -static Property smmuv3_nested_properties[] = { - DEFINE_PROP_STRING("pci-bus", SMMUv3NestedState, pci_bus), +static Property smmuv3_accel_properties[] = { + DEFINE_PROP_STRING("pci-bus", SMMUv3AccelState, pci_bus), DEFINE_PROP_END_OF_LIST() }; @@ -2389,15 +2389,15 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmuv3_properties); } -static void smmuv3_nested_class_init(ObjectClass *klass, void *data) +static void smmuv3_accel_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_CLASS(klass); + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass); dc->vmsd = &vmstate_smmuv3; - device_class_set_parent_realize(dc, smmu_nested_realize, + device_class_set_parent_realize(dc, smmu_accel_realize, &c->parent_realize); - device_class_set_props(dc, smmuv3_nested_properties); + device_class_set_props(dc, smmuv3_accel_properties); dc->user_creatable = true; dc->hotpluggable = false; } @@ -2440,12 +2440,12 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, imrc->notify_flag_changed = smmuv3_notify_flag_changed; } -static const TypeInfo smmuv3_nested_type_info = { - .name = TYPE_ARM_SMMUV3_NESTED, +static const TypeInfo smmuv3_accel_type_info = { + .name = TYPE_ARM_SMMUV3_ACCEL, .parent = TYPE_ARM_SMMUV3, - .instance_size = sizeof(SMMUv3NestedState), - .class_size = sizeof(SMMUv3NestedClass), - .class_init = smmuv3_nested_class_init, + .instance_size = sizeof(SMMUv3AccelState), + .class_size = sizeof(SMMUv3AccelClass), + .class_init = smmuv3_accel_class_init, }; static const TypeInfo smmuv3_type_info = { @@ -2466,7 +2466,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { static void smmuv3_register_types(void) { type_register(&smmuv3_type_info); - type_register(&smmuv3_nested_type_info); + type_register(&smmuv3_accel_type_info); type_register(&smmuv3_iommu_memory_region_info); } diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index ad0f79e03d..db635120f9 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -418,10 +418,10 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, }; /* - * Nested SMMU requires RMRs for MSI 1-1 mapping, which + * Accel SMMU requires RMRs for MSI 1-1 mapping, which * require _DSM for PreservingPCI Boot Configurations */ - if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { cfg.preserve_config = true; } @@ -619,10 +619,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Table 2 The IORT */ acpi_table_begin(&table, table_data); - if (vms->smmu_nested_count) { - irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE; - base = vms->memmap[VIRT_SMMU_NESTED].base; - num_smmus = vms->smmu_nested_count; + if (vms->smmu_accel_count) { + irq = vms->irqmap[VIRT_SMMU_ACCEL] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU_ACCEL].base; + num_smmus = vms->smmu_accel_count; } else if (virt_has_smmuv3(vms)) { irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; base = vms->memmap[VIRT_SMMU].base; @@ -655,7 +655,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } next_range.input_base = idmap->input_base + idmap->id_count; - if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { nb_nodes++; /* RMR node per SMMU */ } } @@ -775,7 +775,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); } - if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index a55f297af2..57d00acd48 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -166,7 +166,7 @@ static const MemMapEntry base_memmap[] = { /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, - [VIRT_SMMU_NESTED] = { 0x0b010000, 0x00ff0000}, + [VIRT_SMMU_ACCEL] = { 0x0b010000, 0x00ff0000}, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, @@ -212,7 +212,7 @@ static const int a15irqmap[] = { [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ - [VIRT_SMMU_NESTED] = 200, + [VIRT_SMMU_ACCEL] = 200, }; static const char *valid_cpus[] = { @@ -3619,27 +3619,27 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, /* For smmuv3-nested devices we need to set the mem & irq */ if (device_is_dynamic_sysbus(mc, dev) && - object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_NESTED)) { - hwaddr base = vms->memmap[VIRT_SMMU_NESTED].base; - int irq = vms->irqmap[VIRT_SMMU_NESTED]; + object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_ACCEL)) { + hwaddr base = vms->memmap[VIRT_SMMU_ACCEL].base; + int irq = vms->irqmap[VIRT_SMMU_ACCEL]; - if (vms->smmu_nested_count >= MAX_SMMU_NESTED) { + if (vms->smmu_accel_count >= MAX_SMMU_ACCEL) { error_setg(errp, "smmuv3-nested max count reached!"); return; } - base += (vms->smmu_nested_count * SMMU_IO_LEN); - irq += (vms->smmu_nested_count * NUM_SMMU_IRQS); + base += (vms->smmu_accel_count * SMMU_IO_LEN); + irq += (vms->smmu_accel_count * NUM_SMMU_IRQS); sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); for (int i = 0; i < 4; i++) { sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, qdev_get_gpio_in(vms->gic, irq + i)); } - if (vms->iommu != VIRT_IOMMU_SMMUV3_NESTED) { - vms->iommu = VIRT_IOMMU_SMMUV3_NESTED; + if (vms->iommu != VIRT_IOMMU_SMMUV3_ACCEL) { + vms->iommu = VIRT_IOMMU_SMMUV3_ACCEL; } - vms->smmu_nested_count++; + vms->smmu_accel_count++; } if (vms->platform_bus_dev) { @@ -3815,7 +3815,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_NESTED); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_ACCEL); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); #endif diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c index 0f0d0b3e58..58f4dc614c 100644 --- a/hw/core/sysbus-fdt.c +++ b/hw/core/sysbus-fdt.c @@ -489,7 +489,7 @@ static const BindingEntry bindings[] = { #ifdef CONFIG_LINUX TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), - TYPE_BINDING("arm-smmuv3-nested", no_fdt_node), + TYPE_BINDING("arm-smmuv3-accel", no_fdt_node), VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), #endif #ifdef CONFIG_TPM diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index 96513fce56..79b6fcd8e7 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -84,16 +84,16 @@ struct SMMUv3Class { #define TYPE_ARM_SMMUV3 "arm-smmuv3" OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) -#define TYPE_ARM_SMMUV3_NESTED "arm-smmuv3-nested" -OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) +#define TYPE_ARM_SMMUV3_ACCEL "arm-smmuv3-accel" +OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL) -struct SMMUv3NestedState { +struct SMMUv3AccelState { SMMUv3State smmuv3_state; char *pci_bus; }; -struct SMMUv3NestedClass { +struct SMMUv3AccelClass { /*< private >*/ SMMUv3Class smmuv3_class; /*< public >*/ diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index bc3c8b70da..3e2759d225 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -110,7 +110,7 @@ typedef enum { #define SMMU_IO_LEN 0x20000 /* Max supported nested SMMUv3 */ -#define MAX_SMMU_NESTED 64 +#define MAX_SMMU_ACCEL 64 enum { VIRT_FLASH, @@ -124,7 +124,7 @@ enum { VIRT_GIC_ITS, VIRT_GIC_REDIST, VIRT_SMMU, - VIRT_SMMU_NESTED, + VIRT_SMMU_ACCEL, VIRT_UART, VIRT_CPUFREQ, VIRT_MMIO, @@ -159,7 +159,7 @@ enum { typedef enum VirtIOMMUType { VIRT_IOMMU_NONE, VIRT_IOMMU_SMMUV3, - VIRT_IOMMU_SMMUV3_NESTED, + VIRT_IOMMU_SMMUV3_ACCEL, VIRT_IOMMU_VIRTIO, } VirtIOMMUType; @@ -227,7 +227,7 @@ struct VirtMachineState { bool mte; bool dtb_randomness; bool pmu; - int smmu_nested_count; + int smmu_accel_count; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; @@ -298,7 +298,7 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) static inline bool virt_has_smmuv3(const VirtMachineState *vms) { return vms->iommu == VIRT_IOMMU_SMMUV3 || - vms->iommu == VIRT_IOMMU_SMMUV3_NESTED; + vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL; } #endif /* QEMU_ARM_VIRT_H */ -- Gitee From 5e83bdd94533c91d69c7154d967f3bdd2fa86054 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 16 Jan 2025 15:29:49 +0000 Subject: [PATCH 133/134] smmuv3: Use default bus for arm-smmuv3-accel This is based on feedback on RFC v1. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index ecdad6bda4..c0fcdd7574 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -2256,11 +2256,10 @@ static void smmu_realize(DeviceState *d, Error **errp) static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) { DeviceState *d = opaque; - SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; - if (s_accel->pci_bus && !strcmp(bus->qbus.name, s_accel->pci_bus)) { + if (d->parent_bus && !strcmp(bus->qbus.name, d->parent_bus->name)) { object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), &error_abort); } @@ -2365,11 +2364,6 @@ static Property smmuv3_properties[] = { DEFINE_PROP_END_OF_LIST() }; -static Property smmuv3_accel_properties[] = { - DEFINE_PROP_STRING("pci-bus", SMMUv3AccelState, pci_bus), - DEFINE_PROP_END_OF_LIST() -}; - static void smmuv3_instance_init(Object *obj) { /* Nothing much to do here as of now */ @@ -2397,9 +2391,9 @@ static void smmuv3_accel_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_smmuv3; device_class_set_parent_realize(dc, smmu_accel_realize, &c->parent_realize); - device_class_set_props(dc, smmuv3_accel_properties); dc->user_creatable = true; dc->hotpluggable = false; + dc->bus_type = TYPE_PCIE_BUS; } static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, -- Gitee From b1087bb8a4edbacc7240c0fcab63bc1cf2624627 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 21 Jan 2025 14:42:45 +0000 Subject: [PATCH 134/134] gpex-acpi: Remove duplicate DSM #5 It looks like acpi_dsdt_add_pci_osc() already builds the _DSM for virt/gpex case, and we don't need to add duplicate DSM methods for _DSM #5 case. And the acpi_dsdt_add_pci_osc() already adds _DSM #5 when preserve_config is true. This is to get rid of the ACPI related error messages during boot: ACPI BIOS Error (bug): Failure creating named object [\_SB.PC08._DSM], AE_ALREADY_EXISTS ACPI BIOS Error (bug): \_SB.PC08.PCI0._DSM: Excess arguments - ASL declared 5, ACPI requires 4 ToDo: Only sanity tested. Signed-off-by: Shameer Kolothum --- hw/pci-host/gpex-acpi.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index ce424fc9da..162f6221ab 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -189,12 +189,6 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); } - if (cfg->preserve_config) { - method = aml_method("_DSM", 5, AML_SERIALIZED); - aml_append(method, aml_return(aml_int(0))); - aml_append(dev, method); - } - acpi_dsdt_add_pci_route_table(dev, cfg->irq); /* @@ -226,12 +220,6 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); - if (cfg->preserve_config) { - method = aml_method("_DSM", 5, AML_SERIALIZED); - aml_append(method, aml_return(aml_int(0))); - aml_append(dev, method); - } - acpi_dsdt_add_pci_route_table(dev, cfg->irq); method = aml_method("_CBA", 0, AML_NOTSERIALIZED); -- Gitee