From dabfe4576297b05dec07feb70c3cad5be034c2f2 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 4 Mar 2021 21:34:46 +0800
Subject: [PATCH 01/48] vfio: Support host translation granule size

The cpu_physical_memory_set_dirty_lebitmap() can quickly deal with
the dirty pages of memory by bitmap-traveling, regardless of whether
the bitmap is aligned correctly or not.

cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
host page size. So it'd better to set bitmap_pgsize to host page size
to support more translation granule sizes.

[aw: The Fixes commit below introduced code to restrict migration
support to configurations where the target page size intersects the
host dirty page support.  For example, a 4K guest on a 4K host.
Due to the above flexibility in bitmap handling, this restriction
unnecessarily prevents mixed target/host pages size that could
otherwise be supported.  Use host page size for dirty bitmap.]

Fixes: fc49c9cbf2 ("vfio: Get migration capability flags for container")
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Message-Id: <20210304133446.1521-1-jiangkunkun@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 ...upport-host-translation-granule-size.patch | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 vfio-Support-host-translation-granule-size.patch

diff --git a/vfio-Support-host-translation-granule-size.patch b/vfio-Support-host-translation-granule-size.patch
new file mode 100644
index 0000000..d5eab65
--- /dev/null
+++ b/vfio-Support-host-translation-granule-size.patch
@@ -0,0 +1,152 @@
+From 594cba5943b3e8bf1bd5720b1fa20d4662920ae0 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Thu, 4 Mar 2021 21:34:46 +0800
+Subject: [PATCH] vfio: Support host translation granule size
+
+The cpu_physical_memory_set_dirty_lebitmap() can quickly deal with
+the dirty pages of memory by bitmap-traveling, regardless of whether
+the bitmap is aligned correctly or not.
+
+cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
+host page size. So it'd better to set bitmap_pgsize to host page size
+to support more translation granule sizes.
+
+[aw: The Fixes commit below introduced code to restrict migration
+support to configurations where the target page size intersects the
+host dirty page support.  For example, a 4K guest on a 4K host.
+Due to the above flexibility in bitmap handling, this restriction
+unnecessarily prevents mixed target/host pages size that could
+otherwise be supported.  Use host page size for dirty bitmap.]
+
+Fixes: fc49c9cbf2 ("vfio: Get migration capability flags for container")
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Message-Id: <20210304133446.1521-1-jiangkunkun@huawei.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+---
+ hw/vfio/common.c | 48 +++++++++++++++++++++++++-----------------------
+ 1 file changed, 25 insertions(+), 23 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index ebd701faa0..a7817c90cc 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -377,7 +377,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+ {
+     struct vfio_iommu_type1_dma_unmap *unmap;
+     struct vfio_bitmap *bitmap;
+-    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
++    uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
+     int ret;
+ 
+     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
+@@ -389,12 +389,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+     bitmap = (struct vfio_bitmap *)&unmap->data;
+ 
+     /*
+-     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+-     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
+-     * TARGET_PAGE_SIZE.
++     * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
++     * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
++     * to qemu_real_host_page_size.
+      */
+ 
+-    bitmap->pgsize = TARGET_PAGE_SIZE;
++    bitmap->pgsize = qemu_real_host_page_size;
+     bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                    BITS_PER_BYTE;
+ 
+@@ -672,16 +672,17 @@ static void vfio_listener_region_add(MemoryListener *listener,
+         return;
+     }
+ 
+-    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+-                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
++    if (unlikely((section->offset_within_address_space &
++                  ~qemu_real_host_page_mask) !=
++                 (section->offset_within_region & ~qemu_real_host_page_mask))) {
+         error_report("%s received unaligned region", __func__);
+         return;
+     }
+ 
+-    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
++    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+     llend = int128_make64(section->offset_within_address_space);
+     llend = int128_add(llend, section->size);
+-    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
++    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
+ 
+     if (int128_ge(int128_make64(iova), llend)) {
+         return;
+@@ -866,8 +867,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
+         return;
+     }
+ 
+-    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+-                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
++    if (unlikely((section->offset_within_address_space &
++                  ~qemu_real_host_page_mask) !=
++                 (section->offset_within_region & ~qemu_real_host_page_mask))) {
+         error_report("%s received unaligned region", __func__);
+         return;
+     }
+@@ -895,10 +897,10 @@ static void vfio_listener_region_del(MemoryListener *listener,
+          */
+     }
+ 
+-    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
++    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+     llend = int128_make64(section->offset_within_address_space);
+     llend = int128_add(llend, section->size);
+-    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
++    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
+ 
+     if (int128_ge(int128_make64(iova), llend)) {
+         return;
+@@ -967,13 +969,13 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     range->size = size;
+ 
+     /*
+-     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+-     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
+-     * TARGET_PAGE_SIZE.
++     * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
++     * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
++     * to qemu_real_host_page_size.
+      */
+-    range->bitmap.pgsize = TARGET_PAGE_SIZE;
++    range->bitmap.pgsize = qemu_real_host_page_size;
+ 
+-    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
++    pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
+     range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                          BITS_PER_BYTE;
+     range->bitmap.data = g_try_malloc0(range->bitmap.size);
+@@ -1077,8 +1079,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+                section->offset_within_region;
+ 
+     return vfio_get_dirty_bitmap(container,
+-                       TARGET_PAGE_ALIGN(section->offset_within_address_space),
+-                       int128_get64(section->size), ram_addr);
++                   REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
++                   int128_get64(section->size), ram_addr);
+ }
+ 
+ static void vfio_listener_log_sync(MemoryListener *listener,
+@@ -1572,10 +1574,10 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
+                             header);
+ 
+     /*
+-     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+-     * TARGET_PAGE_SIZE to mark those dirty.
++     * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
++     * qemu_real_host_page_size to mark those dirty.
+      */
+-    if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) {
++    if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) {
+         container->dirty_pages_supported = true;
+         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
+         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
+-- 
+2.27.0
+
-- 
Gitee


From 6d1ad1532952c7d9f9a9e0973d4b159473c63f22 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 9 Mar 2021 11:19:13 +0800
Subject: [PATCH 02/48] vfio/migrate: Move switch of dirty tracking into
 vfio_memory_listener

For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.

However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.

Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:

1. Coupling: Their saving granule is different (perVM vs perDevice).
   vfio will enable dirty_page_tracking for each devices, actually
   once is enough.

2. Conflicts: The ram_save_setup() traverses all memory_listeners
   to execute their log_start() and log_sync() hooks to get the
   first round dirty bitmap, which is used by the bulk stage of
   ram saving. However, as vfio dirty tracking is not yet started,
   it can't get dirty bitmap from vfio. Then we give up the chance
   to handle vfio dirty page at bulk stage.

Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().

[1] https://www.spinics.net/lists/kvm/msg229967.html

Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...e-switch-of-dirty-tracking-into-vfio.patch | 196 ++++++++++++++++++
 1 file changed, 196 insertions(+)
 create mode 100644 vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch

diff --git a/vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch b/vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch
new file mode 100644
index 0000000..5f543b4
--- /dev/null
+++ b/vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch
@@ -0,0 +1,196 @@
+From 74b651428e6ed65177354d80bd888e842a4a5077 Mon Sep 17 00:00:00 2001
+From: Keqian Zhu <zhukeqian1@huawei.com>
+Date: Tue, 9 Mar 2021 11:19:13 +0800
+Subject: [PATCH] vfio/migrate: Move switch of dirty tracking into
+ vfio_memory_listener
+
+For now the switch of vfio dirty page tracking is integrated into
+@vfio_save_handler. The reason is that some PCI vendor driver may
+start to track dirty base on _SAVING state of device, so if dirty
+tracking is started before setting device state, vfio will report
+full-dirty to QEMU.
+
+However, the dirty bmap of all ramblocks are fully set when setup
+ram saving, so it's not matter whether the device is in _SAVING
+state when start vfio dirty tracking.
+
+Moreover, this logic causes some problems [1]. The object of dirty
+tracking is guest memory, but the object of @vfio_save_handler is
+device state, which produces unnecessary coupling and conflicts:
+
+1. Coupling: Their saving granule is different (perVM vs perDevice).
+   vfio will enable dirty_page_tracking for each devices, actually
+   once is enough.
+
+2. Conflicts: The ram_save_setup() traverses all memory_listeners
+   to execute their log_start() and log_sync() hooks to get the
+   first round dirty bitmap, which is used by the bulk stage of
+   ram saving. However, as vfio dirty tracking is not yet started,
+   it can't get dirty bitmap from vfio. Then we give up the chance
+   to handle vfio dirty page at bulk stage.
+
+Move the switch of vfio dirty_page_tracking into vfio_memory_listener
+can solve above problems. Besides, Do not require devices in SAVING
+state for vfio_sync_dirty_bitmap().
+
+[1] https://www.spinics.net/lists/kvm/msg229967.html
+
+Reported-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c    | 49 ++++++++++++++++++++++++++++++++++++---------
+ hw/vfio/migration.c | 35 --------------------------------
+ 2 files changed, 40 insertions(+), 44 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index a7817c90cc..245e32df5b 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -310,7 +310,7 @@ bool vfio_mig_active(void)
+     return true;
+ }
+ 
+-static bool vfio_devices_all_saving(VFIOContainer *container)
++static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
+ {
+     VFIOGroup *group;
+     VFIODevice *vbasedev;
+@@ -328,13 +328,8 @@ static bool vfio_devices_all_saving(VFIOContainer *container)
+                 return false;
+             }
+ 
+-            if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
+-                if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
+-                    && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+-                        return false;
+-                }
+-                continue;
+-            } else {
++            if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
++                && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+                 return false;
+             }
+         }
+@@ -952,6 +947,40 @@ static void vfio_listener_region_del(MemoryListener *listener,
+     }
+ }
+ 
++static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
++{
++    int ret;
++    struct vfio_iommu_type1_dirty_bitmap dirty = {
++        .argsz = sizeof(dirty),
++    };
++
++    if (start) {
++        dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
++    } else {
++        dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
++    }
++
++    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
++    if (ret) {
++        error_report("Failed to set dirty tracking flag 0x%x errno: %d",
++                     dirty.flags, errno);
++    }
++}
++
++static void vfio_listener_log_global_start(MemoryListener *listener)
++{
++    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
++
++    vfio_set_dirty_page_tracking(container, true);
++}
++
++static void vfio_listener_log_global_stop(MemoryListener *listener)
++{
++    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
++
++    vfio_set_dirty_page_tracking(container, false);
++}
++
+ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+                                  uint64_t size, ram_addr_t ram_addr)
+ {
+@@ -1093,7 +1122,7 @@ static void vfio_listener_log_sync(MemoryListener *listener,
+         return;
+     }
+ 
+-    if (vfio_devices_all_saving(container)) {
++    if (vfio_devices_all_dirty_tracking(container)) {
+         vfio_sync_dirty_bitmap(container, section);
+     }
+ }
+@@ -1101,6 +1130,8 @@ static void vfio_listener_log_sync(MemoryListener *listener,
+ static const MemoryListener vfio_memory_listener = {
+     .region_add = vfio_listener_region_add,
+     .region_del = vfio_listener_region_del,
++    .log_global_start = vfio_listener_log_global_start,
++    .log_global_stop = vfio_listener_log_global_stop,
+     .log_sync = vfio_listener_log_sync,
+ };
+ 
+diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
+index 033cb2b0c9..f1f006d584 100644
+--- a/hw/vfio/migration.c
++++ b/hw/vfio/migration.c
+@@ -395,40 +395,10 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
+     return qemu_file_get_error(f);
+ }
+ 
+-static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start)
+-{
+-    int ret;
+-    VFIOMigration *migration = vbasedev->migration;
+-    VFIOContainer *container = vbasedev->group->container;
+-    struct vfio_iommu_type1_dirty_bitmap dirty = {
+-        .argsz = sizeof(dirty),
+-    };
+-
+-    if (start) {
+-        if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
+-            dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+-        } else {
+-            return -EINVAL;
+-        }
+-    } else {
+-            dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+-    }
+-
+-    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
+-    if (ret) {
+-        error_report("Failed to set dirty tracking flag 0x%x errno: %d",
+-                     dirty.flags, errno);
+-        return -errno;
+-    }
+-    return ret;
+-}
+-
+ static void vfio_migration_cleanup(VFIODevice *vbasedev)
+ {
+     VFIOMigration *migration = vbasedev->migration;
+ 
+-    vfio_set_dirty_page_tracking(vbasedev, false);
+-
+     if (migration->region.mmaps) {
+         vfio_region_unmap(&migration->region);
+     }
+@@ -469,11 +439,6 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
+         return ret;
+     }
+ 
+-    ret = vfio_set_dirty_page_tracking(vbasedev, true);
+-    if (ret) {
+-        return ret;
+-    }
+-
+     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+ 
+     ret = qemu_file_get_error(f);
+-- 
+2.27.0
+
-- 
Gitee


From d5a71c239a363ae9c9542e36be2b114b2791c2fe Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 27 May 2021 20:31:01 +0800
Subject: [PATCH 03/48] vfio: Fix unregister SaveVMHandler in
 vfio_migration_finalize

In the vfio_migration_init(), the SaveVMHandler is registered for
VFIO device. But it lacks the operation of 'unregister'. It will
lead to 'Segmentation fault (core dumped)' in
qemu_savevm_state_setup(), if performing live migration after a
VFIO device is hot deleted.

Fixes: cd5b58f2ba (vfio: Register SaveVMHandlers for VFIO device)
Reported-by: Qixin Gan <ganqixin@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Message-Id: <20210527123101.289-1-jiangkunkun@huawei.com>
Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 ...ter-SaveVMHandler-in-vfio_migration_.patch | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch

diff --git a/vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch b/vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch
new file mode 100644
index 0000000..47d5992
--- /dev/null
+++ b/vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch
@@ -0,0 +1,36 @@
+From 8dc6e7ccc5712aee457ffb1f6cf1bf3f80e778d5 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Thu, 27 May 2021 20:31:01 +0800
+Subject: [PATCH] vfio: Fix unregister SaveVMHandler in vfio_migration_finalize
+
+In the vfio_migration_init(), the SaveVMHandler is registered for
+VFIO device. But it lacks the operation of 'unregister'. It will
+lead to 'Segmentation fault (core dumped)' in
+qemu_savevm_state_setup(), if performing live migration after a
+VFIO device is hot deleted.
+
+Fixes: cd5b58f2ba (vfio: Register SaveVMHandlers for VFIO device)
+Reported-by: Qixin Gan <ganqixin@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Message-Id: <20210527123101.289-1-jiangkunkun@huawei.com>
+Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+---
+ hw/vfio/migration.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
+index f1f006d584..d9e0e12824 100644
+--- a/hw/vfio/migration.c
++++ b/hw/vfio/migration.c
+@@ -893,6 +893,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
+ 
+         remove_migration_state_change_notifier(&migration->migration_state);
+         qemu_del_vm_change_state_handler(migration->vm_state);
++        unregister_savevm(vbasedev->dev, "vfio", vbasedev);
+         vfio_migration_exit(vbasedev);
+     }
+ 
+-- 
+2.27.0
+
-- 
Gitee


From c9d492032d96334c776fc84232e1e2f2c550ba50 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 16 Mar 2021 20:57:15 +0800
Subject: [PATCH 04/48] migration/ram: Reduce unnecessary rate limiting

When the host page is a huge page and something is sent in the
current iteration, migration_rate_limit() should be executed.
If not, it can be omitted.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Reviewed-by: David Edmondson <david.edmondson@oracle.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20210316125716.1243-2-jiangkunkun@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 ...ram-Reduce-unnecessary-rate-limiting.patch | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 migration-ram-Reduce-unnecessary-rate-limiting.patch

diff --git a/migration-ram-Reduce-unnecessary-rate-limiting.patch b/migration-ram-Reduce-unnecessary-rate-limiting.patch
new file mode 100644
index 0000000..64374dd
--- /dev/null
+++ b/migration-ram-Reduce-unnecessary-rate-limiting.patch
@@ -0,0 +1,42 @@
+From 338d691c985ad5b3624ef36e4beaac82982c8f0a Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 16 Mar 2021 20:57:15 +0800
+Subject: [PATCH] migration/ram: Reduce unnecessary rate limiting
+
+When the host page is a huge page and something is sent in the
+current iteration, migration_rate_limit() should be executed.
+If not, it can be omitted.
+
+Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: David Edmondson <david.edmondson@oracle.com>
+Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Message-Id: <20210316125716.1243-2-jiangkunkun@huawei.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+---
+ migration/ram.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/migration/ram.c b/migration/ram.c
+index 2077ba5be4..22063e00b4 100644
+--- a/migration/ram.c
++++ b/migration/ram.c
+@@ -3076,8 +3076,13 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
+         }
+ 
+         pss->page++;
+-        /* Allow rate limiting to happen in the middle of huge pages */
+-        migration_rate_limit();
++        /*
++         * Allow rate limiting to happen in the middle of huge pages if
++         * something is sent in the current iteration.
++         */
++        if (pagesize_bits > 1 && tmppages > 0) {
++            migration_rate_limit();
++        }
+     } while ((pss->page & (pagesize_bits - 1)) &&
+              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
+ 
+-- 
+2.27.0
+
-- 
Gitee


From 8c2655d6250bdebde8cf800d5f6fec7291f6cd11 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 16 Mar 2021 20:57:16 +0800
Subject: [PATCH 05/48] migration/ram: Optimize ram_save_host_page()

Starting from pss->page, ram_save_host_page() will check every page
and send the dirty pages up to the end of the current host page or
the boundary of used_length of the block. If the host page size is
a huge page, the step "check" will take a lot of time.

It will improve performance to use migration_bitmap_find_dirty().

Tested on Kunpeng 920; VM parameters: 1U 4G (page size 1G)
The time of ram_save_host_page() in the last round of ram saving:
before optimize: 9250us		after optimize: 34us

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Message-Id: <20210316125716.1243-3-jiangkunkun@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 ...tion-ram-Optimize-ram_save_host_page.patch | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 migration-ram-Optimize-ram_save_host_page.patch

diff --git a/migration-ram-Optimize-ram_save_host_page.patch b/migration-ram-Optimize-ram_save_host_page.patch
new file mode 100644
index 0000000..c58a6dc
--- /dev/null
+++ b/migration-ram-Optimize-ram_save_host_page.patch
@@ -0,0 +1,95 @@
+From ae1a8506aa45266f2bf77a8d428f5ccd970a9b13 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 16 Mar 2021 20:57:16 +0800
+Subject: [PATCH] migration/ram: Optimize ram_save_host_page()
+
+Starting from pss->page, ram_save_host_page() will check every page
+and send the dirty pages up to the end of the current host page or
+the boundary of used_length of the block. If the host page size is
+a huge page, the step "check" will take a lot of time.
+
+It will improve performance to use migration_bitmap_find_dirty().
+
+Tested on Kunpeng 920; VM parameters: 1U 4G (page size 1G)
+The time of ram_save_host_page() in the last round of ram saving:
+before optimize: 9250us		after optimize: 34us
+
+Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Message-Id: <20210316125716.1243-3-jiangkunkun@huawei.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+---
+ migration/ram.c | 43 +++++++++++++++++++++----------------------
+ 1 file changed, 21 insertions(+), 22 deletions(-)
+
+diff --git a/migration/ram.c b/migration/ram.c
+index 22063e00b4..1bd99ff9e5 100644
+--- a/migration/ram.c
++++ b/migration/ram.c
+@@ -3052,6 +3052,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
+     int tmppages, pages = 0;
+     size_t pagesize_bits =
+         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
++    unsigned long hostpage_boundary =
++        QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
+ 
+     if (ramblock_is_ignored(pss->block)) {
+         error_report("block %s should not be migrated !", pss->block->idstr);
+@@ -3060,34 +3062,31 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
+ 
+     do {
+         /* Check the pages is dirty and if it is send it */
+-        if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
+-            pss->page++;
+-            continue;
+-        }
+-
+-        tmppages = ram_save_target_page(rs, pss, last_stage);
+-        if (tmppages < 0) {
+-            return tmppages;
+-        }
++        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
++            tmppages = ram_save_target_page(rs, pss, last_stage);
++            if (tmppages < 0) {
++                return tmppages;
++            }
+ 
+-        pages += tmppages;
+-        if (pss->block->unsentmap) {
+-            clear_bit(pss->page, pss->block->unsentmap);
+-        }
++            pages += tmppages;
++            if (pss->block->unsentmap) {
++                clear_bit(pss->page, pss->block->unsentmap);
++            }
+ 
+-        pss->page++;
+-        /*
+-         * Allow rate limiting to happen in the middle of huge pages if
+-         * something is sent in the current iteration.
+-         */
+-        if (pagesize_bits > 1 && tmppages > 0) {
+-            migration_rate_limit();
++            /*
++             * Allow rate limiting to happen in the middle of huge pages if
++             * something is sent in the current iteration.
++             */
++            if (pagesize_bits > 1 && tmppages > 0) {
++                migration_rate_limit();
++            }
+         }
++        pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
+     } while ((pss->page & (pagesize_bits - 1)) &&
+              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
+ 
+-    /* The offset we leave with is the last one we looked at */
+-    pss->page--;
++    /* The offset we leave with is the min boundary of host page and block */
++    pss->page = MIN(pss->page, hostpage_boundary) - 1;
+     return pages;
+ }
+ 
+-- 
+2.27.0
+
-- 
Gitee


From d6e16086dbcb5e55a793d6e05f660524c7845a8e Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 29 Jul 2021 15:24:48 +0800
Subject: [PATCH 06/48] qdev/monitors: Fix reundant error_setg of
 qdev_add_device

There is an extra log "error_setg" in qdev_add_device(). When
hot-plug a device, if the corresponding bus doesn't exist, it
will trigger an asseration "assert(*errp == NULL)".

Fixes: 515a7970490 (log: Add some logs on VM runtime path)
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...x-reundant-error_setg-of-qdev_add_de.patch | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch

diff --git a/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch
new file mode 100644
index 0000000..85467e8
--- /dev/null
+++ b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch
@@ -0,0 +1,30 @@
+From 4f1396f9e173a24f78204b8849c209100499d639 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Thu, 29 Jul 2021 15:24:48 +0800
+Subject: [PATCH] qdev/monitors: Fix reundant error_setg of qdev_add_device
+
+There is an extra log "error_setg" in qdev_add_device(). When
+hot-plug a device, if the corresponding bus doesn't exist, it
+will trigger an asseration "assert(*errp == NULL)".
+
+Fixes: 515a7970490 (log: Add some logs on VM runtime path)
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ qdev-monitor.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/qdev-monitor.c b/qdev-monitor.c
+index c6c1d3f06a..ab2bdef105 100644
+--- a/qdev-monitor.c
++++ b/qdev-monitor.c
+@@ -587,7 +587,6 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp)
+     if (path != NULL) {
+         bus = qbus_find(path, errp);
+         if (!bus) {
+-            error_setg(errp, "can not find bus for %s", driver);
+             return NULL;
+         }
+         if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) {
+-- 
+2.27.0
+
-- 
Gitee


From 18c2252e32359be69a29c282c22aa2ea078a9d86 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Sat, 8 May 2021 17:31:03 +0800
Subject: [PATCH 07/48] linux-headers: update against 5.10 and manual clear
 vfio dirty log series

The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl
VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in
the kernel, update the header to add them.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...date-against-5.10-and-manual-clear-v.patch | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 linux-headers-update-against-5.10-and-manual-clear-v.patch

diff --git a/linux-headers-update-against-5.10-and-manual-clear-v.patch b/linux-headers-update-against-5.10-and-manual-clear-v.patch
new file mode 100644
index 0000000..0315fc2
--- /dev/null
+++ b/linux-headers-update-against-5.10-and-manual-clear-v.patch
@@ -0,0 +1,90 @@
+From 79efeccd41d761b68946df68e5431eff399ccbd5 Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Sat, 8 May 2021 17:31:03 +0800
+Subject: [PATCH] linux-headers: update against 5.10 and manual clear vfio
+ dirty log series
+
+The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl
+VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
+VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in
+the kernel, update the header to add them.
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ linux-headers/linux/vfio.h | 37 ++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 36 insertions(+), 1 deletion(-)
+
+diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
+index a90672494d..120387ba58 100644
+--- a/linux-headers/linux/vfio.h
++++ b/linux-headers/linux/vfio.h
+@@ -46,6 +46,16 @@
+  */
+ #define VFIO_NOIOMMU_IOMMU		8
+ 
++/*
++ * The vfio_iommu driver may support user clears dirty log manually, which means
++ * dirty log can be requested to not cleared automatically after dirty log is
++ * copied to userspace, it's user's duty to clear dirty log.
++ *
++ * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP.
++ */
++#define VFIO_DIRTY_LOG_MANUAL_CLEAR	11
++
+ /*
+  * The IOCTL interface is designed for extensibility by embedding the
+  * structure length (argsz) and flags into structures passed between
+@@ -1074,6 +1084,7 @@ struct vfio_bitmap {
+  * field.  No guarantee is made to the user that arbitrary unmaps of iova
+  * or size different from those used in the original mapping call will
+  * succeed.
++ *
+  * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
+  * before unmapping IO virtual addresses. When this flag is set, the user must
+  * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
+@@ -1133,8 +1144,30 @@ struct vfio_iommu_type1_dma_unmap {
+  * actual bitmap. If dirty pages logging is not enabled, an error will be
+  * returned.
+  *
+- * Only one of the flags _START, _STOP and _GET may be specified at a time.
++ * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as
++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying
++ * dirty bitmap is not cleared automatically. The user can clear it manually by
++ * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set.
+  *
++ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set,
++ * instructs the IOMMU driver to clear the dirty status of pages in a bitmap
++ * for IOMMU container for a given IOVA range. The user must specify the IOVA
++ * range, the bitmap and the pgsize through the structure
++ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
++ * supports clearing a bitmap of the smallest supported pgsize only and can be
++ * modified in future to clear a bitmap of any specified supported pgsize. The
++ * user must provide a memory area for the bitmap memory and specify its size
++ * in bitmap.size. One bit is used to represent one page consecutively starting
++ * from iova offset. The user should provide page size in bitmap.pgsize field.
++ * A bit set in the bitmap indicates that the page at that offset from iova is
++ * cleared the dirty status, and dirty tracking is re-enabled for that page. The
++ * caller must set argsz to a value including the size of structure
++ * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If
++ * dirty pages logging is not enabled, an error will be returned. Note: user
++ * should clear dirty log before handle corresponding dirty pages.
++ *
++ * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be
++ * specified at a time.
+  */
+ struct vfio_iommu_type1_dirty_bitmap {
+ 	__u32        argsz;
+@@ -1142,6 +1175,8 @@ struct vfio_iommu_type1_dirty_bitmap {
+ #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
+ #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
+ #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR	(1 << 3)
++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP	(1 << 4)
+ 	__u8         data[];
+ };
+ 
+-- 
+2.27.0
+
-- 
Gitee


From be1ff354a7ec82d114912f301bd5b9e079065a41 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Sat, 8 May 2021 17:31:04 +0800
Subject: [PATCH 08/48] vfio: Maintain DMA mapping range for the container

When synchronizing dirty bitmap from kernel VFIO we do it in a
per-iova-range fashion and we allocate the userspace bitmap for each of the
ioctl. This patch introduces `struct VFIODMARange` to describe a range of
the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and
make the bitmap cache of this range be persistent so that we don't need to
g_try_malloc0() every time. Note that the new structure is almost a copy of
`struct vfio_iommu_type1_dma_map` but only internally used by QEMU.

More importantly, the cached per-iova-range dirty bitmap will be further
used when we want to add support for the CLEAR_BITMAP and this cached
bitmap will be used to guarantee we don't clear any unknown dirty bits
otherwise that can be a severe data loss issue for migration code.

It's pretty intuitive to maintain a bitmap per container since we perform
log_sync at this granule. But I don't know how to deal with things like
memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome.

* yet something to-do:
  - can't work with guest viommu
  - no locks
  - etc

[ The idea and even the commit message are largely inherited from kvm side.
  See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ]

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jinagkunkun@huawei.com>
---
 ...-DMA-mapping-range-for-the-container.patch | 191 ++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 vfio-Maintain-DMA-mapping-range-for-the-container.patch

diff --git a/vfio-Maintain-DMA-mapping-range-for-the-container.patch b/vfio-Maintain-DMA-mapping-range-for-the-container.patch
new file mode 100644
index 0000000..901a5e3
--- /dev/null
+++ b/vfio-Maintain-DMA-mapping-range-for-the-container.patch
@@ -0,0 +1,191 @@
+From 90a6a1ec65d55d27faf79341b2dd9418d99da187 Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Sat, 8 May 2021 17:31:04 +0800
+Subject: [PATCH] vfio: Maintain DMA mapping range for the container
+
+When synchronizing dirty bitmap from kernel VFIO we do it in a
+per-iova-range fashion and we allocate the userspace bitmap for each of the
+ioctl. This patch introduces `struct VFIODMARange` to describe a range of
+the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and
+make the bitmap cache of this range be persistent so that we don't need to
+g_try_malloc0() every time. Note that the new structure is almost a copy of
+`struct vfio_iommu_type1_dma_map` but only internally used by QEMU.
+
+More importantly, the cached per-iova-range dirty bitmap will be further
+used when we want to add support for the CLEAR_BITMAP and this cached
+bitmap will be used to guarantee we don't clear any unknown dirty bits
+otherwise that can be a severe data loss issue for migration code.
+
+It's pretty intuitive to maintain a bitmap per container since we perform
+log_sync at this granule. But I don't know how to deal with things like
+memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome.
+
+* yet something to-do:
+  - can't work with guest viommu
+  - no locks
+  - etc
+
+[ The idea and even the commit message are largely inherited from kvm side.
+  See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ]
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Kunkun Jiang <jinagkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 62 +++++++++++++++++++++++++++++++----
+ include/hw/vfio/vfio-common.h |  9 +++++
+ 2 files changed, 65 insertions(+), 6 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 245e32df5b..c33c4c539d 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -420,6 +420,29 @@ unmap_exit:
+     return ret;
+ }
+ 
++static VFIODMARange *vfio_lookup_match_range(VFIOContainer *container,
++        hwaddr start_addr, hwaddr size)
++{
++    VFIODMARange *qrange;
++
++    QLIST_FOREACH(qrange, &container->dma_list, next) {
++        if (qrange->iova == start_addr && qrange->size == size) {
++            return qrange;
++        }
++    }
++    return NULL;
++}
++
++static void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange)
++{
++    uint64_t pages, size;
++
++    pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size;
++    size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
++
++    qrange->bitmap = g_malloc0(size);
++}
++
+ /*
+  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+  */
+@@ -433,12 +456,29 @@ static int vfio_dma_unmap(VFIOContainer *container,
+         .iova = iova,
+         .size = size,
+     };
++    VFIODMARange *qrange;
+ 
+     if (iotlb && container->dirty_pages_supported &&
+         vfio_devices_all_running_and_saving(container)) {
+         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+     }
+ 
++    /*
++     * unregister the DMA range
++     *
++     * It seems that the memory layer will give us the same section as the one
++     * used in region_add(). Otherwise it'll be complicated to manipulate the
++     * bitmap across region_{add,del}. Is there any guarantee?
++     *
++     * But there is really not such a restriction on the kernel interface
++     * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc).
++     */
++    qrange = vfio_lookup_match_range(container, iova, size);
++    assert(qrange);
++    g_free(qrange->bitmap);
++    QLIST_REMOVE(qrange, next);
++    g_free(qrange);
++
+     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+         /*
+          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
+@@ -475,6 +515,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+         .iova = iova,
+         .size = size,
+     };
++    VFIODMARange *qrange;
++
++    qrange = g_malloc0(sizeof(*qrange));
++    qrange->iova = iova;
++    qrange->size = size;
++    QLIST_INSERT_HEAD(&container->dma_list, qrange, next);
++    /* XXX allocate the dirty bitmap on demand */
++    vfio_dma_range_init_dirty_bitmap(qrange);
+ 
+     if (!readonly) {
+         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
+@@ -986,9 +1034,14 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+ {
+     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+     struct vfio_iommu_type1_dirty_bitmap_get *range;
++    VFIODMARange *qrange;
+     uint64_t pages;
+     int ret;
+ 
++    qrange = vfio_lookup_match_range(container, iova, size);
++    /* the same as vfio_dma_unmap() */
++    assert(qrange);
++
+     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
+ 
+     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
+@@ -1007,11 +1060,8 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
+     range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                          BITS_PER_BYTE;
+-    range->bitmap.data = g_try_malloc0(range->bitmap.size);
+-    if (!range->bitmap.data) {
+-        ret = -ENOMEM;
+-        goto err_out;
+-    }
++
++    range->bitmap.data = (__u64 *)qrange->bitmap;
+ 
+     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
+     if (ret) {
+@@ -1027,7 +1077,6 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
+                                 range->bitmap.size, ram_addr);
+ err_out:
+-    g_free(range->bitmap.data);
+     g_free(dbitmap);
+ 
+     return ret;
+@@ -1681,6 +1730,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     container->dirty_pages_supported = false;
+     QLIST_INIT(&container->giommu_list);
+     QLIST_INIT(&container->hostwin_list);
++    QLIST_INIT(&container->dma_list);
+ 
+     ret = vfio_init_container(container, group->fd, errp);
+     if (ret) {
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index 475aa9fb40..2853dc861e 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -76,6 +76,14 @@ typedef struct VFIOAddressSpace {
+ 
+ struct VFIOGroup;
+ 
++typedef struct VFIODMARange {
++    QLIST_ENTRY(VFIODMARange) next;
++    hwaddr iova;
++    size_t size;
++    void *vaddr; /* unused */
++    unsigned long *bitmap; /* dirty bitmap cache for this range */
++} VFIODMARange;
++
+ typedef struct VFIOContainer {
+     VFIOAddressSpace *space;
+     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+@@ -91,6 +99,7 @@ typedef struct VFIOContainer {
+     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+     QLIST_HEAD(, VFIOGroup) group_list;
++    QLIST_HEAD(, VFIODMARange) dma_list;
+     QLIST_ENTRY(VFIOContainer) next;
+ } VFIOContainer;
+ 
+-- 
+2.27.0
+
-- 
Gitee


From c127034a6dd48156c5e0a228e19f2aff83ad7e33 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Sat, 8 May 2021 17:31:05 +0800
Subject: [PATCH 09/48] vfio/migration: Add support for manual clear vfio dirty
 log

The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl
VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in
the kernel, tweak the userspace side to use them.

Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and
provide the log_clear() hook for vfio_memory_listener. If the
kernel supports it, deliever the clear message to kernel.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...dd-support-for-manual-clear-vfio-dir.patch | 223 ++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 vfio-migration-Add-support-for-manual-clear-vfio-dir.patch

diff --git a/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch
new file mode 100644
index 0000000..c59bc4e
--- /dev/null
+++ b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch
@@ -0,0 +1,223 @@
+From f9574b63bf5e940d794db2c3aaf928bde36d9521 Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Sat, 8 May 2021 17:31:05 +0800
+Subject: [PATCH] vfio/migration: Add support for manual clear vfio dirty log
+
+The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl
+VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
+VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in
+the kernel, tweak the userspace side to use them.
+
+Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and
+provide the log_clear() hook for vfio_memory_listener. If the
+kernel supports it, deliever the clear message to kernel.
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 149 +++++++++++++++++++++++++++++++++-
+ include/hw/vfio/vfio-common.h |   1 +
+ 2 files changed, 148 insertions(+), 2 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index c33c4c539d..206fb83e28 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1045,7 +1045,9 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
+ 
+     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
+-    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
++    dbitmap->flags = container->dirty_log_manual_clear ?
++                     VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR :
++                     VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
+     range->iova = iova;
+     range->size = size;
+@@ -1176,12 +1178,148 @@ static void vfio_listener_log_sync(MemoryListener *listener,
+     }
+ }
+ 
++/*
++ * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP
++ * ioctl. But copy from kvm side and align {start, size} with 64 pages.
++ *
++ * I think the code can be simplified a lot if no alignment requirement.
++ */
++#define VFIO_CLEAR_LOG_SHIFT  6
++#define VFIO_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << VFIO_CLEAR_LOG_SHIFT)
++#define VFIO_CLEAR_LOG_MASK   (-VFIO_CLEAR_LOG_ALIGN)
++
++static int vfio_log_clear_one_range(VFIOContainer *container,
++        VFIODMARange *qrange, uint64_t start, uint64_t size)
++{
++    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
++    struct vfio_iommu_type1_dirty_bitmap_get *range;
++
++    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
++
++    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
++    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP;
++    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
++
++    /*
++     * Now let's deal with the actual bitmap, which is almost the same
++     * as the kvm side.
++     */
++    uint64_t end, bmap_start, start_delta, bmap_npages;
++    unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
++    int ret;
++
++    bmap_start = start & VFIO_CLEAR_LOG_MASK;
++    start_delta = start - bmap_start;
++    bmap_start /= psize;
++
++    bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN)
++        << VFIO_CLEAR_LOG_SHIFT;
++    end = qrange->size / psize;
++    if (bmap_npages > end - bmap_start) {
++        bmap_npages = end - bmap_start;
++    }
++    start_delta /= psize;
++
++    if (start_delta) {
++        bmap_clear = bitmap_new(bmap_npages);
++        bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap,
++                                    bmap_start, start_delta + size / psize);
++        bitmap_clear(bmap_clear, 0, start_delta);
++        range->bitmap.data = (__u64 *)bmap_clear;
++    } else {
++        range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start));
++    }
++
++    range->iova = qrange->iova + bmap_start * psize;
++    range->size = bmap_npages * psize;
++    range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) /
++                                               BITS_PER_BYTE;
++    range->bitmap.pgsize = qemu_real_host_page_size;
++
++    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
++    if (ret) {
++        error_report("Failed to clear dirty log for iova: 0x%"PRIx64
++                " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
++                (uint64_t)range->size, errno);
++        goto err_out;
++    }
++
++    bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize);
++err_out:
++    g_free(bmap_clear);
++    g_free(dbitmap);
++    return 0;
++}
++
++static int vfio_physical_log_clear(VFIOContainer *container,
++                                   MemoryRegionSection *section)
++{
++    uint64_t start, size, offset, count;
++    VFIODMARange *qrange;
++    int ret = 0;
++
++    if (!container->dirty_log_manual_clear) {
++        /* No need to do explicit clear */
++        return ret;
++    }
++
++    start = section->offset_within_address_space;
++    size = int128_get64(section->size);
++
++    if (!size) {
++        return ret;
++    }
++
++    QLIST_FOREACH(qrange, &container->dma_list, next) {
++        /*
++         * Discard ranges that do not overlap the section (e.g., the
++         * Memory BAR regions of the device)
++         */
++        if (qrange->iova > start + size - 1 ||
++            start > qrange->iova + qrange->size - 1) {
++            continue;
++        }
++
++        if (start >= qrange->iova) {
++            /* The range starts before section or is aligned to it. */
++            offset = start - qrange->iova;
++            count = MIN(qrange->size - offset, size);
++        } else {
++            /* The range starts after section. */
++            offset = 0;
++            count = MIN(qrange->size, size - (qrange->iova - start));
++        }
++        ret = vfio_log_clear_one_range(container, qrange, offset, count);
++        if (ret < 0) {
++            break;
++        }
++    }
++
++    return ret;
++}
++
++static void vfio_listener_log_clear(MemoryListener *listener,
++                                    MemoryRegionSection *section)
++{
++    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
++
++    if (vfio_listener_skipped_section(section) ||
++        !container->dirty_pages_supported) {
++        return;
++    }
++
++    if (vfio_devices_all_dirty_tracking(container)) {
++        vfio_physical_log_clear(container, section);
++    }
++}
++
+ static const MemoryListener vfio_memory_listener = {
+     .region_add = vfio_listener_region_add,
+     .region_del = vfio_listener_region_del,
+     .log_global_start = vfio_listener_log_global_start,
+     .log_global_stop = vfio_listener_log_global_stop,
+     .log_sync = vfio_listener_log_sync,
++    .log_clear = vfio_listener_log_clear,
+ };
+ 
+ static void vfio_listener_release(VFIOContainer *container)
+@@ -1563,7 +1701,7 @@ static int vfio_get_iommu_type(VFIOContainer *container,
+ static int vfio_init_container(VFIOContainer *container, int group_fd,
+                                Error **errp)
+ {
+-    int iommu_type, ret;
++    int iommu_type, dirty_log_manual_clear, ret;
+ 
+     iommu_type = vfio_get_iommu_type(container, errp);
+     if (iommu_type < 0) {
+@@ -1592,6 +1730,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd,
+     }
+ 
+     container->iommu_type = iommu_type;
++
++    dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION,
++                                   VFIO_DIRTY_LOG_MANUAL_CLEAR);
++    if (dirty_log_manual_clear) {
++        container->dirty_log_manual_clear = dirty_log_manual_clear;
++    }
++
+     return 0;
+ }
+ 
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index 2853dc861e..1277914ca8 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -93,6 +93,7 @@ typedef struct VFIOContainer {
+     int error;
+     bool initialized;
+     bool dirty_pages_supported;
++    bool dirty_log_manual_clear;
+     uint64_t dirty_pgsizes;
+     uint64_t max_dirty_bitmap_size;
+     unsigned long pgsizes;
+-- 
+2.27.0
+
-- 
Gitee


From 3c9cc492f258ec2a5893d148e256c23e09b61579 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Wed, 31 Mar 2021 14:47:13 +0800
Subject: [PATCH 10/48] hw/arm/smmuv3: Support 16K translation granule

The driver can query some bits in SMMUv3 IDR5 to learn which
translation granules are supported. Arm recommends that SMMUv3
implementations support at least 4K and 64K granules. But in
the vSMMUv3, there seems to be no reason not to support 16K
translation granule. In addition, if 16K is not supported,
vSVA will failed to be enabled in the future for 16K guest
kernel. So it'd better to support it.

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 ...muv3-Support-16K-translation-granule.patch | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 hw-arm-smmuv3-Support-16K-translation-granule.patch

diff --git a/hw-arm-smmuv3-Support-16K-translation-granule.patch b/hw-arm-smmuv3-Support-16K-translation-granule.patch
new file mode 100644
index 0000000..08c4bc5
--- /dev/null
+++ b/hw-arm-smmuv3-Support-16K-translation-granule.patch
@@ -0,0 +1,49 @@
+From 008dec30dea19950ff48a34c54441d065c1f228b Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Wed, 31 Mar 2021 14:47:13 +0800
+Subject: [PATCH] hw/arm/smmuv3: Support 16K translation granule
+
+The driver can query some bits in SMMUv3 IDR5 to learn which
+translation granules are supported. Arm recommends that SMMUv3
+implementations support at least 4K and 64K granules. But in
+the vSMMUv3, there seems to be no reason not to support 16K
+translation granule. In addition, if 16K is not supported,
+vSVA will failed to be enabled in the future for 16K guest
+kernel. So it'd better to support it.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/smmuv3.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index e96d5beb9a..7911944c59 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -254,8 +254,9 @@ static void smmuv3_init_regs(SMMUv3State *s)
+     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS);
+     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS,   SMMU_CMDQS);
+ 
+-   /* 4K and 64K granule support */
++    /* 4K, 16K and 64K granule support */
+     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
++    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
+     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1);
+     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */
+ 
+@@ -480,7 +481,8 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event)
+ 
+         tg = CD_TG(cd, i);
+         tt->granule_sz = tg2granule(tg, i);
+-        if ((tt->granule_sz != 12 && tt->granule_sz != 16) || CD_ENDI(cd)) {
++        if ((tt->granule_sz != 12 && tt->granule_sz != 14 &&
++             tt->granule_sz != 16) || CD_ENDI(cd)) {
+             goto bad_cd;
+         }
+ 
+-- 
+2.27.0
+
-- 
Gitee


From f5fd61400bc7308362ffeb0a81ee86d4406c0f5f Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Mon, 19 Oct 2020 17:15:08 +0800
Subject: [PATCH 11/48] hw/arm/smmuv3: Set the restoration priority of the
 vSMMUv3 explicitly

Ensure the vSMMUv3 will be restored before all PCIe devices so that DMA
translation can work properly during migration.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Message-id: 20201019091508.197-1-yuzenghui@huawei.com
Acked-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...t-the-restoration-priority-of-the-vS.patch | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch

diff --git a/hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch b/hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch
new file mode 100644
index 0000000..1139fea
--- /dev/null
+++ b/hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch
@@ -0,0 +1,33 @@
+From eceb9213e23d15d5b4342b6a6a8368f4fec60c2f Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Mon, 19 Oct 2020 17:15:08 +0800
+Subject: [PATCH] hw/arm/smmuv3: Set the restoration priority of the vSMMUv3
+ explicitly
+
+Ensure the vSMMUv3 will be restored before all PCIe devices so that DMA
+translation can work properly during migration.
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Message-id: 20201019091508.197-1-yuzenghui@huawei.com
+Acked-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 7911944c59..3b5723e1e1 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1424,6 +1424,7 @@ static const VMStateDescription vmstate_smmuv3 = {
+     .name = "smmuv3",
+     .version_id = 1,
+     .minimum_version_id = 1,
++    .priority = MIG_PRI_IOMMU,
+     .fields = (VMStateField[]) {
+         VMSTATE_UINT32(features, SMMUv3State),
+         VMSTATE_UINT8(sid_size, SMMUv3State),
+-- 
+2.27.0
+
-- 
Gitee


From 483badb73dd785a676d998b02fc6f0f4b2d9d034 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Mon, 22 Feb 2021 10:13:55 -0500
Subject: [PATCH 12/48] hw/vfio/common: trace vfio_connect_container operations

We currently trace vfio_disconnect_container() but we do not trace
the container <-> group creation, which can be useful to understand
the VFIO topology.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...race-vfio_connect_container-operatio.patch | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 hw-vfio-common-trace-vfio_connect_container-operatio.patch

diff --git a/hw-vfio-common-trace-vfio_connect_container-operatio.patch b/hw-vfio-common-trace-vfio_connect_container-operatio.patch
new file mode 100644
index 0000000..bd95235
--- /dev/null
+++ b/hw-vfio-common-trace-vfio_connect_container-operatio.patch
@@ -0,0 +1,53 @@
+From b107e6ec2a5a34e0ba95345a89dcf5f505ad9da4 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Mon, 22 Feb 2021 10:13:55 -0500
+Subject: [PATCH] hw/vfio/common: trace vfio_connect_container operations
+
+We currently trace vfio_disconnect_container() but we do not trace
+the container <-> group creation, which can be useful to understand
+the VFIO topology.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c     | 3 +++
+ hw/vfio/trace-events | 2 ++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 206fb83e28..fefa2ccfdf 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1848,6 +1848,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     QLIST_FOREACH(container, &space->containers, next) {
+         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+             group->container = container;
++            trace_vfio_connect_existing_container(group->groupid,
++                                                  container->fd);
+             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+             vfio_kvm_device_add_group(group);
+             return 0;
+@@ -1881,6 +1883,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     if (ret) {
+         goto free_container_exit;
+     }
++    trace_vfio_connect_new_container(group->groupid, container->fd);
+ 
+     switch (container->iommu_type) {
+     case VFIO_TYPE1v2_IOMMU:
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 575ebde6e0..561dc6e758 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -102,6 +102,8 @@ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t si
+ vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
+ vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+ vfio_disconnect_container(int fd) "close container->fd=%d"
++vfio_connect_existing_container(int groupid, int container_fd) "group=%d existing container fd=%d"
++vfio_connect_new_container(int groupid, int container_fd) "group=%d new container fd=%d"
+ vfio_put_group(int fd) "close group->fd=%d"
+ vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
+ vfio_put_base_device(int fd) "close vdev->fd=%d"
+-- 
+2.27.0
+
-- 
Gitee


From 12e533eab483a765f49215858f7bb045006bc3f7 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 9 May 2019 10:23:42 -0400
Subject: [PATCH 13/48] update-linux-headers: Import iommu.h

Update the script to import the new iommu.h uapi header.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 update-linux-headers-Import-iommu.h.patch | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 update-linux-headers-Import-iommu.h.patch

diff --git a/update-linux-headers-Import-iommu.h.patch b/update-linux-headers-Import-iommu.h.patch
new file mode 100644
index 0000000..eea744e
--- /dev/null
+++ b/update-linux-headers-Import-iommu.h.patch
@@ -0,0 +1,29 @@
+From 78c269f4ed09a3272d99a65d9c86977a01ef99c8 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 9 May 2019 10:23:42 -0400
+Subject: [PATCH] update-linux-headers: Import iommu.h
+
+Update the script to import the new iommu.h uapi header.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ scripts/update-linux-headers.sh | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
+index f76d77363b..dfdfdfddcf 100755
+--- a/scripts/update-linux-headers.sh
++++ b/scripts/update-linux-headers.sh
+@@ -141,7 +141,7 @@ done
+ 
+ rm -rf "$output/linux-headers/linux"
+ mkdir -p "$output/linux-headers/linux"
+-for header in kvm.h vfio.h vfio_ccw.h vhost.h \
++for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \
+               psci.h psp-sev.h userfaultfd.h mman.h; do
+     cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
+ done
+-- 
+2.27.0
+
-- 
Gitee


From 3e52dd62030ecba9779d477b3a99a11c30e6e987 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Fri, 30 Jul 2021 09:15:31 +0800
Subject: [PATCH 14/48] vfio.h and iommu.h header update against 5.10

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...d-iommu.h-header-update-against-5.10.patch | 760 ++++++++++++++++++
 1 file changed, 760 insertions(+)
 create mode 100644 vfio.h-and-iommu.h-header-update-against-5.10.patch

diff --git a/vfio.h-and-iommu.h-header-update-against-5.10.patch b/vfio.h-and-iommu.h-header-update-against-5.10.patch
new file mode 100644
index 0000000..721f2b6
--- /dev/null
+++ b/vfio.h-and-iommu.h-header-update-against-5.10.patch
@@ -0,0 +1,760 @@
+From 95435c6778f38dee9ed6f3ee6fd9e022107315d7 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Fri, 30 Jul 2021 09:15:31 +0800
+Subject: [PATCH] vfio.h and iommu.h header update against 5.10
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ linux-headers/linux/iommu.h | 395 ++++++++++++++++++++++++++++++++++++
+ linux-headers/linux/vfio.h  | 249 ++++++++++++++++++++++-
+ 2 files changed, 641 insertions(+), 3 deletions(-)
+ create mode 100644 linux-headers/linux/iommu.h
+
+diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h
+new file mode 100644
+index 0000000000..773b7dc2d6
+--- /dev/null
++++ b/linux-headers/linux/iommu.h
+@@ -0,0 +1,395 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ * IOMMU user API definitions
++ */
++
++#ifndef IOMMU_H
++#define IOMMU_H
++
++#include <linux/types.h>
++
++#define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
++#define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
++#define IOMMU_FAULT_PERM_EXEC	(1 << 2) /* exec */
++#define IOMMU_FAULT_PERM_PRIV	(1 << 3) /* privileged */
++
++/* Generic fault types, can be expanded IRQ remapping fault */
++enum iommu_fault_type {
++	IOMMU_FAULT_DMA_UNRECOV = 1,	/* unrecoverable fault */
++	IOMMU_FAULT_PAGE_REQ,		/* page request fault */
++};
++
++enum iommu_fault_reason {
++	IOMMU_FAULT_REASON_UNKNOWN = 0,
++
++	/* Could not access the PASID table (fetch caused external abort) */
++	IOMMU_FAULT_REASON_PASID_FETCH,
++
++	/* PASID entry is invalid or has configuration errors */
++	IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
++
++	/*
++	 * PASID is out of range (e.g. exceeds the maximum PASID
++	 * supported by the IOMMU) or disabled.
++	 */
++	IOMMU_FAULT_REASON_PASID_INVALID,
++
++	/*
++	 * An external abort occurred fetching (or updating) a translation
++	 * table descriptor
++	 */
++	IOMMU_FAULT_REASON_WALK_EABT,
++
++	/*
++	 * Could not access the page table entry (Bad address),
++	 * actual translation fault
++	 */
++	IOMMU_FAULT_REASON_PTE_FETCH,
++
++	/* Protection flag check failed */
++	IOMMU_FAULT_REASON_PERMISSION,
++
++	/* access flag check failed */
++	IOMMU_FAULT_REASON_ACCESS,
++
++	/* Output address of a translation stage caused Address Size fault */
++	IOMMU_FAULT_REASON_OOR_ADDRESS,
++};
++
++/**
++ * struct iommu_fault_unrecoverable - Unrecoverable fault data
++ * @reason: reason of the fault, from &enum iommu_fault_reason
++ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
++ * @pasid: Process Address Space ID
++ * @perm: requested permission access using by the incoming transaction
++ *        (IOMMU_FAULT_PERM_* values)
++ * @addr: offending page address
++ * @fetch_addr: address that caused a fetch abort, if any
++ */
++struct iommu_fault_unrecoverable {
++	__u32	reason;
++#define IOMMU_FAULT_UNRECOV_PASID_VALID		(1 << 0)
++#define IOMMU_FAULT_UNRECOV_ADDR_VALID		(1 << 1)
++#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID	(1 << 2)
++	__u32	flags;
++	__u32	pasid;
++	__u32	perm;
++	__u64	addr;
++	__u64	fetch_addr;
++};
++
++/**
++ * struct iommu_fault_page_request - Page Request data
++ * @flags: encodes whether the corresponding fields are valid and whether this
++ *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
++ *         When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
++ *         must have the same PASID value as the page request. When it is clear,
++ *         the page response should not have a PASID.
++ * @pasid: Process Address Space ID
++ * @grpid: Page Request Group Index
++ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
++ * @addr: page address
++ * @private_data: device-specific private information
++ */
++struct iommu_fault_page_request {
++#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
++#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
++#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
++#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
++	__u32	flags;
++	__u32	pasid;
++	__u32	grpid;
++	__u32	perm;
++	__u64	addr;
++	__u64	private_data[2];
++};
++
++/**
++ * struct iommu_fault - Generic fault data
++ * @type: fault type from &enum iommu_fault_type
++ * @padding: reserved for future use (should be zero)
++ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
++ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
++ * @padding2: sets the fault size to allow for future extensions
++ */
++struct iommu_fault {
++	__u32	type;
++	__u32	padding;
++	union {
++		struct iommu_fault_unrecoverable event;
++		struct iommu_fault_page_request prm;
++		__u8 padding2[56];
++	};
++};
++
++/**
++ * enum iommu_page_response_code - Return status of fault handlers
++ * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
++ *	populated, retry the access. This is "Success" in PCI PRI.
++ * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
++ *	this device if possible. This is "Response Failure" in PCI PRI.
++ * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
++ *	access. This is "Invalid Request" in PCI PRI.
++ */
++enum iommu_page_response_code {
++	IOMMU_PAGE_RESP_SUCCESS = 0,
++	IOMMU_PAGE_RESP_INVALID,
++	IOMMU_PAGE_RESP_FAILURE,
++};
++
++/**
++ * struct iommu_page_response - Generic page response information
++ * @argsz: User filled size of this data
++ * @version: API version of this structure
++ * @flags: encodes whether the corresponding fields are valid
++ *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
++ * @pasid: Process Address Space ID
++ * @grpid: Page Request Group Index
++ * @code: response code from &enum iommu_page_response_code
++ */
++struct iommu_page_response {
++	__u32	argsz;
++#define IOMMU_PAGE_RESP_VERSION_1	1
++	__u32	version;
++#define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
++	__u32	flags;
++	__u32	pasid;
++	__u32	grpid;
++	__u32	code;
++};
++
++/* defines the granularity of the invalidation */
++enum iommu_inv_granularity {
++	IOMMU_INV_GRANU_DOMAIN,	/* domain-selective invalidation */
++	IOMMU_INV_GRANU_PASID,	/* PASID-selective invalidation */
++	IOMMU_INV_GRANU_ADDR,	/* page-selective invalidation */
++	IOMMU_INV_GRANU_NR,	/* number of invalidation granularities */
++};
++
++/**
++ * struct iommu_inv_addr_info - Address Selective Invalidation Structure
++ *
++ * @flags: indicates the granularity of the address-selective invalidation
++ * - If the PASID bit is set, the @pasid field is populated and the invalidation
++ *   relates to cache entries tagged with this PASID and matching the address
++ *   range.
++ * - If ARCHID bit is set, @archid is populated and the invalidation relates
++ *   to cache entries tagged with this architecture specific ID and matching
++ *   the address range.
++ * - Both PASID and ARCHID can be set as they may tag different caches.
++ * - If neither PASID or ARCHID is set, global addr invalidation applies.
++ * - The LEAF flag indicates whether only the leaf PTE caching needs to be
++ *   invalidated and other paging structure caches can be preserved.
++ * @pasid: process address space ID
++ * @archid: architecture-specific ID
++ * @addr: first stage/level input address
++ * @granule_size: page/block size of the mapping in bytes
++ * @nb_granules: number of contiguous granules to be invalidated
++ */
++struct iommu_inv_addr_info {
++#define IOMMU_INV_ADDR_FLAGS_PASID	(1 << 0)
++#define IOMMU_INV_ADDR_FLAGS_ARCHID	(1 << 1)
++#define IOMMU_INV_ADDR_FLAGS_LEAF	(1 << 2)
++	__u32	flags;
++	__u32	archid;
++	__u64	pasid;
++	__u64	addr;
++	__u64	granule_size;
++	__u64	nb_granules;
++};
++
++/**
++ * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure
++ *
++ * @flags: indicates the granularity of the PASID-selective invalidation
++ * - If the PASID bit is set, the @pasid field is populated and the invalidation
++ *   relates to cache entries tagged with this PASID and matching the address
++ *   range.
++ * - If the ARCHID bit is set, the @archid is populated and the invalidation
++ *   relates to cache entries tagged with this architecture specific ID and
++ *   matching the address range.
++ * - Both PASID and ARCHID can be set as they may tag different caches.
++ * - At least one of PASID or ARCHID must be set.
++ * @pasid: process address space ID
++ * @archid: architecture-specific ID
++ */
++struct iommu_inv_pasid_info {
++#define IOMMU_INV_PASID_FLAGS_PASID	(1 << 0)
++#define IOMMU_INV_PASID_FLAGS_ARCHID	(1 << 1)
++	__u32	flags;
++	__u32	archid;
++	__u64	pasid;
++};
++
++/**
++ * struct iommu_cache_invalidate_info - First level/stage invalidation
++ *     information
++ * @argsz: User filled size of this data
++ * @version: API version of this structure
++ * @cache: bitfield that allows to select which caches to invalidate
++ * @granularity: defines the lowest granularity used for the invalidation:
++ *     domain > PASID > addr
++ * @padding: reserved for future use (should be zero)
++ * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID
++ * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR
++ *
++ * Not all the combinations of cache/granularity are valid:
++ *
++ * +--------------+---------------+---------------+---------------+
++ * | type /       |   DEV_IOTLB   |     IOTLB     |      PASID    |
++ * | granularity  |               |               |      cache    |
++ * +==============+===============+===============+===============+
++ * | DOMAIN       |       N/A     |       Y       |       Y       |
++ * +--------------+---------------+---------------+---------------+
++ * | PASID        |       Y       |       Y       |       Y       |
++ * +--------------+---------------+---------------+---------------+
++ * | ADDR         |       Y       |       Y       |       N/A     |
++ * +--------------+---------------+---------------+---------------+
++ *
++ * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than
++ * @version and @cache.
++ *
++ * If multiple cache types are invalidated simultaneously, they all
++ * must support the used granularity.
++ */
++struct iommu_cache_invalidate_info {
++	__u32	argsz;
++#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
++	__u32	version;
++/* IOMMU paging structure cache */
++#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
++#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
++#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
++#define IOMMU_CACHE_INV_TYPE_NR		(3)
++	__u8	cache;
++	__u8	granularity;
++	__u8	padding[6];
++	union {
++		struct iommu_inv_pasid_info pasid_info;
++		struct iommu_inv_addr_info addr_info;
++	} granu;
++};
++
++/**
++ * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
++ * SVA binding.
++ *
++ * @flags:	VT-d PASID table entry attributes
++ * @pat:	Page attribute table data to compute effective memory type
++ * @emt:	Extended memory type
++ *
++ * Only guest vIOMMU selectable and effective options are passed down to
++ * the host IOMMU.
++ */
++struct iommu_gpasid_bind_data_vtd {
++#define IOMMU_SVA_VTD_GPASID_SRE	(1 << 0) /* supervisor request */
++#define IOMMU_SVA_VTD_GPASID_EAFE	(1 << 1) /* extended access enable */
++#define IOMMU_SVA_VTD_GPASID_PCD	(1 << 2) /* page-level cache disable */
++#define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
++#define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
++#define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
++#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 6)
++	__u64 flags;
++	__u32 pat;
++	__u32 emt;
++};
++
++#define IOMMU_SVA_VTD_GPASID_MTS_MASK	(IOMMU_SVA_VTD_GPASID_CD | \
++					 IOMMU_SVA_VTD_GPASID_EMTE | \
++					 IOMMU_SVA_VTD_GPASID_PCD |  \
++					 IOMMU_SVA_VTD_GPASID_PWT)
++
++/**
++ * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
++ * @argsz:	User filled size of this data
++ * @version:	Version of this data structure
++ * @format:	PASID table entry format
++ * @flags:	Additional information on guest bind request
++ * @gpgd:	Guest page directory base of the guest mm to bind
++ * @hpasid:	Process address space ID used for the guest mm in host IOMMU
++ * @gpasid:	Process address space ID used for the guest mm in guest IOMMU
++ * @addr_width:	Guest virtual address width
++ * @padding:	Reserved for future use (should be zero)
++ * @vtd:	Intel VT-d specific data
++ *
++ * Guest to host PASID mapping can be an identity or non-identity, where guest
++ * has its own PASID space. For non-identify mapping, guest to host PASID lookup
++ * is needed when VM programs guest PASID into an assigned device. VMM may
++ * trap such PASID programming then request host IOMMU driver to convert guest
++ * PASID to host PASID based on this bind data.
++ */
++struct iommu_gpasid_bind_data {
++	__u32 argsz;
++#define IOMMU_GPASID_BIND_VERSION_1	1
++	__u32 version;
++#define IOMMU_PASID_FORMAT_INTEL_VTD	1
++#define IOMMU_PASID_FORMAT_LAST		2
++	__u32 format;
++	__u32 addr_width;
++#define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
++	__u64 flags;
++	__u64 gpgd;
++	__u64 hpasid;
++	__u64 gpasid;
++	__u8  padding[8];
++	/* Vendor specific data */
++	union {
++		struct iommu_gpasid_bind_data_vtd vtd;
++	} vendor;
++};
++
++/**
++ * struct iommu_pasid_smmuv3 - ARM SMMUv3 Stream Table Entry stage 1 related
++ *     information
++ * @version: API version of this structure
++ * @s1fmt: STE s1fmt (format of the CD table: single CD, linear table
++ *         or 2-level table)
++ * @s1dss: STE s1dss (specifies the behavior when @pasid_bits != 0
++ *         and no PASID is passed along with the incoming transaction)
++ * @padding: reserved for future use (should be zero)
++ *
++ * The PASID table is referred to as the Context Descriptor (CD) table on ARM
++ * SMMUv3. Please refer to the ARM SMMU 3.x spec (ARM IHI 0070A) for full
++ * details.
++ */
++struct iommu_pasid_smmuv3 {
++#define PASID_TABLE_SMMUV3_CFG_VERSION_1 1
++	__u32	version;
++	__u8	s1fmt;
++	__u8	s1dss;
++	__u8	padding[2];
++};
++
++/**
++ * struct iommu_pasid_table_config - PASID table data used to bind guest PASID
++ *     table to the host IOMMU
++ * @argsz: User filled size of this data
++ * @version: API version to prepare for future extensions
++ * @base_ptr: guest physical address of the PASID table
++ * @format: format of the PASID table
++ * @pasid_bits: number of PASID bits used in the PASID table
++ * @config: indicates whether the guest translation stage must
++ *          be translated, bypassed or aborted.
++ * @padding: reserved for future use (should be zero)
++ * @vendor_data.smmuv3: table information when @format is
++ * %IOMMU_PASID_FORMAT_SMMUV3
++ */
++struct iommu_pasid_table_config {
++	__u32	argsz;
++#define PASID_TABLE_CFG_VERSION_1 1
++	__u32	version;
++	__u64	base_ptr;
++#define IOMMU_PASID_FORMAT_SMMUV3	1
++	__u32	format;
++	__u8	pasid_bits;
++#define IOMMU_PASID_CONFIG_TRANSLATE	1
++#define IOMMU_PASID_CONFIG_BYPASS	2
++#define IOMMU_PASID_CONFIG_ABORT	3
++	__u8	config;
++	__u8    padding[2];
++	union {
++		struct iommu_pasid_smmuv3 smmuv3;
++	} vendor_data;
++};
++
++#endif /* _UAPI_IOMMU_H */
+diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
+index 120387ba58..d6edfbd2f5 100644
+--- a/linux-headers/linux/vfio.h
++++ b/linux-headers/linux/vfio.h
+@@ -14,6 +14,7 @@
+ 
+ #include <linux/types.h>
+ #include <linux/ioctl.h>
++#include <linux/iommu.h>
+ 
+ #define VFIO_API_VERSION	0
+ 
+@@ -211,8 +212,11 @@ struct vfio_device_info {
+ #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
+ #define VFIO_DEVICE_FLAGS_CCW	(1 << 4)	/* vfio-ccw device */
+ #define VFIO_DEVICE_FLAGS_AP	(1 << 5)	/* vfio-ap device */
++#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6)	/* vfio-fsl-mc device */
++#define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
+ 	__u32	num_regions;	/* Max region index + 1 */
+ 	__u32	num_irqs;	/* Max IRQ index + 1 */
++	__u32   cap_offset;	/* Offset within info struct of first cap */
+ };
+ #define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
+ 
+@@ -228,6 +232,15 @@ struct vfio_device_info {
+ #define VFIO_DEVICE_API_CCW_STRING		"vfio-ccw"
+ #define VFIO_DEVICE_API_AP_STRING		"vfio-ap"
+ 
++/*
++ * The following capabilities are unique to s390 zPCI devices.  Their contents
++ * are further-defined in vfio_zdev.h
++ */
++#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE		1
++#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP		2
++#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL		3
++#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP		4
++
+ /**
+  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
+  *				       struct vfio_region_info)
+@@ -316,6 +329,7 @@ struct vfio_region_info_cap_type {
+ #define VFIO_REGION_TYPE_GFX                    (1)
+ #define VFIO_REGION_TYPE_CCW			(2)
+ #define VFIO_REGION_TYPE_MIGRATION              (3)
++#define VFIO_REGION_TYPE_NESTED			(4)
+ 
+ /* sub-types for VFIO_REGION_TYPE_PCI_* */
+ 
+@@ -340,6 +354,10 @@ struct vfio_region_info_cap_type {
+ /* sub-types for VFIO_REGION_TYPE_GFX */
+ #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
+ 
++/* sub-types for VFIO_REGION_TYPE_NESTED */
++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT	(1)
++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE	(2)
++
+ /**
+  * struct vfio_region_gfx_edid - EDID region layout.
+  *
+@@ -472,7 +490,7 @@ struct vfio_region_gfx_edid {
+  * 5. Resumed
+  *                  |--------->|
+  *
+- * 0. Default state of VFIO device is _RUNNNG when the user application starts.
++ * 0. Default state of VFIO device is _RUNNING when the user application starts.
+  * 1. During normal shutdown of the user application, the user application may
+  *    optionally change the VFIO device state from _RUNNING to _STOP. This
+  *    transition is optional. The vendor driver must support this transition but
+@@ -695,11 +713,30 @@ struct vfio_irq_info {
+ #define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
+ #define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
+ #define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
++#define VFIO_IRQ_INFO_FLAG_CAPS		(1 << 4) /* Info supports caps */
+ 	__u32	index;		/* IRQ index */
+ 	__u32	count;		/* Number of IRQs within this index */
++	__u32	cap_offset;	/* Offset within info struct of first cap */
+ };
+ #define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
+ 
++/*
++ * The irq type capability allows IRQs unique to a specific device or
++ * class of devices to be exposed.
++ *
++ * The structures below define version 1 of this capability.
++ */
++#define VFIO_IRQ_INFO_CAP_TYPE      3
++
++struct vfio_irq_info_cap_type {
++	struct vfio_info_cap_header header;
++	__u32 type;     /* global per bus driver */
++	__u32 subtype;  /* type specific */
++};
++
++#define VFIO_IRQ_TYPE_NESTED				(1)
++#define VFIO_IRQ_SUBTYPE_DMA_FAULT			(1)
++
+ /**
+  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
+  *
+@@ -801,7 +838,8 @@ enum {
+ 	VFIO_PCI_MSIX_IRQ_INDEX,
+ 	VFIO_PCI_ERR_IRQ_INDEX,
+ 	VFIO_PCI_REQ_IRQ_INDEX,
+-	VFIO_PCI_NUM_IRQS
++	VFIO_PCI_NUM_IRQS = 5	/* Fixed user ABI, IRQ indexes >=5 use   */
++				/* device specific cap to define content */
+ };
+ 
+ /*
+@@ -985,6 +1023,68 @@ struct vfio_device_feature {
+  */
+ #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN	(0)
+ 
++/*
++ * Capability exposed by the DMA fault region
++ * @version: ABI version
++ */
++#define VFIO_REGION_INFO_CAP_DMA_FAULT	6
++
++struct vfio_region_info_cap_fault {
++	struct vfio_info_cap_header header;
++	__u32 version;
++};
++
++/*
++ * Capability exposed by the DMA fault response region
++ * @version: ABI version
++ */
++#define VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE	7
++
++struct vfio_region_info_cap_fault_response {
++	struct vfio_info_cap_header header;
++	__u32 version;
++};
++
++/*
++ * DMA Fault Region Layout
++ * @tail: index relative to the start of the ring buffer at which the
++ *        consumer finds the next item in the buffer
++ * @entry_size: fault ring buffer entry size in bytes
++ * @nb_entries: max capacity of the fault ring buffer
++ * @offset: ring buffer offset relative to the start of the region
++ * @head: index relative to the start of the ring buffer at which the
++ *        producer (kernel) inserts items into the buffers
++ */
++struct vfio_region_dma_fault {
++	/* Write-Only */
++	__u32   tail;
++	/* Read-Only */
++	__u32   entry_size;
++	__u32	nb_entries;
++	__u32	offset;
++	__u32   head;
++};
++
++/*
++ * DMA Fault Response Region Layout
++ * @head: index relative to the start of the ring buffer at which the
++ *        producer (userspace) insert responses into the buffer
++ * @entry_size: fault ring buffer entry size in bytes
++ * @nb_entries: max capacity of the fault ring buffer
++ * @offset: ring buffer offset relative to the start of the region
++ * @tail: index relative to the start of the ring buffer at which the
++ *        consumer (kernel) finds the next item in the buffer
++ */
++struct vfio_region_dma_fault_response {
++	/* Write-Only */
++	__u32   head;
++	/* Read-Only */
++	__u32   entry_size;
++	__u32	nb_entries;
++	__u32	offset;
++	__u32   tail;
++};
++
+ /* -------- API for Type1 VFIO IOMMU -------- */
+ 
+ /**
+@@ -1049,6 +1149,21 @@ struct vfio_iommu_type1_info_cap_migration {
+ 	__u64	max_dirty_bitmap_size;		/* in bytes */
+ };
+ 
++/*
++ * The DMA available capability allows to report the current number of
++ * simultaneously outstanding DMA mappings that are allowed.
++ *
++ * The structure below defines version 1 of this capability.
++ *
++ * avail: specifies the current number of outstanding DMA mappings allowed.
++ */
++#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3
++
++struct vfio_iommu_type1_info_dma_avail {
++	struct	vfio_info_cap_header header;
++	__u32	avail;
++};
++
+ #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+ 
+ /**
+@@ -1072,7 +1187,7 @@ struct vfio_iommu_type1_dma_map {
+ struct vfio_bitmap {
+ 	__u64        pgsize;	/* page size for bitmap in bytes */
+ 	__u64        size;	/* in bytes */
+-	__u64 *data;	/* one bit per page */
++	__u64        *data;	/* one bit per page */
+ };
+ 
+ /**
+@@ -1188,6 +1303,134 @@ struct vfio_iommu_type1_dirty_bitmap_get {
+ 
+ #define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
+ 
++/*
++ * VFIO_IOMMU_BIND_PROCESS
++ *
++ * Allocate a PASID for a process address space, and use it to attach this
++ * process to all devices in the container. Devices can then tag their DMA
++ * traffic with the returned @pasid to perform transactions on the associated
++ * virtual address space. Mapping and unmapping buffers is performed by standard
++ * functions such as mmap and malloc.
++ *
++ * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to
++ * bind. Otherwise the current task is bound. Given that the caller owns the
++ * device, setting this flag grants the caller read and write permissions on the
++ * entire address space of foreign process described by @pid. Therefore,
++ * permission to perform the bind operation on a foreign process is governed by
++ * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
++ * for more information.
++ *
++ * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
++ * ID is unique to a process and can be used on all devices in the container.
++ *
++ * On fork, the child inherits the device fd and can use the bonds setup by its
++ * parent. Consequently, the child has R/W access on the address spaces bound by
++ * its parent. After an execv, the device fd is closed and the child doesn't
++ * have access to the address space anymore.
++ *
++ * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is
++ * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND,
++ * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current
++ * task from the container.
++ */
++struct vfio_iommu_type1_bind_process {
++	__u32	flags;
++#define VFIO_IOMMU_BIND_PID		(1 << 0)
++	__u32	pasid;
++	__s32	pid;
++};
++
++/*
++ * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes
++ * vfio_iommu_type1_bind_process in data.
++ */
++struct vfio_iommu_type1_bind {
++	__u32	argsz;
++	__u32	flags;
++#define VFIO_IOMMU_BIND_PROCESS		(1 << 0)
++	__u8	data[];
++};
++
++/*
++ * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind)
++ *
++ * Manage address spaces of devices in this container. Initially a TYPE1
++ * container can only have one address space, managed with
++ * VFIO_IOMMU_MAP/UNMAP_DMA.
++ *
++ * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
++ * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
++ * tables, and BIND manages the stage-1 (guest) page tables. Other types of
++ * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
++ * non-PASID traffic and BIND controls PASID traffic. But this depends on the
++ * underlying IOMMU architecture and isn't guaranteed.
++ *
++ * Availability of this feature depends on the device, its bus, the underlying
++ * IOMMU and the CPU architecture.
++ *
++ * returns: 0 on success, -errno on failure.
++ */
++#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
++
++/*
++ * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind)
++ *
++ * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl.
++ */
++#define VFIO_IOMMU_UNBIND	_IO(VFIO_TYPE, VFIO_BASE + 23)
++
++/*
++ * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
++ *			struct vfio_iommu_type1_set_pasid_table)
++ *
++ * The SET operation passes a PASID table to the host while the
++ * UNSET operation detaches the one currently programmed. It is
++ * allowed to "SET" the table several times without unsetting as
++ * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE.
++ */
++struct vfio_iommu_type1_set_pasid_table {
++	__u32	argsz;
++	__u32	flags;
++#define VFIO_PASID_TABLE_FLAG_SET	(1 << 0)
++#define VFIO_PASID_TABLE_FLAG_UNSET	(1 << 1)
++	struct iommu_pasid_table_config config; /* used on SET */
++};
++
++#define VFIO_IOMMU_SET_PASID_TABLE	_IO(VFIO_TYPE, VFIO_BASE + 18)
++
++/**
++ * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19,
++ *			struct vfio_iommu_type1_cache_invalidate)
++ *
++ * Propagate guest IOMMU cache invalidation to the host.
++ */
++struct vfio_iommu_type1_cache_invalidate {
++	__u32   argsz;
++	__u32   flags;
++	struct iommu_cache_invalidate_info info;
++};
++#define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 19)
++
++/**
++ * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20,
++ *			struct vfio_iommu_type1_set_msi_binding)
++ *
++ * Pass a stage 1 MSI doorbell mapping to the host so that this
++ * latter can build a nested stage2 mapping. Or conversely tear
++ * down a previously bound stage 1 MSI binding.
++ */
++struct vfio_iommu_type1_set_msi_binding {
++	__u32   argsz;
++	__u32   flags;
++#define VFIO_IOMMU_BIND_MSI	(1 << 0)
++#define VFIO_IOMMU_UNBIND_MSI	(1 << 1)
++	__u64	iova;	/* MSI guest IOVA */
++	/* Fields below are used on BIND */
++	__u64	gpa;	/* MSI guest physical address */
++	__u64	size;	/* size of stage1 mapping (bytes) */
++};
++#define VFIO_IOMMU_SET_MSI_BINDING      _IO(VFIO_TYPE, VFIO_BASE + 20)
++
+ /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+ 
+ /*
+-- 
+2.27.0
+
-- 
Gitee


From 24678ea00b6e15fe3eabad4a2d05784e4e795cf0 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 4 Sep 2018 08:43:05 -0400
Subject: [PATCH 15/48] memory: Add new fields in IOTLBEntry

The current IOTLBEntry becomes too simple to interact with
some physical IOMMUs. IOTLBs can be invalidated with different
granularities: domain, pasid, addr. Current IOTLB entry only offers
page selective invalidation. Let's add a granularity field
that conveys this information.

TLB entries are usually tagged with some ids such as the asid
or pasid. When propagating an invalidation command from the
guest to the host, we need to pass those IDs.

Also we add a leaf field which indicates, in case of invalidation
notification, whether only cache entries for the last level of
translation are required to be invalidated.

A flag field is introduced to inform whether those fields are set.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 memory-Add-new-fields-in-IOTLBEntry.patch | 84 +++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 memory-Add-new-fields-in-IOTLBEntry.patch

diff --git a/memory-Add-new-fields-in-IOTLBEntry.patch b/memory-Add-new-fields-in-IOTLBEntry.patch
new file mode 100644
index 0000000..d76ff3b
--- /dev/null
+++ b/memory-Add-new-fields-in-IOTLBEntry.patch
@@ -0,0 +1,84 @@
+From 5a77056573d946eb9220b90dd1edce1f6f925c42 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 4 Sep 2018 08:43:05 -0400
+Subject: [PATCH] memory: Add new fields in IOTLBEntry
+
+The current IOTLBEntry becomes too simple to interact with
+some physical IOMMUs. IOTLBs can be invalidated with different
+granularities: domain, pasid, addr. Current IOTLB entry only offers
+page selective invalidation. Let's add a granularity field
+that conveys this information.
+
+TLB entries are usually tagged with some ids such as the asid
+or pasid. When propagating an invalidation command from the
+guest to the host, we need to pass those IDs.
+
+Also we add a leaf field which indicates, in case of invalidation
+notification, whether only cache entries for the last level of
+translation are required to be invalidated.
+
+A flag field is introduced to inform whether those fields are set.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/exec/memory.h | 36 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index dca8184277..3c5206dce6 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -66,14 +66,48 @@ typedef enum {
+     IOMMU_RW   = 3,
+ } IOMMUAccessFlags;
+ 
++/* Granularity of the cache invalidation */
++typedef enum {
++    IOMMU_INV_GRAN_ADDR = 0,
++    IOMMU_INV_GRAN_PASID,
++    IOMMU_INV_GRAN_DOMAIN,
++} IOMMUInvGranularity;
++
+ #define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0))
+ 
++/**
++ * IOMMUTLBEntry - IOMMU TLB entry
++ *
++ * Structure used when performing a translation or when notifying MAP or
++ * UNMAP (invalidation) events
++ *
++ * @target_as: target address space
++ * @iova: IO virtual address (input)
++ * @translated_addr: translated address (output)
++ * @addr_mask: address mask (0xfff means 4K binding), must be multiple of 2
++ * @perm: permission flag of the mapping (NONE encodes no mapping or
++ * invalidation notification)
++ * @granularity: granularity of the invalidation
++ * @flags: informs whether the following fields are set
++ * @arch_id: architecture specific ID tagging the TLB
++ * @pasid: PASID tagging the TLB
++ * @leaf: when @perm is NONE, indicates whether only caches for the last
++ * level of translation need to be invalidated.
++ */
+ struct IOMMUTLBEntry {
+     AddressSpace    *target_as;
+     hwaddr           iova;
+     hwaddr           translated_addr;
+-    hwaddr           addr_mask;  /* 0xfff = 4k translation */
++    hwaddr           addr_mask;
+     IOMMUAccessFlags perm;
++    IOMMUInvGranularity granularity;
++#define IOMMU_INV_FLAGS_PASID  (1 << 0)
++#define IOMMU_INV_FLAGS_ARCHID (1 << 1)
++#define IOMMU_INV_FLAGS_LEAF   (1 << 2)
++    uint32_t         flags;
++    uint32_t         arch_id;
++    uint32_t         pasid;
++    bool             leaf;
+ };
+ 
+ /*
+-- 
+2.27.0
+
-- 
Gitee


From be7575f6adf8751c99921929144979e4931b1809 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Sun, 14 Feb 2021 12:30:57 -0500
Subject: [PATCH 16/48] hw/arm/smmuv3: Improve stage1 ASID invalidation

At the moment ASID invalidation command (CMD_TLBI_NH_ASID) is
propagated as a domain invalidation (the whole notifier range
is invalidated independently on any ASID information).

The new granularity field now allows to be more precise and
restrict the invalidation to a peculiar ASID. Set the corresponding
fields and flag.

We still keep the iova and addr_mask settings for consumers that
do not support the new fields, like VHOST.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...uv3-Improve-stage1-ASID-invalidation.patch | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch

diff --git a/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch
new file mode 100644
index 0000000..10639e8
--- /dev/null
+++ b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch
@@ -0,0 +1,105 @@
+From c0027c2e744c8ed99e937d3cbc88f400ab63a316 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Sun, 14 Feb 2021 12:30:57 -0500
+Subject: [PATCH] hw/arm/smmuv3: Improve stage1 ASID invalidation
+
+At the moment ASID invalidation command (CMD_TLBI_NH_ASID) is
+propagated as a domain invalidation (the whole notifier range
+is invalidated independently on any ASID information).
+
+The new granularity field now allows to be more precise and
+restrict the invalidation to a peculiar ASID. Set the corresponding
+fields and flag.
+
+We still keep the iova and addr_mask settings for consumers that
+do not support the new fields, like VHOST.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c     | 42 ++++++++++++++++++++++++++++++++++++++++--
+ hw/arm/trace-events |  1 +
+ 2 files changed, 41 insertions(+), 2 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 3b5723e1e1..0ef1ca376c 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -827,6 +827,29 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+     memory_region_notify_one(n, &entry);
+ }
+ 
++/**
++ * smmuv3_notify_asid - call the notifier @n for a given asid
++ *
++ * @mr: IOMMU mr region handle
++ * @n: notifier to be called
++ * @asid: address space ID or negative value if we don't care
++ */
++static void smmuv3_notify_asid(IOMMUMemoryRegion *mr,
++                               IOMMUNotifier *n, int asid)
++{
++    IOMMUTLBEntry entry;
++
++    entry.target_as = &address_space_memory;
++    entry.perm = IOMMU_NONE;
++    entry.granularity = IOMMU_INV_GRAN_PASID;
++    entry.flags = IOMMU_INV_FLAGS_ARCHID;
++    entry.arch_id = asid;
++    entry.iova = n->start;
++    entry.addr_mask = n->end - n->start;
++
++    memory_region_notify_one(n, &entry);
++}
++
+ /* invalidate an asid/iova tuple in all mr's */
+ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
+ {
+@@ -844,6 +867,22 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
+     }
+ }
+ 
++static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid)
++{
++    SMMUDevice *sdev;
++
++    trace_smmuv3_s1_asid_inval(asid);
++    QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) {
++        IOMMUMemoryRegion *mr = &sdev->iommu;
++        IOMMUNotifier *n;
++
++        IOMMU_NOTIFIER_FOREACH(n, mr) {
++            smmuv3_notify_asid(mr, n, asid);
++        }
++    }
++    smmu_iotlb_inv_asid(s, asid);
++}
++
+ static int smmuv3_cmdq_consume(SMMUv3State *s)
+ {
+     SMMUState *bs = ARM_SMMU(s);
+@@ -963,8 +1002,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+             uint16_t asid = CMD_ASID(&cmd);
+ 
+             trace_smmuv3_cmdq_tlbi_nh_asid(asid);
+-            smmu_inv_notifiers_all(&s->smmu_state);
+-            smmu_iotlb_inv_asid(bs, asid);
++            smmuv3_s1_asid_inval(bs, asid);
+             break;
+         }
+         case SMMU_CMD_TLBI_NH_ALL:
+diff --git a/hw/arm/trace-events b/hw/arm/trace-events
+index 0acedcedc6..4512d20115 100644
+--- a/hw/arm/trace-events
++++ b/hw/arm/trace-events
+@@ -44,6 +44,7 @@ smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t p
+ smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)"
+ smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d"
+ smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64
++smmuv3_s1_asid_inval(int asid) "asid=%d"
+ smmuv3_cmdq_tlbi_nh(void) ""
+ smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d"
+ smmu_iotlb_cache_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d"
+-- 
+2.27.0
+
-- 
Gitee


From 30cc257bc51c87be0f2354db50fff6960ac93f89 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Fri, 19 Mar 2021 12:22:48 -0400
Subject: [PATCH 17/48] hw/arm/smmu-common: Allow domain invalidation for
 NH_ALL/NSNH_ALL

NH_ALL/NSNH_ALL corresponds to a domain granularity invalidation,
ie. all the notifier range gets invalidation, whatever the ASID.
So let's set the granularity to IOMMU_INV_GRAN_DOMAIN to allow
the consumer to benefit from the info if it can.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Suggested-by: chenxiang (M) <chenxiang66@hisilicon.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...on-Allow-domain-invalidation-for-NH_.patch | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch

diff --git a/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch b/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch
new file mode 100644
index 0000000..1824b8e
--- /dev/null
+++ b/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch
@@ -0,0 +1,33 @@
+From 8bf9d1dc67335c1fb921a56825f6bf198a568091 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 19 Mar 2021 12:22:48 -0400
+Subject: [PATCH] hw/arm/smmu-common: Allow domain invalidation for
+ NH_ALL/NSNH_ALL
+
+NH_ALL/NSNH_ALL corresponds to a domain granularity invalidation,
+ie. all the notifier range gets invalidation, whatever the ASID.
+So let's set the granularity to IOMMU_INV_GRAN_DOMAIN to allow
+the consumer to benefit from the info if it can.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Suggested-by: chenxiang (M) <chenxiang66@hisilicon.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmu-common.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
+index 717d22bcbe..de9468d33f 100644
+--- a/hw/arm/smmu-common.c
++++ b/hw/arm/smmu-common.c
+@@ -395,6 +395,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n)
+     entry.iova = n->start;
+     entry.perm = IOMMU_NONE;
+     entry.addr_mask = n->end - n->start;
++    entry.granularity = IOMMU_INV_GRAN_DOMAIN;
+ 
+     memory_region_notify_one(n, &entry);
+ }
+-- 
+2.27.0
+
-- 
Gitee


From 6ddda50d8af78363dccf90d668bf63af6dcc01f9 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Mon, 1 Jul 2019 11:30:30 +0200
Subject: [PATCH 18/48] memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region
 attribute

We introduce a new IOMMU Memory Region attribute,
IOMMU_ATTR_VFIO_NESTED that tells whether the virtual IOMMU
requires HW nested paging for VFIO integration.

Current Intel virtual IOMMU device supports "Caching
Mode" and does not require 2 stages at physical level to be
integrated with VFIO. However SMMUv3 does not implement such
"caching mode" and requires to use HW nested paging.

As such SMMUv3 is the first IOMMU device to advertise this
attribute.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ..._ATTR_VFIO_NESTED-IOMMU-memory-regio.patch | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch

diff --git a/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch
new file mode 100644
index 0000000..3932161
--- /dev/null
+++ b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch
@@ -0,0 +1,72 @@
+From 5f4291f431add76b8754a5fb2d62ab4108ece73f Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Mon, 1 Jul 2019 11:30:30 +0200
+Subject: [PATCH] memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region
+ attribute
+
+We introduce a new IOMMU Memory Region attribute,
+IOMMU_ATTR_VFIO_NESTED that tells whether the virtual IOMMU
+requires HW nested paging for VFIO integration.
+
+Current Intel virtual IOMMU device supports "Caching
+Mode" and does not require 2 stages at physical level to be
+integrated with VFIO. However SMMUv3 does not implement such
+"caching mode" and requires to use HW nested paging.
+
+As such SMMUv3 is the first IOMMU device to advertise this
+attribute.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c       | 12 ++++++++++++
+ include/exec/memory.h |  3 ++-
+ 2 files changed, 14 insertions(+), 1 deletion(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 0ef1ca376c..55eed5189e 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1531,6 +1531,17 @@ static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
+     }
+ }
+ 
++static int smmuv3_get_attr(IOMMUMemoryRegion *iommu,
++                           enum IOMMUMemoryRegionAttr attr,
++                           void *data)
++{
++    if (attr == IOMMU_ATTR_VFIO_NESTED) {
++        *(bool *) data = true;
++        return 0;
++    }
++    return -EINVAL;
++}
++
+ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+                                                   void *data)
+ {
+@@ -1538,6 +1549,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+ 
+     imrc->translate = smmuv3_translate;
+     imrc->notify_flag_changed = smmuv3_notify_flag_changed;
++    imrc->get_attr = smmuv3_get_attr;
+ }
+ 
+ static const TypeInfo smmuv3_type_info = {
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index 3c5206dce6..74606e14aa 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -240,7 +240,8 @@ struct MemoryRegionOps {
+ };
+ 
+ enum IOMMUMemoryRegionAttr {
+-    IOMMU_ATTR_SPAPR_TCE_FD
++    IOMMU_ATTR_SPAPR_TCE_FD,
++    IOMMU_ATTR_VFIO_NESTED,
+ };
+ 
+ /**
+-- 
+2.27.0
+
-- 
Gitee


From f026ae6d35ea3d3bd684763b5abab2afe30a275e Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Mon, 25 Mar 2019 16:35:05 +0100
Subject: [PATCH 19/48] memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory
 region attribute

We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_MSI_TRANSLATE
which tells whether the virtual IOMMU translates MSIs. ARM SMMU
will expose this attribute since, as opposed to Intel DMAR, MSIs
are translated as any other DMA requests.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ..._ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch

diff --git a/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch
new file mode 100644
index 0000000..c67de46
--- /dev/null
+++ b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch
@@ -0,0 +1,32 @@
+From b7f4f3b71a179a21a90ca32ef7d6ea000fb0e3bd Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Mon, 25 Mar 2019 16:35:05 +0100
+Subject: [PATCH] memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region
+ attribute
+
+We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_MSI_TRANSLATE
+which tells whether the virtual IOMMU translates MSIs. ARM SMMU
+will expose this attribute since, as opposed to Intel DMAR, MSIs
+are translated as any other DMA requests.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/exec/memory.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index 74606e14aa..716b07e115 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -242,6 +242,7 @@ struct MemoryRegionOps {
+ enum IOMMUMemoryRegionAttr {
+     IOMMU_ATTR_SPAPR_TCE_FD,
+     IOMMU_ATTR_VFIO_NESTED,
++    IOMMU_ATTR_MSI_TRANSLATE,
+ };
+ 
+ /**
+-- 
+2.27.0
+
-- 
Gitee


From 42c9d0a3d0cc3bd05f4f07ed4a61ea3242dd67fd Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 13 Sep 2018 14:13:04 +0200
Subject: [PATCH 20/48] memory: Introduce IOMMU Memory Region inject_faults API

This new API allows to inject @count iommu_faults into
the IOMMU memory region.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...-IOMMU-Memory-Region-inject_faults-A.patch | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch

diff --git a/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch
new file mode 100644
index 0000000..7cecd31
--- /dev/null
+++ b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch
@@ -0,0 +1,89 @@
+From 497e055ed89e3cb5286dde2b05b7d7fd67e69331 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Sep 2018 14:13:04 +0200
+Subject: [PATCH] memory: Introduce IOMMU Memory Region inject_faults API
+
+This new API allows to inject @count iommu_faults into
+the IOMMU memory region.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/exec/memory.h | 25 +++++++++++++++++++++++++
+ memory.c              | 10 ++++++++++
+ 2 files changed, 35 insertions(+)
+
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index 716b07e115..ffd4282f14 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -56,6 +56,8 @@ struct MemoryRegionMmio {
+     CPUWriteMemoryFunc *write[3];
+ };
+ 
++struct iommu_fault;
++
+ typedef struct IOMMUTLBEntry IOMMUTLBEntry;
+ 
+ /* See address_space_translate: bit 0 is read, bit 1 is write.  */
+@@ -378,6 +380,19 @@ typedef struct IOMMUMemoryRegionClass {
+      * @iommu: the IOMMUMemoryRegion
+      */
+     int (*num_indexes)(IOMMUMemoryRegion *iommu);
++
++    /*
++     * Inject @count faults into the IOMMU memory region
++     *
++     * Optional method: if this method is not provided, then
++     * memory_region_injection_faults() will return -ENOENT
++     *
++     * @iommu: the IOMMU memory region to inject the faults in
++     * @count: number of faults to inject
++     * @buf: fault buffer
++     */
++    int (*inject_faults)(IOMMUMemoryRegion *iommu, int count,
++                         struct iommu_fault *buf);
+ } IOMMUMemoryRegionClass;
+ 
+ typedef struct CoalescedMemoryRange CoalescedMemoryRange;
+@@ -1182,6 +1197,16 @@ int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr,
+  */
+ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr);
+ 
++/**
++ * memory_region_inject_faults : inject @count faults stored in @buf
++ *
++ * @iommu_mr: the IOMMU memory region
++ * @count: number of faults to be injected
++ * @buf: buffer containing the faults
++ */
++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
++                                struct iommu_fault *buf);
++
+ /**
+  * memory_region_name: get a memory region's name
+  *
+diff --git a/memory.c b/memory.c
+index 708b3dff3d..623f89baa4 100644
+--- a/memory.c
++++ b/memory.c
+@@ -2017,6 +2017,16 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
+     return imrc->num_indexes(iommu_mr);
+ }
+ 
++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
++                                struct iommu_fault *buf)
++{
++    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
++    if (!imrc->inject_faults) {
++        return -ENOENT;
++    }
++    return imrc->inject_faults(iommu_mr, count, buf);
++}
++
+ void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
+ {
+     uint8_t mask = 1 << client;
+-- 
+2.27.0
+
-- 
Gitee


From 3ff7455793d96c6034aa506e1578b064df234bce Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 9 Jul 2019 12:20:12 +0200
Subject: [PATCH 21/48] iommu: Introduce generic header

This header is meant to exposes data types used by
several IOMMU devices such as struct for SVA and
nested stage configuration.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 iommu-Introduce-generic-header.patch | 53 ++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 iommu-Introduce-generic-header.patch

diff --git a/iommu-Introduce-generic-header.patch b/iommu-Introduce-generic-header.patch
new file mode 100644
index 0000000..76e0c0c
--- /dev/null
+++ b/iommu-Introduce-generic-header.patch
@@ -0,0 +1,53 @@
+From e8055075dbbc932afccc1f18f4acc093fe9e4dc3 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 9 Jul 2019 12:20:12 +0200
+Subject: [PATCH] iommu: Introduce generic header
+
+This header is meant to exposes data types used by
+several IOMMU devices such as struct for SVA and
+nested stage configuration.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/hw/iommu/iommu.h | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+ create mode 100644 include/hw/iommu/iommu.h
+
+diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
+new file mode 100644
+index 0000000000..12092bda7b
+--- /dev/null
++++ b/include/hw/iommu/iommu.h
+@@ -0,0 +1,28 @@
++/*
++ * common header for iommu devices
++ *
++ * Copyright Red Hat, Inc. 2019
++ *
++ * Authors:
++ *  Eric Auger <eric.auger@redhat.com>
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2.  See
++ * the COPYING file in the top-level directory.
++ */
++
++#ifndef QEMU_HW_IOMMU_IOMMU_H
++#define QEMU_HW_IOMMU_IOMMU_H
++#ifdef __linux__
++#include <linux/iommu.h>
++#endif
++
++typedef struct IOMMUConfig {
++    union {
++#ifdef __linux__
++        struct iommu_pasid_table_config pasid_cfg;
++#endif
++          };
++} IOMMUConfig;
++
++
++#endif /* QEMU_HW_IOMMU_IOMMU_H */
+-- 
+2.27.0
+
-- 
Gitee


From a0fc43becbf9148097f587722d6448bbd7eba1eb Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Fri, 5 Jul 2019 19:01:36 +0800
Subject: [PATCH 22/48] pci: introduce PCIPASIDOps to PCIDevice

This patch introduces PCIPASIDOps for IOMMU related operations.

https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00078.html
https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00940.html

So far, to setup virt-SVA for assigned SVA capable device, needs to
configure host translation structures for specific pasid. (e.g. bind
guest page table to host and enable nested translation in host).
Besides, vIOMMU emulator needs to forward guest's cache invalidation
to host since host nested translation is enabled. e.g. on VT-d, guest
owns 1st level translation table, thus cache invalidation for 1st
level should be propagated to host.

This patch adds two functions: alloc_pasid and free_pasid to support
guest pasid allocation and free. The implementations of the callbacks
would be device passthru modules. Like vfio.

Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 pci-introduce-PCIPASIDOps-to-PCIDevice.patch | 127 +++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 pci-introduce-PCIPASIDOps-to-PCIDevice.patch

diff --git a/pci-introduce-PCIPASIDOps-to-PCIDevice.patch b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch
new file mode 100644
index 0000000..e89cdc8
--- /dev/null
+++ b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch
@@ -0,0 +1,127 @@
+From 26adddfe4645b69c16ed8d6601f373d40bddd0e3 Mon Sep 17 00:00:00 2001
+From: Liu Yi L <yi.l.liu@intel.com>
+Date: Fri, 5 Jul 2019 19:01:36 +0800
+Subject: [PATCH] pci: introduce PCIPASIDOps to PCIDevice
+
+This patch introduces PCIPASIDOps for IOMMU related operations.
+
+https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00078.html
+https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00940.html
+
+So far, to setup virt-SVA for assigned SVA capable device, needs to
+configure host translation structures for specific pasid. (e.g. bind
+guest page table to host and enable nested translation in host).
+Besides, vIOMMU emulator needs to forward guest's cache invalidation
+to host since host nested translation is enabled. e.g. on VT-d, guest
+owns 1st level translation table, thus cache invalidation for 1st
+level should be propagated to host.
+
+This patch adds two functions: alloc_pasid and free_pasid to support
+guest pasid allocation and free. The implementations of the callbacks
+would be device passthru modules. Like vfio.
+
+Cc: Kevin Tian <kevin.tian@intel.com>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Eric Auger <eric.auger@redhat.com>
+Cc: Yi Sun <yi.y.sun@linux.intel.com>
+Cc: David Gibson <david@gibson.dropbear.id.au>
+Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
+Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/pci/pci.c         | 34 ++++++++++++++++++++++++++++++++++
+ include/hw/pci/pci.h | 11 +++++++++++
+ 2 files changed, 45 insertions(+)
+
+diff --git a/hw/pci/pci.c b/hw/pci/pci.c
+index e74143ccc3..f11ca7964e 100644
+--- a/hw/pci/pci.c
++++ b/hw/pci/pci.c
+@@ -2626,6 +2626,40 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque)
+     bus->iommu_opaque = opaque;
+ }
+ 
++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops)
++{
++    assert(ops && !dev->pasid_ops);
++    dev->pasid_ops = ops;
++}
++
++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn)
++{
++    PCIDevice *dev;
++
++    if (!bus) {
++        return false;
++    }
++
++    dev = bus->devices[devfn];
++    return !!(dev && dev->pasid_ops);
++}
++
++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn,
++                               IOMMUConfig *config)
++{
++    PCIDevice *dev;
++
++    if (!bus) {
++        return -EINVAL;
++    }
++
++    dev = bus->devices[devfn];
++    if (dev && dev->pasid_ops && dev->pasid_ops->set_pasid_table) {
++        return dev->pasid_ops->set_pasid_table(bus, devfn, config);
++    }
++    return -ENOENT;
++}
++
+ static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque)
+ {
+     Range *range = opaque;
+diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
+index aaf1b9f70d..bb14ed61b0 100644
+--- a/include/hw/pci/pci.h
++++ b/include/hw/pci/pci.h
+@@ -9,6 +9,7 @@
+ #include "hw/isa/isa.h"
+ 
+ #include "hw/pci/pcie.h"
++#include "hw/iommu/iommu.h"
+ 
+ extern bool pci_available;
+ 
+@@ -263,6 +264,11 @@ struct PCIReqIDCache {
+ };
+ typedef struct PCIReqIDCache PCIReqIDCache;
+ 
++struct PCIPASIDOps {
++    int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++};
++typedef struct PCIPASIDOps PCIPASIDOps;
++
+ struct PCIDevice {
+     DeviceState qdev;
+ 
+@@ -352,6 +358,7 @@ struct PCIDevice {
+     MSIVectorUseNotifier msix_vector_use_notifier;
+     MSIVectorReleaseNotifier msix_vector_release_notifier;
+     MSIVectorPollNotifier msix_vector_poll_notifier;
++    PCIPASIDOps *pasid_ops;
+ };
+ 
+ void pci_register_bar(PCIDevice *pci_dev, int region_num,
+@@ -485,6 +492,10 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
+ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+ 
++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops);
++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn);
++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++
+ static inline void
+ pci_set_byte(uint8_t *config, uint8_t val)
+ {
+-- 
+2.27.0
+
-- 
Gitee


From cfcd9dd6b100bd0f204d6f94faededdc8b91962f Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 28 Aug 2018 16:16:20 +0200
Subject: [PATCH 23/48] vfio: Force nested if iommu requires it

In case we detect the address space is translated by
a virtual IOMMU which requires HW nested paging to
integrate with VFIO, let's set up the container with
the VFIO_TYPE1_NESTING_IOMMU iommu_type.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 vfio-Force-nested-if-iommu-requires-it.patch | 100 +++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 vfio-Force-nested-if-iommu-requires-it.patch

diff --git a/vfio-Force-nested-if-iommu-requires-it.patch b/vfio-Force-nested-if-iommu-requires-it.patch
new file mode 100644
index 0000000..6a6b9da
--- /dev/null
+++ b/vfio-Force-nested-if-iommu-requires-it.patch
@@ -0,0 +1,100 @@
+From e4122a95a30cd58e1cd6e1742928e68aa94fd7ee Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 28 Aug 2018 16:16:20 +0200
+Subject: [PATCH] vfio: Force nested if iommu requires it
+
+In case we detect the address space is translated by
+a virtual IOMMU which requires HW nested paging to
+integrate with VFIO, let's set up the container with
+the VFIO_TYPE1_NESTING_IOMMU iommu_type.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 36 ++++++++++++++++++++++++++++--------
+ 1 file changed, 28 insertions(+), 8 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index fefa2ccfdf..c78b58d365 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1683,27 +1683,38 @@ static void vfio_put_address_space(VFIOAddressSpace *space)
+  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
+  */
+ static int vfio_get_iommu_type(VFIOContainer *container,
++                               bool want_nested,
+                                Error **errp)
+ {
+-    int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
++    int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU,
++                          VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
+                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
+-    int i;
++    int i, ret = -EINVAL;
+ 
+     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
+         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
+-            return iommu_types[i];
++            if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU && !want_nested) {
++                continue;
++            }
++            ret = iommu_types[i];
++            break;
+         }
+     }
+-    error_setg(errp, "No available IOMMU models");
+-    return -EINVAL;
++    if (ret < 0) {
++        error_setg(errp, "No available IOMMU models");
++    } else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) {
++        error_setg(errp, "Nested mode requested but not supported");
++        ret = -EINVAL;
++    }
++    return ret;
+ }
+ 
+ static int vfio_init_container(VFIOContainer *container, int group_fd,
+-                               Error **errp)
++                               bool want_nested, Error **errp)
+ {
+     int iommu_type, dirty_log_manual_clear, ret;
+ 
+-    iommu_type = vfio_get_iommu_type(container, errp);
++    iommu_type = vfio_get_iommu_type(container, want_nested, errp);
+     if (iommu_type < 0) {
+         return iommu_type;
+     }
+@@ -1815,6 +1826,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     VFIOContainer *container;
+     int ret, fd;
+     VFIOAddressSpace *space;
++    IOMMUMemoryRegion *iommu_mr;
++    bool nested = false;
++
++    if (memory_region_is_iommu(as->root)) {
++        iommu_mr = IOMMU_MEMORY_REGION(as->root);
++        memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED,
++                                     (void *)&nested);
++    }
+ 
+     space = vfio_get_address_space(as);
+ 
+@@ -1879,13 +1898,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     QLIST_INIT(&container->hostwin_list);
+     QLIST_INIT(&container->dma_list);
+ 
+-    ret = vfio_init_container(container, group->fd, errp);
++    ret = vfio_init_container(container, group->fd, nested, errp);
+     if (ret) {
+         goto free_container_exit;
+     }
+     trace_vfio_connect_new_container(group->groupid, container->fd);
+ 
+     switch (container->iommu_type) {
++    case VFIO_TYPE1_NESTING_IOMMU:
+     case VFIO_TYPE1v2_IOMMU:
+     case VFIO_TYPE1_IOMMU:
+     {
+-- 
+2.27.0
+
-- 
Gitee


From 6c756b3010347fc756da9aefda1812d5809b3830 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Fri, 22 Mar 2019 18:05:23 +0100
Subject: [PATCH 24/48] vfio: Introduce hostwin_from_range helper

Let's introduce a hostwin_from_range() helper that returns the
hostwin encapsulating an IOVA range or NULL if none is found.

This improves the readibility of callers and removes the usage
of hostwin_found.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...-Introduce-hostwin_from_range-helper.patch | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 vfio-Introduce-hostwin_from_range-helper.patch

diff --git a/vfio-Introduce-hostwin_from_range-helper.patch b/vfio-Introduce-hostwin_from_range-helper.patch
new file mode 100644
index 0000000..b9a7099
--- /dev/null
+++ b/vfio-Introduce-hostwin_from_range-helper.patch
@@ -0,0 +1,89 @@
+From 25336cd596ff551293f1be6e108ad9277d80be0f Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 22 Mar 2019 18:05:23 +0100
+Subject: [PATCH] vfio: Introduce hostwin_from_range helper
+
+Let's introduce a hostwin_from_range() helper that returns the
+hostwin encapsulating an IOVA range or NULL if none is found.
+
+This improves the readibility of callers and removes the usage
+of hostwin_found.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 36 +++++++++++++++++-------------------
+ 1 file changed, 17 insertions(+), 19 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index c78b58d365..a8db784ac5 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -696,6 +696,19 @@ out:
+     rcu_read_unlock();
+ }
+ 
++static VFIOHostDMAWindow *
++hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end)
++{
++    VFIOHostDMAWindow *hostwin;
++
++    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
++        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
++            return hostwin;
++        }
++    }
++    return NULL;
++}
++
+ static void vfio_listener_region_add(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+@@ -705,7 +718,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
+     void *vaddr;
+     int ret;
+     VFIOHostDMAWindow *hostwin;
+-    bool hostwin_found;
+ 
+     if (vfio_listener_skipped_section(section)) {
+         trace_vfio_listener_region_add_skip(
+@@ -783,15 +795,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
+ #endif
+     }
+ 
+-    hostwin_found = false;
+-    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+-        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+-            hostwin_found = true;
+-            break;
+-        }
+-    }
+-
+-    if (!hostwin_found) {
++    hostwin = hostwin_from_range(container, iova, end);
++    if (!hostwin) {
+         error_report("vfio: IOMMU container %p can't map guest IOVA region"
+                      " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
+                      container, iova, end);
+@@ -956,16 +961,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
+ 
+     if (memory_region_is_ram_device(section->mr)) {
+         hwaddr pgmask;
+-        VFIOHostDMAWindow *hostwin;
+-        bool hostwin_found = false;
++        VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end);
+ 
+-        QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+-            if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+-                hostwin_found = true;
+-                break;
+-            }
+-        }
+-        assert(hostwin_found); /* or region_add() would have failed */
++        assert(hostwin); /* or region_add() would have failed */
+ 
+         pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
+-- 
+2.27.0
+
-- 
Gitee


From 89a04f5725157458c83ea81135ab9dd201ecd350 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 30 Aug 2018 15:04:25 +0200
Subject: [PATCH 25/48] vfio: Introduce helpers to DMA map/unmap a RAM section

Let's introduce two helpers that allow to DMA map/unmap a RAM
section. Those helpers will be called for nested stage setup in
another call site. Also the vfio_listener_region_add/del()
structure may be clearer.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...elpers-to-DMA-map-unmap-a-RAM-sectio.patch | 261 ++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch

diff --git a/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch
new file mode 100644
index 0000000..124587d
--- /dev/null
+++ b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch
@@ -0,0 +1,261 @@
+From eb3bfdb61025efe2891ce6732b8829a48dd75e2d Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 30 Aug 2018 15:04:25 +0200
+Subject: [PATCH] vfio: Introduce helpers to DMA map/unmap a RAM section
+
+Let's introduce two helpers that allow to DMA map/unmap a RAM
+section. Those helpers will be called for nested stage setup in
+another call site. Also the vfio_listener_region_add/del()
+structure may be clearer.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c     | 187 +++++++++++++++++++++++++++----------------
+ hw/vfio/trace-events |   4 +-
+ 2 files changed, 119 insertions(+), 72 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index a8db784ac5..8837d33c57 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -709,13 +709,126 @@ hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end)
+     return NULL;
+ }
+ 
++static int vfio_dma_map_ram_section(VFIOContainer *container,
++                                    MemoryRegionSection *section)
++{
++    VFIOHostDMAWindow *hostwin;
++    Int128 llend, llsize;
++    hwaddr iova, end;
++    void *vaddr;
++    int ret;
++
++    assert(memory_region_is_ram(section->mr));
++
++    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
++    llend = int128_make64(section->offset_within_address_space);
++    llend = int128_add(llend, section->size);
++    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
++    end = int128_get64(int128_sub(llend, int128_one()));
++
++    vaddr = memory_region_get_ram_ptr(section->mr) +
++            section->offset_within_region +
++            (iova - section->offset_within_address_space);
++
++    hostwin = hostwin_from_range(container, iova, end);
++    if (!hostwin) {
++        error_report("vfio: IOMMU Container %p can't map guest IOVA region"
++                     " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
++                     container, iova, end);
++        return -EFAULT;
++    }
++
++    trace_vfio_dma_map_ram(iova, end, vaddr);
++
++    llsize = int128_sub(llend, int128_make64(iova));
++
++    if (memory_region_is_ram_device(section->mr)) {
++        hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
++
++        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
++            trace_vfio_listener_region_add_no_dma_map(
++                memory_region_name(section->mr),
++                section->offset_within_address_space,
++                int128_getlo(section->size),
++                pgmask + 1);
++            return 0;
++        }
++    }
++
++    ret = vfio_dma_map(container, iova, int128_get64(llsize),
++                       vaddr, section->readonly);
++    if (ret) {
++        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
++                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
++                     container, iova, int128_get64(llsize), vaddr, ret);
++        if (memory_region_is_ram_device(section->mr)) {
++            /* Allow unexpected mappings not to be fatal for RAM devices */
++            return 0;
++        }
++        return ret;
++    }
++    return 0;
++}
++
++static void vfio_dma_unmap_ram_section(VFIOContainer *container,
++                                       MemoryRegionSection *section)
++{
++    Int128 llend, llsize;
++    hwaddr iova, end;
++    bool try_unmap = true;
++    int ret;
++
++    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
++    llend = int128_make64(section->offset_within_address_space);
++    llend = int128_add(llend, section->size);
++    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
++
++    if (int128_ge(int128_make64(iova), llend)) {
++        return;
++    }
++    end = int128_get64(int128_sub(llend, int128_one()));
++
++    llsize = int128_sub(llend, int128_make64(iova));
++
++    trace_vfio_dma_unmap_ram(iova, end);
++
++    if (memory_region_is_ram_device(section->mr)) {
++        hwaddr pgmask;
++        VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end);
++
++        assert(hostwin); /* or region_add() would have failed */
++
++        pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
++        try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
++    }
++
++    if (try_unmap) {
++        if (int128_eq(llsize, int128_2_64())) {
++            /* The unmap ioctl doesn't accept a full 64-bit span. */
++            llsize = int128_rshift(llsize, 1);
++            ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
++            if (ret) {
++                error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
++                             "0x%"HWADDR_PRIx") = %d (%m)",
++                             container, iova, int128_get64(llsize), ret);
++            }
++            iova += int128_get64(llsize);
++        }
++        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
++        if (ret) {
++            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
++                         "0x%"HWADDR_PRIx") = %d (%m)",
++                         container, iova, int128_get64(llsize), ret);
++        }
++    }
++}
++
+ static void vfio_listener_region_add(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+     hwaddr iova, end;
+-    Int128 llend, llsize;
+-    void *vaddr;
++    Int128 llend;
+     int ret;
+     VFIOHostDMAWindow *hostwin;
+ 
+@@ -842,38 +955,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
+     }
+ 
+     /* Here we assume that memory_region_is_ram(section->mr)==true */
+-
+-    vaddr = memory_region_get_ram_ptr(section->mr) +
+-            section->offset_within_region +
+-            (iova - section->offset_within_address_space);
+-
+-    trace_vfio_listener_region_add_ram(iova, end, vaddr);
+-
+-    llsize = int128_sub(llend, int128_make64(iova));
+-
+-    if (memory_region_is_ram_device(section->mr)) {
+-        hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+-
+-        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
+-            trace_vfio_listener_region_add_no_dma_map(
+-                memory_region_name(section->mr),
+-                section->offset_within_address_space,
+-                int128_getlo(section->size),
+-                pgmask + 1);
+-            return;
+-        }
+-    }
+-
+-    ret = vfio_dma_map(container, iova, int128_get64(llsize),
+-                       vaddr, section->readonly);
+-    if (ret) {
+-        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+-                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
+-                     container, iova, int128_get64(llsize), vaddr, ret);
+-        if (memory_region_is_ram_device(section->mr)) {
+-            /* Allow unexpected mappings not to be fatal for RAM devices */
+-            return;
+-        }
++    if (vfio_dma_map_ram_section(container, section)) {
+         goto fail;
+     }
+ 
+@@ -902,10 +984,6 @@ static void vfio_listener_region_del(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+-    hwaddr iova, end;
+-    Int128 llend, llsize;
+-    int ret;
+-    bool try_unmap = true;
+ 
+     if (vfio_listener_skipped_section(section)) {
+         trace_vfio_listener_region_del_skip(
+@@ -945,38 +1023,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
+          */
+     }
+ 
+-    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+-    llend = int128_make64(section->offset_within_address_space);
+-    llend = int128_add(llend, section->size);
+-    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
+-
+-    if (int128_ge(int128_make64(iova), llend)) {
+-        return;
+-    }
+-    end = int128_get64(int128_sub(llend, int128_one()));
+-
+-    llsize = int128_sub(llend, int128_make64(iova));
+-
+-    trace_vfio_listener_region_del(iova, end);
+-
+-    if (memory_region_is_ram_device(section->mr)) {
+-        hwaddr pgmask;
+-        VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end);
+-
+-        assert(hostwin); /* or region_add() would have failed */
+-
+-        pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+-        try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
+-    }
+-
+-    if (try_unmap) {
+-        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
+-        if (ret) {
+-            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+-                         "0x%"HWADDR_PRIx") = %d (%m)",
+-                         container, iova, int128_get64(llsize), ret);
+-        }
+-    }
++    vfio_dma_unmap_ram_section(container, section);
+ 
+     memory_region_unref(section->mr);
+ 
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 561dc6e758..9b6c7ca61b 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -97,10 +97,10 @@ vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "i
+ vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64
+ vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
+ vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64
+-vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
++vfio_dma_map_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
+ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
+ vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
+-vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
++vfio_dma_unmap_ram(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+ vfio_disconnect_container(int fd) "close container->fd=%d"
+ vfio_connect_existing_container(int groupid, int container_fd) "group=%d existing container fd=%d"
+ vfio_connect_new_container(int groupid, int container_fd) "group=%d new container fd=%d"
+-- 
+2.27.0
+
-- 
Gitee


From 39db503d4d2b594f12ae17fc950c7b953b3a2ee8 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Wed, 29 Aug 2018 18:10:12 +0200
Subject: [PATCH 26/48] vfio: Set up nested stage mappings

In nested mode, legacy vfio_iommu_map_notify cannot be used as
there is no "caching" mode and we do not trap on map.

On Intel, vfio_iommu_map_notify was used to DMA map the RAM
through the host single stage.

With nested mode, we need to setup the stage 2 and the stage 1
separately. This patch introduces a prereg_listener to setup
the stage 2 mapping.

The stage 1 mapping, owned by the guest, is passed to the host
when the guest invalidates the stage 1 configuration, through
a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
are cascaded downto the host through another IOMMU MR UNMAP
notifier.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 vfio-Set-up-nested-stage-mappings.patch | 277 ++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 vfio-Set-up-nested-stage-mappings.patch

diff --git a/vfio-Set-up-nested-stage-mappings.patch b/vfio-Set-up-nested-stage-mappings.patch
new file mode 100644
index 0000000..66659e8
--- /dev/null
+++ b/vfio-Set-up-nested-stage-mappings.patch
@@ -0,0 +1,277 @@
+From a65c40f9d1025a9843dec38070d9f26792b00892 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Wed, 29 Aug 2018 18:10:12 +0200
+Subject: [PATCH] vfio: Set up nested stage mappings
+
+In nested mode, legacy vfio_iommu_map_notify cannot be used as
+there is no "caching" mode and we do not trap on map.
+
+On Intel, vfio_iommu_map_notify was used to DMA map the RAM
+through the host single stage.
+
+With nested mode, we need to setup the stage 2 and the stage 1
+separately. This patch introduces a prereg_listener to setup
+the stage 2 mapping.
+
+The stage 1 mapping, owned by the guest, is passed to the host
+when the guest invalidates the stage 1 configuration, through
+a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
+are cascaded downto the host through another IOMMU MR UNMAP
+notifier.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c     | 136 +++++++++++++++++++++++++++++++++++++++++--
+ hw/vfio/pci.c        |  21 +++++++
+ hw/vfio/trace-events |   2 +
+ 3 files changed, 154 insertions(+), 5 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 8837d33c57..cc50efdbc1 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -642,6 +642,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
+     return true;
+ }
+ 
++/* Propagate a guest IOTLB invalidation to the host (nested mode) */
++static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
++{
++    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
++    struct vfio_iommu_type1_cache_invalidate ustruct = {};
++    VFIOContainer *container = giommu->container;
++    int ret;
++
++    assert(iotlb->perm == IOMMU_NONE);
++
++    ustruct.argsz = sizeof(ustruct);
++    ustruct.flags = 0;
++    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
++    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
++    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
++
++    switch (iotlb->granularity) {
++    case IOMMU_INV_GRAN_DOMAIN:
++        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
++        break;
++    case IOMMU_INV_GRAN_PASID:
++    {
++        struct iommu_inv_pasid_info *pasid_info;
++        int archid = -1;
++
++        pasid_info = &ustruct.info.granu.pasid_info;
++        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
++        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
++            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
++            archid = iotlb->arch_id;
++        }
++        pasid_info->archid = archid;
++        trace_vfio_iommu_asid_inv_iotlb(archid);
++        break;
++    }
++    case IOMMU_INV_GRAN_ADDR:
++    {
++        hwaddr start = iotlb->iova + giommu->iommu_offset;
++        struct iommu_inv_addr_info *addr_info;
++        size_t size = iotlb->addr_mask + 1;
++        int archid = -1;
++
++        addr_info = &ustruct.info.granu.addr_info;
++        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
++        if (iotlb->leaf) {
++            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
++        }
++        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
++            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
++            archid = iotlb->arch_id;
++        }
++        addr_info->archid = archid;
++        addr_info->addr = start;
++        addr_info->granule_size = size;
++        addr_info->nb_granules = 1;
++        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
++                                        1, iotlb->leaf);
++        break;
++    }
++    }
++
++    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
++    if (ret) {
++        error_report("%p: failed to invalidate CACHE (%d)", container, ret);
++    }
++}
++
+ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+ {
+     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+@@ -823,6 +890,32 @@ static void vfio_dma_unmap_ram_section(VFIOContainer *container,
+     }
+ }
+ 
++static void vfio_prereg_listener_region_add(MemoryListener *listener,
++                                            MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr)) {
++        return;
++    }
++
++    vfio_dma_map_ram_section(container, section);
++}
++
++static void vfio_prereg_listener_region_del(MemoryListener *listener,
++                                     MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr)) {
++        return;
++    }
++
++    vfio_dma_unmap_ram_section(container, section);
++}
++
+ static void vfio_listener_region_add(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+@@ -920,9 +1013,10 @@ static void vfio_listener_region_add(MemoryListener *listener,
+     memory_region_ref(section->mr);
+ 
+     if (memory_region_is_iommu(section->mr)) {
++        IOMMUNotify notify;
+         VFIOGuestIOMMU *giommu;
+         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+-        int iommu_idx;
++        int iommu_idx, flags;
+ 
+         trace_vfio_listener_region_add_iommu(iova, end);
+         /*
+@@ -941,15 +1035,27 @@ static void vfio_listener_region_add(MemoryListener *listener,
+         llend = int128_sub(llend, int128_one());
+         iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
+                                                        MEMTXATTRS_UNSPECIFIED);
+-        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
+-                            IOMMU_NOTIFIER_ALL,
++
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            /* IOTLB unmap notifier to propagate guest IOTLB invalidations */
++            flags = IOMMU_NOTIFIER_UNMAP;
++            notify = vfio_iommu_unmap_notify;
++        } else {
++            /* MAP/UNMAP IOTLB notifier */
++            flags = IOMMU_NOTIFIER_ALL;
++            notify = vfio_iommu_map_notify;
++        }
++
++        iommu_notifier_init(&giommu->n, notify, flags,
+                             section->offset_within_region,
+                             int128_get64(llend),
+                             iommu_idx);
+         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+ 
+         memory_region_register_iommu_notifier(section->mr, &giommu->n);
+-        memory_region_iommu_replay(giommu->iommu, &giommu->n);
++        if (flags & IOMMU_NOTIFIER_MAP) {
++            memory_region_iommu_replay(giommu->iommu, &giommu->n);
++        }
+ 
+         return;
+     }
+@@ -1367,10 +1473,16 @@ static const MemoryListener vfio_memory_listener = {
+     .log_clear = vfio_listener_log_clear,
+ };
+ 
++static MemoryListener vfio_memory_prereg_listener = {
++    .region_add = vfio_prereg_listener_region_add,
++    .region_del = vfio_prereg_listener_region_del,
++};
++
+ static void vfio_listener_release(VFIOContainer *container)
+ {
+     memory_listener_unregister(&container->listener);
+-    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
++    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
++        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
+         memory_listener_unregister(&container->prereg_listener);
+     }
+ }
+@@ -1976,6 +2088,20 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+             vfio_get_iommu_info_migration(container, info);
+         }
+         g_free(info);
++
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            container->prereg_listener = vfio_memory_prereg_listener;
++            memory_listener_register(&container->prereg_listener,
++                                     &address_space_memory);
++            if (container->error) {
++                memory_listener_unregister(&container->prereg_listener);
++                ret = container->error;
++                error_setg(errp,
++                          "RAM memory listener initialization failed "
++                          "for container");
++                goto free_container_exit;
++            }
++        }
+         break;
+     }
+     case VFIO_SPAPR_TCE_v2_IOMMU:
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index 3641ad0c5c..6c90ec9278 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2766,6 +2766,25 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
+     vdev->req_enabled = false;
+ }
+ 
++static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
++                                      IOMMUConfig *config)
++{
++    PCIDevice *pdev = bus->devices[devfn];
++    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
++    VFIOContainer *container = vdev->vbasedev.group->container;
++    struct vfio_iommu_type1_set_pasid_table info;
++
++    info.argsz = sizeof(info);
++    info.flags = VFIO_PASID_TABLE_FLAG_SET;
++    memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg));
++
++    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
++}
++
++static PCIPASIDOps vfio_pci_pasid_ops = {
++    .set_pasid_table = vfio_iommu_set_pasid_table,
++};
++
+ static void vfio_realize(PCIDevice *pdev, Error **errp)
+ {
+     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
+@@ -3072,6 +3091,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+     vfio_register_req_notifier(vdev);
+     vfio_setup_resetfn_quirk(vdev);
+ 
++    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
++
+     return;
+ 
+ out_teardown:
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 9b6c7ca61b..ee9a67d3ef 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -118,6 +118,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic
+ vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
+ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
+ vfio_dma_unmap_overflow_workaround(void) ""
++vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
++vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
+ 
+ # platform.c
+ vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
+-- 
+2.27.0
+
-- 
Gitee


From d159b4edba4961571b0a24b4a1c4bc9ed5d5156b Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 14 Aug 2018 08:08:11 -0400
Subject: [PATCH 27/48] vfio: Pass stage 1 MSI bindings to the host

We register the stage1 MSI bindings when enabling the vectors
and we unregister them on msi disable.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ass-stage-1-MSI-bindings-to-the-host.patch | 262 ++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 vfio-Pass-stage-1-MSI-bindings-to-the-host.patch

diff --git a/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch
new file mode 100644
index 0000000..1ad94b0
--- /dev/null
+++ b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch
@@ -0,0 +1,262 @@
+From 1729ae16dc557c0ad54cab3096b5cb6649d181ae Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 14 Aug 2018 08:08:11 -0400
+Subject: [PATCH] vfio: Pass stage 1 MSI bindings to the host
+
+We register the stage1 MSI bindings when enabling the vectors
+and we unregister them on msi disable.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 59 +++++++++++++++++++++++++++
+ hw/vfio/pci.c                 | 76 ++++++++++++++++++++++++++++++++++-
+ hw/vfio/trace-events          |  2 +
+ include/hw/vfio/vfio-common.h | 12 ++++++
+ 4 files changed, 147 insertions(+), 2 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index cc50efdbc1..db9af3b0e5 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -709,6 +709,65 @@ static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+     }
+ }
+ 
++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n,
++                               IOMMUTLBEntry *iotlb)
++{
++    struct vfio_iommu_type1_set_msi_binding ustruct;
++    VFIOMSIBinding *binding;
++    int ret;
++
++    QLIST_FOREACH(binding, &container->msibinding_list, next) {
++        if (binding->index == n) {
++            return 0;
++        }
++    }
++
++    ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding);
++    ustruct.iova = iotlb->iova;
++    ustruct.flags = VFIO_IOMMU_BIND_MSI;
++    ustruct.gpa = iotlb->translated_addr;
++    ustruct.size = iotlb->addr_mask + 1;
++    ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct);
++    if (ret) {
++        error_report("%s: failed to register the stage1 MSI binding (%m)",
++                     __func__);
++        return ret;
++    }
++    binding =  g_new0(VFIOMSIBinding, 1);
++    binding->iova = ustruct.iova;
++    binding->gpa = ustruct.gpa;
++    binding->size = ustruct.size;
++    binding->index = n;
++
++    QLIST_INSERT_HEAD(&container->msibinding_list, binding, next);
++    return 0;
++}
++
++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n)
++{
++    struct vfio_iommu_type1_set_msi_binding ustruct;
++    VFIOMSIBinding *binding, *tmp;
++    int ret;
++
++    ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding);
++    QLIST_FOREACH_SAFE(binding, &container->msibinding_list, next, tmp) {
++        if (binding->index != n) {
++            continue;
++        }
++        ustruct.flags = VFIO_IOMMU_UNBIND_MSI;
++        ustruct.iova = binding->iova;
++        ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct);
++        if (ret) {
++            error_report("Failed to unregister the stage1 MSI binding "
++                         "for iova=0x%"PRIx64" (%m)", binding->iova);
++        }
++        QLIST_REMOVE(binding, next);
++        g_free(binding);
++        return ret;
++    }
++    return 0;
++}
++
+ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+ {
+     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index 6c90ec9278..bbcba3fd16 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -360,6 +360,65 @@ static void vfio_msi_interrupt(void *opaque)
+     notify(&vdev->pdev, nr);
+ }
+ 
++static bool vfio_iommu_require_msi_binding(IOMMUMemoryRegion *iommu_mr)
++{
++    bool msi_translate = false, nested = false;
++
++    memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_MSI_TRANSLATE,
++                                 (void *)&msi_translate);
++    memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED,
++                                 (void *)&nested);
++    if (!nested || !msi_translate) {
++        return false;
++    }
++   return true;
++}
++
++static int vfio_register_msi_binding(VFIOPCIDevice *vdev,
++                                     int vector_n, bool set)
++{
++    VFIOContainer *container = vdev->vbasedev.group->container;
++    PCIDevice *dev = &vdev->pdev;
++    AddressSpace *as = pci_device_iommu_address_space(dev);
++    IOMMUMemoryRegionClass *imrc;
++    IOMMUMemoryRegion *iommu_mr;
++    IOMMUTLBEntry entry;
++    MSIMessage msg;
++
++    if (as == &address_space_memory) {
++        return 0;
++    }
++
++    iommu_mr = IOMMU_MEMORY_REGION(as->root);
++    if (!vfio_iommu_require_msi_binding(iommu_mr)) {
++        return 0;
++    }
++
++    /* MSI doorbell address is translated by an IOMMU */
++
++    if (!set) { /* unregister */
++        trace_vfio_unregister_msi_binding(vdev->vbasedev.name, vector_n);
++
++        return vfio_iommu_unset_msi_binding(container, vector_n);
++    }
++
++    msg = pci_get_msi_message(dev, vector_n);
++    imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
++
++    rcu_read_lock();
++    entry = imrc->translate(iommu_mr, msg.address, IOMMU_WO, 0);
++    rcu_read_unlock();
++
++    if (entry.perm == IOMMU_NONE) {
++        return -ENOENT;
++    }
++
++    trace_vfio_register_msi_binding(vdev->vbasedev.name, vector_n,
++                                    msg.address, entry.translated_addr);
++
++    return vfio_iommu_set_msi_binding(container, vector_n, &entry);
++}
++
+ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+ {
+     struct vfio_irq_set *irq_set;
+@@ -377,7 +436,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+     fds = (int32_t *)&irq_set->data;
+ 
+     for (i = 0; i < vdev->nr_vectors; i++) {
+-        int fd = -1;
++        int ret, fd = -1;
+ 
+         /*
+          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
+@@ -386,6 +445,12 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+          * KVM signaling path only when configured and unmasked.
+          */
+         if (vdev->msi_vectors[i].use) {
++            ret = vfio_register_msi_binding(vdev, i, true);
++            if (ret) {
++                error_report("%s failed to register S1 MSI binding "
++                             "for vector %d(%d)", vdev->vbasedev.name, i, ret);
++                goto out;
++            }
+             if (vdev->msi_vectors[i].virq < 0 ||
+                 (msix && msix_is_masked(&vdev->pdev, i))) {
+                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+@@ -399,6 +464,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+ 
+     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ 
++out:
+     g_free(irq_set);
+ 
+     return ret;
+@@ -712,7 +778,8 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
+ 
+ static void vfio_msix_disable(VFIOPCIDevice *vdev)
+ {
+-    int i;
++    int ret, i;
++
+ 
+     msix_unset_vector_notifiers(&vdev->pdev);
+ 
+@@ -724,6 +791,11 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
+         if (vdev->msi_vectors[i].use) {
+             vfio_msix_vector_release(&vdev->pdev, i);
+             msix_vector_unuse(&vdev->pdev, i);
++            ret = vfio_register_msi_binding(vdev, i, false);
++            if (ret) {
++                error_report("%s: failed to unregister S1 MSI binding "
++                             "for vector %d(%d)", vdev->vbasedev.name, i, ret);
++            }
+         }
+     }
+ 
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index ee9a67d3ef..247b72c1eb 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -120,6 +120,8 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype
+ vfio_dma_unmap_overflow_workaround(void) ""
+ vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
+ vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
++vfio_register_msi_binding(const char *name, int vector, uint64_t giova, uint64_t gdb) "%s: register vector %d gIOVA=0x%"PRIx64 "-> gDB=0x%"PRIx64" stage 1 mapping"
++vfio_unregister_msi_binding(const char *name, int vector) "%s: unregister vector %d stage 1 mapping"
+ 
+ # platform.c
+ vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index 1277914ca8..b175158138 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -74,6 +74,14 @@ typedef struct VFIOAddressSpace {
+     QLIST_ENTRY(VFIOAddressSpace) list;
+ } VFIOAddressSpace;
+ 
++typedef struct VFIOMSIBinding {
++    int index;
++    hwaddr iova;
++    hwaddr gpa;
++    hwaddr size;
++    QLIST_ENTRY(VFIOMSIBinding) next;
++} VFIOMSIBinding;
++
+ struct VFIOGroup;
+ 
+ typedef struct VFIODMARange {
+@@ -101,6 +109,7 @@ typedef struct VFIOContainer {
+     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+     QLIST_HEAD(, VFIOGroup) group_list;
+     QLIST_HEAD(, VFIODMARange) dma_list;
++    QLIST_HEAD(, VFIOMSIBinding) msibinding_list;
+     QLIST_ENTRY(VFIOContainer) next;
+ } VFIOContainer;
+ 
+@@ -210,6 +219,9 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
+ void vfio_put_group(VFIOGroup *group);
+ int vfio_get_device(VFIOGroup *group, const char *name,
+                     VFIODevice *vbasedev, Error **errp);
++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n,
++                               IOMMUTLBEntry *entry);
++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n);
+ 
+ extern const MemoryRegionOps vfio_region_ops;
+ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
+-- 
+2.27.0
+
-- 
Gitee


From f4fce522f08daabb78d0901723a565aa715b3c07 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 20 Jun 2019 16:39:57 +0200
Subject: [PATCH 28/48] vfio: Helper to get IRQ info including capabilities

As done for vfio regions, add helpers to retrieve irq info
including their optional capabilities.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...-get-IRQ-info-including-capabilities.patch | 178 ++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 vfio-Helper-to-get-IRQ-info-including-capabilities.patch

diff --git a/vfio-Helper-to-get-IRQ-info-including-capabilities.patch b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch
new file mode 100644
index 0000000..16f16d3
--- /dev/null
+++ b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch
@@ -0,0 +1,178 @@
+From 43fd039dcfee221eb3f86a2cf7deb287cc04e5ad Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 20 Jun 2019 16:39:57 +0200
+Subject: [PATCH] vfio: Helper to get IRQ info including capabilities
+
+As done for vfio regions, add helpers to retrieve irq info
+including their optional capabilities.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 97 +++++++++++++++++++++++++++++++++++
+ hw/vfio/trace-events          |  1 +
+ include/hw/vfio/vfio-common.h |  7 +++
+ 3 files changed, 105 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index db9af3b0e5..98dc9e6f84 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1565,6 +1565,25 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
+     return NULL;
+ }
+ 
++struct vfio_info_cap_header *
++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id)
++{
++    struct vfio_info_cap_header *hdr;
++    void *ptr = info;
++
++    if (!(info->flags & VFIO_IRQ_INFO_FLAG_CAPS)) {
++        return NULL;
++    }
++
++    for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
++        if (hdr->id == id) {
++            return hdr;
++        }
++    }
++
++    return NULL;
++}
++
+ static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
+                                           struct vfio_region_info *info)
+ {
+@@ -2499,6 +2518,33 @@ retry:
+     return 0;
+ }
+ 
++int vfio_get_irq_info(VFIODevice *vbasedev, int index,
++                      struct vfio_irq_info **info)
++{
++    size_t argsz = sizeof(struct vfio_irq_info);
++
++    *info = g_malloc0(argsz);
++
++    (*info)->index = index;
++retry:
++    (*info)->argsz = argsz;
++
++    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, *info)) {
++        g_free(*info);
++        *info = NULL;
++        return -errno;
++    }
++
++    if ((*info)->argsz > argsz) {
++        argsz = (*info)->argsz;
++        *info = g_realloc(*info, argsz);
++
++        goto retry;
++    }
++
++    return 0;
++}
++
+ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+                              uint32_t subtype, struct vfio_region_info **info)
+ {
+@@ -2534,6 +2580,42 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+     return -ENODEV;
+ }
+ 
++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type,
++                          uint32_t subtype, struct vfio_irq_info **info)
++{
++    int i;
++
++    for (i = 0; i < vbasedev->num_irqs; i++) {
++        struct vfio_info_cap_header *hdr;
++        struct vfio_irq_info_cap_type *cap_type;
++
++        if (vfio_get_irq_info(vbasedev, i, info)) {
++            continue;
++        }
++
++        hdr = vfio_get_irq_info_cap(*info, VFIO_IRQ_INFO_CAP_TYPE);
++        if (!hdr) {
++            g_free(*info);
++            continue;
++        }
++
++        cap_type = container_of(hdr, struct vfio_irq_info_cap_type, header);
++
++        trace_vfio_get_dev_irq(vbasedev->name, i,
++                               cap_type->type, cap_type->subtype);
++
++        if (cap_type->type == type && cap_type->subtype == subtype) {
++            return 0;
++        }
++
++        g_free(*info);
++    }
++
++    *info = NULL;
++    return -ENODEV;
++}
++
++
+ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
+ {
+     struct vfio_region_info *info = NULL;
+@@ -2549,6 +2631,21 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
+     return ret;
+ }
+ 
++bool vfio_has_irq_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
++{
++    struct vfio_region_info *info = NULL;
++    bool ret = false;
++
++    if (!vfio_get_region_info(vbasedev, region, &info)) {
++        if (vfio_get_region_info_cap(info, cap_type)) {
++            ret = true;
++        }
++        g_free(info);
++    }
++
++    return ret;
++}
++
+ /*
+  * Interfaces for IBM EEH (Enhanced Error Handling)
+  */
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 247b72c1eb..54e10046f5 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -117,6 +117,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re
+ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
+ vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
+ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
++vfio_get_dev_irq(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
+ vfio_dma_unmap_overflow_workaround(void) ""
+ vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
+ vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index b175158138..a82962ab16 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -238,6 +238,13 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type);
+ struct vfio_info_cap_header *
+ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id);
++int vfio_get_irq_info(VFIODevice *vbasedev, int index,
++                      struct vfio_irq_info **info);
++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type,
++                          uint32_t subtype, struct vfio_irq_info **info);
++bool vfio_has_irq_cap(VFIODevice *vbasedev, int irq, uint16_t cap_type);
++struct vfio_info_cap_header *
++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id);
+ #endif
+ extern const MemoryListener vfio_prereg_listener;
+ 
+-- 
+2.27.0
+
-- 
Gitee


From 98dfb30ca5c40eab69394c458dde3edd0cd2e3cf Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 13 Dec 2018 04:39:30 -0500
Subject: [PATCH 29/48] vfio/pci: Register handler for iommu fault

We use the new extended IRQ VFIO_IRQ_TYPE_NESTED type and
VFIO_IRQ_SUBTYPE_DMA_FAULT subtype to set/unset
a notifier for physical DMA faults. The associated eventfd is
triggered, in nested mode, whenever a fault is detected at IOMMU
physical level.

The actual handler will be implemented in subsequent patches.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...pci-Register-handler-for-iommu-fault.patch | 168 ++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 vfio-pci-Register-handler-for-iommu-fault.patch

diff --git a/vfio-pci-Register-handler-for-iommu-fault.patch b/vfio-pci-Register-handler-for-iommu-fault.patch
new file mode 100644
index 0000000..feea0a3
--- /dev/null
+++ b/vfio-pci-Register-handler-for-iommu-fault.patch
@@ -0,0 +1,168 @@
+From 65b96da46d2c5dfdcf3a4618cf75ca94345164d7 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Dec 2018 04:39:30 -0500
+Subject: [PATCH] vfio/pci: Register handler for iommu fault
+
+We use the new extended IRQ VFIO_IRQ_TYPE_NESTED type and
+VFIO_IRQ_SUBTYPE_DMA_FAULT subtype to set/unset
+a notifier for physical DMA faults. The associated eventfd is
+triggered, in nested mode, whenever a fault is detected at IOMMU
+physical level.
+
+The actual handler will be implemented in subsequent patches.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++-
+ hw/vfio/pci.h |  7 +++++
+ 2 files changed, 87 insertions(+), 1 deletion(-)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index bbcba3fd16..f5c05d508d 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2857,6 +2857,76 @@ static PCIPASIDOps vfio_pci_pasid_ops = {
+     .set_pasid_table = vfio_iommu_set_pasid_table,
+ };
+ 
++static void vfio_dma_fault_notifier_handler(void *opaque)
++{
++    VFIOPCIExtIRQ *ext_irq = opaque;
++
++    if (!event_notifier_test_and_clear(&ext_irq->notifier)) {
++        return;
++    }
++}
++
++static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev,
++                                         uint32_t type, uint32_t subtype,
++                                         IOHandler *handler)
++{
++    int32_t fd, ext_irq_index, index;
++    struct vfio_irq_info *irq_info;
++    Error *err = NULL;
++    EventNotifier *n;
++    int ret;
++
++    ret = vfio_get_dev_irq_info(&vdev->vbasedev, type, subtype, &irq_info);
++    if (ret) {
++        return ret;
++    }
++    index = irq_info->index;
++    ext_irq_index = irq_info->index - VFIO_PCI_NUM_IRQS;
++    g_free(irq_info);
++
++    vdev->ext_irqs[ext_irq_index].vdev = vdev;
++    vdev->ext_irqs[ext_irq_index].index = index;
++    n = &vdev->ext_irqs[ext_irq_index].notifier;
++
++    ret = event_notifier_init(n, 0);
++    if (ret) {
++        error_report("vfio: Unable to init event notifier for ext irq %d(%d)",
++                     ext_irq_index, ret);
++        return ret;
++    }
++
++    fd = event_notifier_get_fd(n);
++    qemu_set_fd_handler(fd, vfio_dma_fault_notifier_handler, NULL,
++                        &vdev->ext_irqs[ext_irq_index]);
++
++    ret = vfio_set_irq_signaling(&vdev->vbasedev, index, 0,
++                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err);
++    if (ret) {
++        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
++        qemu_set_fd_handler(fd, NULL, NULL, vdev);
++        event_notifier_cleanup(n);
++    }
++    return ret;
++}
++
++static void vfio_unregister_ext_irq_notifiers(VFIOPCIDevice *vdev)
++{
++    VFIODevice *vbasedev = &vdev->vbasedev;
++    Error *err = NULL;
++    int i;
++
++    for (i = 0; i < vbasedev->num_irqs - VFIO_PCI_NUM_IRQS; i++) {
++        if (vfio_set_irq_signaling(vbasedev, i + VFIO_PCI_NUM_IRQS , 0,
++                                   VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
++            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
++        }
++        qemu_set_fd_handler(event_notifier_get_fd(&vdev->ext_irqs[i].notifier),
++                            NULL, NULL, vdev);
++        event_notifier_cleanup(&vdev->ext_irqs[i].notifier);
++    }
++    g_free(vdev->ext_irqs);
++}
++
+ static void vfio_realize(PCIDevice *pdev, Error **errp)
+ {
+     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
+@@ -2867,7 +2937,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+     ssize_t len;
+     struct stat st;
+     int groupid;
+-    int i, ret;
++    int i, ret, nb_ext_irqs;
+     bool is_mdev;
+ 
+     if (!vdev->vbasedev.sysfsdev) {
+@@ -2955,6 +3025,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+         goto error;
+     }
+ 
++    nb_ext_irqs = vdev->vbasedev.num_irqs - VFIO_PCI_NUM_IRQS;
++    if (nb_ext_irqs > 0) {
++        vdev->ext_irqs = g_new0(VFIOPCIExtIRQ, nb_ext_irqs);
++    }
++
+     vfio_populate_device(vdev, &err);
+     if (err) {
+         error_propagate(errp, err);
+@@ -3161,6 +3236,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+ 
+     vfio_register_err_notifier(vdev);
+     vfio_register_req_notifier(vdev);
++    vfio_register_ext_irq_handler(vdev, VFIO_IRQ_TYPE_NESTED,
++                                  VFIO_IRQ_SUBTYPE_DMA_FAULT,
++                                  vfio_dma_fault_notifier_handler);
+     vfio_setup_resetfn_quirk(vdev);
+ 
+     pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
+@@ -3201,6 +3279,7 @@ static void vfio_exitfn(PCIDevice *pdev)
+ 
+     vfio_unregister_req_notifier(vdev);
+     vfio_unregister_err_notifier(vdev);
++    vfio_unregister_ext_irq_notifiers(vdev);
+     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+     vfio_disable_interrupts(vdev);
+     if (vdev->intx.mmap_timer) {
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index 834a90d646..893d074375 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -113,6 +113,12 @@ typedef struct VFIOMSIXInfo {
+     unsigned long *pending;
+ } VFIOMSIXInfo;
+ 
++typedef struct VFIOPCIExtIRQ {
++    struct VFIOPCIDevice *vdev;
++    EventNotifier notifier;
++    uint32_t index;
++} VFIOPCIExtIRQ;
++
+ typedef struct VFIOPCIDevice {
+     PCIDevice pdev;
+     VFIODevice vbasedev;
+@@ -134,6 +140,7 @@ typedef struct VFIOPCIDevice {
+     PCIHostDeviceAddress host;
+     EventNotifier err_notifier;
+     EventNotifier req_notifier;
++    VFIOPCIExtIRQ *ext_irqs;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
-- 
Gitee


From 58476d1b47961f0ec6abe15f16ec8c160d402522 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 13 Dec 2018 10:57:53 -0500
Subject: [PATCH 30/48] vfio/pci: Set up the DMA FAULT region

Set up the fault region which is composed of the actual fault
queue (mmappable) and a header used to handle it. The fault
queue is mmapped.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 vfio-pci-Set-up-the-DMA-FAULT-region.patch | 132 +++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 vfio-pci-Set-up-the-DMA-FAULT-region.patch

diff --git a/vfio-pci-Set-up-the-DMA-FAULT-region.patch b/vfio-pci-Set-up-the-DMA-FAULT-region.patch
new file mode 100644
index 0000000..ae70a06
--- /dev/null
+++ b/vfio-pci-Set-up-the-DMA-FAULT-region.patch
@@ -0,0 +1,132 @@
+From e44d9cc377848f0a560b6d114561852e95fab557 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Dec 2018 10:57:53 -0500
+Subject: [PATCH] vfio/pci: Set up the DMA FAULT region
+
+Set up the fault region which is composed of the actual fault
+queue (mmappable) and a header used to handle it. The fault
+queue is mmapped.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/vfio/pci.h |  1 +
+ 2 files changed, 65 insertions(+)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index f5c05d508d..0db7d68258 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2607,11 +2607,67 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
+     return 0;
+ }
+ 
++static void vfio_init_fault_regions(VFIOPCIDevice *vdev, Error **errp)
++{
++    struct vfio_region_info *fault_region_info = NULL;
++    struct vfio_region_info_cap_fault *cap_fault;
++    VFIODevice *vbasedev = &vdev->vbasedev;
++    struct vfio_info_cap_header *hdr;
++    char *fault_region_name;
++    int ret;
++
++    ret = vfio_get_dev_region_info(&vdev->vbasedev,
++                                   VFIO_REGION_TYPE_NESTED,
++                                   VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
++                                   &fault_region_info);
++    if (ret) {
++        goto out;
++    }
++
++    hdr = vfio_get_region_info_cap(fault_region_info,
++                                   VFIO_REGION_INFO_CAP_DMA_FAULT);
++    if (!hdr) {
++        error_setg(errp, "failed to retrieve DMA FAULT capability");
++        goto out;
++    }
++    cap_fault = container_of(hdr, struct vfio_region_info_cap_fault,
++                             header);
++    if (cap_fault->version != 1) {
++        error_setg(errp, "Unsupported DMA FAULT API version %d",
++                   cap_fault->version);
++        goto out;
++    }
++
++    fault_region_name = g_strdup_printf("%s DMA FAULT %d",
++                                        vbasedev->name,
++                                        fault_region_info->index);
++
++    ret = vfio_region_setup(OBJECT(vdev), vbasedev,
++                            &vdev->dma_fault_region,
++                            fault_region_info->index,
++                            fault_region_name);
++    g_free(fault_region_name);
++    if (ret) {
++        error_setg_errno(errp, -ret,
++                         "failed to set up the DMA FAULT region %d",
++                         fault_region_info->index);
++        goto out;
++    }
++
++    ret = vfio_region_mmap(&vdev->dma_fault_region);
++    if (ret) {
++        error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT queue");
++    }
++out:
++    g_free(fault_region_info);
++}
++
+ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+ {
+     VFIODevice *vbasedev = &vdev->vbasedev;
+     struct vfio_region_info *reg_info;
+     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
++    Error *err = NULL;
+     int i, ret = -1;
+ 
+     /* Sanity check device */
+@@ -2675,6 +2731,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+         }
+     }
+ 
++    vfio_init_fault_regions(vdev, &err);
++    if (err) {
++        error_propagate(errp, err);
++        return;
++    }
++
+     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
+ 
+     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+@@ -3260,6 +3322,7 @@ static void vfio_instance_finalize(Object *obj)
+ 
+     vfio_display_finalize(vdev);
+     vfio_bars_finalize(vdev);
++    vfio_region_finalize(&vdev->dma_fault_region);
+     g_free(vdev->emulated_config_bits);
+     g_free(vdev->rom);
+     /*
+@@ -3280,6 +3343,7 @@ static void vfio_exitfn(PCIDevice *pdev)
+     vfio_unregister_req_notifier(vdev);
+     vfio_unregister_err_notifier(vdev);
+     vfio_unregister_ext_irq_notifiers(vdev);
++    vfio_region_exit(&vdev->dma_fault_region);
+     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+     vfio_disable_interrupts(vdev);
+     if (vdev->intx.mmap_timer) {
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index 893d074375..815154656c 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -141,6 +141,7 @@ typedef struct VFIOPCIDevice {
+     EventNotifier err_notifier;
+     EventNotifier req_notifier;
+     VFIOPCIExtIRQ *ext_irqs;
++    VFIORegion dma_fault_region;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
-- 
Gitee


From 93c6df7e5a9c5dcfe55cfbc4a4c9cc3d428ed901 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 5 Mar 2019 16:35:32 +0100
Subject: [PATCH 31/48] vfio/pci: Implement the DMA fault handler

Whenever the eventfd is triggered, we retrieve the DMA fault(s)
from the mmapped fault region and inject them in the iommu
memory region.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...-pci-Implement-the-DMA-fault-handler.patch | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 vfio-pci-Implement-the-DMA-fault-handler.patch

diff --git a/vfio-pci-Implement-the-DMA-fault-handler.patch b/vfio-pci-Implement-the-DMA-fault-handler.patch
new file mode 100644
index 0000000..ca61b01
--- /dev/null
+++ b/vfio-pci-Implement-the-DMA-fault-handler.patch
@@ -0,0 +1,96 @@
+From 139d0b3474c29427fea4a0ed47f51c01a76a8636 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 5 Mar 2019 16:35:32 +0100
+Subject: [PATCH] vfio/pci: Implement the DMA fault handler
+
+Whenever the eventfd is triggered, we retrieve the DMA fault(s)
+from the mmapped fault region and inject them in the iommu
+memory region.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/vfio/pci.h |  1 +
+ 2 files changed, 51 insertions(+)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index 0db7d68258..d1198c8a23 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2922,10 +2922,60 @@ static PCIPASIDOps vfio_pci_pasid_ops = {
+ static void vfio_dma_fault_notifier_handler(void *opaque)
+ {
+     VFIOPCIExtIRQ *ext_irq = opaque;
++    VFIOPCIDevice *vdev = ext_irq->vdev;
++    PCIDevice *pdev = &vdev->pdev;
++    AddressSpace *as = pci_device_iommu_address_space(pdev);
++    IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(as->root);
++    struct vfio_region_dma_fault header;
++    struct iommu_fault *queue;
++    char *queue_buffer = NULL;
++    ssize_t bytes;
+ 
+     if (!event_notifier_test_and_clear(&ext_irq->notifier)) {
+         return;
+     }
++
++    bytes = pread(vdev->vbasedev.fd, &header, sizeof(header),
++                  vdev->dma_fault_region.fd_offset);
++    if (bytes != sizeof(header)) {
++        error_report("%s unable to read the fault region header (0x%lx)",
++                     __func__, bytes);
++        return;
++    }
++
++    /* Normally the fault queue is mmapped */
++    queue = (struct iommu_fault *)vdev->dma_fault_region.mmaps[0].mmap;
++    if (!queue) {
++        size_t queue_size = header.nb_entries * header.entry_size;
++
++        error_report("%s: fault queue not mmapped: slower fault handling",
++                     vdev->vbasedev.name);
++
++        queue_buffer = g_malloc(queue_size);
++        bytes =  pread(vdev->vbasedev.fd, queue_buffer, queue_size,
++                       vdev->dma_fault_region.fd_offset + header.offset);
++        if (bytes != queue_size) {
++            error_report("%s unable to read the fault queue (0x%lx)",
++                         __func__, bytes);
++            return;
++        }
++
++        queue = (struct iommu_fault *)queue_buffer;
++    }
++
++    while (vdev->fault_tail_index != header.head) {
++        memory_region_inject_faults(iommu_mr, 1,
++                                    &queue[vdev->fault_tail_index]);
++        vdev->fault_tail_index =
++            (vdev->fault_tail_index + 1) % header.nb_entries;
++    }
++    bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_tail_index, 4,
++                   vdev->dma_fault_region.fd_offset);
++    if (bytes != 4) {
++        error_report("%s unable to write the fault region tail index (0x%lx)",
++                     __func__, bytes);
++    }
++    g_free(queue_buffer);
+ }
+ 
+ static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev,
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index 815154656c..e31bc0173a 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -142,6 +142,7 @@ typedef struct VFIOPCIDevice {
+     EventNotifier req_notifier;
+     VFIOPCIExtIRQ *ext_irqs;
+     VFIORegion dma_fault_region;
++    uint32_t fault_tail_index;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
-- 
Gitee


From a782347b0388b61f59f33323838115bf857caf69 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 28 Aug 2018 09:21:53 -0400
Subject: [PATCH 32/48] hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute

The SMMUv3 has the peculiarity to translate MSI
transactionss. let's advertise the corresponding
attribute.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...v3-Advertise-MSI_TRANSLATE-attribute.patch | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch

diff --git a/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch
new file mode 100644
index 0000000..89f9292
--- /dev/null
+++ b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch
@@ -0,0 +1,32 @@
+From bc602a4d1355774a0a44e8fbf6dd842049dd63f3 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 28 Aug 2018 09:21:53 -0400
+Subject: [PATCH] hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute
+
+The SMMUv3 has the peculiarity to translate MSI
+transactionss. let's advertise the corresponding
+attribute.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 55eed5189e..83d59b6d28 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1538,6 +1538,9 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu,
+     if (attr == IOMMU_ATTR_VFIO_NESTED) {
+         *(bool *) data = true;
+         return 0;
++    } else if (attr == IOMMU_ATTR_MSI_TRANSLATE) {
++        *(bool *) data = true;
++        return 0;
+     }
+     return -EINVAL;
+ }
+-- 
+2.27.0
+
-- 
Gitee


From cf3014ddc1032fdb8eca091ded0ff9e42523a9fb Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 9 Aug 2018 20:56:44 +0200
Subject: [PATCH 33/48] hw/arm/smmuv3: Store the PASID table GPA in the
 translation config

For VFIO integration we will need to pass the Context Descriptor (CD)
table GPA to the host. The CD table is also referred to as the PASID
table. Its GPA corresponds to the s1ctrptr field of the Stream Table
Entry. So let's decode and store it in the configuration structure.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ore-the-PASID-table-GPA-in-the-trans.patch | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch

diff --git a/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch
new file mode 100644
index 0000000..8ed3590
--- /dev/null
+++ b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch
@@ -0,0 +1,45 @@
+From 6fc85d8a6022d94ffec4cc118472cde583706bfb Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 9 Aug 2018 20:56:44 +0200
+Subject: [PATCH] hw/arm/smmuv3: Store the PASID table GPA in the translation
+ config
+
+For VFIO integration we will need to pass the Context Descriptor (CD)
+table GPA to the host. The CD table is also referred to as the PASID
+table. Its GPA corresponds to the s1ctrptr field of the Stream Table
+Entry. So let's decode and store it in the configuration structure.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c              | 1 +
+ include/hw/arm/smmu-common.h | 1 +
+ 2 files changed, 2 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 83d59b6d28..f8e721f949 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -352,6 +352,7 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
+                       "SMMUv3 S1 stalling fault model not allowed yet\n");
+         goto bad_ste;
+     }
++    cfg->s1ctxptr = STE_CTXPTR(ste);
+     return 0;
+ 
+ bad_ste:
+diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
+index 1f37844e5c..353668f4ea 100644
+--- a/include/hw/arm/smmu-common.h
++++ b/include/hw/arm/smmu-common.h
+@@ -68,6 +68,7 @@ typedef struct SMMUTransCfg {
+     uint8_t tbi;               /* Top Byte Ignore */
+     uint16_t asid;
+     SMMUTransTableInfo tt[2];
++    dma_addr_t s1ctxptr;
+     uint32_t iotlb_hits;       /* counts IOTLB hits for this asid */
+     uint32_t iotlb_misses;     /* counts IOTLB misses for this asid */
+ } SMMUTransCfg;
+-- 
+2.27.0
+
-- 
Gitee


From 4b4532b880cd29a4b404c375d77f436b1a513821 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 4 Sep 2018 08:48:33 -0400
Subject: [PATCH 34/48] hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA
 invalidation

When the guest invalidates one S1 entry, it passes the asid.
When propagating this invalidation downto the host, the asid
information also must be passed. So let's fill the arch_id field
introduced for that purpose and accordingly set the flags to
indicate its presence.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ll-the-IOTLBEntry-arch_id-on-NH_VA-i.patch | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch

diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch
new file mode 100644
index 0000000..1f3425e
--- /dev/null
+++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch
@@ -0,0 +1,34 @@
+From 8108317641b3cb378bf1862dc3c0a73d1e0976ce Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 4 Sep 2018 08:48:33 -0400
+Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA
+ invalidation
+
+When the guest invalidates one S1 entry, it passes the asid.
+When propagating this invalidation downto the host, the asid
+information also must be passed. So let's fill the arch_id field
+introduced for that purpose and accordingly set the flags to
+indicate its presence.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index f8e721f949..c6b950af35 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -824,6 +824,8 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+     entry.iova = iova;
+     entry.addr_mask = (1 << tt->granule_sz) - 1;
+     entry.perm = IOMMU_NONE;
++    entry.flags = IOMMU_INV_FLAGS_ARCHID;
++    entry.arch_id = asid;
+ 
+     memory_region_notify_one(n, &entry);
+ }
+-- 
+2.27.0
+
-- 
Gitee


From bcfe3f1d19cad963b9933c10861a9db4f08386fb Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 14 Mar 2019 09:55:13 -0400
Subject: [PATCH 35/48] hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA
 invalidation

Let's propagate the leaf attribute throughout the invalidation path.
This hint is used to reduce the scope of the invalidations to the
last level of translation. Not enforcing it induces large performance
penalties in nested mode.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ll-the-IOTLBEntry-leaf-field-on-NH_V.patch | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch

diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch
new file mode 100644
index 0000000..febaffa
--- /dev/null
+++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch
@@ -0,0 +1,81 @@
+From 6393ad5c1ba6a04b038d80ecc1e663ad91ed0d21 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 14 Mar 2019 09:55:13 -0400
+Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA
+ invalidation
+
+Let's propagate the leaf attribute throughout the invalidation path.
+This hint is used to reduce the scope of the invalidations to the
+last level of translation. Not enforcing it induces large performance
+penalties in nested mode.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index c6b950af35..c1caa6bc3a 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -795,7 +795,7 @@ epilogue:
+ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+                                IOMMUNotifier *n,
+                                int asid,
+-                               dma_addr_t iova)
++                               dma_addr_t iova, bool leaf)
+ {
+     SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu);
+     SMMUEventInfo event = {};
+@@ -826,6 +826,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+     entry.perm = IOMMU_NONE;
+     entry.flags = IOMMU_INV_FLAGS_ARCHID;
+     entry.arch_id = asid;
++    entry.leaf = leaf;
+ 
+     memory_region_notify_one(n, &entry);
+ }
+@@ -854,7 +855,8 @@ static void smmuv3_notify_asid(IOMMUMemoryRegion *mr,
+ }
+ 
+ /* invalidate an asid/iova tuple in all mr's */
+-static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
++static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova,
++                                      bool leaf)
+ {
+     SMMUDevice *sdev;
+ 
+@@ -865,7 +867,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
+         trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova);
+ 
+         IOMMU_NOTIFIER_FOREACH(n, mr) {
+-            smmuv3_notify_iova(mr, n, asid, iova);
++            smmuv3_notify_iova(mr, n, asid, iova, leaf);
+         }
+     }
+ }
+@@ -1018,9 +1020,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+         {
+             dma_addr_t addr = CMD_ADDR(&cmd);
+             uint16_t vmid = CMD_VMID(&cmd);
++            bool leaf = CMD_LEAF(&cmd);
+ 
+             trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr);
+-            smmuv3_inv_notifiers_iova(bs, -1, addr);
++            smmuv3_inv_notifiers_iova(bs, -1, addr, leaf);
+             smmu_iotlb_inv_all(bs);
+             break;
+         }
+@@ -1032,7 +1035,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+             bool leaf = CMD_LEAF(&cmd);
+ 
+             trace_smmuv3_cmdq_tlbi_nh_va(vmid, asid, addr, leaf);
+-            smmuv3_inv_notifiers_iova(bs, asid, addr);
++            smmuv3_inv_notifiers_iova(bs, asid, addr, leaf);
+             smmu_iotlb_inv_iova(bs, asid, addr);
+             break;
+         }
+-- 
+2.27.0
+
-- 
Gitee


From 1e1a34cb306d009b8eadbdec6878dfca4ad95c88 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 9 Aug 2018 21:04:19 +0200
Subject: [PATCH 36/48] hw/arm/smmuv3: Pass stage 1 configurations to the host

In case PASID PciOps are set for the device we call
the set_pasid_table() callback on each STE update.

This allows to pass the guest stage 1 configuration
to the host and apply it at physical level.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ss-stage-1-configurations-to-the-hos.patch | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch

diff --git a/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch
new file mode 100644
index 0000000..a615b86
--- /dev/null
+++ b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch
@@ -0,0 +1,147 @@
+From d0a1ce3c46246b6ef5510ac1d5c18308417ed525 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 9 Aug 2018 21:04:19 +0200
+Subject: [PATCH] hw/arm/smmuv3: Pass stage 1 configurations to the host
+
+In case PASID PciOps are set for the device we call
+the set_pasid_table() callback on each STE update.
+
+This allows to pass the guest stage 1 configuration
+to the host and apply it at physical level.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c     | 77 +++++++++++++++++++++++++++++++++++----------
+ hw/arm/trace-events |  2 +-
+ 2 files changed, 61 insertions(+), 18 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index c1caa6bc3a..3d2151857d 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -16,6 +16,10 @@
+  * with this program; if not, see <http://www.gnu.org/licenses/>.
+  */
+ 
++#ifdef __linux__
++#include "linux/iommu.h"
++#endif
++
+ #include "qemu/osdep.h"
+ #include "hw/boards.h"
+ #include "sysemu/sysemu.h"
+@@ -872,6 +876,60 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova,
+     }
+ }
+ 
++static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
++{
++#ifdef __linux__
++    IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
++    SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid};
++    IOMMUConfig iommu_config = {};
++    SMMUTransCfg *cfg;
++    SMMUDevice *sdev;
++
++    if (!mr) {
++        return;
++    }
++
++    sdev = container_of(mr, SMMUDevice, iommu);
++
++    /* flush QEMU config cache */
++    smmuv3_flush_config(sdev);
++
++    if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) {
++        return;
++    }
++
++    cfg = smmuv3_get_config(sdev, &event);
++
++    if (!cfg) {
++        return;
++    }
++
++    iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config);
++    iommu_config.pasid_cfg.version = PASID_TABLE_CFG_VERSION_1;
++    iommu_config.pasid_cfg.format = IOMMU_PASID_FORMAT_SMMUV3;
++    iommu_config.pasid_cfg.base_ptr = cfg->s1ctxptr;
++    iommu_config.pasid_cfg.pasid_bits = 0;
++    iommu_config.pasid_cfg.vendor_data.smmuv3.version = PASID_TABLE_SMMUV3_CFG_VERSION_1;
++
++    if (cfg->disabled || cfg->bypassed) {
++        iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_BYPASS;
++    } else if (cfg->aborted) {
++        iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_ABORT;
++    } else {
++        iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_TRANSLATE;
++    }
++
++    trace_smmuv3_notify_config_change(mr->parent_obj.name,
++                                      iommu_config.pasid_cfg.config,
++                                      iommu_config.pasid_cfg.base_ptr);
++
++    if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) {
++        error_report("Failed to pass PASID table to host for iommu mr %s (%m)",
++                     mr->parent_obj.name);
++    }
++#endif
++}
++
+ static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid)
+ {
+     SMMUDevice *sdev;
+@@ -938,22 +996,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+         case SMMU_CMD_CFGI_STE:
+         {
+             uint32_t sid = CMD_SID(&cmd);
+-            IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
+-            SMMUDevice *sdev;
+ 
+             if (CMD_SSEC(&cmd)) {
+                 cmd_error = SMMU_CERROR_ILL;
+                 break;
+             }
+ 
+-            if (!mr) {
+-                break;
+-            }
+-
+             trace_smmuv3_cmdq_cfgi_ste(sid);
+-            sdev = container_of(mr, SMMUDevice, iommu);
+-            smmuv3_flush_config(sdev);
+-
++            smmuv3_notify_config_change(bs, sid);
+             break;
+         }
+         case SMMU_CMD_CFGI_STE_RANGE: /* same as SMMU_CMD_CFGI_ALL */
+@@ -970,14 +1020,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+             trace_smmuv3_cmdq_cfgi_ste_range(start, end);
+ 
+             for (i = start; i <= end; i++) {
+-                IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, i);
+-                SMMUDevice *sdev;
+-
+-                if (!mr) {
+-                    continue;
+-                }
+-                sdev = container_of(mr, SMMUDevice, iommu);
+-                smmuv3_flush_config(sdev);
++                 smmuv3_notify_config_change(bs, i);
+             }
+             break;
+         }
+diff --git a/hw/arm/trace-events b/hw/arm/trace-events
+index 4512d20115..cbbe2ccafd 100644
+--- a/hw/arm/trace-events
++++ b/hw/arm/trace-events
+@@ -53,4 +53,4 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d"
+ smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s"
+ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s"
+ smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova) "iommu mr=%s asid=%d iova=0x%"PRIx64
+-
++smmuv3_notify_config_change(const char *name, uint8_t config, uint64_t s1ctxptr) "iommu mr=%s config=%d s1ctxptr=0x%"PRIx64
+-- 
+2.27.0
+
-- 
Gitee


From b81289b550f1e460bc1678bfcd6ee05fee4d2efb Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Thu, 13 Sep 2018 14:24:45 +0200
Subject: [PATCH 37/48] hw/arm/smmuv3: Implement fault injection

We convert iommu_fault structs received from the kernel
into the data struct used by the emulation code and record
the evnts into the virtual event queue.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 hw-arm-smmuv3-Implement-fault-injection.patch | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 hw-arm-smmuv3-Implement-fault-injection.patch

diff --git a/hw-arm-smmuv3-Implement-fault-injection.patch b/hw-arm-smmuv3-Implement-fault-injection.patch
new file mode 100644
index 0000000..0260e28
--- /dev/null
+++ b/hw-arm-smmuv3-Implement-fault-injection.patch
@@ -0,0 +1,107 @@
+From 55bfd18b7671c82705d83d543281add0afcda31f Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Sep 2018 14:24:45 +0200
+Subject: [PATCH] hw/arm/smmuv3: Implement fault injection
+
+We convert iommu_fault structs received from the kernel
+into the data struct used by the emulation code and record
+the evnts into the virtual event queue.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 71 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 3d2151857d..931d6eae57 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1594,6 +1594,76 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu,
+     return -EINVAL;
+ }
+ 
++struct iommu_fault;
++
++static inline int
++smmuv3_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
++                     struct iommu_fault *buf)
++{
++#ifdef __linux__
++    SMMUDevice *sdev = container_of(iommu_mr, SMMUDevice, iommu);
++    SMMUv3State *s3 = sdev->smmu;
++    uint32_t sid = smmu_get_sid(sdev);
++    int i;
++
++    for (i = 0; i < count; i++) {
++        SMMUEventInfo info = {};
++        struct iommu_fault_unrecoverable *record;
++
++        if (buf[i].type != IOMMU_FAULT_DMA_UNRECOV) {
++            continue;
++        }
++
++        info.sid = sid;
++        record = &buf[i].event;
++
++        switch (record->reason) {
++        case IOMMU_FAULT_REASON_PASID_INVALID:
++            info.type = SMMU_EVT_C_BAD_SUBSTREAMID;
++            /* TODO further fill info.u.c_bad_substream */
++            break;
++        case IOMMU_FAULT_REASON_PASID_FETCH:
++            info.type = SMMU_EVT_F_CD_FETCH;
++            break;
++        case IOMMU_FAULT_REASON_BAD_PASID_ENTRY:
++            info.type = SMMU_EVT_C_BAD_CD;
++            /* TODO further fill info.u.c_bad_cd */
++            break;
++        case IOMMU_FAULT_REASON_WALK_EABT:
++            info.type = SMMU_EVT_F_WALK_EABT;
++            info.u.f_walk_eabt.addr = record->addr;
++            info.u.f_walk_eabt.addr2 = record->fetch_addr;
++            break;
++        case IOMMU_FAULT_REASON_PTE_FETCH:
++            info.type = SMMU_EVT_F_TRANSLATION;
++            info.u.f_translation.addr = record->addr;
++            break;
++        case IOMMU_FAULT_REASON_OOR_ADDRESS:
++            info.type = SMMU_EVT_F_ADDR_SIZE;
++            info.u.f_addr_size.addr = record->addr;
++            break;
++        case IOMMU_FAULT_REASON_ACCESS:
++            info.type = SMMU_EVT_F_ACCESS;
++            info.u.f_access.addr = record->addr;
++            break;
++        case IOMMU_FAULT_REASON_PERMISSION:
++            info.type = SMMU_EVT_F_PERMISSION;
++            info.u.f_permission.addr = record->addr;
++            break;
++        default:
++            warn_report("%s Unexpected fault reason received from host: %d",
++                        __func__, record->reason);
++            continue;
++        }
++
++        smmuv3_record_event(s3, &info);
++    }
++    return 0;
++#else
++    return -1;
++#endif
++}
++
+ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+                                                   void *data)
+ {
+@@ -1602,6 +1672,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+     imrc->translate = smmuv3_translate;
+     imrc->notify_flag_changed = smmuv3_notify_flag_changed;
+     imrc->get_attr = smmuv3_get_attr;
++    imrc->inject_faults = smmuv3_inject_faults;
+ }
+ 
+ static const TypeInfo smmuv3_type_info = {
+-- 
+2.27.0
+
-- 
Gitee


From c2ecdaca135466c27021abe71f2b5d888022b19c Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Wed, 18 Mar 2020 11:17:36 +0100
Subject: [PATCH 38/48] hw/arm/smmuv3: Allow MAP notifiers

We now have all bricks to support nested paging. This
uses MAP notifiers to map the MSIs. So let's allow MAP
notifiers to be registered.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 hw-arm-smmuv3-Allow-MAP-notifiers.patch | 37 +++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 hw-arm-smmuv3-Allow-MAP-notifiers.patch

diff --git a/hw-arm-smmuv3-Allow-MAP-notifiers.patch b/hw-arm-smmuv3-Allow-MAP-notifiers.patch
new file mode 100644
index 0000000..ec05012
--- /dev/null
+++ b/hw-arm-smmuv3-Allow-MAP-notifiers.patch
@@ -0,0 +1,37 @@
+From 965729b4875f637dacdbf82960347beb65512d12 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Wed, 18 Mar 2020 11:17:36 +0100
+Subject: [PATCH] hw/arm/smmuv3: Allow MAP notifiers
+
+We now have all bricks to support nested paging. This
+uses MAP notifiers to map the MSIs. So let's allow MAP
+notifiers to be registered.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 8 --------
+ 1 file changed, 8 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 931d6eae57..c26fba118c 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1563,14 +1563,6 @@ static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
+     SMMUv3State *s3 = sdev->smmu;
+     SMMUState *s = &(s3->smmu_state);
+ 
+-    if (new & IOMMU_NOTIFIER_MAP) {
+-        int bus_num = pci_bus_num(sdev->bus);
+-        PCIDevice *pcidev = pci_find_device(sdev->bus, bus_num, sdev->devfn);
+-
+-        warn_report("SMMUv3 does not support notification on MAP: "
+-                     "device %s will not function properly", pcidev->name);
+-    }
+-
+     if (old == IOMMU_NOTIFIER_NONE) {
+         trace_smmuv3_notify_flag_add(iommu->parent_obj.name);
+         QLIST_INSERT_HEAD(&s->devices_with_notifiers, sdev, next);
+-- 
+2.27.0
+
-- 
Gitee


From 2117b42cb169ca596bdb42bf862c107b950ea3d4 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Fri, 6 Nov 2020 14:34:35 +0100
Subject: [PATCH 39/48] pci: Add return_page_response pci ops

Add a new PCI operation that allows to return page responses
to registered VFIO devices

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 pci-Add-return_page_response-pci-ops.patch | 86 ++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 pci-Add-return_page_response-pci-ops.patch

diff --git a/pci-Add-return_page_response-pci-ops.patch b/pci-Add-return_page_response-pci-ops.patch
new file mode 100644
index 0000000..1337620
--- /dev/null
+++ b/pci-Add-return_page_response-pci-ops.patch
@@ -0,0 +1,86 @@
+From e3b498a1afec138693251bf1bd1fa9b322a880fb Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 6 Nov 2020 14:34:35 +0100
+Subject: [PATCH] pci: Add return_page_response pci ops
+
+Add a new PCI operation that allows to return page responses
+to registered VFIO devices
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/pci/pci.c             | 16 ++++++++++++++++
+ include/hw/iommu/iommu.h |  8 ++++++++
+ include/hw/pci/pci.h     |  4 ++++
+ 3 files changed, 28 insertions(+)
+
+diff --git a/hw/pci/pci.c b/hw/pci/pci.c
+index f11ca7964e..a8b3d1c071 100644
+--- a/hw/pci/pci.c
++++ b/hw/pci/pci.c
+@@ -2660,6 +2660,22 @@ int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn,
+     return -ENOENT;
+ }
+ 
++int pci_device_return_page_response(PCIBus *bus, int32_t devfn,
++                                    IOMMUPageResponse *resp)
++{
++    PCIDevice *dev;
++
++    if (!bus) {
++        return -EINVAL;
++    }
++
++    dev = bus->devices[devfn];
++    if (dev && dev->pasid_ops && dev->pasid_ops->return_page_response) {
++        return dev->pasid_ops->return_page_response(bus, devfn, resp);
++    }
++    return -ENOENT;
++}
++
+ static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque)
+ {
+     Range *range = opaque;
+diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
+index 12092bda7b..5890f095b1 100644
+--- a/include/hw/iommu/iommu.h
++++ b/include/hw/iommu/iommu.h
+@@ -24,5 +24,13 @@ typedef struct IOMMUConfig {
+           };
+ } IOMMUConfig;
+ 
++typedef struct IOMMUPageResponse {
++    union {
++#ifdef __linux__
++        struct iommu_page_response resp;
++#endif
++          };
++} IOMMUPageResponse;
++
+ 
+ #endif /* QEMU_HW_IOMMU_IOMMU_H */
+diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
+index bb14ed61b0..5e7e0e4e6f 100644
+--- a/include/hw/pci/pci.h
++++ b/include/hw/pci/pci.h
+@@ -266,6 +266,8 @@ typedef struct PCIReqIDCache PCIReqIDCache;
+ 
+ struct PCIPASIDOps {
+     int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++    int (*return_page_response)(PCIBus *bus, int32_t devfn,
++                                IOMMUPageResponse *resp);
+ };
+ typedef struct PCIPASIDOps PCIPASIDOps;
+ 
+@@ -495,6 +497,8 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+ void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops);
+ bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn);
+ int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++int pci_device_return_page_response(PCIBus *bus, int32_t devfn,
++                                    IOMMUPageResponse *resp);
+ 
+ static inline void
+ pci_set_byte(uint8_t *config, uint8_t val)
+-- 
+2.27.0
+
-- 
Gitee


From 518ab37de314406b2c9de58bfb56405e97d387c2 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Fri, 6 Nov 2020 12:03:29 -0500
Subject: [PATCH 40/48] vfio/pci: Implement return_page_response page response
 callback

This patch implements the page response path. The
response is written into the page response ring buffer and then
update header's head index is updated. This path is not used
by this series. It is introduced here as a POC for vSVA/ARM
integration.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...nt-return_page_response-page-respons.patch | 199 ++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 vfio-pci-Implement-return_page_response-page-respons.patch

diff --git a/vfio-pci-Implement-return_page_response-page-respons.patch b/vfio-pci-Implement-return_page_response-page-respons.patch
new file mode 100644
index 0000000..721512e
--- /dev/null
+++ b/vfio-pci-Implement-return_page_response-page-respons.patch
@@ -0,0 +1,199 @@
+From dab7c3ad6d51e9f0c65d864d6128f62697db4604 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 6 Nov 2020 12:03:29 -0500
+Subject: [PATCH] vfio/pci: Implement return_page_response page response
+ callback
+
+This patch implements the page response path. The
+response is written into the page response ring buffer and then
+update header's head index is updated. This path is not used
+by this series. It is introduced here as a POC for vSVA/ARM
+integration.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/vfio/pci.h |   2 +
+ 2 files changed, 125 insertions(+)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index d1198c8a23..6f4083aec8 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2662,6 +2662,61 @@ out:
+     g_free(fault_region_info);
+ }
+ 
++static void vfio_init_fault_response_regions(VFIOPCIDevice *vdev, Error **errp)
++{
++    struct vfio_region_info *fault_region_info = NULL;
++    struct vfio_region_info_cap_fault *cap_fault;
++    VFIODevice *vbasedev = &vdev->vbasedev;
++    struct vfio_info_cap_header *hdr;
++    char *fault_region_name;
++    int ret;
++
++    ret = vfio_get_dev_region_info(&vdev->vbasedev,
++                                   VFIO_REGION_TYPE_NESTED,
++                                   VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE,
++                                   &fault_region_info);
++    if (ret) {
++        goto out;
++    }
++
++    hdr = vfio_get_region_info_cap(fault_region_info,
++                                   VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE);
++    if (!hdr) {
++        error_setg(errp, "failed to retrieve DMA FAULT RESPONSE capability");
++        goto out;
++    }
++    cap_fault = container_of(hdr, struct vfio_region_info_cap_fault,
++                             header);
++    if (cap_fault->version != 1) {
++        error_setg(errp, "Unsupported DMA FAULT RESPONSE API version %d",
++                   cap_fault->version);
++        goto out;
++    }
++
++    fault_region_name = g_strdup_printf("%s DMA FAULT RESPONSE %d",
++                                        vbasedev->name,
++                                        fault_region_info->index);
++
++    ret = vfio_region_setup(OBJECT(vdev), vbasedev,
++                            &vdev->dma_fault_response_region,
++                            fault_region_info->index,
++                            fault_region_name);
++    g_free(fault_region_name);
++    if (ret) {
++        error_setg_errno(errp, -ret,
++                         "failed to set up the DMA FAULT RESPONSE region %d",
++                         fault_region_info->index);
++        goto out;
++    }
++
++    ret = vfio_region_mmap(&vdev->dma_fault_response_region);
++    if (ret) {
++        error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT RESPONSE queue");
++    }
++out:
++    g_free(fault_region_info);
++}
++
+ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+ {
+     VFIODevice *vbasedev = &vdev->vbasedev;
+@@ -2737,6 +2792,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+         return;
+     }
+ 
++    vfio_init_fault_response_regions(vdev, &err);
++    if (err) {
++        error_propagate(errp, err);
++        return;
++    }
++
+     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
+ 
+     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+@@ -2915,8 +2976,68 @@ static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
+     return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
+ }
+ 
++static int vfio_iommu_return_page_response(PCIBus *bus, int32_t devfn,
++                                           IOMMUPageResponse *resp)
++{
++    PCIDevice *pdev = bus->devices[devfn];
++    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
++    struct iommu_page_response *response = &resp->resp;
++    struct vfio_region_dma_fault_response header;
++    struct iommu_page_response *queue;
++    char *queue_buffer = NULL;
++    ssize_t bytes;
++
++    if (!vdev->dma_fault_response_region.mem) {
++        return -EINVAL;
++    }
++
++    /* read the header */
++    bytes = pread(vdev->vbasedev.fd, &header, sizeof(header),
++                  vdev->dma_fault_response_region.fd_offset);
++    if (bytes != sizeof(header)) {
++        error_report("%s unable to read the fault region header (0x%lx)",
++                     __func__, bytes);
++        return -1;
++    }
++
++    /* Normally the fault queue is mmapped */
++    queue = (struct iommu_page_response *)vdev->dma_fault_response_region.mmaps[0].mmap;
++    if (!queue) {
++        size_t queue_size = header.nb_entries * header.entry_size;
++
++        error_report("%s: fault queue not mmapped: slower fault handling",
++                     vdev->vbasedev.name);
++
++        queue_buffer = g_malloc(queue_size);
++        bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size,
++                      vdev->dma_fault_response_region.fd_offset + header.offset);
++        if (bytes != queue_size) {
++            error_report("%s unable to read the fault queue (0x%lx)",
++                         __func__, bytes);
++            return -1;
++        }
++
++        queue = (struct iommu_page_response *)queue_buffer;
++    }
++    /* deposit the new response in the queue and increment the head */
++    memcpy(queue + header.head, response, header.entry_size);
++
++    vdev->fault_response_head_index =
++        (vdev->fault_response_head_index + 1) % header.nb_entries;
++    bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_response_head_index, 4,
++                   vdev->dma_fault_response_region.fd_offset);
++    if (bytes != 4) {
++        error_report("%s unable to write the fault response region head index (0x%lx)",
++                     __func__, bytes);
++    }
++    g_free(queue_buffer);
++
++    return 0;
++}
++
+ static PCIPASIDOps vfio_pci_pasid_ops = {
+     .set_pasid_table = vfio_iommu_set_pasid_table,
++    .return_page_response = vfio_iommu_return_page_response,
+ };
+ 
+ static void vfio_dma_fault_notifier_handler(void *opaque)
+@@ -3373,6 +3494,7 @@ static void vfio_instance_finalize(Object *obj)
+     vfio_display_finalize(vdev);
+     vfio_bars_finalize(vdev);
+     vfio_region_finalize(&vdev->dma_fault_region);
++    vfio_region_finalize(&vdev->dma_fault_response_region);
+     g_free(vdev->emulated_config_bits);
+     g_free(vdev->rom);
+     /*
+@@ -3394,6 +3516,7 @@ static void vfio_exitfn(PCIDevice *pdev)
+     vfio_unregister_err_notifier(vdev);
+     vfio_unregister_ext_irq_notifiers(vdev);
+     vfio_region_exit(&vdev->dma_fault_region);
++    vfio_region_exit(&vdev->dma_fault_response_region);
+     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+     vfio_disable_interrupts(vdev);
+     if (vdev->intx.mmap_timer) {
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index e31bc0173a..7fdcfa0dc8 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -143,6 +143,8 @@ typedef struct VFIOPCIDevice {
+     VFIOPCIExtIRQ *ext_irqs;
+     VFIORegion dma_fault_region;
+     uint32_t fault_tail_index;
++    VFIORegion dma_fault_response_region;
++    uint32_t fault_response_head_index;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
-- 
Gitee


From 48e4f1552b34336a377baf168337fbe448dfe5ec Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Sat, 31 Jul 2021 10:02:18 +0800
Subject: [PATCH 41/48] vfio/common: Avoid unmap ram section at
 vfio_listener_region_del() in nested mode

The ram section will be unmapped at vfio_prereg_listener_region_del()
in nested mode. So let's avoid unmap ram section at
vfio_listener_region_dev().

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...d-unmap-ram-section-at-vfio_listener.patch | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch

diff --git a/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch
new file mode 100644
index 0000000..efcbd1f
--- /dev/null
+++ b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch
@@ -0,0 +1,39 @@
+From 55f3bdd0866be2b1a6223bacf9e00a032daf957c Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Sat, 31 Jul 2021 10:02:18 +0800
+Subject: [PATCH] vfio/common: Avoid unmap ram section at
+ vfio_listener_region_del() in nested mode
+
+The ram section will be unmapped at vfio_prereg_listener_region_del()
+in nested mode. So let's avoid unmap ram section at
+vfio_listener_region_dev().
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 98dc9e6f84..21a866e545 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1179,6 +1179,16 @@ static void vfio_listener_region_del(MemoryListener *listener,
+             }
+         }
+ 
++        /*
++         * In nested mode, stage 2 (gpa->hpa) and the stage 1
++         * (giova->gpa) are set separately. The ram section
++         * will be unmapped in vfio_prereg_listener_region_del().
++         * Hence it doesn't need to unmap ram section here.
++         */
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            return;
++        }
++
+         /*
+          * FIXME: We assume the one big unmap below is adequate to
+          * remove any individual page mappings in the IOMMU which
+-- 
+2.27.0
+
-- 
Gitee


From e4d427543342a2ff0ed269dd0a7dd436fe6e8dba Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 11 May 2021 10:08:13 +0800
Subject: [PATCH 42/48] vfio: Introduce helpers to mark dirty pages of a RAM
 section

Extract part of the code from vfio_sync_dirty_bitmap to form a
new helper, which allows to mark dirty pages of a RAM section.
This helper will be called for nested stage.

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...elpers-to-mark-dirty-pages-of-a-RAM-.patch | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch

diff --git a/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch
new file mode 100644
index 0000000..274a0c0
--- /dev/null
+++ b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch
@@ -0,0 +1,64 @@
+From ff9c1f7e3e17cc2afe1b2dfa545065e91941db8b Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:13 +0800
+Subject: [PATCH] vfio: Introduce helpers to mark dirty pages of a RAM section
+
+Extract part of the code from vfio_sync_dirty_bitmap to form a
+new helper, which allows to mark dirty pages of a RAM section.
+This helper will be called for nested stage.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 22 ++++++++++++++--------
+ 1 file changed, 14 insertions(+), 8 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 21a866e545..5176fd3a3d 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1304,6 +1304,19 @@ err_out:
+     return ret;
+ }
+ 
++static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container,
++                                                  MemoryRegionSection *section)
++{
++    ram_addr_t ram_addr;
++
++    ram_addr = memory_region_get_ram_addr(section->mr) +
++               section->offset_within_region;
++
++    return vfio_get_dirty_bitmap(container,
++                    REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
++                    int128_get64(section->size), ram_addr);
++}
++
+ typedef struct {
+     IOMMUNotifier n;
+     VFIOGuestIOMMU *giommu;
+@@ -1345,8 +1358,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+                                   MemoryRegionSection *section)
+ {
+-    ram_addr_t ram_addr;
+-
+     if (memory_region_is_iommu(section->mr)) {
+         VFIOGuestIOMMU *giommu;
+ 
+@@ -1375,12 +1386,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+         return 0;
+     }
+ 
+-    ram_addr = memory_region_get_ram_addr(section->mr) +
+-               section->offset_within_region;
+-
+-    return vfio_get_dirty_bitmap(container,
+-                   REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
+-                   int128_get64(section->size), ram_addr);
++    return vfio_dma_sync_ram_section_dirty_bitmap(container, section);
+ }
+ 
+ static void vfio_listener_log_sync(MemoryListener *listener,
+-- 
+2.27.0
+
-- 
Gitee


From eae456de7c9b8a9b9d8f9b0c5418876a3279cc98 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 11 May 2021 10:08:14 +0800
Subject: [PATCH 43/48] vfio: Add vfio_prereg_listener_log_sync in nested stage

In nested mode, we set up the stage 2 (gpa->hpa)and stage 1
(giova->gpa) separately by vfio_prereg_listener_region_add()
and vfio_listener_region_add(). So when marking dirty pages
we just need to pay attention to stage 2 mappings.

Legacy vfio_listener_log_sync cannot be used in nested stage.
This patch adds vfio_prereg_listener_log_sync to mark dirty
pages in nested mode.

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ereg_listener_log_sync-in-nested-sta.patch | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch

diff --git a/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch
new file mode 100644
index 0000000..77a0c8a
--- /dev/null
+++ b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch
@@ -0,0 +1,74 @@
+From 4c5350044ac2f61ab8088278b59eb6388ca49ff1 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:14 +0800
+Subject: [PATCH] vfio: Add vfio_prereg_listener_log_sync in nested stage
+
+In nested mode, we set up the stage 2 (gpa->hpa)and stage 1
+(giova->gpa) separately by vfio_prereg_listener_region_add()
+and vfio_listener_region_add(). So when marking dirty pages
+we just need to pay attention to stage 2 mappings.
+
+Legacy vfio_listener_log_sync cannot be used in nested stage.
+This patch adds vfio_prereg_listener_log_sync to mark dirty
+pages in nested mode.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 27 +++++++++++++++++++++++++++
+ 1 file changed, 27 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 5176fd3a3d..6b00bd4c2f 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1317,6 +1317,22 @@ static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container,
+                     int128_get64(section->size), ram_addr);
+ }
+ 
++static void vfio_prereg_listener_log_sync(MemoryListener *listener,
++                                          MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr) ||
++        !container->dirty_pages_supported) {
++        return;
++    }
++
++    if (vfio_devices_all_dirty_tracking(container)) {
++        vfio_dma_sync_ram_section_dirty_bitmap(container, section);
++    }
++}
++
+ typedef struct {
+     IOMMUNotifier n;
+     VFIOGuestIOMMU *giommu;
+@@ -1361,6 +1377,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+     if (memory_region_is_iommu(section->mr)) {
+         VFIOGuestIOMMU *giommu;
+ 
++        /*
++         * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are
++         * set up separately. It is inappropriate to pass 'giova' to kernel
++         * to get dirty pages. We only need to focus on stage 2 mapping when
++         * marking dirty pages.
++         */
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            return 0;
++        }
++
+         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+             if (MEMORY_REGION(giommu->iommu) == section->mr &&
+                 giommu->n.start == section->offset_within_region) {
+@@ -1551,6 +1577,7 @@ static const MemoryListener vfio_memory_listener = {
+ static MemoryListener vfio_memory_prereg_listener = {
+     .region_add = vfio_prereg_listener_region_add,
+     .region_del = vfio_prereg_listener_region_del,
++    .log_sync = vfio_prereg_listener_log_sync,
+ };
+ 
+ static void vfio_listener_release(VFIOContainer *container)
+-- 
+2.27.0
+
-- 
Gitee


From 38c3954435ac76ddeef189548c7787d2c1199442 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Sat, 31 Jul 2021 09:40:24 +0800
Subject: [PATCH 44/48] vfio: Add vfio_prereg_listener_log_clear to re-enable
 mark dirty pages

When tracking dirty pages, we just need to pay attention to stage 2
mappings. Legacy vfio_listener_log_clear cannot be used in nested
stage. This patch adds vfio_prereg_listener_log_clear to re-enable
dirty pages in nested mode.

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ereg_listener_log_clear-to-re-enable.patch | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch

diff --git a/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch
new file mode 100644
index 0000000..e4da89b
--- /dev/null
+++ b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch
@@ -0,0 +1,84 @@
+From f959faa36fc100894a44f2e6cd7e02a183ba142a Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Sat, 31 Jul 2021 09:40:24 +0800
+Subject: [PATCH] vfio: Add vfio_prereg_listener_log_clear to re-enable mark
+ dirty pages
+
+When tracking dirty pages, we just need to pay attention to stage 2
+mappings. Legacy vfio_listener_log_clear cannot be used in nested
+stage. This patch adds vfio_prereg_listener_log_clear to re-enable
+dirty pages in nested mode.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 40 +++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 6b00bd4c2f..b5f9ba816e 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1550,6 +1550,43 @@ static int vfio_physical_log_clear(VFIOContainer *container,
+     return ret;
+ }
+ 
++static void vfio_prereg_listener_log_clear(MemoryListener *listener,
++                                           MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr)) {
++        return;
++    }
++
++    vfio_physical_log_clear(container, section);
++}
++
++static int vfio_clear_dirty_bitmap(VFIOContainer *container,
++                                   MemoryRegionSection *section)
++{
++    if (memory_region_is_iommu(section->mr)) {
++        /*
++         * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are
++         * set up separately. It is inappropriate to pass 'giova' to kernel
++         * to get dirty pages. We only need to focus on stage 2 mapping when
++         * marking dirty pages.
++         */
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            return 0;
++        }
++
++        /*
++         * TODO: x86. With the log_clear() interface added, x86 may inplement
++         * its own method.
++         */
++    }
++
++    /* Here we assume that memory_region_is_ram(section->mr) == true */
++    return vfio_physical_log_clear(container, section);
++}
++
+ static void vfio_listener_log_clear(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+ {
+@@ -1561,7 +1598,7 @@ static void vfio_listener_log_clear(MemoryListener *listener,
+     }
+ 
+     if (vfio_devices_all_dirty_tracking(container)) {
+-        vfio_physical_log_clear(container, section);
++        vfio_clear_dirty_bitmap(container, section);
+     }
+ }
+ 
+@@ -1578,6 +1615,7 @@ static MemoryListener vfio_memory_prereg_listener = {
+     .region_add = vfio_prereg_listener_region_add,
+     .region_del = vfio_prereg_listener_region_del,
+     .log_sync = vfio_prereg_listener_log_sync,
++    .log_clear = vfio_prereg_listener_log_clear,
+ };
+ 
+ static void vfio_listener_release(VFIOContainer *container)
+-- 
+2.27.0
+
-- 
Gitee


From 7644dd1549241eaebc151ea8241c1fbb033cd996 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 11 May 2021 10:08:15 +0800
Subject: [PATCH 45/48] vfio: Add vfio_prereg_listener_global_log_start/stop in
 nested stage

In nested mode, we set up the stage 2 and stage 1 separately. In my
opinion, vfio_memory_prereg_listener is used for stage 2 and
vfio_memory_listener is used for stage 1. So it feels weird to call
the global_log_start/stop interface in vfio_memory_listener to switch
dirty tracking, although this won't cause any errors. Add
global_log_start/stop interface in vfio_memory_prereg_listener
can separate stage 2 from stage 1.

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...ereg_listener_global_log_start-stop-.patch | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch

diff --git a/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch
new file mode 100644
index 0000000..289638a
--- /dev/null
+++ b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch
@@ -0,0 +1,71 @@
+From 6aa770f4b83ca068d0c8f3102edda32666a8404d Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:15 +0800
+Subject: [PATCH] vfio: Add vfio_prereg_listener_global_log_start/stop in
+ nested stage
+
+In nested mode, we set up the stage 2 and stage 1 separately. In my
+opinion, vfio_memory_prereg_listener is used for stage 2 and
+vfio_memory_listener is used for stage 1. So it feels weird to call
+the global_log_start/stop interface in vfio_memory_listener to switch
+dirty tracking, although this won't cause any errors. Add
+global_log_start/stop interface in vfio_memory_prereg_listener
+can separate stage 2 from stage 1.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index b5f9ba816e..fb7ca63748 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1239,6 +1239,17 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ 
++    /* For nested mode, vfio_prereg_listener is used to start dirty tracking */
++    if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) {
++        vfio_set_dirty_page_tracking(container, true);
++    }
++}
++
++static void vfio_prereg_listener_log_global_start(MemoryListener *listener)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
+     vfio_set_dirty_page_tracking(container, true);
+ }
+ 
+@@ -1246,6 +1257,17 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ 
++    /* For nested mode, vfio_prereg_listener is used to stop dirty tracking */
++    if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) {
++        vfio_set_dirty_page_tracking(container, false);
++    }
++}
++
++static void vfio_prereg_listener_log_global_stop(MemoryListener *listener)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
+     vfio_set_dirty_page_tracking(container, false);
+ }
+ 
+@@ -1614,6 +1636,8 @@ static const MemoryListener vfio_memory_listener = {
+ static MemoryListener vfio_memory_prereg_listener = {
+     .region_add = vfio_prereg_listener_region_add,
+     .region_del = vfio_prereg_listener_region_del,
++    .log_global_start = vfio_prereg_listener_log_global_start,
++    .log_global_stop = vfio_prereg_listener_log_global_stop,
+     .log_sync = vfio_prereg_listener_log_sync,
+     .log_clear = vfio_prereg_listener_log_clear,
+ };
+-- 
+2.27.0
+
-- 
Gitee


From b06e5516761cb928a29c986df97aee138bb707a5 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Tue, 11 May 2021 10:08:16 +0800
Subject: [PATCH 46/48] hw/arm/smmuv3: Post-load stage 1 configurations to the
 host

In nested mode, we call the set_pasid_table() callback on each
STE update to pass the guest stage 1 configuration to the host
and apply it at physical level.

In the case of live migration, we need to manually call the
set_pasid_table() to load the guest stage 1 configurations to
the host. If this operation fails, the migration fails.

Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
---
 ...st-load-stage-1-configurations-to-th.patch | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch

diff --git a/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch
new file mode 100644
index 0000000..c363acb
--- /dev/null
+++ b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch
@@ -0,0 +1,110 @@
+From 06e43bc658aa80bb5f4da3e43c1c13d4cab6ebdd Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:16 +0800
+Subject: [PATCH] hw/arm/smmuv3: Post-load stage 1 configurations to the host
+
+In nested mode, we call the set_pasid_table() callback on each
+STE update to pass the guest stage 1 configuration to the host
+and apply it at physical level.
+
+In the case of live migration, we need to manually call the
+set_pasid_table() to load the guest stage 1 configurations to
+the host. If this operation fails, the migration fails.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 33 ++++++++++++++++++++++++++++-----
+ 1 file changed, 28 insertions(+), 5 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index c26fba118c..f383143db1 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -876,7 +876,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova,
+     }
+ }
+ 
+-static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
++static int smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+ {
+ #ifdef __linux__
+     IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
+@@ -884,9 +884,10 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+     IOMMUConfig iommu_config = {};
+     SMMUTransCfg *cfg;
+     SMMUDevice *sdev;
++    int ret;
+ 
+     if (!mr) {
+-        return;
++        return 0;
+     }
+ 
+     sdev = container_of(mr, SMMUDevice, iommu);
+@@ -895,13 +896,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+     smmuv3_flush_config(sdev);
+ 
+     if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) {
+-        return;
++        return 0;
+     }
+ 
+     cfg = smmuv3_get_config(sdev, &event);
+ 
+     if (!cfg) {
+-        return;
++        return 0;
+     }
+ 
+     iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config);
+@@ -923,10 +924,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+                                       iommu_config.pasid_cfg.config,
+                                       iommu_config.pasid_cfg.base_ptr);
+ 
+-    if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) {
++    ret = pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config);
++    if (ret) {
+         error_report("Failed to pass PASID table to host for iommu mr %s (%m)",
+                      mr->parent_obj.name);
+     }
++
++    return ret;
+ #endif
+ }
+ 
+@@ -1494,6 +1498,24 @@ static void smmu_realize(DeviceState *d, Error **errp)
+     smmu_init_irq(s, dev);
+ }
+ 
++static int smmuv3_post_load(void *opaque, int version_id)
++{
++    SMMUv3State *s3 = opaque;
++    SMMUState *s = &(s3->smmu_state);
++    SMMUDevice *sdev;
++    int ret = 0;
++
++    QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) {
++        uint32_t sid = smmu_get_sid(sdev);
++        ret = smmuv3_notify_config_change(s, sid);
++        if (ret) {
++            break;
++        }
++    }
++
++    return ret;
++}
++
+ static const VMStateDescription vmstate_smmuv3_queue = {
+     .name = "smmuv3_queue",
+     .version_id = 1,
+@@ -1512,6 +1534,7 @@ static const VMStateDescription vmstate_smmuv3 = {
+     .version_id = 1,
+     .minimum_version_id = 1,
+     .priority = MIG_PRI_IOMMU,
++    .post_load = smmuv3_post_load,
+     .fields = (VMStateField[]) {
+         VMSTATE_UINT32(features, SMMUv3State),
+         VMSTATE_UINT8(sid_size, SMMUv3State),
+-- 
+2.27.0
+
-- 
Gitee


From b94d8926eef4b6ef85800371212a85d87e0ff68c Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Wed, 4 Aug 2021 11:28:28 +0800
Subject: [PATCH 47/48] spec: Update patch and changelog with !183 Support VFIO
 migration manual clear interface  & vSMMUv3/pSMMUv3 2 stage VFIO integration
 & Support migration in SMMUv3 nested mode  !183

vfio: Support host translation granule size
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
vfio: Fix unregister SaveVMHandler in vfio_migration_finalize
migration/ram: Reduce unnecessary rate limiting
migration/ram: Optimize ram_save_host_page()
qdev/monitors: Fix reundant error_setg of qdev_add_device
linux-headers: update against 5.10 and manual clear vfio dirty log series
vfio: Maintain DMA mapping range for the container
vfio/migration: Add support for manual clear vfio dirty log
hw/arm/smmuv3: Support 16K translation granule
hw/arm/smmuv3: Set the restoration priority of the vSMMUv3 explicitly
hw/vfio/common: trace vfio_connect_container operations
update-linux-headers: Import iommu.h
vfio.h and iommu.h header update against 5.10
memory: Add new fields in IOTLBEntry
hw/arm/smmuv3: Improve stage1 ASID invalidation
hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL
memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute
memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute
memory: Introduce IOMMU Memory Region inject_faults API
iommu: Introduce generic header
pci: introduce PCIPASIDOps to PCIDevice
vfio: Force nested if iommu requires it
vfio: Introduce hostwin_from_range helper
vfio: Introduce helpers to DMA map/unmap a RAM section
vfio: Set up nested stage mappings
vfio: Pass stage 1 MSI bindings to the host
vfio: Helper to get IRQ info including capabilities
vfio/pci: Register handler for iommu fault
vfio/pci: Set up the DMA FAULT region
vfio/pci: Implement the DMA fault handler
hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute
hw/arm/smmuv3: Store the PASID table GPA in the translation config
hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation
hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation
hw/arm/smmuv3: Pass stage 1 configurations to the host
hw/arm/smmuv3: Implement fault injection
hw/arm/smmuv3: Allow MAP notifiers
pci: Add return_page_response pci ops
vfio/pci: Implement return_page_response page response callback
vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode
vfio: Introduce helpers to mark dirty pages of a RAM section
vfio: Add vfio_prereg_listener_log_sync in nested stage
vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages
vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage
hw/arm/smmuv3: Post-load stage 1 configurations to the host

Signed-off-by: Chen Qun<kuhn.chenqun@huawei.com>
---
 qemu.spec | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/qemu.spec b/qemu.spec
index 62c6f6c..4b23d32 100644
--- a/qemu.spec
+++ b/qemu.spec
@@ -509,6 +509,52 @@ Patch0496: Fix-use-after-free-in-vfio_migration_probe.patch
 Patch0497: vfio-Make-migration-support-experimental.patch
 Patch0498: vfio-Change-default-dirty-pages-tracking-behavior-du.patch
 Patch0499: vfio-Fix-vfio_listener_log_sync-function-name-typo.patch
+Patch0500: vfio-Support-host-translation-granule-size.patch
+Patch0501: vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch
+Patch0502: vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch
+Patch0503: migration-ram-Reduce-unnecessary-rate-limiting.patch
+Patch0504: migration-ram-Optimize-ram_save_host_page.patch
+Patch0505: qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch
+Patch0506: linux-headers-update-against-5.10-and-manual-clear-v.patch
+Patch0507: vfio-Maintain-DMA-mapping-range-for-the-container.patch
+Patch0508: vfio-migration-Add-support-for-manual-clear-vfio-dir.patch
+Patch0509: hw-arm-smmuv3-Support-16K-translation-granule.patch
+Patch0510: hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch
+Patch0511: hw-vfio-common-trace-vfio_connect_container-operatio.patch
+Patch0512: update-linux-headers-Import-iommu.h.patch
+Patch0513: vfio.h-and-iommu.h-header-update-against-5.10.patch
+Patch0514: memory-Add-new-fields-in-IOTLBEntry.patch
+Patch0515: hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch
+Patch0516: hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch
+Patch0517: memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch
+Patch0518: memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch
+Patch0519: memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch
+Patch0520: iommu-Introduce-generic-header.patch
+Patch0521: pci-introduce-PCIPASIDOps-to-PCIDevice.patch
+Patch0522: vfio-Force-nested-if-iommu-requires-it.patch
+Patch0523: vfio-Introduce-hostwin_from_range-helper.patch
+Patch0524: vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch
+Patch0525: vfio-Set-up-nested-stage-mappings.patch
+Patch0526: vfio-Pass-stage-1-MSI-bindings-to-the-host.patch
+Patch0527: vfio-Helper-to-get-IRQ-info-including-capabilities.patch
+Patch0528: vfio-pci-Register-handler-for-iommu-fault.patch
+Patch0529: vfio-pci-Set-up-the-DMA-FAULT-region.patch
+Patch0530: vfio-pci-Implement-the-DMA-fault-handler.patch
+Patch0531: hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch
+Patch0532: hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch
+Patch0533: hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch
+Patch0534: hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch
+Patch0535: hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch
+Patch0536: hw-arm-smmuv3-Implement-fault-injection.patch
+Patch0537: hw-arm-smmuv3-Allow-MAP-notifiers.patch
+Patch0538: pci-Add-return_page_response-pci-ops.patch
+Patch0539: vfio-pci-Implement-return_page_response-page-respons.patch
+Patch0540: vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch
+Patch0541: vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch
+Patch0542: vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch
+Patch0543: vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch
+Patch0544: vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch
+Patch0545: hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch
 
 BuildRequires: flex
 BuildRequires: gcc
@@ -903,6 +949,54 @@ getent passwd qemu >/dev/null || \
 %endif
 
 %changelog
+* Wed Aug 04 2021 Chen Qun <kuhn.chenqun@huawei.com>
+- vfio: Support host translation granule size
+- vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
+- vfio: Fix unregister SaveVMHandler in vfio_migration_finalize
+- migration/ram: Reduce unnecessary rate limiting
+- migration/ram: Optimize ram_save_host_page()
+- qdev/monitors: Fix reundant error_setg of qdev_add_device
+- linux-headers: update against 5.10 and manual clear vfio dirty log series
+- vfio: Maintain DMA mapping range for the container
+- vfio/migration: Add support for manual clear vfio dirty log
+- hw/arm/smmuv3: Support 16K translation granule
+- hw/arm/smmuv3: Set the restoration priority of the vSMMUv3 explicitly
+- hw/vfio/common: trace vfio_connect_container operations
+- update-linux-headers: Import iommu.h
+- vfio.h and iommu.h header update against 5.10
+- memory: Add new fields in IOTLBEntry
+- hw/arm/smmuv3: Improve stage1 ASID invalidation
+- hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL
+- memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute
+- memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute
+- memory: Introduce IOMMU Memory Region inject_faults API
+- iommu: Introduce generic header
+- pci: introduce PCIPASIDOps to PCIDevice
+- vfio: Force nested if iommu requires it
+- vfio: Introduce hostwin_from_range helper
+- vfio: Introduce helpers to DMA map/unmap a RAM section
+- vfio: Set up nested stage mappings
+- vfio: Pass stage 1 MSI bindings to the host
+- vfio: Helper to get IRQ info including capabilities
+- vfio/pci: Register handler for iommu fault
+- vfio/pci: Set up the DMA FAULT region
+- vfio/pci: Implement the DMA fault handler
+- hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute
+- hw/arm/smmuv3: Store the PASID table GPA in the translation config
+- hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation
+- hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation
+- hw/arm/smmuv3: Pass stage 1 configurations to the host
+- hw/arm/smmuv3: Implement fault injection
+- hw/arm/smmuv3: Allow MAP notifiers
+- pci: Add return_page_response pci ops
+- vfio/pci: Implement return_page_response page response callback
+- vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode
+- vfio: Introduce helpers to mark dirty pages of a RAM section
+- vfio: Add vfio_prereg_listener_log_sync in nested stage
+- vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages
+- vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage
+- hw/arm/smmuv3: Post-load stage 1 configurations to the host
+
 * Tue Aug 03 2021 Chen Qun <kuhn.chenqun@huawei.com>
 - kvm: split too big memory section on several memslots
 - kvm: Reallocate dirty_bmap when we change a slot
-- 
Gitee


From 216918bb04b7c1f0bdb79ab67201d1fc94bad883 Mon Sep 17 00:00:00 2001
From: Chen Qun <kuhn.chenqun@huawei.com>
Date: Wed, 4 Aug 2021 11:28:28 +0800
Subject: [PATCH 48/48] spec: Update release version with !183

increase release verison by one

Signed-off-by: Chen Qun <kuhn.chenqun@huawei.com>
---
 qemu.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu.spec b/qemu.spec
index 4b23d32..981d9e1 100644
--- a/qemu.spec
+++ b/qemu.spec
@@ -1,6 +1,6 @@
 Name: qemu
 Version: 4.1.0
-Release: 75
+Release: 76
 Epoch: 2
 Summary: QEMU is a generic and open source machine emulator and virtualizer
 License: GPLv2 and BSD and MIT and CC-BY-SA-4.0
-- 
Gitee