diff --git a/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch b/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1824b8e4fd0fa31603df903bb640065c70858907
--- /dev/null
+++ b/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch
@@ -0,0 +1,33 @@
+From 8bf9d1dc67335c1fb921a56825f6bf198a568091 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 19 Mar 2021 12:22:48 -0400
+Subject: [PATCH] hw/arm/smmu-common: Allow domain invalidation for
+ NH_ALL/NSNH_ALL
+
+NH_ALL/NSNH_ALL corresponds to a domain granularity invalidation,
+ie. all the notifier range gets invalidation, whatever the ASID.
+So let's set the granularity to IOMMU_INV_GRAN_DOMAIN to allow
+the consumer to benefit from the info if it can.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Suggested-by: chenxiang (M) <chenxiang66@hisilicon.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmu-common.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
+index 717d22bcbe..de9468d33f 100644
+--- a/hw/arm/smmu-common.c
++++ b/hw/arm/smmu-common.c
+@@ -395,6 +395,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n)
+     entry.iova = n->start;
+     entry.perm = IOMMU_NONE;
+     entry.addr_mask = n->end - n->start;
++    entry.granularity = IOMMU_INV_GRAN_DOMAIN;
+ 
+     memory_region_notify_one(n, &entry);
+ }
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch
new file mode 100644
index 0000000000000000000000000000000000000000..89f9292287246e65a25587df2da43f2765457312
--- /dev/null
+++ b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch
@@ -0,0 +1,32 @@
+From bc602a4d1355774a0a44e8fbf6dd842049dd63f3 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 28 Aug 2018 09:21:53 -0400
+Subject: [PATCH] hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute
+
+The SMMUv3 has the peculiarity to translate MSI
+transactionss. let's advertise the corresponding
+attribute.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 55eed5189e..83d59b6d28 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1538,6 +1538,9 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu,
+     if (attr == IOMMU_ATTR_VFIO_NESTED) {
+         *(bool *) data = true;
+         return 0;
++    } else if (attr == IOMMU_ATTR_MSI_TRANSLATE) {
++        *(bool *) data = true;
++        return 0;
+     }
+     return -EINVAL;
+ }
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Allow-MAP-notifiers.patch b/hw-arm-smmuv3-Allow-MAP-notifiers.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ec050121fcd57a2e942774ce76fceb8ed5039cf2
--- /dev/null
+++ b/hw-arm-smmuv3-Allow-MAP-notifiers.patch
@@ -0,0 +1,37 @@
+From 965729b4875f637dacdbf82960347beb65512d12 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Wed, 18 Mar 2020 11:17:36 +0100
+Subject: [PATCH] hw/arm/smmuv3: Allow MAP notifiers
+
+We now have all bricks to support nested paging. This
+uses MAP notifiers to map the MSIs. So let's allow MAP
+notifiers to be registered.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 8 --------
+ 1 file changed, 8 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 931d6eae57..c26fba118c 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1563,14 +1563,6 @@ static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
+     SMMUv3State *s3 = sdev->smmu;
+     SMMUState *s = &(s3->smmu_state);
+ 
+-    if (new & IOMMU_NOTIFIER_MAP) {
+-        int bus_num = pci_bus_num(sdev->bus);
+-        PCIDevice *pcidev = pci_find_device(sdev->bus, bus_num, sdev->devfn);
+-
+-        warn_report("SMMUv3 does not support notification on MAP: "
+-                     "device %s will not function properly", pcidev->name);
+-    }
+-
+     if (old == IOMMU_NOTIFIER_NONE) {
+         trace_smmuv3_notify_flag_add(iommu->parent_obj.name);
+         QLIST_INSERT_HEAD(&s->devices_with_notifiers, sdev, next);
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1f3425e7eddae1fee87d0cb8d86587f4e6011ee5
--- /dev/null
+++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch
@@ -0,0 +1,34 @@
+From 8108317641b3cb378bf1862dc3c0a73d1e0976ce Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 4 Sep 2018 08:48:33 -0400
+Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA
+ invalidation
+
+When the guest invalidates one S1 entry, it passes the asid.
+When propagating this invalidation downto the host, the asid
+information also must be passed. So let's fill the arch_id field
+introduced for that purpose and accordingly set the flags to
+indicate its presence.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index f8e721f949..c6b950af35 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -824,6 +824,8 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+     entry.iova = iova;
+     entry.addr_mask = (1 << tt->granule_sz) - 1;
+     entry.perm = IOMMU_NONE;
++    entry.flags = IOMMU_INV_FLAGS_ARCHID;
++    entry.arch_id = asid;
+ 
+     memory_region_notify_one(n, &entry);
+ }
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch
new file mode 100644
index 0000000000000000000000000000000000000000..febaffaa655ecbe70419d692e586e56b1561f330
--- /dev/null
+++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch
@@ -0,0 +1,81 @@
+From 6393ad5c1ba6a04b038d80ecc1e663ad91ed0d21 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 14 Mar 2019 09:55:13 -0400
+Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA
+ invalidation
+
+Let's propagate the leaf attribute throughout the invalidation path.
+This hint is used to reduce the scope of the invalidations to the
+last level of translation. Not enforcing it induces large performance
+penalties in nested mode.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index c6b950af35..c1caa6bc3a 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -795,7 +795,7 @@ epilogue:
+ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+                                IOMMUNotifier *n,
+                                int asid,
+-                               dma_addr_t iova)
++                               dma_addr_t iova, bool leaf)
+ {
+     SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu);
+     SMMUEventInfo event = {};
+@@ -826,6 +826,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+     entry.perm = IOMMU_NONE;
+     entry.flags = IOMMU_INV_FLAGS_ARCHID;
+     entry.arch_id = asid;
++    entry.leaf = leaf;
+ 
+     memory_region_notify_one(n, &entry);
+ }
+@@ -854,7 +855,8 @@ static void smmuv3_notify_asid(IOMMUMemoryRegion *mr,
+ }
+ 
+ /* invalidate an asid/iova tuple in all mr's */
+-static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
++static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova,
++                                      bool leaf)
+ {
+     SMMUDevice *sdev;
+ 
+@@ -865,7 +867,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
+         trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, iova);
+ 
+         IOMMU_NOTIFIER_FOREACH(n, mr) {
+-            smmuv3_notify_iova(mr, n, asid, iova);
++            smmuv3_notify_iova(mr, n, asid, iova, leaf);
+         }
+     }
+ }
+@@ -1018,9 +1020,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+         {
+             dma_addr_t addr = CMD_ADDR(&cmd);
+             uint16_t vmid = CMD_VMID(&cmd);
++            bool leaf = CMD_LEAF(&cmd);
+ 
+             trace_smmuv3_cmdq_tlbi_nh_vaa(vmid, addr);
+-            smmuv3_inv_notifiers_iova(bs, -1, addr);
++            smmuv3_inv_notifiers_iova(bs, -1, addr, leaf);
+             smmu_iotlb_inv_all(bs);
+             break;
+         }
+@@ -1032,7 +1035,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+             bool leaf = CMD_LEAF(&cmd);
+ 
+             trace_smmuv3_cmdq_tlbi_nh_va(vmid, asid, addr, leaf);
+-            smmuv3_inv_notifiers_iova(bs, asid, addr);
++            smmuv3_inv_notifiers_iova(bs, asid, addr, leaf);
+             smmu_iotlb_inv_iova(bs, asid, addr);
+             break;
+         }
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Implement-fault-injection.patch b/hw-arm-smmuv3-Implement-fault-injection.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0260e28a05e7d30ec2b637eadb2251890c7e3701
--- /dev/null
+++ b/hw-arm-smmuv3-Implement-fault-injection.patch
@@ -0,0 +1,107 @@
+From 55bfd18b7671c82705d83d543281add0afcda31f Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Sep 2018 14:24:45 +0200
+Subject: [PATCH] hw/arm/smmuv3: Implement fault injection
+
+We convert iommu_fault structs received from the kernel
+into the data struct used by the emulation code and record
+the evnts into the virtual event queue.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 71 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 3d2151857d..931d6eae57 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1594,6 +1594,76 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu,
+     return -EINVAL;
+ }
+ 
++struct iommu_fault;
++
++static inline int
++smmuv3_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
++                     struct iommu_fault *buf)
++{
++#ifdef __linux__
++    SMMUDevice *sdev = container_of(iommu_mr, SMMUDevice, iommu);
++    SMMUv3State *s3 = sdev->smmu;
++    uint32_t sid = smmu_get_sid(sdev);
++    int i;
++
++    for (i = 0; i < count; i++) {
++        SMMUEventInfo info = {};
++        struct iommu_fault_unrecoverable *record;
++
++        if (buf[i].type != IOMMU_FAULT_DMA_UNRECOV) {
++            continue;
++        }
++
++        info.sid = sid;
++        record = &buf[i].event;
++
++        switch (record->reason) {
++        case IOMMU_FAULT_REASON_PASID_INVALID:
++            info.type = SMMU_EVT_C_BAD_SUBSTREAMID;
++            /* TODO further fill info.u.c_bad_substream */
++            break;
++        case IOMMU_FAULT_REASON_PASID_FETCH:
++            info.type = SMMU_EVT_F_CD_FETCH;
++            break;
++        case IOMMU_FAULT_REASON_BAD_PASID_ENTRY:
++            info.type = SMMU_EVT_C_BAD_CD;
++            /* TODO further fill info.u.c_bad_cd */
++            break;
++        case IOMMU_FAULT_REASON_WALK_EABT:
++            info.type = SMMU_EVT_F_WALK_EABT;
++            info.u.f_walk_eabt.addr = record->addr;
++            info.u.f_walk_eabt.addr2 = record->fetch_addr;
++            break;
++        case IOMMU_FAULT_REASON_PTE_FETCH:
++            info.type = SMMU_EVT_F_TRANSLATION;
++            info.u.f_translation.addr = record->addr;
++            break;
++        case IOMMU_FAULT_REASON_OOR_ADDRESS:
++            info.type = SMMU_EVT_F_ADDR_SIZE;
++            info.u.f_addr_size.addr = record->addr;
++            break;
++        case IOMMU_FAULT_REASON_ACCESS:
++            info.type = SMMU_EVT_F_ACCESS;
++            info.u.f_access.addr = record->addr;
++            break;
++        case IOMMU_FAULT_REASON_PERMISSION:
++            info.type = SMMU_EVT_F_PERMISSION;
++            info.u.f_permission.addr = record->addr;
++            break;
++        default:
++            warn_report("%s Unexpected fault reason received from host: %d",
++                        __func__, record->reason);
++            continue;
++        }
++
++        smmuv3_record_event(s3, &info);
++    }
++    return 0;
++#else
++    return -1;
++#endif
++}
++
+ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+                                                   void *data)
+ {
+@@ -1602,6 +1672,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+     imrc->translate = smmuv3_translate;
+     imrc->notify_flag_changed = smmuv3_notify_flag_changed;
+     imrc->get_attr = smmuv3_get_attr;
++    imrc->inject_faults = smmuv3_inject_faults;
+ }
+ 
+ static const TypeInfo smmuv3_type_info = {
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch
new file mode 100644
index 0000000000000000000000000000000000000000..10639e89f957b970b78f2c0de930ad8b92032d0f
--- /dev/null
+++ b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch
@@ -0,0 +1,105 @@
+From c0027c2e744c8ed99e937d3cbc88f400ab63a316 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Sun, 14 Feb 2021 12:30:57 -0500
+Subject: [PATCH] hw/arm/smmuv3: Improve stage1 ASID invalidation
+
+At the moment ASID invalidation command (CMD_TLBI_NH_ASID) is
+propagated as a domain invalidation (the whole notifier range
+is invalidated independently on any ASID information).
+
+The new granularity field now allows to be more precise and
+restrict the invalidation to a peculiar ASID. Set the corresponding
+fields and flag.
+
+We still keep the iova and addr_mask settings for consumers that
+do not support the new fields, like VHOST.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c     | 42 ++++++++++++++++++++++++++++++++++++++++--
+ hw/arm/trace-events |  1 +
+ 2 files changed, 41 insertions(+), 2 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 3b5723e1e1..0ef1ca376c 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -827,6 +827,29 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
+     memory_region_notify_one(n, &entry);
+ }
+ 
++/**
++ * smmuv3_notify_asid - call the notifier @n for a given asid
++ *
++ * @mr: IOMMU mr region handle
++ * @n: notifier to be called
++ * @asid: address space ID or negative value if we don't care
++ */
++static void smmuv3_notify_asid(IOMMUMemoryRegion *mr,
++                               IOMMUNotifier *n, int asid)
++{
++    IOMMUTLBEntry entry;
++
++    entry.target_as = &address_space_memory;
++    entry.perm = IOMMU_NONE;
++    entry.granularity = IOMMU_INV_GRAN_PASID;
++    entry.flags = IOMMU_INV_FLAGS_ARCHID;
++    entry.arch_id = asid;
++    entry.iova = n->start;
++    entry.addr_mask = n->end - n->start;
++
++    memory_region_notify_one(n, &entry);
++}
++
+ /* invalidate an asid/iova tuple in all mr's */
+ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
+ {
+@@ -844,6 +867,22 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova)
+     }
+ }
+ 
++static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid)
++{
++    SMMUDevice *sdev;
++
++    trace_smmuv3_s1_asid_inval(asid);
++    QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) {
++        IOMMUMemoryRegion *mr = &sdev->iommu;
++        IOMMUNotifier *n;
++
++        IOMMU_NOTIFIER_FOREACH(n, mr) {
++            smmuv3_notify_asid(mr, n, asid);
++        }
++    }
++    smmu_iotlb_inv_asid(s, asid);
++}
++
+ static int smmuv3_cmdq_consume(SMMUv3State *s)
+ {
+     SMMUState *bs = ARM_SMMU(s);
+@@ -963,8 +1002,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+             uint16_t asid = CMD_ASID(&cmd);
+ 
+             trace_smmuv3_cmdq_tlbi_nh_asid(asid);
+-            smmu_inv_notifiers_all(&s->smmu_state);
+-            smmu_iotlb_inv_asid(bs, asid);
++            smmuv3_s1_asid_inval(bs, asid);
+             break;
+         }
+         case SMMU_CMD_TLBI_NH_ALL:
+diff --git a/hw/arm/trace-events b/hw/arm/trace-events
+index 0acedcedc6..4512d20115 100644
+--- a/hw/arm/trace-events
++++ b/hw/arm/trace-events
+@@ -44,6 +44,7 @@ smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t p
+ smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid %d (hits=%d, misses=%d, hit rate=%d)"
+ smmuv3_cmdq_tlbi_nh_va(int vmid, int asid, uint64_t addr, bool leaf) "vmid =%d asid =%d addr=0x%"PRIx64" leaf=%d"
+ smmuv3_cmdq_tlbi_nh_vaa(int vmid, uint64_t addr) "vmid =%d addr=0x%"PRIx64
++smmuv3_s1_asid_inval(int asid) "asid=%d"
+ smmuv3_cmdq_tlbi_nh(void) ""
+ smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d"
+ smmu_iotlb_cache_hit(uint16_t asid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d"
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a615b8664bd6b9c3603073bfd7ec0bb505e70ef8
--- /dev/null
+++ b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch
@@ -0,0 +1,147 @@
+From d0a1ce3c46246b6ef5510ac1d5c18308417ed525 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 9 Aug 2018 21:04:19 +0200
+Subject: [PATCH] hw/arm/smmuv3: Pass stage 1 configurations to the host
+
+In case PASID PciOps are set for the device we call
+the set_pasid_table() callback on each STE update.
+
+This allows to pass the guest stage 1 configuration
+to the host and apply it at physical level.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c     | 77 +++++++++++++++++++++++++++++++++++----------
+ hw/arm/trace-events |  2 +-
+ 2 files changed, 61 insertions(+), 18 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index c1caa6bc3a..3d2151857d 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -16,6 +16,10 @@
+  * with this program; if not, see <http://www.gnu.org/licenses/>.
+  */
+ 
++#ifdef __linux__
++#include "linux/iommu.h"
++#endif
++
+ #include "qemu/osdep.h"
+ #include "hw/boards.h"
+ #include "sysemu/sysemu.h"
+@@ -872,6 +876,60 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova,
+     }
+ }
+ 
++static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
++{
++#ifdef __linux__
++    IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
++    SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid};
++    IOMMUConfig iommu_config = {};
++    SMMUTransCfg *cfg;
++    SMMUDevice *sdev;
++
++    if (!mr) {
++        return;
++    }
++
++    sdev = container_of(mr, SMMUDevice, iommu);
++
++    /* flush QEMU config cache */
++    smmuv3_flush_config(sdev);
++
++    if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) {
++        return;
++    }
++
++    cfg = smmuv3_get_config(sdev, &event);
++
++    if (!cfg) {
++        return;
++    }
++
++    iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config);
++    iommu_config.pasid_cfg.version = PASID_TABLE_CFG_VERSION_1;
++    iommu_config.pasid_cfg.format = IOMMU_PASID_FORMAT_SMMUV3;
++    iommu_config.pasid_cfg.base_ptr = cfg->s1ctxptr;
++    iommu_config.pasid_cfg.pasid_bits = 0;
++    iommu_config.pasid_cfg.vendor_data.smmuv3.version = PASID_TABLE_SMMUV3_CFG_VERSION_1;
++
++    if (cfg->disabled || cfg->bypassed) {
++        iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_BYPASS;
++    } else if (cfg->aborted) {
++        iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_ABORT;
++    } else {
++        iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_TRANSLATE;
++    }
++
++    trace_smmuv3_notify_config_change(mr->parent_obj.name,
++                                      iommu_config.pasid_cfg.config,
++                                      iommu_config.pasid_cfg.base_ptr);
++
++    if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) {
++        error_report("Failed to pass PASID table to host for iommu mr %s (%m)",
++                     mr->parent_obj.name);
++    }
++#endif
++}
++
+ static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid)
+ {
+     SMMUDevice *sdev;
+@@ -938,22 +996,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+         case SMMU_CMD_CFGI_STE:
+         {
+             uint32_t sid = CMD_SID(&cmd);
+-            IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
+-            SMMUDevice *sdev;
+ 
+             if (CMD_SSEC(&cmd)) {
+                 cmd_error = SMMU_CERROR_ILL;
+                 break;
+             }
+ 
+-            if (!mr) {
+-                break;
+-            }
+-
+             trace_smmuv3_cmdq_cfgi_ste(sid);
+-            sdev = container_of(mr, SMMUDevice, iommu);
+-            smmuv3_flush_config(sdev);
+-
++            smmuv3_notify_config_change(bs, sid);
+             break;
+         }
+         case SMMU_CMD_CFGI_STE_RANGE: /* same as SMMU_CMD_CFGI_ALL */
+@@ -970,14 +1020,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
+             trace_smmuv3_cmdq_cfgi_ste_range(start, end);
+ 
+             for (i = start; i <= end; i++) {
+-                IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, i);
+-                SMMUDevice *sdev;
+-
+-                if (!mr) {
+-                    continue;
+-                }
+-                sdev = container_of(mr, SMMUDevice, iommu);
+-                smmuv3_flush_config(sdev);
++                 smmuv3_notify_config_change(bs, i);
+             }
+             break;
+         }
+diff --git a/hw/arm/trace-events b/hw/arm/trace-events
+index 4512d20115..cbbe2ccafd 100644
+--- a/hw/arm/trace-events
++++ b/hw/arm/trace-events
+@@ -53,4 +53,4 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid %d"
+ smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s"
+ smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s"
+ smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova) "iommu mr=%s asid=%d iova=0x%"PRIx64
+-
++smmuv3_notify_config_change(const char *name, uint8_t config, uint64_t s1ctxptr) "iommu mr=%s config=%d s1ctxptr=0x%"PRIx64
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c363acb60c0fce72a986b6056aa74bb578b7a992
--- /dev/null
+++ b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch
@@ -0,0 +1,110 @@
+From 06e43bc658aa80bb5f4da3e43c1c13d4cab6ebdd Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:16 +0800
+Subject: [PATCH] hw/arm/smmuv3: Post-load stage 1 configurations to the host
+
+In nested mode, we call the set_pasid_table() callback on each
+STE update to pass the guest stage 1 configuration to the host
+and apply it at physical level.
+
+In the case of live migration, we need to manually call the
+set_pasid_table() to load the guest stage 1 configurations to
+the host. If this operation fails, the migration fails.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 33 ++++++++++++++++++++++++++++-----
+ 1 file changed, 28 insertions(+), 5 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index c26fba118c..f383143db1 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -876,7 +876,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova,
+     }
+ }
+ 
+-static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
++static int smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+ {
+ #ifdef __linux__
+     IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
+@@ -884,9 +884,10 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+     IOMMUConfig iommu_config = {};
+     SMMUTransCfg *cfg;
+     SMMUDevice *sdev;
++    int ret;
+ 
+     if (!mr) {
+-        return;
++        return 0;
+     }
+ 
+     sdev = container_of(mr, SMMUDevice, iommu);
+@@ -895,13 +896,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+     smmuv3_flush_config(sdev);
+ 
+     if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) {
+-        return;
++        return 0;
+     }
+ 
+     cfg = smmuv3_get_config(sdev, &event);
+ 
+     if (!cfg) {
+-        return;
++        return 0;
+     }
+ 
+     iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config);
+@@ -923,10 +924,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid)
+                                       iommu_config.pasid_cfg.config,
+                                       iommu_config.pasid_cfg.base_ptr);
+ 
+-    if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) {
++    ret = pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config);
++    if (ret) {
+         error_report("Failed to pass PASID table to host for iommu mr %s (%m)",
+                      mr->parent_obj.name);
+     }
++
++    return ret;
+ #endif
+ }
+ 
+@@ -1494,6 +1498,24 @@ static void smmu_realize(DeviceState *d, Error **errp)
+     smmu_init_irq(s, dev);
+ }
+ 
++static int smmuv3_post_load(void *opaque, int version_id)
++{
++    SMMUv3State *s3 = opaque;
++    SMMUState *s = &(s3->smmu_state);
++    SMMUDevice *sdev;
++    int ret = 0;
++
++    QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) {
++        uint32_t sid = smmu_get_sid(sdev);
++        ret = smmuv3_notify_config_change(s, sid);
++        if (ret) {
++            break;
++        }
++    }
++
++    return ret;
++}
++
+ static const VMStateDescription vmstate_smmuv3_queue = {
+     .name = "smmuv3_queue",
+     .version_id = 1,
+@@ -1512,6 +1534,7 @@ static const VMStateDescription vmstate_smmuv3 = {
+     .version_id = 1,
+     .minimum_version_id = 1,
+     .priority = MIG_PRI_IOMMU,
++    .post_load = smmuv3_post_load,
+     .fields = (VMStateField[]) {
+         VMSTATE_UINT32(features, SMMUv3State),
+         VMSTATE_UINT8(sid_size, SMMUv3State),
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch b/hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1139feaed62705a6baebbecba25ad0355b761daf
--- /dev/null
+++ b/hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch
@@ -0,0 +1,33 @@
+From eceb9213e23d15d5b4342b6a6a8368f4fec60c2f Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Mon, 19 Oct 2020 17:15:08 +0800
+Subject: [PATCH] hw/arm/smmuv3: Set the restoration priority of the vSMMUv3
+ explicitly
+
+Ensure the vSMMUv3 will be restored before all PCIe devices so that DMA
+translation can work properly during migration.
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Message-id: 20201019091508.197-1-yuzenghui@huawei.com
+Acked-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 7911944c59..3b5723e1e1 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1424,6 +1424,7 @@ static const VMStateDescription vmstate_smmuv3 = {
+     .name = "smmuv3",
+     .version_id = 1,
+     .minimum_version_id = 1,
++    .priority = MIG_PRI_IOMMU,
+     .fields = (VMStateField[]) {
+         VMSTATE_UINT32(features, SMMUv3State),
+         VMSTATE_UINT8(sid_size, SMMUv3State),
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8ed3590b6e3c3863486db0082be983a7b7d4968c
--- /dev/null
+++ b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch
@@ -0,0 +1,45 @@
+From 6fc85d8a6022d94ffec4cc118472cde583706bfb Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 9 Aug 2018 20:56:44 +0200
+Subject: [PATCH] hw/arm/smmuv3: Store the PASID table GPA in the translation
+ config
+
+For VFIO integration we will need to pass the Context Descriptor (CD)
+table GPA to the host. The CD table is also referred to as the PASID
+table. Its GPA corresponds to the s1ctrptr field of the Stream Table
+Entry. So let's decode and store it in the configuration structure.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c              | 1 +
+ include/hw/arm/smmu-common.h | 1 +
+ 2 files changed, 2 insertions(+)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 83d59b6d28..f8e721f949 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -352,6 +352,7 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
+                       "SMMUv3 S1 stalling fault model not allowed yet\n");
+         goto bad_ste;
+     }
++    cfg->s1ctxptr = STE_CTXPTR(ste);
+     return 0;
+ 
+ bad_ste:
+diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
+index 1f37844e5c..353668f4ea 100644
+--- a/include/hw/arm/smmu-common.h
++++ b/include/hw/arm/smmu-common.h
+@@ -68,6 +68,7 @@ typedef struct SMMUTransCfg {
+     uint8_t tbi;               /* Top Byte Ignore */
+     uint16_t asid;
+     SMMUTransTableInfo tt[2];
++    dma_addr_t s1ctxptr;
+     uint32_t iotlb_hits;       /* counts IOTLB hits for this asid */
+     uint32_t iotlb_misses;     /* counts IOTLB misses for this asid */
+ } SMMUTransCfg;
+-- 
+2.27.0
+
diff --git a/hw-arm-smmuv3-Support-16K-translation-granule.patch b/hw-arm-smmuv3-Support-16K-translation-granule.patch
new file mode 100644
index 0000000000000000000000000000000000000000..08c4bc5603401f6e5735daa6767dfa2aa2785255
--- /dev/null
+++ b/hw-arm-smmuv3-Support-16K-translation-granule.patch
@@ -0,0 +1,49 @@
+From 008dec30dea19950ff48a34c54441d065c1f228b Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Wed, 31 Mar 2021 14:47:13 +0800
+Subject: [PATCH] hw/arm/smmuv3: Support 16K translation granule
+
+The driver can query some bits in SMMUv3 IDR5 to learn which
+translation granules are supported. Arm recommends that SMMUv3
+implementations support at least 4K and 64K granules. But in
+the vSMMUv3, there seems to be no reason not to support 16K
+translation granule. In addition, if 16K is not supported,
+vSVA will failed to be enabled in the future for 16K guest
+kernel. So it'd better to support it.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: Eric Auger <eric.auger@redhat.com>
+Tested-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
+---
+ hw/arm/smmuv3.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index e96d5beb9a..7911944c59 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -254,8 +254,9 @@ static void smmuv3_init_regs(SMMUv3State *s)
+     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS);
+     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS,   SMMU_CMDQS);
+ 
+-   /* 4K and 64K granule support */
++    /* 4K, 16K and 64K granule support */
+     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
++    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
+     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1);
+     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */
+ 
+@@ -480,7 +481,8 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event)
+ 
+         tg = CD_TG(cd, i);
+         tt->granule_sz = tg2granule(tg, i);
+-        if ((tt->granule_sz != 12 && tt->granule_sz != 16) || CD_ENDI(cd)) {
++        if ((tt->granule_sz != 12 && tt->granule_sz != 14 &&
++             tt->granule_sz != 16) || CD_ENDI(cd)) {
+             goto bad_cd;
+         }
+ 
+-- 
+2.27.0
+
diff --git a/hw-vfio-common-trace-vfio_connect_container-operatio.patch b/hw-vfio-common-trace-vfio_connect_container-operatio.patch
new file mode 100644
index 0000000000000000000000000000000000000000..bd952359250359770ea8d51711e88be943ee2c72
--- /dev/null
+++ b/hw-vfio-common-trace-vfio_connect_container-operatio.patch
@@ -0,0 +1,53 @@
+From b107e6ec2a5a34e0ba95345a89dcf5f505ad9da4 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Mon, 22 Feb 2021 10:13:55 -0500
+Subject: [PATCH] hw/vfio/common: trace vfio_connect_container operations
+
+We currently trace vfio_disconnect_container() but we do not trace
+the container <-> group creation, which can be useful to understand
+the VFIO topology.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c     | 3 +++
+ hw/vfio/trace-events | 2 ++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 206fb83e28..fefa2ccfdf 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1848,6 +1848,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     QLIST_FOREACH(container, &space->containers, next) {
+         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+             group->container = container;
++            trace_vfio_connect_existing_container(group->groupid,
++                                                  container->fd);
+             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+             vfio_kvm_device_add_group(group);
+             return 0;
+@@ -1881,6 +1883,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     if (ret) {
+         goto free_container_exit;
+     }
++    trace_vfio_connect_new_container(group->groupid, container->fd);
+ 
+     switch (container->iommu_type) {
+     case VFIO_TYPE1v2_IOMMU:
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 575ebde6e0..561dc6e758 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -102,6 +102,8 @@ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t si
+ vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
+ vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+ vfio_disconnect_container(int fd) "close container->fd=%d"
++vfio_connect_existing_container(int groupid, int container_fd) "group=%d existing container fd=%d"
++vfio_connect_new_container(int groupid, int container_fd) "group=%d new container fd=%d"
+ vfio_put_group(int fd) "close group->fd=%d"
+ vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
+ vfio_put_base_device(int fd) "close vdev->fd=%d"
+-- 
+2.27.0
+
diff --git a/iommu-Introduce-generic-header.patch b/iommu-Introduce-generic-header.patch
new file mode 100644
index 0000000000000000000000000000000000000000..76e0c0c80ff83bfd8a5f0130ca73c0623e0efc35
--- /dev/null
+++ b/iommu-Introduce-generic-header.patch
@@ -0,0 +1,53 @@
+From e8055075dbbc932afccc1f18f4acc093fe9e4dc3 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 9 Jul 2019 12:20:12 +0200
+Subject: [PATCH] iommu: Introduce generic header
+
+This header is meant to exposes data types used by
+several IOMMU devices such as struct for SVA and
+nested stage configuration.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/hw/iommu/iommu.h | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+ create mode 100644 include/hw/iommu/iommu.h
+
+diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
+new file mode 100644
+index 0000000000..12092bda7b
+--- /dev/null
++++ b/include/hw/iommu/iommu.h
+@@ -0,0 +1,28 @@
++/*
++ * common header for iommu devices
++ *
++ * Copyright Red Hat, Inc. 2019
++ *
++ * Authors:
++ *  Eric Auger <eric.auger@redhat.com>
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2.  See
++ * the COPYING file in the top-level directory.
++ */
++
++#ifndef QEMU_HW_IOMMU_IOMMU_H
++#define QEMU_HW_IOMMU_IOMMU_H
++#ifdef __linux__
++#include <linux/iommu.h>
++#endif
++
++typedef struct IOMMUConfig {
++    union {
++#ifdef __linux__
++        struct iommu_pasid_table_config pasid_cfg;
++#endif
++          };
++} IOMMUConfig;
++
++
++#endif /* QEMU_HW_IOMMU_IOMMU_H */
+-- 
+2.27.0
+
diff --git a/linux-headers-update-against-5.10-and-manual-clear-v.patch b/linux-headers-update-against-5.10-and-manual-clear-v.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0315fc2c1a30be23b4643c30d783e5259ef11931
--- /dev/null
+++ b/linux-headers-update-against-5.10-and-manual-clear-v.patch
@@ -0,0 +1,90 @@
+From 79efeccd41d761b68946df68e5431eff399ccbd5 Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Sat, 8 May 2021 17:31:03 +0800
+Subject: [PATCH] linux-headers: update against 5.10 and manual clear vfio
+ dirty log series
+
+The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl
+VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
+VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in
+the kernel, update the header to add them.
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ linux-headers/linux/vfio.h | 37 ++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 36 insertions(+), 1 deletion(-)
+
+diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
+index a90672494d..120387ba58 100644
+--- a/linux-headers/linux/vfio.h
++++ b/linux-headers/linux/vfio.h
+@@ -46,6 +46,16 @@
+  */
+ #define VFIO_NOIOMMU_IOMMU		8
+ 
++/*
++ * The vfio_iommu driver may support user clears dirty log manually, which means
++ * dirty log can be requested to not cleared automatically after dirty log is
++ * copied to userspace, it's user's duty to clear dirty log.
++ *
++ * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP.
++ */
++#define VFIO_DIRTY_LOG_MANUAL_CLEAR	11
++
+ /*
+  * The IOCTL interface is designed for extensibility by embedding the
+  * structure length (argsz) and flags into structures passed between
+@@ -1074,6 +1084,7 @@ struct vfio_bitmap {
+  * field.  No guarantee is made to the user that arbitrary unmaps of iova
+  * or size different from those used in the original mapping call will
+  * succeed.
++ *
+  * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
+  * before unmapping IO virtual addresses. When this flag is set, the user must
+  * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
+@@ -1133,8 +1144,30 @@ struct vfio_iommu_type1_dma_unmap {
+  * actual bitmap. If dirty pages logging is not enabled, an error will be
+  * returned.
+  *
+- * Only one of the flags _START, _STOP and _GET may be specified at a time.
++ * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as
++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying
++ * dirty bitmap is not cleared automatically. The user can clear it manually by
++ * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set.
+  *
++ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set,
++ * instructs the IOMMU driver to clear the dirty status of pages in a bitmap
++ * for IOMMU container for a given IOVA range. The user must specify the IOVA
++ * range, the bitmap and the pgsize through the structure
++ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
++ * supports clearing a bitmap of the smallest supported pgsize only and can be
++ * modified in future to clear a bitmap of any specified supported pgsize. The
++ * user must provide a memory area for the bitmap memory and specify its size
++ * in bitmap.size. One bit is used to represent one page consecutively starting
++ * from iova offset. The user should provide page size in bitmap.pgsize field.
++ * A bit set in the bitmap indicates that the page at that offset from iova is
++ * cleared the dirty status, and dirty tracking is re-enabled for that page. The
++ * caller must set argsz to a value including the size of structure
++ * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If
++ * dirty pages logging is not enabled, an error will be returned. Note: user
++ * should clear dirty log before handle corresponding dirty pages.
++ *
++ * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be
++ * specified at a time.
+  */
+ struct vfio_iommu_type1_dirty_bitmap {
+ 	__u32        argsz;
+@@ -1142,6 +1175,8 @@ struct vfio_iommu_type1_dirty_bitmap {
+ #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
+ #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
+ #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR	(1 << 3)
++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP	(1 << 4)
+ 	__u8         data[];
+ };
+ 
+-- 
+2.27.0
+
diff --git a/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c67de46045dcbdca04a8a78d8ca0d44b27a794c2
--- /dev/null
+++ b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch
@@ -0,0 +1,32 @@
+From b7f4f3b71a179a21a90ca32ef7d6ea000fb0e3bd Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Mon, 25 Mar 2019 16:35:05 +0100
+Subject: [PATCH] memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region
+ attribute
+
+We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_MSI_TRANSLATE
+which tells whether the virtual IOMMU translates MSIs. ARM SMMU
+will expose this attribute since, as opposed to Intel DMAR, MSIs
+are translated as any other DMA requests.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/exec/memory.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index 74606e14aa..716b07e115 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -242,6 +242,7 @@ struct MemoryRegionOps {
+ enum IOMMUMemoryRegionAttr {
+     IOMMU_ATTR_SPAPR_TCE_FD,
+     IOMMU_ATTR_VFIO_NESTED,
++    IOMMU_ATTR_MSI_TRANSLATE,
+ };
+ 
+ /**
+-- 
+2.27.0
+
diff --git a/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch
new file mode 100644
index 0000000000000000000000000000000000000000..3932161dc8aeb2377a64f77c1ccc2e8a5c0d9a6a
--- /dev/null
+++ b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch
@@ -0,0 +1,72 @@
+From 5f4291f431add76b8754a5fb2d62ab4108ece73f Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Mon, 1 Jul 2019 11:30:30 +0200
+Subject: [PATCH] memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region
+ attribute
+
+We introduce a new IOMMU Memory Region attribute,
+IOMMU_ATTR_VFIO_NESTED that tells whether the virtual IOMMU
+requires HW nested paging for VFIO integration.
+
+Current Intel virtual IOMMU device supports "Caching
+Mode" and does not require 2 stages at physical level to be
+integrated with VFIO. However SMMUv3 does not implement such
+"caching mode" and requires to use HW nested paging.
+
+As such SMMUv3 is the first IOMMU device to advertise this
+attribute.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/arm/smmuv3.c       | 12 ++++++++++++
+ include/exec/memory.h |  3 ++-
+ 2 files changed, 14 insertions(+), 1 deletion(-)
+
+diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
+index 0ef1ca376c..55eed5189e 100644
+--- a/hw/arm/smmuv3.c
++++ b/hw/arm/smmuv3.c
+@@ -1531,6 +1531,17 @@ static void smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
+     }
+ }
+ 
++static int smmuv3_get_attr(IOMMUMemoryRegion *iommu,
++                           enum IOMMUMemoryRegionAttr attr,
++                           void *data)
++{
++    if (attr == IOMMU_ATTR_VFIO_NESTED) {
++        *(bool *) data = true;
++        return 0;
++    }
++    return -EINVAL;
++}
++
+ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+                                                   void *data)
+ {
+@@ -1538,6 +1549,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
+ 
+     imrc->translate = smmuv3_translate;
+     imrc->notify_flag_changed = smmuv3_notify_flag_changed;
++    imrc->get_attr = smmuv3_get_attr;
+ }
+ 
+ static const TypeInfo smmuv3_type_info = {
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index 3c5206dce6..74606e14aa 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -240,7 +240,8 @@ struct MemoryRegionOps {
+ };
+ 
+ enum IOMMUMemoryRegionAttr {
+-    IOMMU_ATTR_SPAPR_TCE_FD
++    IOMMU_ATTR_SPAPR_TCE_FD,
++    IOMMU_ATTR_VFIO_NESTED,
+ };
+ 
+ /**
+-- 
+2.27.0
+
diff --git a/memory-Add-new-fields-in-IOTLBEntry.patch b/memory-Add-new-fields-in-IOTLBEntry.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d76ff3bcd7321b32c9a57b6862f68b19f1216daa
--- /dev/null
+++ b/memory-Add-new-fields-in-IOTLBEntry.patch
@@ -0,0 +1,84 @@
+From 5a77056573d946eb9220b90dd1edce1f6f925c42 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 4 Sep 2018 08:43:05 -0400
+Subject: [PATCH] memory: Add new fields in IOTLBEntry
+
+The current IOTLBEntry becomes too simple to interact with
+some physical IOMMUs. IOTLBs can be invalidated with different
+granularities: domain, pasid, addr. Current IOTLB entry only offers
+page selective invalidation. Let's add a granularity field
+that conveys this information.
+
+TLB entries are usually tagged with some ids such as the asid
+or pasid. When propagating an invalidation command from the
+guest to the host, we need to pass those IDs.
+
+Also we add a leaf field which indicates, in case of invalidation
+notification, whether only cache entries for the last level of
+translation are required to be invalidated.
+
+A flag field is introduced to inform whether those fields are set.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/exec/memory.h | 36 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index dca8184277..3c5206dce6 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -66,14 +66,48 @@ typedef enum {
+     IOMMU_RW   = 3,
+ } IOMMUAccessFlags;
+ 
++/* Granularity of the cache invalidation */
++typedef enum {
++    IOMMU_INV_GRAN_ADDR = 0,
++    IOMMU_INV_GRAN_PASID,
++    IOMMU_INV_GRAN_DOMAIN,
++} IOMMUInvGranularity;
++
+ #define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0))
+ 
++/**
++ * IOMMUTLBEntry - IOMMU TLB entry
++ *
++ * Structure used when performing a translation or when notifying MAP or
++ * UNMAP (invalidation) events
++ *
++ * @target_as: target address space
++ * @iova: IO virtual address (input)
++ * @translated_addr: translated address (output)
++ * @addr_mask: address mask (0xfff means 4K binding), must be multiple of 2
++ * @perm: permission flag of the mapping (NONE encodes no mapping or
++ * invalidation notification)
++ * @granularity: granularity of the invalidation
++ * @flags: informs whether the following fields are set
++ * @arch_id: architecture specific ID tagging the TLB
++ * @pasid: PASID tagging the TLB
++ * @leaf: when @perm is NONE, indicates whether only caches for the last
++ * level of translation need to be invalidated.
++ */
+ struct IOMMUTLBEntry {
+     AddressSpace    *target_as;
+     hwaddr           iova;
+     hwaddr           translated_addr;
+-    hwaddr           addr_mask;  /* 0xfff = 4k translation */
++    hwaddr           addr_mask;
+     IOMMUAccessFlags perm;
++    IOMMUInvGranularity granularity;
++#define IOMMU_INV_FLAGS_PASID  (1 << 0)
++#define IOMMU_INV_FLAGS_ARCHID (1 << 1)
++#define IOMMU_INV_FLAGS_LEAF   (1 << 2)
++    uint32_t         flags;
++    uint32_t         arch_id;
++    uint32_t         pasid;
++    bool             leaf;
+ };
+ 
+ /*
+-- 
+2.27.0
+
diff --git a/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7cecd31a9765fb0926a4de993b38e0d5e68dfd6b
--- /dev/null
+++ b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch
@@ -0,0 +1,89 @@
+From 497e055ed89e3cb5286dde2b05b7d7fd67e69331 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Sep 2018 14:13:04 +0200
+Subject: [PATCH] memory: Introduce IOMMU Memory Region inject_faults API
+
+This new API allows to inject @count iommu_faults into
+the IOMMU memory region.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ include/exec/memory.h | 25 +++++++++++++++++++++++++
+ memory.c              | 10 ++++++++++
+ 2 files changed, 35 insertions(+)
+
+diff --git a/include/exec/memory.h b/include/exec/memory.h
+index 716b07e115..ffd4282f14 100644
+--- a/include/exec/memory.h
++++ b/include/exec/memory.h
+@@ -56,6 +56,8 @@ struct MemoryRegionMmio {
+     CPUWriteMemoryFunc *write[3];
+ };
+ 
++struct iommu_fault;
++
+ typedef struct IOMMUTLBEntry IOMMUTLBEntry;
+ 
+ /* See address_space_translate: bit 0 is read, bit 1 is write.  */
+@@ -378,6 +380,19 @@ typedef struct IOMMUMemoryRegionClass {
+      * @iommu: the IOMMUMemoryRegion
+      */
+     int (*num_indexes)(IOMMUMemoryRegion *iommu);
++
++    /*
++     * Inject @count faults into the IOMMU memory region
++     *
++     * Optional method: if this method is not provided, then
++     * memory_region_injection_faults() will return -ENOENT
++     *
++     * @iommu: the IOMMU memory region to inject the faults in
++     * @count: number of faults to inject
++     * @buf: fault buffer
++     */
++    int (*inject_faults)(IOMMUMemoryRegion *iommu, int count,
++                         struct iommu_fault *buf);
+ } IOMMUMemoryRegionClass;
+ 
+ typedef struct CoalescedMemoryRange CoalescedMemoryRange;
+@@ -1182,6 +1197,16 @@ int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr,
+  */
+ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr);
+ 
++/**
++ * memory_region_inject_faults : inject @count faults stored in @buf
++ *
++ * @iommu_mr: the IOMMU memory region
++ * @count: number of faults to be injected
++ * @buf: buffer containing the faults
++ */
++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
++                                struct iommu_fault *buf);
++
+ /**
+  * memory_region_name: get a memory region's name
+  *
+diff --git a/memory.c b/memory.c
+index 708b3dff3d..623f89baa4 100644
+--- a/memory.c
++++ b/memory.c
+@@ -2017,6 +2017,16 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
+     return imrc->num_indexes(iommu_mr);
+ }
+ 
++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
++                                struct iommu_fault *buf)
++{
++    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
++    if (!imrc->inject_faults) {
++        return -ENOENT;
++    }
++    return imrc->inject_faults(iommu_mr, count, buf);
++}
++
+ void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
+ {
+     uint8_t mask = 1 << client;
+-- 
+2.27.0
+
diff --git a/migration-ram-Optimize-ram_save_host_page.patch b/migration-ram-Optimize-ram_save_host_page.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c58a6dcb6a5f3dc85be056f1c6ffd3a0bf3ba972
--- /dev/null
+++ b/migration-ram-Optimize-ram_save_host_page.patch
@@ -0,0 +1,95 @@
+From ae1a8506aa45266f2bf77a8d428f5ccd970a9b13 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 16 Mar 2021 20:57:16 +0800
+Subject: [PATCH] migration/ram: Optimize ram_save_host_page()
+
+Starting from pss->page, ram_save_host_page() will check every page
+and send the dirty pages up to the end of the current host page or
+the boundary of used_length of the block. If the host page size is
+a huge page, the step "check" will take a lot of time.
+
+It will improve performance to use migration_bitmap_find_dirty().
+
+Tested on Kunpeng 920; VM parameters: 1U 4G (page size 1G)
+The time of ram_save_host_page() in the last round of ram saving:
+before optimize: 9250us		after optimize: 34us
+
+Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Message-Id: <20210316125716.1243-3-jiangkunkun@huawei.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+---
+ migration/ram.c | 43 +++++++++++++++++++++----------------------
+ 1 file changed, 21 insertions(+), 22 deletions(-)
+
+diff --git a/migration/ram.c b/migration/ram.c
+index 22063e00b4..1bd99ff9e5 100644
+--- a/migration/ram.c
++++ b/migration/ram.c
+@@ -3052,6 +3052,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
+     int tmppages, pages = 0;
+     size_t pagesize_bits =
+         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
++    unsigned long hostpage_boundary =
++        QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
+ 
+     if (ramblock_is_ignored(pss->block)) {
+         error_report("block %s should not be migrated !", pss->block->idstr);
+@@ -3060,34 +3062,31 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
+ 
+     do {
+         /* Check the pages is dirty and if it is send it */
+-        if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
+-            pss->page++;
+-            continue;
+-        }
+-
+-        tmppages = ram_save_target_page(rs, pss, last_stage);
+-        if (tmppages < 0) {
+-            return tmppages;
+-        }
++        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
++            tmppages = ram_save_target_page(rs, pss, last_stage);
++            if (tmppages < 0) {
++                return tmppages;
++            }
+ 
+-        pages += tmppages;
+-        if (pss->block->unsentmap) {
+-            clear_bit(pss->page, pss->block->unsentmap);
+-        }
++            pages += tmppages;
++            if (pss->block->unsentmap) {
++                clear_bit(pss->page, pss->block->unsentmap);
++            }
+ 
+-        pss->page++;
+-        /*
+-         * Allow rate limiting to happen in the middle of huge pages if
+-         * something is sent in the current iteration.
+-         */
+-        if (pagesize_bits > 1 && tmppages > 0) {
+-            migration_rate_limit();
++            /*
++             * Allow rate limiting to happen in the middle of huge pages if
++             * something is sent in the current iteration.
++             */
++            if (pagesize_bits > 1 && tmppages > 0) {
++                migration_rate_limit();
++            }
+         }
++        pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
+     } while ((pss->page & (pagesize_bits - 1)) &&
+              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
+ 
+-    /* The offset we leave with is the last one we looked at */
+-    pss->page--;
++    /* The offset we leave with is the min boundary of host page and block */
++    pss->page = MIN(pss->page, hostpage_boundary) - 1;
+     return pages;
+ }
+ 
+-- 
+2.27.0
+
diff --git a/migration-ram-Reduce-unnecessary-rate-limiting.patch b/migration-ram-Reduce-unnecessary-rate-limiting.patch
new file mode 100644
index 0000000000000000000000000000000000000000..64374dd3e255224e650c8de3e93669db04a6c413
--- /dev/null
+++ b/migration-ram-Reduce-unnecessary-rate-limiting.patch
@@ -0,0 +1,42 @@
+From 338d691c985ad5b3624ef36e4beaac82982c8f0a Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 16 Mar 2021 20:57:15 +0800
+Subject: [PATCH] migration/ram: Reduce unnecessary rate limiting
+
+When the host page is a huge page and something is sent in the
+current iteration, migration_rate_limit() should be executed.
+If not, it can be omitted.
+
+Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Reviewed-by: David Edmondson <david.edmondson@oracle.com>
+Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Message-Id: <20210316125716.1243-2-jiangkunkun@huawei.com>
+Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+---
+ migration/ram.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/migration/ram.c b/migration/ram.c
+index 2077ba5be4..22063e00b4 100644
+--- a/migration/ram.c
++++ b/migration/ram.c
+@@ -3076,8 +3076,13 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
+         }
+ 
+         pss->page++;
+-        /* Allow rate limiting to happen in the middle of huge pages */
+-        migration_rate_limit();
++        /*
++         * Allow rate limiting to happen in the middle of huge pages if
++         * something is sent in the current iteration.
++         */
++        if (pagesize_bits > 1 && tmppages > 0) {
++            migration_rate_limit();
++        }
+     } while ((pss->page & (pagesize_bits - 1)) &&
+              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
+ 
+-- 
+2.27.0
+
diff --git a/pci-Add-return_page_response-pci-ops.patch b/pci-Add-return_page_response-pci-ops.patch
new file mode 100644
index 0000000000000000000000000000000000000000..133762085ba7a4a01fd5e909a3c17cd45a975978
--- /dev/null
+++ b/pci-Add-return_page_response-pci-ops.patch
@@ -0,0 +1,86 @@
+From e3b498a1afec138693251bf1bd1fa9b322a880fb Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 6 Nov 2020 14:34:35 +0100
+Subject: [PATCH] pci: Add return_page_response pci ops
+
+Add a new PCI operation that allows to return page responses
+to registered VFIO devices
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/pci/pci.c             | 16 ++++++++++++++++
+ include/hw/iommu/iommu.h |  8 ++++++++
+ include/hw/pci/pci.h     |  4 ++++
+ 3 files changed, 28 insertions(+)
+
+diff --git a/hw/pci/pci.c b/hw/pci/pci.c
+index f11ca7964e..a8b3d1c071 100644
+--- a/hw/pci/pci.c
++++ b/hw/pci/pci.c
+@@ -2660,6 +2660,22 @@ int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn,
+     return -ENOENT;
+ }
+ 
++int pci_device_return_page_response(PCIBus *bus, int32_t devfn,
++                                    IOMMUPageResponse *resp)
++{
++    PCIDevice *dev;
++
++    if (!bus) {
++        return -EINVAL;
++    }
++
++    dev = bus->devices[devfn];
++    if (dev && dev->pasid_ops && dev->pasid_ops->return_page_response) {
++        return dev->pasid_ops->return_page_response(bus, devfn, resp);
++    }
++    return -ENOENT;
++}
++
+ static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque)
+ {
+     Range *range = opaque;
+diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
+index 12092bda7b..5890f095b1 100644
+--- a/include/hw/iommu/iommu.h
++++ b/include/hw/iommu/iommu.h
+@@ -24,5 +24,13 @@ typedef struct IOMMUConfig {
+           };
+ } IOMMUConfig;
+ 
++typedef struct IOMMUPageResponse {
++    union {
++#ifdef __linux__
++        struct iommu_page_response resp;
++#endif
++          };
++} IOMMUPageResponse;
++
+ 
+ #endif /* QEMU_HW_IOMMU_IOMMU_H */
+diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
+index bb14ed61b0..5e7e0e4e6f 100644
+--- a/include/hw/pci/pci.h
++++ b/include/hw/pci/pci.h
+@@ -266,6 +266,8 @@ typedef struct PCIReqIDCache PCIReqIDCache;
+ 
+ struct PCIPASIDOps {
+     int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++    int (*return_page_response)(PCIBus *bus, int32_t devfn,
++                                IOMMUPageResponse *resp);
+ };
+ typedef struct PCIPASIDOps PCIPASIDOps;
+ 
+@@ -495,6 +497,8 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+ void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops);
+ bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn);
+ int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++int pci_device_return_page_response(PCIBus *bus, int32_t devfn,
++                                    IOMMUPageResponse *resp);
+ 
+ static inline void
+ pci_set_byte(uint8_t *config, uint8_t val)
+-- 
+2.27.0
+
diff --git a/pci-introduce-PCIPASIDOps-to-PCIDevice.patch b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e89cdc8df7130e348c916988d8b85f39ec4b6d19
--- /dev/null
+++ b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch
@@ -0,0 +1,127 @@
+From 26adddfe4645b69c16ed8d6601f373d40bddd0e3 Mon Sep 17 00:00:00 2001
+From: Liu Yi L <yi.l.liu@intel.com>
+Date: Fri, 5 Jul 2019 19:01:36 +0800
+Subject: [PATCH] pci: introduce PCIPASIDOps to PCIDevice
+
+This patch introduces PCIPASIDOps for IOMMU related operations.
+
+https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00078.html
+https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00940.html
+
+So far, to setup virt-SVA for assigned SVA capable device, needs to
+configure host translation structures for specific pasid. (e.g. bind
+guest page table to host and enable nested translation in host).
+Besides, vIOMMU emulator needs to forward guest's cache invalidation
+to host since host nested translation is enabled. e.g. on VT-d, guest
+owns 1st level translation table, thus cache invalidation for 1st
+level should be propagated to host.
+
+This patch adds two functions: alloc_pasid and free_pasid to support
+guest pasid allocation and free. The implementations of the callbacks
+would be device passthru modules. Like vfio.
+
+Cc: Kevin Tian <kevin.tian@intel.com>
+Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Eric Auger <eric.auger@redhat.com>
+Cc: Yi Sun <yi.y.sun@linux.intel.com>
+Cc: David Gibson <david@gibson.dropbear.id.au>
+Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
+Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/pci/pci.c         | 34 ++++++++++++++++++++++++++++++++++
+ include/hw/pci/pci.h | 11 +++++++++++
+ 2 files changed, 45 insertions(+)
+
+diff --git a/hw/pci/pci.c b/hw/pci/pci.c
+index e74143ccc3..f11ca7964e 100644
+--- a/hw/pci/pci.c
++++ b/hw/pci/pci.c
+@@ -2626,6 +2626,40 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque)
+     bus->iommu_opaque = opaque;
+ }
+ 
++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops)
++{
++    assert(ops && !dev->pasid_ops);
++    dev->pasid_ops = ops;
++}
++
++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn)
++{
++    PCIDevice *dev;
++
++    if (!bus) {
++        return false;
++    }
++
++    dev = bus->devices[devfn];
++    return !!(dev && dev->pasid_ops);
++}
++
++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn,
++                               IOMMUConfig *config)
++{
++    PCIDevice *dev;
++
++    if (!bus) {
++        return -EINVAL;
++    }
++
++    dev = bus->devices[devfn];
++    if (dev && dev->pasid_ops && dev->pasid_ops->set_pasid_table) {
++        return dev->pasid_ops->set_pasid_table(bus, devfn, config);
++    }
++    return -ENOENT;
++}
++
+ static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque)
+ {
+     Range *range = opaque;
+diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
+index aaf1b9f70d..bb14ed61b0 100644
+--- a/include/hw/pci/pci.h
++++ b/include/hw/pci/pci.h
+@@ -9,6 +9,7 @@
+ #include "hw/isa/isa.h"
+ 
+ #include "hw/pci/pcie.h"
++#include "hw/iommu/iommu.h"
+ 
+ extern bool pci_available;
+ 
+@@ -263,6 +264,11 @@ struct PCIReqIDCache {
+ };
+ typedef struct PCIReqIDCache PCIReqIDCache;
+ 
++struct PCIPASIDOps {
++    int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++};
++typedef struct PCIPASIDOps PCIPASIDOps;
++
+ struct PCIDevice {
+     DeviceState qdev;
+ 
+@@ -352,6 +358,7 @@ struct PCIDevice {
+     MSIVectorUseNotifier msix_vector_use_notifier;
+     MSIVectorReleaseNotifier msix_vector_release_notifier;
+     MSIVectorPollNotifier msix_vector_poll_notifier;
++    PCIPASIDOps *pasid_ops;
+ };
+ 
+ void pci_register_bar(PCIDevice *pci_dev, int region_num,
+@@ -485,6 +492,10 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
+ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+ 
++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops);
++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn);
++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config);
++
+ static inline void
+ pci_set_byte(uint8_t *config, uint8_t val)
+ {
+-- 
+2.27.0
+
diff --git a/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch
new file mode 100644
index 0000000000000000000000000000000000000000..85467e8412ec264d6034f59ae3704a3042d1e5e0
--- /dev/null
+++ b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch
@@ -0,0 +1,30 @@
+From 4f1396f9e173a24f78204b8849c209100499d639 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Thu, 29 Jul 2021 15:24:48 +0800
+Subject: [PATCH] qdev/monitors: Fix reundant error_setg of qdev_add_device
+
+There is an extra log "error_setg" in qdev_add_device(). When
+hot-plug a device, if the corresponding bus doesn't exist, it
+will trigger an asseration "assert(*errp == NULL)".
+
+Fixes: 515a7970490 (log: Add some logs on VM runtime path)
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ qdev-monitor.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/qdev-monitor.c b/qdev-monitor.c
+index c6c1d3f06a..ab2bdef105 100644
+--- a/qdev-monitor.c
++++ b/qdev-monitor.c
+@@ -587,7 +587,6 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp)
+     if (path != NULL) {
+         bus = qbus_find(path, errp);
+         if (!bus) {
+-            error_setg(errp, "can not find bus for %s", driver);
+             return NULL;
+         }
+         if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) {
+-- 
+2.27.0
+
diff --git a/qemu.spec b/qemu.spec
index 62c6f6cc22e5def7e674f1fa31baa900f0827eca..981d9e122bb15216d1d717f42c62cd6ace4810fd 100644
--- a/qemu.spec
+++ b/qemu.spec
@@ -1,6 +1,6 @@
 Name: qemu
 Version: 4.1.0
-Release: 75
+Release: 76
 Epoch: 2
 Summary: QEMU is a generic and open source machine emulator and virtualizer
 License: GPLv2 and BSD and MIT and CC-BY-SA-4.0
@@ -509,6 +509,52 @@ Patch0496: Fix-use-after-free-in-vfio_migration_probe.patch
 Patch0497: vfio-Make-migration-support-experimental.patch
 Patch0498: vfio-Change-default-dirty-pages-tracking-behavior-du.patch
 Patch0499: vfio-Fix-vfio_listener_log_sync-function-name-typo.patch
+Patch0500: vfio-Support-host-translation-granule-size.patch
+Patch0501: vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch
+Patch0502: vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch
+Patch0503: migration-ram-Reduce-unnecessary-rate-limiting.patch
+Patch0504: migration-ram-Optimize-ram_save_host_page.patch
+Patch0505: qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch
+Patch0506: linux-headers-update-against-5.10-and-manual-clear-v.patch
+Patch0507: vfio-Maintain-DMA-mapping-range-for-the-container.patch
+Patch0508: vfio-migration-Add-support-for-manual-clear-vfio-dir.patch
+Patch0509: hw-arm-smmuv3-Support-16K-translation-granule.patch
+Patch0510: hw-arm-smmuv3-Set-the-restoration-priority-of-the-vS.patch
+Patch0511: hw-vfio-common-trace-vfio_connect_container-operatio.patch
+Patch0512: update-linux-headers-Import-iommu.h.patch
+Patch0513: vfio.h-and-iommu.h-header-update-against-5.10.patch
+Patch0514: memory-Add-new-fields-in-IOTLBEntry.patch
+Patch0515: hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch
+Patch0516: hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch
+Patch0517: memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch
+Patch0518: memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch
+Patch0519: memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch
+Patch0520: iommu-Introduce-generic-header.patch
+Patch0521: pci-introduce-PCIPASIDOps-to-PCIDevice.patch
+Patch0522: vfio-Force-nested-if-iommu-requires-it.patch
+Patch0523: vfio-Introduce-hostwin_from_range-helper.patch
+Patch0524: vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch
+Patch0525: vfio-Set-up-nested-stage-mappings.patch
+Patch0526: vfio-Pass-stage-1-MSI-bindings-to-the-host.patch
+Patch0527: vfio-Helper-to-get-IRQ-info-including-capabilities.patch
+Patch0528: vfio-pci-Register-handler-for-iommu-fault.patch
+Patch0529: vfio-pci-Set-up-the-DMA-FAULT-region.patch
+Patch0530: vfio-pci-Implement-the-DMA-fault-handler.patch
+Patch0531: hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch
+Patch0532: hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch
+Patch0533: hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch
+Patch0534: hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch
+Patch0535: hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch
+Patch0536: hw-arm-smmuv3-Implement-fault-injection.patch
+Patch0537: hw-arm-smmuv3-Allow-MAP-notifiers.patch
+Patch0538: pci-Add-return_page_response-pci-ops.patch
+Patch0539: vfio-pci-Implement-return_page_response-page-respons.patch
+Patch0540: vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch
+Patch0541: vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch
+Patch0542: vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch
+Patch0543: vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch
+Patch0544: vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch
+Patch0545: hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch
 
 BuildRequires: flex
 BuildRequires: gcc
@@ -903,6 +949,54 @@ getent passwd qemu >/dev/null || \
 %endif
 
 %changelog
+* Wed Aug 04 2021 Chen Qun <kuhn.chenqun@huawei.com>
+- vfio: Support host translation granule size
+- vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
+- vfio: Fix unregister SaveVMHandler in vfio_migration_finalize
+- migration/ram: Reduce unnecessary rate limiting
+- migration/ram: Optimize ram_save_host_page()
+- qdev/monitors: Fix reundant error_setg of qdev_add_device
+- linux-headers: update against 5.10 and manual clear vfio dirty log series
+- vfio: Maintain DMA mapping range for the container
+- vfio/migration: Add support for manual clear vfio dirty log
+- hw/arm/smmuv3: Support 16K translation granule
+- hw/arm/smmuv3: Set the restoration priority of the vSMMUv3 explicitly
+- hw/vfio/common: trace vfio_connect_container operations
+- update-linux-headers: Import iommu.h
+- vfio.h and iommu.h header update against 5.10
+- memory: Add new fields in IOTLBEntry
+- hw/arm/smmuv3: Improve stage1 ASID invalidation
+- hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL
+- memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute
+- memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute
+- memory: Introduce IOMMU Memory Region inject_faults API
+- iommu: Introduce generic header
+- pci: introduce PCIPASIDOps to PCIDevice
+- vfio: Force nested if iommu requires it
+- vfio: Introduce hostwin_from_range helper
+- vfio: Introduce helpers to DMA map/unmap a RAM section
+- vfio: Set up nested stage mappings
+- vfio: Pass stage 1 MSI bindings to the host
+- vfio: Helper to get IRQ info including capabilities
+- vfio/pci: Register handler for iommu fault
+- vfio/pci: Set up the DMA FAULT region
+- vfio/pci: Implement the DMA fault handler
+- hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute
+- hw/arm/smmuv3: Store the PASID table GPA in the translation config
+- hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation
+- hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation
+- hw/arm/smmuv3: Pass stage 1 configurations to the host
+- hw/arm/smmuv3: Implement fault injection
+- hw/arm/smmuv3: Allow MAP notifiers
+- pci: Add return_page_response pci ops
+- vfio/pci: Implement return_page_response page response callback
+- vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode
+- vfio: Introduce helpers to mark dirty pages of a RAM section
+- vfio: Add vfio_prereg_listener_log_sync in nested stage
+- vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages
+- vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage
+- hw/arm/smmuv3: Post-load stage 1 configurations to the host
+
 * Tue Aug 03 2021 Chen Qun <kuhn.chenqun@huawei.com>
 - kvm: split too big memory section on several memslots
 - kvm: Reallocate dirty_bmap when we change a slot
diff --git a/update-linux-headers-Import-iommu.h.patch b/update-linux-headers-Import-iommu.h.patch
new file mode 100644
index 0000000000000000000000000000000000000000..eea744e5063aeba90d9e16967f88b9d902de93f1
--- /dev/null
+++ b/update-linux-headers-Import-iommu.h.patch
@@ -0,0 +1,29 @@
+From 78c269f4ed09a3272d99a65d9c86977a01ef99c8 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 9 May 2019 10:23:42 -0400
+Subject: [PATCH] update-linux-headers: Import iommu.h
+
+Update the script to import the new iommu.h uapi header.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ scripts/update-linux-headers.sh | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
+index f76d77363b..dfdfdfddcf 100755
+--- a/scripts/update-linux-headers.sh
++++ b/scripts/update-linux-headers.sh
+@@ -141,7 +141,7 @@ done
+ 
+ rm -rf "$output/linux-headers/linux"
+ mkdir -p "$output/linux-headers/linux"
+-for header in kvm.h vfio.h vfio_ccw.h vhost.h \
++for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \
+               psci.h psp-sev.h userfaultfd.h mman.h; do
+     cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
+ done
+-- 
+2.27.0
+
diff --git a/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..289638a9e3c453dfe9fa9e863209ddbd5ea0489f
--- /dev/null
+++ b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch
@@ -0,0 +1,71 @@
+From 6aa770f4b83ca068d0c8f3102edda32666a8404d Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:15 +0800
+Subject: [PATCH] vfio: Add vfio_prereg_listener_global_log_start/stop in
+ nested stage
+
+In nested mode, we set up the stage 2 and stage 1 separately. In my
+opinion, vfio_memory_prereg_listener is used for stage 2 and
+vfio_memory_listener is used for stage 1. So it feels weird to call
+the global_log_start/stop interface in vfio_memory_listener to switch
+dirty tracking, although this won't cause any errors. Add
+global_log_start/stop interface in vfio_memory_prereg_listener
+can separate stage 2 from stage 1.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index b5f9ba816e..fb7ca63748 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1239,6 +1239,17 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ 
++    /* For nested mode, vfio_prereg_listener is used to start dirty tracking */
++    if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) {
++        vfio_set_dirty_page_tracking(container, true);
++    }
++}
++
++static void vfio_prereg_listener_log_global_start(MemoryListener *listener)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
+     vfio_set_dirty_page_tracking(container, true);
+ }
+ 
+@@ -1246,6 +1257,17 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ 
++    /* For nested mode, vfio_prereg_listener is used to stop dirty tracking */
++    if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) {
++        vfio_set_dirty_page_tracking(container, false);
++    }
++}
++
++static void vfio_prereg_listener_log_global_stop(MemoryListener *listener)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
+     vfio_set_dirty_page_tracking(container, false);
+ }
+ 
+@@ -1614,6 +1636,8 @@ static const MemoryListener vfio_memory_listener = {
+ static MemoryListener vfio_memory_prereg_listener = {
+     .region_add = vfio_prereg_listener_region_add,
+     .region_del = vfio_prereg_listener_region_del,
++    .log_global_start = vfio_prereg_listener_log_global_start,
++    .log_global_stop = vfio_prereg_listener_log_global_stop,
+     .log_sync = vfio_prereg_listener_log_sync,
+     .log_clear = vfio_prereg_listener_log_clear,
+ };
+-- 
+2.27.0
+
diff --git a/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e4da89bd477558ea9e58538c75b0c198d27e3d21
--- /dev/null
+++ b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch
@@ -0,0 +1,84 @@
+From f959faa36fc100894a44f2e6cd7e02a183ba142a Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Sat, 31 Jul 2021 09:40:24 +0800
+Subject: [PATCH] vfio: Add vfio_prereg_listener_log_clear to re-enable mark
+ dirty pages
+
+When tracking dirty pages, we just need to pay attention to stage 2
+mappings. Legacy vfio_listener_log_clear cannot be used in nested
+stage. This patch adds vfio_prereg_listener_log_clear to re-enable
+dirty pages in nested mode.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 40 +++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 6b00bd4c2f..b5f9ba816e 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1550,6 +1550,43 @@ static int vfio_physical_log_clear(VFIOContainer *container,
+     return ret;
+ }
+ 
++static void vfio_prereg_listener_log_clear(MemoryListener *listener,
++                                           MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr)) {
++        return;
++    }
++
++    vfio_physical_log_clear(container, section);
++}
++
++static int vfio_clear_dirty_bitmap(VFIOContainer *container,
++                                   MemoryRegionSection *section)
++{
++    if (memory_region_is_iommu(section->mr)) {
++        /*
++         * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are
++         * set up separately. It is inappropriate to pass 'giova' to kernel
++         * to get dirty pages. We only need to focus on stage 2 mapping when
++         * marking dirty pages.
++         */
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            return 0;
++        }
++
++        /*
++         * TODO: x86. With the log_clear() interface added, x86 may inplement
++         * its own method.
++         */
++    }
++
++    /* Here we assume that memory_region_is_ram(section->mr) == true */
++    return vfio_physical_log_clear(container, section);
++}
++
+ static void vfio_listener_log_clear(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+ {
+@@ -1561,7 +1598,7 @@ static void vfio_listener_log_clear(MemoryListener *listener,
+     }
+ 
+     if (vfio_devices_all_dirty_tracking(container)) {
+-        vfio_physical_log_clear(container, section);
++        vfio_clear_dirty_bitmap(container, section);
+     }
+ }
+ 
+@@ -1578,6 +1615,7 @@ static MemoryListener vfio_memory_prereg_listener = {
+     .region_add = vfio_prereg_listener_region_add,
+     .region_del = vfio_prereg_listener_region_del,
+     .log_sync = vfio_prereg_listener_log_sync,
++    .log_clear = vfio_prereg_listener_log_clear,
+ };
+ 
+ static void vfio_listener_release(VFIOContainer *container)
+-- 
+2.27.0
+
diff --git a/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch
new file mode 100644
index 0000000000000000000000000000000000000000..77a0c8a14d29280b369466b1fa9b55dc62c26228
--- /dev/null
+++ b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch
@@ -0,0 +1,74 @@
+From 4c5350044ac2f61ab8088278b59eb6388ca49ff1 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:14 +0800
+Subject: [PATCH] vfio: Add vfio_prereg_listener_log_sync in nested stage
+
+In nested mode, we set up the stage 2 (gpa->hpa)and stage 1
+(giova->gpa) separately by vfio_prereg_listener_region_add()
+and vfio_listener_region_add(). So when marking dirty pages
+we just need to pay attention to stage 2 mappings.
+
+Legacy vfio_listener_log_sync cannot be used in nested stage.
+This patch adds vfio_prereg_listener_log_sync to mark dirty
+pages in nested mode.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 27 +++++++++++++++++++++++++++
+ 1 file changed, 27 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 5176fd3a3d..6b00bd4c2f 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1317,6 +1317,22 @@ static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container,
+                     int128_get64(section->size), ram_addr);
+ }
+ 
++static void vfio_prereg_listener_log_sync(MemoryListener *listener,
++                                          MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr) ||
++        !container->dirty_pages_supported) {
++        return;
++    }
++
++    if (vfio_devices_all_dirty_tracking(container)) {
++        vfio_dma_sync_ram_section_dirty_bitmap(container, section);
++    }
++}
++
+ typedef struct {
+     IOMMUNotifier n;
+     VFIOGuestIOMMU *giommu;
+@@ -1361,6 +1377,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+     if (memory_region_is_iommu(section->mr)) {
+         VFIOGuestIOMMU *giommu;
+ 
++        /*
++         * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are
++         * set up separately. It is inappropriate to pass 'giova' to kernel
++         * to get dirty pages. We only need to focus on stage 2 mapping when
++         * marking dirty pages.
++         */
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            return 0;
++        }
++
+         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+             if (MEMORY_REGION(giommu->iommu) == section->mr &&
+                 giommu->n.start == section->offset_within_region) {
+@@ -1551,6 +1577,7 @@ static const MemoryListener vfio_memory_listener = {
+ static MemoryListener vfio_memory_prereg_listener = {
+     .region_add = vfio_prereg_listener_region_add,
+     .region_del = vfio_prereg_listener_region_del,
++    .log_sync = vfio_prereg_listener_log_sync,
+ };
+ 
+ static void vfio_listener_release(VFIOContainer *container)
+-- 
+2.27.0
+
diff --git a/vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch b/vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch
new file mode 100644
index 0000000000000000000000000000000000000000..47d59923070d7827152f59a60304ef708bcc1c62
--- /dev/null
+++ b/vfio-Fix-unregister-SaveVMHandler-in-vfio_migration_.patch
@@ -0,0 +1,36 @@
+From 8dc6e7ccc5712aee457ffb1f6cf1bf3f80e778d5 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Thu, 27 May 2021 20:31:01 +0800
+Subject: [PATCH] vfio: Fix unregister SaveVMHandler in vfio_migration_finalize
+
+In the vfio_migration_init(), the SaveVMHandler is registered for
+VFIO device. But it lacks the operation of 'unregister'. It will
+lead to 'Segmentation fault (core dumped)' in
+qemu_savevm_state_setup(), if performing live migration after a
+VFIO device is hot deleted.
+
+Fixes: cd5b58f2ba (vfio: Register SaveVMHandlers for VFIO device)
+Reported-by: Qixin Gan <ganqixin@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Message-Id: <20210527123101.289-1-jiangkunkun@huawei.com>
+Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+---
+ hw/vfio/migration.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
+index f1f006d584..d9e0e12824 100644
+--- a/hw/vfio/migration.c
++++ b/hw/vfio/migration.c
+@@ -893,6 +893,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
+ 
+         remove_migration_state_change_notifier(&migration->migration_state);
+         qemu_del_vm_change_state_handler(migration->vm_state);
++        unregister_savevm(vbasedev->dev, "vfio", vbasedev);
+         vfio_migration_exit(vbasedev);
+     }
+ 
+-- 
+2.27.0
+
diff --git a/vfio-Force-nested-if-iommu-requires-it.patch b/vfio-Force-nested-if-iommu-requires-it.patch
new file mode 100644
index 0000000000000000000000000000000000000000..6a6b9da3f1ebd6c44f6a298a9c456351a8a93fcd
--- /dev/null
+++ b/vfio-Force-nested-if-iommu-requires-it.patch
@@ -0,0 +1,100 @@
+From e4122a95a30cd58e1cd6e1742928e68aa94fd7ee Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 28 Aug 2018 16:16:20 +0200
+Subject: [PATCH] vfio: Force nested if iommu requires it
+
+In case we detect the address space is translated by
+a virtual IOMMU which requires HW nested paging to
+integrate with VFIO, let's set up the container with
+the VFIO_TYPE1_NESTING_IOMMU iommu_type.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 36 ++++++++++++++++++++++++++++--------
+ 1 file changed, 28 insertions(+), 8 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index fefa2ccfdf..c78b58d365 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1683,27 +1683,38 @@ static void vfio_put_address_space(VFIOAddressSpace *space)
+  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
+  */
+ static int vfio_get_iommu_type(VFIOContainer *container,
++                               bool want_nested,
+                                Error **errp)
+ {
+-    int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
++    int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU,
++                          VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
+                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
+-    int i;
++    int i, ret = -EINVAL;
+ 
+     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
+         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
+-            return iommu_types[i];
++            if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU && !want_nested) {
++                continue;
++            }
++            ret = iommu_types[i];
++            break;
+         }
+     }
+-    error_setg(errp, "No available IOMMU models");
+-    return -EINVAL;
++    if (ret < 0) {
++        error_setg(errp, "No available IOMMU models");
++    } else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) {
++        error_setg(errp, "Nested mode requested but not supported");
++        ret = -EINVAL;
++    }
++    return ret;
+ }
+ 
+ static int vfio_init_container(VFIOContainer *container, int group_fd,
+-                               Error **errp)
++                               bool want_nested, Error **errp)
+ {
+     int iommu_type, dirty_log_manual_clear, ret;
+ 
+-    iommu_type = vfio_get_iommu_type(container, errp);
++    iommu_type = vfio_get_iommu_type(container, want_nested, errp);
+     if (iommu_type < 0) {
+         return iommu_type;
+     }
+@@ -1815,6 +1826,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     VFIOContainer *container;
+     int ret, fd;
+     VFIOAddressSpace *space;
++    IOMMUMemoryRegion *iommu_mr;
++    bool nested = false;
++
++    if (memory_region_is_iommu(as->root)) {
++        iommu_mr = IOMMU_MEMORY_REGION(as->root);
++        memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED,
++                                     (void *)&nested);
++    }
+ 
+     space = vfio_get_address_space(as);
+ 
+@@ -1879,13 +1898,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     QLIST_INIT(&container->hostwin_list);
+     QLIST_INIT(&container->dma_list);
+ 
+-    ret = vfio_init_container(container, group->fd, errp);
++    ret = vfio_init_container(container, group->fd, nested, errp);
+     if (ret) {
+         goto free_container_exit;
+     }
+     trace_vfio_connect_new_container(group->groupid, container->fd);
+ 
+     switch (container->iommu_type) {
++    case VFIO_TYPE1_NESTING_IOMMU:
+     case VFIO_TYPE1v2_IOMMU:
+     case VFIO_TYPE1_IOMMU:
+     {
+-- 
+2.27.0
+
diff --git a/vfio-Helper-to-get-IRQ-info-including-capabilities.patch b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch
new file mode 100644
index 0000000000000000000000000000000000000000..16f16d32faa4d793056700a52ef33b23716801c1
--- /dev/null
+++ b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch
@@ -0,0 +1,178 @@
+From 43fd039dcfee221eb3f86a2cf7deb287cc04e5ad Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 20 Jun 2019 16:39:57 +0200
+Subject: [PATCH] vfio: Helper to get IRQ info including capabilities
+
+As done for vfio regions, add helpers to retrieve irq info
+including their optional capabilities.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 97 +++++++++++++++++++++++++++++++++++
+ hw/vfio/trace-events          |  1 +
+ include/hw/vfio/vfio-common.h |  7 +++
+ 3 files changed, 105 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index db9af3b0e5..98dc9e6f84 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1565,6 +1565,25 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
+     return NULL;
+ }
+ 
++struct vfio_info_cap_header *
++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id)
++{
++    struct vfio_info_cap_header *hdr;
++    void *ptr = info;
++
++    if (!(info->flags & VFIO_IRQ_INFO_FLAG_CAPS)) {
++        return NULL;
++    }
++
++    for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
++        if (hdr->id == id) {
++            return hdr;
++        }
++    }
++
++    return NULL;
++}
++
+ static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
+                                           struct vfio_region_info *info)
+ {
+@@ -2499,6 +2518,33 @@ retry:
+     return 0;
+ }
+ 
++int vfio_get_irq_info(VFIODevice *vbasedev, int index,
++                      struct vfio_irq_info **info)
++{
++    size_t argsz = sizeof(struct vfio_irq_info);
++
++    *info = g_malloc0(argsz);
++
++    (*info)->index = index;
++retry:
++    (*info)->argsz = argsz;
++
++    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, *info)) {
++        g_free(*info);
++        *info = NULL;
++        return -errno;
++    }
++
++    if ((*info)->argsz > argsz) {
++        argsz = (*info)->argsz;
++        *info = g_realloc(*info, argsz);
++
++        goto retry;
++    }
++
++    return 0;
++}
++
+ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+                              uint32_t subtype, struct vfio_region_info **info)
+ {
+@@ -2534,6 +2580,42 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+     return -ENODEV;
+ }
+ 
++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type,
++                          uint32_t subtype, struct vfio_irq_info **info)
++{
++    int i;
++
++    for (i = 0; i < vbasedev->num_irqs; i++) {
++        struct vfio_info_cap_header *hdr;
++        struct vfio_irq_info_cap_type *cap_type;
++
++        if (vfio_get_irq_info(vbasedev, i, info)) {
++            continue;
++        }
++
++        hdr = vfio_get_irq_info_cap(*info, VFIO_IRQ_INFO_CAP_TYPE);
++        if (!hdr) {
++            g_free(*info);
++            continue;
++        }
++
++        cap_type = container_of(hdr, struct vfio_irq_info_cap_type, header);
++
++        trace_vfio_get_dev_irq(vbasedev->name, i,
++                               cap_type->type, cap_type->subtype);
++
++        if (cap_type->type == type && cap_type->subtype == subtype) {
++            return 0;
++        }
++
++        g_free(*info);
++    }
++
++    *info = NULL;
++    return -ENODEV;
++}
++
++
+ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
+ {
+     struct vfio_region_info *info = NULL;
+@@ -2549,6 +2631,21 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
+     return ret;
+ }
+ 
++bool vfio_has_irq_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
++{
++    struct vfio_region_info *info = NULL;
++    bool ret = false;
++
++    if (!vfio_get_region_info(vbasedev, region, &info)) {
++        if (vfio_get_region_info_cap(info, cap_type)) {
++            ret = true;
++        }
++        g_free(info);
++    }
++
++    return ret;
++}
++
+ /*
+  * Interfaces for IBM EEH (Enhanced Error Handling)
+  */
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 247b72c1eb..54e10046f5 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -117,6 +117,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re
+ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
+ vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
+ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
++vfio_get_dev_irq(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
+ vfio_dma_unmap_overflow_workaround(void) ""
+ vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
+ vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index b175158138..a82962ab16 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -238,6 +238,13 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type);
+ struct vfio_info_cap_header *
+ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id);
++int vfio_get_irq_info(VFIODevice *vbasedev, int index,
++                      struct vfio_irq_info **info);
++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type,
++                          uint32_t subtype, struct vfio_irq_info **info);
++bool vfio_has_irq_cap(VFIODevice *vbasedev, int irq, uint16_t cap_type);
++struct vfio_info_cap_header *
++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id);
+ #endif
+ extern const MemoryListener vfio_prereg_listener;
+ 
+-- 
+2.27.0
+
diff --git a/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch
new file mode 100644
index 0000000000000000000000000000000000000000..124587d1e081a65740786aefbd1033d895678245
--- /dev/null
+++ b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch
@@ -0,0 +1,261 @@
+From eb3bfdb61025efe2891ce6732b8829a48dd75e2d Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 30 Aug 2018 15:04:25 +0200
+Subject: [PATCH] vfio: Introduce helpers to DMA map/unmap a RAM section
+
+Let's introduce two helpers that allow to DMA map/unmap a RAM
+section. Those helpers will be called for nested stage setup in
+another call site. Also the vfio_listener_region_add/del()
+structure may be clearer.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c     | 187 +++++++++++++++++++++++++++----------------
+ hw/vfio/trace-events |   4 +-
+ 2 files changed, 119 insertions(+), 72 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index a8db784ac5..8837d33c57 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -709,13 +709,126 @@ hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end)
+     return NULL;
+ }
+ 
++static int vfio_dma_map_ram_section(VFIOContainer *container,
++                                    MemoryRegionSection *section)
++{
++    VFIOHostDMAWindow *hostwin;
++    Int128 llend, llsize;
++    hwaddr iova, end;
++    void *vaddr;
++    int ret;
++
++    assert(memory_region_is_ram(section->mr));
++
++    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
++    llend = int128_make64(section->offset_within_address_space);
++    llend = int128_add(llend, section->size);
++    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
++    end = int128_get64(int128_sub(llend, int128_one()));
++
++    vaddr = memory_region_get_ram_ptr(section->mr) +
++            section->offset_within_region +
++            (iova - section->offset_within_address_space);
++
++    hostwin = hostwin_from_range(container, iova, end);
++    if (!hostwin) {
++        error_report("vfio: IOMMU Container %p can't map guest IOVA region"
++                     " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
++                     container, iova, end);
++        return -EFAULT;
++    }
++
++    trace_vfio_dma_map_ram(iova, end, vaddr);
++
++    llsize = int128_sub(llend, int128_make64(iova));
++
++    if (memory_region_is_ram_device(section->mr)) {
++        hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
++
++        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
++            trace_vfio_listener_region_add_no_dma_map(
++                memory_region_name(section->mr),
++                section->offset_within_address_space,
++                int128_getlo(section->size),
++                pgmask + 1);
++            return 0;
++        }
++    }
++
++    ret = vfio_dma_map(container, iova, int128_get64(llsize),
++                       vaddr, section->readonly);
++    if (ret) {
++        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
++                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
++                     container, iova, int128_get64(llsize), vaddr, ret);
++        if (memory_region_is_ram_device(section->mr)) {
++            /* Allow unexpected mappings not to be fatal for RAM devices */
++            return 0;
++        }
++        return ret;
++    }
++    return 0;
++}
++
++static void vfio_dma_unmap_ram_section(VFIOContainer *container,
++                                       MemoryRegionSection *section)
++{
++    Int128 llend, llsize;
++    hwaddr iova, end;
++    bool try_unmap = true;
++    int ret;
++
++    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
++    llend = int128_make64(section->offset_within_address_space);
++    llend = int128_add(llend, section->size);
++    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
++
++    if (int128_ge(int128_make64(iova), llend)) {
++        return;
++    }
++    end = int128_get64(int128_sub(llend, int128_one()));
++
++    llsize = int128_sub(llend, int128_make64(iova));
++
++    trace_vfio_dma_unmap_ram(iova, end);
++
++    if (memory_region_is_ram_device(section->mr)) {
++        hwaddr pgmask;
++        VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end);
++
++        assert(hostwin); /* or region_add() would have failed */
++
++        pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
++        try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
++    }
++
++    if (try_unmap) {
++        if (int128_eq(llsize, int128_2_64())) {
++            /* The unmap ioctl doesn't accept a full 64-bit span. */
++            llsize = int128_rshift(llsize, 1);
++            ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
++            if (ret) {
++                error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
++                             "0x%"HWADDR_PRIx") = %d (%m)",
++                             container, iova, int128_get64(llsize), ret);
++            }
++            iova += int128_get64(llsize);
++        }
++        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
++        if (ret) {
++            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
++                         "0x%"HWADDR_PRIx") = %d (%m)",
++                         container, iova, int128_get64(llsize), ret);
++        }
++    }
++}
++
+ static void vfio_listener_region_add(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+     hwaddr iova, end;
+-    Int128 llend, llsize;
+-    void *vaddr;
++    Int128 llend;
+     int ret;
+     VFIOHostDMAWindow *hostwin;
+ 
+@@ -842,38 +955,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
+     }
+ 
+     /* Here we assume that memory_region_is_ram(section->mr)==true */
+-
+-    vaddr = memory_region_get_ram_ptr(section->mr) +
+-            section->offset_within_region +
+-            (iova - section->offset_within_address_space);
+-
+-    trace_vfio_listener_region_add_ram(iova, end, vaddr);
+-
+-    llsize = int128_sub(llend, int128_make64(iova));
+-
+-    if (memory_region_is_ram_device(section->mr)) {
+-        hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+-
+-        if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
+-            trace_vfio_listener_region_add_no_dma_map(
+-                memory_region_name(section->mr),
+-                section->offset_within_address_space,
+-                int128_getlo(section->size),
+-                pgmask + 1);
+-            return;
+-        }
+-    }
+-
+-    ret = vfio_dma_map(container, iova, int128_get64(llsize),
+-                       vaddr, section->readonly);
+-    if (ret) {
+-        error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+-                     "0x%"HWADDR_PRIx", %p) = %d (%m)",
+-                     container, iova, int128_get64(llsize), vaddr, ret);
+-        if (memory_region_is_ram_device(section->mr)) {
+-            /* Allow unexpected mappings not to be fatal for RAM devices */
+-            return;
+-        }
++    if (vfio_dma_map_ram_section(container, section)) {
+         goto fail;
+     }
+ 
+@@ -902,10 +984,6 @@ static void vfio_listener_region_del(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+-    hwaddr iova, end;
+-    Int128 llend, llsize;
+-    int ret;
+-    bool try_unmap = true;
+ 
+     if (vfio_listener_skipped_section(section)) {
+         trace_vfio_listener_region_del_skip(
+@@ -945,38 +1023,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
+          */
+     }
+ 
+-    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+-    llend = int128_make64(section->offset_within_address_space);
+-    llend = int128_add(llend, section->size);
+-    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
+-
+-    if (int128_ge(int128_make64(iova), llend)) {
+-        return;
+-    }
+-    end = int128_get64(int128_sub(llend, int128_one()));
+-
+-    llsize = int128_sub(llend, int128_make64(iova));
+-
+-    trace_vfio_listener_region_del(iova, end);
+-
+-    if (memory_region_is_ram_device(section->mr)) {
+-        hwaddr pgmask;
+-        VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end);
+-
+-        assert(hostwin); /* or region_add() would have failed */
+-
+-        pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+-        try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
+-    }
+-
+-    if (try_unmap) {
+-        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
+-        if (ret) {
+-            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+-                         "0x%"HWADDR_PRIx") = %d (%m)",
+-                         container, iova, int128_get64(llsize), ret);
+-        }
+-    }
++    vfio_dma_unmap_ram_section(container, section);
+ 
+     memory_region_unref(section->mr);
+ 
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 561dc6e758..9b6c7ca61b 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -97,10 +97,10 @@ vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "i
+ vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64
+ vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
+ vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64
+-vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
++vfio_dma_map_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
+ vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
+ vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
+-vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
++vfio_dma_unmap_ram(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+ vfio_disconnect_container(int fd) "close container->fd=%d"
+ vfio_connect_existing_container(int groupid, int container_fd) "group=%d existing container fd=%d"
+ vfio_connect_new_container(int groupid, int container_fd) "group=%d new container fd=%d"
+-- 
+2.27.0
+
diff --git a/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..274a0c08a92bfa268f3fab8a5d7842cdbab9d273
--- /dev/null
+++ b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch
@@ -0,0 +1,64 @@
+From ff9c1f7e3e17cc2afe1b2dfa545065e91941db8b Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Tue, 11 May 2021 10:08:13 +0800
+Subject: [PATCH] vfio: Introduce helpers to mark dirty pages of a RAM section
+
+Extract part of the code from vfio_sync_dirty_bitmap to form a
+new helper, which allows to mark dirty pages of a RAM section.
+This helper will be called for nested stage.
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 22 ++++++++++++++--------
+ 1 file changed, 14 insertions(+), 8 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 21a866e545..5176fd3a3d 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1304,6 +1304,19 @@ err_out:
+     return ret;
+ }
+ 
++static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container,
++                                                  MemoryRegionSection *section)
++{
++    ram_addr_t ram_addr;
++
++    ram_addr = memory_region_get_ram_addr(section->mr) +
++               section->offset_within_region;
++
++    return vfio_get_dirty_bitmap(container,
++                    REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
++                    int128_get64(section->size), ram_addr);
++}
++
+ typedef struct {
+     IOMMUNotifier n;
+     VFIOGuestIOMMU *giommu;
+@@ -1345,8 +1358,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+                                   MemoryRegionSection *section)
+ {
+-    ram_addr_t ram_addr;
+-
+     if (memory_region_is_iommu(section->mr)) {
+         VFIOGuestIOMMU *giommu;
+ 
+@@ -1375,12 +1386,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+         return 0;
+     }
+ 
+-    ram_addr = memory_region_get_ram_addr(section->mr) +
+-               section->offset_within_region;
+-
+-    return vfio_get_dirty_bitmap(container,
+-                   REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
+-                   int128_get64(section->size), ram_addr);
++    return vfio_dma_sync_ram_section_dirty_bitmap(container, section);
+ }
+ 
+ static void vfio_listener_log_sync(MemoryListener *listener,
+-- 
+2.27.0
+
diff --git a/vfio-Introduce-hostwin_from_range-helper.patch b/vfio-Introduce-hostwin_from_range-helper.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b9a7099de2a6eeaa9265be66c8818ffc852e6583
--- /dev/null
+++ b/vfio-Introduce-hostwin_from_range-helper.patch
@@ -0,0 +1,89 @@
+From 25336cd596ff551293f1be6e108ad9277d80be0f Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 22 Mar 2019 18:05:23 +0100
+Subject: [PATCH] vfio: Introduce hostwin_from_range helper
+
+Let's introduce a hostwin_from_range() helper that returns the
+hostwin encapsulating an IOVA range or NULL if none is found.
+
+This improves the readibility of callers and removes the usage
+of hostwin_found.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 36 +++++++++++++++++-------------------
+ 1 file changed, 17 insertions(+), 19 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index c78b58d365..a8db784ac5 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -696,6 +696,19 @@ out:
+     rcu_read_unlock();
+ }
+ 
++static VFIOHostDMAWindow *
++hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end)
++{
++    VFIOHostDMAWindow *hostwin;
++
++    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
++        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
++            return hostwin;
++        }
++    }
++    return NULL;
++}
++
+ static void vfio_listener_region_add(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+@@ -705,7 +718,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
+     void *vaddr;
+     int ret;
+     VFIOHostDMAWindow *hostwin;
+-    bool hostwin_found;
+ 
+     if (vfio_listener_skipped_section(section)) {
+         trace_vfio_listener_region_add_skip(
+@@ -783,15 +795,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
+ #endif
+     }
+ 
+-    hostwin_found = false;
+-    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+-        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+-            hostwin_found = true;
+-            break;
+-        }
+-    }
+-
+-    if (!hostwin_found) {
++    hostwin = hostwin_from_range(container, iova, end);
++    if (!hostwin) {
+         error_report("vfio: IOMMU container %p can't map guest IOVA region"
+                      " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
+                      container, iova, end);
+@@ -956,16 +961,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
+ 
+     if (memory_region_is_ram_device(section->mr)) {
+         hwaddr pgmask;
+-        VFIOHostDMAWindow *hostwin;
+-        bool hostwin_found = false;
++        VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end);
+ 
+-        QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+-            if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+-                hostwin_found = true;
+-                break;
+-            }
+-        }
+-        assert(hostwin_found); /* or region_add() would have failed */
++        assert(hostwin); /* or region_add() would have failed */
+ 
+         pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
+         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
+-- 
+2.27.0
+
diff --git a/vfio-Maintain-DMA-mapping-range-for-the-container.patch b/vfio-Maintain-DMA-mapping-range-for-the-container.patch
new file mode 100644
index 0000000000000000000000000000000000000000..901a5e38ea78a2c490875611f12658151da661b9
--- /dev/null
+++ b/vfio-Maintain-DMA-mapping-range-for-the-container.patch
@@ -0,0 +1,191 @@
+From 90a6a1ec65d55d27faf79341b2dd9418d99da187 Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Sat, 8 May 2021 17:31:04 +0800
+Subject: [PATCH] vfio: Maintain DMA mapping range for the container
+
+When synchronizing dirty bitmap from kernel VFIO we do it in a
+per-iova-range fashion and we allocate the userspace bitmap for each of the
+ioctl. This patch introduces `struct VFIODMARange` to describe a range of
+the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and
+make the bitmap cache of this range be persistent so that we don't need to
+g_try_malloc0() every time. Note that the new structure is almost a copy of
+`struct vfio_iommu_type1_dma_map` but only internally used by QEMU.
+
+More importantly, the cached per-iova-range dirty bitmap will be further
+used when we want to add support for the CLEAR_BITMAP and this cached
+bitmap will be used to guarantee we don't clear any unknown dirty bits
+otherwise that can be a severe data loss issue for migration code.
+
+It's pretty intuitive to maintain a bitmap per container since we perform
+log_sync at this granule. But I don't know how to deal with things like
+memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome.
+
+* yet something to-do:
+  - can't work with guest viommu
+  - no locks
+  - etc
+
+[ The idea and even the commit message are largely inherited from kvm side.
+  See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ]
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Kunkun Jiang <jinagkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 62 +++++++++++++++++++++++++++++++----
+ include/hw/vfio/vfio-common.h |  9 +++++
+ 2 files changed, 65 insertions(+), 6 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 245e32df5b..c33c4c539d 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -420,6 +420,29 @@ unmap_exit:
+     return ret;
+ }
+ 
++static VFIODMARange *vfio_lookup_match_range(VFIOContainer *container,
++        hwaddr start_addr, hwaddr size)
++{
++    VFIODMARange *qrange;
++
++    QLIST_FOREACH(qrange, &container->dma_list, next) {
++        if (qrange->iova == start_addr && qrange->size == size) {
++            return qrange;
++        }
++    }
++    return NULL;
++}
++
++static void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange)
++{
++    uint64_t pages, size;
++
++    pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size;
++    size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
++
++    qrange->bitmap = g_malloc0(size);
++}
++
+ /*
+  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+  */
+@@ -433,12 +456,29 @@ static int vfio_dma_unmap(VFIOContainer *container,
+         .iova = iova,
+         .size = size,
+     };
++    VFIODMARange *qrange;
+ 
+     if (iotlb && container->dirty_pages_supported &&
+         vfio_devices_all_running_and_saving(container)) {
+         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+     }
+ 
++    /*
++     * unregister the DMA range
++     *
++     * It seems that the memory layer will give us the same section as the one
++     * used in region_add(). Otherwise it'll be complicated to manipulate the
++     * bitmap across region_{add,del}. Is there any guarantee?
++     *
++     * But there is really not such a restriction on the kernel interface
++     * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc).
++     */
++    qrange = vfio_lookup_match_range(container, iova, size);
++    assert(qrange);
++    g_free(qrange->bitmap);
++    QLIST_REMOVE(qrange, next);
++    g_free(qrange);
++
+     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+         /*
+          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
+@@ -475,6 +515,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
+         .iova = iova,
+         .size = size,
+     };
++    VFIODMARange *qrange;
++
++    qrange = g_malloc0(sizeof(*qrange));
++    qrange->iova = iova;
++    qrange->size = size;
++    QLIST_INSERT_HEAD(&container->dma_list, qrange, next);
++    /* XXX allocate the dirty bitmap on demand */
++    vfio_dma_range_init_dirty_bitmap(qrange);
+ 
+     if (!readonly) {
+         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
+@@ -986,9 +1034,14 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+ {
+     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+     struct vfio_iommu_type1_dirty_bitmap_get *range;
++    VFIODMARange *qrange;
+     uint64_t pages;
+     int ret;
+ 
++    qrange = vfio_lookup_match_range(container, iova, size);
++    /* the same as vfio_dma_unmap() */
++    assert(qrange);
++
+     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
+ 
+     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
+@@ -1007,11 +1060,8 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
+     range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                          BITS_PER_BYTE;
+-    range->bitmap.data = g_try_malloc0(range->bitmap.size);
+-    if (!range->bitmap.data) {
+-        ret = -ENOMEM;
+-        goto err_out;
+-    }
++
++    range->bitmap.data = (__u64 *)qrange->bitmap;
+ 
+     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
+     if (ret) {
+@@ -1027,7 +1077,6 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
+                                 range->bitmap.size, ram_addr);
+ err_out:
+-    g_free(range->bitmap.data);
+     g_free(dbitmap);
+ 
+     return ret;
+@@ -1681,6 +1730,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+     container->dirty_pages_supported = false;
+     QLIST_INIT(&container->giommu_list);
+     QLIST_INIT(&container->hostwin_list);
++    QLIST_INIT(&container->dma_list);
+ 
+     ret = vfio_init_container(container, group->fd, errp);
+     if (ret) {
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index 475aa9fb40..2853dc861e 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -76,6 +76,14 @@ typedef struct VFIOAddressSpace {
+ 
+ struct VFIOGroup;
+ 
++typedef struct VFIODMARange {
++    QLIST_ENTRY(VFIODMARange) next;
++    hwaddr iova;
++    size_t size;
++    void *vaddr; /* unused */
++    unsigned long *bitmap; /* dirty bitmap cache for this range */
++} VFIODMARange;
++
+ typedef struct VFIOContainer {
+     VFIOAddressSpace *space;
+     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+@@ -91,6 +99,7 @@ typedef struct VFIOContainer {
+     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+     QLIST_HEAD(, VFIOGroup) group_list;
++    QLIST_HEAD(, VFIODMARange) dma_list;
+     QLIST_ENTRY(VFIOContainer) next;
+ } VFIOContainer;
+ 
+-- 
+2.27.0
+
diff --git a/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1ad94b06ad73ccf44d049daa2b8ff35b3624d539
--- /dev/null
+++ b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch
@@ -0,0 +1,262 @@
+From 1729ae16dc557c0ad54cab3096b5cb6649d181ae Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 14 Aug 2018 08:08:11 -0400
+Subject: [PATCH] vfio: Pass stage 1 MSI bindings to the host
+
+We register the stage1 MSI bindings when enabling the vectors
+and we unregister them on msi disable.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 59 +++++++++++++++++++++++++++
+ hw/vfio/pci.c                 | 76 ++++++++++++++++++++++++++++++++++-
+ hw/vfio/trace-events          |  2 +
+ include/hw/vfio/vfio-common.h | 12 ++++++
+ 4 files changed, 147 insertions(+), 2 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index cc50efdbc1..db9af3b0e5 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -709,6 +709,65 @@ static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+     }
+ }
+ 
++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n,
++                               IOMMUTLBEntry *iotlb)
++{
++    struct vfio_iommu_type1_set_msi_binding ustruct;
++    VFIOMSIBinding *binding;
++    int ret;
++
++    QLIST_FOREACH(binding, &container->msibinding_list, next) {
++        if (binding->index == n) {
++            return 0;
++        }
++    }
++
++    ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding);
++    ustruct.iova = iotlb->iova;
++    ustruct.flags = VFIO_IOMMU_BIND_MSI;
++    ustruct.gpa = iotlb->translated_addr;
++    ustruct.size = iotlb->addr_mask + 1;
++    ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct);
++    if (ret) {
++        error_report("%s: failed to register the stage1 MSI binding (%m)",
++                     __func__);
++        return ret;
++    }
++    binding =  g_new0(VFIOMSIBinding, 1);
++    binding->iova = ustruct.iova;
++    binding->gpa = ustruct.gpa;
++    binding->size = ustruct.size;
++    binding->index = n;
++
++    QLIST_INSERT_HEAD(&container->msibinding_list, binding, next);
++    return 0;
++}
++
++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n)
++{
++    struct vfio_iommu_type1_set_msi_binding ustruct;
++    VFIOMSIBinding *binding, *tmp;
++    int ret;
++
++    ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding);
++    QLIST_FOREACH_SAFE(binding, &container->msibinding_list, next, tmp) {
++        if (binding->index != n) {
++            continue;
++        }
++        ustruct.flags = VFIO_IOMMU_UNBIND_MSI;
++        ustruct.iova = binding->iova;
++        ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct);
++        if (ret) {
++            error_report("Failed to unregister the stage1 MSI binding "
++                         "for iova=0x%"PRIx64" (%m)", binding->iova);
++        }
++        QLIST_REMOVE(binding, next);
++        g_free(binding);
++        return ret;
++    }
++    return 0;
++}
++
+ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+ {
+     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index 6c90ec9278..bbcba3fd16 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -360,6 +360,65 @@ static void vfio_msi_interrupt(void *opaque)
+     notify(&vdev->pdev, nr);
+ }
+ 
++static bool vfio_iommu_require_msi_binding(IOMMUMemoryRegion *iommu_mr)
++{
++    bool msi_translate = false, nested = false;
++
++    memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_MSI_TRANSLATE,
++                                 (void *)&msi_translate);
++    memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED,
++                                 (void *)&nested);
++    if (!nested || !msi_translate) {
++        return false;
++    }
++   return true;
++}
++
++static int vfio_register_msi_binding(VFIOPCIDevice *vdev,
++                                     int vector_n, bool set)
++{
++    VFIOContainer *container = vdev->vbasedev.group->container;
++    PCIDevice *dev = &vdev->pdev;
++    AddressSpace *as = pci_device_iommu_address_space(dev);
++    IOMMUMemoryRegionClass *imrc;
++    IOMMUMemoryRegion *iommu_mr;
++    IOMMUTLBEntry entry;
++    MSIMessage msg;
++
++    if (as == &address_space_memory) {
++        return 0;
++    }
++
++    iommu_mr = IOMMU_MEMORY_REGION(as->root);
++    if (!vfio_iommu_require_msi_binding(iommu_mr)) {
++        return 0;
++    }
++
++    /* MSI doorbell address is translated by an IOMMU */
++
++    if (!set) { /* unregister */
++        trace_vfio_unregister_msi_binding(vdev->vbasedev.name, vector_n);
++
++        return vfio_iommu_unset_msi_binding(container, vector_n);
++    }
++
++    msg = pci_get_msi_message(dev, vector_n);
++    imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
++
++    rcu_read_lock();
++    entry = imrc->translate(iommu_mr, msg.address, IOMMU_WO, 0);
++    rcu_read_unlock();
++
++    if (entry.perm == IOMMU_NONE) {
++        return -ENOENT;
++    }
++
++    trace_vfio_register_msi_binding(vdev->vbasedev.name, vector_n,
++                                    msg.address, entry.translated_addr);
++
++    return vfio_iommu_set_msi_binding(container, vector_n, &entry);
++}
++
+ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+ {
+     struct vfio_irq_set *irq_set;
+@@ -377,7 +436,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+     fds = (int32_t *)&irq_set->data;
+ 
+     for (i = 0; i < vdev->nr_vectors; i++) {
+-        int fd = -1;
++        int ret, fd = -1;
+ 
+         /*
+          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
+@@ -386,6 +445,12 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+          * KVM signaling path only when configured and unmasked.
+          */
+         if (vdev->msi_vectors[i].use) {
++            ret = vfio_register_msi_binding(vdev, i, true);
++            if (ret) {
++                error_report("%s failed to register S1 MSI binding "
++                             "for vector %d(%d)", vdev->vbasedev.name, i, ret);
++                goto out;
++            }
+             if (vdev->msi_vectors[i].virq < 0 ||
+                 (msix && msix_is_masked(&vdev->pdev, i))) {
+                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+@@ -399,6 +464,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
+ 
+     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ 
++out:
+     g_free(irq_set);
+ 
+     return ret;
+@@ -712,7 +778,8 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
+ 
+ static void vfio_msix_disable(VFIOPCIDevice *vdev)
+ {
+-    int i;
++    int ret, i;
++
+ 
+     msix_unset_vector_notifiers(&vdev->pdev);
+ 
+@@ -724,6 +791,11 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
+         if (vdev->msi_vectors[i].use) {
+             vfio_msix_vector_release(&vdev->pdev, i);
+             msix_vector_unuse(&vdev->pdev, i);
++            ret = vfio_register_msi_binding(vdev, i, false);
++            if (ret) {
++                error_report("%s: failed to unregister S1 MSI binding "
++                             "for vector %d(%d)", vdev->vbasedev.name, i, ret);
++            }
+         }
+     }
+ 
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index ee9a67d3ef..247b72c1eb 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -120,6 +120,8 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype
+ vfio_dma_unmap_overflow_workaround(void) ""
+ vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
+ vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
++vfio_register_msi_binding(const char *name, int vector, uint64_t giova, uint64_t gdb) "%s: register vector %d gIOVA=0x%"PRIx64 "-> gDB=0x%"PRIx64" stage 1 mapping"
++vfio_unregister_msi_binding(const char *name, int vector) "%s: unregister vector %d stage 1 mapping"
+ 
+ # platform.c
+ vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index 1277914ca8..b175158138 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -74,6 +74,14 @@ typedef struct VFIOAddressSpace {
+     QLIST_ENTRY(VFIOAddressSpace) list;
+ } VFIOAddressSpace;
+ 
++typedef struct VFIOMSIBinding {
++    int index;
++    hwaddr iova;
++    hwaddr gpa;
++    hwaddr size;
++    QLIST_ENTRY(VFIOMSIBinding) next;
++} VFIOMSIBinding;
++
+ struct VFIOGroup;
+ 
+ typedef struct VFIODMARange {
+@@ -101,6 +109,7 @@ typedef struct VFIOContainer {
+     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+     QLIST_HEAD(, VFIOGroup) group_list;
+     QLIST_HEAD(, VFIODMARange) dma_list;
++    QLIST_HEAD(, VFIOMSIBinding) msibinding_list;
+     QLIST_ENTRY(VFIOContainer) next;
+ } VFIOContainer;
+ 
+@@ -210,6 +219,9 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
+ void vfio_put_group(VFIOGroup *group);
+ int vfio_get_device(VFIOGroup *group, const char *name,
+                     VFIODevice *vbasedev, Error **errp);
++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n,
++                               IOMMUTLBEntry *entry);
++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n);
+ 
+ extern const MemoryRegionOps vfio_region_ops;
+ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
+-- 
+2.27.0
+
diff --git a/vfio-Set-up-nested-stage-mappings.patch b/vfio-Set-up-nested-stage-mappings.patch
new file mode 100644
index 0000000000000000000000000000000000000000..66659e81a8dacc392f4f6a56ab988c36a52af8a2
--- /dev/null
+++ b/vfio-Set-up-nested-stage-mappings.patch
@@ -0,0 +1,277 @@
+From a65c40f9d1025a9843dec38070d9f26792b00892 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Wed, 29 Aug 2018 18:10:12 +0200
+Subject: [PATCH] vfio: Set up nested stage mappings
+
+In nested mode, legacy vfio_iommu_map_notify cannot be used as
+there is no "caching" mode and we do not trap on map.
+
+On Intel, vfio_iommu_map_notify was used to DMA map the RAM
+through the host single stage.
+
+With nested mode, we need to setup the stage 2 and the stage 1
+separately. This patch introduces a prereg_listener to setup
+the stage 2 mapping.
+
+The stage 1 mapping, owned by the guest, is passed to the host
+when the guest invalidates the stage 1 configuration, through
+a dedicated PCIPASIDOps callback. Guest IOTLB invalidations
+are cascaded downto the host through another IOMMU MR UNMAP
+notifier.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c     | 136 +++++++++++++++++++++++++++++++++++++++++--
+ hw/vfio/pci.c        |  21 +++++++
+ hw/vfio/trace-events |   2 +
+ 3 files changed, 154 insertions(+), 5 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 8837d33c57..cc50efdbc1 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -642,6 +642,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
+     return true;
+ }
+ 
++/* Propagate a guest IOTLB invalidation to the host (nested mode) */
++static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
++{
++    VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
++    struct vfio_iommu_type1_cache_invalidate ustruct = {};
++    VFIOContainer *container = giommu->container;
++    int ret;
++
++    assert(iotlb->perm == IOMMU_NONE);
++
++    ustruct.argsz = sizeof(ustruct);
++    ustruct.flags = 0;
++    ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info);
++    ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
++    ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB;
++
++    switch (iotlb->granularity) {
++    case IOMMU_INV_GRAN_DOMAIN:
++        ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN;
++        break;
++    case IOMMU_INV_GRAN_PASID:
++    {
++        struct iommu_inv_pasid_info *pasid_info;
++        int archid = -1;
++
++        pasid_info = &ustruct.info.granu.pasid_info;
++        ustruct.info.granularity = IOMMU_INV_GRANU_PASID;
++        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
++            pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
++            archid = iotlb->arch_id;
++        }
++        pasid_info->archid = archid;
++        trace_vfio_iommu_asid_inv_iotlb(archid);
++        break;
++    }
++    case IOMMU_INV_GRAN_ADDR:
++    {
++        hwaddr start = iotlb->iova + giommu->iommu_offset;
++        struct iommu_inv_addr_info *addr_info;
++        size_t size = iotlb->addr_mask + 1;
++        int archid = -1;
++
++        addr_info = &ustruct.info.granu.addr_info;
++        ustruct.info.granularity = IOMMU_INV_GRANU_ADDR;
++        if (iotlb->leaf) {
++            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF;
++        }
++        if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) {
++            addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID;
++            archid = iotlb->arch_id;
++        }
++        addr_info->archid = archid;
++        addr_info->addr = start;
++        addr_info->granule_size = size;
++        addr_info->nb_granules = 1;
++        trace_vfio_iommu_addr_inv_iotlb(archid, start, size,
++                                        1, iotlb->leaf);
++        break;
++    }
++    }
++
++    ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct);
++    if (ret) {
++        error_report("%p: failed to invalidate CACHE (%d)", container, ret);
++    }
++}
++
+ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+ {
+     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+@@ -823,6 +890,32 @@ static void vfio_dma_unmap_ram_section(VFIOContainer *container,
+     }
+ }
+ 
++static void vfio_prereg_listener_region_add(MemoryListener *listener,
++                                            MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr)) {
++        return;
++    }
++
++    vfio_dma_map_ram_section(container, section);
++}
++
++static void vfio_prereg_listener_region_del(MemoryListener *listener,
++                                     MemoryRegionSection *section)
++{
++    VFIOContainer *container =
++        container_of(listener, VFIOContainer, prereg_listener);
++
++    if (!memory_region_is_ram(section->mr)) {
++        return;
++    }
++
++    vfio_dma_unmap_ram_section(container, section);
++}
++
+ static void vfio_listener_region_add(MemoryListener *listener,
+                                      MemoryRegionSection *section)
+ {
+@@ -920,9 +1013,10 @@ static void vfio_listener_region_add(MemoryListener *listener,
+     memory_region_ref(section->mr);
+ 
+     if (memory_region_is_iommu(section->mr)) {
++        IOMMUNotify notify;
+         VFIOGuestIOMMU *giommu;
+         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+-        int iommu_idx;
++        int iommu_idx, flags;
+ 
+         trace_vfio_listener_region_add_iommu(iova, end);
+         /*
+@@ -941,15 +1035,27 @@ static void vfio_listener_region_add(MemoryListener *listener,
+         llend = int128_sub(llend, int128_one());
+         iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
+                                                        MEMTXATTRS_UNSPECIFIED);
+-        iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
+-                            IOMMU_NOTIFIER_ALL,
++
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            /* IOTLB unmap notifier to propagate guest IOTLB invalidations */
++            flags = IOMMU_NOTIFIER_UNMAP;
++            notify = vfio_iommu_unmap_notify;
++        } else {
++            /* MAP/UNMAP IOTLB notifier */
++            flags = IOMMU_NOTIFIER_ALL;
++            notify = vfio_iommu_map_notify;
++        }
++
++        iommu_notifier_init(&giommu->n, notify, flags,
+                             section->offset_within_region,
+                             int128_get64(llend),
+                             iommu_idx);
+         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+ 
+         memory_region_register_iommu_notifier(section->mr, &giommu->n);
+-        memory_region_iommu_replay(giommu->iommu, &giommu->n);
++        if (flags & IOMMU_NOTIFIER_MAP) {
++            memory_region_iommu_replay(giommu->iommu, &giommu->n);
++        }
+ 
+         return;
+     }
+@@ -1367,10 +1473,16 @@ static const MemoryListener vfio_memory_listener = {
+     .log_clear = vfio_listener_log_clear,
+ };
+ 
++static MemoryListener vfio_memory_prereg_listener = {
++    .region_add = vfio_prereg_listener_region_add,
++    .region_del = vfio_prereg_listener_region_del,
++};
++
+ static void vfio_listener_release(VFIOContainer *container)
+ {
+     memory_listener_unregister(&container->listener);
+-    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
++    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
++        container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
+         memory_listener_unregister(&container->prereg_listener);
+     }
+ }
+@@ -1976,6 +2088,20 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+             vfio_get_iommu_info_migration(container, info);
+         }
+         g_free(info);
++
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            container->prereg_listener = vfio_memory_prereg_listener;
++            memory_listener_register(&container->prereg_listener,
++                                     &address_space_memory);
++            if (container->error) {
++                memory_listener_unregister(&container->prereg_listener);
++                ret = container->error;
++                error_setg(errp,
++                          "RAM memory listener initialization failed "
++                          "for container");
++                goto free_container_exit;
++            }
++        }
+         break;
+     }
+     case VFIO_SPAPR_TCE_v2_IOMMU:
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index 3641ad0c5c..6c90ec9278 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2766,6 +2766,25 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
+     vdev->req_enabled = false;
+ }
+ 
++static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
++                                      IOMMUConfig *config)
++{
++    PCIDevice *pdev = bus->devices[devfn];
++    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
++    VFIOContainer *container = vdev->vbasedev.group->container;
++    struct vfio_iommu_type1_set_pasid_table info;
++
++    info.argsz = sizeof(info);
++    info.flags = VFIO_PASID_TABLE_FLAG_SET;
++    memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg));
++
++    return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
++}
++
++static PCIPASIDOps vfio_pci_pasid_ops = {
++    .set_pasid_table = vfio_iommu_set_pasid_table,
++};
++
+ static void vfio_realize(PCIDevice *pdev, Error **errp)
+ {
+     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
+@@ -3072,6 +3091,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+     vfio_register_req_notifier(vdev);
+     vfio_setup_resetfn_quirk(vdev);
+ 
++    pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
++
+     return;
+ 
+ out_teardown:
+diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
+index 9b6c7ca61b..ee9a67d3ef 100644
+--- a/hw/vfio/trace-events
++++ b/hw/vfio/trace-events
+@@ -118,6 +118,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic
+ vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
+ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
+ vfio_dma_unmap_overflow_workaround(void) ""
++vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d"
++vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d"
+ 
+ # platform.c
+ vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
+-- 
+2.27.0
+
diff --git a/vfio-Support-host-translation-granule-size.patch b/vfio-Support-host-translation-granule-size.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d5eab65155770160c38615d038ea66264e284acb
--- /dev/null
+++ b/vfio-Support-host-translation-granule-size.patch
@@ -0,0 +1,152 @@
+From 594cba5943b3e8bf1bd5720b1fa20d4662920ae0 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Thu, 4 Mar 2021 21:34:46 +0800
+Subject: [PATCH] vfio: Support host translation granule size
+
+The cpu_physical_memory_set_dirty_lebitmap() can quickly deal with
+the dirty pages of memory by bitmap-traveling, regardless of whether
+the bitmap is aligned correctly or not.
+
+cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
+host page size. So it'd better to set bitmap_pgsize to host page size
+to support more translation granule sizes.
+
+[aw: The Fixes commit below introduced code to restrict migration
+support to configurations where the target page size intersects the
+host dirty page support.  For example, a 4K guest on a 4K host.
+Due to the above flexibility in bitmap handling, this restriction
+unnecessarily prevents mixed target/host pages size that could
+otherwise be supported.  Use host page size for dirty bitmap.]
+
+Fixes: fc49c9cbf2 ("vfio: Get migration capability flags for container")
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+Message-Id: <20210304133446.1521-1-jiangkunkun@huawei.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+---
+ hw/vfio/common.c | 48 +++++++++++++++++++++++++-----------------------
+ 1 file changed, 25 insertions(+), 23 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index ebd701faa0..a7817c90cc 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -377,7 +377,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+ {
+     struct vfio_iommu_type1_dma_unmap *unmap;
+     struct vfio_bitmap *bitmap;
+-    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
++    uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
+     int ret;
+ 
+     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
+@@ -389,12 +389,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+     bitmap = (struct vfio_bitmap *)&unmap->data;
+ 
+     /*
+-     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+-     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
+-     * TARGET_PAGE_SIZE.
++     * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
++     * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
++     * to qemu_real_host_page_size.
+      */
+ 
+-    bitmap->pgsize = TARGET_PAGE_SIZE;
++    bitmap->pgsize = qemu_real_host_page_size;
+     bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                    BITS_PER_BYTE;
+ 
+@@ -672,16 +672,17 @@ static void vfio_listener_region_add(MemoryListener *listener,
+         return;
+     }
+ 
+-    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+-                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
++    if (unlikely((section->offset_within_address_space &
++                  ~qemu_real_host_page_mask) !=
++                 (section->offset_within_region & ~qemu_real_host_page_mask))) {
+         error_report("%s received unaligned region", __func__);
+         return;
+     }
+ 
+-    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
++    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+     llend = int128_make64(section->offset_within_address_space);
+     llend = int128_add(llend, section->size);
+-    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
++    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
+ 
+     if (int128_ge(int128_make64(iova), llend)) {
+         return;
+@@ -866,8 +867,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
+         return;
+     }
+ 
+-    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+-                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
++    if (unlikely((section->offset_within_address_space &
++                  ~qemu_real_host_page_mask) !=
++                 (section->offset_within_region & ~qemu_real_host_page_mask))) {
+         error_report("%s received unaligned region", __func__);
+         return;
+     }
+@@ -895,10 +897,10 @@ static void vfio_listener_region_del(MemoryListener *listener,
+          */
+     }
+ 
+-    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
++    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+     llend = int128_make64(section->offset_within_address_space);
+     llend = int128_add(llend, section->size);
+-    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
++    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
+ 
+     if (int128_ge(int128_make64(iova), llend)) {
+         return;
+@@ -967,13 +969,13 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     range->size = size;
+ 
+     /*
+-     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+-     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
+-     * TARGET_PAGE_SIZE.
++     * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
++     * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
++     * to qemu_real_host_page_size.
+      */
+-    range->bitmap.pgsize = TARGET_PAGE_SIZE;
++    range->bitmap.pgsize = qemu_real_host_page_size;
+ 
+-    pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
++    pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
+     range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                          BITS_PER_BYTE;
+     range->bitmap.data = g_try_malloc0(range->bitmap.size);
+@@ -1077,8 +1079,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+                section->offset_within_region;
+ 
+     return vfio_get_dirty_bitmap(container,
+-                       TARGET_PAGE_ALIGN(section->offset_within_address_space),
+-                       int128_get64(section->size), ram_addr);
++                   REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
++                   int128_get64(section->size), ram_addr);
+ }
+ 
+ static void vfio_listener_log_sync(MemoryListener *listener,
+@@ -1572,10 +1574,10 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
+                             header);
+ 
+     /*
+-     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+-     * TARGET_PAGE_SIZE to mark those dirty.
++     * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
++     * qemu_real_host_page_size to mark those dirty.
+      */
+-    if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) {
++    if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) {
+         container->dirty_pages_supported = true;
+         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
+         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
+-- 
+2.27.0
+
diff --git a/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch
new file mode 100644
index 0000000000000000000000000000000000000000..efcbd1fd03162efd34a1c11bc169e39da757da6b
--- /dev/null
+++ b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch
@@ -0,0 +1,39 @@
+From 55f3bdd0866be2b1a6223bacf9e00a032daf957c Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Sat, 31 Jul 2021 10:02:18 +0800
+Subject: [PATCH] vfio/common: Avoid unmap ram section at
+ vfio_listener_region_del() in nested mode
+
+The ram section will be unmapped at vfio_prereg_listener_region_del()
+in nested mode. So let's avoid unmap ram section at
+vfio_listener_region_dev().
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index 98dc9e6f84..21a866e545 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1179,6 +1179,16 @@ static void vfio_listener_region_del(MemoryListener *listener,
+             }
+         }
+ 
++        /*
++         * In nested mode, stage 2 (gpa->hpa) and the stage 1
++         * (giova->gpa) are set separately. The ram section
++         * will be unmapped in vfio_prereg_listener_region_del().
++         * Hence it doesn't need to unmap ram section here.
++         */
++        if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
++            return;
++        }
++
+         /*
+          * FIXME: We assume the one big unmap below is adequate to
+          * remove any individual page mappings in the IOMMU which
+-- 
+2.27.0
+
diff --git a/vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch b/vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch
new file mode 100644
index 0000000000000000000000000000000000000000..5f543b40bdb7e93d671edbd834b4279dec69c8c9
--- /dev/null
+++ b/vfio-migrate-Move-switch-of-dirty-tracking-into-vfio.patch
@@ -0,0 +1,196 @@
+From 74b651428e6ed65177354d80bd888e842a4a5077 Mon Sep 17 00:00:00 2001
+From: Keqian Zhu <zhukeqian1@huawei.com>
+Date: Tue, 9 Mar 2021 11:19:13 +0800
+Subject: [PATCH] vfio/migrate: Move switch of dirty tracking into
+ vfio_memory_listener
+
+For now the switch of vfio dirty page tracking is integrated into
+@vfio_save_handler. The reason is that some PCI vendor driver may
+start to track dirty base on _SAVING state of device, so if dirty
+tracking is started before setting device state, vfio will report
+full-dirty to QEMU.
+
+However, the dirty bmap of all ramblocks are fully set when setup
+ram saving, so it's not matter whether the device is in _SAVING
+state when start vfio dirty tracking.
+
+Moreover, this logic causes some problems [1]. The object of dirty
+tracking is guest memory, but the object of @vfio_save_handler is
+device state, which produces unnecessary coupling and conflicts:
+
+1. Coupling: Their saving granule is different (perVM vs perDevice).
+   vfio will enable dirty_page_tracking for each devices, actually
+   once is enough.
+
+2. Conflicts: The ram_save_setup() traverses all memory_listeners
+   to execute their log_start() and log_sync() hooks to get the
+   first round dirty bitmap, which is used by the bulk stage of
+   ram saving. However, as vfio dirty tracking is not yet started,
+   it can't get dirty bitmap from vfio. Then we give up the chance
+   to handle vfio dirty page at bulk stage.
+
+Move the switch of vfio dirty_page_tracking into vfio_memory_listener
+can solve above problems. Besides, Do not require devices in SAVING
+state for vfio_sync_dirty_bitmap().
+
+[1] https://www.spinics.net/lists/kvm/msg229967.html
+
+Reported-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
+Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c    | 49 ++++++++++++++++++++++++++++++++++++---------
+ hw/vfio/migration.c | 35 --------------------------------
+ 2 files changed, 40 insertions(+), 44 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index a7817c90cc..245e32df5b 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -310,7 +310,7 @@ bool vfio_mig_active(void)
+     return true;
+ }
+ 
+-static bool vfio_devices_all_saving(VFIOContainer *container)
++static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
+ {
+     VFIOGroup *group;
+     VFIODevice *vbasedev;
+@@ -328,13 +328,8 @@ static bool vfio_devices_all_saving(VFIOContainer *container)
+                 return false;
+             }
+ 
+-            if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
+-                if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
+-                    && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+-                        return false;
+-                }
+-                continue;
+-            } else {
++            if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
++                && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+                 return false;
+             }
+         }
+@@ -952,6 +947,40 @@ static void vfio_listener_region_del(MemoryListener *listener,
+     }
+ }
+ 
++static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
++{
++    int ret;
++    struct vfio_iommu_type1_dirty_bitmap dirty = {
++        .argsz = sizeof(dirty),
++    };
++
++    if (start) {
++        dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
++    } else {
++        dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
++    }
++
++    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
++    if (ret) {
++        error_report("Failed to set dirty tracking flag 0x%x errno: %d",
++                     dirty.flags, errno);
++    }
++}
++
++static void vfio_listener_log_global_start(MemoryListener *listener)
++{
++    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
++
++    vfio_set_dirty_page_tracking(container, true);
++}
++
++static void vfio_listener_log_global_stop(MemoryListener *listener)
++{
++    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
++
++    vfio_set_dirty_page_tracking(container, false);
++}
++
+ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+                                  uint64_t size, ram_addr_t ram_addr)
+ {
+@@ -1093,7 +1122,7 @@ static void vfio_listener_log_sync(MemoryListener *listener,
+         return;
+     }
+ 
+-    if (vfio_devices_all_saving(container)) {
++    if (vfio_devices_all_dirty_tracking(container)) {
+         vfio_sync_dirty_bitmap(container, section);
+     }
+ }
+@@ -1101,6 +1130,8 @@ static void vfio_listener_log_sync(MemoryListener *listener,
+ static const MemoryListener vfio_memory_listener = {
+     .region_add = vfio_listener_region_add,
+     .region_del = vfio_listener_region_del,
++    .log_global_start = vfio_listener_log_global_start,
++    .log_global_stop = vfio_listener_log_global_stop,
+     .log_sync = vfio_listener_log_sync,
+ };
+ 
+diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
+index 033cb2b0c9..f1f006d584 100644
+--- a/hw/vfio/migration.c
++++ b/hw/vfio/migration.c
+@@ -395,40 +395,10 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
+     return qemu_file_get_error(f);
+ }
+ 
+-static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start)
+-{
+-    int ret;
+-    VFIOMigration *migration = vbasedev->migration;
+-    VFIOContainer *container = vbasedev->group->container;
+-    struct vfio_iommu_type1_dirty_bitmap dirty = {
+-        .argsz = sizeof(dirty),
+-    };
+-
+-    if (start) {
+-        if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
+-            dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+-        } else {
+-            return -EINVAL;
+-        }
+-    } else {
+-            dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+-    }
+-
+-    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
+-    if (ret) {
+-        error_report("Failed to set dirty tracking flag 0x%x errno: %d",
+-                     dirty.flags, errno);
+-        return -errno;
+-    }
+-    return ret;
+-}
+-
+ static void vfio_migration_cleanup(VFIODevice *vbasedev)
+ {
+     VFIOMigration *migration = vbasedev->migration;
+ 
+-    vfio_set_dirty_page_tracking(vbasedev, false);
+-
+     if (migration->region.mmaps) {
+         vfio_region_unmap(&migration->region);
+     }
+@@ -469,11 +439,6 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
+         return ret;
+     }
+ 
+-    ret = vfio_set_dirty_page_tracking(vbasedev, true);
+-    if (ret) {
+-        return ret;
+-    }
+-
+     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+ 
+     ret = qemu_file_get_error(f);
+-- 
+2.27.0
+
diff --git a/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c59bc4e1ff70f6993557329480505c4300ff6aa0
--- /dev/null
+++ b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch
@@ -0,0 +1,223 @@
+From f9574b63bf5e940d794db2c3aaf928bde36d9521 Mon Sep 17 00:00:00 2001
+From: Zenghui Yu <yuzenghui@huawei.com>
+Date: Sat, 8 May 2021 17:31:05 +0800
+Subject: [PATCH] vfio/migration: Add support for manual clear vfio dirty log
+
+The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl
+VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
+VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in
+the kernel, tweak the userspace side to use them.
+
+Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and
+provide the log_clear() hook for vfio_memory_listener. If the
+kernel supports it, deliever the clear message to kernel.
+
+Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/common.c              | 149 +++++++++++++++++++++++++++++++++-
+ include/hw/vfio/vfio-common.h |   1 +
+ 2 files changed, 148 insertions(+), 2 deletions(-)
+
+diff --git a/hw/vfio/common.c b/hw/vfio/common.c
+index c33c4c539d..206fb83e28 100644
+--- a/hw/vfio/common.c
++++ b/hw/vfio/common.c
+@@ -1045,7 +1045,9 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
+ 
+     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
+-    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
++    dbitmap->flags = container->dirty_log_manual_clear ?
++                     VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR :
++                     VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
+     range->iova = iova;
+     range->size = size;
+@@ -1176,12 +1178,148 @@ static void vfio_listener_log_sync(MemoryListener *listener,
+     }
+ }
+ 
++/*
++ * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP
++ * ioctl. But copy from kvm side and align {start, size} with 64 pages.
++ *
++ * I think the code can be simplified a lot if no alignment requirement.
++ */
++#define VFIO_CLEAR_LOG_SHIFT  6
++#define VFIO_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << VFIO_CLEAR_LOG_SHIFT)
++#define VFIO_CLEAR_LOG_MASK   (-VFIO_CLEAR_LOG_ALIGN)
++
++static int vfio_log_clear_one_range(VFIOContainer *container,
++        VFIODMARange *qrange, uint64_t start, uint64_t size)
++{
++    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
++    struct vfio_iommu_type1_dirty_bitmap_get *range;
++
++    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
++
++    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
++    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP;
++    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
++
++    /*
++     * Now let's deal with the actual bitmap, which is almost the same
++     * as the kvm side.
++     */
++    uint64_t end, bmap_start, start_delta, bmap_npages;
++    unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
++    int ret;
++
++    bmap_start = start & VFIO_CLEAR_LOG_MASK;
++    start_delta = start - bmap_start;
++    bmap_start /= psize;
++
++    bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN)
++        << VFIO_CLEAR_LOG_SHIFT;
++    end = qrange->size / psize;
++    if (bmap_npages > end - bmap_start) {
++        bmap_npages = end - bmap_start;
++    }
++    start_delta /= psize;
++
++    if (start_delta) {
++        bmap_clear = bitmap_new(bmap_npages);
++        bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap,
++                                    bmap_start, start_delta + size / psize);
++        bitmap_clear(bmap_clear, 0, start_delta);
++        range->bitmap.data = (__u64 *)bmap_clear;
++    } else {
++        range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start));
++    }
++
++    range->iova = qrange->iova + bmap_start * psize;
++    range->size = bmap_npages * psize;
++    range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) /
++                                               BITS_PER_BYTE;
++    range->bitmap.pgsize = qemu_real_host_page_size;
++
++    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
++    if (ret) {
++        error_report("Failed to clear dirty log for iova: 0x%"PRIx64
++                " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
++                (uint64_t)range->size, errno);
++        goto err_out;
++    }
++
++    bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize);
++err_out:
++    g_free(bmap_clear);
++    g_free(dbitmap);
++    return 0;
++}
++
++static int vfio_physical_log_clear(VFIOContainer *container,
++                                   MemoryRegionSection *section)
++{
++    uint64_t start, size, offset, count;
++    VFIODMARange *qrange;
++    int ret = 0;
++
++    if (!container->dirty_log_manual_clear) {
++        /* No need to do explicit clear */
++        return ret;
++    }
++
++    start = section->offset_within_address_space;
++    size = int128_get64(section->size);
++
++    if (!size) {
++        return ret;
++    }
++
++    QLIST_FOREACH(qrange, &container->dma_list, next) {
++        /*
++         * Discard ranges that do not overlap the section (e.g., the
++         * Memory BAR regions of the device)
++         */
++        if (qrange->iova > start + size - 1 ||
++            start > qrange->iova + qrange->size - 1) {
++            continue;
++        }
++
++        if (start >= qrange->iova) {
++            /* The range starts before section or is aligned to it. */
++            offset = start - qrange->iova;
++            count = MIN(qrange->size - offset, size);
++        } else {
++            /* The range starts after section. */
++            offset = 0;
++            count = MIN(qrange->size, size - (qrange->iova - start));
++        }
++        ret = vfio_log_clear_one_range(container, qrange, offset, count);
++        if (ret < 0) {
++            break;
++        }
++    }
++
++    return ret;
++}
++
++static void vfio_listener_log_clear(MemoryListener *listener,
++                                    MemoryRegionSection *section)
++{
++    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
++
++    if (vfio_listener_skipped_section(section) ||
++        !container->dirty_pages_supported) {
++        return;
++    }
++
++    if (vfio_devices_all_dirty_tracking(container)) {
++        vfio_physical_log_clear(container, section);
++    }
++}
++
+ static const MemoryListener vfio_memory_listener = {
+     .region_add = vfio_listener_region_add,
+     .region_del = vfio_listener_region_del,
+     .log_global_start = vfio_listener_log_global_start,
+     .log_global_stop = vfio_listener_log_global_stop,
+     .log_sync = vfio_listener_log_sync,
++    .log_clear = vfio_listener_log_clear,
+ };
+ 
+ static void vfio_listener_release(VFIOContainer *container)
+@@ -1563,7 +1701,7 @@ static int vfio_get_iommu_type(VFIOContainer *container,
+ static int vfio_init_container(VFIOContainer *container, int group_fd,
+                                Error **errp)
+ {
+-    int iommu_type, ret;
++    int iommu_type, dirty_log_manual_clear, ret;
+ 
+     iommu_type = vfio_get_iommu_type(container, errp);
+     if (iommu_type < 0) {
+@@ -1592,6 +1730,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd,
+     }
+ 
+     container->iommu_type = iommu_type;
++
++    dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION,
++                                   VFIO_DIRTY_LOG_MANUAL_CLEAR);
++    if (dirty_log_manual_clear) {
++        container->dirty_log_manual_clear = dirty_log_manual_clear;
++    }
++
+     return 0;
+ }
+ 
+diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
+index 2853dc861e..1277914ca8 100644
+--- a/include/hw/vfio/vfio-common.h
++++ b/include/hw/vfio/vfio-common.h
+@@ -93,6 +93,7 @@ typedef struct VFIOContainer {
+     int error;
+     bool initialized;
+     bool dirty_pages_supported;
++    bool dirty_log_manual_clear;
+     uint64_t dirty_pgsizes;
+     uint64_t max_dirty_bitmap_size;
+     unsigned long pgsizes;
+-- 
+2.27.0
+
diff --git a/vfio-pci-Implement-return_page_response-page-respons.patch b/vfio-pci-Implement-return_page_response-page-respons.patch
new file mode 100644
index 0000000000000000000000000000000000000000..721512e4095c6385efe44279e7e44744ea781899
--- /dev/null
+++ b/vfio-pci-Implement-return_page_response-page-respons.patch
@@ -0,0 +1,199 @@
+From dab7c3ad6d51e9f0c65d864d6128f62697db4604 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Fri, 6 Nov 2020 12:03:29 -0500
+Subject: [PATCH] vfio/pci: Implement return_page_response page response
+ callback
+
+This patch implements the page response path. The
+response is written into the page response ring buffer and then
+update header's head index is updated. This path is not used
+by this series. It is introduced here as a POC for vSVA/ARM
+integration.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/vfio/pci.h |   2 +
+ 2 files changed, 125 insertions(+)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index d1198c8a23..6f4083aec8 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2662,6 +2662,61 @@ out:
+     g_free(fault_region_info);
+ }
+ 
++static void vfio_init_fault_response_regions(VFIOPCIDevice *vdev, Error **errp)
++{
++    struct vfio_region_info *fault_region_info = NULL;
++    struct vfio_region_info_cap_fault *cap_fault;
++    VFIODevice *vbasedev = &vdev->vbasedev;
++    struct vfio_info_cap_header *hdr;
++    char *fault_region_name;
++    int ret;
++
++    ret = vfio_get_dev_region_info(&vdev->vbasedev,
++                                   VFIO_REGION_TYPE_NESTED,
++                                   VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE,
++                                   &fault_region_info);
++    if (ret) {
++        goto out;
++    }
++
++    hdr = vfio_get_region_info_cap(fault_region_info,
++                                   VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE);
++    if (!hdr) {
++        error_setg(errp, "failed to retrieve DMA FAULT RESPONSE capability");
++        goto out;
++    }
++    cap_fault = container_of(hdr, struct vfio_region_info_cap_fault,
++                             header);
++    if (cap_fault->version != 1) {
++        error_setg(errp, "Unsupported DMA FAULT RESPONSE API version %d",
++                   cap_fault->version);
++        goto out;
++    }
++
++    fault_region_name = g_strdup_printf("%s DMA FAULT RESPONSE %d",
++                                        vbasedev->name,
++                                        fault_region_info->index);
++
++    ret = vfio_region_setup(OBJECT(vdev), vbasedev,
++                            &vdev->dma_fault_response_region,
++                            fault_region_info->index,
++                            fault_region_name);
++    g_free(fault_region_name);
++    if (ret) {
++        error_setg_errno(errp, -ret,
++                         "failed to set up the DMA FAULT RESPONSE region %d",
++                         fault_region_info->index);
++        goto out;
++    }
++
++    ret = vfio_region_mmap(&vdev->dma_fault_response_region);
++    if (ret) {
++        error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT RESPONSE queue");
++    }
++out:
++    g_free(fault_region_info);
++}
++
+ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+ {
+     VFIODevice *vbasedev = &vdev->vbasedev;
+@@ -2737,6 +2792,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+         return;
+     }
+ 
++    vfio_init_fault_response_regions(vdev, &err);
++    if (err) {
++        error_propagate(errp, err);
++        return;
++    }
++
+     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
+ 
+     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+@@ -2915,8 +2976,68 @@ static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn,
+     return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info);
+ }
+ 
++static int vfio_iommu_return_page_response(PCIBus *bus, int32_t devfn,
++                                           IOMMUPageResponse *resp)
++{
++    PCIDevice *pdev = bus->devices[devfn];
++    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
++    struct iommu_page_response *response = &resp->resp;
++    struct vfio_region_dma_fault_response header;
++    struct iommu_page_response *queue;
++    char *queue_buffer = NULL;
++    ssize_t bytes;
++
++    if (!vdev->dma_fault_response_region.mem) {
++        return -EINVAL;
++    }
++
++    /* read the header */
++    bytes = pread(vdev->vbasedev.fd, &header, sizeof(header),
++                  vdev->dma_fault_response_region.fd_offset);
++    if (bytes != sizeof(header)) {
++        error_report("%s unable to read the fault region header (0x%lx)",
++                     __func__, bytes);
++        return -1;
++    }
++
++    /* Normally the fault queue is mmapped */
++    queue = (struct iommu_page_response *)vdev->dma_fault_response_region.mmaps[0].mmap;
++    if (!queue) {
++        size_t queue_size = header.nb_entries * header.entry_size;
++
++        error_report("%s: fault queue not mmapped: slower fault handling",
++                     vdev->vbasedev.name);
++
++        queue_buffer = g_malloc(queue_size);
++        bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size,
++                      vdev->dma_fault_response_region.fd_offset + header.offset);
++        if (bytes != queue_size) {
++            error_report("%s unable to read the fault queue (0x%lx)",
++                         __func__, bytes);
++            return -1;
++        }
++
++        queue = (struct iommu_page_response *)queue_buffer;
++    }
++    /* deposit the new response in the queue and increment the head */
++    memcpy(queue + header.head, response, header.entry_size);
++
++    vdev->fault_response_head_index =
++        (vdev->fault_response_head_index + 1) % header.nb_entries;
++    bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_response_head_index, 4,
++                   vdev->dma_fault_response_region.fd_offset);
++    if (bytes != 4) {
++        error_report("%s unable to write the fault response region head index (0x%lx)",
++                     __func__, bytes);
++    }
++    g_free(queue_buffer);
++
++    return 0;
++}
++
+ static PCIPASIDOps vfio_pci_pasid_ops = {
+     .set_pasid_table = vfio_iommu_set_pasid_table,
++    .return_page_response = vfio_iommu_return_page_response,
+ };
+ 
+ static void vfio_dma_fault_notifier_handler(void *opaque)
+@@ -3373,6 +3494,7 @@ static void vfio_instance_finalize(Object *obj)
+     vfio_display_finalize(vdev);
+     vfio_bars_finalize(vdev);
+     vfio_region_finalize(&vdev->dma_fault_region);
++    vfio_region_finalize(&vdev->dma_fault_response_region);
+     g_free(vdev->emulated_config_bits);
+     g_free(vdev->rom);
+     /*
+@@ -3394,6 +3516,7 @@ static void vfio_exitfn(PCIDevice *pdev)
+     vfio_unregister_err_notifier(vdev);
+     vfio_unregister_ext_irq_notifiers(vdev);
+     vfio_region_exit(&vdev->dma_fault_region);
++    vfio_region_exit(&vdev->dma_fault_response_region);
+     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+     vfio_disable_interrupts(vdev);
+     if (vdev->intx.mmap_timer) {
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index e31bc0173a..7fdcfa0dc8 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -143,6 +143,8 @@ typedef struct VFIOPCIDevice {
+     VFIOPCIExtIRQ *ext_irqs;
+     VFIORegion dma_fault_region;
+     uint32_t fault_tail_index;
++    VFIORegion dma_fault_response_region;
++    uint32_t fault_response_head_index;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
diff --git a/vfio-pci-Implement-the-DMA-fault-handler.patch b/vfio-pci-Implement-the-DMA-fault-handler.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ca61b01c4469cd30c3b4b781c2cc527b48c45e80
--- /dev/null
+++ b/vfio-pci-Implement-the-DMA-fault-handler.patch
@@ -0,0 +1,96 @@
+From 139d0b3474c29427fea4a0ed47f51c01a76a8636 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Tue, 5 Mar 2019 16:35:32 +0100
+Subject: [PATCH] vfio/pci: Implement the DMA fault handler
+
+Whenever the eventfd is triggered, we retrieve the DMA fault(s)
+from the mmapped fault region and inject them in the iommu
+memory region.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/vfio/pci.h |  1 +
+ 2 files changed, 51 insertions(+)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index 0db7d68258..d1198c8a23 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2922,10 +2922,60 @@ static PCIPASIDOps vfio_pci_pasid_ops = {
+ static void vfio_dma_fault_notifier_handler(void *opaque)
+ {
+     VFIOPCIExtIRQ *ext_irq = opaque;
++    VFIOPCIDevice *vdev = ext_irq->vdev;
++    PCIDevice *pdev = &vdev->pdev;
++    AddressSpace *as = pci_device_iommu_address_space(pdev);
++    IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(as->root);
++    struct vfio_region_dma_fault header;
++    struct iommu_fault *queue;
++    char *queue_buffer = NULL;
++    ssize_t bytes;
+ 
+     if (!event_notifier_test_and_clear(&ext_irq->notifier)) {
+         return;
+     }
++
++    bytes = pread(vdev->vbasedev.fd, &header, sizeof(header),
++                  vdev->dma_fault_region.fd_offset);
++    if (bytes != sizeof(header)) {
++        error_report("%s unable to read the fault region header (0x%lx)",
++                     __func__, bytes);
++        return;
++    }
++
++    /* Normally the fault queue is mmapped */
++    queue = (struct iommu_fault *)vdev->dma_fault_region.mmaps[0].mmap;
++    if (!queue) {
++        size_t queue_size = header.nb_entries * header.entry_size;
++
++        error_report("%s: fault queue not mmapped: slower fault handling",
++                     vdev->vbasedev.name);
++
++        queue_buffer = g_malloc(queue_size);
++        bytes =  pread(vdev->vbasedev.fd, queue_buffer, queue_size,
++                       vdev->dma_fault_region.fd_offset + header.offset);
++        if (bytes != queue_size) {
++            error_report("%s unable to read the fault queue (0x%lx)",
++                         __func__, bytes);
++            return;
++        }
++
++        queue = (struct iommu_fault *)queue_buffer;
++    }
++
++    while (vdev->fault_tail_index != header.head) {
++        memory_region_inject_faults(iommu_mr, 1,
++                                    &queue[vdev->fault_tail_index]);
++        vdev->fault_tail_index =
++            (vdev->fault_tail_index + 1) % header.nb_entries;
++    }
++    bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_tail_index, 4,
++                   vdev->dma_fault_region.fd_offset);
++    if (bytes != 4) {
++        error_report("%s unable to write the fault region tail index (0x%lx)",
++                     __func__, bytes);
++    }
++    g_free(queue_buffer);
+ }
+ 
+ static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev,
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index 815154656c..e31bc0173a 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -142,6 +142,7 @@ typedef struct VFIOPCIDevice {
+     EventNotifier req_notifier;
+     VFIOPCIExtIRQ *ext_irqs;
+     VFIORegion dma_fault_region;
++    uint32_t fault_tail_index;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
diff --git a/vfio-pci-Register-handler-for-iommu-fault.patch b/vfio-pci-Register-handler-for-iommu-fault.patch
new file mode 100644
index 0000000000000000000000000000000000000000..feea0a347baad96a592cefba3dd6957947d1505d
--- /dev/null
+++ b/vfio-pci-Register-handler-for-iommu-fault.patch
@@ -0,0 +1,168 @@
+From 65b96da46d2c5dfdcf3a4618cf75ca94345164d7 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Dec 2018 04:39:30 -0500
+Subject: [PATCH] vfio/pci: Register handler for iommu fault
+
+We use the new extended IRQ VFIO_IRQ_TYPE_NESTED type and
+VFIO_IRQ_SUBTYPE_DMA_FAULT subtype to set/unset
+a notifier for physical DMA faults. The associated eventfd is
+triggered, in nested mode, whenever a fault is detected at IOMMU
+physical level.
+
+The actual handler will be implemented in subsequent patches.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++-
+ hw/vfio/pci.h |  7 +++++
+ 2 files changed, 87 insertions(+), 1 deletion(-)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index bbcba3fd16..f5c05d508d 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2857,6 +2857,76 @@ static PCIPASIDOps vfio_pci_pasid_ops = {
+     .set_pasid_table = vfio_iommu_set_pasid_table,
+ };
+ 
++static void vfio_dma_fault_notifier_handler(void *opaque)
++{
++    VFIOPCIExtIRQ *ext_irq = opaque;
++
++    if (!event_notifier_test_and_clear(&ext_irq->notifier)) {
++        return;
++    }
++}
++
++static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev,
++                                         uint32_t type, uint32_t subtype,
++                                         IOHandler *handler)
++{
++    int32_t fd, ext_irq_index, index;
++    struct vfio_irq_info *irq_info;
++    Error *err = NULL;
++    EventNotifier *n;
++    int ret;
++
++    ret = vfio_get_dev_irq_info(&vdev->vbasedev, type, subtype, &irq_info);
++    if (ret) {
++        return ret;
++    }
++    index = irq_info->index;
++    ext_irq_index = irq_info->index - VFIO_PCI_NUM_IRQS;
++    g_free(irq_info);
++
++    vdev->ext_irqs[ext_irq_index].vdev = vdev;
++    vdev->ext_irqs[ext_irq_index].index = index;
++    n = &vdev->ext_irqs[ext_irq_index].notifier;
++
++    ret = event_notifier_init(n, 0);
++    if (ret) {
++        error_report("vfio: Unable to init event notifier for ext irq %d(%d)",
++                     ext_irq_index, ret);
++        return ret;
++    }
++
++    fd = event_notifier_get_fd(n);
++    qemu_set_fd_handler(fd, vfio_dma_fault_notifier_handler, NULL,
++                        &vdev->ext_irqs[ext_irq_index]);
++
++    ret = vfio_set_irq_signaling(&vdev->vbasedev, index, 0,
++                                 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err);
++    if (ret) {
++        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
++        qemu_set_fd_handler(fd, NULL, NULL, vdev);
++        event_notifier_cleanup(n);
++    }
++    return ret;
++}
++
++static void vfio_unregister_ext_irq_notifiers(VFIOPCIDevice *vdev)
++{
++    VFIODevice *vbasedev = &vdev->vbasedev;
++    Error *err = NULL;
++    int i;
++
++    for (i = 0; i < vbasedev->num_irqs - VFIO_PCI_NUM_IRQS; i++) {
++        if (vfio_set_irq_signaling(vbasedev, i + VFIO_PCI_NUM_IRQS , 0,
++                                   VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
++            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
++        }
++        qemu_set_fd_handler(event_notifier_get_fd(&vdev->ext_irqs[i].notifier),
++                            NULL, NULL, vdev);
++        event_notifier_cleanup(&vdev->ext_irqs[i].notifier);
++    }
++    g_free(vdev->ext_irqs);
++}
++
+ static void vfio_realize(PCIDevice *pdev, Error **errp)
+ {
+     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
+@@ -2867,7 +2937,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+     ssize_t len;
+     struct stat st;
+     int groupid;
+-    int i, ret;
++    int i, ret, nb_ext_irqs;
+     bool is_mdev;
+ 
+     if (!vdev->vbasedev.sysfsdev) {
+@@ -2955,6 +3025,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+         goto error;
+     }
+ 
++    nb_ext_irqs = vdev->vbasedev.num_irqs - VFIO_PCI_NUM_IRQS;
++    if (nb_ext_irqs > 0) {
++        vdev->ext_irqs = g_new0(VFIOPCIExtIRQ, nb_ext_irqs);
++    }
++
+     vfio_populate_device(vdev, &err);
+     if (err) {
+         error_propagate(errp, err);
+@@ -3161,6 +3236,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
+ 
+     vfio_register_err_notifier(vdev);
+     vfio_register_req_notifier(vdev);
++    vfio_register_ext_irq_handler(vdev, VFIO_IRQ_TYPE_NESTED,
++                                  VFIO_IRQ_SUBTYPE_DMA_FAULT,
++                                  vfio_dma_fault_notifier_handler);
+     vfio_setup_resetfn_quirk(vdev);
+ 
+     pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops);
+@@ -3201,6 +3279,7 @@ static void vfio_exitfn(PCIDevice *pdev)
+ 
+     vfio_unregister_req_notifier(vdev);
+     vfio_unregister_err_notifier(vdev);
++    vfio_unregister_ext_irq_notifiers(vdev);
+     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+     vfio_disable_interrupts(vdev);
+     if (vdev->intx.mmap_timer) {
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index 834a90d646..893d074375 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -113,6 +113,12 @@ typedef struct VFIOMSIXInfo {
+     unsigned long *pending;
+ } VFIOMSIXInfo;
+ 
++typedef struct VFIOPCIExtIRQ {
++    struct VFIOPCIDevice *vdev;
++    EventNotifier notifier;
++    uint32_t index;
++} VFIOPCIExtIRQ;
++
+ typedef struct VFIOPCIDevice {
+     PCIDevice pdev;
+     VFIODevice vbasedev;
+@@ -134,6 +140,7 @@ typedef struct VFIOPCIDevice {
+     PCIHostDeviceAddress host;
+     EventNotifier err_notifier;
+     EventNotifier req_notifier;
++    VFIOPCIExtIRQ *ext_irqs;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
diff --git a/vfio-pci-Set-up-the-DMA-FAULT-region.patch b/vfio-pci-Set-up-the-DMA-FAULT-region.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ae70a0696cb8310e2669b7e75d2e12bf8e9911f8
--- /dev/null
+++ b/vfio-pci-Set-up-the-DMA-FAULT-region.patch
@@ -0,0 +1,132 @@
+From e44d9cc377848f0a560b6d114561852e95fab557 Mon Sep 17 00:00:00 2001
+From: Eric Auger <eric.auger@redhat.com>
+Date: Thu, 13 Dec 2018 10:57:53 -0500
+Subject: [PATCH] vfio/pci: Set up the DMA FAULT region
+
+Set up the fault region which is composed of the actual fault
+queue (mmappable) and a header used to handle it. The fault
+queue is mmapped.
+
+Signed-off-by: Eric Auger <eric.auger@redhat.com>
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ hw/vfio/pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/vfio/pci.h |  1 +
+ 2 files changed, 65 insertions(+)
+
+diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
+index f5c05d508d..0db7d68258 100644
+--- a/hw/vfio/pci.c
++++ b/hw/vfio/pci.c
+@@ -2607,11 +2607,67 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
+     return 0;
+ }
+ 
++static void vfio_init_fault_regions(VFIOPCIDevice *vdev, Error **errp)
++{
++    struct vfio_region_info *fault_region_info = NULL;
++    struct vfio_region_info_cap_fault *cap_fault;
++    VFIODevice *vbasedev = &vdev->vbasedev;
++    struct vfio_info_cap_header *hdr;
++    char *fault_region_name;
++    int ret;
++
++    ret = vfio_get_dev_region_info(&vdev->vbasedev,
++                                   VFIO_REGION_TYPE_NESTED,
++                                   VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
++                                   &fault_region_info);
++    if (ret) {
++        goto out;
++    }
++
++    hdr = vfio_get_region_info_cap(fault_region_info,
++                                   VFIO_REGION_INFO_CAP_DMA_FAULT);
++    if (!hdr) {
++        error_setg(errp, "failed to retrieve DMA FAULT capability");
++        goto out;
++    }
++    cap_fault = container_of(hdr, struct vfio_region_info_cap_fault,
++                             header);
++    if (cap_fault->version != 1) {
++        error_setg(errp, "Unsupported DMA FAULT API version %d",
++                   cap_fault->version);
++        goto out;
++    }
++
++    fault_region_name = g_strdup_printf("%s DMA FAULT %d",
++                                        vbasedev->name,
++                                        fault_region_info->index);
++
++    ret = vfio_region_setup(OBJECT(vdev), vbasedev,
++                            &vdev->dma_fault_region,
++                            fault_region_info->index,
++                            fault_region_name);
++    g_free(fault_region_name);
++    if (ret) {
++        error_setg_errno(errp, -ret,
++                         "failed to set up the DMA FAULT region %d",
++                         fault_region_info->index);
++        goto out;
++    }
++
++    ret = vfio_region_mmap(&vdev->dma_fault_region);
++    if (ret) {
++        error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT queue");
++    }
++out:
++    g_free(fault_region_info);
++}
++
+ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+ {
+     VFIODevice *vbasedev = &vdev->vbasedev;
+     struct vfio_region_info *reg_info;
+     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
++    Error *err = NULL;
+     int i, ret = -1;
+ 
+     /* Sanity check device */
+@@ -2675,6 +2731,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+         }
+     }
+ 
++    vfio_init_fault_regions(vdev, &err);
++    if (err) {
++        error_propagate(errp, err);
++        return;
++    }
++
+     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
+ 
+     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+@@ -3260,6 +3322,7 @@ static void vfio_instance_finalize(Object *obj)
+ 
+     vfio_display_finalize(vdev);
+     vfio_bars_finalize(vdev);
++    vfio_region_finalize(&vdev->dma_fault_region);
+     g_free(vdev->emulated_config_bits);
+     g_free(vdev->rom);
+     /*
+@@ -3280,6 +3343,7 @@ static void vfio_exitfn(PCIDevice *pdev)
+     vfio_unregister_req_notifier(vdev);
+     vfio_unregister_err_notifier(vdev);
+     vfio_unregister_ext_irq_notifiers(vdev);
++    vfio_region_exit(&vdev->dma_fault_region);
+     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+     vfio_disable_interrupts(vdev);
+     if (vdev->intx.mmap_timer) {
+diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
+index 893d074375..815154656c 100644
+--- a/hw/vfio/pci.h
++++ b/hw/vfio/pci.h
+@@ -141,6 +141,7 @@ typedef struct VFIOPCIDevice {
+     EventNotifier err_notifier;
+     EventNotifier req_notifier;
+     VFIOPCIExtIRQ *ext_irqs;
++    VFIORegion dma_fault_region;
+     int (*resetfn)(struct VFIOPCIDevice *);
+     uint32_t vendor_id;
+     uint32_t device_id;
+-- 
+2.27.0
+
diff --git a/vfio.h-and-iommu.h-header-update-against-5.10.patch b/vfio.h-and-iommu.h-header-update-against-5.10.patch
new file mode 100644
index 0000000000000000000000000000000000000000..721f2b6fcbc9de84c77b59ddf68da60d3d1fd255
--- /dev/null
+++ b/vfio.h-and-iommu.h-header-update-against-5.10.patch
@@ -0,0 +1,760 @@
+From 95435c6778f38dee9ed6f3ee6fd9e022107315d7 Mon Sep 17 00:00:00 2001
+From: Kunkun Jiang <jiangkunkun@huawei.com>
+Date: Fri, 30 Jul 2021 09:15:31 +0800
+Subject: [PATCH] vfio.h and iommu.h header update against 5.10
+
+Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
+---
+ linux-headers/linux/iommu.h | 395 ++++++++++++++++++++++++++++++++++++
+ linux-headers/linux/vfio.h  | 249 ++++++++++++++++++++++-
+ 2 files changed, 641 insertions(+), 3 deletions(-)
+ create mode 100644 linux-headers/linux/iommu.h
+
+diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h
+new file mode 100644
+index 0000000000..773b7dc2d6
+--- /dev/null
++++ b/linux-headers/linux/iommu.h
+@@ -0,0 +1,395 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++/*
++ * IOMMU user API definitions
++ */
++
++#ifndef IOMMU_H
++#define IOMMU_H
++
++#include <linux/types.h>
++
++#define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
++#define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
++#define IOMMU_FAULT_PERM_EXEC	(1 << 2) /* exec */
++#define IOMMU_FAULT_PERM_PRIV	(1 << 3) /* privileged */
++
++/* Generic fault types, can be expanded IRQ remapping fault */
++enum iommu_fault_type {
++	IOMMU_FAULT_DMA_UNRECOV = 1,	/* unrecoverable fault */
++	IOMMU_FAULT_PAGE_REQ,		/* page request fault */
++};
++
++enum iommu_fault_reason {
++	IOMMU_FAULT_REASON_UNKNOWN = 0,
++
++	/* Could not access the PASID table (fetch caused external abort) */
++	IOMMU_FAULT_REASON_PASID_FETCH,
++
++	/* PASID entry is invalid or has configuration errors */
++	IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
++
++	/*
++	 * PASID is out of range (e.g. exceeds the maximum PASID
++	 * supported by the IOMMU) or disabled.
++	 */
++	IOMMU_FAULT_REASON_PASID_INVALID,
++
++	/*
++	 * An external abort occurred fetching (or updating) a translation
++	 * table descriptor
++	 */
++	IOMMU_FAULT_REASON_WALK_EABT,
++
++	/*
++	 * Could not access the page table entry (Bad address),
++	 * actual translation fault
++	 */
++	IOMMU_FAULT_REASON_PTE_FETCH,
++
++	/* Protection flag check failed */
++	IOMMU_FAULT_REASON_PERMISSION,
++
++	/* access flag check failed */
++	IOMMU_FAULT_REASON_ACCESS,
++
++	/* Output address of a translation stage caused Address Size fault */
++	IOMMU_FAULT_REASON_OOR_ADDRESS,
++};
++
++/**
++ * struct iommu_fault_unrecoverable - Unrecoverable fault data
++ * @reason: reason of the fault, from &enum iommu_fault_reason
++ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
++ * @pasid: Process Address Space ID
++ * @perm: requested permission access using by the incoming transaction
++ *        (IOMMU_FAULT_PERM_* values)
++ * @addr: offending page address
++ * @fetch_addr: address that caused a fetch abort, if any
++ */
++struct iommu_fault_unrecoverable {
++	__u32	reason;
++#define IOMMU_FAULT_UNRECOV_PASID_VALID		(1 << 0)
++#define IOMMU_FAULT_UNRECOV_ADDR_VALID		(1 << 1)
++#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID	(1 << 2)
++	__u32	flags;
++	__u32	pasid;
++	__u32	perm;
++	__u64	addr;
++	__u64	fetch_addr;
++};
++
++/**
++ * struct iommu_fault_page_request - Page Request data
++ * @flags: encodes whether the corresponding fields are valid and whether this
++ *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
++ *         When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
++ *         must have the same PASID value as the page request. When it is clear,
++ *         the page response should not have a PASID.
++ * @pasid: Process Address Space ID
++ * @grpid: Page Request Group Index
++ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
++ * @addr: page address
++ * @private_data: device-specific private information
++ */
++struct iommu_fault_page_request {
++#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
++#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
++#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
++#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
++	__u32	flags;
++	__u32	pasid;
++	__u32	grpid;
++	__u32	perm;
++	__u64	addr;
++	__u64	private_data[2];
++};
++
++/**
++ * struct iommu_fault - Generic fault data
++ * @type: fault type from &enum iommu_fault_type
++ * @padding: reserved for future use (should be zero)
++ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
++ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
++ * @padding2: sets the fault size to allow for future extensions
++ */
++struct iommu_fault {
++	__u32	type;
++	__u32	padding;
++	union {
++		struct iommu_fault_unrecoverable event;
++		struct iommu_fault_page_request prm;
++		__u8 padding2[56];
++	};
++};
++
++/**
++ * enum iommu_page_response_code - Return status of fault handlers
++ * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
++ *	populated, retry the access. This is "Success" in PCI PRI.
++ * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
++ *	this device if possible. This is "Response Failure" in PCI PRI.
++ * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
++ *	access. This is "Invalid Request" in PCI PRI.
++ */
++enum iommu_page_response_code {
++	IOMMU_PAGE_RESP_SUCCESS = 0,
++	IOMMU_PAGE_RESP_INVALID,
++	IOMMU_PAGE_RESP_FAILURE,
++};
++
++/**
++ * struct iommu_page_response - Generic page response information
++ * @argsz: User filled size of this data
++ * @version: API version of this structure
++ * @flags: encodes whether the corresponding fields are valid
++ *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
++ * @pasid: Process Address Space ID
++ * @grpid: Page Request Group Index
++ * @code: response code from &enum iommu_page_response_code
++ */
++struct iommu_page_response {
++	__u32	argsz;
++#define IOMMU_PAGE_RESP_VERSION_1	1
++	__u32	version;
++#define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
++	__u32	flags;
++	__u32	pasid;
++	__u32	grpid;
++	__u32	code;
++};
++
++/* defines the granularity of the invalidation */
++enum iommu_inv_granularity {
++	IOMMU_INV_GRANU_DOMAIN,	/* domain-selective invalidation */
++	IOMMU_INV_GRANU_PASID,	/* PASID-selective invalidation */
++	IOMMU_INV_GRANU_ADDR,	/* page-selective invalidation */
++	IOMMU_INV_GRANU_NR,	/* number of invalidation granularities */
++};
++
++/**
++ * struct iommu_inv_addr_info - Address Selective Invalidation Structure
++ *
++ * @flags: indicates the granularity of the address-selective invalidation
++ * - If the PASID bit is set, the @pasid field is populated and the invalidation
++ *   relates to cache entries tagged with this PASID and matching the address
++ *   range.
++ * - If ARCHID bit is set, @archid is populated and the invalidation relates
++ *   to cache entries tagged with this architecture specific ID and matching
++ *   the address range.
++ * - Both PASID and ARCHID can be set as they may tag different caches.
++ * - If neither PASID or ARCHID is set, global addr invalidation applies.
++ * - The LEAF flag indicates whether only the leaf PTE caching needs to be
++ *   invalidated and other paging structure caches can be preserved.
++ * @pasid: process address space ID
++ * @archid: architecture-specific ID
++ * @addr: first stage/level input address
++ * @granule_size: page/block size of the mapping in bytes
++ * @nb_granules: number of contiguous granules to be invalidated
++ */
++struct iommu_inv_addr_info {
++#define IOMMU_INV_ADDR_FLAGS_PASID	(1 << 0)
++#define IOMMU_INV_ADDR_FLAGS_ARCHID	(1 << 1)
++#define IOMMU_INV_ADDR_FLAGS_LEAF	(1 << 2)
++	__u32	flags;
++	__u32	archid;
++	__u64	pasid;
++	__u64	addr;
++	__u64	granule_size;
++	__u64	nb_granules;
++};
++
++/**
++ * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure
++ *
++ * @flags: indicates the granularity of the PASID-selective invalidation
++ * - If the PASID bit is set, the @pasid field is populated and the invalidation
++ *   relates to cache entries tagged with this PASID and matching the address
++ *   range.
++ * - If the ARCHID bit is set, the @archid is populated and the invalidation
++ *   relates to cache entries tagged with this architecture specific ID and
++ *   matching the address range.
++ * - Both PASID and ARCHID can be set as they may tag different caches.
++ * - At least one of PASID or ARCHID must be set.
++ * @pasid: process address space ID
++ * @archid: architecture-specific ID
++ */
++struct iommu_inv_pasid_info {
++#define IOMMU_INV_PASID_FLAGS_PASID	(1 << 0)
++#define IOMMU_INV_PASID_FLAGS_ARCHID	(1 << 1)
++	__u32	flags;
++	__u32	archid;
++	__u64	pasid;
++};
++
++/**
++ * struct iommu_cache_invalidate_info - First level/stage invalidation
++ *     information
++ * @argsz: User filled size of this data
++ * @version: API version of this structure
++ * @cache: bitfield that allows to select which caches to invalidate
++ * @granularity: defines the lowest granularity used for the invalidation:
++ *     domain > PASID > addr
++ * @padding: reserved for future use (should be zero)
++ * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID
++ * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR
++ *
++ * Not all the combinations of cache/granularity are valid:
++ *
++ * +--------------+---------------+---------------+---------------+
++ * | type /       |   DEV_IOTLB   |     IOTLB     |      PASID    |
++ * | granularity  |               |               |      cache    |
++ * +==============+===============+===============+===============+
++ * | DOMAIN       |       N/A     |       Y       |       Y       |
++ * +--------------+---------------+---------------+---------------+
++ * | PASID        |       Y       |       Y       |       Y       |
++ * +--------------+---------------+---------------+---------------+
++ * | ADDR         |       Y       |       Y       |       N/A     |
++ * +--------------+---------------+---------------+---------------+
++ *
++ * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than
++ * @version and @cache.
++ *
++ * If multiple cache types are invalidated simultaneously, they all
++ * must support the used granularity.
++ */
++struct iommu_cache_invalidate_info {
++	__u32	argsz;
++#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
++	__u32	version;
++/* IOMMU paging structure cache */
++#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
++#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
++#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
++#define IOMMU_CACHE_INV_TYPE_NR		(3)
++	__u8	cache;
++	__u8	granularity;
++	__u8	padding[6];
++	union {
++		struct iommu_inv_pasid_info pasid_info;
++		struct iommu_inv_addr_info addr_info;
++	} granu;
++};
++
++/**
++ * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
++ * SVA binding.
++ *
++ * @flags:	VT-d PASID table entry attributes
++ * @pat:	Page attribute table data to compute effective memory type
++ * @emt:	Extended memory type
++ *
++ * Only guest vIOMMU selectable and effective options are passed down to
++ * the host IOMMU.
++ */
++struct iommu_gpasid_bind_data_vtd {
++#define IOMMU_SVA_VTD_GPASID_SRE	(1 << 0) /* supervisor request */
++#define IOMMU_SVA_VTD_GPASID_EAFE	(1 << 1) /* extended access enable */
++#define IOMMU_SVA_VTD_GPASID_PCD	(1 << 2) /* page-level cache disable */
++#define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
++#define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
++#define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
++#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 6)
++	__u64 flags;
++	__u32 pat;
++	__u32 emt;
++};
++
++#define IOMMU_SVA_VTD_GPASID_MTS_MASK	(IOMMU_SVA_VTD_GPASID_CD | \
++					 IOMMU_SVA_VTD_GPASID_EMTE | \
++					 IOMMU_SVA_VTD_GPASID_PCD |  \
++					 IOMMU_SVA_VTD_GPASID_PWT)
++
++/**
++ * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
++ * @argsz:	User filled size of this data
++ * @version:	Version of this data structure
++ * @format:	PASID table entry format
++ * @flags:	Additional information on guest bind request
++ * @gpgd:	Guest page directory base of the guest mm to bind
++ * @hpasid:	Process address space ID used for the guest mm in host IOMMU
++ * @gpasid:	Process address space ID used for the guest mm in guest IOMMU
++ * @addr_width:	Guest virtual address width
++ * @padding:	Reserved for future use (should be zero)
++ * @vtd:	Intel VT-d specific data
++ *
++ * Guest to host PASID mapping can be an identity or non-identity, where guest
++ * has its own PASID space. For non-identify mapping, guest to host PASID lookup
++ * is needed when VM programs guest PASID into an assigned device. VMM may
++ * trap such PASID programming then request host IOMMU driver to convert guest
++ * PASID to host PASID based on this bind data.
++ */
++struct iommu_gpasid_bind_data {
++	__u32 argsz;
++#define IOMMU_GPASID_BIND_VERSION_1	1
++	__u32 version;
++#define IOMMU_PASID_FORMAT_INTEL_VTD	1
++#define IOMMU_PASID_FORMAT_LAST		2
++	__u32 format;
++	__u32 addr_width;
++#define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
++	__u64 flags;
++	__u64 gpgd;
++	__u64 hpasid;
++	__u64 gpasid;
++	__u8  padding[8];
++	/* Vendor specific data */
++	union {
++		struct iommu_gpasid_bind_data_vtd vtd;
++	} vendor;
++};
++
++/**
++ * struct iommu_pasid_smmuv3 - ARM SMMUv3 Stream Table Entry stage 1 related
++ *     information
++ * @version: API version of this structure
++ * @s1fmt: STE s1fmt (format of the CD table: single CD, linear table
++ *         or 2-level table)
++ * @s1dss: STE s1dss (specifies the behavior when @pasid_bits != 0
++ *         and no PASID is passed along with the incoming transaction)
++ * @padding: reserved for future use (should be zero)
++ *
++ * The PASID table is referred to as the Context Descriptor (CD) table on ARM
++ * SMMUv3. Please refer to the ARM SMMU 3.x spec (ARM IHI 0070A) for full
++ * details.
++ */
++struct iommu_pasid_smmuv3 {
++#define PASID_TABLE_SMMUV3_CFG_VERSION_1 1
++	__u32	version;
++	__u8	s1fmt;
++	__u8	s1dss;
++	__u8	padding[2];
++};
++
++/**
++ * struct iommu_pasid_table_config - PASID table data used to bind guest PASID
++ *     table to the host IOMMU
++ * @argsz: User filled size of this data
++ * @version: API version to prepare for future extensions
++ * @base_ptr: guest physical address of the PASID table
++ * @format: format of the PASID table
++ * @pasid_bits: number of PASID bits used in the PASID table
++ * @config: indicates whether the guest translation stage must
++ *          be translated, bypassed or aborted.
++ * @padding: reserved for future use (should be zero)
++ * @vendor_data.smmuv3: table information when @format is
++ * %IOMMU_PASID_FORMAT_SMMUV3
++ */
++struct iommu_pasid_table_config {
++	__u32	argsz;
++#define PASID_TABLE_CFG_VERSION_1 1
++	__u32	version;
++	__u64	base_ptr;
++#define IOMMU_PASID_FORMAT_SMMUV3	1
++	__u32	format;
++	__u8	pasid_bits;
++#define IOMMU_PASID_CONFIG_TRANSLATE	1
++#define IOMMU_PASID_CONFIG_BYPASS	2
++#define IOMMU_PASID_CONFIG_ABORT	3
++	__u8	config;
++	__u8    padding[2];
++	union {
++		struct iommu_pasid_smmuv3 smmuv3;
++	} vendor_data;
++};
++
++#endif /* _UAPI_IOMMU_H */
+diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
+index 120387ba58..d6edfbd2f5 100644
+--- a/linux-headers/linux/vfio.h
++++ b/linux-headers/linux/vfio.h
+@@ -14,6 +14,7 @@
+ 
+ #include <linux/types.h>
+ #include <linux/ioctl.h>
++#include <linux/iommu.h>
+ 
+ #define VFIO_API_VERSION	0
+ 
+@@ -211,8 +212,11 @@ struct vfio_device_info {
+ #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
+ #define VFIO_DEVICE_FLAGS_CCW	(1 << 4)	/* vfio-ccw device */
+ #define VFIO_DEVICE_FLAGS_AP	(1 << 5)	/* vfio-ap device */
++#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6)	/* vfio-fsl-mc device */
++#define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
+ 	__u32	num_regions;	/* Max region index + 1 */
+ 	__u32	num_irqs;	/* Max IRQ index + 1 */
++	__u32   cap_offset;	/* Offset within info struct of first cap */
+ };
+ #define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
+ 
+@@ -228,6 +232,15 @@ struct vfio_device_info {
+ #define VFIO_DEVICE_API_CCW_STRING		"vfio-ccw"
+ #define VFIO_DEVICE_API_AP_STRING		"vfio-ap"
+ 
++/*
++ * The following capabilities are unique to s390 zPCI devices.  Their contents
++ * are further-defined in vfio_zdev.h
++ */
++#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE		1
++#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP		2
++#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL		3
++#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP		4
++
+ /**
+  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
+  *				       struct vfio_region_info)
+@@ -316,6 +329,7 @@ struct vfio_region_info_cap_type {
+ #define VFIO_REGION_TYPE_GFX                    (1)
+ #define VFIO_REGION_TYPE_CCW			(2)
+ #define VFIO_REGION_TYPE_MIGRATION              (3)
++#define VFIO_REGION_TYPE_NESTED			(4)
+ 
+ /* sub-types for VFIO_REGION_TYPE_PCI_* */
+ 
+@@ -340,6 +354,10 @@ struct vfio_region_info_cap_type {
+ /* sub-types for VFIO_REGION_TYPE_GFX */
+ #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
+ 
++/* sub-types for VFIO_REGION_TYPE_NESTED */
++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT	(1)
++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE	(2)
++
+ /**
+  * struct vfio_region_gfx_edid - EDID region layout.
+  *
+@@ -472,7 +490,7 @@ struct vfio_region_gfx_edid {
+  * 5. Resumed
+  *                  |--------->|
+  *
+- * 0. Default state of VFIO device is _RUNNNG when the user application starts.
++ * 0. Default state of VFIO device is _RUNNING when the user application starts.
+  * 1. During normal shutdown of the user application, the user application may
+  *    optionally change the VFIO device state from _RUNNING to _STOP. This
+  *    transition is optional. The vendor driver must support this transition but
+@@ -695,11 +713,30 @@ struct vfio_irq_info {
+ #define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
+ #define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
+ #define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
++#define VFIO_IRQ_INFO_FLAG_CAPS		(1 << 4) /* Info supports caps */
+ 	__u32	index;		/* IRQ index */
+ 	__u32	count;		/* Number of IRQs within this index */
++	__u32	cap_offset;	/* Offset within info struct of first cap */
+ };
+ #define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
+ 
++/*
++ * The irq type capability allows IRQs unique to a specific device or
++ * class of devices to be exposed.
++ *
++ * The structures below define version 1 of this capability.
++ */
++#define VFIO_IRQ_INFO_CAP_TYPE      3
++
++struct vfio_irq_info_cap_type {
++	struct vfio_info_cap_header header;
++	__u32 type;     /* global per bus driver */
++	__u32 subtype;  /* type specific */
++};
++
++#define VFIO_IRQ_TYPE_NESTED				(1)
++#define VFIO_IRQ_SUBTYPE_DMA_FAULT			(1)
++
+ /**
+  * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
+  *
+@@ -801,7 +838,8 @@ enum {
+ 	VFIO_PCI_MSIX_IRQ_INDEX,
+ 	VFIO_PCI_ERR_IRQ_INDEX,
+ 	VFIO_PCI_REQ_IRQ_INDEX,
+-	VFIO_PCI_NUM_IRQS
++	VFIO_PCI_NUM_IRQS = 5	/* Fixed user ABI, IRQ indexes >=5 use   */
++				/* device specific cap to define content */
+ };
+ 
+ /*
+@@ -985,6 +1023,68 @@ struct vfio_device_feature {
+  */
+ #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN	(0)
+ 
++/*
++ * Capability exposed by the DMA fault region
++ * @version: ABI version
++ */
++#define VFIO_REGION_INFO_CAP_DMA_FAULT	6
++
++struct vfio_region_info_cap_fault {
++	struct vfio_info_cap_header header;
++	__u32 version;
++};
++
++/*
++ * Capability exposed by the DMA fault response region
++ * @version: ABI version
++ */
++#define VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE	7
++
++struct vfio_region_info_cap_fault_response {
++	struct vfio_info_cap_header header;
++	__u32 version;
++};
++
++/*
++ * DMA Fault Region Layout
++ * @tail: index relative to the start of the ring buffer at which the
++ *        consumer finds the next item in the buffer
++ * @entry_size: fault ring buffer entry size in bytes
++ * @nb_entries: max capacity of the fault ring buffer
++ * @offset: ring buffer offset relative to the start of the region
++ * @head: index relative to the start of the ring buffer at which the
++ *        producer (kernel) inserts items into the buffers
++ */
++struct vfio_region_dma_fault {
++	/* Write-Only */
++	__u32   tail;
++	/* Read-Only */
++	__u32   entry_size;
++	__u32	nb_entries;
++	__u32	offset;
++	__u32   head;
++};
++
++/*
++ * DMA Fault Response Region Layout
++ * @head: index relative to the start of the ring buffer at which the
++ *        producer (userspace) insert responses into the buffer
++ * @entry_size: fault ring buffer entry size in bytes
++ * @nb_entries: max capacity of the fault ring buffer
++ * @offset: ring buffer offset relative to the start of the region
++ * @tail: index relative to the start of the ring buffer at which the
++ *        consumer (kernel) finds the next item in the buffer
++ */
++struct vfio_region_dma_fault_response {
++	/* Write-Only */
++	__u32   head;
++	/* Read-Only */
++	__u32   entry_size;
++	__u32	nb_entries;
++	__u32	offset;
++	__u32   tail;
++};
++
+ /* -------- API for Type1 VFIO IOMMU -------- */
+ 
+ /**
+@@ -1049,6 +1149,21 @@ struct vfio_iommu_type1_info_cap_migration {
+ 	__u64	max_dirty_bitmap_size;		/* in bytes */
+ };
+ 
++/*
++ * The DMA available capability allows to report the current number of
++ * simultaneously outstanding DMA mappings that are allowed.
++ *
++ * The structure below defines version 1 of this capability.
++ *
++ * avail: specifies the current number of outstanding DMA mappings allowed.
++ */
++#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3
++
++struct vfio_iommu_type1_info_dma_avail {
++	struct	vfio_info_cap_header header;
++	__u32	avail;
++};
++
+ #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+ 
+ /**
+@@ -1072,7 +1187,7 @@ struct vfio_iommu_type1_dma_map {
+ struct vfio_bitmap {
+ 	__u64        pgsize;	/* page size for bitmap in bytes */
+ 	__u64        size;	/* in bytes */
+-	__u64 *data;	/* one bit per page */
++	__u64        *data;	/* one bit per page */
+ };
+ 
+ /**
+@@ -1188,6 +1303,134 @@ struct vfio_iommu_type1_dirty_bitmap_get {
+ 
+ #define VFIO_IOMMU_DIRTY_PAGES             _IO(VFIO_TYPE, VFIO_BASE + 17)
+ 
++/*
++ * VFIO_IOMMU_BIND_PROCESS
++ *
++ * Allocate a PASID for a process address space, and use it to attach this
++ * process to all devices in the container. Devices can then tag their DMA
++ * traffic with the returned @pasid to perform transactions on the associated
++ * virtual address space. Mapping and unmapping buffers is performed by standard
++ * functions such as mmap and malloc.
++ *
++ * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to
++ * bind. Otherwise the current task is bound. Given that the caller owns the
++ * device, setting this flag grants the caller read and write permissions on the
++ * entire address space of foreign process described by @pid. Therefore,
++ * permission to perform the bind operation on a foreign process is governed by
++ * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2)
++ * for more information.
++ *
++ * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This
++ * ID is unique to a process and can be used on all devices in the container.
++ *
++ * On fork, the child inherits the device fd and can use the bonds setup by its
++ * parent. Consequently, the child has R/W access on the address spaces bound by
++ * its parent. After an execv, the device fd is closed and the child doesn't
++ * have access to the address space anymore.
++ *
++ * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is
++ * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND,
++ * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current
++ * task from the container.
++ */
++struct vfio_iommu_type1_bind_process {
++	__u32	flags;
++#define VFIO_IOMMU_BIND_PID		(1 << 0)
++	__u32	pasid;
++	__s32	pid;
++};
++
++/*
++ * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes
++ * vfio_iommu_type1_bind_process in data.
++ */
++struct vfio_iommu_type1_bind {
++	__u32	argsz;
++	__u32	flags;
++#define VFIO_IOMMU_BIND_PROCESS		(1 << 0)
++	__u8	data[];
++};
++
++/*
++ * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind)
++ *
++ * Manage address spaces of devices in this container. Initially a TYPE1
++ * container can only have one address space, managed with
++ * VFIO_IOMMU_MAP/UNMAP_DMA.
++ *
++ * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP
++ * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page
++ * tables, and BIND manages the stage-1 (guest) page tables. Other types of
++ * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls
++ * non-PASID traffic and BIND controls PASID traffic. But this depends on the
++ * underlying IOMMU architecture and isn't guaranteed.
++ *
++ * Availability of this feature depends on the device, its bus, the underlying
++ * IOMMU and the CPU architecture.
++ *
++ * returns: 0 on success, -errno on failure.
++ */
++#define VFIO_IOMMU_BIND		_IO(VFIO_TYPE, VFIO_BASE + 22)
++
++/*
++ * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind)
++ *
++ * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl.
++ */
++#define VFIO_IOMMU_UNBIND	_IO(VFIO_TYPE, VFIO_BASE + 23)
++
++/*
++ * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18,
++ *			struct vfio_iommu_type1_set_pasid_table)
++ *
++ * The SET operation passes a PASID table to the host while the
++ * UNSET operation detaches the one currently programmed. It is
++ * allowed to "SET" the table several times without unsetting as
++ * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE.
++ */
++struct vfio_iommu_type1_set_pasid_table {
++	__u32	argsz;
++	__u32	flags;
++#define VFIO_PASID_TABLE_FLAG_SET	(1 << 0)
++#define VFIO_PASID_TABLE_FLAG_UNSET	(1 << 1)
++	struct iommu_pasid_table_config config; /* used on SET */
++};
++
++#define VFIO_IOMMU_SET_PASID_TABLE	_IO(VFIO_TYPE, VFIO_BASE + 18)
++
++/**
++ * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19,
++ *			struct vfio_iommu_type1_cache_invalidate)
++ *
++ * Propagate guest IOMMU cache invalidation to the host.
++ */
++struct vfio_iommu_type1_cache_invalidate {
++	__u32   argsz;
++	__u32   flags;
++	struct iommu_cache_invalidate_info info;
++};
++#define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 19)
++
++/**
++ * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20,
++ *			struct vfio_iommu_type1_set_msi_binding)
++ *
++ * Pass a stage 1 MSI doorbell mapping to the host so that this
++ * latter can build a nested stage2 mapping. Or conversely tear
++ * down a previously bound stage 1 MSI binding.
++ */
++struct vfio_iommu_type1_set_msi_binding {
++	__u32   argsz;
++	__u32   flags;
++#define VFIO_IOMMU_BIND_MSI	(1 << 0)
++#define VFIO_IOMMU_UNBIND_MSI	(1 << 1)
++	__u64	iova;	/* MSI guest IOVA */
++	/* Fields below are used on BIND */
++	__u64	gpa;	/* MSI guest physical address */
++	__u64	size;	/* size of stage1 mapping (bytes) */
++};
++#define VFIO_IOMMU_SET_MSI_BINDING      _IO(VFIO_TYPE, VFIO_BASE + 20)
++
+ /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+ 
+ /*
+-- 
+2.27.0
+