diff --git a/bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch b/bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch new file mode 100644 index 0000000000000000000000000000000000000000..8228abcd539d2ad17cd911cfc7a5281b7beda939 --- /dev/null +++ b/bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch @@ -0,0 +1,23 @@ +From 00c4115a1388ee72295b99fce1f6ad49bf761134 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 17:08:08 +0800 +Subject: [PATCH] bios-tables-test: Allow changes to q35/SSDT.dimmpxm file + +List test/data/acpi/q35/SSDT.dimmpxm as the expected files allowed to +be changed in tests/qtest/bios-tables-test-allowed-diff.h + +Signed-off-by: Yan Wang +--- + tests/qtest/bios-tables-test-allowed-diff.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h +index dfb8523c8b..81148a604f 100644 +--- a/tests/qtest/bios-tables-test-allowed-diff.h ++++ b/tests/qtest/bios-tables-test-allowed-diff.h +@@ -1 +1,2 @@ + /* List of comma-separated changed AML files to ignore */ ++"tests/data/acpi/q35/SSDT.dimmpxm", +-- +2.27.0 + diff --git a/bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch b/bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch new file mode 100644 index 0000000000000000000000000000000000000000..f41a702b99e3a9e8a7c3a4ff6b4cc9aeb7192cf9 --- /dev/null +++ b/bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch @@ -0,0 +1,88 @@ +From 8940f11a055da0a744d10b53cf999dea7967be25 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 17:12:35 +0800 +Subject: [PATCH] bios-tables-test: Update expected q35/SSDT.dimmpxm file + +Run ./tests/data/acpi/rebuild-expected-aml.sh from build directory +to update q35/SSDT.dimmpxm file. Also empty bios-tables-test-allowed-diff.h. + +The disassembled differences between actual and expected SSDT.dimmpxm: + + /* + * Intel ACPI Component Architecture + * AML/ASL+ Disassembler version 20210604 (64-bit version) + * Copyright (c) 2000 - 2021 Intel Corporation + * + * Disassembling to symbolic ASL+ operators + * +- * Disassembly of tests/data/acpi/q35/SSDT.dimmpxm, Thu Feb 10 15:03:52 2022 ++ * Disassembly of /tmp/aml-CK68G1, Thu Feb 10 15:03:52 2022 + * + * Original Table Header: + * Signature "SSDT" + * Length 0x000002DE (734) + * Revision 0x01 +- * Checksum 0x06 ++ * Checksum 0x16 + * OEM ID "BOCHS " + * OEM Table ID "NVDIMM " + * OEM Revision 0x00000001 (1) + * Compiler ID "BXPC" + * Compiler Version 0x00000001 (1) + */ + DefinitionBlock ("", "SSDT", 1, "BOCHS ", "NVDIMM ", 0x00000001) + { + Scope (\_SB) + { + Device (NVDR) + { + Name (_HID, "ACPI0012" /* NVDIMM Root Device */) // _HID: Hardware ID + Method (NCAL, 5, Serialized) + { + Local6 = MEMA /* \MEMA */ +@@ -187,19 +187,19 @@ + { + Return (NCAL (Arg0, Arg1, Arg2, Arg3, 0x02)) + } + } + + Device (NV02) + { + Name (_ADR, 0x03) // _ADR: Address + Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + { + Return (NCAL (Arg0, Arg1, Arg2, Arg3, 0x03)) + } + } + } + } + +- Name (MEMA, 0x07FFF000) ++ Name (MEMA, 0x07FFE000) + } + +Signed-off-by: Yan Wang +--- + tests/data/acpi/q35/SSDT.dimmpxm | Bin 734 -> 734 bytes + tests/qtest/bios-tables-test-allowed-diff.h | 1 - + 2 files changed, 1 deletion(-) + +diff --git a/tests/data/acpi/q35/SSDT.dimmpxm b/tests/data/acpi/q35/SSDT.dimmpxm +index 617a1c911c7d6753bcedc8ecc52e3027a5259ad6..a50a961fa1d9b0dd8ea4096d652c83bcf04db20b 100644 +GIT binary patch +delta 23 +fcmcb|dXJSWIM^lR9uortqu55Si%iT9{<8xBSkVW4 + +delta 23 +fcmcb|dXJSWIM^lR9uortBilx +Date: Fri, 19 Mar 2021 12:22:48 -0400 +Subject: [PATCH] hw/arm/smmu-common: Allow domain invalidation for + NH_ALL/NSNH_ALL + +NH_ALL/NSNH_ALL corresponds to a domain granularity invalidation, +ie. all the notifier range gets invalidation, whatever the ASID. +So let's set the granularity to IOMMU_INV_GRAN_DOMAIN to allow +the consumer to benefit from the info if it can. + +Signed-off-by: Eric Auger +Suggested-by: chenxiang (M) +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmu-common.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 3a1ecf81d6..2ec4222c93 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -477,6 +477,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n) + event.entry.iova = n->start; + event.entry.perm = IOMMU_NONE; + event.entry.addr_mask = n->end - n->start; ++ event.entry.granularity = IOMMU_INV_GRAN_DOMAIN; + + memory_region_notify_iommu_one(n, &event); + } +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch new file mode 100644 index 0000000000000000000000000000000000000000..e8d397824f80fff77c7260bfb4dcec67e410b250 --- /dev/null +++ b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch @@ -0,0 +1,32 @@ +From 5a759ab19d508361053e388694546216705d173b Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 28 Aug 2018 09:21:53 -0400 +Subject: [PATCH] hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute + +The SMMUv3 has the peculiarity to translate MSI +transactionss. let's advertise the corresponding +attribute. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 9b87d16217..12f354a0d5 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1596,6 +1596,9 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, + if (attr == IOMMU_ATTR_VFIO_NESTED) { + *(bool *) data = true; + return 0; ++ } else if (attr == IOMMU_ATTR_MSI_TRANSLATE) { ++ *(bool *) data = true; ++ return 0; + } + return -EINVAL; + } +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Allow-MAP-notifiers.patch b/hw-arm-smmuv3-Allow-MAP-notifiers.patch new file mode 100644 index 0000000000000000000000000000000000000000..1d82532d1262f420e3d4bb39f27579d85f437026 --- /dev/null +++ b/hw-arm-smmuv3-Allow-MAP-notifiers.patch @@ -0,0 +1,37 @@ +From dc126664134989975ce9ab9e7d5d2c8916628bf6 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Wed, 18 Mar 2020 11:17:36 +0100 +Subject: [PATCH] hw/arm/smmuv3: Allow MAP notifiers + +We now have all bricks to support nested paging. This +uses MAP notifiers to map the MSIs. So let's allow MAP +notifiers to be registered. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 8 -------- + 1 file changed, 8 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 9aeb420428..45f21c53fe 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1628,14 +1628,6 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, + return -EINVAL; + } + +- if (new & IOMMU_NOTIFIER_MAP) { +- error_setg(errp, +- "device %02x.%02x.%x requires iommu MAP notifier which is " +- "not currently supported", pci_bus_num(sdev->bus), +- PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn)); +- return -EINVAL; +- } +- + if (old == IOMMU_NOTIFIER_NONE) { + trace_smmuv3_notify_flag_add(iommu->parent_obj.name); + QLIST_INSERT_HEAD(&s->devices_with_notifiers, sdev, next); +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch new file mode 100644 index 0000000000000000000000000000000000000000..646a95bd19bab7b28a797433b0321edc45512657 --- /dev/null +++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch @@ -0,0 +1,34 @@ +From dcda615b3d9b1acffee3d31d57974cc9e4bd0dee Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 4 Sep 2018 08:48:33 -0400 +Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA + invalidation + +When the guest invalidates one S1 entry, it passes the asid. +When propagating this invalidation downto the host, the asid +information also must be passed. So let's fill the arch_id field +introduced for that purpose and accordingly set the flags to +indicate its presence. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 3416f6a639..696c588f08 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -833,6 +833,8 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + event.entry.iova = iova; + event.entry.addr_mask = num_pages * (1 << granule) - 1; + event.entry.perm = IOMMU_NONE; ++ event.entry.flags = IOMMU_INV_FLAGS_ARCHID; ++ event.entry.arch_id = asid; + + memory_region_notify_iommu_one(n, &event); + } +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch new file mode 100644 index 0000000000000000000000000000000000000000..f5f3db19ea9ca70944d1da3be402ce800226d08c --- /dev/null +++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch @@ -0,0 +1,77 @@ +From c219274b7b6a472d7340a4f72a052ba33ed19659 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 14 Mar 2019 09:55:13 -0400 +Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA + invalidation + +Let's propagate the leaf attribute throughout the invalidation path. +This hint is used to reduce the scope of the invalidations to the +last level of translation. Not enforcing it induces large performance +penalties in nested mode. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 696c588f08..ad816e850c 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -800,7 +800,7 @@ epilogue: + static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + IOMMUNotifier *n, + int asid, dma_addr_t iova, +- uint8_t tg, uint64_t num_pages) ++ uint8_t tg, uint64_t num_pages, bool leaf) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); + IOMMUTLBEvent event = {}; +@@ -835,6 +835,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + event.entry.perm = IOMMU_NONE; + event.entry.flags = IOMMU_INV_FLAGS_ARCHID; + event.entry.arch_id = asid; ++ event.entry.leaf = leaf; + + memory_region_notify_iommu_one(n, &event); + } +@@ -866,7 +867,7 @@ static void smmuv3_notify_asid(IOMMUMemoryRegion *mr, + + /* invalidate an asid/iova range tuple in all mr's */ + static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, +- uint8_t tg, uint64_t num_pages) ++ uint8_t tg, uint64_t num_pages, bool leaf) + { + SMMUDevice *sdev; + +@@ -878,7 +879,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, + tg, num_pages); + + IOMMU_NOTIFIER_FOREACH(n, mr) { +- smmuv3_notify_iova(mr, n, asid, iova, tg, num_pages); ++ smmuv3_notify_iova(mr, n, asid, iova, tg, num_pages, leaf); + } + } + } +@@ -903,7 +904,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + + if (!tg) { + trace_smmuv3_s1_range_inval(vmid, asid, addr, tg, 1, ttl, leaf); +- smmuv3_inv_notifiers_iova(s, asid, addr, tg, 1); ++ smmuv3_inv_notifiers_iova(s, asid, addr, tg, 1, leaf); + smmu_iotlb_inv_iova(s, asid, addr, tg, 1, ttl); + return; + } +@@ -921,7 +922,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + + num_pages = (mask + 1) >> granule; + trace_smmuv3_s1_range_inval(vmid, asid, addr, tg, num_pages, ttl, leaf); +- smmuv3_inv_notifiers_iova(s, asid, addr, tg, num_pages); ++ smmuv3_inv_notifiers_iova(s, asid, addr, tg, num_pages, leaf); + smmu_iotlb_inv_iova(s, asid, addr, tg, num_pages, ttl); + addr += mask + 1; + } +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Implement-fault-injection.patch b/hw-arm-smmuv3-Implement-fault-injection.patch new file mode 100644 index 0000000000000000000000000000000000000000..5ecb6da751ece0b759505e47ea49e254234787ef --- /dev/null +++ b/hw-arm-smmuv3-Implement-fault-injection.patch @@ -0,0 +1,107 @@ +From d31c754470b4b651d0e19c66738fbcc8fc6abf3c Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Sep 2018 14:24:45 +0200 +Subject: [PATCH] hw/arm/smmuv3: Implement fault injection + +We convert iommu_fault structs received from the kernel +into the data struct used by the emulation code and record +the evnts into the virtual event queue. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 71 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 58139f707d..9aeb420428 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1660,6 +1660,76 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, + return -EINVAL; + } + ++struct iommu_fault; ++ ++static inline int ++smmuv3_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, ++ struct iommu_fault *buf) ++{ ++#ifdef __linux__ ++ SMMUDevice *sdev = container_of(iommu_mr, SMMUDevice, iommu); ++ SMMUv3State *s3 = sdev->smmu; ++ uint32_t sid = smmu_get_sid(sdev); ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ SMMUEventInfo info = {}; ++ struct iommu_fault_unrecoverable *record; ++ ++ if (buf[i].type != IOMMU_FAULT_DMA_UNRECOV) { ++ continue; ++ } ++ ++ info.sid = sid; ++ record = &buf[i].event; ++ ++ switch (record->reason) { ++ case IOMMU_FAULT_REASON_PASID_INVALID: ++ info.type = SMMU_EVT_C_BAD_SUBSTREAMID; ++ /* TODO further fill info.u.c_bad_substream */ ++ break; ++ case IOMMU_FAULT_REASON_PASID_FETCH: ++ info.type = SMMU_EVT_F_CD_FETCH; ++ break; ++ case IOMMU_FAULT_REASON_BAD_PASID_ENTRY: ++ info.type = SMMU_EVT_C_BAD_CD; ++ /* TODO further fill info.u.c_bad_cd */ ++ break; ++ case IOMMU_FAULT_REASON_WALK_EABT: ++ info.type = SMMU_EVT_F_WALK_EABT; ++ info.u.f_walk_eabt.addr = record->addr; ++ info.u.f_walk_eabt.addr2 = record->fetch_addr; ++ break; ++ case IOMMU_FAULT_REASON_PTE_FETCH: ++ info.type = SMMU_EVT_F_TRANSLATION; ++ info.u.f_translation.addr = record->addr; ++ break; ++ case IOMMU_FAULT_REASON_OOR_ADDRESS: ++ info.type = SMMU_EVT_F_ADDR_SIZE; ++ info.u.f_addr_size.addr = record->addr; ++ break; ++ case IOMMU_FAULT_REASON_ACCESS: ++ info.type = SMMU_EVT_F_ACCESS; ++ info.u.f_access.addr = record->addr; ++ break; ++ case IOMMU_FAULT_REASON_PERMISSION: ++ info.type = SMMU_EVT_F_PERMISSION; ++ info.u.f_permission.addr = record->addr; ++ break; ++ default: ++ warn_report("%s Unexpected fault reason received from host: %d", ++ __func__, record->reason); ++ continue; ++ } ++ ++ smmuv3_record_event(s3, &info); ++ } ++ return 0; ++#else ++ return -1; ++#endif ++} ++ + static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + void *data) + { +@@ -1668,6 +1738,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + imrc->translate = smmuv3_translate; + imrc->notify_flag_changed = smmuv3_notify_flag_changed; + imrc->get_attr = smmuv3_get_attr; ++ imrc->inject_faults = smmuv3_inject_faults; + } + + static const TypeInfo smmuv3_type_info = { +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch new file mode 100644 index 0000000000000000000000000000000000000000..505bec39045c01a65e90863f940dc28011f7b400 --- /dev/null +++ b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch @@ -0,0 +1,107 @@ +From de53feaa37a267a21ed30a642e1e64c5fcfbc4a4 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sun, 14 Feb 2021 12:30:57 -0500 +Subject: [PATCH] hw/arm/smmuv3: Improve stage1 ASID invalidation + +At the moment ASID invalidation command (CMD_TLBI_NH_ASID) is +propagated as a domain invalidation (the whole notifier range +is invalidated independently on any ASID information). + +The new granularity field now allows to be more precise and +restrict the invalidation to a peculiar ASID. Set the corresponding +fields and flag. + +We still keep the iova and addr_mask settings for consumers that +do not support the new fields, like VHOST. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- + hw/arm/trace-events | 1 + + 2 files changed, 43 insertions(+), 2 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 94e2c658f8..da5dac1ba5 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -836,6 +836,31 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + memory_region_notify_iommu_one(n, &event); + } + ++/** ++ * smmuv3_notify_asid - call the notifier @n for a given asid ++ * ++ * @mr: IOMMU mr region handle ++ * @n: notifier to be called ++ * @asid: address space ID or negative value if we don't care ++ */ ++static void smmuv3_notify_asid(IOMMUMemoryRegion *mr, ++ IOMMUNotifier *n, int asid) ++{ ++ IOMMUTLBEvent event = {}; ++ ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &address_space_memory; ++ event.entry.perm = IOMMU_NONE; ++ event.entry.granularity = IOMMU_INV_GRAN_PASID; ++ event.entry.flags = IOMMU_INV_FLAGS_ARCHID; ++ event.entry.arch_id = asid; ++ event.entry.iova = n->start; ++ event.entry.addr_mask = n->end - n->start; ++ ++ memory_region_notify_iommu_one(n, &event); ++} ++ ++ + /* invalidate an asid/iova range tuple in all mr's */ + static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, + uint8_t tg, uint64_t num_pages) +@@ -913,6 +938,22 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + return true; + } + ++static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid) ++{ ++ SMMUDevice *sdev; ++ ++ trace_smmuv3_s1_asid_inval(asid); ++ QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) { ++ IOMMUMemoryRegion *mr = &sdev->iommu; ++ IOMMUNotifier *n; ++ ++ IOMMU_NOTIFIER_FOREACH(n, mr) { ++ smmuv3_notify_asid(mr, n, asid); ++ } ++ } ++ smmu_iotlb_inv_asid(s, asid); ++} ++ + static int smmuv3_cmdq_consume(SMMUv3State *s) + { + SMMUState *bs = ARM_SMMU(s); +@@ -1027,8 +1068,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + uint16_t asid = CMD_ASID(&cmd); + + trace_smmuv3_cmdq_tlbi_nh_asid(asid); +- smmu_inv_notifiers_all(&s->smmu_state); +- smmu_iotlb_inv_asid(bs, asid); ++ smmuv3_s1_asid_inval(bs, asid); + break; + } + case SMMU_CMD_TLBI_NH_ALL: +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 2dee296c8f..1447ad5a90 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -46,6 +46,7 @@ smmuv3_cmdq_cfgi_cd(uint32_t sid) "sid=0x%x" + smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache HIT for sid=0x%x (hits=%d, misses=%d, hit rate=%d)" + smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid=0x%x (hits=%d, misses=%d, hit rate=%d)" + smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, uint8_t tg, uint64_t num_pages, uint8_t ttl, bool leaf) "vmid=%d asid=%d addr=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" ttl=%d leaf=%d" ++smmuv3_s1_asid_inval(int asid) "asid=%d" + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch new file mode 100644 index 0000000000000000000000000000000000000000..012c5d0071dc9f1bce50e706958e55fb443230d2 --- /dev/null +++ b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch @@ -0,0 +1,161 @@ +From 2e5929ec2a35a7a227dc7ba70a557a84993a366d Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 9 Aug 2018 21:04:19 +0200 +Subject: [PATCH] hw/arm/smmuv3: Pass stage 1 configurations to the host + +In case PASID PciOps are set for the device we call +the set_pasid_table() callback on each STE update. + +This allows to pass the guest stage 1 configuration +to the host and apply it at physical level. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmu-internal.h | 1 + + hw/arm/smmuv3.c | 71 ++++++++++++++++++++++++++++++++++++------ + hw/arm/trace-events | 1 + + 3 files changed, 64 insertions(+), 9 deletions(-) + +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 2d75b31953..5ef8c598c6 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -105,6 +105,7 @@ typedef struct SMMUIOTLBPageInvInfo { + } SMMUIOTLBPageInvInfo; + + typedef struct SMMUSIDRange { ++ SMMUState *state; + uint32_t start; + uint32_t end; + } SMMUSIDRange; +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ad816e850c..58139f707d 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -16,6 +16,10 @@ + * with this program; if not, see . + */ + ++#ifdef __linux__ ++#include "linux/iommu.h" ++#endif ++ + #include "qemu/osdep.h" + #include "qemu/bitops.h" + #include "hw/irq.h" +@@ -928,6 +932,61 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + } + } + ++static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) ++{ ++#ifdef __linux__ ++ IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); ++ SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid, ++ .inval_ste_allowed = true}; ++ IOMMUConfig iommu_config = {}; ++ SMMUTransCfg *cfg; ++ SMMUDevice *sdev; ++ ++ if (!mr) { ++ return; ++ } ++ ++ sdev = container_of(mr, SMMUDevice, iommu); ++ ++ /* flush QEMU config cache */ ++ smmuv3_flush_config(sdev); ++ ++ if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) { ++ return; ++ } ++ ++ cfg = smmuv3_get_config(sdev, &event); ++ ++ if (!cfg) { ++ return; ++ } ++ ++ iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config); ++ iommu_config.pasid_cfg.version = PASID_TABLE_CFG_VERSION_1; ++ iommu_config.pasid_cfg.format = IOMMU_PASID_FORMAT_SMMUV3; ++ iommu_config.pasid_cfg.base_ptr = cfg->s1ctxptr; ++ iommu_config.pasid_cfg.pasid_bits = 0; ++ iommu_config.pasid_cfg.vendor_data.smmuv3.version = PASID_TABLE_SMMUV3_CFG_VERSION_1; ++ ++ if (cfg->disabled || cfg->bypassed) { ++ iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_BYPASS; ++ } else if (cfg->aborted) { ++ iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_ABORT; ++ } else { ++ iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_TRANSLATE; ++ } ++ ++ trace_smmuv3_notify_config_change(mr->parent_obj.name, ++ iommu_config.pasid_cfg.config, ++ iommu_config.pasid_cfg.base_ptr); ++ ++ if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) { ++ error_report("Failed to pass PASID table to host for iommu mr %s (%m)", ++ mr->parent_obj.name); ++ } ++#endif ++} ++ + static gboolean + smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + { +@@ -938,6 +997,7 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + if (sid < sid_range->start || sid > sid_range->end) { + return false; + } ++ smmuv3_notify_config_change(sid_range->state, sid); + trace_smmuv3_config_cache_inv(sid); + return true; + } +@@ -1008,22 +1068,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + case SMMU_CMD_CFGI_STE: + { + uint32_t sid = CMD_SID(&cmd); +- IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); +- SMMUDevice *sdev; + + if (CMD_SSEC(&cmd)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + +- if (!mr) { +- break; +- } +- + trace_smmuv3_cmdq_cfgi_ste(sid); +- sdev = container_of(mr, SMMUDevice, iommu); +- smmuv3_flush_config(sdev); +- ++ smmuv3_notify_config_change(bs, sid); + break; + } + case SMMU_CMD_CFGI_STE_RANGE: /* same as SMMU_CMD_CFGI_ALL */ +@@ -1038,6 +1090,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + } + + mask = (1ULL << (range + 1)) - 1; ++ sid_range.state = bs; + sid_range.start = sid & ~mask; + sid_range.end = sid_range.start + mask; + +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 1447ad5a90..d9851d663e 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -53,4 +53,5 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" + smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 ++smmuv3_notify_config_change(const char *name, uint8_t config, uint64_t s1ctxptr) "iommu mr=%s config=%d s1ctxptr=0x%"PRIx64 + +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch new file mode 100644 index 0000000000000000000000000000000000000000..0fc5f84460a60655713b48766bed2b7599042431 --- /dev/null +++ b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch @@ -0,0 +1,110 @@ +From 1b95c995f032c21bf6607dda8ede0f5856bb190a Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:16 +0800 +Subject: [PATCH] hw/arm/smmuv3: Post-load stage 1 configurations to the host + +In nested mode, we call the set_pasid_table() callback on each +STE update to pass the guest stage 1 configuration to the host +and apply it at physical level. + +In the case of live migration, we need to manually call the +set_pasid_table() to load the guest stage 1 configurations to +the host. If this operation fails, the migration fails. + +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 33 ++++++++++++++++++++++++++++----- + 1 file changed, 28 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 45f21c53fe..291e3a12e8 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -932,7 +932,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + } + } + +-static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) ++static int smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + { + #ifdef __linux__ + IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); +@@ -941,9 +941,10 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + IOMMUConfig iommu_config = {}; + SMMUTransCfg *cfg; + SMMUDevice *sdev; ++ int ret; + + if (!mr) { +- return; ++ return 0; + } + + sdev = container_of(mr, SMMUDevice, iommu); +@@ -952,13 +953,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + smmuv3_flush_config(sdev); + + if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) { +- return; ++ return 0; + } + + cfg = smmuv3_get_config(sdev, &event); + + if (!cfg) { +- return; ++ return 0; + } + + iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config); +@@ -980,10 +981,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + iommu_config.pasid_cfg.config, + iommu_config.pasid_cfg.base_ptr); + +- if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) { ++ ret = pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config); ++ if (ret) { + error_report("Failed to pass PASID table to host for iommu mr %s (%m)", + mr->parent_obj.name); + } ++ ++ return ret; + #endif + } + +@@ -1553,6 +1557,24 @@ static void smmu_realize(DeviceState *d, Error **errp) + smmu_init_irq(s, dev); + } + ++static int smmuv3_post_load(void *opaque, int version_id) ++{ ++ SMMUv3State *s3 = opaque; ++ SMMUState *s = &(s3->smmu_state); ++ SMMUDevice *sdev; ++ int ret = 0; ++ ++ QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) { ++ uint32_t sid = smmu_get_sid(sdev); ++ ret = smmuv3_notify_config_change(s, sid); ++ if (ret) { ++ break; ++ } ++ } ++ ++ return ret; ++} ++ + static const VMStateDescription vmstate_smmuv3_queue = { + .name = "smmuv3_queue", + .version_id = 1, +@@ -1571,6 +1593,7 @@ static const VMStateDescription vmstate_smmuv3 = { + .version_id = 1, + .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, ++ .post_load = smmuv3_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT32(features, SMMUv3State), + VMSTATE_UINT8(sid_size, SMMUv3State), +-- +2.27.0 + diff --git a/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch new file mode 100644 index 0000000000000000000000000000000000000000..3bbf1dadc18e41dbfe5859f337c79d42ef557c0b --- /dev/null +++ b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch @@ -0,0 +1,45 @@ +From f937ce4124d57eea27d516957a2efa0e7fbdf198 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 9 Aug 2018 20:56:44 +0200 +Subject: [PATCH] hw/arm/smmuv3: Store the PASID table GPA in the translation + config + +For VFIO integration we will need to pass the Context Descriptor (CD) +table GPA to the host. The CD table is also referred to as the PASID +table. Its GPA corresponds to the s1ctrptr field of the Stream Table +Entry. So let's decode and store it in the configuration structure. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 1 + + include/hw/arm/smmu-common.h | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 12f354a0d5..3416f6a639 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -358,6 +358,7 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, + "SMMUv3 S1 stalling fault model not allowed yet\n"); + goto bad_ste; + } ++ cfg->s1ctxptr = STE_CTXPTR(ste); + return 0; + + bad_ste: +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 706be3c6d0..d578339935 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -76,6 +76,7 @@ typedef struct SMMUTransCfg { + uint8_t tbi; /* Top Byte Ignore */ + uint16_t asid; + SMMUTransTableInfo tt[2]; ++ dma_addr_t s1ctxptr; + uint32_t iotlb_hits; /* counts IOTLB hits for this asid */ + uint32_t iotlb_misses; /* counts IOTLB misses for this asid */ + } SMMUTransCfg; +-- +2.27.0 + diff --git a/ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch b/ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch new file mode 100644 index 0000000000000000000000000000000000000000..7cd2ccff36dc717b01363728e5d48f6be9661e0e --- /dev/null +++ b/ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch @@ -0,0 +1,41 @@ +From 9169beed83ea77059a7240aae5621dcfb3178cba Mon Sep 17 00:00:00 2001 +From: Prasad J Pandit +Date: Mon, 21 Jun 2021 09:22:35 +0800 +Subject: [PATCH] ide: ahci: add check to avoid null dereference + (CVE-2019-12067) + +Fix CVE-2019-12067 + +AHCI emulator while committing DMA buffer in ahci_commit_buf() +may do a NULL dereference if the command header 'ad->cur_cmd' +is null. Add check to avoid it. + +Reported-by: Bugs SysSec +Signed-off-by: Prasad J Pandit + +Signed-off-by: Jiajie Li +Signed-off-by: Yan Wang +--- + hw/ide/ahci.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c +index a94c6e26fb..256b58026a 100644 +--- a/hw/ide/ahci.c ++++ b/hw/ide/ahci.c +@@ -1459,8 +1459,10 @@ static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes) + { + AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); + +- tx_bytes += le32_to_cpu(ad->cur_cmd->status); +- ad->cur_cmd->status = cpu_to_le32(tx_bytes); ++ if (ad->cur_cmd) { ++ tx_bytes += le32_to_cpu(ad->cur_cmd->status); ++ ad->cur_cmd->status = cpu_to_le32(tx_bytes); ++ } + } + + static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write) +-- +2.27.0 + diff --git a/iommu-Introduce-generic-header.patch b/iommu-Introduce-generic-header.patch new file mode 100644 index 0000000000000000000000000000000000000000..84f3d77c057bee4e80d68e8dbf92c473089109f0 --- /dev/null +++ b/iommu-Introduce-generic-header.patch @@ -0,0 +1,53 @@ +From 5e312f7b41ec48dc7dc9805af9f52aa8ed393bf9 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 9 Jul 2019 12:20:12 +0200 +Subject: [PATCH] iommu: Introduce generic header + +This header is meant to exposes data types used by +several IOMMU devices such as struct for SVA and +nested stage configuration. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + include/hw/iommu/iommu.h | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + create mode 100644 include/hw/iommu/iommu.h + +diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h +new file mode 100644 +index 0000000000..12092bda7b +--- /dev/null ++++ b/include/hw/iommu/iommu.h +@@ -0,0 +1,28 @@ ++/* ++ * common header for iommu devices ++ * ++ * Copyright Red Hat, Inc. 2019 ++ * ++ * Authors: ++ * Eric Auger ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. See ++ * the COPYING file in the top-level directory. ++ */ ++ ++#ifndef QEMU_HW_IOMMU_IOMMU_H ++#define QEMU_HW_IOMMU_IOMMU_H ++#ifdef __linux__ ++#include ++#endif ++ ++typedef struct IOMMUConfig { ++ union { ++#ifdef __linux__ ++ struct iommu_pasid_table_config pasid_cfg; ++#endif ++ }; ++} IOMMUConfig; ++ ++ ++#endif /* QEMU_HW_IOMMU_IOMMU_H */ +-- +2.27.0 + diff --git a/linux-headers-update-against-5.10-and-manual-clear-v.patch b/linux-headers-update-against-5.10-and-manual-clear-v.patch new file mode 100644 index 0000000000000000000000000000000000000000..93d300675cb54678121161bfb63e0a35bfaffc6f --- /dev/null +++ b/linux-headers-update-against-5.10-and-manual-clear-v.patch @@ -0,0 +1,82 @@ +From 40512773625a4f8ddd96a5af924f119b89a14706 Mon Sep 17 00:00:00 2001 +From: Zenghui Yu +Date: Sat, 8 May 2021 17:31:03 +0800 +Subject: [PATCH] linux-headers: update against 5.10 and manual clear vfio + dirty log series + +The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl +VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and +VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in +the kernel, update the header to add them. + +Signed-off-by: Zenghui Yu +Signed-off-by: Kunkun Jiang +--- + linux-headers/linux/vfio.h | 36 +++++++++++++++++++++++++++++++++++- + 1 file changed, 35 insertions(+), 1 deletion(-) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index e680594f27..f4ff038e8c 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -52,6 +52,16 @@ + /* Supports the vaddr flag for DMA map and unmap */ + #define VFIO_UPDATE_VADDR 10 + ++/* ++ * The vfio_iommu driver may support user clears dirty log manually, which means ++ * dirty log can be requested to not cleared automatically after dirty log is ++ * copied to userspace, it's user's duty to clear dirty log. ++ * ++ * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and ++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. ++ */ ++#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 ++ + /* + * The IOCTL interface is designed for extensibility by embedding the + * structure length (argsz) and flags into structures passed between +@@ -1196,8 +1206,30 @@ struct vfio_iommu_type1_dma_unmap { + * actual bitmap. If dirty pages logging is not enabled, an error will be + * returned. + * +- * Only one of the flags _START, _STOP and _GET may be specified at a time. ++ * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as ++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying ++ * dirty bitmap is not cleared automatically. The user can clear it manually by ++ * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. + * ++ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, ++ * instructs the IOMMU driver to clear the dirty status of pages in a bitmap ++ * for IOMMU container for a given IOVA range. The user must specify the IOVA ++ * range, the bitmap and the pgsize through the structure ++ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface ++ * supports clearing a bitmap of the smallest supported pgsize only and can be ++ * modified in future to clear a bitmap of any specified supported pgsize. The ++ * user must provide a memory area for the bitmap memory and specify its size ++ * in bitmap.size. One bit is used to represent one page consecutively starting ++ * from iova offset. The user should provide page size in bitmap.pgsize field. ++ * A bit set in the bitmap indicates that the page at that offset from iova is ++ * cleared the dirty status, and dirty tracking is re-enabled for that page. The ++ * caller must set argsz to a value including the size of structure ++ * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If ++ * dirty pages logging is not enabled, an error will be returned. Note: user ++ * should clear dirty log before handle corresponding dirty pages. ++ * ++ * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be ++ * specified at a time. + */ + struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; +@@ -1205,6 +1237,8 @@ struct vfio_iommu_type1_dirty_bitmap { + #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) + #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) + #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) ++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) ++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) + __u8 data[]; + }; + +-- +2.27.0 + diff --git a/log-Add-some-logs-on-VM-runtime-path.patch b/log-Add-some-logs-on-VM-runtime-path.patch new file mode 100644 index 0000000000000000000000000000000000000000..90408a3a2a9971f29abee70b9907aa1e45ad22cf --- /dev/null +++ b/log-Add-some-logs-on-VM-runtime-path.patch @@ -0,0 +1,170 @@ +From d0ed3afacd2af1cbfcfb615471ade3c8c4185c00 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Tue, 8 Feb 2022 15:48:01 +0800 +Subject: [PATCH] log: Add some logs on VM runtime path + +Add logs on VM runtime path, to make it easier to do trouble shooting. + +Signed-off-by: Ying Fang +Signed-off-by: Yan Wang +--- + hw/virtio/virtio-pci.c | 2 ++ + hw/virtio/virtio.c | 14 ++++++++++++-- + monitor/monitor.c | 9 +++++++++ + qapi/qmp-dispatch.c | 15 +++++++++++++++ + softmmu/qdev-monitor.c | 4 +++- + 5 files changed, 41 insertions(+), 3 deletions(-) + +diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c +index 750aa47ec1..38a5dc1ba8 100644 +--- a/hw/virtio/virtio-pci.c ++++ b/hw/virtio/virtio-pci.c +@@ -1772,7 +1772,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + bool modern = virtio_pci_modern(proxy); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; ++ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + ++ qemu_log("unplug device name: %s\n", !vdev ? "NULL" : vdev->name); + virtio_pci_stop_ioeventfd(proxy); + + if (modern) { +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index ea7c079fb0..9b4ac58a16 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -1945,7 +1945,14 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) + k->set_status(vdev, val); + } + vdev->status = val; +- ++ if (val) { ++ qemu_log("%s device status is %d that means %s\n", ++ vdev->name, val, ++ (val & VIRTIO_CONFIG_S_DRIVER_OK) ? "DRIVER OK" : ++ (val & VIRTIO_CONFIG_S_DRIVER) ? "DRIVER" : ++ (val & VIRTIO_CONFIG_S_ACKNOWLEDGE) ? "ACKNOWLEDGE" : ++ (val & VIRTIO_CONFIG_S_FAILED) ? "FAILED" : "UNKNOWN"); ++ } + return 0; + } + +@@ -2389,8 +2396,11 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + break; + } + +- if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) ++ if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) { ++ qemu_log("unacceptable queue_size (%d) or num (%d)\n", ++ queue_size, i); + abort(); ++ } + + vdev->vq[i].vring.num = queue_size; + vdev->vq[i].vring.num_default = queue_size; +diff --git a/monitor/monitor.c b/monitor/monitor.c +index 21c7a68758..013c628695 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -29,6 +29,7 @@ + #include "qapi/qapi-emit-events.h" + #include "qapi/qapi-visit-control.h" + #include "qapi/qmp/qdict.h" ++#include "qapi/qmp/qjson.h" + #include "qemu/error-report.h" + #include "qemu/option.h" + #include "sysemu/qtest.h" +@@ -318,6 +319,7 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) + { + Monitor *mon; + MonitorQMP *qmp_mon; ++ GString *json; + + trace_monitor_protocol_event_emit(event, qdict); + QTAILQ_FOREACH(mon, &mon_list, entry) { +@@ -328,6 +330,13 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) + qmp_mon = container_of(mon, MonitorQMP, common); + if (qmp_mon->commands != &qmp_cap_negotiation_commands) { + qmp_send_response(qmp_mon, qdict); ++ json = qobject_to_json(QOBJECT(qdict)); ++ if (json) { ++ if (!strstr(json->str, "RTC_CHANGE")) { ++ qemu_log("%s\n", json->str); ++ } ++ g_string_free(json, true); ++ } + } + } + } +diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c +index d378bccac7..bb005594d3 100644 +--- a/qapi/qmp-dispatch.c ++++ b/qapi/qmp-dispatch.c +@@ -25,6 +25,7 @@ + #include "qapi/qmp/qbool.h" + #include "qemu/coroutine.h" + #include "qemu/main-loop.h" ++#include "qemu/log.h" + + Visitor *qobject_input_visitor_new_qmp(QObject *obj) + { +@@ -147,6 +148,7 @@ QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request, + QObject *id; + QObject *ret = NULL; + QDict *rsp = NULL; ++ GString *json; + + dict = qobject_to(QDict, request); + if (!dict) { +@@ -204,6 +206,19 @@ QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request, + qobject_ref(args); + } + ++ json = qobject_to_json(QOBJECT(args)); ++ if (json) { ++ if ((strcmp(command, "query-block-jobs") != 0) ++ && (strcmp(command, "query-migrate") != 0) ++ && (strcmp(command, "query-blockstats") != 0) ++ && (strcmp(command, "query-balloon") != 0) ++ && (strcmp(command, "set_password") != 0)) { ++ qemu_log("qmp_cmd_name: %s, arguments: %s\n", ++ command, json->str); ++ } ++ g_string_free(json, true); ++ } ++ + assert(!(oob && qemu_in_coroutine())); + assert(monitor_cur() == NULL); + if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) { +diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c +index 01f3834db5..dfd6429bf3 100644 +--- a/softmmu/qdev-monitor.c ++++ b/softmmu/qdev-monitor.c +@@ -36,6 +36,7 @@ + #include "qemu/option.h" + #include "qemu/qemu-print.h" + #include "qemu/option_int.h" ++#include "qemu/log.h" + #include "sysemu/block-backend.h" + #include "migration/misc.h" + #include "migration/migration.h" +@@ -635,6 +636,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, + if (path != NULL) { + bus = qbus_find(path, errp); + if (!bus) { ++ error_setg(errp, "can not find bus for %s", driver); + return NULL; + } + if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { +@@ -707,7 +709,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, + if (*errp) { + goto err_del_dev; + } +- ++ qemu_log("add qdev %s:%s success\n", driver, dev->id ? dev->id : "none"); + if (!qdev_realize(DEVICE(dev), bus, errp)) { + goto err_del_dev; + } +-- +2.27.0 + diff --git a/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch new file mode 100644 index 0000000000000000000000000000000000000000..b06bc2514357ceda3aeff221ef2095f43885ada1 --- /dev/null +++ b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch @@ -0,0 +1,32 @@ +From 062923fd4e6d11e1b724f2dd059f8b0c6e65bf7a Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Mon, 25 Mar 2019 16:35:05 +0100 +Subject: [PATCH] memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region + attribute + +We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_MSI_TRANSLATE +which tells whether the virtual IOMMU translates MSIs. ARM SMMU +will expose this attribute since, as opposed to Intel DMAR, MSIs +are translated as any other DMA requests. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + include/exec/memory.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 864bcaeb01..76ef99ed27 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -323,6 +323,7 @@ typedef struct MemoryRegionClass { + enum IOMMUMemoryRegionAttr { + IOMMU_ATTR_SPAPR_TCE_FD, + IOMMU_ATTR_VFIO_NESTED, ++ IOMMU_ATTR_MSI_TRANSLATE, + }; + + /* +-- +2.27.0 + diff --git a/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch new file mode 100644 index 0000000000000000000000000000000000000000..bb27247096c8d16f63ff7c55a7cb2a6827dccd16 --- /dev/null +++ b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch @@ -0,0 +1,72 @@ +From b380e3e0c30fb68dbbfb1397f3c374adfff77ac4 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Mon, 1 Jul 2019 11:30:30 +0200 +Subject: [PATCH] memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region + attribute + +We introduce a new IOMMU Memory Region attribute, +IOMMU_ATTR_VFIO_NESTED that tells whether the virtual IOMMU +requires HW nested paging for VFIO integration. + +Current Intel virtual IOMMU device supports "Caching +Mode" and does not require 2 stages at physical level to be +integrated with VFIO. However SMMUv3 does not implement such +"caching mode" and requires to use HW nested paging. + +As such SMMUv3 is the first IOMMU device to advertise this +attribute. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 12 ++++++++++++ + include/exec/memory.h | 3 ++- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index da5dac1ba5..9b87d16217 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1589,6 +1589,17 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, + return 0; + } + ++static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, ++ enum IOMMUMemoryRegionAttr attr, ++ void *data) ++{ ++ if (attr == IOMMU_ATTR_VFIO_NESTED) { ++ *(bool *) data = true; ++ return 0; ++ } ++ return -EINVAL; ++} ++ + static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + void *data) + { +@@ -1596,6 +1607,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + + imrc->translate = smmuv3_translate; + imrc->notify_flag_changed = smmuv3_notify_flag_changed; ++ imrc->get_attr = smmuv3_get_attr; + } + + static const TypeInfo smmuv3_type_info = { +diff --git a/include/exec/memory.h b/include/exec/memory.h +index c3180075e1..864bcaeb01 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -321,7 +321,8 @@ typedef struct MemoryRegionClass { + + + enum IOMMUMemoryRegionAttr { +- IOMMU_ATTR_SPAPR_TCE_FD ++ IOMMU_ATTR_SPAPR_TCE_FD, ++ IOMMU_ATTR_VFIO_NESTED, + }; + + /* +-- +2.27.0 + diff --git a/memory-Add-new-fields-in-IOTLBEntry.patch b/memory-Add-new-fields-in-IOTLBEntry.patch new file mode 100644 index 0000000000000000000000000000000000000000..5a85dbfd61ecea58ff1942d24a004b2ed82e6fdd --- /dev/null +++ b/memory-Add-new-fields-in-IOTLBEntry.patch @@ -0,0 +1,184 @@ +From da97cef20d4ee5a8f3942953836b35e7f7dd974f Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 4 Sep 2018 08:43:05 -0400 +Subject: [PATCH] memory: Add new fields in IOTLBEntry + +The current IOTLBEntry becomes too simple to interact with +some physical IOMMUs. IOTLBs can be invalidated with different +granularities: domain, pasid, addr. Current IOTLB entry only offers +page selective invalidation. Let's add a granularity field +that conveys this information. + +TLB entries are usually tagged with some ids such as the asid +or pasid. When propagating an invalidation command from the +guest to the host, we need to pass those IDs. + +Also we add a leaf field which indicates, in case of invalidation +notification, whether only cache entries for the last level of +translation are required to be invalidated. + +A flag field is introduced to inform whether those fields are set. + +To enforce all existing users do not use those new fields, +initialize the IOMMUTLBEvents when needed. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmu-common.c | 2 +- + hw/arm/smmuv3.c | 2 +- + hw/i386/intel_iommu.c | 6 +++--- + hw/ppc/spapr_iommu.c | 2 +- + hw/virtio/virtio-iommu.c | 4 ++-- + include/exec/memory.h | 36 +++++++++++++++++++++++++++++++++++- + 6 files changed, 43 insertions(+), 9 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 0459850a93..3a1ecf81d6 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -470,7 +470,7 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + /* Unmap the whole notifier's range */ + static void smmu_unmap_notifier_range(IOMMUNotifier *n) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + + event.type = IOMMU_NOTIFIER_UNMAP; + event.entry.target_as = &address_space_memory; +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 01b60bee49..94e2c658f8 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -802,7 +802,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint8_t granule; + + if (!tg) { +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index f584449d8d..fae282ef5e 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -1193,7 +1193,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + uint32_t offset; + uint64_t slpte; + uint64_t subpage_size, subpage_mask; +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint64_t iova = start; + uint64_t iova_next; + int ret = 0; +@@ -2425,7 +2425,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) + { + VTDAddressSpace *vtd_dev_as; +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + struct VTDBus *vtd_bus; + hwaddr addr; + uint64_t sz; +@@ -3481,7 +3481,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + size = remain = end - start + 1; + + while (remain >= VTD_PAGE_SIZE) { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits); + uint64_t size = mask + 1; + +diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c +index db01071858..454df25d44 100644 +--- a/hw/ppc/spapr_iommu.c ++++ b/hw/ppc/spapr_iommu.c +@@ -449,7 +449,7 @@ static void spapr_tce_reset(DeviceState *dev) + static target_ulong put_tce_emu(SpaprTceTable *tcet, target_ulong ioba, + target_ulong tce) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift); + unsigned long index = (ioba - tcet->bus_offset) >> tcet->page_shift; + +diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c +index 1b23e8e18c..83ed2b82e6 100644 +--- a/hw/virtio/virtio-iommu.c ++++ b/hw/virtio/virtio-iommu.c +@@ -129,7 +129,7 @@ static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end, hwaddr paddr, + uint32_t flags) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ, + flags & VIRTIO_IOMMU_MAP_F_WRITE); + +@@ -154,7 +154,7 @@ static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, + static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint64_t delta = virt_end - virt_start; + + if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) { +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 20f1b27377..c3180075e1 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -113,14 +113,48 @@ typedef enum { + IOMMU_RW = 3, + } IOMMUAccessFlags; + ++/* Granularity of the cache invalidation */ ++typedef enum { ++ IOMMU_INV_GRAN_ADDR = 0, ++ IOMMU_INV_GRAN_PASID, ++ IOMMU_INV_GRAN_DOMAIN, ++} IOMMUInvGranularity; ++ + #define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0)) + ++/** ++ * struct IOMMUTLBEntry - IOMMU TLB entry ++ * ++ * Structure used when performing a translation or when notifying MAP or ++ * UNMAP (invalidation) events ++ * ++ * @target_as: target address space ++ * @iova: IO virtual address (input) ++ * @translated_addr: translated address (output) ++ * @addr_mask: address mask (0xfff means 4K binding), must be multiple of 2 ++ * @perm: permission flag of the mapping (NONE encodes no mapping or ++ * invalidation notification) ++ * @granularity: granularity of the invalidation ++ * @flags: informs whether the following fields are set ++ * @arch_id: architecture specific ID tagging the TLB ++ * @pasid: PASID tagging the TLB ++ * @leaf: when @perm is NONE, indicates whether only caches for the last ++ * level of translation need to be invalidated. ++ */ + struct IOMMUTLBEntry { + AddressSpace *target_as; + hwaddr iova; + hwaddr translated_addr; +- hwaddr addr_mask; /* 0xfff = 4k translation */ ++ hwaddr addr_mask; + IOMMUAccessFlags perm; ++ IOMMUInvGranularity granularity; ++#define IOMMU_INV_FLAGS_PASID (1 << 0) ++#define IOMMU_INV_FLAGS_ARCHID (1 << 1) ++#define IOMMU_INV_FLAGS_LEAF (1 << 2) ++ uint32_t flags; ++ uint32_t arch_id; ++ uint32_t pasid; ++ bool leaf; + }; + + /* +-- +2.27.0 + diff --git a/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch new file mode 100644 index 0000000000000000000000000000000000000000..e541d9e69d716560a8d5636a25d072b3d05c765f --- /dev/null +++ b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch @@ -0,0 +1,88 @@ +From d2dce19165f133935ff72e209f19bc43ab4d1421 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Sep 2018 14:13:04 +0200 +Subject: [PATCH] memory: Introduce IOMMU Memory Region inject_faults API + +This new API allows to inject @count iommu_faults into +the IOMMU memory region. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + include/exec/memory.h | 24 ++++++++++++++++++++++++ + softmmu/memory.c | 10 ++++++++++ + 2 files changed, 34 insertions(+) + +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 76ef99ed27..3e84d62e40 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -103,6 +103,8 @@ struct MemoryRegionSection { + bool nonvolatile; + }; + ++struct iommu_fault; ++ + typedef struct IOMMUTLBEntry IOMMUTLBEntry; + + /* See address_space_translate: bit 0 is read, bit 1 is write. */ +@@ -523,6 +525,19 @@ struct IOMMUMemoryRegionClass { + int (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu, + uint64_t page_size_mask, + Error **errp); ++ ++ /* ++ * Inject @count faults into the IOMMU memory region ++ * ++ * Optional method: if this method is not provided, then ++ * memory_region_injection_faults() will return -ENOENT ++ * ++ * @iommu: the IOMMU memory region to inject the faults in ++ * @count: number of faults to inject ++ * @buf: fault buffer ++ */ ++ int (*inject_faults)(IOMMUMemoryRegion *iommu, int count, ++ struct iommu_fault *buf); + }; + + typedef struct RamDiscardListener RamDiscardListener; +@@ -1819,6 +1834,15 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); + int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, + uint64_t page_size_mask, + Error **errp); ++/** ++ * memory_region_inject_faults : inject @count faults stored in @buf ++ * ++ * @iommu_mr: the IOMMU memory region ++ * @count: number of faults to be injected ++ * @buf: buffer containing the faults ++ */ ++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, ++ struct iommu_fault *buf); + + /** + * memory_region_name: get a memory region's name +diff --git a/softmmu/memory.c b/softmmu/memory.c +index 7340e19ff5..9f98209ab2 100644 +--- a/softmmu/memory.c ++++ b/softmmu/memory.c +@@ -2111,6 +2111,16 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + rdmc->unregister_listener(rdm, rdl); + } + ++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, ++ struct iommu_fault *buf) ++{ ++ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); ++ if (!imrc->inject_faults) { ++ return -ENOENT; ++ } ++ return imrc->inject_faults(iommu_mr, count, buf); ++} ++ + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) + { + uint8_t mask = 1 << client; +-- +2.27.0 + diff --git a/monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch b/monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch new file mode 100644 index 0000000000000000000000000000000000000000..13d16d03846771b316c9a42c3e616d473c299984 --- /dev/null +++ b/monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch @@ -0,0 +1,97 @@ +From f5af9ac3c9af4602812060759f6f95da8725314b Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 11:18:13 +0800 +Subject: [PATCH] monitor: Discard BLOCK_IO_ERROR event when VM rebooted + +Throttled event like QAPI_EVENT_BLOCK_IO_ERROR may be queued +to limit event rate. Event may be delivered when VM is rebooted +if the event was queued in the *monitor_qapi_event_state* hash table. +Which may casue VM pause and other related problems. +Such as seabios blocked during virtio-scsi initialization: + vring_add_buf(vq, sg, out_num, in_num, 0, 0); + vring_kick(vp, vq, 1); + ------------> VM paused here <----------- + /* Wait for reply */ + while (!vring_more_used(vq)) usleep(5); + +Signed-off-by: Yan Wang +--- + include/monitor/monitor.h | 2 ++ + monitor/monitor.c | 30 ++++++++++++++++++++++++++++++ + softmmu/runstate.c | 1 + + 3 files changed, 33 insertions(+) + +diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h +index 12d395d62d..847445f972 100644 +--- a/include/monitor/monitor.h ++++ b/include/monitor/monitor.h +@@ -56,4 +56,6 @@ void monitor_register_hmp(const char *name, bool info, + void monitor_register_hmp_info_hrt(const char *name, + HumanReadableText *(*handler)(Error **errp)); + ++void monitor_qapi_event_discard_io_error(void); ++ + #endif /* MONITOR_H */ +diff --git a/monitor/monitor.c b/monitor/monitor.c +index 013c628695..fb4ae9531c 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -34,6 +34,9 @@ + #include "qemu/option.h" + #include "sysemu/qtest.h" + #include "trace.h" ++#include "qemu/log.h" ++#include "qapi/qmp/qjson.h" ++#include "qapi/qmp/qobject.h" + + /* + * To prevent flooding clients, events can be throttled. The +@@ -767,6 +770,33 @@ int monitor_init_opts(QemuOpts *opts, Error **errp) + return ret; + } + ++void monitor_qapi_event_discard_io_error(void) ++{ ++ GHashTableIter event_iter; ++ MonitorQAPIEventState *evstate; ++ gpointer key, value; ++ GString *json; ++ ++ qemu_mutex_lock(&monitor_lock); ++ g_hash_table_iter_init(&event_iter, monitor_qapi_event_state); ++ while (g_hash_table_iter_next(&event_iter, &key, &value)) { ++ evstate = key; ++ /* Only QAPI_EVENT_BLOCK_IO_ERROR is discarded */ ++ if (evstate->event == QAPI_EVENT_BLOCK_IO_ERROR) { ++ g_hash_table_iter_remove(&event_iter); ++ json = qobject_to_json(QOBJECT(evstate->qdict)); ++ qemu_log(" %s event discarded\n", json->str); ++ timer_del(evstate->timer); ++ timer_free(evstate->timer); ++ qobject_unref(evstate->data); ++ qobject_unref(evstate->qdict); ++ g_string_free(json, true); ++ g_free(evstate); ++ } ++ } ++ qemu_mutex_unlock(&monitor_lock); ++} ++ + QemuOptsList qemu_mon_opts = { + .name = "mon", + .implied_opt_name = "chardev", +diff --git a/softmmu/runstate.c b/softmmu/runstate.c +index 10d9b7365a..5736d908db 100644 +--- a/softmmu/runstate.c ++++ b/softmmu/runstate.c +@@ -448,6 +448,7 @@ void qemu_system_reset(ShutdownCause reason) + qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); + } + cpu_synchronize_all_post_reset(); ++ monitor_qapi_event_discard_io_error(); + } + + /* +-- +2.27.0 + diff --git a/monitor-limit-io-error-qmp-event-to-at-most-once-per.patch b/monitor-limit-io-error-qmp-event-to-at-most-once-per.patch new file mode 100644 index 0000000000000000000000000000000000000000..2b3b02f82514599c996d2ba05950ac2445afd773 --- /dev/null +++ b/monitor-limit-io-error-qmp-event-to-at-most-once-per.patch @@ -0,0 +1,29 @@ +From 44f45b5c163efed5387dac40e229e0a50bf5921a Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 11:35:58 +0800 +Subject: [PATCH] monitor: limit io error qmp event to at most once per 60s + +The speed of BLOCK IO ERROR event maybe very high (thousands per +second). If we report all BLOCK IO ERRORs, the log file will be flooded +with BLOCK IO ERROR event. So throttle it to at most once per 60s. + +Signed-off-by: Yan Wang +--- + monitor/monitor.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/monitor/monitor.c b/monitor/monitor.c +index fb4ae9531c..621e79eb66 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -300,6 +300,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { + [QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS }, + [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, + [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, ++ [QAPI_EVENT_BLOCK_IO_ERROR] = { 60L * 1000 * SCALE_MS }, + }; + + /* +-- +2.27.0 + diff --git a/net-eepro100-validate-various-address-valuesi-CVE-20.patch b/net-eepro100-validate-various-address-valuesi-CVE-20.patch new file mode 100644 index 0000000000000000000000000000000000000000..47095713aabaf1232ac04d75dc97f189c67df4aa --- /dev/null +++ b/net-eepro100-validate-various-address-valuesi-CVE-20.patch @@ -0,0 +1,58 @@ +From 5db012b1116d21c64da88ad206b3589ddf5f219b Mon Sep 17 00:00:00 2001 +From: zhouli57 +Date: Sat, 18 Dec 2021 09:39:57 +0800 +Subject: [PATCH] net: eepro100: validate various address + valuesi(CVE-2021-20255) + +fix CVE-2021-20255 + +patch link: https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg06098.html + +fix CVE-2021-20255, sync patch from ostms platform. + +Signed-off-by: zhouli57 +Signed-off-by: Yan Wang +--- + hw/net/eepro100.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c +index 16e95ef9cc..2474cf3dc2 100644 +--- a/hw/net/eepro100.c ++++ b/hw/net/eepro100.c +@@ -279,6 +279,9 @@ typedef struct { + /* Quasi static device properties (no need to save them). */ + uint16_t stats_size; + bool has_extended_tcb_support; ++ ++ /* Flag to avoid recursions. */ ++ bool busy; + } EEPRO100State; + + /* Word indices in EEPROM. */ +@@ -837,6 +840,14 @@ static void action_command(EEPRO100State *s) + Therefore we limit the number of iterations. */ + unsigned max_loop_count = 16; + ++ if (s->busy) { ++ /* Prevent recursions. */ ++ logout("recursion in %s:%u\n", __FILE__, __LINE__); ++ return; ++ } ++ ++ s->busy = true; ++ + for (;;) { + bool bit_el; + bool bit_s; +@@ -933,6 +944,7 @@ static void action_command(EEPRO100State *s) + } + TRACE(OTHER, logout("CU list empty\n")); + /* List is empty. Now CU is idle or suspended. */ ++ s->busy = false; + } + + static void eepro100_cu_command(EEPRO100State * s, uint8_t val) +-- +2.27.0 + diff --git a/pci-Add-return_page_response-pci-ops.patch b/pci-Add-return_page_response-pci-ops.patch new file mode 100644 index 0000000000000000000000000000000000000000..25e665f8bb14c8004760a11f67e8d7b572fc1b17 --- /dev/null +++ b/pci-Add-return_page_response-pci-ops.patch @@ -0,0 +1,86 @@ +From 228345cfa59c764e725e2d3680a4bc3ecb237609 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 6 Nov 2020 14:34:35 +0100 +Subject: [PATCH] pci: Add return_page_response pci ops + +Add a new PCI operation that allows to return page responses +to registered VFIO devices + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/pci/pci.c | 16 ++++++++++++++++ + include/hw/iommu/iommu.h | 8 ++++++++ + include/hw/pci/pci.h | 4 ++++ + 3 files changed, 28 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index 4a9374c025..64db325d6b 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2793,6 +2793,22 @@ int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, + return -ENOENT; + } + ++int pci_device_return_page_response(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp) ++{ ++ PCIDevice *dev; ++ ++ if (!bus) { ++ return -EINVAL; ++ } ++ ++ dev = bus->devices[devfn]; ++ if (dev && dev->pasid_ops && dev->pasid_ops->return_page_response) { ++ return dev->pasid_ops->return_page_response(bus, devfn, resp); ++ } ++ return -ENOENT; ++} ++ + static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque) + { + Range *range = opaque; +diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h +index 12092bda7b..5890f095b1 100644 +--- a/include/hw/iommu/iommu.h ++++ b/include/hw/iommu/iommu.h +@@ -24,5 +24,13 @@ typedef struct IOMMUConfig { + }; + } IOMMUConfig; + ++typedef struct IOMMUPageResponse { ++ union { ++#ifdef __linux__ ++ struct iommu_page_response resp; ++#endif ++ }; ++} IOMMUPageResponse; ++ + + #endif /* QEMU_HW_IOMMU_IOMMU_H */ +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index abffa12a99..809eb32f4a 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -268,6 +268,8 @@ typedef struct PCIReqIDCache PCIReqIDCache; + + struct PCIPASIDOps { + int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++ int (*return_page_response)(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp); + }; + typedef struct PCIPASIDOps PCIPASIDOps; + +@@ -508,6 +510,8 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); + void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops); + bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn); + int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++int pci_device_return_page_response(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp); + + static inline void + pci_set_byte(uint8_t *config, uint8_t val) +-- +2.27.0 + diff --git a/pci-check-bus-pointer-before-dereference.patch b/pci-check-bus-pointer-before-dereference.patch new file mode 100644 index 0000000000000000000000000000000000000000..fbd30a32937a53dede688323163c8d229e93a054 --- /dev/null +++ b/pci-check-bus-pointer-before-dereference.patch @@ -0,0 +1,51 @@ +From 92da19fb18c234bb8872b9d8f7dedcc73e5fcafb Mon Sep 17 00:00:00 2001 +From: Prasad J Pandit +Date: Wed, 14 Oct 2020 15:00:20 +0800 +Subject: [PATCH] pci: check bus pointer before dereference + +fix CVE-2020-25742 + +patch link: https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg05294.html + +While mapping IRQ level in pci_change_irq_level() routine, +it does not check if pci_get_bus() returned a valid pointer. +It may lead to a NULL pointer dereference issue. Add check to +avoid it. + + -> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Flsi_nullptr1 + ==1183858==Hint: address points to the zero page. + #0 pci_change_irq_level hw/pci/pci.c:259 + #1 pci_irq_handler hw/pci/pci.c:1445 + #2 pci_set_irq hw/pci/pci.c:1463 + #3 lsi_set_irq hw/scsi/lsi53c895a.c:488 + #4 lsi_update_irq hw/scsi/lsi53c895a.c:523 + #5 lsi_script_scsi_interrupt hw/scsi/lsi53c895a.c:554 + #6 lsi_execute_script hw/scsi/lsi53c895a.c:1149 + #7 lsi_reg_writeb hw/scsi/lsi53c895a.c:1984 + #8 lsi_io_write hw/scsi/lsi53c895a.c:2146 + ... + +Reported-by: Ruhr-University +Signed-off-by: Prasad J Pandit +Signed-off-by: Yan Wang +--- + hw/pci/pci.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index e5993c1ef5..6d1c39a9de 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -270,6 +270,9 @@ static void pci_change_irq_level(PCIDevice *pci_dev, int irq_num, int change) + PCIBus *bus; + for (;;) { + bus = pci_get_bus(pci_dev); ++ if (!bus) { ++ return; ++ } + irq_num = bus->map_irq(pci_dev, irq_num); + if (bus->set_irq) + break; +-- +2.27.0 + diff --git a/pci-introduce-PCIPASIDOps-to-PCIDevice.patch b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch new file mode 100644 index 0000000000000000000000000000000000000000..f4cb611607899f89956b788e615d2d964b3a71ec --- /dev/null +++ b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch @@ -0,0 +1,127 @@ +From c71485494970e7aa986be2b05bf7e2847017e264 Mon Sep 17 00:00:00 2001 +From: Liu Yi L +Date: Fri, 5 Jul 2019 19:01:36 +0800 +Subject: [PATCH] pci: introduce PCIPASIDOps to PCIDevice + +This patch introduces PCIPASIDOps for IOMMU related operations. + +https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00078.html +https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00940.html + +So far, to setup virt-SVA for assigned SVA capable device, needs to +configure host translation structures for specific pasid. (e.g. bind +guest page table to host and enable nested translation in host). +Besides, vIOMMU emulator needs to forward guest's cache invalidation +to host since host nested translation is enabled. e.g. on VT-d, guest +owns 1st level translation table, thus cache invalidation for 1st +level should be propagated to host. + +This patch adds two functions: alloc_pasid and free_pasid to support +guest pasid allocation and free. The implementations of the callbacks +would be device passthru modules. Like vfio. + +Cc: Kevin Tian +Cc: Jacob Pan +Cc: Peter Xu +Cc: Eric Auger +Cc: Yi Sun +Cc: David Gibson +Signed-off-by: Liu Yi L +Signed-off-by: Yi Sun +Signed-off-by: Kunkun Jiang +--- + hw/pci/pci.c | 34 ++++++++++++++++++++++++++++++++++ + include/hw/pci/pci.h | 11 +++++++++++ + 2 files changed, 45 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index e5993c1ef5..4a9374c025 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2759,6 +2759,40 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) + bus->iommu_opaque = opaque; + } + ++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops) ++{ ++ assert(ops && !dev->pasid_ops); ++ dev->pasid_ops = ops; ++} ++ ++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn) ++{ ++ PCIDevice *dev; ++ ++ if (!bus) { ++ return false; ++ } ++ ++ dev = bus->devices[devfn]; ++ return !!(dev && dev->pasid_ops); ++} ++ ++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, ++ IOMMUConfig *config) ++{ ++ PCIDevice *dev; ++ ++ if (!bus) { ++ return -EINVAL; ++ } ++ ++ dev = bus->devices[devfn]; ++ if (dev && dev->pasid_ops && dev->pasid_ops->set_pasid_table) { ++ return dev->pasid_ops->set_pasid_table(bus, devfn, config); ++ } ++ return -ENOENT; ++} ++ + static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque) + { + Range *range = opaque; +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index e7cdf2d5ec..abffa12a99 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -9,6 +9,7 @@ + + #include "hw/pci/pcie.h" + #include "qom/object.h" ++#include "hw/iommu/iommu.h" + + extern bool pci_available; + +@@ -265,6 +266,11 @@ struct PCIReqIDCache { + }; + typedef struct PCIReqIDCache PCIReqIDCache; + ++struct PCIPASIDOps { ++ int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++}; ++typedef struct PCIPASIDOps PCIPASIDOps; ++ + struct PCIDevice { + DeviceState qdev; + bool partially_hotplugged; +@@ -361,6 +367,7 @@ struct PCIDevice { + /* ID of standby device in net_failover pair */ + char *failover_pair_id; + uint32_t acpi_index; ++ PCIPASIDOps *pasid_ops; + }; + + void pci_register_bar(PCIDevice *pci_dev, int region_num, +@@ -498,6 +505,10 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); + AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); + void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); + ++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops); ++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn); ++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++ + static inline void + pci_set_byte(uint8_t *config, uint8_t val) + { +-- +2.27.0 + diff --git a/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch new file mode 100644 index 0000000000000000000000000000000000000000..e02dbf6f365184c098159a72f29b7e256d235dac --- /dev/null +++ b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch @@ -0,0 +1,31 @@ +From ada323e932c83271184a6ddba1cfd74a29378963 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Thu, 29 Jul 2021 15:24:48 +0800 +Subject: [PATCH] qdev/monitors: Fix reundant error_setg of qdev_add_device + +There is an extra log "error_setg" in qdev_add_device(). When +hot-plug a device, if the corresponding bus doesn't exist, it +will trigger an asseration "assert(*errp == NULL)". + +Fixes: 515a7970490 (log: Add some logs on VM runtime path) +Signed-off-by: Kunkun Jiang +Signed-off-by: Yan Wang +--- + softmmu/qdev-monitor.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c +index dfd6429bf3..4a20f5dbd7 100644 +--- a/softmmu/qdev-monitor.c ++++ b/softmmu/qdev-monitor.c +@@ -636,7 +636,6 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, + if (path != NULL) { + bus = qbus_find(path, errp); + if (!bus) { +- error_setg(errp, "can not find bus for %s", driver); + return NULL; + } + if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { +-- +2.27.0 + diff --git a/qemu.spec b/qemu.spec index 5e4aca1016b0dff4b89bbba158d4ac265ccffed2..4fdd311ebf794191fd2091cfff1deffd8be8366b 100644 --- a/qemu.spec +++ b/qemu.spec @@ -1,6 +1,6 @@ Name: qemu Version: 6.2.0 -Release: 11 +Release: 12 Epoch: 2 Summary: QEMU is a generic and open source machine emulator and virtualizer License: GPLv2 and BSD and MIT and CC-BY-SA-4.0 @@ -86,6 +86,58 @@ Patch0073: seabios-increase-the-seabios-high-mem-zone-size.patch Patch0074: seabios-increase-the-seabios-minibiostable.patch Patch0075: IPv6-add-support-for-IPv6-protocol.patch Patch0076: Use-post-increment-only-in-inffast.c.patch +Patch0077: util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch +Patch0078: log-Add-some-logs-on-VM-runtime-path.patch +Patch0079: qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch +Patch0080: bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch +Patch0081: smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch +Patch0082: bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch +Patch0083: net-eepro100-validate-various-address-valuesi-CVE-20.patch +Patch0084: pci-check-bus-pointer-before-dereference.patch +Patch0085: ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch +Patch0086: tap-return-err-when-tap-TUNGETIFF-fail.patch +Patch0087: xhci-check-reg-to-avoid-OOB-read.patch +Patch0088: monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch +Patch0089: monitor-limit-io-error-qmp-event-to-at-most-once-per.patch +Patch0090: linux-headers-update-against-5.10-and-manual-clear-v.patch +Patch0091: vfio-Maintain-DMA-mapping-range-for-the-container.patch +Patch0092: vfio-migration-Add-support-for-manual-clear-vfio-dir.patch +Patch0093: update-linux-headers-Import-iommu.h.patch +Patch0094: vfio.h-and-iommu.h-header-update-against-5.10.patch +Patch0095: memory-Add-new-fields-in-IOTLBEntry.patch +Patch0096: hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch +Patch0097: hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch +Patch0098: memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch +Patch0099: memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch +Patch0100: memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch +Patch0101: iommu-Introduce-generic-header.patch +Patch0102: pci-introduce-PCIPASIDOps-to-PCIDevice.patch +Patch0103: vfio-Force-nested-if-iommu-requires-it.patch +Patch0104: vfio-Introduce-hostwin_from_range-helper.patch +Patch0105: vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch +Patch0106: vfio-Set-up-nested-stage-mappings.patch +Patch0107: vfio-Pass-stage-1-MSI-bindings-to-the-host.patch +Patch0108: vfio-Helper-to-get-IRQ-info-including-capabilities.patch +Patch0109: vfio-pci-Register-handler-for-iommu-fault.patch +Patch0110: vfio-pci-Set-up-the-DMA-FAULT-region.patch +Patch0111: vfio-pci-Implement-the-DMA-fault-handler.patch +Patch0112: hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch +Patch0113: hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch +Patch0114: hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch +Patch0115: hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch +Patch0116: hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch +Patch0117: hw-arm-smmuv3-Implement-fault-injection.patch +Patch0118: hw-arm-smmuv3-Allow-MAP-notifiers.patch +Patch0119: pci-Add-return_page_response-pci-ops.patch +Patch0120: vfio-pci-Implement-return_page_response-page-respons.patch +Patch0121: vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch +Patch0122: vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch +Patch0123: vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch +Patch0124: vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch +Patch0125: vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch +Patch0126: hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch +Patch0127: vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch +Patch0128: vfio-common-Add-address-alignment-check-in-vfio_list.patch BuildRequires: flex BuildRequires: gcc @@ -530,6 +582,64 @@ getent passwd qemu >/dev/null || \ %endif %changelog +* Sat Feb 12 2022 Chen Qun +- linux-headers: update against 5.10 and manual clear vfio dirty log series +- vfio: Maintain DMA mapping range for the container +- vfio/migration: Add support for manual clear vfio dirty log +- update-linux-headers: Import iommu.h +- vfio.h and iommu.h header update against 5.10 +- memory: Add new fields in IOTLBEntry +- hw/arm/smmuv3: Improve stage1 ASID invalidation +- hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL +- memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute +- memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute +- memory: Introduce IOMMU Memory Region inject_faults API +- iommu: Introduce generic header +- pci: introduce PCIPASIDOps to PCIDevice +- vfio: Force nested if iommu requires it +- vfio: Introduce hostwin_from_range helper +- vfio: Introduce helpers to DMA map/unmap a RAM section +- vfio: Set up nested stage mappings +- vfio: Pass stage 1 MSI bindings to the host +- vfio: Helper to get IRQ info including capabilities +- vfio/pci: Register handler for iommu fault +- vfio/pci: Set up the DMA FAULT region +- vfio/pci: Implement the DMA fault handler +- hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute +- hw/arm/smmuv3: Store the PASID table GPA in the translation config +- hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation +- hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation +- hw/arm/smmuv3: Pass stage 1 configurations to the host +- hw/arm/smmuv3: Implement fault injection +- hw/arm/smmuv3: Allow MAP notifiers +- pci: Add return_page_response pci ops +- vfio/pci: Implement return_page_response page response callback +- vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode +- vfio: Introduce helpers to mark dirty pages of a RAM section +- vfio: Add vfio_prereg_listener_log_sync in nested stage +- vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages +- vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage +- hw/arm/smmuv3: Post-load stage 1 configurations to the host +- vfio/common: Fix incorrect address alignment in vfio_dma_map_ram_section +- vfio/common: Add address alignment check in vfio_listener_region_del + +* Sat Feb 12 2022 Chen Qun +- log: Add some logs on VM runtime path +- qdev/monitors: Fix reundant error_setg of qdev_add_device +- bios-tables-test: Allow changes to q35/SSDT.dimmpxm file +- smbios: Add missing member of type 4 for smbios 3.0 +- bios-tables-test: Update expected q35/SSDT.dimmpxm file +- net: eepro100: validate various address valuesi(CVE-2021-20255) +- pci: check bus pointer before dereference +- ide: ahci: add check to avoid null dereference (CVE-2019-12067) +- tap: return err when tap TUNGETIFF fail +- xhci: check reg to avoid OOB read +- monitor: Discard BLOCK_IO_ERROR event when VM rebooted +- monitor: limit io error qmp event to at most once per 60s + +* Sat Feb 12 2022 Chen Qun +- util/log: add CONFIG_DISABLE_QEMU_LOG macro + * Sat Feb 12 2022 Yan Wang - ipxe: IPv6 add support for IPv6 protocol - u-boot: Use post increment only in inffast.c diff --git a/smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch b/smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch new file mode 100644 index 0000000000000000000000000000000000000000..f1f9e91b6c6fa36fe8e362c1001c6ef3c68af72c --- /dev/null +++ b/smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch @@ -0,0 +1,56 @@ +From 937e22eda2480a64095928ee8df0d37b3313bb64 Mon Sep 17 00:00:00 2001 +From: Ying Fang +Date: Tue, 14 Apr 2020 14:53:44 +0800 +Subject: [PATCH] smbios: Add missing member of type 4 for smbios 3.0 + +According to smbios 3.0 spec, for processor information (type 4), +it adds three new members (Core Count 2, Core enabled 2, thread count 2) for 3.0, Without this three members, we can not get correct cpu frequency from dmi, +Because it will failed to check the length of Processor Infomation in DMI. + +The corresponding codes in kernel is like: + if (dm->type == DMI_ENTRY_PROCESSOR && + dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { + u16 val = (u16)get_unaligned((const u16 *) + (dmi_data + DMI_PROCESSOR_MAX_SPEED)); + *mhz = val > *mhz ? val : *mhz; + } + +Signed-off-by: zhanghailiang +Signed-off-by: Yan Wang +--- + hw/smbios/smbios.c | 4 +++- + include/hw/firmware/smbios.h | 3 +++ + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c +index 7397e56737..66be9aee09 100644 +--- a/hw/smbios/smbios.c ++++ b/hw/smbios/smbios.c +@@ -688,7 +688,9 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) + t->thread_count = ms->smp.threads; + t->processor_characteristics = cpu_to_le16(0x02); /* Unknown */ + t->processor_family2 = cpu_to_le16(0x01); /* Other */ +- ++ t->corecount2 = 0; ++ t->enabledcorecount2 = 0; ++ t->threadcount2 = 0; + SMBIOS_BUILD_TABLE_POST; + smbios_type4_count++; + } +diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h +index 5a0dd0c8cf..5a696cf75a 100644 +--- a/include/hw/firmware/smbios.h ++++ b/include/hw/firmware/smbios.h +@@ -193,6 +193,9 @@ struct smbios_type_4 { + uint8_t thread_count; + uint16_t processor_characteristics; + uint16_t processor_family2; ++ uint16_t corecount2; ++ uint16_t enabledcorecount2; ++ uint16_t threadcount2; + } QEMU_PACKED; + + /* SMBIOS type 11 - OEM strings */ +-- +2.27.0 + diff --git a/tap-return-err-when-tap-TUNGETIFF-fail.patch b/tap-return-err-when-tap-TUNGETIFF-fail.patch new file mode 100644 index 0000000000000000000000000000000000000000..f74fa19abc42f16414dad411c06590d66c612922 --- /dev/null +++ b/tap-return-err-when-tap-TUNGETIFF-fail.patch @@ -0,0 +1,30 @@ +From 48a38f409a25f26605d65346c8ed9403c4b36c80 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 10:28:59 +0800 +Subject: [PATCH] tap: return err when tap TUNGETIFF fail + +When hotplug ovs kernel netcard, even tap TUNGETIFF failed, +the hotplug would go on and would lead to qemu assert. +The failure should lead to the free_fail. + +Signed-off-by: miaoyubo +Signed-off-by: Yan Wang +--- + net/tap.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/tap.c b/net/tap.c +index f716be3e3f..c5cbeaa7a2 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -900,6 +900,7 @@ int net_init_tap(const Netdev *netdev, const char *name, + if (i == 0) { + vnet_hdr = tap_probe_vnet_hdr(fd, errp); + if (vnet_hdr < 0) { ++ ret = -1; + goto free_fail; + } + } else if (vnet_hdr != tap_probe_vnet_hdr(fd, NULL)) { +-- +2.27.0 + diff --git a/update-linux-headers-Import-iommu.h.patch b/update-linux-headers-Import-iommu.h.patch new file mode 100644 index 0000000000000000000000000000000000000000..5653e6a4ddd6b5cdb7a68dbef54f010d0e3a1cda --- /dev/null +++ b/update-linux-headers-Import-iommu.h.patch @@ -0,0 +1,29 @@ +From 694acf3c321908d26ce508842b7bd076664ffbc6 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 9 May 2019 10:23:42 -0400 +Subject: [PATCH] update-linux-headers: Import iommu.h + +Update the script to import the new iommu.h uapi header. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + scripts/update-linux-headers.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index fea4d6eb65..acde610733 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -144,7 +144,7 @@ done + + rm -rf "$output/linux-headers/linux" + mkdir -p "$output/linux-headers/linux" +-for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \ ++for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h iommu.h \ + psci.h psp-sev.h userfaultfd.h mman.h; do + cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" + done +-- +2.27.0 + diff --git a/util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch b/util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch new file mode 100644 index 0000000000000000000000000000000000000000..f6940d69d12b29d8f6740cb87dec83b5eaa1356a --- /dev/null +++ b/util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch @@ -0,0 +1,41 @@ +From 05462305ec8b9ce5b414ede1e7e680b16d1a08ad Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Fri, 11 Feb 2022 18:20:59 +0800 +Subject: [PATCH] util/log: add CONFIG_DISABLE_QEMU_LOG macro + +Using CONFIG_DISABLE_QEMU_LOG macro to control +qemu_log function. + +Signed-off-by: Yan Wang +--- + util/log.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/util/log.c b/util/log.c +index 2ee1500bee..ed3029fe5c 100644 +--- a/util/log.c ++++ b/util/log.c +@@ -34,6 +34,12 @@ int qemu_loglevel; + static int log_append = 0; + static GArray *debug_regions; + ++#ifdef CONFIG_DISABLE_QEMU_LOG ++int qemu_log(const char *fmt, ...) ++{ ++ return 0; ++} ++#else + /* Return the number of characters emitted. */ + int qemu_log(const char *fmt, ...) + { +@@ -56,6 +62,7 @@ int qemu_log(const char *fmt, ...) + rcu_read_unlock(); + return ret; + } ++#endif + + static void __attribute__((__constructor__)) qemu_logfile_init(void) + { +-- +2.27.0 + diff --git a/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch new file mode 100644 index 0000000000000000000000000000000000000000..962266210c9ed50027a893c453b59313f4fedc56 --- /dev/null +++ b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch @@ -0,0 +1,71 @@ +From 287c63ab540533f1f9642e753c091caa7e6e2511 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:15 +0800 +Subject: [PATCH] vfio: Add vfio_prereg_listener_global_log_start/stop in + nested stage + +In nested mode, we set up the stage 2 and stage 1 separately. In my +opinion, vfio_memory_prereg_listener is used for stage 2 and +vfio_memory_listener is used for stage 1. So it feels weird to call +the global_log_start/stop interface in vfio_memory_listener to switch +dirty tracking, although this won't cause any errors. Add +global_log_start/stop interface in vfio_memory_prereg_listener +can separate stage 2 from stage 1. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 20c820aa74..65f3979492 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1501,6 +1501,17 @@ static void vfio_listener_log_global_start(MemoryListener *listener) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + ++ /* For nested mode, vfio_prereg_listener is used to start dirty tracking */ ++ if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) { ++ vfio_set_dirty_page_tracking(container, true); ++ } ++} ++ ++static void vfio_prereg_listener_log_global_start(MemoryListener *listener) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ + vfio_set_dirty_page_tracking(container, true); + } + +@@ -1508,6 +1519,17 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + ++ /* For nested mode, vfio_prereg_listener is used to stop dirty tracking */ ++ if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) { ++ vfio_set_dirty_page_tracking(container, false); ++ } ++} ++ ++static void vfio_prereg_listener_log_global_stop(MemoryListener *listener) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ + vfio_set_dirty_page_tracking(container, false); + } + +@@ -1922,6 +1944,8 @@ static const MemoryListener vfio_memory_listener = { + static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, ++ .log_global_start = vfio_prereg_listener_log_global_start, ++ .log_global_stop = vfio_prereg_listener_log_global_stop, + .log_sync = vfio_prereg_listener_log_sync, + .log_clear = vfio_prereg_listener_log_clear, + }; +-- +2.27.0 + diff --git a/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch new file mode 100644 index 0000000000000000000000000000000000000000..a055ed555f3a7fa213a8c16b4983a139de590ec0 --- /dev/null +++ b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch @@ -0,0 +1,84 @@ +From 7086df6d90cd698a3e20cf4cf6e9a834f168cd8f Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Sat, 31 Jul 2021 09:40:24 +0800 +Subject: [PATCH] vfio: Add vfio_prereg_listener_log_clear to re-enable mark + dirty pages + +When tracking dirty pages, we just need to pay attention to stage 2 +mappings. Legacy vfio_listener_log_clear cannot be used in nested +stage. This patch adds vfio_prereg_listener_log_clear to re-enable +dirty pages in nested mode. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 40 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 39 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 2506cd57ee..20c820aa74 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1857,6 +1857,43 @@ static int vfio_physical_log_clear(VFIOContainer *container, + return ret; + } + ++static void vfio_prereg_listener_log_clear(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ ++ if (!memory_region_is_ram(section->mr)) { ++ return; ++ } ++ ++ vfio_physical_log_clear(container, section); ++} ++ ++static int vfio_clear_dirty_bitmap(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ if (memory_region_is_iommu(section->mr)) { ++ /* ++ * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are ++ * set up separately. It is inappropriate to pass 'giova' to kernel ++ * to get dirty pages. We only need to focus on stage 2 mapping when ++ * marking dirty pages. ++ */ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ return 0; ++ } ++ ++ /* ++ * TODO: x86. With the log_clear() interface added, x86 may inplement ++ * its own method. ++ */ ++ } ++ ++ /* Here we assume that memory_region_is_ram(section->mr) == true */ ++ return vfio_physical_log_clear(container, section); ++} ++ + static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) + { +@@ -1868,7 +1905,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, + } + + if (vfio_devices_all_dirty_tracking(container)) { +- vfio_physical_log_clear(container, section); ++ vfio_clear_dirty_bitmap(container, section); + } + } + +@@ -1886,6 +1923,7 @@ static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, + .log_sync = vfio_prereg_listener_log_sync, ++ .log_clear = vfio_prereg_listener_log_clear, + }; + + static void vfio_listener_release(VFIOContainer *container) +-- +2.27.0 + diff --git a/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch new file mode 100644 index 0000000000000000000000000000000000000000..b1df5a3801a75edaedde9968fbb8db92713dfbd5 --- /dev/null +++ b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch @@ -0,0 +1,74 @@ +From f4523389bf57593484308124e06d67855bb79315 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:14 +0800 +Subject: [PATCH] vfio: Add vfio_prereg_listener_log_sync in nested stage + +In nested mode, we set up the stage 2 (gpa->hpa)and stage 1 +(giova->gpa) separately by vfio_prereg_listener_region_add() +and vfio_listener_region_add(). So when marking dirty pages +we just need to pay attention to stage 2 mappings. + +Legacy vfio_listener_log_sync cannot be used in nested stage. +This patch adds vfio_prereg_listener_log_sync to mark dirty +pages in nested mode. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 6136b1ef61..2506cd57ee 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1579,6 +1579,22 @@ static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container, + int128_get64(section->size), ram_addr); + } + ++static void vfio_prereg_listener_log_sync(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ ++ if (!memory_region_is_ram(section->mr) || ++ !container->dirty_pages_supported) { ++ return; ++ } ++ ++ if (vfio_devices_all_dirty_tracking(container)) { ++ vfio_dma_sync_ram_section_dirty_bitmap(container, section); ++ } ++} ++ + typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +@@ -1666,6 +1682,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + ++ /* ++ * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are ++ * set up separately. It is inappropriate to pass 'giova' to kernel ++ * to get dirty pages. We only need to focus on stage 2 mapping when ++ * marking dirty pages. ++ */ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ return 0; ++ } ++ + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu) == section->mr && + giommu->n.start == section->offset_within_region) { +@@ -1859,6 +1885,7 @@ static const MemoryListener vfio_memory_listener = { + static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, ++ .log_sync = vfio_prereg_listener_log_sync, + }; + + static void vfio_listener_release(VFIOContainer *container) +-- +2.27.0 + diff --git a/vfio-Force-nested-if-iommu-requires-it.patch b/vfio-Force-nested-if-iommu-requires-it.patch new file mode 100644 index 0000000000000000000000000000000000000000..d580ae5007f6ae7be2adb090d9e6774eeeb323db --- /dev/null +++ b/vfio-Force-nested-if-iommu-requires-it.patch @@ -0,0 +1,101 @@ +From e7eef5af743a53f0415267ebe9bba2e5f0e05816 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 28 Aug 2018 16:16:20 +0200 +Subject: [PATCH] vfio: Force nested if iommu requires it + +In case we detect the address space is translated by +a virtual IOMMU which requires HW nested paging to +integrate with VFIO, let's set up the container with +the VFIO_TYPE1_NESTING_IOMMU iommu_type. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 36 ++++++++++++++++++++++++++++-------- + 1 file changed, 28 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 6cb91e7ffd..d7533637c9 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -2045,27 +2045,38 @@ static void vfio_put_address_space(VFIOAddressSpace *space) + * vfio_get_iommu_type - selects the richest iommu_type (v2 first) + */ + static int vfio_get_iommu_type(VFIOContainer *container, ++ bool want_nested, + Error **errp) + { +- int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, ++ int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU, ++ VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, + VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; +- int i; ++ int i, ret = -EINVAL; + + for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { + if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { +- return iommu_types[i]; ++ if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU && !want_nested) { ++ continue; ++ } ++ ret = iommu_types[i]; ++ break; + } + } +- error_setg(errp, "No available IOMMU models"); +- return -EINVAL; ++ if (ret < 0) { ++ error_setg(errp, "No available IOMMU models"); ++ } else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) { ++ error_setg(errp, "Nested mode requested but not supported"); ++ ret = -EINVAL; ++ } ++ return ret; + } + + static int vfio_init_container(VFIOContainer *container, int group_fd, +- Error **errp) ++ bool want_nested, Error **errp) + { + int iommu_type, dirty_log_manual_clear, ret; + +- iommu_type = vfio_get_iommu_type(container, errp); ++ iommu_type = vfio_get_iommu_type(container, want_nested, errp); + if (iommu_type < 0) { + return iommu_type; + } +@@ -2177,6 +2188,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + VFIOContainer *container; + int ret, fd; + VFIOAddressSpace *space; ++ IOMMUMemoryRegion *iommu_mr; ++ bool nested = false; ++ ++ if (memory_region_is_iommu(as->root)) { ++ iommu_mr = IOMMU_MEMORY_REGION(as->root); ++ memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED, ++ (void *)&nested); ++ } + + space = vfio_get_address_space(as); + +@@ -2257,7 +2276,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); + +- ret = vfio_init_container(container, group->fd, errp); ++ ret = vfio_init_container(container, group->fd, nested, errp); + if (ret) { + goto free_container_exit; + } +@@ -2269,6 +2288,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + } + + switch (container->iommu_type) { ++ case VFIO_TYPE1_NESTING_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + { +-- +2.27.0 + diff --git a/vfio-Helper-to-get-IRQ-info-including-capabilities.patch b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch new file mode 100644 index 0000000000000000000000000000000000000000..3d4b1667692bf76a3c89c71cd363db5090641986 --- /dev/null +++ b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch @@ -0,0 +1,178 @@ +From a4336765c99a876743c0ead89997ad6f97d7b442 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 20 Jun 2019 16:39:57 +0200 +Subject: [PATCH] vfio: Helper to get IRQ info including capabilities + +As done for vfio regions, add helpers to retrieve irq info +including their optional capabilities. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 97 +++++++++++++++++++++++++++++++++++ + hw/vfio/trace-events | 1 + + include/hw/vfio/vfio-common.h | 7 +++ + 3 files changed, 105 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 1f78af121d..d05a485808 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1919,6 +1919,25 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + return true; + } + ++struct vfio_info_cap_header * ++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id) ++{ ++ struct vfio_info_cap_header *hdr; ++ void *ptr = info; ++ ++ if (!(info->flags & VFIO_IRQ_INFO_FLAG_CAPS)) { ++ return NULL; ++ } ++ ++ for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { ++ if (hdr->id == id) { ++ return hdr; ++ } ++ } ++ ++ return NULL; ++} ++ + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) + { +@@ -2887,6 +2906,33 @@ retry: + return 0; + } + ++int vfio_get_irq_info(VFIODevice *vbasedev, int index, ++ struct vfio_irq_info **info) ++{ ++ size_t argsz = sizeof(struct vfio_irq_info); ++ ++ *info = g_malloc0(argsz); ++ ++ (*info)->index = index; ++retry: ++ (*info)->argsz = argsz; ++ ++ if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, *info)) { ++ g_free(*info); ++ *info = NULL; ++ return -errno; ++ } ++ ++ if ((*info)->argsz > argsz) { ++ argsz = (*info)->argsz; ++ *info = g_realloc(*info, argsz); ++ ++ goto retry; ++ } ++ ++ return 0; ++} ++ + int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info) + { +@@ -2922,6 +2968,42 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + return -ENODEV; + } + ++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type, ++ uint32_t subtype, struct vfio_irq_info **info) ++{ ++ int i; ++ ++ for (i = 0; i < vbasedev->num_irqs; i++) { ++ struct vfio_info_cap_header *hdr; ++ struct vfio_irq_info_cap_type *cap_type; ++ ++ if (vfio_get_irq_info(vbasedev, i, info)) { ++ continue; ++ } ++ ++ hdr = vfio_get_irq_info_cap(*info, VFIO_IRQ_INFO_CAP_TYPE); ++ if (!hdr) { ++ g_free(*info); ++ continue; ++ } ++ ++ cap_type = container_of(hdr, struct vfio_irq_info_cap_type, header); ++ ++ trace_vfio_get_dev_irq(vbasedev->name, i, ++ cap_type->type, cap_type->subtype); ++ ++ if (cap_type->type == type && cap_type->subtype == subtype) { ++ return 0; ++ } ++ ++ g_free(*info); ++ } ++ ++ *info = NULL; ++ return -ENODEV; ++} ++ ++ + bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) + { + struct vfio_region_info *info = NULL; +@@ -2937,6 +3019,21 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) + return ret; + } + ++bool vfio_has_irq_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) ++{ ++ struct vfio_region_info *info = NULL; ++ bool ret = false; ++ ++ if (!vfio_get_region_info(vbasedev, region, &info)) { ++ if (vfio_get_region_info_cap(info, cap_type)) { ++ ret = true; ++ } ++ g_free(info); ++ } ++ ++ return ret; ++} ++ + /* + * Interfaces for IBM EEH (Enhanced Error Handling) + */ +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 35bd415d6d..f5fe201ab5 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -117,6 +117,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re + vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" + vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" + vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" ++vfio_get_dev_irq(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" + vfio_dma_unmap_overflow_workaround(void) "" + vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" + vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index a838a939e4..7fdca26fa0 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -254,6 +254,13 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail); + struct vfio_info_cap_header * + vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); ++int vfio_get_irq_info(VFIODevice *vbasedev, int index, ++ struct vfio_irq_info **info); ++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type, ++ uint32_t subtype, struct vfio_irq_info **info); ++bool vfio_has_irq_cap(VFIODevice *vbasedev, int irq, uint16_t cap_type); ++struct vfio_info_cap_header * ++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id); + #endif + extern const MemoryListener vfio_prereg_listener; + +-- +2.27.0 + diff --git a/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch new file mode 100644 index 0000000000000000000000000000000000000000..fd6deffd16ce01ee2a97e4061ac358e974a49320 --- /dev/null +++ b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch @@ -0,0 +1,280 @@ +From dab969657d8ff8b175856f91b035b74849cf69ba Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 30 Aug 2018 15:04:25 +0200 +Subject: [PATCH] vfio: Introduce helpers to DMA map/unmap a RAM section + +Let's introduce two helpers that allow to DMA map/unmap a RAM +section. Those helpers will be called for nested stage setup in +another call site. Also the vfio_listener_region_add/del() +structure may be clearer. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 206 +++++++++++++++++++++++++------------------ + hw/vfio/trace-events | 4 +- + 2 files changed, 123 insertions(+), 87 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d358789f19..b3dc090840 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -922,13 +922,130 @@ hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end) + return NULL; + } + ++static int vfio_dma_map_ram_section(VFIOContainer *container, ++ MemoryRegionSection *section, Error **err) ++{ ++ VFIOHostDMAWindow *hostwin; ++ Int128 llend, llsize; ++ hwaddr iova, end; ++ void *vaddr; ++ int ret; ++ ++ assert(memory_region_is_ram(section->mr)); ++ ++ iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); ++ llend = int128_make64(section->offset_within_address_space); ++ llend = int128_add(llend, section->size); ++ llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); ++ end = int128_get64(int128_sub(llend, int128_one())); ++ ++ vaddr = memory_region_get_ram_ptr(section->mr) + ++ section->offset_within_region + ++ (iova - section->offset_within_address_space); ++ ++ hostwin = hostwin_from_range(container, iova, end); ++ if (!hostwin) { ++ error_setg(err, "Container %p can't map guest IOVA region" ++ " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); ++ return -EFAULT; ++ } ++ ++ trace_vfio_dma_map_ram(iova, end, vaddr); ++ ++ llsize = int128_sub(llend, int128_make64(iova)); ++ ++ if (memory_region_is_ram_device(section->mr)) { ++ hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; ++ ++ if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { ++ trace_vfio_listener_region_add_no_dma_map( ++ memory_region_name(section->mr), ++ section->offset_within_address_space, ++ int128_getlo(section->size), ++ pgmask + 1); ++ return 0; ++ } ++ } ++ ++ ret = vfio_dma_map(container, iova, int128_get64(llsize), ++ vaddr, section->readonly); ++ if (ret) { ++ error_setg(err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " ++ "0x%"HWADDR_PRIx", %p) = %d (%m)", ++ container, iova, int128_get64(llsize), vaddr, ret); ++ if (memory_region_is_ram_device(section->mr)) { ++ /* Allow unexpected mappings not to be fatal for RAM devices */ ++ error_report_err(*err); ++ return 0; ++ } ++ return ret; ++ } ++ return 0; ++} ++ ++static void vfio_dma_unmap_ram_section(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ Int128 llend, llsize; ++ hwaddr iova, end; ++ bool try_unmap = true; ++ int ret; ++ ++ iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); ++ llend = int128_make64(section->offset_within_address_space); ++ llend = int128_add(llend, section->size); ++ llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); ++ ++ if (int128_ge(int128_make64(iova), llend)) { ++ return; ++ } ++ end = int128_get64(int128_sub(llend, int128_one())); ++ ++ llsize = int128_sub(llend, int128_make64(iova)); ++ ++ trace_vfio_dma_unmap_ram(iova, end); ++ ++ if (memory_region_is_ram_device(section->mr)) { ++ hwaddr pgmask; ++ VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); ++ ++ assert(hostwin); /* or region_add() would have failed */ ++ ++ pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; ++ try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); ++ } else if (memory_region_has_ram_discard_manager(section->mr)) { ++ vfio_unregister_ram_discard_listener(container, section); ++ /* Unregistering will trigger an unmap. */ ++ try_unmap = false; ++ } ++ ++ if (try_unmap) { ++ if (int128_eq(llsize, int128_2_64())) { ++ /* The unmap ioctl doesn't accept a full 64-bit span. */ ++ llsize = int128_rshift(llsize, 1); ++ ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); ++ if (ret) { ++ error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ "0x%"HWADDR_PRIx") = %d (%m)", ++ container, iova, int128_get64(llsize), ret); ++ } ++ iova += int128_get64(llsize); ++ } ++ ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); ++ if (ret) { ++ error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ "0x%"HWADDR_PRIx") = %d (%m)", ++ container, iova, int128_get64(llsize), ret); ++ } ++ } ++} ++ + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + hwaddr iova, end; +- Int128 llend, llsize; +- void *vaddr; ++ Int128 llend; + int ret; + VFIOHostDMAWindow *hostwin; + Error *err = NULL; +@@ -1092,38 +1209,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + return; + } + +- vaddr = memory_region_get_ram_ptr(section->mr) + +- section->offset_within_region + +- (iova - section->offset_within_address_space); +- +- trace_vfio_listener_region_add_ram(iova, end, vaddr); +- +- llsize = int128_sub(llend, int128_make64(iova)); +- +- if (memory_region_is_ram_device(section->mr)) { +- hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; +- +- if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { +- trace_vfio_listener_region_add_no_dma_map( +- memory_region_name(section->mr), +- section->offset_within_address_space, +- int128_getlo(section->size), +- pgmask + 1); +- return; +- } +- } +- +- ret = vfio_dma_map(container, iova, int128_get64(llsize), +- vaddr, section->readonly); +- if (ret) { +- error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " +- "0x%"HWADDR_PRIx", %p) = %d (%m)", +- container, iova, int128_get64(llsize), vaddr, ret); +- if (memory_region_is_ram_device(section->mr)) { +- /* Allow unexpected mappings not to be fatal for RAM devices */ +- error_report_err(err); +- return; +- } ++ if (vfio_dma_map_ram_section(container, section, &err)) { + goto fail; + } + +@@ -1157,10 +1243,6 @@ static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); +- hwaddr iova, end; +- Int128 llend, llsize; +- int ret; +- bool try_unmap = true; + + if (vfio_listener_skipped_section(section)) { + trace_vfio_listener_region_del_skip( +@@ -1200,53 +1282,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + */ + } + +- iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); +- llend = int128_make64(section->offset_within_address_space); +- llend = int128_add(llend, section->size); +- llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); +- +- if (int128_ge(int128_make64(iova), llend)) { +- return; +- } +- end = int128_get64(int128_sub(llend, int128_one())); +- +- llsize = int128_sub(llend, int128_make64(iova)); +- +- trace_vfio_listener_region_del(iova, end); +- +- if (memory_region_is_ram_device(section->mr)) { +- hwaddr pgmask; +- VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); +- +- assert(hostwin); /* or region_add() would have failed */ +- +- pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; +- try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); +- } else if (memory_region_has_ram_discard_manager(section->mr)) { +- vfio_unregister_ram_discard_listener(container, section); +- /* Unregistering will trigger an unmap. */ +- try_unmap = false; +- } +- +- if (try_unmap) { +- if (int128_eq(llsize, int128_2_64())) { +- /* The unmap ioctl doesn't accept a full 64-bit span. */ +- llsize = int128_rshift(llsize, 1); +- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); +- if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " +- "0x%"HWADDR_PRIx") = %d (%m)", +- container, iova, int128_get64(llsize), ret); +- } +- iova += int128_get64(llsize); +- } +- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); +- if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " +- "0x%"HWADDR_PRIx") = %d (%m)", +- container, iova, int128_get64(llsize), ret); +- } +- } ++ vfio_dma_unmap_ram_section(container, section); + + memory_region_unref(section->mr); + +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 0ef1b5f4a6..a37563a315 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -99,10 +99,10 @@ vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "i + vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64 + vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" + vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 +-vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" ++vfio_dma_map_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" + vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" + vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64 +-vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 ++vfio_dma_unmap_ram(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 + vfio_disconnect_container(int fd) "close container->fd=%d" + vfio_put_group(int fd) "close group->fd=%d" + vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" +-- +2.27.0 + diff --git a/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch new file mode 100644 index 0000000000000000000000000000000000000000..e77dd1d64b4bdfb606e7987df653d5e365c27c98 --- /dev/null +++ b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch @@ -0,0 +1,64 @@ +From 1675d767aa9bd496178b4d74e01a40dbbd97eccb Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:13 +0800 +Subject: [PATCH] vfio: Introduce helpers to mark dirty pages of a RAM section + +Extract part of the code from vfio_sync_dirty_bitmap to form a +new helper, which allows to mark dirty pages of a RAM section. +This helper will be called for nested stage. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 22 ++++++++++++++-------- + 1 file changed, 14 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index bdfcc854fe..6136b1ef61 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1566,6 +1566,19 @@ err_out: + return ret; + } + ++static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ ram_addr_t ram_addr; ++ ++ ram_addr = memory_region_get_ram_addr(section->mr) + ++ section->offset_within_region; ++ ++ return vfio_get_dirty_bitmap(container, ++ REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), ++ int128_get64(section->size), ram_addr); ++} ++ + typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +@@ -1650,8 +1663,6 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + static int vfio_sync_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) + { +- ram_addr_t ram_addr; +- + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + +@@ -1682,12 +1693,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); + } + +- ram_addr = memory_region_get_ram_addr(section->mr) + +- section->offset_within_region; +- +- return vfio_get_dirty_bitmap(container, +- REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), +- int128_get64(section->size), ram_addr); ++ return vfio_dma_sync_ram_section_dirty_bitmap(container, section); + } + + static void vfio_listener_log_sync(MemoryListener *listener, +-- +2.27.0 + diff --git a/vfio-Introduce-hostwin_from_range-helper.patch b/vfio-Introduce-hostwin_from_range-helper.patch new file mode 100644 index 0000000000000000000000000000000000000000..c8c8ab76922a95fbcfdec74b92b11dc688ce2745 --- /dev/null +++ b/vfio-Introduce-hostwin_from_range-helper.patch @@ -0,0 +1,89 @@ +From 85232739b4852f1a51dde58c9007ed0deb17c2f2 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 22 Mar 2019 18:05:23 +0100 +Subject: [PATCH] vfio: Introduce hostwin_from_range helper + +Let's introduce a hostwin_from_range() helper that returns the +hostwin encapsulating an IOVA range or NULL if none is found. + +This improves the readibility of callers and removes the usage +of hostwin_found. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 36 +++++++++++++++++------------------- + 1 file changed, 17 insertions(+), 19 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d7533637c9..d358789f19 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -909,6 +909,19 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, + g_free(vrdl); + } + ++static VFIOHostDMAWindow * ++hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end) ++{ ++ VFIOHostDMAWindow *hostwin; ++ ++ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { ++ if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { ++ return hostwin; ++ } ++ } ++ return NULL; ++} ++ + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { +@@ -918,7 +931,6 @@ static void vfio_listener_region_add(MemoryListener *listener, + void *vaddr; + int ret; + VFIOHostDMAWindow *hostwin; +- bool hostwin_found; + Error *err = NULL; + + if (vfio_listener_skipped_section(section)) { +@@ -1011,15 +1023,8 @@ static void vfio_listener_region_add(MemoryListener *listener, + #endif + } + +- hostwin_found = false; +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { +- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { +- hostwin_found = true; +- break; +- } +- } +- +- if (!hostwin_found) { ++ hostwin = hostwin_from_range(container, iova, end); ++ if (!hostwin) { + error_setg(&err, "Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); + goto fail; +@@ -1211,16 +1216,9 @@ static void vfio_listener_region_del(MemoryListener *listener, + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; +- VFIOHostDMAWindow *hostwin; +- bool hostwin_found = false; ++ VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); + +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { +- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { +- hostwin_found = true; +- break; +- } +- } +- assert(hostwin_found); /* or region_add() would have failed */ ++ assert(hostwin); /* or region_add() would have failed */ + + pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); +-- +2.27.0 + diff --git a/vfio-Maintain-DMA-mapping-range-for-the-container.patch b/vfio-Maintain-DMA-mapping-range-for-the-container.patch new file mode 100644 index 0000000000000000000000000000000000000000..ba8abb431c5606de905890d5be95e37f4d0ccf8e --- /dev/null +++ b/vfio-Maintain-DMA-mapping-range-for-the-container.patch @@ -0,0 +1,191 @@ +From ac1bf3edcd2b807cf81ada500716f13b1394d58e Mon Sep 17 00:00:00 2001 +From: Zenghui Yu +Date: Sat, 8 May 2021 17:31:04 +0800 +Subject: [PATCH] vfio: Maintain DMA mapping range for the container + +When synchronizing dirty bitmap from kernel VFIO we do it in a +per-iova-range fashion and we allocate the userspace bitmap for each of the +ioctl. This patch introduces `struct VFIODMARange` to describe a range of +the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and +make the bitmap cache of this range be persistent so that we don't need to +g_try_malloc0() every time. Note that the new structure is almost a copy of +`struct vfio_iommu_type1_dma_map` but only internally used by QEMU. + +More importantly, the cached per-iova-range dirty bitmap will be further +used when we want to add support for the CLEAR_BITMAP and this cached +bitmap will be used to guarantee we don't clear any unknown dirty bits +otherwise that can be a severe data loss issue for migration code. + +It's pretty intuitive to maintain a bitmap per container since we perform +log_sync at this granule. But I don't know how to deal with things like +memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome. + +* yet something to-do: + - can't work with guest viommu + - no locks + - etc + +[ The idea and even the commit message are largely inherited from kvm side. + See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ] + +Signed-off-by: Zenghui Yu +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 62 +++++++++++++++++++++++++++++++---- + include/hw/vfio/vfio-common.h | 9 +++++ + 2 files changed, 65 insertions(+), 6 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 080046e3f5..86ea784919 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -445,6 +445,29 @@ unmap_exit: + return ret; + } + ++static VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, ++ hwaddr start_addr, hwaddr size) ++{ ++ VFIODMARange *qrange; ++ ++ QLIST_FOREACH(qrange, &container->dma_list, next) { ++ if (qrange->iova == start_addr && qrange->size == size) { ++ return qrange; ++ } ++ } ++ return NULL; ++} ++ ++static void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) ++{ ++ uint64_t pages, size; ++ ++ pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size; ++ size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; ++ ++ qrange->bitmap = g_malloc0(size); ++} ++ + /* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +@@ -458,12 +481,29 @@ static int vfio_dma_unmap(VFIOContainer *container, + .iova = iova, + .size = size, + }; ++ VFIODMARange *qrange; + + if (iotlb && container->dirty_pages_supported && + vfio_devices_all_running_and_saving(container)) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + ++ /* ++ * unregister the DMA range ++ * ++ * It seems that the memory layer will give us the same section as the one ++ * used in region_add(). Otherwise it'll be complicated to manipulate the ++ * bitmap across region_{add,del}. Is there any guarantee? ++ * ++ * But there is really not such a restriction on the kernel interface ++ * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc). ++ */ ++ qrange = vfio_lookup_match_range(container, iova, size); ++ assert(qrange); ++ g_free(qrange->bitmap); ++ QLIST_REMOVE(qrange, next); ++ g_free(qrange); ++ + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + /* + * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c +@@ -500,6 +540,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, + .iova = iova, + .size = size, + }; ++ VFIODMARange *qrange; ++ ++ qrange = g_malloc0(sizeof(*qrange)); ++ qrange->iova = iova; ++ qrange->size = size; ++ QLIST_INSERT_HEAD(&container->dma_list, qrange, next); ++ /* XXX allocate the dirty bitmap on demand */ ++ vfio_dma_range_init_dirty_bitmap(qrange); + + if (!readonly) { + map.flags |= VFIO_DMA_MAP_FLAG_WRITE; +@@ -1256,9 +1304,14 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + { + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; ++ VFIODMARange *qrange; + uint64_t pages; + int ret; + ++ qrange = vfio_lookup_match_range(container, iova, size); ++ /* the same as vfio_dma_unmap() */ ++ assert(qrange); ++ + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); +@@ -1277,11 +1330,8 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size; + range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; +- range->bitmap.data = g_try_malloc0(range->bitmap.size); +- if (!range->bitmap.data) { +- ret = -ENOMEM; +- goto err_out; +- } ++ ++ range->bitmap.data = (__u64 *)qrange->bitmap; + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { +@@ -1297,7 +1347,6 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, + range->bitmap.size, ram_addr); + err_out: +- g_free(range->bitmap.data); + g_free(dbitmap); + + return ret; +@@ -2061,6 +2110,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->giommu_list); + QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->vrdl_list); ++ QLIST_INIT(&container->dma_list); + + ret = vfio_init_container(container, group->fd, errp); + if (ret) { +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 8af11b0a76..20b9c8a1d3 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -76,6 +76,14 @@ typedef struct VFIOAddressSpace { + + struct VFIOGroup; + ++typedef struct VFIODMARange { ++ QLIST_ENTRY(VFIODMARange) next; ++ hwaddr iova; ++ size_t size; ++ void *vaddr; /* unused */ ++ unsigned long *bitmap; /* dirty bitmap cache for this range */ ++} VFIODMARange; ++ + typedef struct VFIOContainer { + VFIOAddressSpace *space; + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ +@@ -93,6 +101,7 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; ++ QLIST_HEAD(, VFIODMARange) dma_list; + QLIST_ENTRY(VFIOContainer) next; + } VFIOContainer; + +-- +2.27.0 + diff --git a/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch new file mode 100644 index 0000000000000000000000000000000000000000..bed28007ce19c71e58690d3ceb7ce5c9dd87a9d4 --- /dev/null +++ b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch @@ -0,0 +1,262 @@ +From 8b4fbe869f8a1f510896c86067d2e4fc3dc82eb9 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 14 Aug 2018 08:08:11 -0400 +Subject: [PATCH] vfio: Pass stage 1 MSI bindings to the host + +We register the stage1 MSI bindings when enabling the vectors +and we unregister them on msi disable. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 59 +++++++++++++++++++++++++++ + hw/vfio/pci.c | 76 ++++++++++++++++++++++++++++++++++- + hw/vfio/trace-events | 2 + + include/hw/vfio/vfio-common.h | 12 ++++++ + 4 files changed, 147 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 58f8a43a43..1f78af121d 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -774,6 +774,65 @@ static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + } + } + ++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n, ++ IOMMUTLBEntry *iotlb) ++{ ++ struct vfio_iommu_type1_set_msi_binding ustruct; ++ VFIOMSIBinding *binding; ++ int ret; ++ ++ QLIST_FOREACH(binding, &container->msibinding_list, next) { ++ if (binding->index == n) { ++ return 0; ++ } ++ } ++ ++ ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding); ++ ustruct.iova = iotlb->iova; ++ ustruct.flags = VFIO_IOMMU_BIND_MSI; ++ ustruct.gpa = iotlb->translated_addr; ++ ustruct.size = iotlb->addr_mask + 1; ++ ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct); ++ if (ret) { ++ error_report("%s: failed to register the stage1 MSI binding (%m)", ++ __func__); ++ return ret; ++ } ++ binding = g_new0(VFIOMSIBinding, 1); ++ binding->iova = ustruct.iova; ++ binding->gpa = ustruct.gpa; ++ binding->size = ustruct.size; ++ binding->index = n; ++ ++ QLIST_INSERT_HEAD(&container->msibinding_list, binding, next); ++ return 0; ++} ++ ++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n) ++{ ++ struct vfio_iommu_type1_set_msi_binding ustruct; ++ VFIOMSIBinding *binding, *tmp; ++ int ret; ++ ++ ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding); ++ QLIST_FOREACH_SAFE(binding, &container->msibinding_list, next, tmp) { ++ if (binding->index != n) { ++ continue; ++ } ++ ustruct.flags = VFIO_IOMMU_UNBIND_MSI; ++ ustruct.iova = binding->iova; ++ ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct); ++ if (ret) { ++ error_report("Failed to unregister the stage1 MSI binding " ++ "for iova=0x%"PRIx64" (%m)", binding->iova); ++ } ++ QLIST_REMOVE(binding, next); ++ g_free(binding); ++ return ret; ++ } ++ return 0; ++} ++ + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + { + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index ae5e014e5d..99c52a0944 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -365,6 +365,65 @@ static void vfio_msi_interrupt(void *opaque) + notify(&vdev->pdev, nr); + } + ++static bool vfio_iommu_require_msi_binding(IOMMUMemoryRegion *iommu_mr) ++{ ++ bool msi_translate = false, nested = false; ++ ++ memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_MSI_TRANSLATE, ++ (void *)&msi_translate); ++ memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED, ++ (void *)&nested); ++ if (!nested || !msi_translate) { ++ return false; ++ } ++ return true; ++} ++ ++static int vfio_register_msi_binding(VFIOPCIDevice *vdev, ++ int vector_n, bool set) ++{ ++ VFIOContainer *container = vdev->vbasedev.group->container; ++ PCIDevice *dev = &vdev->pdev; ++ AddressSpace *as = pci_device_iommu_address_space(dev); ++ IOMMUMemoryRegionClass *imrc; ++ IOMMUMemoryRegion *iommu_mr; ++ IOMMUTLBEntry entry; ++ MSIMessage msg; ++ ++ if (as == &address_space_memory) { ++ return 0; ++ } ++ ++ iommu_mr = IOMMU_MEMORY_REGION(as->root); ++ if (!vfio_iommu_require_msi_binding(iommu_mr)) { ++ return 0; ++ } ++ ++ /* MSI doorbell address is translated by an IOMMU */ ++ ++ if (!set) { /* unregister */ ++ trace_vfio_unregister_msi_binding(vdev->vbasedev.name, vector_n); ++ ++ return vfio_iommu_unset_msi_binding(container, vector_n); ++ } ++ ++ msg = pci_get_msi_message(dev, vector_n); ++ imrc = memory_region_get_iommu_class_nocheck(iommu_mr); ++ ++ rcu_read_lock(); ++ entry = imrc->translate(iommu_mr, msg.address, IOMMU_WO, 0); ++ rcu_read_unlock(); ++ ++ if (entry.perm == IOMMU_NONE) { ++ return -ENOENT; ++ } ++ ++ trace_vfio_register_msi_binding(vdev->vbasedev.name, vector_n, ++ msg.address, entry.translated_addr); ++ ++ return vfio_iommu_set_msi_binding(container, vector_n, &entry); ++} ++ + static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + { + struct vfio_irq_set *irq_set; +@@ -382,7 +441,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + fds = (int32_t *)&irq_set->data; + + for (i = 0; i < vdev->nr_vectors; i++) { +- int fd = -1; ++ int ret, fd = -1; + + /* + * MSI vs MSI-X - The guest has direct access to MSI mask and pending +@@ -391,6 +450,12 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + * KVM signaling path only when configured and unmasked. + */ + if (vdev->msi_vectors[i].use) { ++ ret = vfio_register_msi_binding(vdev, i, true); ++ if (ret) { ++ error_report("%s failed to register S1 MSI binding " ++ "for vector %d(%d)", vdev->vbasedev.name, i, ret); ++ goto out; ++ } + if (vdev->msi_vectors[i].virq < 0 || + (msix && msix_is_masked(&vdev->pdev, i))) { + fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); +@@ -404,6 +469,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ++out: + g_free(irq_set); + + return ret; +@@ -718,7 +784,8 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) + + static void vfio_msix_disable(VFIOPCIDevice *vdev) + { +- int i; ++ int ret, i; ++ + + msix_unset_vector_notifiers(&vdev->pdev); + +@@ -730,6 +797,11 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) + if (vdev->msi_vectors[i].use) { + vfio_msix_vector_release(&vdev->pdev, i); + msix_vector_unuse(&vdev->pdev, i); ++ ret = vfio_register_msi_binding(vdev, i, false); ++ if (ret) { ++ error_report("%s: failed to unregister S1 MSI binding " ++ "for vector %d(%d)", vdev->vbasedev.name, i, ret); ++ } + } + } + +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 20069935f5..35bd415d6d 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -120,6 +120,8 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype + vfio_dma_unmap_overflow_workaround(void) "" + vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" + vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" ++vfio_register_msi_binding(const char *name, int vector, uint64_t giova, uint64_t gdb) "%s: register vector %d gIOVA=0x%"PRIx64 "-> gDB=0x%"PRIx64" stage 1 mapping" ++vfio_unregister_msi_binding(const char *name, int vector) "%s: unregister vector %d stage 1 mapping" + + # platform.c + vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 0234f5e1b1..a838a939e4 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -74,6 +74,14 @@ typedef struct VFIOAddressSpace { + QLIST_ENTRY(VFIOAddressSpace) list; + } VFIOAddressSpace; + ++typedef struct VFIOMSIBinding { ++ int index; ++ hwaddr iova; ++ hwaddr gpa; ++ hwaddr size; ++ QLIST_ENTRY(VFIOMSIBinding) next; ++} VFIOMSIBinding; ++ + struct VFIOGroup; + + typedef struct VFIODMARange { +@@ -103,6 +111,7 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_HEAD(, VFIODMARange) dma_list; ++ QLIST_HEAD(, VFIOMSIBinding) msibinding_list; + QLIST_ENTRY(VFIOContainer) next; + } VFIOContainer; + +@@ -222,6 +231,9 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp); + void vfio_put_group(VFIOGroup *group); + int vfio_get_device(VFIOGroup *group, const char *name, + VFIODevice *vbasedev, Error **errp); ++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n, ++ IOMMUTLBEntry *entry); ++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n); + + extern const MemoryRegionOps vfio_region_ops; + typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; +-- +2.27.0 + diff --git a/vfio-Set-up-nested-stage-mappings.patch b/vfio-Set-up-nested-stage-mappings.patch new file mode 100644 index 0000000000000000000000000000000000000000..c6d87f97be54c3c76b2890ee978723ee946146c6 --- /dev/null +++ b/vfio-Set-up-nested-stage-mappings.patch @@ -0,0 +1,281 @@ +From 96581a5ee46e89dbc9e1ebe247b00adefb1c7a41 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Wed, 29 Aug 2018 18:10:12 +0200 +Subject: [PATCH] vfio: Set up nested stage mappings + +In nested mode, legacy vfio_iommu_map_notify cannot be used as +there is no "caching" mode and we do not trap on map. + +On Intel, vfio_iommu_map_notify was used to DMA map the RAM +through the host single stage. + +With nested mode, we need to setup the stage 2 and the stage 1 +separately. This patch introduces a prereg_listener to setup +the stage 2 mapping. + +The stage 1 mapping, owned by the guest, is passed to the host +when the guest invalidates the stage 1 configuration, through +a dedicated PCIPASIDOps callback. Guest IOTLB invalidations +are cascaded downto the host through another IOMMU MR UNMAP +notifier. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 139 +++++++++++++++++++++++++++++++++++++++++-- + hw/vfio/pci.c | 21 +++++++ + hw/vfio/trace-events | 2 + + 3 files changed, 157 insertions(+), 5 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b3dc090840..58f8a43a43 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -707,6 +707,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + return true; + } + ++/* Propagate a guest IOTLB invalidation to the host (nested mode) */ ++static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ++{ ++ VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); ++ struct vfio_iommu_type1_cache_invalidate ustruct = {}; ++ VFIOContainer *container = giommu->container; ++ int ret; ++ ++ assert(iotlb->perm == IOMMU_NONE); ++ ++ ustruct.argsz = sizeof(ustruct); ++ ustruct.flags = 0; ++ ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info); ++ ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1; ++ ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB; ++ ++ switch (iotlb->granularity) { ++ case IOMMU_INV_GRAN_DOMAIN: ++ ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN; ++ break; ++ case IOMMU_INV_GRAN_PASID: ++ { ++ struct iommu_inv_pasid_info *pasid_info; ++ int archid = -1; ++ ++ pasid_info = &ustruct.info.granu.pasid_info; ++ ustruct.info.granularity = IOMMU_INV_GRANU_PASID; ++ if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { ++ pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; ++ archid = iotlb->arch_id; ++ } ++ pasid_info->archid = archid; ++ trace_vfio_iommu_asid_inv_iotlb(archid); ++ break; ++ } ++ case IOMMU_INV_GRAN_ADDR: ++ { ++ hwaddr start = iotlb->iova + giommu->iommu_offset; ++ struct iommu_inv_addr_info *addr_info; ++ size_t size = iotlb->addr_mask + 1; ++ int archid = -1; ++ ++ addr_info = &ustruct.info.granu.addr_info; ++ ustruct.info.granularity = IOMMU_INV_GRANU_ADDR; ++ if (iotlb->leaf) { ++ addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF; ++ } ++ if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { ++ addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; ++ archid = iotlb->arch_id; ++ } ++ addr_info->archid = archid; ++ addr_info->addr = start; ++ addr_info->granule_size = size; ++ addr_info->nb_granules = 1; ++ trace_vfio_iommu_addr_inv_iotlb(archid, start, size, ++ 1, iotlb->leaf); ++ break; ++ } ++ } ++ ++ ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct); ++ if (ret) { ++ error_report("%p: failed to invalidate CACHE (%d)", container, ret); ++ } ++} ++ + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + { + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); +@@ -1040,6 +1107,35 @@ static void vfio_dma_unmap_ram_section(VFIOContainer *container, + } + } + ++static void vfio_prereg_listener_region_add(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ Error *err = NULL; ++ ++ if (!memory_region_is_ram(section->mr)) { ++ return; ++ } ++ ++ vfio_dma_map_ram_section(container, section, &err); ++ if (err) { ++ error_report_err(err); ++ } ++} ++static void vfio_prereg_listener_region_del(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ ++ if (!memory_region_is_ram(section->mr)) { ++ return; ++ } ++ ++ vfio_dma_unmap_ram_section(container, section); ++} ++ + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { +@@ -1150,9 +1246,10 @@ static void vfio_listener_region_add(MemoryListener *listener, + memory_region_ref(section->mr); + + if (memory_region_is_iommu(section->mr)) { ++ IOMMUNotify notify; + VFIOGuestIOMMU *giommu; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); +- int iommu_idx; ++ int iommu_idx, flags; + + trace_vfio_listener_region_add_iommu(iova, end); + /* +@@ -1171,8 +1268,18 @@ static void vfio_listener_region_add(MemoryListener *listener, + llend = int128_sub(llend, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); +- iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, +- IOMMU_NOTIFIER_IOTLB_EVENTS, ++ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ /* IOTLB unmap notifier to propagate guest IOTLB invalidations */ ++ flags = IOMMU_NOTIFIER_UNMAP; ++ notify = vfio_iommu_unmap_notify; ++ } else { ++ /* MAP/UNMAP IOTLB notifier */ ++ flags = IOMMU_NOTIFIER_IOTLB_EVENTS; ++ notify = vfio_iommu_map_notify; ++ } ++ ++ iommu_notifier_init(&giommu->n, notify, flags, + section->offset_within_region, + int128_get64(llend), + iommu_idx); +@@ -1192,7 +1299,9 @@ static void vfio_listener_region_add(MemoryListener *listener, + goto fail; + } + QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); +- memory_region_iommu_replay(giommu->iommu, &giommu->n); ++ if (flags & IOMMU_NOTIFIER_MAP) { ++ memory_region_iommu_replay(giommu->iommu, &giommu->n); ++ } + + return; + } +@@ -1672,10 +1781,16 @@ static const MemoryListener vfio_memory_listener = { + .log_clear = vfio_listener_log_clear, + }; + ++static MemoryListener vfio_memory_prereg_listener = { ++ .region_add = vfio_prereg_listener_region_add, ++ .region_del = vfio_prereg_listener_region_del, ++}; ++ + static void vfio_listener_release(VFIOContainer *container) + { + memory_listener_unregister(&container->listener); +- if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { ++ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || ++ container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } + } +@@ -2351,6 +2466,20 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + vfio_get_iommu_info_migration(container, info); + } + g_free(info); ++ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ container->prereg_listener = vfio_memory_prereg_listener; ++ memory_listener_register(&container->prereg_listener, ++ &address_space_memory); ++ if (container->error) { ++ memory_listener_unregister(&container->prereg_listener); ++ ret = -1; ++ error_propagate_prepend(errp, container->error, ++ "RAM memory listener initialization failed " ++ "for container"); ++ goto free_container_exit; ++ } ++ } + break; + } + case VFIO_SPAPR_TCE_v2_IOMMU: +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 7b45353ce2..ae5e014e5d 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2797,6 +2797,25 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) + vdev->req_enabled = false; + } + ++static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn, ++ IOMMUConfig *config) ++{ ++ PCIDevice *pdev = bus->devices[devfn]; ++ VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); ++ VFIOContainer *container = vdev->vbasedev.group->container; ++ struct vfio_iommu_type1_set_pasid_table info; ++ ++ info.argsz = sizeof(info); ++ info.flags = VFIO_PASID_TABLE_FLAG_SET; ++ memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg)); ++ ++ return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); ++} ++ ++static PCIPASIDOps vfio_pci_pasid_ops = { ++ .set_pasid_table = vfio_iommu_set_pasid_table, ++}; ++ + static void vfio_realize(PCIDevice *pdev, Error **errp) + { + VFIOPCIDevice *vdev = VFIO_PCI(pdev); +@@ -3108,6 +3127,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + vfio_register_req_notifier(vdev); + vfio_setup_resetfn_quirk(vdev); + ++ pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); ++ + return; + + out_deregister: +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index a37563a315..20069935f5 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -118,6 +118,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic + vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" + vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" + vfio_dma_unmap_overflow_workaround(void) "" ++vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" ++vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" + + # platform.c + vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" +-- +2.27.0 + diff --git a/vfio-common-Add-address-alignment-check-in-vfio_list.patch b/vfio-common-Add-address-alignment-check-in-vfio_list.patch new file mode 100644 index 0000000000000000000000000000000000000000..288f28482314f60dd0396228b35b17c0f9d67b8e --- /dev/null +++ b/vfio-common-Add-address-alignment-check-in-vfio_list.patch @@ -0,0 +1,53 @@ +From 00c553f53657bf4bc165d859187215dba7110246 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 14 Sep 2021 14:21:46 +0800 +Subject: [PATCH] vfio/common: Add address alignment check in + vfio_listener_region_del + +Both vfio_listener_region_add and vfio_listener_region_del have +reference counting operations on ram section->mr. If the 'iova' +and 'llend' of the ram section do not pass the alignment +check, the ram section should not be mapped or unmapped. It means +that the reference counting should not be changed. + +However, the address alignment check is missing in +vfio_listener_region_del. This makes memory_region_unref will +be unconditional called and causes unintended problems in some +scenarios. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 89c49f5508..4d45c2b625 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1411,6 +1411,8 @@ static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ hwaddr iova; ++ Int128 llend; + + if (vfio_listener_skipped_section(section)) { + trace_vfio_listener_region_del_skip( +@@ -1460,6 +1462,14 @@ static void vfio_listener_region_del(MemoryListener *listener, + */ + } + ++ iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); ++ llend = int128_make64(section->offset_within_address_space); ++ llend = int128_add(llend, section->size); ++ llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); ++ if (int128_ge(int128_make64(iova), llend)) { ++ return; ++ } ++ + vfio_dma_unmap_ram_section(container, section); + + memory_region_unref(section->mr); +-- +2.27.0 + diff --git a/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch new file mode 100644 index 0000000000000000000000000000000000000000..71302b2d21e9840a0591d2960452de622cd60731 --- /dev/null +++ b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch @@ -0,0 +1,39 @@ +From 9d7b782a0b2c5288e82f3064b4c5b7bf18887280 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Sat, 31 Jul 2021 10:02:18 +0800 +Subject: [PATCH] vfio/common: Avoid unmap ram section at + vfio_listener_region_del() in nested mode + +The ram section will be unmapped at vfio_prereg_listener_region_del() +in nested mode. So let's avoid unmap ram section at +vfio_listener_region_dev(). + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d05a485808..bdfcc854fe 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1441,6 +1441,16 @@ static void vfio_listener_region_del(MemoryListener *listener, + } + } + ++ /* ++ * In nested mode, stage 2 (gpa->hpa) and the stage 1 ++ * (giova->gpa) are set separately. The ram section ++ * will be unmapped in vfio_prereg_listener_region_del(). ++ * Hence it doesn't need to unmap ram section here. ++ */ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ return; ++ } ++ + /* + * FIXME: We assume the one big unmap below is adequate to + * remove any individual page mappings in the IOMMU which +-- +2.27.0 + diff --git a/vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch b/vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch new file mode 100644 index 0000000000000000000000000000000000000000..d61408e6242adbb0c7e95b4cd94ec70fcca19c22 --- /dev/null +++ b/vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch @@ -0,0 +1,40 @@ +From c2a4ce033db6ab74256e28da382c797a98047d4b Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 7 Sep 2021 15:14:12 +0800 +Subject: [PATCH] vfio/common: Fix incorrect address alignment in + vfio_dma_map_ram_section + +The 'iova' will be passed to host kernel for mapping with the +HPA. It is related to the host page size. So TARGET_PAGE_ALIGN +should be replaced by REAL_HOST_PAGE_ALIGN. In the case of +large granularity (64K), it may return early when map MMIO RAM +section. And because of the inconsistency with +vfio_dma_unmap_ram_section, it may cause 'assert(qrange)' +in vfio_dma_unmap. + +Signed-off-by: Kunkun Jiang +Signed-off-by: Zenghui Yu +--- + hw/vfio/common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 65f3979492..89c49f5508 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1059,10 +1059,10 @@ static int vfio_dma_map_ram_section(VFIOContainer *container, + + assert(memory_region_is_ram(section->mr)); + +- iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); ++ iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); +- llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); ++ llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); + end = int128_get64(int128_sub(llend, int128_one())); + + vaddr = memory_region_get_ram_ptr(section->mr) + +-- +2.27.0 + diff --git a/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch new file mode 100644 index 0000000000000000000000000000000000000000..0a5ff88f995220f5c3128eb8e56c86afeffe53e2 --- /dev/null +++ b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch @@ -0,0 +1,224 @@ +From 815258f81a660ad87272191dca4a9726cb2bf5b2 Mon Sep 17 00:00:00 2001 +From: Zenghui Yu +Date: Sat, 8 May 2021 17:31:05 +0800 +Subject: [PATCH] vfio/migration: Add support for manual clear vfio dirty log + +The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl +VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and +VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in +the kernel, tweak the userspace side to use them. + +Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and +provide the log_clear() hook for vfio_memory_listener. If the +kernel supports it, deliever the clear message to kernel. + +Signed-off-by: Zenghui Yu +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 149 +++++++++++++++++++++++++++++++++- + include/hw/vfio/vfio-common.h | 1 + + 2 files changed, 148 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 86ea784919..6cb91e7ffd 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1315,7 +1315,9 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); +- dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; ++ dbitmap->flags = container->dirty_log_manual_clear ? ++ VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR : ++ VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; +@@ -1491,6 +1493,141 @@ static void vfio_listener_log_sync(MemoryListener *listener, + } + } + ++/* ++ * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP ++ * ioctl. But copy from kvm side and align {start, size} with 64 pages. ++ * ++ * I think the code can be simplified a lot if no alignment requirement. ++ */ ++#define VFIO_CLEAR_LOG_SHIFT 6 ++#define VFIO_CLEAR_LOG_ALIGN (qemu_real_host_page_size << VFIO_CLEAR_LOG_SHIFT) ++#define VFIO_CLEAR_LOG_MASK (-VFIO_CLEAR_LOG_ALIGN) ++ ++static int vfio_log_clear_one_range(VFIOContainer *container, ++ VFIODMARange *qrange, uint64_t start, uint64_t size) ++{ ++ struct vfio_iommu_type1_dirty_bitmap *dbitmap; ++ struct vfio_iommu_type1_dirty_bitmap_get *range; ++ ++ dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); ++ ++ dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); ++ dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; ++ range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; ++ ++ /* ++ * Now let's deal with the actual bitmap, which is almost the same ++ * as the kvm side. ++ */ ++ uint64_t end, bmap_start, start_delta, bmap_npages; ++ unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; ++ int ret; ++ ++ bmap_start = start & VFIO_CLEAR_LOG_MASK; ++ start_delta = start - bmap_start; ++ bmap_start /= psize; ++ ++ bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN) ++ << VFIO_CLEAR_LOG_SHIFT; ++ end = qrange->size / psize; ++ if (bmap_npages > end - bmap_start) { ++ bmap_npages = end - bmap_start; ++ } ++ start_delta /= psize; ++ ++ if (start_delta) { ++ bmap_clear = bitmap_new(bmap_npages); ++ bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap, ++ bmap_start, start_delta + size / psize); ++ bitmap_clear(bmap_clear, 0, start_delta); ++ range->bitmap.data = (__u64 *)bmap_clear; ++ } else { ++ range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start)); ++ } ++ ++ range->iova = qrange->iova + bmap_start * psize; ++ range->size = bmap_npages * psize; ++ range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) / ++ BITS_PER_BYTE; ++ range->bitmap.pgsize = qemu_real_host_page_size; ++ ++ ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); ++ if (ret) { ++ error_report("Failed to clear dirty log for iova: 0x%"PRIx64 ++ " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, ++ (uint64_t)range->size, errno); ++ goto err_out; ++ } ++ ++ bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize); ++err_out: ++ g_free(bmap_clear); ++ g_free(dbitmap); ++ return 0; ++} ++ ++static int vfio_physical_log_clear(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ uint64_t start, size, offset, count; ++ VFIODMARange *qrange; ++ int ret = 0; ++ ++ if (!container->dirty_log_manual_clear) { ++ /* No need to do explicit clear */ ++ return ret; ++ } ++ ++ start = section->offset_within_address_space; ++ size = int128_get64(section->size); ++ ++ if (!size) { ++ return ret; ++ } ++ ++ QLIST_FOREACH(qrange, &container->dma_list, next) { ++ /* ++ * Discard ranges that do not overlap the section (e.g., the ++ * Memory BAR regions of the device) ++ */ ++ if (qrange->iova > start + size - 1 || ++ start > qrange->iova + qrange->size - 1) { ++ continue; ++ } ++ ++ if (start >= qrange->iova) { ++ /* The range starts before section or is aligned to it. */ ++ offset = start - qrange->iova; ++ count = MIN(qrange->size - offset, size); ++ } else { ++ /* The range starts after section. */ ++ offset = 0; ++ count = MIN(qrange->size, size - (qrange->iova - start)); ++ } ++ ret = vfio_log_clear_one_range(container, qrange, offset, count); ++ if (ret < 0) { ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static void vfio_listener_log_clear(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ ++ if (vfio_listener_skipped_section(section) || ++ !container->dirty_pages_supported) { ++ return; ++ } ++ ++ if (vfio_devices_all_dirty_tracking(container)) { ++ vfio_physical_log_clear(container, section); ++ } ++} ++ + static const MemoryListener vfio_memory_listener = { + .name = "vfio", + .region_add = vfio_listener_region_add, +@@ -1498,6 +1635,7 @@ static const MemoryListener vfio_memory_listener = { + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, + .log_sync = vfio_listener_log_sync, ++ .log_clear = vfio_listener_log_clear, + }; + + static void vfio_listener_release(VFIOContainer *container) +@@ -1925,7 +2063,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, + static int vfio_init_container(VFIOContainer *container, int group_fd, + Error **errp) + { +- int iommu_type, ret; ++ int iommu_type, dirty_log_manual_clear, ret; + + iommu_type = vfio_get_iommu_type(container, errp); + if (iommu_type < 0) { +@@ -1954,6 +2092,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, + } + + container->iommu_type = iommu_type; ++ ++ dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION, ++ VFIO_DIRTY_LOG_MANUAL_CLEAR); ++ if (dirty_log_manual_clear) { ++ container->dirty_log_manual_clear = dirty_log_manual_clear; ++ } ++ + return 0; + } + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 20b9c8a1d3..0234f5e1b1 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -93,6 +93,7 @@ typedef struct VFIOContainer { + Error *error; + bool initialized; + bool dirty_pages_supported; ++ bool dirty_log_manual_clear; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; + unsigned long pgsizes; +-- +2.27.0 + diff --git a/vfio-pci-Implement-return_page_response-page-respons.patch b/vfio-pci-Implement-return_page_response-page-respons.patch new file mode 100644 index 0000000000000000000000000000000000000000..21d88a363a4ddb9305b9990424a121e8ccb68623 --- /dev/null +++ b/vfio-pci-Implement-return_page_response-page-respons.patch @@ -0,0 +1,199 @@ +From 6bbf810edebdb89a6958519ee3adfb1888520231 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 6 Nov 2020 12:03:29 -0500 +Subject: [PATCH] vfio/pci: Implement return_page_response page response + callback + +This patch implements the page response path. The +response is written into the page response ring buffer and then +update header's head index is updated. This path is not used +by this series. It is introduced here as a POC for vSVA/ARM +integration. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/pci.h | 2 + + 2 files changed, 125 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index c54e62fe8f..8e24f9c7d1 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2693,6 +2693,61 @@ out: + g_free(fault_region_info); + } + ++static void vfio_init_fault_response_regions(VFIOPCIDevice *vdev, Error **errp) ++{ ++ struct vfio_region_info *fault_region_info = NULL; ++ struct vfio_region_info_cap_fault *cap_fault; ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ struct vfio_info_cap_header *hdr; ++ char *fault_region_name; ++ int ret; ++ ++ ret = vfio_get_dev_region_info(&vdev->vbasedev, ++ VFIO_REGION_TYPE_NESTED, ++ VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE, ++ &fault_region_info); ++ if (ret) { ++ goto out; ++ } ++ ++ hdr = vfio_get_region_info_cap(fault_region_info, ++ VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE); ++ if (!hdr) { ++ error_setg(errp, "failed to retrieve DMA FAULT RESPONSE capability"); ++ goto out; ++ } ++ cap_fault = container_of(hdr, struct vfio_region_info_cap_fault, ++ header); ++ if (cap_fault->version != 1) { ++ error_setg(errp, "Unsupported DMA FAULT RESPONSE API version %d", ++ cap_fault->version); ++ goto out; ++ } ++ ++ fault_region_name = g_strdup_printf("%s DMA FAULT RESPONSE %d", ++ vbasedev->name, ++ fault_region_info->index); ++ ++ ret = vfio_region_setup(OBJECT(vdev), vbasedev, ++ &vdev->dma_fault_response_region, ++ fault_region_info->index, ++ fault_region_name); ++ g_free(fault_region_name); ++ if (ret) { ++ error_setg_errno(errp, -ret, ++ "failed to set up the DMA FAULT RESPONSE region %d", ++ fault_region_info->index); ++ goto out; ++ } ++ ++ ret = vfio_region_mmap(&vdev->dma_fault_response_region); ++ if (ret) { ++ error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT RESPONSE queue"); ++ } ++out: ++ g_free(fault_region_info); ++} ++ + static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + { + VFIODevice *vbasedev = &vdev->vbasedev; +@@ -2768,6 +2823,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + return; + } + ++ vfio_init_fault_response_regions(vdev, &err); ++ if (err) { ++ error_propagate(errp, err); ++ return; ++ } ++ + irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +@@ -2946,8 +3007,68 @@ static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn, + return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); + } + ++static int vfio_iommu_return_page_response(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp) ++{ ++ PCIDevice *pdev = bus->devices[devfn]; ++ VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); ++ struct iommu_page_response *response = &resp->resp; ++ struct vfio_region_dma_fault_response header; ++ struct iommu_page_response *queue; ++ char *queue_buffer = NULL; ++ ssize_t bytes; ++ ++ if (!vdev->dma_fault_response_region.mem) { ++ return -EINVAL; ++ } ++ ++ /* read the header */ ++ bytes = pread(vdev->vbasedev.fd, &header, sizeof(header), ++ vdev->dma_fault_response_region.fd_offset); ++ if (bytes != sizeof(header)) { ++ error_report("%s unable to read the fault region header (0x%lx)", ++ __func__, bytes); ++ return -1; ++ } ++ ++ /* Normally the fault queue is mmapped */ ++ queue = (struct iommu_page_response *)vdev->dma_fault_response_region.mmaps[0].mmap; ++ if (!queue) { ++ size_t queue_size = header.nb_entries * header.entry_size; ++ ++ error_report("%s: fault queue not mmapped: slower fault handling", ++ vdev->vbasedev.name); ++ ++ queue_buffer = g_malloc(queue_size); ++ bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size, ++ vdev->dma_fault_response_region.fd_offset + header.offset); ++ if (bytes != queue_size) { ++ error_report("%s unable to read the fault queue (0x%lx)", ++ __func__, bytes); ++ return -1; ++ } ++ ++ queue = (struct iommu_page_response *)queue_buffer; ++ } ++ /* deposit the new response in the queue and increment the head */ ++ memcpy(queue + header.head, response, header.entry_size); ++ ++ vdev->fault_response_head_index = ++ (vdev->fault_response_head_index + 1) % header.nb_entries; ++ bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_response_head_index, 4, ++ vdev->dma_fault_response_region.fd_offset); ++ if (bytes != 4) { ++ error_report("%s unable to write the fault response region head index (0x%lx)", ++ __func__, bytes); ++ } ++ g_free(queue_buffer); ++ ++ return 0; ++} ++ + static PCIPASIDOps vfio_pci_pasid_ops = { + .set_pasid_table = vfio_iommu_set_pasid_table, ++ .return_page_response = vfio_iommu_return_page_response, + }; + + static void vfio_dma_fault_notifier_handler(void *opaque) +@@ -3411,6 +3532,7 @@ static void vfio_instance_finalize(Object *obj) + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + vfio_region_finalize(&vdev->dma_fault_region); ++ vfio_region_finalize(&vdev->dma_fault_response_region); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* +@@ -3432,6 +3554,7 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_unregister_err_notifier(vdev); + vfio_unregister_ext_irq_notifiers(vdev); + vfio_region_exit(&vdev->dma_fault_region); ++ vfio_region_exit(&vdev->dma_fault_response_region); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 03ac8919ef..61b3bf1303 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -147,6 +147,8 @@ struct VFIOPCIDevice { + VFIOPCIExtIRQ *ext_irqs; + VFIORegion dma_fault_region; + uint32_t fault_tail_index; ++ VFIORegion dma_fault_response_region; ++ uint32_t fault_response_head_index; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + diff --git a/vfio-pci-Implement-the-DMA-fault-handler.patch b/vfio-pci-Implement-the-DMA-fault-handler.patch new file mode 100644 index 0000000000000000000000000000000000000000..7d7349c9088662d0e38aa22c0b0ecbea3e0f7506 --- /dev/null +++ b/vfio-pci-Implement-the-DMA-fault-handler.patch @@ -0,0 +1,96 @@ +From d33cc7eccb68c6a1488804c94ff5c1197ee0fc6e Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 5 Mar 2019 16:35:32 +0100 +Subject: [PATCH] vfio/pci: Implement the DMA fault handler + +Whenever the eventfd is triggered, we retrieve the DMA fault(s) +from the mmapped fault region and inject them in the iommu +memory region. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/pci.h | 1 + + 2 files changed, 51 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 76bc9d3506..c54e62fe8f 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2953,10 +2953,60 @@ static PCIPASIDOps vfio_pci_pasid_ops = { + static void vfio_dma_fault_notifier_handler(void *opaque) + { + VFIOPCIExtIRQ *ext_irq = opaque; ++ VFIOPCIDevice *vdev = ext_irq->vdev; ++ PCIDevice *pdev = &vdev->pdev; ++ AddressSpace *as = pci_device_iommu_address_space(pdev); ++ IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(as->root); ++ struct vfio_region_dma_fault header; ++ struct iommu_fault *queue; ++ char *queue_buffer = NULL; ++ ssize_t bytes; + + if (!event_notifier_test_and_clear(&ext_irq->notifier)) { + return; + } ++ ++ bytes = pread(vdev->vbasedev.fd, &header, sizeof(header), ++ vdev->dma_fault_region.fd_offset); ++ if (bytes != sizeof(header)) { ++ error_report("%s unable to read the fault region header (0x%lx)", ++ __func__, bytes); ++ return; ++ } ++ ++ /* Normally the fault queue is mmapped */ ++ queue = (struct iommu_fault *)vdev->dma_fault_region.mmaps[0].mmap; ++ if (!queue) { ++ size_t queue_size = header.nb_entries * header.entry_size; ++ ++ error_report("%s: fault queue not mmapped: slower fault handling", ++ vdev->vbasedev.name); ++ ++ queue_buffer = g_malloc(queue_size); ++ bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size, ++ vdev->dma_fault_region.fd_offset + header.offset); ++ if (bytes != queue_size) { ++ error_report("%s unable to read the fault queue (0x%lx)", ++ __func__, bytes); ++ return; ++ } ++ ++ queue = (struct iommu_fault *)queue_buffer; ++ } ++ ++ while (vdev->fault_tail_index != header.head) { ++ memory_region_inject_faults(iommu_mr, 1, ++ &queue[vdev->fault_tail_index]); ++ vdev->fault_tail_index = ++ (vdev->fault_tail_index + 1) % header.nb_entries; ++ } ++ bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_tail_index, 4, ++ vdev->dma_fault_region.fd_offset); ++ if (bytes != 4) { ++ error_report("%s unable to write the fault region tail index (0x%lx)", ++ __func__, bytes); ++ } ++ g_free(queue_buffer); + } + + static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev, +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index eef91065f1..03ac8919ef 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -146,6 +146,7 @@ struct VFIOPCIDevice { + EventNotifier req_notifier; + VFIOPCIExtIRQ *ext_irqs; + VFIORegion dma_fault_region; ++ uint32_t fault_tail_index; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + diff --git a/vfio-pci-Register-handler-for-iommu-fault.patch b/vfio-pci-Register-handler-for-iommu-fault.patch new file mode 100644 index 0000000000000000000000000000000000000000..7209a807ee0911af0f6af7e01e29a9ff389944dc --- /dev/null +++ b/vfio-pci-Register-handler-for-iommu-fault.patch @@ -0,0 +1,168 @@ +From 574455d1363e818905e05cd23ef0948e83a16a51 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Dec 2018 04:39:30 -0500 +Subject: [PATCH] vfio/pci: Register handler for iommu fault + +We use the new extended IRQ VFIO_IRQ_TYPE_NESTED type and +VFIO_IRQ_SUBTYPE_DMA_FAULT subtype to set/unset +a notifier for physical DMA faults. The associated eventfd is +triggered, in nested mode, whenever a fault is detected at IOMMU +physical level. + +The actual handler will be implemented in subsequent patches. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++- + hw/vfio/pci.h | 7 +++++ + 2 files changed, 87 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 99c52a0944..37a70932c6 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2888,6 +2888,76 @@ static PCIPASIDOps vfio_pci_pasid_ops = { + .set_pasid_table = vfio_iommu_set_pasid_table, + }; + ++static void vfio_dma_fault_notifier_handler(void *opaque) ++{ ++ VFIOPCIExtIRQ *ext_irq = opaque; ++ ++ if (!event_notifier_test_and_clear(&ext_irq->notifier)) { ++ return; ++ } ++} ++ ++static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev, ++ uint32_t type, uint32_t subtype, ++ IOHandler *handler) ++{ ++ int32_t fd, ext_irq_index, index; ++ struct vfio_irq_info *irq_info; ++ Error *err = NULL; ++ EventNotifier *n; ++ int ret; ++ ++ ret = vfio_get_dev_irq_info(&vdev->vbasedev, type, subtype, &irq_info); ++ if (ret) { ++ return ret; ++ } ++ index = irq_info->index; ++ ext_irq_index = irq_info->index - VFIO_PCI_NUM_IRQS; ++ g_free(irq_info); ++ ++ vdev->ext_irqs[ext_irq_index].vdev = vdev; ++ vdev->ext_irqs[ext_irq_index].index = index; ++ n = &vdev->ext_irqs[ext_irq_index].notifier; ++ ++ ret = event_notifier_init(n, 0); ++ if (ret) { ++ error_report("vfio: Unable to init event notifier for ext irq %d(%d)", ++ ext_irq_index, ret); ++ return ret; ++ } ++ ++ fd = event_notifier_get_fd(n); ++ qemu_set_fd_handler(fd, vfio_dma_fault_notifier_handler, NULL, ++ &vdev->ext_irqs[ext_irq_index]); ++ ++ ret = vfio_set_irq_signaling(&vdev->vbasedev, index, 0, ++ VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err); ++ if (ret) { ++ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); ++ qemu_set_fd_handler(fd, NULL, NULL, vdev); ++ event_notifier_cleanup(n); ++ } ++ return ret; ++} ++ ++static void vfio_unregister_ext_irq_notifiers(VFIOPCIDevice *vdev) ++{ ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ Error *err = NULL; ++ int i; ++ ++ for (i = 0; i < vbasedev->num_irqs - VFIO_PCI_NUM_IRQS; i++) { ++ if (vfio_set_irq_signaling(vbasedev, i + VFIO_PCI_NUM_IRQS , 0, ++ VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { ++ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); ++ } ++ qemu_set_fd_handler(event_notifier_get_fd(&vdev->ext_irqs[i].notifier), ++ NULL, NULL, vdev); ++ event_notifier_cleanup(&vdev->ext_irqs[i].notifier); ++ } ++ g_free(vdev->ext_irqs); ++} ++ + static void vfio_realize(PCIDevice *pdev, Error **errp) + { + VFIOPCIDevice *vdev = VFIO_PCI(pdev); +@@ -2898,7 +2968,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + ssize_t len; + struct stat st; + int groupid; +- int i, ret; ++ int i, ret, nb_ext_irqs; + bool is_mdev; + + if (!vdev->vbasedev.sysfsdev) { +@@ -2986,6 +3056,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + goto error; + } + ++ nb_ext_irqs = vdev->vbasedev.num_irqs - VFIO_PCI_NUM_IRQS; ++ if (nb_ext_irqs > 0) { ++ vdev->ext_irqs = g_new0(VFIOPCIExtIRQ, nb_ext_irqs); ++ } ++ + vfio_populate_device(vdev, &err); + if (err) { + error_propagate(errp, err); +@@ -3197,6 +3272,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + + vfio_register_err_notifier(vdev); + vfio_register_req_notifier(vdev); ++ vfio_register_ext_irq_handler(vdev, VFIO_IRQ_TYPE_NESTED, ++ VFIO_IRQ_SUBTYPE_DMA_FAULT, ++ vfio_dma_fault_notifier_handler); + vfio_setup_resetfn_quirk(vdev); + + pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); +@@ -3239,6 +3317,7 @@ static void vfio_exitfn(PCIDevice *pdev) + + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); ++ vfio_unregister_ext_irq_notifiers(vdev); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 64777516d1..a8b06737fb 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -114,6 +114,12 @@ typedef struct VFIOMSIXInfo { + unsigned long *pending; + } VFIOMSIXInfo; + ++typedef struct VFIOPCIExtIRQ { ++ struct VFIOPCIDevice *vdev; ++ EventNotifier notifier; ++ uint32_t index; ++} VFIOPCIExtIRQ; ++ + #define TYPE_VFIO_PCI "vfio-pci" + OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI) + +@@ -138,6 +144,7 @@ struct VFIOPCIDevice { + PCIHostDeviceAddress host; + EventNotifier err_notifier; + EventNotifier req_notifier; ++ VFIOPCIExtIRQ *ext_irqs; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + diff --git a/vfio-pci-Set-up-the-DMA-FAULT-region.patch b/vfio-pci-Set-up-the-DMA-FAULT-region.patch new file mode 100644 index 0000000000000000000000000000000000000000..9a4757dc7bbd03571a7d7e5981828ad817da5654 --- /dev/null +++ b/vfio-pci-Set-up-the-DMA-FAULT-region.patch @@ -0,0 +1,132 @@ +From e701d0fef4fbb7935d6aa7d22d82eb2dcfee2431 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Dec 2018 10:57:53 -0500 +Subject: [PATCH] vfio/pci: Set up the DMA FAULT region + +Set up the fault region which is composed of the actual fault +queue (mmappable) and a header used to handle it. The fault +queue is mmapped. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/pci.h | 1 + + 2 files changed, 65 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 37a70932c6..76bc9d3506 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2638,11 +2638,67 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) + return 0; + } + ++static void vfio_init_fault_regions(VFIOPCIDevice *vdev, Error **errp) ++{ ++ struct vfio_region_info *fault_region_info = NULL; ++ struct vfio_region_info_cap_fault *cap_fault; ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ struct vfio_info_cap_header *hdr; ++ char *fault_region_name; ++ int ret; ++ ++ ret = vfio_get_dev_region_info(&vdev->vbasedev, ++ VFIO_REGION_TYPE_NESTED, ++ VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT, ++ &fault_region_info); ++ if (ret) { ++ goto out; ++ } ++ ++ hdr = vfio_get_region_info_cap(fault_region_info, ++ VFIO_REGION_INFO_CAP_DMA_FAULT); ++ if (!hdr) { ++ error_setg(errp, "failed to retrieve DMA FAULT capability"); ++ goto out; ++ } ++ cap_fault = container_of(hdr, struct vfio_region_info_cap_fault, ++ header); ++ if (cap_fault->version != 1) { ++ error_setg(errp, "Unsupported DMA FAULT API version %d", ++ cap_fault->version); ++ goto out; ++ } ++ ++ fault_region_name = g_strdup_printf("%s DMA FAULT %d", ++ vbasedev->name, ++ fault_region_info->index); ++ ++ ret = vfio_region_setup(OBJECT(vdev), vbasedev, ++ &vdev->dma_fault_region, ++ fault_region_info->index, ++ fault_region_name); ++ g_free(fault_region_name); ++ if (ret) { ++ error_setg_errno(errp, -ret, ++ "failed to set up the DMA FAULT region %d", ++ fault_region_info->index); ++ goto out; ++ } ++ ++ ret = vfio_region_mmap(&vdev->dma_fault_region); ++ if (ret) { ++ error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT queue"); ++ } ++out: ++ g_free(fault_region_info); ++} ++ + static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + { + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; ++ Error *err = NULL; + int i, ret = -1; + + /* Sanity check device */ +@@ -2706,6 +2762,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + } + } + ++ vfio_init_fault_regions(vdev, &err); ++ if (err) { ++ error_propagate(errp, err); ++ return; ++ } ++ + irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +@@ -3298,6 +3360,7 @@ static void vfio_instance_finalize(Object *obj) + + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); ++ vfio_region_finalize(&vdev->dma_fault_region); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* +@@ -3318,6 +3381,7 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); + vfio_unregister_ext_irq_notifiers(vdev); ++ vfio_region_exit(&vdev->dma_fault_region); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index a8b06737fb..eef91065f1 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -145,6 +145,7 @@ struct VFIOPCIDevice { + EventNotifier err_notifier; + EventNotifier req_notifier; + VFIOPCIExtIRQ *ext_irqs; ++ VFIORegion dma_fault_region; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + diff --git a/vfio.h-and-iommu.h-header-update-against-5.10.patch b/vfio.h-and-iommu.h-header-update-against-5.10.patch new file mode 100644 index 0000000000000000000000000000000000000000..8272a6793eb1b672efe3980f4abc3ca61cb39732 --- /dev/null +++ b/vfio.h-and-iommu.h-header-update-against-5.10.patch @@ -0,0 +1,701 @@ +From 36b65d7312a343cb636e6963b8262dce9420ebc6 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Fri, 30 Jul 2021 09:15:31 +0800 +Subject: [PATCH] vfio.h and iommu.h header update against 5.10 + +Signed-off-by: Kunkun Jiang +--- + linux-headers/linux/iommu.h | 395 ++++++++++++++++++++++++++++++++++++ + linux-headers/linux/vfio.h | 220 +++++++++++++++++++- + 2 files changed, 613 insertions(+), 2 deletions(-) + create mode 100644 linux-headers/linux/iommu.h + +diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h +new file mode 100644 +index 0000000000..773b7dc2d6 +--- /dev/null ++++ b/linux-headers/linux/iommu.h +@@ -0,0 +1,395 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * IOMMU user API definitions ++ */ ++ ++#ifndef IOMMU_H ++#define IOMMU_H ++ ++#include ++ ++#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ ++#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ ++#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ ++#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ ++ ++/* Generic fault types, can be expanded IRQ remapping fault */ ++enum iommu_fault_type { ++ IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ ++ IOMMU_FAULT_PAGE_REQ, /* page request fault */ ++}; ++ ++enum iommu_fault_reason { ++ IOMMU_FAULT_REASON_UNKNOWN = 0, ++ ++ /* Could not access the PASID table (fetch caused external abort) */ ++ IOMMU_FAULT_REASON_PASID_FETCH, ++ ++ /* PASID entry is invalid or has configuration errors */ ++ IOMMU_FAULT_REASON_BAD_PASID_ENTRY, ++ ++ /* ++ * PASID is out of range (e.g. exceeds the maximum PASID ++ * supported by the IOMMU) or disabled. ++ */ ++ IOMMU_FAULT_REASON_PASID_INVALID, ++ ++ /* ++ * An external abort occurred fetching (or updating) a translation ++ * table descriptor ++ */ ++ IOMMU_FAULT_REASON_WALK_EABT, ++ ++ /* ++ * Could not access the page table entry (Bad address), ++ * actual translation fault ++ */ ++ IOMMU_FAULT_REASON_PTE_FETCH, ++ ++ /* Protection flag check failed */ ++ IOMMU_FAULT_REASON_PERMISSION, ++ ++ /* access flag check failed */ ++ IOMMU_FAULT_REASON_ACCESS, ++ ++ /* Output address of a translation stage caused Address Size fault */ ++ IOMMU_FAULT_REASON_OOR_ADDRESS, ++}; ++ ++/** ++ * struct iommu_fault_unrecoverable - Unrecoverable fault data ++ * @reason: reason of the fault, from &enum iommu_fault_reason ++ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) ++ * @pasid: Process Address Space ID ++ * @perm: requested permission access using by the incoming transaction ++ * (IOMMU_FAULT_PERM_* values) ++ * @addr: offending page address ++ * @fetch_addr: address that caused a fetch abort, if any ++ */ ++struct iommu_fault_unrecoverable { ++ __u32 reason; ++#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) ++#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) ++#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) ++ __u32 flags; ++ __u32 pasid; ++ __u32 perm; ++ __u64 addr; ++ __u64 fetch_addr; ++}; ++ ++/** ++ * struct iommu_fault_page_request - Page Request data ++ * @flags: encodes whether the corresponding fields are valid and whether this ++ * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). ++ * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response ++ * must have the same PASID value as the page request. When it is clear, ++ * the page response should not have a PASID. ++ * @pasid: Process Address Space ID ++ * @grpid: Page Request Group Index ++ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) ++ * @addr: page address ++ * @private_data: device-specific private information ++ */ ++struct iommu_fault_page_request { ++#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) ++#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) ++#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) ++#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) ++ __u32 flags; ++ __u32 pasid; ++ __u32 grpid; ++ __u32 perm; ++ __u64 addr; ++ __u64 private_data[2]; ++}; ++ ++/** ++ * struct iommu_fault - Generic fault data ++ * @type: fault type from &enum iommu_fault_type ++ * @padding: reserved for future use (should be zero) ++ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV ++ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ ++ * @padding2: sets the fault size to allow for future extensions ++ */ ++struct iommu_fault { ++ __u32 type; ++ __u32 padding; ++ union { ++ struct iommu_fault_unrecoverable event; ++ struct iommu_fault_page_request prm; ++ __u8 padding2[56]; ++ }; ++}; ++ ++/** ++ * enum iommu_page_response_code - Return status of fault handlers ++ * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables ++ * populated, retry the access. This is "Success" in PCI PRI. ++ * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from ++ * this device if possible. This is "Response Failure" in PCI PRI. ++ * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the ++ * access. This is "Invalid Request" in PCI PRI. ++ */ ++enum iommu_page_response_code { ++ IOMMU_PAGE_RESP_SUCCESS = 0, ++ IOMMU_PAGE_RESP_INVALID, ++ IOMMU_PAGE_RESP_FAILURE, ++}; ++ ++/** ++ * struct iommu_page_response - Generic page response information ++ * @argsz: User filled size of this data ++ * @version: API version of this structure ++ * @flags: encodes whether the corresponding fields are valid ++ * (IOMMU_FAULT_PAGE_RESPONSE_* values) ++ * @pasid: Process Address Space ID ++ * @grpid: Page Request Group Index ++ * @code: response code from &enum iommu_page_response_code ++ */ ++struct iommu_page_response { ++ __u32 argsz; ++#define IOMMU_PAGE_RESP_VERSION_1 1 ++ __u32 version; ++#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) ++ __u32 flags; ++ __u32 pasid; ++ __u32 grpid; ++ __u32 code; ++}; ++ ++/* defines the granularity of the invalidation */ ++enum iommu_inv_granularity { ++ IOMMU_INV_GRANU_DOMAIN, /* domain-selective invalidation */ ++ IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */ ++ IOMMU_INV_GRANU_ADDR, /* page-selective invalidation */ ++ IOMMU_INV_GRANU_NR, /* number of invalidation granularities */ ++}; ++ ++/** ++ * struct iommu_inv_addr_info - Address Selective Invalidation Structure ++ * ++ * @flags: indicates the granularity of the address-selective invalidation ++ * - If the PASID bit is set, the @pasid field is populated and the invalidation ++ * relates to cache entries tagged with this PASID and matching the address ++ * range. ++ * - If ARCHID bit is set, @archid is populated and the invalidation relates ++ * to cache entries tagged with this architecture specific ID and matching ++ * the address range. ++ * - Both PASID and ARCHID can be set as they may tag different caches. ++ * - If neither PASID or ARCHID is set, global addr invalidation applies. ++ * - The LEAF flag indicates whether only the leaf PTE caching needs to be ++ * invalidated and other paging structure caches can be preserved. ++ * @pasid: process address space ID ++ * @archid: architecture-specific ID ++ * @addr: first stage/level input address ++ * @granule_size: page/block size of the mapping in bytes ++ * @nb_granules: number of contiguous granules to be invalidated ++ */ ++struct iommu_inv_addr_info { ++#define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0) ++#define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1) ++#define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2) ++ __u32 flags; ++ __u32 archid; ++ __u64 pasid; ++ __u64 addr; ++ __u64 granule_size; ++ __u64 nb_granules; ++}; ++ ++/** ++ * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure ++ * ++ * @flags: indicates the granularity of the PASID-selective invalidation ++ * - If the PASID bit is set, the @pasid field is populated and the invalidation ++ * relates to cache entries tagged with this PASID and matching the address ++ * range. ++ * - If the ARCHID bit is set, the @archid is populated and the invalidation ++ * relates to cache entries tagged with this architecture specific ID and ++ * matching the address range. ++ * - Both PASID and ARCHID can be set as they may tag different caches. ++ * - At least one of PASID or ARCHID must be set. ++ * @pasid: process address space ID ++ * @archid: architecture-specific ID ++ */ ++struct iommu_inv_pasid_info { ++#define IOMMU_INV_PASID_FLAGS_PASID (1 << 0) ++#define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1) ++ __u32 flags; ++ __u32 archid; ++ __u64 pasid; ++}; ++ ++/** ++ * struct iommu_cache_invalidate_info - First level/stage invalidation ++ * information ++ * @argsz: User filled size of this data ++ * @version: API version of this structure ++ * @cache: bitfield that allows to select which caches to invalidate ++ * @granularity: defines the lowest granularity used for the invalidation: ++ * domain > PASID > addr ++ * @padding: reserved for future use (should be zero) ++ * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID ++ * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR ++ * ++ * Not all the combinations of cache/granularity are valid: ++ * ++ * +--------------+---------------+---------------+---------------+ ++ * | type / | DEV_IOTLB | IOTLB | PASID | ++ * | granularity | | | cache | ++ * +==============+===============+===============+===============+ ++ * | DOMAIN | N/A | Y | Y | ++ * +--------------+---------------+---------------+---------------+ ++ * | PASID | Y | Y | Y | ++ * +--------------+---------------+---------------+---------------+ ++ * | ADDR | Y | Y | N/A | ++ * +--------------+---------------+---------------+---------------+ ++ * ++ * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than ++ * @version and @cache. ++ * ++ * If multiple cache types are invalidated simultaneously, they all ++ * must support the used granularity. ++ */ ++struct iommu_cache_invalidate_info { ++ __u32 argsz; ++#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1 ++ __u32 version; ++/* IOMMU paging structure cache */ ++#define IOMMU_CACHE_INV_TYPE_IOTLB (1 << 0) /* IOMMU IOTLB */ ++#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB (1 << 1) /* Device IOTLB */ ++#define IOMMU_CACHE_INV_TYPE_PASID (1 << 2) /* PASID cache */ ++#define IOMMU_CACHE_INV_TYPE_NR (3) ++ __u8 cache; ++ __u8 granularity; ++ __u8 padding[6]; ++ union { ++ struct iommu_inv_pasid_info pasid_info; ++ struct iommu_inv_addr_info addr_info; ++ } granu; ++}; ++ ++/** ++ * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest ++ * SVA binding. ++ * ++ * @flags: VT-d PASID table entry attributes ++ * @pat: Page attribute table data to compute effective memory type ++ * @emt: Extended memory type ++ * ++ * Only guest vIOMMU selectable and effective options are passed down to ++ * the host IOMMU. ++ */ ++struct iommu_gpasid_bind_data_vtd { ++#define IOMMU_SVA_VTD_GPASID_SRE (1 << 0) /* supervisor request */ ++#define IOMMU_SVA_VTD_GPASID_EAFE (1 << 1) /* extended access enable */ ++#define IOMMU_SVA_VTD_GPASID_PCD (1 << 2) /* page-level cache disable */ ++#define IOMMU_SVA_VTD_GPASID_PWT (1 << 3) /* page-level write through */ ++#define IOMMU_SVA_VTD_GPASID_EMTE (1 << 4) /* extended mem type enable */ ++#define IOMMU_SVA_VTD_GPASID_CD (1 << 5) /* PASID-level cache disable */ ++#define IOMMU_SVA_VTD_GPASID_LAST (1 << 6) ++ __u64 flags; ++ __u32 pat; ++ __u32 emt; ++}; ++ ++#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ ++ IOMMU_SVA_VTD_GPASID_EMTE | \ ++ IOMMU_SVA_VTD_GPASID_PCD | \ ++ IOMMU_SVA_VTD_GPASID_PWT) ++ ++/** ++ * struct iommu_gpasid_bind_data - Information about device and guest PASID binding ++ * @argsz: User filled size of this data ++ * @version: Version of this data structure ++ * @format: PASID table entry format ++ * @flags: Additional information on guest bind request ++ * @gpgd: Guest page directory base of the guest mm to bind ++ * @hpasid: Process address space ID used for the guest mm in host IOMMU ++ * @gpasid: Process address space ID used for the guest mm in guest IOMMU ++ * @addr_width: Guest virtual address width ++ * @padding: Reserved for future use (should be zero) ++ * @vtd: Intel VT-d specific data ++ * ++ * Guest to host PASID mapping can be an identity or non-identity, where guest ++ * has its own PASID space. For non-identify mapping, guest to host PASID lookup ++ * is needed when VM programs guest PASID into an assigned device. VMM may ++ * trap such PASID programming then request host IOMMU driver to convert guest ++ * PASID to host PASID based on this bind data. ++ */ ++struct iommu_gpasid_bind_data { ++ __u32 argsz; ++#define IOMMU_GPASID_BIND_VERSION_1 1 ++ __u32 version; ++#define IOMMU_PASID_FORMAT_INTEL_VTD 1 ++#define IOMMU_PASID_FORMAT_LAST 2 ++ __u32 format; ++ __u32 addr_width; ++#define IOMMU_SVA_GPASID_VAL (1 << 0) /* guest PASID valid */ ++ __u64 flags; ++ __u64 gpgd; ++ __u64 hpasid; ++ __u64 gpasid; ++ __u8 padding[8]; ++ /* Vendor specific data */ ++ union { ++ struct iommu_gpasid_bind_data_vtd vtd; ++ } vendor; ++}; ++ ++/** ++ * struct iommu_pasid_smmuv3 - ARM SMMUv3 Stream Table Entry stage 1 related ++ * information ++ * @version: API version of this structure ++ * @s1fmt: STE s1fmt (format of the CD table: single CD, linear table ++ * or 2-level table) ++ * @s1dss: STE s1dss (specifies the behavior when @pasid_bits != 0 ++ * and no PASID is passed along with the incoming transaction) ++ * @padding: reserved for future use (should be zero) ++ * ++ * The PASID table is referred to as the Context Descriptor (CD) table on ARM ++ * SMMUv3. Please refer to the ARM SMMU 3.x spec (ARM IHI 0070A) for full ++ * details. ++ */ ++struct iommu_pasid_smmuv3 { ++#define PASID_TABLE_SMMUV3_CFG_VERSION_1 1 ++ __u32 version; ++ __u8 s1fmt; ++ __u8 s1dss; ++ __u8 padding[2]; ++}; ++ ++/** ++ * struct iommu_pasid_table_config - PASID table data used to bind guest PASID ++ * table to the host IOMMU ++ * @argsz: User filled size of this data ++ * @version: API version to prepare for future extensions ++ * @base_ptr: guest physical address of the PASID table ++ * @format: format of the PASID table ++ * @pasid_bits: number of PASID bits used in the PASID table ++ * @config: indicates whether the guest translation stage must ++ * be translated, bypassed or aborted. ++ * @padding: reserved for future use (should be zero) ++ * @vendor_data.smmuv3: table information when @format is ++ * %IOMMU_PASID_FORMAT_SMMUV3 ++ */ ++struct iommu_pasid_table_config { ++ __u32 argsz; ++#define PASID_TABLE_CFG_VERSION_1 1 ++ __u32 version; ++ __u64 base_ptr; ++#define IOMMU_PASID_FORMAT_SMMUV3 1 ++ __u32 format; ++ __u8 pasid_bits; ++#define IOMMU_PASID_CONFIG_TRANSLATE 1 ++#define IOMMU_PASID_CONFIG_BYPASS 2 ++#define IOMMU_PASID_CONFIG_ABORT 3 ++ __u8 config; ++ __u8 padding[2]; ++ union { ++ struct iommu_pasid_smmuv3 smmuv3; ++ } vendor_data; ++}; ++ ++#endif /* _UAPI_IOMMU_H */ +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index f4ff038e8c..cf8e208fac 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -14,6 +14,7 @@ + + #include + #include ++#include + + #define VFIO_API_VERSION 0 + +@@ -334,6 +335,7 @@ struct vfio_region_info_cap_type { + #define VFIO_REGION_TYPE_GFX (1) + #define VFIO_REGION_TYPE_CCW (2) + #define VFIO_REGION_TYPE_MIGRATION (3) ++#define VFIO_REGION_TYPE_NESTED (4) + + /* sub-types for VFIO_REGION_TYPE_PCI_* */ + +@@ -362,6 +364,10 @@ struct vfio_region_info_cap_type { + /* sub-types for VFIO_REGION_TYPE_GFX */ + #define VFIO_REGION_SUBTYPE_GFX_EDID (1) + ++/* sub-types for VFIO_REGION_TYPE_NESTED */ ++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT (1) ++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE (2) ++ + /** + * struct vfio_region_gfx_edid - EDID region layout. + * +@@ -721,11 +727,30 @@ struct vfio_irq_info { + #define VFIO_IRQ_INFO_MASKABLE (1 << 1) + #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) + #define VFIO_IRQ_INFO_NORESIZE (1 << 3) ++#define VFIO_IRQ_INFO_FLAG_CAPS (1 << 4) /* Info supports caps */ + __u32 index; /* IRQ index */ + __u32 count; /* Number of IRQs within this index */ ++ __u32 cap_offset; /* Offset within info struct of first cap */ + }; + #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) + ++/* ++ * The irq type capability allows IRQs unique to a specific device or ++ * class of devices to be exposed. ++ * ++ * The structures below define version 1 of this capability. ++ */ ++#define VFIO_IRQ_INFO_CAP_TYPE 3 ++ ++struct vfio_irq_info_cap_type { ++ struct vfio_info_cap_header header; ++ __u32 type; /* global per bus driver */ ++ __u32 subtype; /* type specific */ ++}; ++ ++#define VFIO_IRQ_TYPE_NESTED (1) ++#define VFIO_IRQ_SUBTYPE_DMA_FAULT (1) ++ + /** + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) + * +@@ -827,7 +852,8 @@ enum { + VFIO_PCI_MSIX_IRQ_INDEX, + VFIO_PCI_ERR_IRQ_INDEX, + VFIO_PCI_REQ_IRQ_INDEX, +- VFIO_PCI_NUM_IRQS ++ VFIO_PCI_NUM_IRQS = 5 /* Fixed user ABI, IRQ indexes >=5 use */ ++ /* device specific cap to define content */ + }; + + /* +@@ -1012,6 +1038,68 @@ struct vfio_device_feature { + */ + #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0) + ++/* ++ * Capability exposed by the DMA fault region ++ * @version: ABI version ++ */ ++#define VFIO_REGION_INFO_CAP_DMA_FAULT 6 ++ ++struct vfio_region_info_cap_fault { ++ struct vfio_info_cap_header header; ++ __u32 version; ++}; ++ ++/* ++ * Capability exposed by the DMA fault response region ++ * @version: ABI version ++ */ ++#define VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE 7 ++ ++struct vfio_region_info_cap_fault_response { ++ struct vfio_info_cap_header header; ++ __u32 version; ++}; ++ ++/* ++ * DMA Fault Region Layout ++ * @tail: index relative to the start of the ring buffer at which the ++ * consumer finds the next item in the buffer ++ * @entry_size: fault ring buffer entry size in bytes ++ * @nb_entries: max capacity of the fault ring buffer ++ * @offset: ring buffer offset relative to the start of the region ++ * @head: index relative to the start of the ring buffer at which the ++ * producer (kernel) inserts items into the buffers ++ */ ++struct vfio_region_dma_fault { ++ /* Write-Only */ ++ __u32 tail; ++ /* Read-Only */ ++ __u32 entry_size; ++ __u32 nb_entries; ++ __u32 offset; ++ __u32 head; ++}; ++ ++/* ++ * DMA Fault Response Region Layout ++ * @head: index relative to the start of the ring buffer at which the ++ * producer (userspace) insert responses into the buffer ++ * @entry_size: fault ring buffer entry size in bytes ++ * @nb_entries: max capacity of the fault ring buffer ++ * @offset: ring buffer offset relative to the start of the region ++ * @tail: index relative to the start of the ring buffer at which the ++ * consumer (kernel) finds the next item in the buffer ++ */ ++struct vfio_region_dma_fault_response { ++ /* Write-Only */ ++ __u32 head; ++ /* Read-Only */ ++ __u32 entry_size; ++ __u32 nb_entries; ++ __u32 offset; ++ __u32 tail; ++}; ++ + /* -------- API for Type1 VFIO IOMMU -------- */ + + /** +@@ -1124,7 +1212,7 @@ struct vfio_iommu_type1_dma_map { + struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ +- __u64 *data; /* one bit per page */ ++ __u64 *data; /* one bit per page */ + }; + + /** +@@ -1250,6 +1338,134 @@ struct vfio_iommu_type1_dirty_bitmap_get { + + #define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) + ++/* ++ * VFIO_IOMMU_BIND_PROCESS ++ * ++ * Allocate a PASID for a process address space, and use it to attach this ++ * process to all devices in the container. Devices can then tag their DMA ++ * traffic with the returned @pasid to perform transactions on the associated ++ * virtual address space. Mapping and unmapping buffers is performed by standard ++ * functions such as mmap and malloc. ++ * ++ * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to ++ * bind. Otherwise the current task is bound. Given that the caller owns the ++ * device, setting this flag grants the caller read and write permissions on the ++ * entire address space of foreign process described by @pid. Therefore, ++ * permission to perform the bind operation on a foreign process is governed by ++ * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) ++ * for more information. ++ * ++ * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This ++ * ID is unique to a process and can be used on all devices in the container. ++ * ++ * On fork, the child inherits the device fd and can use the bonds setup by its ++ * parent. Consequently, the child has R/W access on the address spaces bound by ++ * its parent. After an execv, the device fd is closed and the child doesn't ++ * have access to the address space anymore. ++ * ++ * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is ++ * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND, ++ * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current ++ * task from the container. ++ */ ++struct vfio_iommu_type1_bind_process { ++ __u32 flags; ++#define VFIO_IOMMU_BIND_PID (1 << 0) ++ __u32 pasid; ++ __s32 pid; ++}; ++ ++/* ++ * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes ++ * vfio_iommu_type1_bind_process in data. ++ */ ++struct vfio_iommu_type1_bind { ++ __u32 argsz; ++ __u32 flags; ++#define VFIO_IOMMU_BIND_PROCESS (1 << 0) ++ __u8 data[]; ++}; ++ ++/* ++ * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind) ++ * ++ * Manage address spaces of devices in this container. Initially a TYPE1 ++ * container can only have one address space, managed with ++ * VFIO_IOMMU_MAP/UNMAP_DMA. ++ * ++ * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP ++ * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page ++ * tables, and BIND manages the stage-1 (guest) page tables. Other types of ++ * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls ++ * non-PASID traffic and BIND controls PASID traffic. But this depends on the ++ * underlying IOMMU architecture and isn't guaranteed. ++ * ++ * Availability of this feature depends on the device, its bus, the underlying ++ * IOMMU and the CPU architecture. ++ * ++ * returns: 0 on success, -errno on failure. ++ */ ++#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) ++ ++/* ++ * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind) ++ * ++ * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. ++ */ ++#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) ++ ++/* ++ * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18, ++ * struct vfio_iommu_type1_set_pasid_table) ++ * ++ * The SET operation passes a PASID table to the host while the ++ * UNSET operation detaches the one currently programmed. It is ++ * allowed to "SET" the table several times without unsetting as ++ * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE. ++ */ ++struct vfio_iommu_type1_set_pasid_table { ++ __u32 argsz; ++ __u32 flags; ++#define VFIO_PASID_TABLE_FLAG_SET (1 << 0) ++#define VFIO_PASID_TABLE_FLAG_UNSET (1 << 1) ++ struct iommu_pasid_table_config config; /* used on SET */ ++}; ++ ++#define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18) ++ ++/** ++ * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, ++ * struct vfio_iommu_type1_cache_invalidate) ++ * ++ * Propagate guest IOMMU cache invalidation to the host. ++ */ ++struct vfio_iommu_type1_cache_invalidate { ++ __u32 argsz; ++ __u32 flags; ++ struct iommu_cache_invalidate_info info; ++}; ++#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 19) ++ ++/** ++ * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20, ++ * struct vfio_iommu_type1_set_msi_binding) ++ * ++ * Pass a stage 1 MSI doorbell mapping to the host so that this ++ * latter can build a nested stage2 mapping. Or conversely tear ++ * down a previously bound stage 1 MSI binding. ++ */ ++struct vfio_iommu_type1_set_msi_binding { ++ __u32 argsz; ++ __u32 flags; ++#define VFIO_IOMMU_BIND_MSI (1 << 0) ++#define VFIO_IOMMU_UNBIND_MSI (1 << 1) ++ __u64 iova; /* MSI guest IOVA */ ++ /* Fields below are used on BIND */ ++ __u64 gpa; /* MSI guest physical address */ ++ __u64 size; /* size of stage1 mapping (bytes) */ ++}; ++#define VFIO_IOMMU_SET_MSI_BINDING _IO(VFIO_TYPE, VFIO_BASE + 20) ++ + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + + /* +-- +2.27.0 + diff --git a/xhci-check-reg-to-avoid-OOB-read.patch b/xhci-check-reg-to-avoid-OOB-read.patch new file mode 100644 index 0000000000000000000000000000000000000000..f6a4c356578e4f686ad8cb0dbbcdd24e6cd8b816 --- /dev/null +++ b/xhci-check-reg-to-avoid-OOB-read.patch @@ -0,0 +1,47 @@ +From a95ada20170af0a71529c1583846e402cdbb850b Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 10:41:40 +0800 +Subject: [PATCH] xhci: check reg to avoid OOB read + +Add a sanity check to fix OOB read access. + +Signed-off-by: Yan Wang +--- + hw/usb/hcd-xhci.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index e01700039b..08cd63e159 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -27,6 +27,7 @@ + #include "hw/qdev-properties.h" + #include "trace.h" + #include "qapi/error.h" ++#include "qemu/log.h" + + #include "hcd-xhci.h" + +@@ -3017,14 +3018,17 @@ static void xhci_runtime_write(void *ptr, hwaddr reg, + XHCIInterrupter *intr; + int v; + +- trace_usb_xhci_runtime_write(reg, val); +- + if (reg < 0x20) { + trace_usb_xhci_unimplemented("runtime write", reg); + return; + } + v = (reg - 0x20) / 0x20; ++ if (v >= xhci->numintrs) { ++ qemu_log("intr nr out of range (%d >= %d)\n", v, xhci->numintrs); ++ return; ++ } + intr = &xhci->intr[v]; ++ trace_usb_xhci_runtime_write(reg, val); + + switch (reg & 0x1f) { + case 0x00: /* IMAN */ +-- +2.27.0 +