From 7a4587345e340072e6582f5b495e567a35c37ce6 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 11 Feb 2022 18:20:59 +0800 Subject: [PATCH 01/56] util/log: add CONFIG_DISABLE_QEMU_LOG macro Using CONFIG_DISABLE_QEMU_LOG macro to control qemu_log function. Signed-off-by: Yan Wang --- ...og-add-CONFIG_DISABLE_QEMU_LOG-macro.patch | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch diff --git a/util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch b/util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch new file mode 100644 index 00000000..f6940d69 --- /dev/null +++ b/util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch @@ -0,0 +1,41 @@ +From 05462305ec8b9ce5b414ede1e7e680b16d1a08ad Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Fri, 11 Feb 2022 18:20:59 +0800 +Subject: [PATCH] util/log: add CONFIG_DISABLE_QEMU_LOG macro + +Using CONFIG_DISABLE_QEMU_LOG macro to control +qemu_log function. + +Signed-off-by: Yan Wang +--- + util/log.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/util/log.c b/util/log.c +index 2ee1500bee..ed3029fe5c 100644 +--- a/util/log.c ++++ b/util/log.c +@@ -34,6 +34,12 @@ int qemu_loglevel; + static int log_append = 0; + static GArray *debug_regions; + ++#ifdef CONFIG_DISABLE_QEMU_LOG ++int qemu_log(const char *fmt, ...) ++{ ++ return 0; ++} ++#else + /* Return the number of characters emitted. */ + int qemu_log(const char *fmt, ...) + { +@@ -56,6 +62,7 @@ int qemu_log(const char *fmt, ...) + rcu_read_unlock(); + return ret; + } ++#endif + + static void __attribute__((__constructor__)) qemu_logfile_init(void) + { +-- +2.27.0 + -- Gitee From 05079f3fcc9657a1bc675bb11594db5bd28d89b9 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 12 Feb 2022 17:20:20 +0800 Subject: [PATCH 02/56] =?UTF-8?q?spec:=20Update=20patch=20and=20changelog?= =?UTF-8?q?=20with=20!245=20=E3=80=906.2.0=E3=80=91=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E6=8E=A7=E5=88=B6qemu=5Flog=E5=87=BD=E6=95=B0=E7=9A=84?= =?UTF-8?q?=E5=AE=8F=20=20!245?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit util/log: add CONFIG_DISABLE_QEMU_LOG macro Signed-off-by: Chen Qun --- qemu.spec | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/qemu.spec b/qemu.spec index 5e4aca10..325effa2 100644 --- a/qemu.spec +++ b/qemu.spec @@ -86,6 +86,7 @@ Patch0073: seabios-increase-the-seabios-high-mem-zone-size.patch Patch0074: seabios-increase-the-seabios-minibiostable.patch Patch0075: IPv6-add-support-for-IPv6-protocol.patch Patch0076: Use-post-increment-only-in-inffast.c.patch +Patch0077: util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch BuildRequires: flex BuildRequires: gcc @@ -530,6 +531,9 @@ getent passwd qemu >/dev/null || \ %endif %changelog +* Sat Feb 12 2022 Chen Qun +- util/log: add CONFIG_DISABLE_QEMU_LOG macro + * Sat Feb 12 2022 Yan Wang - ipxe: IPv6 add support for IPv6 protocol - u-boot: Use post increment only in inffast.c -- Gitee From 6e5093c45b71c43afe98f3f7d4aff5f06d5f5905 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 8 Feb 2022 15:48:01 +0800 Subject: [PATCH 03/56] log: Add some logs on VM runtime path Add logs on VM runtime path, to make it easier to do trouble shooting. Signed-off-by: Ying Fang Signed-off-by: Yan Wang --- log-Add-some-logs-on-VM-runtime-path.patch | 170 +++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 log-Add-some-logs-on-VM-runtime-path.patch diff --git a/log-Add-some-logs-on-VM-runtime-path.patch b/log-Add-some-logs-on-VM-runtime-path.patch new file mode 100644 index 00000000..90408a3a --- /dev/null +++ b/log-Add-some-logs-on-VM-runtime-path.patch @@ -0,0 +1,170 @@ +From d0ed3afacd2af1cbfcfb615471ade3c8c4185c00 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Tue, 8 Feb 2022 15:48:01 +0800 +Subject: [PATCH] log: Add some logs on VM runtime path + +Add logs on VM runtime path, to make it easier to do trouble shooting. + +Signed-off-by: Ying Fang +Signed-off-by: Yan Wang +--- + hw/virtio/virtio-pci.c | 2 ++ + hw/virtio/virtio.c | 14 ++++++++++++-- + monitor/monitor.c | 9 +++++++++ + qapi/qmp-dispatch.c | 15 +++++++++++++++ + softmmu/qdev-monitor.c | 4 +++- + 5 files changed, 41 insertions(+), 3 deletions(-) + +diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c +index 750aa47ec1..38a5dc1ba8 100644 +--- a/hw/virtio/virtio-pci.c ++++ b/hw/virtio/virtio-pci.c +@@ -1772,7 +1772,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + bool modern = virtio_pci_modern(proxy); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; ++ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + ++ qemu_log("unplug device name: %s\n", !vdev ? "NULL" : vdev->name); + virtio_pci_stop_ioeventfd(proxy); + + if (modern) { +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index ea7c079fb0..9b4ac58a16 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -1945,7 +1945,14 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) + k->set_status(vdev, val); + } + vdev->status = val; +- ++ if (val) { ++ qemu_log("%s device status is %d that means %s\n", ++ vdev->name, val, ++ (val & VIRTIO_CONFIG_S_DRIVER_OK) ? "DRIVER OK" : ++ (val & VIRTIO_CONFIG_S_DRIVER) ? "DRIVER" : ++ (val & VIRTIO_CONFIG_S_ACKNOWLEDGE) ? "ACKNOWLEDGE" : ++ (val & VIRTIO_CONFIG_S_FAILED) ? "FAILED" : "UNKNOWN"); ++ } + return 0; + } + +@@ -2389,8 +2396,11 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + break; + } + +- if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) ++ if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) { ++ qemu_log("unacceptable queue_size (%d) or num (%d)\n", ++ queue_size, i); + abort(); ++ } + + vdev->vq[i].vring.num = queue_size; + vdev->vq[i].vring.num_default = queue_size; +diff --git a/monitor/monitor.c b/monitor/monitor.c +index 21c7a68758..013c628695 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -29,6 +29,7 @@ + #include "qapi/qapi-emit-events.h" + #include "qapi/qapi-visit-control.h" + #include "qapi/qmp/qdict.h" ++#include "qapi/qmp/qjson.h" + #include "qemu/error-report.h" + #include "qemu/option.h" + #include "sysemu/qtest.h" +@@ -318,6 +319,7 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) + { + Monitor *mon; + MonitorQMP *qmp_mon; ++ GString *json; + + trace_monitor_protocol_event_emit(event, qdict); + QTAILQ_FOREACH(mon, &mon_list, entry) { +@@ -328,6 +330,13 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) + qmp_mon = container_of(mon, MonitorQMP, common); + if (qmp_mon->commands != &qmp_cap_negotiation_commands) { + qmp_send_response(qmp_mon, qdict); ++ json = qobject_to_json(QOBJECT(qdict)); ++ if (json) { ++ if (!strstr(json->str, "RTC_CHANGE")) { ++ qemu_log("%s\n", json->str); ++ } ++ g_string_free(json, true); ++ } + } + } + } +diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c +index d378bccac7..bb005594d3 100644 +--- a/qapi/qmp-dispatch.c ++++ b/qapi/qmp-dispatch.c +@@ -25,6 +25,7 @@ + #include "qapi/qmp/qbool.h" + #include "qemu/coroutine.h" + #include "qemu/main-loop.h" ++#include "qemu/log.h" + + Visitor *qobject_input_visitor_new_qmp(QObject *obj) + { +@@ -147,6 +148,7 @@ QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request, + QObject *id; + QObject *ret = NULL; + QDict *rsp = NULL; ++ GString *json; + + dict = qobject_to(QDict, request); + if (!dict) { +@@ -204,6 +206,19 @@ QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request, + qobject_ref(args); + } + ++ json = qobject_to_json(QOBJECT(args)); ++ if (json) { ++ if ((strcmp(command, "query-block-jobs") != 0) ++ && (strcmp(command, "query-migrate") != 0) ++ && (strcmp(command, "query-blockstats") != 0) ++ && (strcmp(command, "query-balloon") != 0) ++ && (strcmp(command, "set_password") != 0)) { ++ qemu_log("qmp_cmd_name: %s, arguments: %s\n", ++ command, json->str); ++ } ++ g_string_free(json, true); ++ } ++ + assert(!(oob && qemu_in_coroutine())); + assert(monitor_cur() == NULL); + if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) { +diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c +index 01f3834db5..dfd6429bf3 100644 +--- a/softmmu/qdev-monitor.c ++++ b/softmmu/qdev-monitor.c +@@ -36,6 +36,7 @@ + #include "qemu/option.h" + #include "qemu/qemu-print.h" + #include "qemu/option_int.h" ++#include "qemu/log.h" + #include "sysemu/block-backend.h" + #include "migration/misc.h" + #include "migration/migration.h" +@@ -635,6 +636,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, + if (path != NULL) { + bus = qbus_find(path, errp); + if (!bus) { ++ error_setg(errp, "can not find bus for %s", driver); + return NULL; + } + if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { +@@ -707,7 +709,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, + if (*errp) { + goto err_del_dev; + } +- ++ qemu_log("add qdev %s:%s success\n", driver, dev->id ? dev->id : "none"); + if (!qdev_realize(DEVICE(dev), bus, errp)) { + goto err_del_dev; + } +-- +2.27.0 + -- Gitee From 842953f34a3296fc4e12b5319e980ee2e806469a Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 29 Jul 2021 15:24:48 +0800 Subject: [PATCH 04/56] qdev/monitors: Fix reundant error_setg of qdev_add_device There is an extra log "error_setg" in qdev_add_device(). When hot-plug a device, if the corresponding bus doesn't exist, it will trigger an asseration "assert(*errp == NULL)". Fixes: 515a7970490 (log: Add some logs on VM runtime path) Signed-off-by: Kunkun Jiang Signed-off-by: Yan Wang --- ...x-reundant-error_setg-of-qdev_add_de.patch | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch diff --git a/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch new file mode 100644 index 00000000..e02dbf6f --- /dev/null +++ b/qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch @@ -0,0 +1,31 @@ +From ada323e932c83271184a6ddba1cfd74a29378963 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Thu, 29 Jul 2021 15:24:48 +0800 +Subject: [PATCH] qdev/monitors: Fix reundant error_setg of qdev_add_device + +There is an extra log "error_setg" in qdev_add_device(). When +hot-plug a device, if the corresponding bus doesn't exist, it +will trigger an asseration "assert(*errp == NULL)". + +Fixes: 515a7970490 (log: Add some logs on VM runtime path) +Signed-off-by: Kunkun Jiang +Signed-off-by: Yan Wang +--- + softmmu/qdev-monitor.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c +index dfd6429bf3..4a20f5dbd7 100644 +--- a/softmmu/qdev-monitor.c ++++ b/softmmu/qdev-monitor.c +@@ -636,7 +636,6 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, + if (path != NULL) { + bus = qbus_find(path, errp); + if (!bus) { +- error_setg(errp, "can not find bus for %s", driver); + return NULL; + } + if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { +-- +2.27.0 + -- Gitee From 7a47a3cbb89fbfe6d7ff0be852182e1a939a3e4b Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 10 Feb 2022 17:08:08 +0800 Subject: [PATCH 05/56] bios-tables-test: Allow changes to q35/SSDT.dimmpxm file List test/data/acpi/q35/SSDT.dimmpxm as the expected files allowed to be changed in tests/qtest/bios-tables-test-allowed-diff.h Signed-off-by: Yan Wang --- ...-Allow-changes-to-q35-SSDT.dimmpxm-f.patch | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch diff --git a/bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch b/bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch new file mode 100644 index 00000000..8228abcd --- /dev/null +++ b/bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch @@ -0,0 +1,23 @@ +From 00c4115a1388ee72295b99fce1f6ad49bf761134 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 17:08:08 +0800 +Subject: [PATCH] bios-tables-test: Allow changes to q35/SSDT.dimmpxm file + +List test/data/acpi/q35/SSDT.dimmpxm as the expected files allowed to +be changed in tests/qtest/bios-tables-test-allowed-diff.h + +Signed-off-by: Yan Wang +--- + tests/qtest/bios-tables-test-allowed-diff.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h +index dfb8523c8b..81148a604f 100644 +--- a/tests/qtest/bios-tables-test-allowed-diff.h ++++ b/tests/qtest/bios-tables-test-allowed-diff.h +@@ -1 +1,2 @@ + /* List of comma-separated changed AML files to ignore */ ++"tests/data/acpi/q35/SSDT.dimmpxm", +-- +2.27.0 + -- Gitee From 43d1eda5508a8d26662f6797d15b7e0077818c17 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 14 Apr 2020 14:53:44 +0800 Subject: [PATCH 06/56] smbios: Add missing member of type 4 for smbios 3.0 According to smbios 3.0 spec, for processor information (type 4), it adds three new members (Core Count 2, Core enabled 2, thread count 2) for 3.0, Without this three members, we can not get correct cpu frequency from dmi, Because it will failed to check the length of Processor Infomation in DMI. The corresponding codes in kernel is like: if (dm->type == DMI_ENTRY_PROCESSOR && dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { u16 val = (u16)get_unaligned((const u16 *) (dmi_data + DMI_PROCESSOR_MAX_SPEED)); *mhz = val > *mhz ? val : *mhz; } Signed-off-by: zhanghailiang Signed-off-by: Yan Wang --- ...sing-member-of-type-4-for-smbios-3.0.patch | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch diff --git a/smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch b/smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch new file mode 100644 index 00000000..f1f9e91b --- /dev/null +++ b/smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch @@ -0,0 +1,56 @@ +From 937e22eda2480a64095928ee8df0d37b3313bb64 Mon Sep 17 00:00:00 2001 +From: Ying Fang +Date: Tue, 14 Apr 2020 14:53:44 +0800 +Subject: [PATCH] smbios: Add missing member of type 4 for smbios 3.0 + +According to smbios 3.0 spec, for processor information (type 4), +it adds three new members (Core Count 2, Core enabled 2, thread count 2) for 3.0, Without this three members, we can not get correct cpu frequency from dmi, +Because it will failed to check the length of Processor Infomation in DMI. + +The corresponding codes in kernel is like: + if (dm->type == DMI_ENTRY_PROCESSOR && + dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { + u16 val = (u16)get_unaligned((const u16 *) + (dmi_data + DMI_PROCESSOR_MAX_SPEED)); + *mhz = val > *mhz ? val : *mhz; + } + +Signed-off-by: zhanghailiang +Signed-off-by: Yan Wang +--- + hw/smbios/smbios.c | 4 +++- + include/hw/firmware/smbios.h | 3 +++ + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c +index 7397e56737..66be9aee09 100644 +--- a/hw/smbios/smbios.c ++++ b/hw/smbios/smbios.c +@@ -688,7 +688,9 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) + t->thread_count = ms->smp.threads; + t->processor_characteristics = cpu_to_le16(0x02); /* Unknown */ + t->processor_family2 = cpu_to_le16(0x01); /* Other */ +- ++ t->corecount2 = 0; ++ t->enabledcorecount2 = 0; ++ t->threadcount2 = 0; + SMBIOS_BUILD_TABLE_POST; + smbios_type4_count++; + } +diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h +index 5a0dd0c8cf..5a696cf75a 100644 +--- a/include/hw/firmware/smbios.h ++++ b/include/hw/firmware/smbios.h +@@ -193,6 +193,9 @@ struct smbios_type_4 { + uint8_t thread_count; + uint16_t processor_characteristics; + uint16_t processor_family2; ++ uint16_t corecount2; ++ uint16_t enabledcorecount2; ++ uint16_t threadcount2; + } QEMU_PACKED; + + /* SMBIOS type 11 - OEM strings */ +-- +2.27.0 + -- Gitee From 968cf98bc4c42edcd85e31d54ba0c9948412dd45 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 10 Feb 2022 17:12:35 +0800 Subject: [PATCH 07/56] bios-tables-test: Update expected q35/SSDT.dimmpxm file Run ./tests/data/acpi/rebuild-expected-aml.sh from build directory to update q35/SSDT.dimmpxm file. Also empty bios-tables-test-allowed-diff.h. The disassembled differences between actual and expected SSDT.dimmpxm: /* * Intel ACPI Component Architecture * AML/ASL+ Disassembler version 20210604 (64-bit version) * Copyright (c) 2000 - 2021 Intel Corporation * * Disassembling to symbolic ASL+ operators * - * Disassembly of tests/data/acpi/q35/SSDT.dimmpxm, Thu Feb 10 15:03:52 2022 + * Disassembly of /tmp/aml-CK68G1, Thu Feb 10 15:03:52 2022 * * Original Table Header: * Signature "SSDT" * Length 0x000002DE (734) * Revision 0x01 - * Checksum 0x06 + * Checksum 0x16 * OEM ID "BOCHS " * OEM Table ID "NVDIMM " * OEM Revision 0x00000001 (1) * Compiler ID "BXPC" * Compiler Version 0x00000001 (1) */ DefinitionBlock ("", "SSDT", 1, "BOCHS ", "NVDIMM ", 0x00000001) { Scope (\_SB) { Device (NVDR) { Name (_HID, "ACPI0012" /* NVDIMM Root Device */) // _HID: Hardware ID Method (NCAL, 5, Serialized) { Local6 = MEMA /* \MEMA */ @@ -187,19 +187,19 @@ { Return (NCAL (Arg0, Arg1, Arg2, Arg3, 0x02)) } } Device (NV02) { Name (_ADR, 0x03) // _ADR: Address Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method { Return (NCAL (Arg0, Arg1, Arg2, Arg3, 0x03)) } } } } - Name (MEMA, 0x07FFF000) + Name (MEMA, 0x07FFE000) } Signed-off-by: Yan Wang --- ...-Update-expected-q35-SSDT.dimmpxm-fi.patch | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch diff --git a/bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch b/bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch new file mode 100644 index 00000000..f41a702b --- /dev/null +++ b/bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch @@ -0,0 +1,88 @@ +From 8940f11a055da0a744d10b53cf999dea7967be25 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 17:12:35 +0800 +Subject: [PATCH] bios-tables-test: Update expected q35/SSDT.dimmpxm file + +Run ./tests/data/acpi/rebuild-expected-aml.sh from build directory +to update q35/SSDT.dimmpxm file. Also empty bios-tables-test-allowed-diff.h. + +The disassembled differences between actual and expected SSDT.dimmpxm: + + /* + * Intel ACPI Component Architecture + * AML/ASL+ Disassembler version 20210604 (64-bit version) + * Copyright (c) 2000 - 2021 Intel Corporation + * + * Disassembling to symbolic ASL+ operators + * +- * Disassembly of tests/data/acpi/q35/SSDT.dimmpxm, Thu Feb 10 15:03:52 2022 ++ * Disassembly of /tmp/aml-CK68G1, Thu Feb 10 15:03:52 2022 + * + * Original Table Header: + * Signature "SSDT" + * Length 0x000002DE (734) + * Revision 0x01 +- * Checksum 0x06 ++ * Checksum 0x16 + * OEM ID "BOCHS " + * OEM Table ID "NVDIMM " + * OEM Revision 0x00000001 (1) + * Compiler ID "BXPC" + * Compiler Version 0x00000001 (1) + */ + DefinitionBlock ("", "SSDT", 1, "BOCHS ", "NVDIMM ", 0x00000001) + { + Scope (\_SB) + { + Device (NVDR) + { + Name (_HID, "ACPI0012" /* NVDIMM Root Device */) // _HID: Hardware ID + Method (NCAL, 5, Serialized) + { + Local6 = MEMA /* \MEMA */ +@@ -187,19 +187,19 @@ + { + Return (NCAL (Arg0, Arg1, Arg2, Arg3, 0x02)) + } + } + + Device (NV02) + { + Name (_ADR, 0x03) // _ADR: Address + Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + { + Return (NCAL (Arg0, Arg1, Arg2, Arg3, 0x03)) + } + } + } + } + +- Name (MEMA, 0x07FFF000) ++ Name (MEMA, 0x07FFE000) + } + +Signed-off-by: Yan Wang +--- + tests/data/acpi/q35/SSDT.dimmpxm | Bin 734 -> 734 bytes + tests/qtest/bios-tables-test-allowed-diff.h | 1 - + 2 files changed, 1 deletion(-) + +diff --git a/tests/data/acpi/q35/SSDT.dimmpxm b/tests/data/acpi/q35/SSDT.dimmpxm +index 617a1c911c7d6753bcedc8ecc52e3027a5259ad6..a50a961fa1d9b0dd8ea4096d652c83bcf04db20b 100644 +GIT binary patch +delta 23 +fcmcb|dXJSWIM^lR9uortqu55Si%iT9{<8xBSkVW4 + +delta 23 +fcmcb|dXJSWIM^lR9uortBilx Date: Sat, 18 Dec 2021 09:39:57 +0800 Subject: [PATCH 08/56] net: eepro100: validate various address valuesi(CVE-2021-20255) fix CVE-2021-20255 patch link: https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg06098.html fix CVE-2021-20255, sync patch from ostms platform. Signed-off-by: zhouli57 Signed-off-by: Yan Wang --- ...idate-various-address-valuesi-CVE-20.patch | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 net-eepro100-validate-various-address-valuesi-CVE-20.patch diff --git a/net-eepro100-validate-various-address-valuesi-CVE-20.patch b/net-eepro100-validate-various-address-valuesi-CVE-20.patch new file mode 100644 index 00000000..47095713 --- /dev/null +++ b/net-eepro100-validate-various-address-valuesi-CVE-20.patch @@ -0,0 +1,58 @@ +From 5db012b1116d21c64da88ad206b3589ddf5f219b Mon Sep 17 00:00:00 2001 +From: zhouli57 +Date: Sat, 18 Dec 2021 09:39:57 +0800 +Subject: [PATCH] net: eepro100: validate various address + valuesi(CVE-2021-20255) + +fix CVE-2021-20255 + +patch link: https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg06098.html + +fix CVE-2021-20255, sync patch from ostms platform. + +Signed-off-by: zhouli57 +Signed-off-by: Yan Wang +--- + hw/net/eepro100.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c +index 16e95ef9cc..2474cf3dc2 100644 +--- a/hw/net/eepro100.c ++++ b/hw/net/eepro100.c +@@ -279,6 +279,9 @@ typedef struct { + /* Quasi static device properties (no need to save them). */ + uint16_t stats_size; + bool has_extended_tcb_support; ++ ++ /* Flag to avoid recursions. */ ++ bool busy; + } EEPRO100State; + + /* Word indices in EEPROM. */ +@@ -837,6 +840,14 @@ static void action_command(EEPRO100State *s) + Therefore we limit the number of iterations. */ + unsigned max_loop_count = 16; + ++ if (s->busy) { ++ /* Prevent recursions. */ ++ logout("recursion in %s:%u\n", __FILE__, __LINE__); ++ return; ++ } ++ ++ s->busy = true; ++ + for (;;) { + bool bit_el; + bool bit_s; +@@ -933,6 +944,7 @@ static void action_command(EEPRO100State *s) + } + TRACE(OTHER, logout("CU list empty\n")); + /* List is empty. Now CU is idle or suspended. */ ++ s->busy = false; + } + + static void eepro100_cu_command(EEPRO100State * s, uint8_t val) +-- +2.27.0 + -- Gitee From c99f289f71ab05b282e308e22977f02e912f59cc Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Wed, 14 Oct 2020 15:00:20 +0800 Subject: [PATCH 09/56] pci: check bus pointer before dereference fix CVE-2020-25742 patch link: https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg05294.html While mapping IRQ level in pci_change_irq_level() routine, it does not check if pci_get_bus() returned a valid pointer. It may lead to a NULL pointer dereference issue. Add check to avoid it. -> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Flsi_nullptr1 ==1183858==Hint: address points to the zero page. #0 pci_change_irq_level hw/pci/pci.c:259 #1 pci_irq_handler hw/pci/pci.c:1445 #2 pci_set_irq hw/pci/pci.c:1463 #3 lsi_set_irq hw/scsi/lsi53c895a.c:488 #4 lsi_update_irq hw/scsi/lsi53c895a.c:523 #5 lsi_script_scsi_interrupt hw/scsi/lsi53c895a.c:554 #6 lsi_execute_script hw/scsi/lsi53c895a.c:1149 #7 lsi_reg_writeb hw/scsi/lsi53c895a.c:1984 #8 lsi_io_write hw/scsi/lsi53c895a.c:2146 ... Reported-by: Ruhr-University Signed-off-by: Prasad J Pandit Signed-off-by: Yan Wang --- ...check-bus-pointer-before-dereference.patch | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 pci-check-bus-pointer-before-dereference.patch diff --git a/pci-check-bus-pointer-before-dereference.patch b/pci-check-bus-pointer-before-dereference.patch new file mode 100644 index 00000000..fbd30a32 --- /dev/null +++ b/pci-check-bus-pointer-before-dereference.patch @@ -0,0 +1,51 @@ +From 92da19fb18c234bb8872b9d8f7dedcc73e5fcafb Mon Sep 17 00:00:00 2001 +From: Prasad J Pandit +Date: Wed, 14 Oct 2020 15:00:20 +0800 +Subject: [PATCH] pci: check bus pointer before dereference + +fix CVE-2020-25742 + +patch link: https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg05294.html + +While mapping IRQ level in pci_change_irq_level() routine, +it does not check if pci_get_bus() returned a valid pointer. +It may lead to a NULL pointer dereference issue. Add check to +avoid it. + + -> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Flsi_nullptr1 + ==1183858==Hint: address points to the zero page. + #0 pci_change_irq_level hw/pci/pci.c:259 + #1 pci_irq_handler hw/pci/pci.c:1445 + #2 pci_set_irq hw/pci/pci.c:1463 + #3 lsi_set_irq hw/scsi/lsi53c895a.c:488 + #4 lsi_update_irq hw/scsi/lsi53c895a.c:523 + #5 lsi_script_scsi_interrupt hw/scsi/lsi53c895a.c:554 + #6 lsi_execute_script hw/scsi/lsi53c895a.c:1149 + #7 lsi_reg_writeb hw/scsi/lsi53c895a.c:1984 + #8 lsi_io_write hw/scsi/lsi53c895a.c:2146 + ... + +Reported-by: Ruhr-University +Signed-off-by: Prasad J Pandit +Signed-off-by: Yan Wang +--- + hw/pci/pci.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index e5993c1ef5..6d1c39a9de 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -270,6 +270,9 @@ static void pci_change_irq_level(PCIDevice *pci_dev, int irq_num, int change) + PCIBus *bus; + for (;;) { + bus = pci_get_bus(pci_dev); ++ if (!bus) { ++ return; ++ } + irq_num = bus->map_irq(pci_dev, irq_num); + if (bus->set_irq) + break; +-- +2.27.0 + -- Gitee From d1171bc32c2ba5530a4b5f2b0694204367a9a714 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Mon, 21 Jun 2021 09:22:35 +0800 Subject: [PATCH 10/56] ide: ahci: add check to avoid null dereference (CVE-2019-12067) Fix CVE-2019-12067 AHCI emulator while committing DMA buffer in ahci_commit_buf() may do a NULL dereference if the command header 'ad->cur_cmd' is null. Add check to avoid it. Reported-by: Bugs SysSec Signed-off-by: Prasad J Pandit Signed-off-by: Jiajie Li Signed-off-by: Yan Wang --- ...ck-to-avoid-null-dereference-CVE-201.patch | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch diff --git a/ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch b/ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch new file mode 100644 index 00000000..7cd2ccff --- /dev/null +++ b/ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch @@ -0,0 +1,41 @@ +From 9169beed83ea77059a7240aae5621dcfb3178cba Mon Sep 17 00:00:00 2001 +From: Prasad J Pandit +Date: Mon, 21 Jun 2021 09:22:35 +0800 +Subject: [PATCH] ide: ahci: add check to avoid null dereference + (CVE-2019-12067) + +Fix CVE-2019-12067 + +AHCI emulator while committing DMA buffer in ahci_commit_buf() +may do a NULL dereference if the command header 'ad->cur_cmd' +is null. Add check to avoid it. + +Reported-by: Bugs SysSec +Signed-off-by: Prasad J Pandit + +Signed-off-by: Jiajie Li +Signed-off-by: Yan Wang +--- + hw/ide/ahci.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c +index a94c6e26fb..256b58026a 100644 +--- a/hw/ide/ahci.c ++++ b/hw/ide/ahci.c +@@ -1459,8 +1459,10 @@ static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes) + { + AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); + +- tx_bytes += le32_to_cpu(ad->cur_cmd->status); +- ad->cur_cmd->status = cpu_to_le32(tx_bytes); ++ if (ad->cur_cmd) { ++ tx_bytes += le32_to_cpu(ad->cur_cmd->status); ++ ad->cur_cmd->status = cpu_to_le32(tx_bytes); ++ } + } + + static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write) +-- +2.27.0 + -- Gitee From 868171e75a29b5e7fbf0323d358d8b1d8e679a55 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 10 Feb 2022 10:28:59 +0800 Subject: [PATCH 11/56] tap: return err when tap TUNGETIFF fail When hotplug ovs kernel netcard, even tap TUNGETIFF failed, the hotplug would go on and would lead to qemu assert. The failure should lead to the free_fail. Signed-off-by: miaoyubo Signed-off-by: Yan Wang --- tap-return-err-when-tap-TUNGETIFF-fail.patch | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tap-return-err-when-tap-TUNGETIFF-fail.patch diff --git a/tap-return-err-when-tap-TUNGETIFF-fail.patch b/tap-return-err-when-tap-TUNGETIFF-fail.patch new file mode 100644 index 00000000..f74fa19a --- /dev/null +++ b/tap-return-err-when-tap-TUNGETIFF-fail.patch @@ -0,0 +1,30 @@ +From 48a38f409a25f26605d65346c8ed9403c4b36c80 Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 10:28:59 +0800 +Subject: [PATCH] tap: return err when tap TUNGETIFF fail + +When hotplug ovs kernel netcard, even tap TUNGETIFF failed, +the hotplug would go on and would lead to qemu assert. +The failure should lead to the free_fail. + +Signed-off-by: miaoyubo +Signed-off-by: Yan Wang +--- + net/tap.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/tap.c b/net/tap.c +index f716be3e3f..c5cbeaa7a2 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -900,6 +900,7 @@ int net_init_tap(const Netdev *netdev, const char *name, + if (i == 0) { + vnet_hdr = tap_probe_vnet_hdr(fd, errp); + if (vnet_hdr < 0) { ++ ret = -1; + goto free_fail; + } + } else if (vnet_hdr != tap_probe_vnet_hdr(fd, NULL)) { +-- +2.27.0 + -- Gitee From b43b387858421463212b0746314d223950736b78 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 10 Feb 2022 10:41:40 +0800 Subject: [PATCH 12/56] xhci: check reg to avoid OOB read Add a sanity check to fix OOB read access. Signed-off-by: Yan Wang --- xhci-check-reg-to-avoid-OOB-read.patch | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 xhci-check-reg-to-avoid-OOB-read.patch diff --git a/xhci-check-reg-to-avoid-OOB-read.patch b/xhci-check-reg-to-avoid-OOB-read.patch new file mode 100644 index 00000000..f6a4c356 --- /dev/null +++ b/xhci-check-reg-to-avoid-OOB-read.patch @@ -0,0 +1,47 @@ +From a95ada20170af0a71529c1583846e402cdbb850b Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 10:41:40 +0800 +Subject: [PATCH] xhci: check reg to avoid OOB read + +Add a sanity check to fix OOB read access. + +Signed-off-by: Yan Wang +--- + hw/usb/hcd-xhci.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c +index e01700039b..08cd63e159 100644 +--- a/hw/usb/hcd-xhci.c ++++ b/hw/usb/hcd-xhci.c +@@ -27,6 +27,7 @@ + #include "hw/qdev-properties.h" + #include "trace.h" + #include "qapi/error.h" ++#include "qemu/log.h" + + #include "hcd-xhci.h" + +@@ -3017,14 +3018,17 @@ static void xhci_runtime_write(void *ptr, hwaddr reg, + XHCIInterrupter *intr; + int v; + +- trace_usb_xhci_runtime_write(reg, val); +- + if (reg < 0x20) { + trace_usb_xhci_unimplemented("runtime write", reg); + return; + } + v = (reg - 0x20) / 0x20; ++ if (v >= xhci->numintrs) { ++ qemu_log("intr nr out of range (%d >= %d)\n", v, xhci->numintrs); ++ return; ++ } + intr = &xhci->intr[v]; ++ trace_usb_xhci_runtime_write(reg, val); + + switch (reg & 0x1f) { + case 0x00: /* IMAN */ +-- +2.27.0 + -- Gitee From bcd9bb0ecaf6cc4d2f1cade816377e2b61169906 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 10 Feb 2022 11:18:13 +0800 Subject: [PATCH 13/56] monitor: Discard BLOCK_IO_ERROR event when VM rebooted Throttled event like QAPI_EVENT_BLOCK_IO_ERROR may be queued to limit event rate. Event may be delivered when VM is rebooted if the event was queued in the *monitor_qapi_event_state* hash table. Which may casue VM pause and other related problems. Such as seabios blocked during virtio-scsi initialization: vring_add_buf(vq, sg, out_num, in_num, 0, 0); vring_kick(vp, vq, 1); ------------> VM paused here <----------- /* Wait for reply */ while (!vring_more_used(vq)) usleep(5); Signed-off-by: Yan Wang --- ...BLOCK_IO_ERROR-event-when-VM-reboote.patch | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch diff --git a/monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch b/monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch new file mode 100644 index 00000000..13d16d03 --- /dev/null +++ b/monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch @@ -0,0 +1,97 @@ +From f5af9ac3c9af4602812060759f6f95da8725314b Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 11:18:13 +0800 +Subject: [PATCH] monitor: Discard BLOCK_IO_ERROR event when VM rebooted + +Throttled event like QAPI_EVENT_BLOCK_IO_ERROR may be queued +to limit event rate. Event may be delivered when VM is rebooted +if the event was queued in the *monitor_qapi_event_state* hash table. +Which may casue VM pause and other related problems. +Such as seabios blocked during virtio-scsi initialization: + vring_add_buf(vq, sg, out_num, in_num, 0, 0); + vring_kick(vp, vq, 1); + ------------> VM paused here <----------- + /* Wait for reply */ + while (!vring_more_used(vq)) usleep(5); + +Signed-off-by: Yan Wang +--- + include/monitor/monitor.h | 2 ++ + monitor/monitor.c | 30 ++++++++++++++++++++++++++++++ + softmmu/runstate.c | 1 + + 3 files changed, 33 insertions(+) + +diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h +index 12d395d62d..847445f972 100644 +--- a/include/monitor/monitor.h ++++ b/include/monitor/monitor.h +@@ -56,4 +56,6 @@ void monitor_register_hmp(const char *name, bool info, + void monitor_register_hmp_info_hrt(const char *name, + HumanReadableText *(*handler)(Error **errp)); + ++void monitor_qapi_event_discard_io_error(void); ++ + #endif /* MONITOR_H */ +diff --git a/monitor/monitor.c b/monitor/monitor.c +index 013c628695..fb4ae9531c 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -34,6 +34,9 @@ + #include "qemu/option.h" + #include "sysemu/qtest.h" + #include "trace.h" ++#include "qemu/log.h" ++#include "qapi/qmp/qjson.h" ++#include "qapi/qmp/qobject.h" + + /* + * To prevent flooding clients, events can be throttled. The +@@ -767,6 +770,33 @@ int monitor_init_opts(QemuOpts *opts, Error **errp) + return ret; + } + ++void monitor_qapi_event_discard_io_error(void) ++{ ++ GHashTableIter event_iter; ++ MonitorQAPIEventState *evstate; ++ gpointer key, value; ++ GString *json; ++ ++ qemu_mutex_lock(&monitor_lock); ++ g_hash_table_iter_init(&event_iter, monitor_qapi_event_state); ++ while (g_hash_table_iter_next(&event_iter, &key, &value)) { ++ evstate = key; ++ /* Only QAPI_EVENT_BLOCK_IO_ERROR is discarded */ ++ if (evstate->event == QAPI_EVENT_BLOCK_IO_ERROR) { ++ g_hash_table_iter_remove(&event_iter); ++ json = qobject_to_json(QOBJECT(evstate->qdict)); ++ qemu_log(" %s event discarded\n", json->str); ++ timer_del(evstate->timer); ++ timer_free(evstate->timer); ++ qobject_unref(evstate->data); ++ qobject_unref(evstate->qdict); ++ g_string_free(json, true); ++ g_free(evstate); ++ } ++ } ++ qemu_mutex_unlock(&monitor_lock); ++} ++ + QemuOptsList qemu_mon_opts = { + .name = "mon", + .implied_opt_name = "chardev", +diff --git a/softmmu/runstate.c b/softmmu/runstate.c +index 10d9b7365a..5736d908db 100644 +--- a/softmmu/runstate.c ++++ b/softmmu/runstate.c +@@ -448,6 +448,7 @@ void qemu_system_reset(ShutdownCause reason) + qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); + } + cpu_synchronize_all_post_reset(); ++ monitor_qapi_event_discard_io_error(); + } + + /* +-- +2.27.0 + -- Gitee From 561aa0e1504da8adc0ec244c10fe08d29cf142ed Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 10 Feb 2022 11:35:58 +0800 Subject: [PATCH 14/56] monitor: limit io error qmp event to at most once per 60s The speed of BLOCK IO ERROR event maybe very high (thousands per second). If we report all BLOCK IO ERRORs, the log file will be flooded with BLOCK IO ERROR event. So throttle it to at most once per 60s. Signed-off-by: Yan Wang --- ...-error-qmp-event-to-at-most-once-per.patch | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 monitor-limit-io-error-qmp-event-to-at-most-once-per.patch diff --git a/monitor-limit-io-error-qmp-event-to-at-most-once-per.patch b/monitor-limit-io-error-qmp-event-to-at-most-once-per.patch new file mode 100644 index 00000000..2b3b02f8 --- /dev/null +++ b/monitor-limit-io-error-qmp-event-to-at-most-once-per.patch @@ -0,0 +1,29 @@ +From 44f45b5c163efed5387dac40e229e0a50bf5921a Mon Sep 17 00:00:00 2001 +From: Yan Wang +Date: Thu, 10 Feb 2022 11:35:58 +0800 +Subject: [PATCH] monitor: limit io error qmp event to at most once per 60s + +The speed of BLOCK IO ERROR event maybe very high (thousands per +second). If we report all BLOCK IO ERRORs, the log file will be flooded +with BLOCK IO ERROR event. So throttle it to at most once per 60s. + +Signed-off-by: Yan Wang +--- + monitor/monitor.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/monitor/monitor.c b/monitor/monitor.c +index fb4ae9531c..621e79eb66 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -300,6 +300,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { + [QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS }, + [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, + [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, ++ [QAPI_EVENT_BLOCK_IO_ERROR] = { 60L * 1000 * SCALE_MS }, + }; + + /* +-- +2.27.0 + -- Gitee From 1c5495087e15a2c27b5f2361674eb444162f9555 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 12 Feb 2022 17:20:24 +0800 Subject: [PATCH 15/56] =?UTF-8?q?spec:=20Update=20patch=20and=20changelog?= =?UTF-8?q?=20with=20!247=20=E3=80=906.2.0=E3=80=91IO=E7=9B=B8=E5=85=B3?= =?UTF-8?q?=E8=A1=A5=E4=B8=81=E5=9B=9E=E5=90=88=20=20!247?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit log: Add some logs on VM runtime path qdev/monitors: Fix reundant error_setg of qdev_add_device bios-tables-test: Allow changes to q35/SSDT.dimmpxm file smbios: Add missing member of type 4 for smbios 3.0 bios-tables-test: Update expected q35/SSDT.dimmpxm file net: eepro100: validate various address valuesi(CVE-2021-20255) pci: check bus pointer before dereference ide: ahci: add check to avoid null dereference (CVE-2019-12067) tap: return err when tap TUNGETIFF fail xhci: check reg to avoid OOB read monitor: Discard BLOCK_IO_ERROR event when VM rebooted monitor: limit io error qmp event to at most once per 60s Signed-off-by: Chen Qun --- qemu.spec | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/qemu.spec b/qemu.spec index 325effa2..11efd808 100644 --- a/qemu.spec +++ b/qemu.spec @@ -87,6 +87,18 @@ Patch0074: seabios-increase-the-seabios-minibiostable.patch Patch0075: IPv6-add-support-for-IPv6-protocol.patch Patch0076: Use-post-increment-only-in-inffast.c.patch Patch0077: util-log-add-CONFIG_DISABLE_QEMU_LOG-macro.patch +Patch0078: log-Add-some-logs-on-VM-runtime-path.patch +Patch0079: qdev-monitors-Fix-reundant-error_setg-of-qdev_add_de.patch +Patch0080: bios-tables-test-Allow-changes-to-q35-SSDT.dimmpxm-f.patch +Patch0081: smbios-Add-missing-member-of-type-4-for-smbios-3.0.patch +Patch0082: bios-tables-test-Update-expected-q35-SSDT.dimmpxm-fi.patch +Patch0083: net-eepro100-validate-various-address-valuesi-CVE-20.patch +Patch0084: pci-check-bus-pointer-before-dereference.patch +Patch0085: ide-ahci-add-check-to-avoid-null-dereference-CVE-201.patch +Patch0086: tap-return-err-when-tap-TUNGETIFF-fail.patch +Patch0087: xhci-check-reg-to-avoid-OOB-read.patch +Patch0088: monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch +Patch0089: monitor-limit-io-error-qmp-event-to-at-most-once-per.patch BuildRequires: flex BuildRequires: gcc @@ -531,6 +543,20 @@ getent passwd qemu >/dev/null || \ %endif %changelog +* Sat Feb 12 2022 Chen Qun +- log: Add some logs on VM runtime path +- qdev/monitors: Fix reundant error_setg of qdev_add_device +- bios-tables-test: Allow changes to q35/SSDT.dimmpxm file +- smbios: Add missing member of type 4 for smbios 3.0 +- bios-tables-test: Update expected q35/SSDT.dimmpxm file +- net: eepro100: validate various address valuesi(CVE-2021-20255) +- pci: check bus pointer before dereference +- ide: ahci: add check to avoid null dereference (CVE-2019-12067) +- tap: return err when tap TUNGETIFF fail +- xhci: check reg to avoid OOB read +- monitor: Discard BLOCK_IO_ERROR event when VM rebooted +- monitor: limit io error qmp event to at most once per 60s + * Sat Feb 12 2022 Chen Qun - util/log: add CONFIG_DISABLE_QEMU_LOG macro -- Gitee From f9a5081e369a224584be76a68c8e3d3cdcb53e96 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 8 May 2021 17:31:03 +0800 Subject: [PATCH 16/56] linux-headers: update against 5.10 and manual clear vfio dirty log series The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in the kernel, update the header to add them. Signed-off-by: Zenghui Yu Signed-off-by: Kunkun Jiang --- ...date-against-5.10-and-manual-clear-v.patch | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 linux-headers-update-against-5.10-and-manual-clear-v.patch diff --git a/linux-headers-update-against-5.10-and-manual-clear-v.patch b/linux-headers-update-against-5.10-and-manual-clear-v.patch new file mode 100644 index 00000000..93d30067 --- /dev/null +++ b/linux-headers-update-against-5.10-and-manual-clear-v.patch @@ -0,0 +1,82 @@ +From 40512773625a4f8ddd96a5af924f119b89a14706 Mon Sep 17 00:00:00 2001 +From: Zenghui Yu +Date: Sat, 8 May 2021 17:31:03 +0800 +Subject: [PATCH] linux-headers: update against 5.10 and manual clear vfio + dirty log series + +The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl +VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and +VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in +the kernel, update the header to add them. + +Signed-off-by: Zenghui Yu +Signed-off-by: Kunkun Jiang +--- + linux-headers/linux/vfio.h | 36 +++++++++++++++++++++++++++++++++++- + 1 file changed, 35 insertions(+), 1 deletion(-) + +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index e680594f27..f4ff038e8c 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -52,6 +52,16 @@ + /* Supports the vaddr flag for DMA map and unmap */ + #define VFIO_UPDATE_VADDR 10 + ++/* ++ * The vfio_iommu driver may support user clears dirty log manually, which means ++ * dirty log can be requested to not cleared automatically after dirty log is ++ * copied to userspace, it's user's duty to clear dirty log. ++ * ++ * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and ++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. ++ */ ++#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 ++ + /* + * The IOCTL interface is designed for extensibility by embedding the + * structure length (argsz) and flags into structures passed between +@@ -1196,8 +1206,30 @@ struct vfio_iommu_type1_dma_unmap { + * actual bitmap. If dirty pages logging is not enabled, an error will be + * returned. + * +- * Only one of the flags _START, _STOP and _GET may be specified at a time. ++ * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as ++ * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying ++ * dirty bitmap is not cleared automatically. The user can clear it manually by ++ * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. + * ++ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, ++ * instructs the IOMMU driver to clear the dirty status of pages in a bitmap ++ * for IOMMU container for a given IOVA range. The user must specify the IOVA ++ * range, the bitmap and the pgsize through the structure ++ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface ++ * supports clearing a bitmap of the smallest supported pgsize only and can be ++ * modified in future to clear a bitmap of any specified supported pgsize. The ++ * user must provide a memory area for the bitmap memory and specify its size ++ * in bitmap.size. One bit is used to represent one page consecutively starting ++ * from iova offset. The user should provide page size in bitmap.pgsize field. ++ * A bit set in the bitmap indicates that the page at that offset from iova is ++ * cleared the dirty status, and dirty tracking is re-enabled for that page. The ++ * caller must set argsz to a value including the size of structure ++ * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If ++ * dirty pages logging is not enabled, an error will be returned. Note: user ++ * should clear dirty log before handle corresponding dirty pages. ++ * ++ * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be ++ * specified at a time. + */ + struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; +@@ -1205,6 +1237,8 @@ struct vfio_iommu_type1_dirty_bitmap { + #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) + #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) + #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) ++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) ++#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) + __u8 data[]; + }; + +-- +2.27.0 + -- Gitee From 1c45ed55533b36ee653701f792316caa0a4dc2ac Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 8 May 2021 17:31:04 +0800 Subject: [PATCH 17/56] vfio: Maintain DMA mapping range for the container When synchronizing dirty bitmap from kernel VFIO we do it in a per-iova-range fashion and we allocate the userspace bitmap for each of the ioctl. This patch introduces `struct VFIODMARange` to describe a range of the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and make the bitmap cache of this range be persistent so that we don't need to g_try_malloc0() every time. Note that the new structure is almost a copy of `struct vfio_iommu_type1_dma_map` but only internally used by QEMU. More importantly, the cached per-iova-range dirty bitmap will be further used when we want to add support for the CLEAR_BITMAP and this cached bitmap will be used to guarantee we don't clear any unknown dirty bits otherwise that can be a severe data loss issue for migration code. It's pretty intuitive to maintain a bitmap per container since we perform log_sync at this granule. But I don't know how to deal with things like memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome. * yet something to-do: - can't work with guest viommu - no locks - etc [ The idea and even the commit message are largely inherited from kvm side. See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ] Signed-off-by: Zenghui Yu Signed-off-by: Kunkun Jiang --- ...-DMA-mapping-range-for-the-container.patch | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 vfio-Maintain-DMA-mapping-range-for-the-container.patch diff --git a/vfio-Maintain-DMA-mapping-range-for-the-container.patch b/vfio-Maintain-DMA-mapping-range-for-the-container.patch new file mode 100644 index 00000000..ba8abb43 --- /dev/null +++ b/vfio-Maintain-DMA-mapping-range-for-the-container.patch @@ -0,0 +1,191 @@ +From ac1bf3edcd2b807cf81ada500716f13b1394d58e Mon Sep 17 00:00:00 2001 +From: Zenghui Yu +Date: Sat, 8 May 2021 17:31:04 +0800 +Subject: [PATCH] vfio: Maintain DMA mapping range for the container + +When synchronizing dirty bitmap from kernel VFIO we do it in a +per-iova-range fashion and we allocate the userspace bitmap for each of the +ioctl. This patch introduces `struct VFIODMARange` to describe a range of +the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and +make the bitmap cache of this range be persistent so that we don't need to +g_try_malloc0() every time. Note that the new structure is almost a copy of +`struct vfio_iommu_type1_dma_map` but only internally used by QEMU. + +More importantly, the cached per-iova-range dirty bitmap will be further +used when we want to add support for the CLEAR_BITMAP and this cached +bitmap will be used to guarantee we don't clear any unknown dirty bits +otherwise that can be a severe data loss issue for migration code. + +It's pretty intuitive to maintain a bitmap per container since we perform +log_sync at this granule. But I don't know how to deal with things like +memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome. + +* yet something to-do: + - can't work with guest viommu + - no locks + - etc + +[ The idea and even the commit message are largely inherited from kvm side. + See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ] + +Signed-off-by: Zenghui Yu +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 62 +++++++++++++++++++++++++++++++---- + include/hw/vfio/vfio-common.h | 9 +++++ + 2 files changed, 65 insertions(+), 6 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 080046e3f5..86ea784919 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -445,6 +445,29 @@ unmap_exit: + return ret; + } + ++static VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, ++ hwaddr start_addr, hwaddr size) ++{ ++ VFIODMARange *qrange; ++ ++ QLIST_FOREACH(qrange, &container->dma_list, next) { ++ if (qrange->iova == start_addr && qrange->size == size) { ++ return qrange; ++ } ++ } ++ return NULL; ++} ++ ++static void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) ++{ ++ uint64_t pages, size; ++ ++ pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size; ++ size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; ++ ++ qrange->bitmap = g_malloc0(size); ++} ++ + /* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +@@ -458,12 +481,29 @@ static int vfio_dma_unmap(VFIOContainer *container, + .iova = iova, + .size = size, + }; ++ VFIODMARange *qrange; + + if (iotlb && container->dirty_pages_supported && + vfio_devices_all_running_and_saving(container)) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + ++ /* ++ * unregister the DMA range ++ * ++ * It seems that the memory layer will give us the same section as the one ++ * used in region_add(). Otherwise it'll be complicated to manipulate the ++ * bitmap across region_{add,del}. Is there any guarantee? ++ * ++ * But there is really not such a restriction on the kernel interface ++ * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc). ++ */ ++ qrange = vfio_lookup_match_range(container, iova, size); ++ assert(qrange); ++ g_free(qrange->bitmap); ++ QLIST_REMOVE(qrange, next); ++ g_free(qrange); ++ + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + /* + * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c +@@ -500,6 +540,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, + .iova = iova, + .size = size, + }; ++ VFIODMARange *qrange; ++ ++ qrange = g_malloc0(sizeof(*qrange)); ++ qrange->iova = iova; ++ qrange->size = size; ++ QLIST_INSERT_HEAD(&container->dma_list, qrange, next); ++ /* XXX allocate the dirty bitmap on demand */ ++ vfio_dma_range_init_dirty_bitmap(qrange); + + if (!readonly) { + map.flags |= VFIO_DMA_MAP_FLAG_WRITE; +@@ -1256,9 +1304,14 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + { + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; ++ VFIODMARange *qrange; + uint64_t pages; + int ret; + ++ qrange = vfio_lookup_match_range(container, iova, size); ++ /* the same as vfio_dma_unmap() */ ++ assert(qrange); ++ + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); +@@ -1277,11 +1330,8 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size; + range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; +- range->bitmap.data = g_try_malloc0(range->bitmap.size); +- if (!range->bitmap.data) { +- ret = -ENOMEM; +- goto err_out; +- } ++ ++ range->bitmap.data = (__u64 *)qrange->bitmap; + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { +@@ -1297,7 +1347,6 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, + range->bitmap.size, ram_addr); + err_out: +- g_free(range->bitmap.data); + g_free(dbitmap); + + return ret; +@@ -2061,6 +2110,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->giommu_list); + QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->vrdl_list); ++ QLIST_INIT(&container->dma_list); + + ret = vfio_init_container(container, group->fd, errp); + if (ret) { +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 8af11b0a76..20b9c8a1d3 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -76,6 +76,14 @@ typedef struct VFIOAddressSpace { + + struct VFIOGroup; + ++typedef struct VFIODMARange { ++ QLIST_ENTRY(VFIODMARange) next; ++ hwaddr iova; ++ size_t size; ++ void *vaddr; /* unused */ ++ unsigned long *bitmap; /* dirty bitmap cache for this range */ ++} VFIODMARange; ++ + typedef struct VFIOContainer { + VFIOAddressSpace *space; + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ +@@ -93,6 +101,7 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; ++ QLIST_HEAD(, VFIODMARange) dma_list; + QLIST_ENTRY(VFIOContainer) next; + } VFIOContainer; + +-- +2.27.0 + -- Gitee From 280cdc602c3169a19f6c8484b6dd81b88fd5e8d9 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 8 May 2021 17:31:05 +0800 Subject: [PATCH 18/56] vfio/migration: Add support for manual clear vfio dirty log The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in the kernel, tweak the userspace side to use them. Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and provide the log_clear() hook for vfio_memory_listener. If the kernel supports it, deliever the clear message to kernel. Signed-off-by: Zenghui Yu Signed-off-by: Kunkun Jiang --- ...dd-support-for-manual-clear-vfio-dir.patch | 224 ++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 vfio-migration-Add-support-for-manual-clear-vfio-dir.patch diff --git a/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch new file mode 100644 index 00000000..0a5ff88f --- /dev/null +++ b/vfio-migration-Add-support-for-manual-clear-vfio-dir.patch @@ -0,0 +1,224 @@ +From 815258f81a660ad87272191dca4a9726cb2bf5b2 Mon Sep 17 00:00:00 2001 +From: Zenghui Yu +Date: Sat, 8 May 2021 17:31:05 +0800 +Subject: [PATCH] vfio/migration: Add support for manual clear vfio dirty log + +The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl +VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and +VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in +the kernel, tweak the userspace side to use them. + +Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and +provide the log_clear() hook for vfio_memory_listener. If the +kernel supports it, deliever the clear message to kernel. + +Signed-off-by: Zenghui Yu +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 149 +++++++++++++++++++++++++++++++++- + include/hw/vfio/vfio-common.h | 1 + + 2 files changed, 148 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 86ea784919..6cb91e7ffd 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1315,7 +1315,9 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); +- dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; ++ dbitmap->flags = container->dirty_log_manual_clear ? ++ VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR : ++ VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; +@@ -1491,6 +1493,141 @@ static void vfio_listener_log_sync(MemoryListener *listener, + } + } + ++/* ++ * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP ++ * ioctl. But copy from kvm side and align {start, size} with 64 pages. ++ * ++ * I think the code can be simplified a lot if no alignment requirement. ++ */ ++#define VFIO_CLEAR_LOG_SHIFT 6 ++#define VFIO_CLEAR_LOG_ALIGN (qemu_real_host_page_size << VFIO_CLEAR_LOG_SHIFT) ++#define VFIO_CLEAR_LOG_MASK (-VFIO_CLEAR_LOG_ALIGN) ++ ++static int vfio_log_clear_one_range(VFIOContainer *container, ++ VFIODMARange *qrange, uint64_t start, uint64_t size) ++{ ++ struct vfio_iommu_type1_dirty_bitmap *dbitmap; ++ struct vfio_iommu_type1_dirty_bitmap_get *range; ++ ++ dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); ++ ++ dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); ++ dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; ++ range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; ++ ++ /* ++ * Now let's deal with the actual bitmap, which is almost the same ++ * as the kvm side. ++ */ ++ uint64_t end, bmap_start, start_delta, bmap_npages; ++ unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; ++ int ret; ++ ++ bmap_start = start & VFIO_CLEAR_LOG_MASK; ++ start_delta = start - bmap_start; ++ bmap_start /= psize; ++ ++ bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN) ++ << VFIO_CLEAR_LOG_SHIFT; ++ end = qrange->size / psize; ++ if (bmap_npages > end - bmap_start) { ++ bmap_npages = end - bmap_start; ++ } ++ start_delta /= psize; ++ ++ if (start_delta) { ++ bmap_clear = bitmap_new(bmap_npages); ++ bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap, ++ bmap_start, start_delta + size / psize); ++ bitmap_clear(bmap_clear, 0, start_delta); ++ range->bitmap.data = (__u64 *)bmap_clear; ++ } else { ++ range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start)); ++ } ++ ++ range->iova = qrange->iova + bmap_start * psize; ++ range->size = bmap_npages * psize; ++ range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) / ++ BITS_PER_BYTE; ++ range->bitmap.pgsize = qemu_real_host_page_size; ++ ++ ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); ++ if (ret) { ++ error_report("Failed to clear dirty log for iova: 0x%"PRIx64 ++ " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, ++ (uint64_t)range->size, errno); ++ goto err_out; ++ } ++ ++ bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize); ++err_out: ++ g_free(bmap_clear); ++ g_free(dbitmap); ++ return 0; ++} ++ ++static int vfio_physical_log_clear(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ uint64_t start, size, offset, count; ++ VFIODMARange *qrange; ++ int ret = 0; ++ ++ if (!container->dirty_log_manual_clear) { ++ /* No need to do explicit clear */ ++ return ret; ++ } ++ ++ start = section->offset_within_address_space; ++ size = int128_get64(section->size); ++ ++ if (!size) { ++ return ret; ++ } ++ ++ QLIST_FOREACH(qrange, &container->dma_list, next) { ++ /* ++ * Discard ranges that do not overlap the section (e.g., the ++ * Memory BAR regions of the device) ++ */ ++ if (qrange->iova > start + size - 1 || ++ start > qrange->iova + qrange->size - 1) { ++ continue; ++ } ++ ++ if (start >= qrange->iova) { ++ /* The range starts before section or is aligned to it. */ ++ offset = start - qrange->iova; ++ count = MIN(qrange->size - offset, size); ++ } else { ++ /* The range starts after section. */ ++ offset = 0; ++ count = MIN(qrange->size, size - (qrange->iova - start)); ++ } ++ ret = vfio_log_clear_one_range(container, qrange, offset, count); ++ if (ret < 0) { ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static void vfio_listener_log_clear(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ ++ if (vfio_listener_skipped_section(section) || ++ !container->dirty_pages_supported) { ++ return; ++ } ++ ++ if (vfio_devices_all_dirty_tracking(container)) { ++ vfio_physical_log_clear(container, section); ++ } ++} ++ + static const MemoryListener vfio_memory_listener = { + .name = "vfio", + .region_add = vfio_listener_region_add, +@@ -1498,6 +1635,7 @@ static const MemoryListener vfio_memory_listener = { + .log_global_start = vfio_listener_log_global_start, + .log_global_stop = vfio_listener_log_global_stop, + .log_sync = vfio_listener_log_sync, ++ .log_clear = vfio_listener_log_clear, + }; + + static void vfio_listener_release(VFIOContainer *container) +@@ -1925,7 +2063,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, + static int vfio_init_container(VFIOContainer *container, int group_fd, + Error **errp) + { +- int iommu_type, ret; ++ int iommu_type, dirty_log_manual_clear, ret; + + iommu_type = vfio_get_iommu_type(container, errp); + if (iommu_type < 0) { +@@ -1954,6 +2092,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, + } + + container->iommu_type = iommu_type; ++ ++ dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION, ++ VFIO_DIRTY_LOG_MANUAL_CLEAR); ++ if (dirty_log_manual_clear) { ++ container->dirty_log_manual_clear = dirty_log_manual_clear; ++ } ++ + return 0; + } + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 20b9c8a1d3..0234f5e1b1 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -93,6 +93,7 @@ typedef struct VFIOContainer { + Error *error; + bool initialized; + bool dirty_pages_supported; ++ bool dirty_log_manual_clear; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; + unsigned long pgsizes; +-- +2.27.0 + -- Gitee From 475dc70689bb73b51405ec2aa9ba8e21b4bcd148 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 9 May 2019 10:23:42 -0400 Subject: [PATCH 19/56] update-linux-headers: Import iommu.h Update the script to import the new iommu.h uapi header. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- update-linux-headers-Import-iommu.h.patch | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 update-linux-headers-Import-iommu.h.patch diff --git a/update-linux-headers-Import-iommu.h.patch b/update-linux-headers-Import-iommu.h.patch new file mode 100644 index 00000000..5653e6a4 --- /dev/null +++ b/update-linux-headers-Import-iommu.h.patch @@ -0,0 +1,29 @@ +From 694acf3c321908d26ce508842b7bd076664ffbc6 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 9 May 2019 10:23:42 -0400 +Subject: [PATCH] update-linux-headers: Import iommu.h + +Update the script to import the new iommu.h uapi header. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + scripts/update-linux-headers.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh +index fea4d6eb65..acde610733 100755 +--- a/scripts/update-linux-headers.sh ++++ b/scripts/update-linux-headers.sh +@@ -144,7 +144,7 @@ done + + rm -rf "$output/linux-headers/linux" + mkdir -p "$output/linux-headers/linux" +-for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \ ++for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h iommu.h \ + psci.h psp-sev.h userfaultfd.h mman.h; do + cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" + done +-- +2.27.0 + -- Gitee From 830dcc6d9f7a646d6b44bc68b457eb01d733f465 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 30 Jul 2021 09:15:31 +0800 Subject: [PATCH 20/56] vfio.h and iommu.h header update against 5.10 Signed-off-by: Kunkun Jiang --- ...d-iommu.h-header-update-against-5.10.patch | 701 ++++++++++++++++++ 1 file changed, 701 insertions(+) create mode 100644 vfio.h-and-iommu.h-header-update-against-5.10.patch diff --git a/vfio.h-and-iommu.h-header-update-against-5.10.patch b/vfio.h-and-iommu.h-header-update-against-5.10.patch new file mode 100644 index 00000000..8272a679 --- /dev/null +++ b/vfio.h-and-iommu.h-header-update-against-5.10.patch @@ -0,0 +1,701 @@ +From 36b65d7312a343cb636e6963b8262dce9420ebc6 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Fri, 30 Jul 2021 09:15:31 +0800 +Subject: [PATCH] vfio.h and iommu.h header update against 5.10 + +Signed-off-by: Kunkun Jiang +--- + linux-headers/linux/iommu.h | 395 ++++++++++++++++++++++++++++++++++++ + linux-headers/linux/vfio.h | 220 +++++++++++++++++++- + 2 files changed, 613 insertions(+), 2 deletions(-) + create mode 100644 linux-headers/linux/iommu.h + +diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h +new file mode 100644 +index 0000000000..773b7dc2d6 +--- /dev/null ++++ b/linux-headers/linux/iommu.h +@@ -0,0 +1,395 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * IOMMU user API definitions ++ */ ++ ++#ifndef IOMMU_H ++#define IOMMU_H ++ ++#include ++ ++#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ ++#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ ++#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ ++#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ ++ ++/* Generic fault types, can be expanded IRQ remapping fault */ ++enum iommu_fault_type { ++ IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ ++ IOMMU_FAULT_PAGE_REQ, /* page request fault */ ++}; ++ ++enum iommu_fault_reason { ++ IOMMU_FAULT_REASON_UNKNOWN = 0, ++ ++ /* Could not access the PASID table (fetch caused external abort) */ ++ IOMMU_FAULT_REASON_PASID_FETCH, ++ ++ /* PASID entry is invalid or has configuration errors */ ++ IOMMU_FAULT_REASON_BAD_PASID_ENTRY, ++ ++ /* ++ * PASID is out of range (e.g. exceeds the maximum PASID ++ * supported by the IOMMU) or disabled. ++ */ ++ IOMMU_FAULT_REASON_PASID_INVALID, ++ ++ /* ++ * An external abort occurred fetching (or updating) a translation ++ * table descriptor ++ */ ++ IOMMU_FAULT_REASON_WALK_EABT, ++ ++ /* ++ * Could not access the page table entry (Bad address), ++ * actual translation fault ++ */ ++ IOMMU_FAULT_REASON_PTE_FETCH, ++ ++ /* Protection flag check failed */ ++ IOMMU_FAULT_REASON_PERMISSION, ++ ++ /* access flag check failed */ ++ IOMMU_FAULT_REASON_ACCESS, ++ ++ /* Output address of a translation stage caused Address Size fault */ ++ IOMMU_FAULT_REASON_OOR_ADDRESS, ++}; ++ ++/** ++ * struct iommu_fault_unrecoverable - Unrecoverable fault data ++ * @reason: reason of the fault, from &enum iommu_fault_reason ++ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) ++ * @pasid: Process Address Space ID ++ * @perm: requested permission access using by the incoming transaction ++ * (IOMMU_FAULT_PERM_* values) ++ * @addr: offending page address ++ * @fetch_addr: address that caused a fetch abort, if any ++ */ ++struct iommu_fault_unrecoverable { ++ __u32 reason; ++#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) ++#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) ++#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) ++ __u32 flags; ++ __u32 pasid; ++ __u32 perm; ++ __u64 addr; ++ __u64 fetch_addr; ++}; ++ ++/** ++ * struct iommu_fault_page_request - Page Request data ++ * @flags: encodes whether the corresponding fields are valid and whether this ++ * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). ++ * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response ++ * must have the same PASID value as the page request. When it is clear, ++ * the page response should not have a PASID. ++ * @pasid: Process Address Space ID ++ * @grpid: Page Request Group Index ++ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) ++ * @addr: page address ++ * @private_data: device-specific private information ++ */ ++struct iommu_fault_page_request { ++#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) ++#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) ++#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) ++#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) ++ __u32 flags; ++ __u32 pasid; ++ __u32 grpid; ++ __u32 perm; ++ __u64 addr; ++ __u64 private_data[2]; ++}; ++ ++/** ++ * struct iommu_fault - Generic fault data ++ * @type: fault type from &enum iommu_fault_type ++ * @padding: reserved for future use (should be zero) ++ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV ++ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ ++ * @padding2: sets the fault size to allow for future extensions ++ */ ++struct iommu_fault { ++ __u32 type; ++ __u32 padding; ++ union { ++ struct iommu_fault_unrecoverable event; ++ struct iommu_fault_page_request prm; ++ __u8 padding2[56]; ++ }; ++}; ++ ++/** ++ * enum iommu_page_response_code - Return status of fault handlers ++ * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables ++ * populated, retry the access. This is "Success" in PCI PRI. ++ * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from ++ * this device if possible. This is "Response Failure" in PCI PRI. ++ * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the ++ * access. This is "Invalid Request" in PCI PRI. ++ */ ++enum iommu_page_response_code { ++ IOMMU_PAGE_RESP_SUCCESS = 0, ++ IOMMU_PAGE_RESP_INVALID, ++ IOMMU_PAGE_RESP_FAILURE, ++}; ++ ++/** ++ * struct iommu_page_response - Generic page response information ++ * @argsz: User filled size of this data ++ * @version: API version of this structure ++ * @flags: encodes whether the corresponding fields are valid ++ * (IOMMU_FAULT_PAGE_RESPONSE_* values) ++ * @pasid: Process Address Space ID ++ * @grpid: Page Request Group Index ++ * @code: response code from &enum iommu_page_response_code ++ */ ++struct iommu_page_response { ++ __u32 argsz; ++#define IOMMU_PAGE_RESP_VERSION_1 1 ++ __u32 version; ++#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) ++ __u32 flags; ++ __u32 pasid; ++ __u32 grpid; ++ __u32 code; ++}; ++ ++/* defines the granularity of the invalidation */ ++enum iommu_inv_granularity { ++ IOMMU_INV_GRANU_DOMAIN, /* domain-selective invalidation */ ++ IOMMU_INV_GRANU_PASID, /* PASID-selective invalidation */ ++ IOMMU_INV_GRANU_ADDR, /* page-selective invalidation */ ++ IOMMU_INV_GRANU_NR, /* number of invalidation granularities */ ++}; ++ ++/** ++ * struct iommu_inv_addr_info - Address Selective Invalidation Structure ++ * ++ * @flags: indicates the granularity of the address-selective invalidation ++ * - If the PASID bit is set, the @pasid field is populated and the invalidation ++ * relates to cache entries tagged with this PASID and matching the address ++ * range. ++ * - If ARCHID bit is set, @archid is populated and the invalidation relates ++ * to cache entries tagged with this architecture specific ID and matching ++ * the address range. ++ * - Both PASID and ARCHID can be set as they may tag different caches. ++ * - If neither PASID or ARCHID is set, global addr invalidation applies. ++ * - The LEAF flag indicates whether only the leaf PTE caching needs to be ++ * invalidated and other paging structure caches can be preserved. ++ * @pasid: process address space ID ++ * @archid: architecture-specific ID ++ * @addr: first stage/level input address ++ * @granule_size: page/block size of the mapping in bytes ++ * @nb_granules: number of contiguous granules to be invalidated ++ */ ++struct iommu_inv_addr_info { ++#define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0) ++#define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1) ++#define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2) ++ __u32 flags; ++ __u32 archid; ++ __u64 pasid; ++ __u64 addr; ++ __u64 granule_size; ++ __u64 nb_granules; ++}; ++ ++/** ++ * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure ++ * ++ * @flags: indicates the granularity of the PASID-selective invalidation ++ * - If the PASID bit is set, the @pasid field is populated and the invalidation ++ * relates to cache entries tagged with this PASID and matching the address ++ * range. ++ * - If the ARCHID bit is set, the @archid is populated and the invalidation ++ * relates to cache entries tagged with this architecture specific ID and ++ * matching the address range. ++ * - Both PASID and ARCHID can be set as they may tag different caches. ++ * - At least one of PASID or ARCHID must be set. ++ * @pasid: process address space ID ++ * @archid: architecture-specific ID ++ */ ++struct iommu_inv_pasid_info { ++#define IOMMU_INV_PASID_FLAGS_PASID (1 << 0) ++#define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1) ++ __u32 flags; ++ __u32 archid; ++ __u64 pasid; ++}; ++ ++/** ++ * struct iommu_cache_invalidate_info - First level/stage invalidation ++ * information ++ * @argsz: User filled size of this data ++ * @version: API version of this structure ++ * @cache: bitfield that allows to select which caches to invalidate ++ * @granularity: defines the lowest granularity used for the invalidation: ++ * domain > PASID > addr ++ * @padding: reserved for future use (should be zero) ++ * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID ++ * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR ++ * ++ * Not all the combinations of cache/granularity are valid: ++ * ++ * +--------------+---------------+---------------+---------------+ ++ * | type / | DEV_IOTLB | IOTLB | PASID | ++ * | granularity | | | cache | ++ * +==============+===============+===============+===============+ ++ * | DOMAIN | N/A | Y | Y | ++ * +--------------+---------------+---------------+---------------+ ++ * | PASID | Y | Y | Y | ++ * +--------------+---------------+---------------+---------------+ ++ * | ADDR | Y | Y | N/A | ++ * +--------------+---------------+---------------+---------------+ ++ * ++ * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than ++ * @version and @cache. ++ * ++ * If multiple cache types are invalidated simultaneously, they all ++ * must support the used granularity. ++ */ ++struct iommu_cache_invalidate_info { ++ __u32 argsz; ++#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1 ++ __u32 version; ++/* IOMMU paging structure cache */ ++#define IOMMU_CACHE_INV_TYPE_IOTLB (1 << 0) /* IOMMU IOTLB */ ++#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB (1 << 1) /* Device IOTLB */ ++#define IOMMU_CACHE_INV_TYPE_PASID (1 << 2) /* PASID cache */ ++#define IOMMU_CACHE_INV_TYPE_NR (3) ++ __u8 cache; ++ __u8 granularity; ++ __u8 padding[6]; ++ union { ++ struct iommu_inv_pasid_info pasid_info; ++ struct iommu_inv_addr_info addr_info; ++ } granu; ++}; ++ ++/** ++ * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest ++ * SVA binding. ++ * ++ * @flags: VT-d PASID table entry attributes ++ * @pat: Page attribute table data to compute effective memory type ++ * @emt: Extended memory type ++ * ++ * Only guest vIOMMU selectable and effective options are passed down to ++ * the host IOMMU. ++ */ ++struct iommu_gpasid_bind_data_vtd { ++#define IOMMU_SVA_VTD_GPASID_SRE (1 << 0) /* supervisor request */ ++#define IOMMU_SVA_VTD_GPASID_EAFE (1 << 1) /* extended access enable */ ++#define IOMMU_SVA_VTD_GPASID_PCD (1 << 2) /* page-level cache disable */ ++#define IOMMU_SVA_VTD_GPASID_PWT (1 << 3) /* page-level write through */ ++#define IOMMU_SVA_VTD_GPASID_EMTE (1 << 4) /* extended mem type enable */ ++#define IOMMU_SVA_VTD_GPASID_CD (1 << 5) /* PASID-level cache disable */ ++#define IOMMU_SVA_VTD_GPASID_LAST (1 << 6) ++ __u64 flags; ++ __u32 pat; ++ __u32 emt; ++}; ++ ++#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ ++ IOMMU_SVA_VTD_GPASID_EMTE | \ ++ IOMMU_SVA_VTD_GPASID_PCD | \ ++ IOMMU_SVA_VTD_GPASID_PWT) ++ ++/** ++ * struct iommu_gpasid_bind_data - Information about device and guest PASID binding ++ * @argsz: User filled size of this data ++ * @version: Version of this data structure ++ * @format: PASID table entry format ++ * @flags: Additional information on guest bind request ++ * @gpgd: Guest page directory base of the guest mm to bind ++ * @hpasid: Process address space ID used for the guest mm in host IOMMU ++ * @gpasid: Process address space ID used for the guest mm in guest IOMMU ++ * @addr_width: Guest virtual address width ++ * @padding: Reserved for future use (should be zero) ++ * @vtd: Intel VT-d specific data ++ * ++ * Guest to host PASID mapping can be an identity or non-identity, where guest ++ * has its own PASID space. For non-identify mapping, guest to host PASID lookup ++ * is needed when VM programs guest PASID into an assigned device. VMM may ++ * trap such PASID programming then request host IOMMU driver to convert guest ++ * PASID to host PASID based on this bind data. ++ */ ++struct iommu_gpasid_bind_data { ++ __u32 argsz; ++#define IOMMU_GPASID_BIND_VERSION_1 1 ++ __u32 version; ++#define IOMMU_PASID_FORMAT_INTEL_VTD 1 ++#define IOMMU_PASID_FORMAT_LAST 2 ++ __u32 format; ++ __u32 addr_width; ++#define IOMMU_SVA_GPASID_VAL (1 << 0) /* guest PASID valid */ ++ __u64 flags; ++ __u64 gpgd; ++ __u64 hpasid; ++ __u64 gpasid; ++ __u8 padding[8]; ++ /* Vendor specific data */ ++ union { ++ struct iommu_gpasid_bind_data_vtd vtd; ++ } vendor; ++}; ++ ++/** ++ * struct iommu_pasid_smmuv3 - ARM SMMUv3 Stream Table Entry stage 1 related ++ * information ++ * @version: API version of this structure ++ * @s1fmt: STE s1fmt (format of the CD table: single CD, linear table ++ * or 2-level table) ++ * @s1dss: STE s1dss (specifies the behavior when @pasid_bits != 0 ++ * and no PASID is passed along with the incoming transaction) ++ * @padding: reserved for future use (should be zero) ++ * ++ * The PASID table is referred to as the Context Descriptor (CD) table on ARM ++ * SMMUv3. Please refer to the ARM SMMU 3.x spec (ARM IHI 0070A) for full ++ * details. ++ */ ++struct iommu_pasid_smmuv3 { ++#define PASID_TABLE_SMMUV3_CFG_VERSION_1 1 ++ __u32 version; ++ __u8 s1fmt; ++ __u8 s1dss; ++ __u8 padding[2]; ++}; ++ ++/** ++ * struct iommu_pasid_table_config - PASID table data used to bind guest PASID ++ * table to the host IOMMU ++ * @argsz: User filled size of this data ++ * @version: API version to prepare for future extensions ++ * @base_ptr: guest physical address of the PASID table ++ * @format: format of the PASID table ++ * @pasid_bits: number of PASID bits used in the PASID table ++ * @config: indicates whether the guest translation stage must ++ * be translated, bypassed or aborted. ++ * @padding: reserved for future use (should be zero) ++ * @vendor_data.smmuv3: table information when @format is ++ * %IOMMU_PASID_FORMAT_SMMUV3 ++ */ ++struct iommu_pasid_table_config { ++ __u32 argsz; ++#define PASID_TABLE_CFG_VERSION_1 1 ++ __u32 version; ++ __u64 base_ptr; ++#define IOMMU_PASID_FORMAT_SMMUV3 1 ++ __u32 format; ++ __u8 pasid_bits; ++#define IOMMU_PASID_CONFIG_TRANSLATE 1 ++#define IOMMU_PASID_CONFIG_BYPASS 2 ++#define IOMMU_PASID_CONFIG_ABORT 3 ++ __u8 config; ++ __u8 padding[2]; ++ union { ++ struct iommu_pasid_smmuv3 smmuv3; ++ } vendor_data; ++}; ++ ++#endif /* _UAPI_IOMMU_H */ +diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h +index f4ff038e8c..cf8e208fac 100644 +--- a/linux-headers/linux/vfio.h ++++ b/linux-headers/linux/vfio.h +@@ -14,6 +14,7 @@ + + #include + #include ++#include + + #define VFIO_API_VERSION 0 + +@@ -334,6 +335,7 @@ struct vfio_region_info_cap_type { + #define VFIO_REGION_TYPE_GFX (1) + #define VFIO_REGION_TYPE_CCW (2) + #define VFIO_REGION_TYPE_MIGRATION (3) ++#define VFIO_REGION_TYPE_NESTED (4) + + /* sub-types for VFIO_REGION_TYPE_PCI_* */ + +@@ -362,6 +364,10 @@ struct vfio_region_info_cap_type { + /* sub-types for VFIO_REGION_TYPE_GFX */ + #define VFIO_REGION_SUBTYPE_GFX_EDID (1) + ++/* sub-types for VFIO_REGION_TYPE_NESTED */ ++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT (1) ++#define VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE (2) ++ + /** + * struct vfio_region_gfx_edid - EDID region layout. + * +@@ -721,11 +727,30 @@ struct vfio_irq_info { + #define VFIO_IRQ_INFO_MASKABLE (1 << 1) + #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) + #define VFIO_IRQ_INFO_NORESIZE (1 << 3) ++#define VFIO_IRQ_INFO_FLAG_CAPS (1 << 4) /* Info supports caps */ + __u32 index; /* IRQ index */ + __u32 count; /* Number of IRQs within this index */ ++ __u32 cap_offset; /* Offset within info struct of first cap */ + }; + #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) + ++/* ++ * The irq type capability allows IRQs unique to a specific device or ++ * class of devices to be exposed. ++ * ++ * The structures below define version 1 of this capability. ++ */ ++#define VFIO_IRQ_INFO_CAP_TYPE 3 ++ ++struct vfio_irq_info_cap_type { ++ struct vfio_info_cap_header header; ++ __u32 type; /* global per bus driver */ ++ __u32 subtype; /* type specific */ ++}; ++ ++#define VFIO_IRQ_TYPE_NESTED (1) ++#define VFIO_IRQ_SUBTYPE_DMA_FAULT (1) ++ + /** + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) + * +@@ -827,7 +852,8 @@ enum { + VFIO_PCI_MSIX_IRQ_INDEX, + VFIO_PCI_ERR_IRQ_INDEX, + VFIO_PCI_REQ_IRQ_INDEX, +- VFIO_PCI_NUM_IRQS ++ VFIO_PCI_NUM_IRQS = 5 /* Fixed user ABI, IRQ indexes >=5 use */ ++ /* device specific cap to define content */ + }; + + /* +@@ -1012,6 +1038,68 @@ struct vfio_device_feature { + */ + #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0) + ++/* ++ * Capability exposed by the DMA fault region ++ * @version: ABI version ++ */ ++#define VFIO_REGION_INFO_CAP_DMA_FAULT 6 ++ ++struct vfio_region_info_cap_fault { ++ struct vfio_info_cap_header header; ++ __u32 version; ++}; ++ ++/* ++ * Capability exposed by the DMA fault response region ++ * @version: ABI version ++ */ ++#define VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE 7 ++ ++struct vfio_region_info_cap_fault_response { ++ struct vfio_info_cap_header header; ++ __u32 version; ++}; ++ ++/* ++ * DMA Fault Region Layout ++ * @tail: index relative to the start of the ring buffer at which the ++ * consumer finds the next item in the buffer ++ * @entry_size: fault ring buffer entry size in bytes ++ * @nb_entries: max capacity of the fault ring buffer ++ * @offset: ring buffer offset relative to the start of the region ++ * @head: index relative to the start of the ring buffer at which the ++ * producer (kernel) inserts items into the buffers ++ */ ++struct vfio_region_dma_fault { ++ /* Write-Only */ ++ __u32 tail; ++ /* Read-Only */ ++ __u32 entry_size; ++ __u32 nb_entries; ++ __u32 offset; ++ __u32 head; ++}; ++ ++/* ++ * DMA Fault Response Region Layout ++ * @head: index relative to the start of the ring buffer at which the ++ * producer (userspace) insert responses into the buffer ++ * @entry_size: fault ring buffer entry size in bytes ++ * @nb_entries: max capacity of the fault ring buffer ++ * @offset: ring buffer offset relative to the start of the region ++ * @tail: index relative to the start of the ring buffer at which the ++ * consumer (kernel) finds the next item in the buffer ++ */ ++struct vfio_region_dma_fault_response { ++ /* Write-Only */ ++ __u32 head; ++ /* Read-Only */ ++ __u32 entry_size; ++ __u32 nb_entries; ++ __u32 offset; ++ __u32 tail; ++}; ++ + /* -------- API for Type1 VFIO IOMMU -------- */ + + /** +@@ -1124,7 +1212,7 @@ struct vfio_iommu_type1_dma_map { + struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ +- __u64 *data; /* one bit per page */ ++ __u64 *data; /* one bit per page */ + }; + + /** +@@ -1250,6 +1338,134 @@ struct vfio_iommu_type1_dirty_bitmap_get { + + #define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) + ++/* ++ * VFIO_IOMMU_BIND_PROCESS ++ * ++ * Allocate a PASID for a process address space, and use it to attach this ++ * process to all devices in the container. Devices can then tag their DMA ++ * traffic with the returned @pasid to perform transactions on the associated ++ * virtual address space. Mapping and unmapping buffers is performed by standard ++ * functions such as mmap and malloc. ++ * ++ * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to ++ * bind. Otherwise the current task is bound. Given that the caller owns the ++ * device, setting this flag grants the caller read and write permissions on the ++ * entire address space of foreign process described by @pid. Therefore, ++ * permission to perform the bind operation on a foreign process is governed by ++ * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) ++ * for more information. ++ * ++ * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This ++ * ID is unique to a process and can be used on all devices in the container. ++ * ++ * On fork, the child inherits the device fd and can use the bonds setup by its ++ * parent. Consequently, the child has R/W access on the address spaces bound by ++ * its parent. After an execv, the device fd is closed and the child doesn't ++ * have access to the address space anymore. ++ * ++ * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is ++ * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND, ++ * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current ++ * task from the container. ++ */ ++struct vfio_iommu_type1_bind_process { ++ __u32 flags; ++#define VFIO_IOMMU_BIND_PID (1 << 0) ++ __u32 pasid; ++ __s32 pid; ++}; ++ ++/* ++ * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes ++ * vfio_iommu_type1_bind_process in data. ++ */ ++struct vfio_iommu_type1_bind { ++ __u32 argsz; ++ __u32 flags; ++#define VFIO_IOMMU_BIND_PROCESS (1 << 0) ++ __u8 data[]; ++}; ++ ++/* ++ * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind) ++ * ++ * Manage address spaces of devices in this container. Initially a TYPE1 ++ * container can only have one address space, managed with ++ * VFIO_IOMMU_MAP/UNMAP_DMA. ++ * ++ * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP ++ * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page ++ * tables, and BIND manages the stage-1 (guest) page tables. Other types of ++ * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls ++ * non-PASID traffic and BIND controls PASID traffic. But this depends on the ++ * underlying IOMMU architecture and isn't guaranteed. ++ * ++ * Availability of this feature depends on the device, its bus, the underlying ++ * IOMMU and the CPU architecture. ++ * ++ * returns: 0 on success, -errno on failure. ++ */ ++#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) ++ ++/* ++ * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind) ++ * ++ * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. ++ */ ++#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) ++ ++/* ++ * VFIO_IOMMU_SET_PASID_TABLE - _IOWR(VFIO_TYPE, VFIO_BASE + 18, ++ * struct vfio_iommu_type1_set_pasid_table) ++ * ++ * The SET operation passes a PASID table to the host while the ++ * UNSET operation detaches the one currently programmed. It is ++ * allowed to "SET" the table several times without unsetting as ++ * long as the table config does not stay IOMMU_PASID_CONFIG_TRANSLATE. ++ */ ++struct vfio_iommu_type1_set_pasid_table { ++ __u32 argsz; ++ __u32 flags; ++#define VFIO_PASID_TABLE_FLAG_SET (1 << 0) ++#define VFIO_PASID_TABLE_FLAG_UNSET (1 << 1) ++ struct iommu_pasid_table_config config; /* used on SET */ ++}; ++ ++#define VFIO_IOMMU_SET_PASID_TABLE _IO(VFIO_TYPE, VFIO_BASE + 18) ++ ++/** ++ * VFIO_IOMMU_CACHE_INVALIDATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, ++ * struct vfio_iommu_type1_cache_invalidate) ++ * ++ * Propagate guest IOMMU cache invalidation to the host. ++ */ ++struct vfio_iommu_type1_cache_invalidate { ++ __u32 argsz; ++ __u32 flags; ++ struct iommu_cache_invalidate_info info; ++}; ++#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 19) ++ ++/** ++ * VFIO_IOMMU_SET_MSI_BINDING - _IOWR(VFIO_TYPE, VFIO_BASE + 20, ++ * struct vfio_iommu_type1_set_msi_binding) ++ * ++ * Pass a stage 1 MSI doorbell mapping to the host so that this ++ * latter can build a nested stage2 mapping. Or conversely tear ++ * down a previously bound stage 1 MSI binding. ++ */ ++struct vfio_iommu_type1_set_msi_binding { ++ __u32 argsz; ++ __u32 flags; ++#define VFIO_IOMMU_BIND_MSI (1 << 0) ++#define VFIO_IOMMU_UNBIND_MSI (1 << 1) ++ __u64 iova; /* MSI guest IOVA */ ++ /* Fields below are used on BIND */ ++ __u64 gpa; /* MSI guest physical address */ ++ __u64 size; /* size of stage1 mapping (bytes) */ ++}; ++#define VFIO_IOMMU_SET_MSI_BINDING _IO(VFIO_TYPE, VFIO_BASE + 20) ++ + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + + /* +-- +2.27.0 + -- Gitee From 1d34d98c8a5c58e77e550c89ffc67dbea6f5b3de Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 4 Sep 2018 08:43:05 -0400 Subject: [PATCH 21/56] memory: Add new fields in IOTLBEntry The current IOTLBEntry becomes too simple to interact with some physical IOMMUs. IOTLBs can be invalidated with different granularities: domain, pasid, addr. Current IOTLB entry only offers page selective invalidation. Let's add a granularity field that conveys this information. TLB entries are usually tagged with some ids such as the asid or pasid. When propagating an invalidation command from the guest to the host, we need to pass those IDs. Also we add a leaf field which indicates, in case of invalidation notification, whether only cache entries for the last level of translation are required to be invalidated. A flag field is introduced to inform whether those fields are set. To enforce all existing users do not use those new fields, initialize the IOMMUTLBEvents when needed. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- memory-Add-new-fields-in-IOTLBEntry.patch | 184 ++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 memory-Add-new-fields-in-IOTLBEntry.patch diff --git a/memory-Add-new-fields-in-IOTLBEntry.patch b/memory-Add-new-fields-in-IOTLBEntry.patch new file mode 100644 index 00000000..5a85dbfd --- /dev/null +++ b/memory-Add-new-fields-in-IOTLBEntry.patch @@ -0,0 +1,184 @@ +From da97cef20d4ee5a8f3942953836b35e7f7dd974f Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 4 Sep 2018 08:43:05 -0400 +Subject: [PATCH] memory: Add new fields in IOTLBEntry + +The current IOTLBEntry becomes too simple to interact with +some physical IOMMUs. IOTLBs can be invalidated with different +granularities: domain, pasid, addr. Current IOTLB entry only offers +page selective invalidation. Let's add a granularity field +that conveys this information. + +TLB entries are usually tagged with some ids such as the asid +or pasid. When propagating an invalidation command from the +guest to the host, we need to pass those IDs. + +Also we add a leaf field which indicates, in case of invalidation +notification, whether only cache entries for the last level of +translation are required to be invalidated. + +A flag field is introduced to inform whether those fields are set. + +To enforce all existing users do not use those new fields, +initialize the IOMMUTLBEvents when needed. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmu-common.c | 2 +- + hw/arm/smmuv3.c | 2 +- + hw/i386/intel_iommu.c | 6 +++--- + hw/ppc/spapr_iommu.c | 2 +- + hw/virtio/virtio-iommu.c | 4 ++-- + include/exec/memory.h | 36 +++++++++++++++++++++++++++++++++++- + 6 files changed, 43 insertions(+), 9 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 0459850a93..3a1ecf81d6 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -470,7 +470,7 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) + /* Unmap the whole notifier's range */ + static void smmu_unmap_notifier_range(IOMMUNotifier *n) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + + event.type = IOMMU_NOTIFIER_UNMAP; + event.entry.target_as = &address_space_memory; +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 01b60bee49..94e2c658f8 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -802,7 +802,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + uint8_t tg, uint64_t num_pages) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint8_t granule; + + if (!tg) { +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index f584449d8d..fae282ef5e 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -1193,7 +1193,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + uint32_t offset; + uint64_t slpte; + uint64_t subpage_size, subpage_mask; +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint64_t iova = start; + uint64_t iova_next; + int ret = 0; +@@ -2425,7 +2425,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) + { + VTDAddressSpace *vtd_dev_as; +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + struct VTDBus *vtd_bus; + hwaddr addr; + uint64_t sz; +@@ -3481,7 +3481,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) + size = remain = end - start + 1; + + while (remain >= VTD_PAGE_SIZE) { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits); + uint64_t size = mask + 1; + +diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c +index db01071858..454df25d44 100644 +--- a/hw/ppc/spapr_iommu.c ++++ b/hw/ppc/spapr_iommu.c +@@ -449,7 +449,7 @@ static void spapr_tce_reset(DeviceState *dev) + static target_ulong put_tce_emu(SpaprTceTable *tcet, target_ulong ioba, + target_ulong tce) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + hwaddr page_mask = IOMMU_PAGE_MASK(tcet->page_shift); + unsigned long index = (ioba - tcet->bus_offset) >> tcet->page_shift; + +diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c +index 1b23e8e18c..83ed2b82e6 100644 +--- a/hw/virtio/virtio-iommu.c ++++ b/hw/virtio/virtio-iommu.c +@@ -129,7 +129,7 @@ static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end, hwaddr paddr, + uint32_t flags) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ, + flags & VIRTIO_IOMMU_MAP_F_WRITE); + +@@ -154,7 +154,7 @@ static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, + static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end) + { +- IOMMUTLBEvent event; ++ IOMMUTLBEvent event = {}; + uint64_t delta = virt_end - virt_start; + + if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) { +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 20f1b27377..c3180075e1 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -113,14 +113,48 @@ typedef enum { + IOMMU_RW = 3, + } IOMMUAccessFlags; + ++/* Granularity of the cache invalidation */ ++typedef enum { ++ IOMMU_INV_GRAN_ADDR = 0, ++ IOMMU_INV_GRAN_PASID, ++ IOMMU_INV_GRAN_DOMAIN, ++} IOMMUInvGranularity; ++ + #define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0)) + ++/** ++ * struct IOMMUTLBEntry - IOMMU TLB entry ++ * ++ * Structure used when performing a translation or when notifying MAP or ++ * UNMAP (invalidation) events ++ * ++ * @target_as: target address space ++ * @iova: IO virtual address (input) ++ * @translated_addr: translated address (output) ++ * @addr_mask: address mask (0xfff means 4K binding), must be multiple of 2 ++ * @perm: permission flag of the mapping (NONE encodes no mapping or ++ * invalidation notification) ++ * @granularity: granularity of the invalidation ++ * @flags: informs whether the following fields are set ++ * @arch_id: architecture specific ID tagging the TLB ++ * @pasid: PASID tagging the TLB ++ * @leaf: when @perm is NONE, indicates whether only caches for the last ++ * level of translation need to be invalidated. ++ */ + struct IOMMUTLBEntry { + AddressSpace *target_as; + hwaddr iova; + hwaddr translated_addr; +- hwaddr addr_mask; /* 0xfff = 4k translation */ ++ hwaddr addr_mask; + IOMMUAccessFlags perm; ++ IOMMUInvGranularity granularity; ++#define IOMMU_INV_FLAGS_PASID (1 << 0) ++#define IOMMU_INV_FLAGS_ARCHID (1 << 1) ++#define IOMMU_INV_FLAGS_LEAF (1 << 2) ++ uint32_t flags; ++ uint32_t arch_id; ++ uint32_t pasid; ++ bool leaf; + }; + + /* +-- +2.27.0 + -- Gitee From d905e3bb9b63d607648a8a17c30a23adc0e73c4f Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sun, 14 Feb 2021 12:30:57 -0500 Subject: [PATCH 22/56] hw/arm/smmuv3: Improve stage1 ASID invalidation At the moment ASID invalidation command (CMD_TLBI_NH_ASID) is propagated as a domain invalidation (the whole notifier range is invalidated independently on any ASID information). The new granularity field now allows to be more precise and restrict the invalidation to a peculiar ASID. Set the corresponding fields and flag. We still keep the iova and addr_mask settings for consumers that do not support the new fields, like VHOST. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...uv3-Improve-stage1-ASID-invalidation.patch | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch diff --git a/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch new file mode 100644 index 00000000..505bec39 --- /dev/null +++ b/hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch @@ -0,0 +1,107 @@ +From de53feaa37a267a21ed30a642e1e64c5fcfbc4a4 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Sun, 14 Feb 2021 12:30:57 -0500 +Subject: [PATCH] hw/arm/smmuv3: Improve stage1 ASID invalidation + +At the moment ASID invalidation command (CMD_TLBI_NH_ASID) is +propagated as a domain invalidation (the whole notifier range +is invalidated independently on any ASID information). + +The new granularity field now allows to be more precise and +restrict the invalidation to a peculiar ASID. Set the corresponding +fields and flag. + +We still keep the iova and addr_mask settings for consumers that +do not support the new fields, like VHOST. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- + hw/arm/trace-events | 1 + + 2 files changed, 43 insertions(+), 2 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 94e2c658f8..da5dac1ba5 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -836,6 +836,31 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + memory_region_notify_iommu_one(n, &event); + } + ++/** ++ * smmuv3_notify_asid - call the notifier @n for a given asid ++ * ++ * @mr: IOMMU mr region handle ++ * @n: notifier to be called ++ * @asid: address space ID or negative value if we don't care ++ */ ++static void smmuv3_notify_asid(IOMMUMemoryRegion *mr, ++ IOMMUNotifier *n, int asid) ++{ ++ IOMMUTLBEvent event = {}; ++ ++ event.type = IOMMU_NOTIFIER_UNMAP; ++ event.entry.target_as = &address_space_memory; ++ event.entry.perm = IOMMU_NONE; ++ event.entry.granularity = IOMMU_INV_GRAN_PASID; ++ event.entry.flags = IOMMU_INV_FLAGS_ARCHID; ++ event.entry.arch_id = asid; ++ event.entry.iova = n->start; ++ event.entry.addr_mask = n->end - n->start; ++ ++ memory_region_notify_iommu_one(n, &event); ++} ++ ++ + /* invalidate an asid/iova range tuple in all mr's */ + static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, + uint8_t tg, uint64_t num_pages) +@@ -913,6 +938,22 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + return true; + } + ++static void smmuv3_s1_asid_inval(SMMUState *s, uint16_t asid) ++{ ++ SMMUDevice *sdev; ++ ++ trace_smmuv3_s1_asid_inval(asid); ++ QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) { ++ IOMMUMemoryRegion *mr = &sdev->iommu; ++ IOMMUNotifier *n; ++ ++ IOMMU_NOTIFIER_FOREACH(n, mr) { ++ smmuv3_notify_asid(mr, n, asid); ++ } ++ } ++ smmu_iotlb_inv_asid(s, asid); ++} ++ + static int smmuv3_cmdq_consume(SMMUv3State *s) + { + SMMUState *bs = ARM_SMMU(s); +@@ -1027,8 +1068,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + uint16_t asid = CMD_ASID(&cmd); + + trace_smmuv3_cmdq_tlbi_nh_asid(asid); +- smmu_inv_notifiers_all(&s->smmu_state); +- smmu_iotlb_inv_asid(bs, asid); ++ smmuv3_s1_asid_inval(bs, asid); + break; + } + case SMMU_CMD_TLBI_NH_ALL: +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 2dee296c8f..1447ad5a90 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -46,6 +46,7 @@ smmuv3_cmdq_cfgi_cd(uint32_t sid) "sid=0x%x" + smmuv3_config_cache_hit(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache HIT for sid=0x%x (hits=%d, misses=%d, hit rate=%d)" + smmuv3_config_cache_miss(uint32_t sid, uint32_t hits, uint32_t misses, uint32_t perc) "Config cache MISS for sid=0x%x (hits=%d, misses=%d, hit rate=%d)" + smmuv3_s1_range_inval(int vmid, int asid, uint64_t addr, uint8_t tg, uint64_t num_pages, uint8_t ttl, bool leaf) "vmid=%d asid=%d addr=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" ttl=%d leaf=%d" ++smmuv3_s1_asid_inval(int asid) "asid=%d" + smmuv3_cmdq_tlbi_nh(void) "" + smmuv3_cmdq_tlbi_nh_asid(uint16_t asid) "asid=%d" + smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" +-- +2.27.0 + -- Gitee From 84ba65f8afb7afefa869353b9511fa104aeb9ca2 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 19 Mar 2021 12:22:48 -0400 Subject: [PATCH 23/56] hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL NH_ALL/NSNH_ALL corresponds to a domain granularity invalidation, ie. all the notifier range gets invalidation, whatever the ASID. So let's set the granularity to IOMMU_INV_GRAN_DOMAIN to allow the consumer to benefit from the info if it can. Signed-off-by: Eric Auger Suggested-by: chenxiang (M) Signed-off-by: Kunkun Jiang --- ...on-Allow-domain-invalidation-for-NH_.patch | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch diff --git a/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch b/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch new file mode 100644 index 00000000..d92b5edd --- /dev/null +++ b/hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch @@ -0,0 +1,33 @@ +From 876d18c962f0ead31d8458cd7ac19178be78455c Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 19 Mar 2021 12:22:48 -0400 +Subject: [PATCH] hw/arm/smmu-common: Allow domain invalidation for + NH_ALL/NSNH_ALL + +NH_ALL/NSNH_ALL corresponds to a domain granularity invalidation, +ie. all the notifier range gets invalidation, whatever the ASID. +So let's set the granularity to IOMMU_INV_GRAN_DOMAIN to allow +the consumer to benefit from the info if it can. + +Signed-off-by: Eric Auger +Suggested-by: chenxiang (M) +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmu-common.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 3a1ecf81d6..2ec4222c93 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -477,6 +477,7 @@ static void smmu_unmap_notifier_range(IOMMUNotifier *n) + event.entry.iova = n->start; + event.entry.perm = IOMMU_NONE; + event.entry.addr_mask = n->end - n->start; ++ event.entry.granularity = IOMMU_INV_GRAN_DOMAIN; + + memory_region_notify_iommu_one(n, &event); + } +-- +2.27.0 + -- Gitee From 0fe05cad27e7d1f6031f6dec4df5171f5af45352 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Mon, 1 Jul 2019 11:30:30 +0200 Subject: [PATCH 24/56] memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_VFIO_NESTED that tells whether the virtual IOMMU requires HW nested paging for VFIO integration. Current Intel virtual IOMMU device supports "Caching Mode" and does not require 2 stages at physical level to be integrated with VFIO. However SMMUv3 does not implement such "caching mode" and requires to use HW nested paging. As such SMMUv3 is the first IOMMU device to advertise this attribute. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ..._ATTR_VFIO_NESTED-IOMMU-memory-regio.patch | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch diff --git a/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch new file mode 100644 index 00000000..bb272470 --- /dev/null +++ b/memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch @@ -0,0 +1,72 @@ +From b380e3e0c30fb68dbbfb1397f3c374adfff77ac4 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Mon, 1 Jul 2019 11:30:30 +0200 +Subject: [PATCH] memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region + attribute + +We introduce a new IOMMU Memory Region attribute, +IOMMU_ATTR_VFIO_NESTED that tells whether the virtual IOMMU +requires HW nested paging for VFIO integration. + +Current Intel virtual IOMMU device supports "Caching +Mode" and does not require 2 stages at physical level to be +integrated with VFIO. However SMMUv3 does not implement such +"caching mode" and requires to use HW nested paging. + +As such SMMUv3 is the first IOMMU device to advertise this +attribute. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 12 ++++++++++++ + include/exec/memory.h | 3 ++- + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index da5dac1ba5..9b87d16217 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1589,6 +1589,17 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, + return 0; + } + ++static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, ++ enum IOMMUMemoryRegionAttr attr, ++ void *data) ++{ ++ if (attr == IOMMU_ATTR_VFIO_NESTED) { ++ *(bool *) data = true; ++ return 0; ++ } ++ return -EINVAL; ++} ++ + static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + void *data) + { +@@ -1596,6 +1607,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + + imrc->translate = smmuv3_translate; + imrc->notify_flag_changed = smmuv3_notify_flag_changed; ++ imrc->get_attr = smmuv3_get_attr; + } + + static const TypeInfo smmuv3_type_info = { +diff --git a/include/exec/memory.h b/include/exec/memory.h +index c3180075e1..864bcaeb01 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -321,7 +321,8 @@ typedef struct MemoryRegionClass { + + + enum IOMMUMemoryRegionAttr { +- IOMMU_ATTR_SPAPR_TCE_FD ++ IOMMU_ATTR_SPAPR_TCE_FD, ++ IOMMU_ATTR_VFIO_NESTED, + }; + + /* +-- +2.27.0 + -- Gitee From 727b94e7380eafe9593bfc5afe0929983608428a Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Mon, 25 Mar 2019 16:35:05 +0100 Subject: [PATCH 25/56] memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_MSI_TRANSLATE which tells whether the virtual IOMMU translates MSIs. ARM SMMU will expose this attribute since, as opposed to Intel DMAR, MSIs are translated as any other DMA requests. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ..._ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch diff --git a/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch new file mode 100644 index 00000000..b06bc251 --- /dev/null +++ b/memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch @@ -0,0 +1,32 @@ +From 062923fd4e6d11e1b724f2dd059f8b0c6e65bf7a Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Mon, 25 Mar 2019 16:35:05 +0100 +Subject: [PATCH] memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region + attribute + +We introduce a new IOMMU Memory Region attribute, IOMMU_ATTR_MSI_TRANSLATE +which tells whether the virtual IOMMU translates MSIs. ARM SMMU +will expose this attribute since, as opposed to Intel DMAR, MSIs +are translated as any other DMA requests. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + include/exec/memory.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 864bcaeb01..76ef99ed27 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -323,6 +323,7 @@ typedef struct MemoryRegionClass { + enum IOMMUMemoryRegionAttr { + IOMMU_ATTR_SPAPR_TCE_FD, + IOMMU_ATTR_VFIO_NESTED, ++ IOMMU_ATTR_MSI_TRANSLATE, + }; + + /* +-- +2.27.0 + -- Gitee From 35e28ff4d119ea5427bef5dfe6773f72c132efe3 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 13 Sep 2018 14:13:04 +0200 Subject: [PATCH 26/56] memory: Introduce IOMMU Memory Region inject_faults API This new API allows to inject @count iommu_faults into the IOMMU memory region. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...-IOMMU-Memory-Region-inject_faults-A.patch | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch diff --git a/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch new file mode 100644 index 00000000..e541d9e6 --- /dev/null +++ b/memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch @@ -0,0 +1,88 @@ +From d2dce19165f133935ff72e209f19bc43ab4d1421 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Sep 2018 14:13:04 +0200 +Subject: [PATCH] memory: Introduce IOMMU Memory Region inject_faults API + +This new API allows to inject @count iommu_faults into +the IOMMU memory region. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + include/exec/memory.h | 24 ++++++++++++++++++++++++ + softmmu/memory.c | 10 ++++++++++ + 2 files changed, 34 insertions(+) + +diff --git a/include/exec/memory.h b/include/exec/memory.h +index 76ef99ed27..3e84d62e40 100644 +--- a/include/exec/memory.h ++++ b/include/exec/memory.h +@@ -103,6 +103,8 @@ struct MemoryRegionSection { + bool nonvolatile; + }; + ++struct iommu_fault; ++ + typedef struct IOMMUTLBEntry IOMMUTLBEntry; + + /* See address_space_translate: bit 0 is read, bit 1 is write. */ +@@ -523,6 +525,19 @@ struct IOMMUMemoryRegionClass { + int (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu, + uint64_t page_size_mask, + Error **errp); ++ ++ /* ++ * Inject @count faults into the IOMMU memory region ++ * ++ * Optional method: if this method is not provided, then ++ * memory_region_injection_faults() will return -ENOENT ++ * ++ * @iommu: the IOMMU memory region to inject the faults in ++ * @count: number of faults to inject ++ * @buf: fault buffer ++ */ ++ int (*inject_faults)(IOMMUMemoryRegion *iommu, int count, ++ struct iommu_fault *buf); + }; + + typedef struct RamDiscardListener RamDiscardListener; +@@ -1819,6 +1834,15 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); + int memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, + uint64_t page_size_mask, + Error **errp); ++/** ++ * memory_region_inject_faults : inject @count faults stored in @buf ++ * ++ * @iommu_mr: the IOMMU memory region ++ * @count: number of faults to be injected ++ * @buf: buffer containing the faults ++ */ ++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, ++ struct iommu_fault *buf); + + /** + * memory_region_name: get a memory region's name +diff --git a/softmmu/memory.c b/softmmu/memory.c +index 7340e19ff5..9f98209ab2 100644 +--- a/softmmu/memory.c ++++ b/softmmu/memory.c +@@ -2111,6 +2111,16 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, + rdmc->unregister_listener(rdm, rdl); + } + ++int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, ++ struct iommu_fault *buf) ++{ ++ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); ++ if (!imrc->inject_faults) { ++ return -ENOENT; ++ } ++ return imrc->inject_faults(iommu_mr, count, buf); ++} ++ + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) + { + uint8_t mask = 1 << client; +-- +2.27.0 + -- Gitee From 8e33e36be42a41171a5f9ba046a72f75875e29ba Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 9 Jul 2019 12:20:12 +0200 Subject: [PATCH 27/56] iommu: Introduce generic header This header is meant to exposes data types used by several IOMMU devices such as struct for SVA and nested stage configuration. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- iommu-Introduce-generic-header.patch | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 iommu-Introduce-generic-header.patch diff --git a/iommu-Introduce-generic-header.patch b/iommu-Introduce-generic-header.patch new file mode 100644 index 00000000..84f3d77c --- /dev/null +++ b/iommu-Introduce-generic-header.patch @@ -0,0 +1,53 @@ +From 5e312f7b41ec48dc7dc9805af9f52aa8ed393bf9 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 9 Jul 2019 12:20:12 +0200 +Subject: [PATCH] iommu: Introduce generic header + +This header is meant to exposes data types used by +several IOMMU devices such as struct for SVA and +nested stage configuration. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + include/hw/iommu/iommu.h | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + create mode 100644 include/hw/iommu/iommu.h + +diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h +new file mode 100644 +index 0000000000..12092bda7b +--- /dev/null ++++ b/include/hw/iommu/iommu.h +@@ -0,0 +1,28 @@ ++/* ++ * common header for iommu devices ++ * ++ * Copyright Red Hat, Inc. 2019 ++ * ++ * Authors: ++ * Eric Auger ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. See ++ * the COPYING file in the top-level directory. ++ */ ++ ++#ifndef QEMU_HW_IOMMU_IOMMU_H ++#define QEMU_HW_IOMMU_IOMMU_H ++#ifdef __linux__ ++#include ++#endif ++ ++typedef struct IOMMUConfig { ++ union { ++#ifdef __linux__ ++ struct iommu_pasid_table_config pasid_cfg; ++#endif ++ }; ++} IOMMUConfig; ++ ++ ++#endif /* QEMU_HW_IOMMU_IOMMU_H */ +-- +2.27.0 + -- Gitee From fdf9a8f9886f0c89f962d28e496780a58cb8676c Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 5 Jul 2019 19:01:36 +0800 Subject: [PATCH 28/56] pci: introduce PCIPASIDOps to PCIDevice This patch introduces PCIPASIDOps for IOMMU related operations. https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00078.html https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00940.html So far, to setup virt-SVA for assigned SVA capable device, needs to configure host translation structures for specific pasid. (e.g. bind guest page table to host and enable nested translation in host). Besides, vIOMMU emulator needs to forward guest's cache invalidation to host since host nested translation is enabled. e.g. on VT-d, guest owns 1st level translation table, thus cache invalidation for 1st level should be propagated to host. This patch adds two functions: alloc_pasid and free_pasid to support guest pasid allocation and free. The implementations of the callbacks would be device passthru modules. Like vfio. Cc: Kevin Tian Cc: Jacob Pan Cc: Peter Xu Cc: Eric Auger Cc: Yi Sun Cc: David Gibson Signed-off-by: Liu Yi L Signed-off-by: Yi Sun Signed-off-by: Kunkun Jiang --- pci-introduce-PCIPASIDOps-to-PCIDevice.patch | 127 +++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 pci-introduce-PCIPASIDOps-to-PCIDevice.patch diff --git a/pci-introduce-PCIPASIDOps-to-PCIDevice.patch b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch new file mode 100644 index 00000000..f4cb6116 --- /dev/null +++ b/pci-introduce-PCIPASIDOps-to-PCIDevice.patch @@ -0,0 +1,127 @@ +From c71485494970e7aa986be2b05bf7e2847017e264 Mon Sep 17 00:00:00 2001 +From: Liu Yi L +Date: Fri, 5 Jul 2019 19:01:36 +0800 +Subject: [PATCH] pci: introduce PCIPASIDOps to PCIDevice + +This patch introduces PCIPASIDOps for IOMMU related operations. + +https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00078.html +https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg00940.html + +So far, to setup virt-SVA for assigned SVA capable device, needs to +configure host translation structures for specific pasid. (e.g. bind +guest page table to host and enable nested translation in host). +Besides, vIOMMU emulator needs to forward guest's cache invalidation +to host since host nested translation is enabled. e.g. on VT-d, guest +owns 1st level translation table, thus cache invalidation for 1st +level should be propagated to host. + +This patch adds two functions: alloc_pasid and free_pasid to support +guest pasid allocation and free. The implementations of the callbacks +would be device passthru modules. Like vfio. + +Cc: Kevin Tian +Cc: Jacob Pan +Cc: Peter Xu +Cc: Eric Auger +Cc: Yi Sun +Cc: David Gibson +Signed-off-by: Liu Yi L +Signed-off-by: Yi Sun +Signed-off-by: Kunkun Jiang +--- + hw/pci/pci.c | 34 ++++++++++++++++++++++++++++++++++ + include/hw/pci/pci.h | 11 +++++++++++ + 2 files changed, 45 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index e5993c1ef5..4a9374c025 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2759,6 +2759,40 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) + bus->iommu_opaque = opaque; + } + ++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops) ++{ ++ assert(ops && !dev->pasid_ops); ++ dev->pasid_ops = ops; ++} ++ ++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn) ++{ ++ PCIDevice *dev; ++ ++ if (!bus) { ++ return false; ++ } ++ ++ dev = bus->devices[devfn]; ++ return !!(dev && dev->pasid_ops); ++} ++ ++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, ++ IOMMUConfig *config) ++{ ++ PCIDevice *dev; ++ ++ if (!bus) { ++ return -EINVAL; ++ } ++ ++ dev = bus->devices[devfn]; ++ if (dev && dev->pasid_ops && dev->pasid_ops->set_pasid_table) { ++ return dev->pasid_ops->set_pasid_table(bus, devfn, config); ++ } ++ return -ENOENT; ++} ++ + static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque) + { + Range *range = opaque; +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index e7cdf2d5ec..abffa12a99 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -9,6 +9,7 @@ + + #include "hw/pci/pcie.h" + #include "qom/object.h" ++#include "hw/iommu/iommu.h" + + extern bool pci_available; + +@@ -265,6 +266,11 @@ struct PCIReqIDCache { + }; + typedef struct PCIReqIDCache PCIReqIDCache; + ++struct PCIPASIDOps { ++ int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++}; ++typedef struct PCIPASIDOps PCIPASIDOps; ++ + struct PCIDevice { + DeviceState qdev; + bool partially_hotplugged; +@@ -361,6 +367,7 @@ struct PCIDevice { + /* ID of standby device in net_failover pair */ + char *failover_pair_id; + uint32_t acpi_index; ++ PCIPASIDOps *pasid_ops; + }; + + void pci_register_bar(PCIDevice *pci_dev, int region_num, +@@ -498,6 +505,10 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); + AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); + void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); + ++void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops); ++bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn); ++int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++ + static inline void + pci_set_byte(uint8_t *config, uint8_t val) + { +-- +2.27.0 + -- Gitee From 985689746cd720332a318c1ea3878cd90eebf1d7 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 28 Aug 2018 16:16:20 +0200 Subject: [PATCH 29/56] vfio: Force nested if iommu requires it In case we detect the address space is translated by a virtual IOMMU which requires HW nested paging to integrate with VFIO, let's set up the container with the VFIO_TYPE1_NESTING_IOMMU iommu_type. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- vfio-Force-nested-if-iommu-requires-it.patch | 101 +++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 vfio-Force-nested-if-iommu-requires-it.patch diff --git a/vfio-Force-nested-if-iommu-requires-it.patch b/vfio-Force-nested-if-iommu-requires-it.patch new file mode 100644 index 00000000..d580ae50 --- /dev/null +++ b/vfio-Force-nested-if-iommu-requires-it.patch @@ -0,0 +1,101 @@ +From e7eef5af743a53f0415267ebe9bba2e5f0e05816 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 28 Aug 2018 16:16:20 +0200 +Subject: [PATCH] vfio: Force nested if iommu requires it + +In case we detect the address space is translated by +a virtual IOMMU which requires HW nested paging to +integrate with VFIO, let's set up the container with +the VFIO_TYPE1_NESTING_IOMMU iommu_type. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 36 ++++++++++++++++++++++++++++-------- + 1 file changed, 28 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 6cb91e7ffd..d7533637c9 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -2045,27 +2045,38 @@ static void vfio_put_address_space(VFIOAddressSpace *space) + * vfio_get_iommu_type - selects the richest iommu_type (v2 first) + */ + static int vfio_get_iommu_type(VFIOContainer *container, ++ bool want_nested, + Error **errp) + { +- int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, ++ int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU, ++ VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, + VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; +- int i; ++ int i, ret = -EINVAL; + + for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { + if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { +- return iommu_types[i]; ++ if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU && !want_nested) { ++ continue; ++ } ++ ret = iommu_types[i]; ++ break; + } + } +- error_setg(errp, "No available IOMMU models"); +- return -EINVAL; ++ if (ret < 0) { ++ error_setg(errp, "No available IOMMU models"); ++ } else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) { ++ error_setg(errp, "Nested mode requested but not supported"); ++ ret = -EINVAL; ++ } ++ return ret; + } + + static int vfio_init_container(VFIOContainer *container, int group_fd, +- Error **errp) ++ bool want_nested, Error **errp) + { + int iommu_type, dirty_log_manual_clear, ret; + +- iommu_type = vfio_get_iommu_type(container, errp); ++ iommu_type = vfio_get_iommu_type(container, want_nested, errp); + if (iommu_type < 0) { + return iommu_type; + } +@@ -2177,6 +2188,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + VFIOContainer *container; + int ret, fd; + VFIOAddressSpace *space; ++ IOMMUMemoryRegion *iommu_mr; ++ bool nested = false; ++ ++ if (memory_region_is_iommu(as->root)) { ++ iommu_mr = IOMMU_MEMORY_REGION(as->root); ++ memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED, ++ (void *)&nested); ++ } + + space = vfio_get_address_space(as); + +@@ -2257,7 +2276,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); + +- ret = vfio_init_container(container, group->fd, errp); ++ ret = vfio_init_container(container, group->fd, nested, errp); + if (ret) { + goto free_container_exit; + } +@@ -2269,6 +2288,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + } + + switch (container->iommu_type) { ++ case VFIO_TYPE1_NESTING_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + { +-- +2.27.0 + -- Gitee From 421fe54648bb29ead64e64bdbaa847464bc8a165 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 22 Mar 2019 18:05:23 +0100 Subject: [PATCH 30/56] vfio: Introduce hostwin_from_range helper Let's introduce a hostwin_from_range() helper that returns the hostwin encapsulating an IOVA range or NULL if none is found. This improves the readibility of callers and removes the usage of hostwin_found. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...-Introduce-hostwin_from_range-helper.patch | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 vfio-Introduce-hostwin_from_range-helper.patch diff --git a/vfio-Introduce-hostwin_from_range-helper.patch b/vfio-Introduce-hostwin_from_range-helper.patch new file mode 100644 index 00000000..c8c8ab76 --- /dev/null +++ b/vfio-Introduce-hostwin_from_range-helper.patch @@ -0,0 +1,89 @@ +From 85232739b4852f1a51dde58c9007ed0deb17c2f2 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 22 Mar 2019 18:05:23 +0100 +Subject: [PATCH] vfio: Introduce hostwin_from_range helper + +Let's introduce a hostwin_from_range() helper that returns the +hostwin encapsulating an IOVA range or NULL if none is found. + +This improves the readibility of callers and removes the usage +of hostwin_found. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 36 +++++++++++++++++------------------- + 1 file changed, 17 insertions(+), 19 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d7533637c9..d358789f19 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -909,6 +909,19 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container, + g_free(vrdl); + } + ++static VFIOHostDMAWindow * ++hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end) ++{ ++ VFIOHostDMAWindow *hostwin; ++ ++ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { ++ if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { ++ return hostwin; ++ } ++ } ++ return NULL; ++} ++ + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { +@@ -918,7 +931,6 @@ static void vfio_listener_region_add(MemoryListener *listener, + void *vaddr; + int ret; + VFIOHostDMAWindow *hostwin; +- bool hostwin_found; + Error *err = NULL; + + if (vfio_listener_skipped_section(section)) { +@@ -1011,15 +1023,8 @@ static void vfio_listener_region_add(MemoryListener *listener, + #endif + } + +- hostwin_found = false; +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { +- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { +- hostwin_found = true; +- break; +- } +- } +- +- if (!hostwin_found) { ++ hostwin = hostwin_from_range(container, iova, end); ++ if (!hostwin) { + error_setg(&err, "Container %p can't map guest IOVA region" + " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); + goto fail; +@@ -1211,16 +1216,9 @@ static void vfio_listener_region_del(MemoryListener *listener, + + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; +- VFIOHostDMAWindow *hostwin; +- bool hostwin_found = false; ++ VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); + +- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { +- if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { +- hostwin_found = true; +- break; +- } +- } +- assert(hostwin_found); /* or region_add() would have failed */ ++ assert(hostwin); /* or region_add() would have failed */ + + pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); +-- +2.27.0 + -- Gitee From 0ea8a70c731cfced01d00903829a450f55ba3e1f Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 30 Aug 2018 15:04:25 +0200 Subject: [PATCH 31/56] vfio: Introduce helpers to DMA map/unmap a RAM section Let's introduce two helpers that allow to DMA map/unmap a RAM section. Those helpers will be called for nested stage setup in another call site. Also the vfio_listener_region_add/del() structure may be clearer. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...elpers-to-DMA-map-unmap-a-RAM-sectio.patch | 280 ++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch diff --git a/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch new file mode 100644 index 00000000..fd6deffd --- /dev/null +++ b/vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch @@ -0,0 +1,280 @@ +From dab969657d8ff8b175856f91b035b74849cf69ba Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 30 Aug 2018 15:04:25 +0200 +Subject: [PATCH] vfio: Introduce helpers to DMA map/unmap a RAM section + +Let's introduce two helpers that allow to DMA map/unmap a RAM +section. Those helpers will be called for nested stage setup in +another call site. Also the vfio_listener_region_add/del() +structure may be clearer. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 206 +++++++++++++++++++++++++------------------ + hw/vfio/trace-events | 4 +- + 2 files changed, 123 insertions(+), 87 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d358789f19..b3dc090840 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -922,13 +922,130 @@ hostwin_from_range(VFIOContainer *container, hwaddr iova, hwaddr end) + return NULL; + } + ++static int vfio_dma_map_ram_section(VFIOContainer *container, ++ MemoryRegionSection *section, Error **err) ++{ ++ VFIOHostDMAWindow *hostwin; ++ Int128 llend, llsize; ++ hwaddr iova, end; ++ void *vaddr; ++ int ret; ++ ++ assert(memory_region_is_ram(section->mr)); ++ ++ iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); ++ llend = int128_make64(section->offset_within_address_space); ++ llend = int128_add(llend, section->size); ++ llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); ++ end = int128_get64(int128_sub(llend, int128_one())); ++ ++ vaddr = memory_region_get_ram_ptr(section->mr) + ++ section->offset_within_region + ++ (iova - section->offset_within_address_space); ++ ++ hostwin = hostwin_from_range(container, iova, end); ++ if (!hostwin) { ++ error_setg(err, "Container %p can't map guest IOVA region" ++ " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); ++ return -EFAULT; ++ } ++ ++ trace_vfio_dma_map_ram(iova, end, vaddr); ++ ++ llsize = int128_sub(llend, int128_make64(iova)); ++ ++ if (memory_region_is_ram_device(section->mr)) { ++ hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; ++ ++ if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { ++ trace_vfio_listener_region_add_no_dma_map( ++ memory_region_name(section->mr), ++ section->offset_within_address_space, ++ int128_getlo(section->size), ++ pgmask + 1); ++ return 0; ++ } ++ } ++ ++ ret = vfio_dma_map(container, iova, int128_get64(llsize), ++ vaddr, section->readonly); ++ if (ret) { ++ error_setg(err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " ++ "0x%"HWADDR_PRIx", %p) = %d (%m)", ++ container, iova, int128_get64(llsize), vaddr, ret); ++ if (memory_region_is_ram_device(section->mr)) { ++ /* Allow unexpected mappings not to be fatal for RAM devices */ ++ error_report_err(*err); ++ return 0; ++ } ++ return ret; ++ } ++ return 0; ++} ++ ++static void vfio_dma_unmap_ram_section(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ Int128 llend, llsize; ++ hwaddr iova, end; ++ bool try_unmap = true; ++ int ret; ++ ++ iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); ++ llend = int128_make64(section->offset_within_address_space); ++ llend = int128_add(llend, section->size); ++ llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); ++ ++ if (int128_ge(int128_make64(iova), llend)) { ++ return; ++ } ++ end = int128_get64(int128_sub(llend, int128_one())); ++ ++ llsize = int128_sub(llend, int128_make64(iova)); ++ ++ trace_vfio_dma_unmap_ram(iova, end); ++ ++ if (memory_region_is_ram_device(section->mr)) { ++ hwaddr pgmask; ++ VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); ++ ++ assert(hostwin); /* or region_add() would have failed */ ++ ++ pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; ++ try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); ++ } else if (memory_region_has_ram_discard_manager(section->mr)) { ++ vfio_unregister_ram_discard_listener(container, section); ++ /* Unregistering will trigger an unmap. */ ++ try_unmap = false; ++ } ++ ++ if (try_unmap) { ++ if (int128_eq(llsize, int128_2_64())) { ++ /* The unmap ioctl doesn't accept a full 64-bit span. */ ++ llsize = int128_rshift(llsize, 1); ++ ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); ++ if (ret) { ++ error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ "0x%"HWADDR_PRIx") = %d (%m)", ++ container, iova, int128_get64(llsize), ret); ++ } ++ iova += int128_get64(llsize); ++ } ++ ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); ++ if (ret) { ++ error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " ++ "0x%"HWADDR_PRIx") = %d (%m)", ++ container, iova, int128_get64(llsize), ret); ++ } ++ } ++} ++ + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + hwaddr iova, end; +- Int128 llend, llsize; +- void *vaddr; ++ Int128 llend; + int ret; + VFIOHostDMAWindow *hostwin; + Error *err = NULL; +@@ -1092,38 +1209,7 @@ static void vfio_listener_region_add(MemoryListener *listener, + return; + } + +- vaddr = memory_region_get_ram_ptr(section->mr) + +- section->offset_within_region + +- (iova - section->offset_within_address_space); +- +- trace_vfio_listener_region_add_ram(iova, end, vaddr); +- +- llsize = int128_sub(llend, int128_make64(iova)); +- +- if (memory_region_is_ram_device(section->mr)) { +- hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; +- +- if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { +- trace_vfio_listener_region_add_no_dma_map( +- memory_region_name(section->mr), +- section->offset_within_address_space, +- int128_getlo(section->size), +- pgmask + 1); +- return; +- } +- } +- +- ret = vfio_dma_map(container, iova, int128_get64(llsize), +- vaddr, section->readonly); +- if (ret) { +- error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " +- "0x%"HWADDR_PRIx", %p) = %d (%m)", +- container, iova, int128_get64(llsize), vaddr, ret); +- if (memory_region_is_ram_device(section->mr)) { +- /* Allow unexpected mappings not to be fatal for RAM devices */ +- error_report_err(err); +- return; +- } ++ if (vfio_dma_map_ram_section(container, section, &err)) { + goto fail; + } + +@@ -1157,10 +1243,6 @@ static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); +- hwaddr iova, end; +- Int128 llend, llsize; +- int ret; +- bool try_unmap = true; + + if (vfio_listener_skipped_section(section)) { + trace_vfio_listener_region_del_skip( +@@ -1200,53 +1282,7 @@ static void vfio_listener_region_del(MemoryListener *listener, + */ + } + +- iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); +- llend = int128_make64(section->offset_within_address_space); +- llend = int128_add(llend, section->size); +- llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); +- +- if (int128_ge(int128_make64(iova), llend)) { +- return; +- } +- end = int128_get64(int128_sub(llend, int128_one())); +- +- llsize = int128_sub(llend, int128_make64(iova)); +- +- trace_vfio_listener_region_del(iova, end); +- +- if (memory_region_is_ram_device(section->mr)) { +- hwaddr pgmask; +- VFIOHostDMAWindow *hostwin = hostwin_from_range(container, iova, end); +- +- assert(hostwin); /* or region_add() would have failed */ +- +- pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; +- try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); +- } else if (memory_region_has_ram_discard_manager(section->mr)) { +- vfio_unregister_ram_discard_listener(container, section); +- /* Unregistering will trigger an unmap. */ +- try_unmap = false; +- } +- +- if (try_unmap) { +- if (int128_eq(llsize, int128_2_64())) { +- /* The unmap ioctl doesn't accept a full 64-bit span. */ +- llsize = int128_rshift(llsize, 1); +- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); +- if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " +- "0x%"HWADDR_PRIx") = %d (%m)", +- container, iova, int128_get64(llsize), ret); +- } +- iova += int128_get64(llsize); +- } +- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); +- if (ret) { +- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " +- "0x%"HWADDR_PRIx") = %d (%m)", +- container, iova, int128_get64(llsize), ret); +- } +- } ++ vfio_dma_unmap_ram_section(container, section); + + memory_region_unref(section->mr); + +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 0ef1b5f4a6..a37563a315 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -99,10 +99,10 @@ vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "i + vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64 + vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" + vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 +-vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" ++vfio_dma_map_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" + vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" + vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64 +-vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 ++vfio_dma_unmap_ram(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 + vfio_disconnect_container(int fd) "close container->fd=%d" + vfio_put_group(int fd) "close group->fd=%d" + vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u" +-- +2.27.0 + -- Gitee From b3c432fcc5a1706d2cc43500bed1714b2e39c5bd Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Wed, 29 Aug 2018 18:10:12 +0200 Subject: [PATCH 32/56] vfio: Set up nested stage mappings In nested mode, legacy vfio_iommu_map_notify cannot be used as there is no "caching" mode and we do not trap on map. On Intel, vfio_iommu_map_notify was used to DMA map the RAM through the host single stage. With nested mode, we need to setup the stage 2 and the stage 1 separately. This patch introduces a prereg_listener to setup the stage 2 mapping. The stage 1 mapping, owned by the guest, is passed to the host when the guest invalidates the stage 1 configuration, through a dedicated PCIPASIDOps callback. Guest IOTLB invalidations are cascaded downto the host through another IOMMU MR UNMAP notifier. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- vfio-Set-up-nested-stage-mappings.patch | 281 ++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 vfio-Set-up-nested-stage-mappings.patch diff --git a/vfio-Set-up-nested-stage-mappings.patch b/vfio-Set-up-nested-stage-mappings.patch new file mode 100644 index 00000000..c6d87f97 --- /dev/null +++ b/vfio-Set-up-nested-stage-mappings.patch @@ -0,0 +1,281 @@ +From 96581a5ee46e89dbc9e1ebe247b00adefb1c7a41 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Wed, 29 Aug 2018 18:10:12 +0200 +Subject: [PATCH] vfio: Set up nested stage mappings + +In nested mode, legacy vfio_iommu_map_notify cannot be used as +there is no "caching" mode and we do not trap on map. + +On Intel, vfio_iommu_map_notify was used to DMA map the RAM +through the host single stage. + +With nested mode, we need to setup the stage 2 and the stage 1 +separately. This patch introduces a prereg_listener to setup +the stage 2 mapping. + +The stage 1 mapping, owned by the guest, is passed to the host +when the guest invalidates the stage 1 configuration, through +a dedicated PCIPASIDOps callback. Guest IOTLB invalidations +are cascaded downto the host through another IOMMU MR UNMAP +notifier. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 139 +++++++++++++++++++++++++++++++++++++++++-- + hw/vfio/pci.c | 21 +++++++ + hw/vfio/trace-events | 2 + + 3 files changed, 157 insertions(+), 5 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b3dc090840..58f8a43a43 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -707,6 +707,73 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + return true; + } + ++/* Propagate a guest IOTLB invalidation to the host (nested mode) */ ++static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ++{ ++ VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); ++ struct vfio_iommu_type1_cache_invalidate ustruct = {}; ++ VFIOContainer *container = giommu->container; ++ int ret; ++ ++ assert(iotlb->perm == IOMMU_NONE); ++ ++ ustruct.argsz = sizeof(ustruct); ++ ustruct.flags = 0; ++ ustruct.info.argsz = sizeof(struct iommu_cache_invalidate_info); ++ ustruct.info.version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1; ++ ustruct.info.cache = IOMMU_CACHE_INV_TYPE_IOTLB; ++ ++ switch (iotlb->granularity) { ++ case IOMMU_INV_GRAN_DOMAIN: ++ ustruct.info.granularity = IOMMU_INV_GRANU_DOMAIN; ++ break; ++ case IOMMU_INV_GRAN_PASID: ++ { ++ struct iommu_inv_pasid_info *pasid_info; ++ int archid = -1; ++ ++ pasid_info = &ustruct.info.granu.pasid_info; ++ ustruct.info.granularity = IOMMU_INV_GRANU_PASID; ++ if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { ++ pasid_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; ++ archid = iotlb->arch_id; ++ } ++ pasid_info->archid = archid; ++ trace_vfio_iommu_asid_inv_iotlb(archid); ++ break; ++ } ++ case IOMMU_INV_GRAN_ADDR: ++ { ++ hwaddr start = iotlb->iova + giommu->iommu_offset; ++ struct iommu_inv_addr_info *addr_info; ++ size_t size = iotlb->addr_mask + 1; ++ int archid = -1; ++ ++ addr_info = &ustruct.info.granu.addr_info; ++ ustruct.info.granularity = IOMMU_INV_GRANU_ADDR; ++ if (iotlb->leaf) { ++ addr_info->flags |= IOMMU_INV_ADDR_FLAGS_LEAF; ++ } ++ if (iotlb->flags & IOMMU_INV_FLAGS_ARCHID) { ++ addr_info->flags |= IOMMU_INV_ADDR_FLAGS_ARCHID; ++ archid = iotlb->arch_id; ++ } ++ addr_info->archid = archid; ++ addr_info->addr = start; ++ addr_info->granule_size = size; ++ addr_info->nb_granules = 1; ++ trace_vfio_iommu_addr_inv_iotlb(archid, start, size, ++ 1, iotlb->leaf); ++ break; ++ } ++ } ++ ++ ret = ioctl(container->fd, VFIO_IOMMU_CACHE_INVALIDATE, &ustruct); ++ if (ret) { ++ error_report("%p: failed to invalidate CACHE (%d)", container, ret); ++ } ++} ++ + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + { + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); +@@ -1040,6 +1107,35 @@ static void vfio_dma_unmap_ram_section(VFIOContainer *container, + } + } + ++static void vfio_prereg_listener_region_add(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ Error *err = NULL; ++ ++ if (!memory_region_is_ram(section->mr)) { ++ return; ++ } ++ ++ vfio_dma_map_ram_section(container, section, &err); ++ if (err) { ++ error_report_err(err); ++ } ++} ++static void vfio_prereg_listener_region_del(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ ++ if (!memory_region_is_ram(section->mr)) { ++ return; ++ } ++ ++ vfio_dma_unmap_ram_section(container, section); ++} ++ + static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) + { +@@ -1150,9 +1246,10 @@ static void vfio_listener_region_add(MemoryListener *listener, + memory_region_ref(section->mr); + + if (memory_region_is_iommu(section->mr)) { ++ IOMMUNotify notify; + VFIOGuestIOMMU *giommu; + IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); +- int iommu_idx; ++ int iommu_idx, flags; + + trace_vfio_listener_region_add_iommu(iova, end); + /* +@@ -1171,8 +1268,18 @@ static void vfio_listener_region_add(MemoryListener *listener, + llend = int128_sub(llend, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); +- iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, +- IOMMU_NOTIFIER_IOTLB_EVENTS, ++ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ /* IOTLB unmap notifier to propagate guest IOTLB invalidations */ ++ flags = IOMMU_NOTIFIER_UNMAP; ++ notify = vfio_iommu_unmap_notify; ++ } else { ++ /* MAP/UNMAP IOTLB notifier */ ++ flags = IOMMU_NOTIFIER_IOTLB_EVENTS; ++ notify = vfio_iommu_map_notify; ++ } ++ ++ iommu_notifier_init(&giommu->n, notify, flags, + section->offset_within_region, + int128_get64(llend), + iommu_idx); +@@ -1192,7 +1299,9 @@ static void vfio_listener_region_add(MemoryListener *listener, + goto fail; + } + QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); +- memory_region_iommu_replay(giommu->iommu, &giommu->n); ++ if (flags & IOMMU_NOTIFIER_MAP) { ++ memory_region_iommu_replay(giommu->iommu, &giommu->n); ++ } + + return; + } +@@ -1672,10 +1781,16 @@ static const MemoryListener vfio_memory_listener = { + .log_clear = vfio_listener_log_clear, + }; + ++static MemoryListener vfio_memory_prereg_listener = { ++ .region_add = vfio_prereg_listener_region_add, ++ .region_del = vfio_prereg_listener_region_del, ++}; ++ + static void vfio_listener_release(VFIOContainer *container) + { + memory_listener_unregister(&container->listener); +- if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { ++ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || ++ container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } + } +@@ -2351,6 +2466,20 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, + vfio_get_iommu_info_migration(container, info); + } + g_free(info); ++ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ container->prereg_listener = vfio_memory_prereg_listener; ++ memory_listener_register(&container->prereg_listener, ++ &address_space_memory); ++ if (container->error) { ++ memory_listener_unregister(&container->prereg_listener); ++ ret = -1; ++ error_propagate_prepend(errp, container->error, ++ "RAM memory listener initialization failed " ++ "for container"); ++ goto free_container_exit; ++ } ++ } + break; + } + case VFIO_SPAPR_TCE_v2_IOMMU: +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 7b45353ce2..ae5e014e5d 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2797,6 +2797,25 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) + vdev->req_enabled = false; + } + ++static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn, ++ IOMMUConfig *config) ++{ ++ PCIDevice *pdev = bus->devices[devfn]; ++ VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); ++ VFIOContainer *container = vdev->vbasedev.group->container; ++ struct vfio_iommu_type1_set_pasid_table info; ++ ++ info.argsz = sizeof(info); ++ info.flags = VFIO_PASID_TABLE_FLAG_SET; ++ memcpy(&info.config, &config->pasid_cfg, sizeof(config->pasid_cfg)); ++ ++ return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); ++} ++ ++static PCIPASIDOps vfio_pci_pasid_ops = { ++ .set_pasid_table = vfio_iommu_set_pasid_table, ++}; ++ + static void vfio_realize(PCIDevice *pdev, Error **errp) + { + VFIOPCIDevice *vdev = VFIO_PCI(pdev); +@@ -3108,6 +3127,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + vfio_register_req_notifier(vdev); + vfio_setup_resetfn_quirk(vdev); + ++ pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); ++ + return; + + out_deregister: +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index a37563a315..20069935f5 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -118,6 +118,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic + vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" + vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" + vfio_dma_unmap_overflow_workaround(void) "" ++vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" ++vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" + + # platform.c + vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" +-- +2.27.0 + -- Gitee From fe17c2d208550788882fa9c2226e6a4c9e840257 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 14 Aug 2018 08:08:11 -0400 Subject: [PATCH 33/56] vfio: Pass stage 1 MSI bindings to the host We register the stage1 MSI bindings when enabling the vectors and we unregister them on msi disable. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...ass-stage-1-MSI-bindings-to-the-host.patch | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 vfio-Pass-stage-1-MSI-bindings-to-the-host.patch diff --git a/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch new file mode 100644 index 00000000..bed28007 --- /dev/null +++ b/vfio-Pass-stage-1-MSI-bindings-to-the-host.patch @@ -0,0 +1,262 @@ +From 8b4fbe869f8a1f510896c86067d2e4fc3dc82eb9 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 14 Aug 2018 08:08:11 -0400 +Subject: [PATCH] vfio: Pass stage 1 MSI bindings to the host + +We register the stage1 MSI bindings when enabling the vectors +and we unregister them on msi disable. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 59 +++++++++++++++++++++++++++ + hw/vfio/pci.c | 76 ++++++++++++++++++++++++++++++++++- + hw/vfio/trace-events | 2 + + include/hw/vfio/vfio-common.h | 12 ++++++ + 4 files changed, 147 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 58f8a43a43..1f78af121d 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -774,6 +774,65 @@ static void vfio_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + } + } + ++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n, ++ IOMMUTLBEntry *iotlb) ++{ ++ struct vfio_iommu_type1_set_msi_binding ustruct; ++ VFIOMSIBinding *binding; ++ int ret; ++ ++ QLIST_FOREACH(binding, &container->msibinding_list, next) { ++ if (binding->index == n) { ++ return 0; ++ } ++ } ++ ++ ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding); ++ ustruct.iova = iotlb->iova; ++ ustruct.flags = VFIO_IOMMU_BIND_MSI; ++ ustruct.gpa = iotlb->translated_addr; ++ ustruct.size = iotlb->addr_mask + 1; ++ ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct); ++ if (ret) { ++ error_report("%s: failed to register the stage1 MSI binding (%m)", ++ __func__); ++ return ret; ++ } ++ binding = g_new0(VFIOMSIBinding, 1); ++ binding->iova = ustruct.iova; ++ binding->gpa = ustruct.gpa; ++ binding->size = ustruct.size; ++ binding->index = n; ++ ++ QLIST_INSERT_HEAD(&container->msibinding_list, binding, next); ++ return 0; ++} ++ ++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n) ++{ ++ struct vfio_iommu_type1_set_msi_binding ustruct; ++ VFIOMSIBinding *binding, *tmp; ++ int ret; ++ ++ ustruct.argsz = sizeof(struct vfio_iommu_type1_set_msi_binding); ++ QLIST_FOREACH_SAFE(binding, &container->msibinding_list, next, tmp) { ++ if (binding->index != n) { ++ continue; ++ } ++ ustruct.flags = VFIO_IOMMU_UNBIND_MSI; ++ ustruct.iova = binding->iova; ++ ret = ioctl(container->fd, VFIO_IOMMU_SET_MSI_BINDING , &ustruct); ++ if (ret) { ++ error_report("Failed to unregister the stage1 MSI binding " ++ "for iova=0x%"PRIx64" (%m)", binding->iova); ++ } ++ QLIST_REMOVE(binding, next); ++ g_free(binding); ++ return ret; ++ } ++ return 0; ++} ++ + static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) + { + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index ae5e014e5d..99c52a0944 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -365,6 +365,65 @@ static void vfio_msi_interrupt(void *opaque) + notify(&vdev->pdev, nr); + } + ++static bool vfio_iommu_require_msi_binding(IOMMUMemoryRegion *iommu_mr) ++{ ++ bool msi_translate = false, nested = false; ++ ++ memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_MSI_TRANSLATE, ++ (void *)&msi_translate); ++ memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_VFIO_NESTED, ++ (void *)&nested); ++ if (!nested || !msi_translate) { ++ return false; ++ } ++ return true; ++} ++ ++static int vfio_register_msi_binding(VFIOPCIDevice *vdev, ++ int vector_n, bool set) ++{ ++ VFIOContainer *container = vdev->vbasedev.group->container; ++ PCIDevice *dev = &vdev->pdev; ++ AddressSpace *as = pci_device_iommu_address_space(dev); ++ IOMMUMemoryRegionClass *imrc; ++ IOMMUMemoryRegion *iommu_mr; ++ IOMMUTLBEntry entry; ++ MSIMessage msg; ++ ++ if (as == &address_space_memory) { ++ return 0; ++ } ++ ++ iommu_mr = IOMMU_MEMORY_REGION(as->root); ++ if (!vfio_iommu_require_msi_binding(iommu_mr)) { ++ return 0; ++ } ++ ++ /* MSI doorbell address is translated by an IOMMU */ ++ ++ if (!set) { /* unregister */ ++ trace_vfio_unregister_msi_binding(vdev->vbasedev.name, vector_n); ++ ++ return vfio_iommu_unset_msi_binding(container, vector_n); ++ } ++ ++ msg = pci_get_msi_message(dev, vector_n); ++ imrc = memory_region_get_iommu_class_nocheck(iommu_mr); ++ ++ rcu_read_lock(); ++ entry = imrc->translate(iommu_mr, msg.address, IOMMU_WO, 0); ++ rcu_read_unlock(); ++ ++ if (entry.perm == IOMMU_NONE) { ++ return -ENOENT; ++ } ++ ++ trace_vfio_register_msi_binding(vdev->vbasedev.name, vector_n, ++ msg.address, entry.translated_addr); ++ ++ return vfio_iommu_set_msi_binding(container, vector_n, &entry); ++} ++ + static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + { + struct vfio_irq_set *irq_set; +@@ -382,7 +441,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + fds = (int32_t *)&irq_set->data; + + for (i = 0; i < vdev->nr_vectors; i++) { +- int fd = -1; ++ int ret, fd = -1; + + /* + * MSI vs MSI-X - The guest has direct access to MSI mask and pending +@@ -391,6 +450,12 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + * KVM signaling path only when configured and unmasked. + */ + if (vdev->msi_vectors[i].use) { ++ ret = vfio_register_msi_binding(vdev, i, true); ++ if (ret) { ++ error_report("%s failed to register S1 MSI binding " ++ "for vector %d(%d)", vdev->vbasedev.name, i, ret); ++ goto out; ++ } + if (vdev->msi_vectors[i].virq < 0 || + (msix && msix_is_masked(&vdev->pdev, i))) { + fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); +@@ -404,6 +469,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ++out: + g_free(irq_set); + + return ret; +@@ -718,7 +784,8 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) + + static void vfio_msix_disable(VFIOPCIDevice *vdev) + { +- int i; ++ int ret, i; ++ + + msix_unset_vector_notifiers(&vdev->pdev); + +@@ -730,6 +797,11 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) + if (vdev->msi_vectors[i].use) { + vfio_msix_vector_release(&vdev->pdev, i); + msix_vector_unuse(&vdev->pdev, i); ++ ret = vfio_register_msi_binding(vdev, i, false); ++ if (ret) { ++ error_report("%s: failed to unregister S1 MSI binding " ++ "for vector %d(%d)", vdev->vbasedev.name, i, ret); ++ } + } + } + +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 20069935f5..35bd415d6d 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -120,6 +120,8 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype + vfio_dma_unmap_overflow_workaround(void) "" + vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" + vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" ++vfio_register_msi_binding(const char *name, int vector, uint64_t giova, uint64_t gdb) "%s: register vector %d gIOVA=0x%"PRIx64 "-> gDB=0x%"PRIx64" stage 1 mapping" ++vfio_unregister_msi_binding(const char *name, int vector) "%s: unregister vector %d stage 1 mapping" + + # platform.c + vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 0234f5e1b1..a838a939e4 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -74,6 +74,14 @@ typedef struct VFIOAddressSpace { + QLIST_ENTRY(VFIOAddressSpace) list; + } VFIOAddressSpace; + ++typedef struct VFIOMSIBinding { ++ int index; ++ hwaddr iova; ++ hwaddr gpa; ++ hwaddr size; ++ QLIST_ENTRY(VFIOMSIBinding) next; ++} VFIOMSIBinding; ++ + struct VFIOGroup; + + typedef struct VFIODMARange { +@@ -103,6 +111,7 @@ typedef struct VFIOContainer { + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_HEAD(, VFIODMARange) dma_list; ++ QLIST_HEAD(, VFIOMSIBinding) msibinding_list; + QLIST_ENTRY(VFIOContainer) next; + } VFIOContainer; + +@@ -222,6 +231,9 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp); + void vfio_put_group(VFIOGroup *group); + int vfio_get_device(VFIOGroup *group, const char *name, + VFIODevice *vbasedev, Error **errp); ++int vfio_iommu_set_msi_binding(VFIOContainer *container, int n, ++ IOMMUTLBEntry *entry); ++int vfio_iommu_unset_msi_binding(VFIOContainer *container, int n); + + extern const MemoryRegionOps vfio_region_ops; + typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; +-- +2.27.0 + -- Gitee From 0d5561bc12d5734c73b41efda833402069cfd1ca Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 20 Jun 2019 16:39:57 +0200 Subject: [PATCH 34/56] vfio: Helper to get IRQ info including capabilities As done for vfio regions, add helpers to retrieve irq info including their optional capabilities. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...-get-IRQ-info-including-capabilities.patch | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 vfio-Helper-to-get-IRQ-info-including-capabilities.patch diff --git a/vfio-Helper-to-get-IRQ-info-including-capabilities.patch b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch new file mode 100644 index 00000000..3d4b1667 --- /dev/null +++ b/vfio-Helper-to-get-IRQ-info-including-capabilities.patch @@ -0,0 +1,178 @@ +From a4336765c99a876743c0ead89997ad6f97d7b442 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 20 Jun 2019 16:39:57 +0200 +Subject: [PATCH] vfio: Helper to get IRQ info including capabilities + +As done for vfio regions, add helpers to retrieve irq info +including their optional capabilities. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 97 +++++++++++++++++++++++++++++++++++ + hw/vfio/trace-events | 1 + + include/hw/vfio/vfio-common.h | 7 +++ + 3 files changed, 105 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 1f78af121d..d05a485808 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1919,6 +1919,25 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + return true; + } + ++struct vfio_info_cap_header * ++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id) ++{ ++ struct vfio_info_cap_header *hdr; ++ void *ptr = info; ++ ++ if (!(info->flags & VFIO_IRQ_INFO_FLAG_CAPS)) { ++ return NULL; ++ } ++ ++ for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { ++ if (hdr->id == id) { ++ return hdr; ++ } ++ } ++ ++ return NULL; ++} ++ + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) + { +@@ -2887,6 +2906,33 @@ retry: + return 0; + } + ++int vfio_get_irq_info(VFIODevice *vbasedev, int index, ++ struct vfio_irq_info **info) ++{ ++ size_t argsz = sizeof(struct vfio_irq_info); ++ ++ *info = g_malloc0(argsz); ++ ++ (*info)->index = index; ++retry: ++ (*info)->argsz = argsz; ++ ++ if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, *info)) { ++ g_free(*info); ++ *info = NULL; ++ return -errno; ++ } ++ ++ if ((*info)->argsz > argsz) { ++ argsz = (*info)->argsz; ++ *info = g_realloc(*info, argsz); ++ ++ goto retry; ++ } ++ ++ return 0; ++} ++ + int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info) + { +@@ -2922,6 +2968,42 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + return -ENODEV; + } + ++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type, ++ uint32_t subtype, struct vfio_irq_info **info) ++{ ++ int i; ++ ++ for (i = 0; i < vbasedev->num_irqs; i++) { ++ struct vfio_info_cap_header *hdr; ++ struct vfio_irq_info_cap_type *cap_type; ++ ++ if (vfio_get_irq_info(vbasedev, i, info)) { ++ continue; ++ } ++ ++ hdr = vfio_get_irq_info_cap(*info, VFIO_IRQ_INFO_CAP_TYPE); ++ if (!hdr) { ++ g_free(*info); ++ continue; ++ } ++ ++ cap_type = container_of(hdr, struct vfio_irq_info_cap_type, header); ++ ++ trace_vfio_get_dev_irq(vbasedev->name, i, ++ cap_type->type, cap_type->subtype); ++ ++ if (cap_type->type == type && cap_type->subtype == subtype) { ++ return 0; ++ } ++ ++ g_free(*info); ++ } ++ ++ *info = NULL; ++ return -ENODEV; ++} ++ ++ + bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) + { + struct vfio_region_info *info = NULL; +@@ -2937,6 +3019,21 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) + return ret; + } + ++bool vfio_has_irq_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) ++{ ++ struct vfio_region_info *info = NULL; ++ bool ret = false; ++ ++ if (!vfio_get_region_info(vbasedev, region, &info)) { ++ if (vfio_get_region_info_cap(info, cap_type)) { ++ ret = true; ++ } ++ g_free(info); ++ } ++ ++ return ret; ++} ++ + /* + * Interfaces for IBM EEH (Enhanced Error Handling) + */ +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 35bd415d6d..f5fe201ab5 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -117,6 +117,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re + vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" + vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" + vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" ++vfio_get_dev_irq(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" + vfio_dma_unmap_overflow_workaround(void) "" + vfio_iommu_addr_inv_iotlb(int asid, uint64_t addr, uint64_t size, uint64_t nb_granules, bool leaf) "nested IOTLB invalidate asid=%d, addr=0x%"PRIx64" granule_size=0x%"PRIx64" nb_granules=0x%"PRIx64" leaf=%d" + vfio_iommu_asid_inv_iotlb(int asid) "nested IOTLB invalidate asid=%d" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index a838a939e4..7fdca26fa0 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -254,6 +254,13 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail); + struct vfio_info_cap_header * + vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); ++int vfio_get_irq_info(VFIODevice *vbasedev, int index, ++ struct vfio_irq_info **info); ++int vfio_get_dev_irq_info(VFIODevice *vbasedev, uint32_t type, ++ uint32_t subtype, struct vfio_irq_info **info); ++bool vfio_has_irq_cap(VFIODevice *vbasedev, int irq, uint16_t cap_type); ++struct vfio_info_cap_header * ++vfio_get_irq_info_cap(struct vfio_irq_info *info, uint16_t id); + #endif + extern const MemoryListener vfio_prereg_listener; + +-- +2.27.0 + -- Gitee From d90ca3a16537fc936b0279cd18c22dec4f11286b Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 13 Dec 2018 04:39:30 -0500 Subject: [PATCH 35/56] vfio/pci: Register handler for iommu fault We use the new extended IRQ VFIO_IRQ_TYPE_NESTED type and VFIO_IRQ_SUBTYPE_DMA_FAULT subtype to set/unset a notifier for physical DMA faults. The associated eventfd is triggered, in nested mode, whenever a fault is detected at IOMMU physical level. The actual handler will be implemented in subsequent patches. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...pci-Register-handler-for-iommu-fault.patch | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 vfio-pci-Register-handler-for-iommu-fault.patch diff --git a/vfio-pci-Register-handler-for-iommu-fault.patch b/vfio-pci-Register-handler-for-iommu-fault.patch new file mode 100644 index 00000000..7209a807 --- /dev/null +++ b/vfio-pci-Register-handler-for-iommu-fault.patch @@ -0,0 +1,168 @@ +From 574455d1363e818905e05cd23ef0948e83a16a51 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Dec 2018 04:39:30 -0500 +Subject: [PATCH] vfio/pci: Register handler for iommu fault + +We use the new extended IRQ VFIO_IRQ_TYPE_NESTED type and +VFIO_IRQ_SUBTYPE_DMA_FAULT subtype to set/unset +a notifier for physical DMA faults. The associated eventfd is +triggered, in nested mode, whenever a fault is detected at IOMMU +physical level. + +The actual handler will be implemented in subsequent patches. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++- + hw/vfio/pci.h | 7 +++++ + 2 files changed, 87 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 99c52a0944..37a70932c6 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2888,6 +2888,76 @@ static PCIPASIDOps vfio_pci_pasid_ops = { + .set_pasid_table = vfio_iommu_set_pasid_table, + }; + ++static void vfio_dma_fault_notifier_handler(void *opaque) ++{ ++ VFIOPCIExtIRQ *ext_irq = opaque; ++ ++ if (!event_notifier_test_and_clear(&ext_irq->notifier)) { ++ return; ++ } ++} ++ ++static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev, ++ uint32_t type, uint32_t subtype, ++ IOHandler *handler) ++{ ++ int32_t fd, ext_irq_index, index; ++ struct vfio_irq_info *irq_info; ++ Error *err = NULL; ++ EventNotifier *n; ++ int ret; ++ ++ ret = vfio_get_dev_irq_info(&vdev->vbasedev, type, subtype, &irq_info); ++ if (ret) { ++ return ret; ++ } ++ index = irq_info->index; ++ ext_irq_index = irq_info->index - VFIO_PCI_NUM_IRQS; ++ g_free(irq_info); ++ ++ vdev->ext_irqs[ext_irq_index].vdev = vdev; ++ vdev->ext_irqs[ext_irq_index].index = index; ++ n = &vdev->ext_irqs[ext_irq_index].notifier; ++ ++ ret = event_notifier_init(n, 0); ++ if (ret) { ++ error_report("vfio: Unable to init event notifier for ext irq %d(%d)", ++ ext_irq_index, ret); ++ return ret; ++ } ++ ++ fd = event_notifier_get_fd(n); ++ qemu_set_fd_handler(fd, vfio_dma_fault_notifier_handler, NULL, ++ &vdev->ext_irqs[ext_irq_index]); ++ ++ ret = vfio_set_irq_signaling(&vdev->vbasedev, index, 0, ++ VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err); ++ if (ret) { ++ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); ++ qemu_set_fd_handler(fd, NULL, NULL, vdev); ++ event_notifier_cleanup(n); ++ } ++ return ret; ++} ++ ++static void vfio_unregister_ext_irq_notifiers(VFIOPCIDevice *vdev) ++{ ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ Error *err = NULL; ++ int i; ++ ++ for (i = 0; i < vbasedev->num_irqs - VFIO_PCI_NUM_IRQS; i++) { ++ if (vfio_set_irq_signaling(vbasedev, i + VFIO_PCI_NUM_IRQS , 0, ++ VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) { ++ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); ++ } ++ qemu_set_fd_handler(event_notifier_get_fd(&vdev->ext_irqs[i].notifier), ++ NULL, NULL, vdev); ++ event_notifier_cleanup(&vdev->ext_irqs[i].notifier); ++ } ++ g_free(vdev->ext_irqs); ++} ++ + static void vfio_realize(PCIDevice *pdev, Error **errp) + { + VFIOPCIDevice *vdev = VFIO_PCI(pdev); +@@ -2898,7 +2968,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + ssize_t len; + struct stat st; + int groupid; +- int i, ret; ++ int i, ret, nb_ext_irqs; + bool is_mdev; + + if (!vdev->vbasedev.sysfsdev) { +@@ -2986,6 +3056,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + goto error; + } + ++ nb_ext_irqs = vdev->vbasedev.num_irqs - VFIO_PCI_NUM_IRQS; ++ if (nb_ext_irqs > 0) { ++ vdev->ext_irqs = g_new0(VFIOPCIExtIRQ, nb_ext_irqs); ++ } ++ + vfio_populate_device(vdev, &err); + if (err) { + error_propagate(errp, err); +@@ -3197,6 +3272,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + + vfio_register_err_notifier(vdev); + vfio_register_req_notifier(vdev); ++ vfio_register_ext_irq_handler(vdev, VFIO_IRQ_TYPE_NESTED, ++ VFIO_IRQ_SUBTYPE_DMA_FAULT, ++ vfio_dma_fault_notifier_handler); + vfio_setup_resetfn_quirk(vdev); + + pci_setup_pasid_ops(pdev, &vfio_pci_pasid_ops); +@@ -3239,6 +3317,7 @@ static void vfio_exitfn(PCIDevice *pdev) + + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); ++ vfio_unregister_ext_irq_notifiers(vdev); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 64777516d1..a8b06737fb 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -114,6 +114,12 @@ typedef struct VFIOMSIXInfo { + unsigned long *pending; + } VFIOMSIXInfo; + ++typedef struct VFIOPCIExtIRQ { ++ struct VFIOPCIDevice *vdev; ++ EventNotifier notifier; ++ uint32_t index; ++} VFIOPCIExtIRQ; ++ + #define TYPE_VFIO_PCI "vfio-pci" + OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI) + +@@ -138,6 +144,7 @@ struct VFIOPCIDevice { + PCIHostDeviceAddress host; + EventNotifier err_notifier; + EventNotifier req_notifier; ++ VFIOPCIExtIRQ *ext_irqs; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + -- Gitee From b5cd35e7e106b3679d3259b25869ce1caa5ea5cc Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 13 Dec 2018 10:57:53 -0500 Subject: [PATCH 36/56] vfio/pci: Set up the DMA FAULT region Set up the fault region which is composed of the actual fault queue (mmappable) and a header used to handle it. The fault queue is mmapped. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- vfio-pci-Set-up-the-DMA-FAULT-region.patch | 132 +++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 vfio-pci-Set-up-the-DMA-FAULT-region.patch diff --git a/vfio-pci-Set-up-the-DMA-FAULT-region.patch b/vfio-pci-Set-up-the-DMA-FAULT-region.patch new file mode 100644 index 00000000..9a4757dc --- /dev/null +++ b/vfio-pci-Set-up-the-DMA-FAULT-region.patch @@ -0,0 +1,132 @@ +From e701d0fef4fbb7935d6aa7d22d82eb2dcfee2431 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Dec 2018 10:57:53 -0500 +Subject: [PATCH] vfio/pci: Set up the DMA FAULT region + +Set up the fault region which is composed of the actual fault +queue (mmappable) and a header used to handle it. The fault +queue is mmapped. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/pci.h | 1 + + 2 files changed, 65 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 37a70932c6..76bc9d3506 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2638,11 +2638,67 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) + return 0; + } + ++static void vfio_init_fault_regions(VFIOPCIDevice *vdev, Error **errp) ++{ ++ struct vfio_region_info *fault_region_info = NULL; ++ struct vfio_region_info_cap_fault *cap_fault; ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ struct vfio_info_cap_header *hdr; ++ char *fault_region_name; ++ int ret; ++ ++ ret = vfio_get_dev_region_info(&vdev->vbasedev, ++ VFIO_REGION_TYPE_NESTED, ++ VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT, ++ &fault_region_info); ++ if (ret) { ++ goto out; ++ } ++ ++ hdr = vfio_get_region_info_cap(fault_region_info, ++ VFIO_REGION_INFO_CAP_DMA_FAULT); ++ if (!hdr) { ++ error_setg(errp, "failed to retrieve DMA FAULT capability"); ++ goto out; ++ } ++ cap_fault = container_of(hdr, struct vfio_region_info_cap_fault, ++ header); ++ if (cap_fault->version != 1) { ++ error_setg(errp, "Unsupported DMA FAULT API version %d", ++ cap_fault->version); ++ goto out; ++ } ++ ++ fault_region_name = g_strdup_printf("%s DMA FAULT %d", ++ vbasedev->name, ++ fault_region_info->index); ++ ++ ret = vfio_region_setup(OBJECT(vdev), vbasedev, ++ &vdev->dma_fault_region, ++ fault_region_info->index, ++ fault_region_name); ++ g_free(fault_region_name); ++ if (ret) { ++ error_setg_errno(errp, -ret, ++ "failed to set up the DMA FAULT region %d", ++ fault_region_info->index); ++ goto out; ++ } ++ ++ ret = vfio_region_mmap(&vdev->dma_fault_region); ++ if (ret) { ++ error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT queue"); ++ } ++out: ++ g_free(fault_region_info); ++} ++ + static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + { + VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; ++ Error *err = NULL; + int i, ret = -1; + + /* Sanity check device */ +@@ -2706,6 +2762,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + } + } + ++ vfio_init_fault_regions(vdev, &err); ++ if (err) { ++ error_propagate(errp, err); ++ return; ++ } ++ + irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +@@ -3298,6 +3360,7 @@ static void vfio_instance_finalize(Object *obj) + + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); ++ vfio_region_finalize(&vdev->dma_fault_region); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* +@@ -3318,6 +3381,7 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_unregister_req_notifier(vdev); + vfio_unregister_err_notifier(vdev); + vfio_unregister_ext_irq_notifiers(vdev); ++ vfio_region_exit(&vdev->dma_fault_region); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index a8b06737fb..eef91065f1 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -145,6 +145,7 @@ struct VFIOPCIDevice { + EventNotifier err_notifier; + EventNotifier req_notifier; + VFIOPCIExtIRQ *ext_irqs; ++ VFIORegion dma_fault_region; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + -- Gitee From 5528a39488b0630f0cb89b02e29b5ef1cb145a61 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 5 Mar 2019 16:35:32 +0100 Subject: [PATCH 37/56] vfio/pci: Implement the DMA fault handler Whenever the eventfd is triggered, we retrieve the DMA fault(s) from the mmapped fault region and inject them in the iommu memory region. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...-pci-Implement-the-DMA-fault-handler.patch | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 vfio-pci-Implement-the-DMA-fault-handler.patch diff --git a/vfio-pci-Implement-the-DMA-fault-handler.patch b/vfio-pci-Implement-the-DMA-fault-handler.patch new file mode 100644 index 00000000..7d7349c9 --- /dev/null +++ b/vfio-pci-Implement-the-DMA-fault-handler.patch @@ -0,0 +1,96 @@ +From d33cc7eccb68c6a1488804c94ff5c1197ee0fc6e Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 5 Mar 2019 16:35:32 +0100 +Subject: [PATCH] vfio/pci: Implement the DMA fault handler + +Whenever the eventfd is triggered, we retrieve the DMA fault(s) +from the mmapped fault region and inject them in the iommu +memory region. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/pci.h | 1 + + 2 files changed, 51 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 76bc9d3506..c54e62fe8f 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2953,10 +2953,60 @@ static PCIPASIDOps vfio_pci_pasid_ops = { + static void vfio_dma_fault_notifier_handler(void *opaque) + { + VFIOPCIExtIRQ *ext_irq = opaque; ++ VFIOPCIDevice *vdev = ext_irq->vdev; ++ PCIDevice *pdev = &vdev->pdev; ++ AddressSpace *as = pci_device_iommu_address_space(pdev); ++ IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(as->root); ++ struct vfio_region_dma_fault header; ++ struct iommu_fault *queue; ++ char *queue_buffer = NULL; ++ ssize_t bytes; + + if (!event_notifier_test_and_clear(&ext_irq->notifier)) { + return; + } ++ ++ bytes = pread(vdev->vbasedev.fd, &header, sizeof(header), ++ vdev->dma_fault_region.fd_offset); ++ if (bytes != sizeof(header)) { ++ error_report("%s unable to read the fault region header (0x%lx)", ++ __func__, bytes); ++ return; ++ } ++ ++ /* Normally the fault queue is mmapped */ ++ queue = (struct iommu_fault *)vdev->dma_fault_region.mmaps[0].mmap; ++ if (!queue) { ++ size_t queue_size = header.nb_entries * header.entry_size; ++ ++ error_report("%s: fault queue not mmapped: slower fault handling", ++ vdev->vbasedev.name); ++ ++ queue_buffer = g_malloc(queue_size); ++ bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size, ++ vdev->dma_fault_region.fd_offset + header.offset); ++ if (bytes != queue_size) { ++ error_report("%s unable to read the fault queue (0x%lx)", ++ __func__, bytes); ++ return; ++ } ++ ++ queue = (struct iommu_fault *)queue_buffer; ++ } ++ ++ while (vdev->fault_tail_index != header.head) { ++ memory_region_inject_faults(iommu_mr, 1, ++ &queue[vdev->fault_tail_index]); ++ vdev->fault_tail_index = ++ (vdev->fault_tail_index + 1) % header.nb_entries; ++ } ++ bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_tail_index, 4, ++ vdev->dma_fault_region.fd_offset); ++ if (bytes != 4) { ++ error_report("%s unable to write the fault region tail index (0x%lx)", ++ __func__, bytes); ++ } ++ g_free(queue_buffer); + } + + static int vfio_register_ext_irq_handler(VFIOPCIDevice *vdev, +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index eef91065f1..03ac8919ef 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -146,6 +146,7 @@ struct VFIOPCIDevice { + EventNotifier req_notifier; + VFIOPCIExtIRQ *ext_irqs; + VFIORegion dma_fault_region; ++ uint32_t fault_tail_index; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + -- Gitee From 4b7118bfa7c1c751098a6c794025d45f73013a98 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 28 Aug 2018 09:21:53 -0400 Subject: [PATCH 38/56] hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute The SMMUv3 has the peculiarity to translate MSI transactionss. let's advertise the corresponding attribute. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...v3-Advertise-MSI_TRANSLATE-attribute.patch | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch diff --git a/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch new file mode 100644 index 00000000..e8d39782 --- /dev/null +++ b/hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch @@ -0,0 +1,32 @@ +From 5a759ab19d508361053e388694546216705d173b Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 28 Aug 2018 09:21:53 -0400 +Subject: [PATCH] hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute + +The SMMUv3 has the peculiarity to translate MSI +transactionss. let's advertise the corresponding +attribute. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 9b87d16217..12f354a0d5 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1596,6 +1596,9 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, + if (attr == IOMMU_ATTR_VFIO_NESTED) { + *(bool *) data = true; + return 0; ++ } else if (attr == IOMMU_ATTR_MSI_TRANSLATE) { ++ *(bool *) data = true; ++ return 0; + } + return -EINVAL; + } +-- +2.27.0 + -- Gitee From 15942990aaea2e47c5db45a49e7a69587baad50c Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 9 Aug 2018 20:56:44 +0200 Subject: [PATCH 39/56] hw/arm/smmuv3: Store the PASID table GPA in the translation config For VFIO integration we will need to pass the Context Descriptor (CD) table GPA to the host. The CD table is also referred to as the PASID table. Its GPA corresponds to the s1ctrptr field of the Stream Table Entry. So let's decode and store it in the configuration structure. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...ore-the-PASID-table-GPA-in-the-trans.patch | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch diff --git a/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch new file mode 100644 index 00000000..3bbf1dad --- /dev/null +++ b/hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch @@ -0,0 +1,45 @@ +From f937ce4124d57eea27d516957a2efa0e7fbdf198 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 9 Aug 2018 20:56:44 +0200 +Subject: [PATCH] hw/arm/smmuv3: Store the PASID table GPA in the translation + config + +For VFIO integration we will need to pass the Context Descriptor (CD) +table GPA to the host. The CD table is also referred to as the PASID +table. Its GPA corresponds to the s1ctrptr field of the Stream Table +Entry. So let's decode and store it in the configuration structure. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 1 + + include/hw/arm/smmu-common.h | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 12f354a0d5..3416f6a639 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -358,6 +358,7 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, + "SMMUv3 S1 stalling fault model not allowed yet\n"); + goto bad_ste; + } ++ cfg->s1ctxptr = STE_CTXPTR(ste); + return 0; + + bad_ste: +diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h +index 706be3c6d0..d578339935 100644 +--- a/include/hw/arm/smmu-common.h ++++ b/include/hw/arm/smmu-common.h +@@ -76,6 +76,7 @@ typedef struct SMMUTransCfg { + uint8_t tbi; /* Top Byte Ignore */ + uint16_t asid; + SMMUTransTableInfo tt[2]; ++ dma_addr_t s1ctxptr; + uint32_t iotlb_hits; /* counts IOTLB hits for this asid */ + uint32_t iotlb_misses; /* counts IOTLB misses for this asid */ + } SMMUTransCfg; +-- +2.27.0 + -- Gitee From c1d79de1d0eab37a386078ec30660158c7f586a9 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 4 Sep 2018 08:48:33 -0400 Subject: [PATCH 40/56] hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation When the guest invalidates one S1 entry, it passes the asid. When propagating this invalidation downto the host, the asid information also must be passed. So let's fill the arch_id field introduced for that purpose and accordingly set the flags to indicate its presence. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...ll-the-IOTLBEntry-arch_id-on-NH_VA-i.patch | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch new file mode 100644 index 00000000..646a95bd --- /dev/null +++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch @@ -0,0 +1,34 @@ +From dcda615b3d9b1acffee3d31d57974cc9e4bd0dee Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 4 Sep 2018 08:48:33 -0400 +Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA + invalidation + +When the guest invalidates one S1 entry, it passes the asid. +When propagating this invalidation downto the host, the asid +information also must be passed. So let's fill the arch_id field +introduced for that purpose and accordingly set the flags to +indicate its presence. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 3416f6a639..696c588f08 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -833,6 +833,8 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + event.entry.iova = iova; + event.entry.addr_mask = num_pages * (1 << granule) - 1; + event.entry.perm = IOMMU_NONE; ++ event.entry.flags = IOMMU_INV_FLAGS_ARCHID; ++ event.entry.arch_id = asid; + + memory_region_notify_iommu_one(n, &event); + } +-- +2.27.0 + -- Gitee From c9eb893c549263c48072ff1cfa0aadeefb10062b Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 14 Mar 2019 09:55:13 -0400 Subject: [PATCH 41/56] hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation Let's propagate the leaf attribute throughout the invalidation path. This hint is used to reduce the scope of the invalidations to the last level of translation. Not enforcing it induces large performance penalties in nested mode. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...ll-the-IOTLBEntry-leaf-field-on-NH_V.patch | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch diff --git a/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch new file mode 100644 index 00000000..f5f3db19 --- /dev/null +++ b/hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch @@ -0,0 +1,77 @@ +From c219274b7b6a472d7340a4f72a052ba33ed19659 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 14 Mar 2019 09:55:13 -0400 +Subject: [PATCH] hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA + invalidation + +Let's propagate the leaf attribute throughout the invalidation path. +This hint is used to reduce the scope of the invalidations to the +last level of translation. Not enforcing it induces large performance +penalties in nested mode. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 696c588f08..ad816e850c 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -800,7 +800,7 @@ epilogue: + static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + IOMMUNotifier *n, + int asid, dma_addr_t iova, +- uint8_t tg, uint64_t num_pages) ++ uint8_t tg, uint64_t num_pages, bool leaf) + { + SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu); + IOMMUTLBEvent event = {}; +@@ -835,6 +835,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr, + event.entry.perm = IOMMU_NONE; + event.entry.flags = IOMMU_INV_FLAGS_ARCHID; + event.entry.arch_id = asid; ++ event.entry.leaf = leaf; + + memory_region_notify_iommu_one(n, &event); + } +@@ -866,7 +867,7 @@ static void smmuv3_notify_asid(IOMMUMemoryRegion *mr, + + /* invalidate an asid/iova range tuple in all mr's */ + static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, +- uint8_t tg, uint64_t num_pages) ++ uint8_t tg, uint64_t num_pages, bool leaf) + { + SMMUDevice *sdev; + +@@ -878,7 +879,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, dma_addr_t iova, + tg, num_pages); + + IOMMU_NOTIFIER_FOREACH(n, mr) { +- smmuv3_notify_iova(mr, n, asid, iova, tg, num_pages); ++ smmuv3_notify_iova(mr, n, asid, iova, tg, num_pages, leaf); + } + } + } +@@ -903,7 +904,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + + if (!tg) { + trace_smmuv3_s1_range_inval(vmid, asid, addr, tg, 1, ttl, leaf); +- smmuv3_inv_notifiers_iova(s, asid, addr, tg, 1); ++ smmuv3_inv_notifiers_iova(s, asid, addr, tg, 1, leaf); + smmu_iotlb_inv_iova(s, asid, addr, tg, 1, ttl); + return; + } +@@ -921,7 +922,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + + num_pages = (mask + 1) >> granule; + trace_smmuv3_s1_range_inval(vmid, asid, addr, tg, num_pages, ttl, leaf); +- smmuv3_inv_notifiers_iova(s, asid, addr, tg, num_pages); ++ smmuv3_inv_notifiers_iova(s, asid, addr, tg, num_pages, leaf); + smmu_iotlb_inv_iova(s, asid, addr, tg, num_pages, ttl); + addr += mask + 1; + } +-- +2.27.0 + -- Gitee From f7fc457afbd51e86184a01438874009a42cc2c1c Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 9 Aug 2018 21:04:19 +0200 Subject: [PATCH 42/56] hw/arm/smmuv3: Pass stage 1 configurations to the host In case PASID PciOps are set for the device we call the set_pasid_table() callback on each STE update. This allows to pass the guest stage 1 configuration to the host and apply it at physical level. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...ss-stage-1-configurations-to-the-hos.patch | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch diff --git a/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch new file mode 100644 index 00000000..012c5d00 --- /dev/null +++ b/hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch @@ -0,0 +1,161 @@ +From 2e5929ec2a35a7a227dc7ba70a557a84993a366d Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 9 Aug 2018 21:04:19 +0200 +Subject: [PATCH] hw/arm/smmuv3: Pass stage 1 configurations to the host + +In case PASID PciOps are set for the device we call +the set_pasid_table() callback on each STE update. + +This allows to pass the guest stage 1 configuration +to the host and apply it at physical level. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmu-internal.h | 1 + + hw/arm/smmuv3.c | 71 ++++++++++++++++++++++++++++++++++++------ + hw/arm/trace-events | 1 + + 3 files changed, 64 insertions(+), 9 deletions(-) + +diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h +index 2d75b31953..5ef8c598c6 100644 +--- a/hw/arm/smmu-internal.h ++++ b/hw/arm/smmu-internal.h +@@ -105,6 +105,7 @@ typedef struct SMMUIOTLBPageInvInfo { + } SMMUIOTLBPageInvInfo; + + typedef struct SMMUSIDRange { ++ SMMUState *state; + uint32_t start; + uint32_t end; + } SMMUSIDRange; +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index ad816e850c..58139f707d 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -16,6 +16,10 @@ + * with this program; if not, see . + */ + ++#ifdef __linux__ ++#include "linux/iommu.h" ++#endif ++ + #include "qemu/osdep.h" + #include "qemu/bitops.h" + #include "hw/irq.h" +@@ -928,6 +932,61 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + } + } + ++static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) ++{ ++#ifdef __linux__ ++ IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); ++ SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid, ++ .inval_ste_allowed = true}; ++ IOMMUConfig iommu_config = {}; ++ SMMUTransCfg *cfg; ++ SMMUDevice *sdev; ++ ++ if (!mr) { ++ return; ++ } ++ ++ sdev = container_of(mr, SMMUDevice, iommu); ++ ++ /* flush QEMU config cache */ ++ smmuv3_flush_config(sdev); ++ ++ if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) { ++ return; ++ } ++ ++ cfg = smmuv3_get_config(sdev, &event); ++ ++ if (!cfg) { ++ return; ++ } ++ ++ iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config); ++ iommu_config.pasid_cfg.version = PASID_TABLE_CFG_VERSION_1; ++ iommu_config.pasid_cfg.format = IOMMU_PASID_FORMAT_SMMUV3; ++ iommu_config.pasid_cfg.base_ptr = cfg->s1ctxptr; ++ iommu_config.pasid_cfg.pasid_bits = 0; ++ iommu_config.pasid_cfg.vendor_data.smmuv3.version = PASID_TABLE_SMMUV3_CFG_VERSION_1; ++ ++ if (cfg->disabled || cfg->bypassed) { ++ iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_BYPASS; ++ } else if (cfg->aborted) { ++ iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_ABORT; ++ } else { ++ iommu_config.pasid_cfg.config = IOMMU_PASID_CONFIG_TRANSLATE; ++ } ++ ++ trace_smmuv3_notify_config_change(mr->parent_obj.name, ++ iommu_config.pasid_cfg.config, ++ iommu_config.pasid_cfg.base_ptr); ++ ++ if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) { ++ error_report("Failed to pass PASID table to host for iommu mr %s (%m)", ++ mr->parent_obj.name); ++ } ++#endif ++} ++ + static gboolean + smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + { +@@ -938,6 +997,7 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) + if (sid < sid_range->start || sid > sid_range->end) { + return false; + } ++ smmuv3_notify_config_change(sid_range->state, sid); + trace_smmuv3_config_cache_inv(sid); + return true; + } +@@ -1008,22 +1068,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + case SMMU_CMD_CFGI_STE: + { + uint32_t sid = CMD_SID(&cmd); +- IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); +- SMMUDevice *sdev; + + if (CMD_SSEC(&cmd)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + +- if (!mr) { +- break; +- } +- + trace_smmuv3_cmdq_cfgi_ste(sid); +- sdev = container_of(mr, SMMUDevice, iommu); +- smmuv3_flush_config(sdev); +- ++ smmuv3_notify_config_change(bs, sid); + break; + } + case SMMU_CMD_CFGI_STE_RANGE: /* same as SMMU_CMD_CFGI_ALL */ +@@ -1038,6 +1090,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) + } + + mask = (1ULL << (range + 1)) - 1; ++ sid_range.state = bs; + sid_range.start = sid & ~mask; + sid_range.end = sid_range.start + mask; + +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index 1447ad5a90..d9851d663e 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -53,4 +53,5 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" + smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 ++smmuv3_notify_config_change(const char *name, uint8_t config, uint64_t s1ctxptr) "iommu mr=%s config=%d s1ctxptr=0x%"PRIx64 + +-- +2.27.0 + -- Gitee From aab5085da5de3ccfc2b3ef4119775d8aebad1252 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Thu, 13 Sep 2018 14:24:45 +0200 Subject: [PATCH 43/56] hw/arm/smmuv3: Implement fault injection We convert iommu_fault structs received from the kernel into the data struct used by the emulation code and record the evnts into the virtual event queue. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- hw-arm-smmuv3-Implement-fault-injection.patch | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 hw-arm-smmuv3-Implement-fault-injection.patch diff --git a/hw-arm-smmuv3-Implement-fault-injection.patch b/hw-arm-smmuv3-Implement-fault-injection.patch new file mode 100644 index 00000000..5ecb6da7 --- /dev/null +++ b/hw-arm-smmuv3-Implement-fault-injection.patch @@ -0,0 +1,107 @@ +From d31c754470b4b651d0e19c66738fbcc8fc6abf3c Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Thu, 13 Sep 2018 14:24:45 +0200 +Subject: [PATCH] hw/arm/smmuv3: Implement fault injection + +We convert iommu_fault structs received from the kernel +into the data struct used by the emulation code and record +the evnts into the virtual event queue. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 71 insertions(+) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 58139f707d..9aeb420428 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1660,6 +1660,76 @@ static int smmuv3_get_attr(IOMMUMemoryRegion *iommu, + return -EINVAL; + } + ++struct iommu_fault; ++ ++static inline int ++smmuv3_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, ++ struct iommu_fault *buf) ++{ ++#ifdef __linux__ ++ SMMUDevice *sdev = container_of(iommu_mr, SMMUDevice, iommu); ++ SMMUv3State *s3 = sdev->smmu; ++ uint32_t sid = smmu_get_sid(sdev); ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ SMMUEventInfo info = {}; ++ struct iommu_fault_unrecoverable *record; ++ ++ if (buf[i].type != IOMMU_FAULT_DMA_UNRECOV) { ++ continue; ++ } ++ ++ info.sid = sid; ++ record = &buf[i].event; ++ ++ switch (record->reason) { ++ case IOMMU_FAULT_REASON_PASID_INVALID: ++ info.type = SMMU_EVT_C_BAD_SUBSTREAMID; ++ /* TODO further fill info.u.c_bad_substream */ ++ break; ++ case IOMMU_FAULT_REASON_PASID_FETCH: ++ info.type = SMMU_EVT_F_CD_FETCH; ++ break; ++ case IOMMU_FAULT_REASON_BAD_PASID_ENTRY: ++ info.type = SMMU_EVT_C_BAD_CD; ++ /* TODO further fill info.u.c_bad_cd */ ++ break; ++ case IOMMU_FAULT_REASON_WALK_EABT: ++ info.type = SMMU_EVT_F_WALK_EABT; ++ info.u.f_walk_eabt.addr = record->addr; ++ info.u.f_walk_eabt.addr2 = record->fetch_addr; ++ break; ++ case IOMMU_FAULT_REASON_PTE_FETCH: ++ info.type = SMMU_EVT_F_TRANSLATION; ++ info.u.f_translation.addr = record->addr; ++ break; ++ case IOMMU_FAULT_REASON_OOR_ADDRESS: ++ info.type = SMMU_EVT_F_ADDR_SIZE; ++ info.u.f_addr_size.addr = record->addr; ++ break; ++ case IOMMU_FAULT_REASON_ACCESS: ++ info.type = SMMU_EVT_F_ACCESS; ++ info.u.f_access.addr = record->addr; ++ break; ++ case IOMMU_FAULT_REASON_PERMISSION: ++ info.type = SMMU_EVT_F_PERMISSION; ++ info.u.f_permission.addr = record->addr; ++ break; ++ default: ++ warn_report("%s Unexpected fault reason received from host: %d", ++ __func__, record->reason); ++ continue; ++ } ++ ++ smmuv3_record_event(s3, &info); ++ } ++ return 0; ++#else ++ return -1; ++#endif ++} ++ + static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + void *data) + { +@@ -1668,6 +1738,7 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, + imrc->translate = smmuv3_translate; + imrc->notify_flag_changed = smmuv3_notify_flag_changed; + imrc->get_attr = smmuv3_get_attr; ++ imrc->inject_faults = smmuv3_inject_faults; + } + + static const TypeInfo smmuv3_type_info = { +-- +2.27.0 + -- Gitee From 5af5d8bdfa02a11d8213a50a078e6db944fcc335 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Wed, 18 Mar 2020 11:17:36 +0100 Subject: [PATCH 44/56] hw/arm/smmuv3: Allow MAP notifiers We now have all bricks to support nested paging. This uses MAP notifiers to map the MSIs. So let's allow MAP notifiers to be registered. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- hw-arm-smmuv3-Allow-MAP-notifiers.patch | 37 +++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 hw-arm-smmuv3-Allow-MAP-notifiers.patch diff --git a/hw-arm-smmuv3-Allow-MAP-notifiers.patch b/hw-arm-smmuv3-Allow-MAP-notifiers.patch new file mode 100644 index 00000000..1d82532d --- /dev/null +++ b/hw-arm-smmuv3-Allow-MAP-notifiers.patch @@ -0,0 +1,37 @@ +From dc126664134989975ce9ab9e7d5d2c8916628bf6 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Wed, 18 Mar 2020 11:17:36 +0100 +Subject: [PATCH] hw/arm/smmuv3: Allow MAP notifiers + +We now have all bricks to support nested paging. This +uses MAP notifiers to map the MSIs. So let's allow MAP +notifiers to be registered. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 8 -------- + 1 file changed, 8 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 9aeb420428..45f21c53fe 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1628,14 +1628,6 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, + return -EINVAL; + } + +- if (new & IOMMU_NOTIFIER_MAP) { +- error_setg(errp, +- "device %02x.%02x.%x requires iommu MAP notifier which is " +- "not currently supported", pci_bus_num(sdev->bus), +- PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn)); +- return -EINVAL; +- } +- + if (old == IOMMU_NOTIFIER_NONE) { + trace_smmuv3_notify_flag_add(iommu->parent_obj.name); + QLIST_INSERT_HEAD(&s->devices_with_notifiers, sdev, next); +-- +2.27.0 + -- Gitee From d5dd3551ffdb14d30c8b04dfad4c56646eda414f Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 6 Nov 2020 14:34:35 +0100 Subject: [PATCH 45/56] pci: Add return_page_response pci ops Add a new PCI operation that allows to return page responses to registered VFIO devices Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- pci-Add-return_page_response-pci-ops.patch | 86 ++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 pci-Add-return_page_response-pci-ops.patch diff --git a/pci-Add-return_page_response-pci-ops.patch b/pci-Add-return_page_response-pci-ops.patch new file mode 100644 index 00000000..25e665f8 --- /dev/null +++ b/pci-Add-return_page_response-pci-ops.patch @@ -0,0 +1,86 @@ +From 228345cfa59c764e725e2d3680a4bc3ecb237609 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 6 Nov 2020 14:34:35 +0100 +Subject: [PATCH] pci: Add return_page_response pci ops + +Add a new PCI operation that allows to return page responses +to registered VFIO devices + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/pci/pci.c | 16 ++++++++++++++++ + include/hw/iommu/iommu.h | 8 ++++++++ + include/hw/pci/pci.h | 4 ++++ + 3 files changed, 28 insertions(+) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index 4a9374c025..64db325d6b 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -2793,6 +2793,22 @@ int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, + return -ENOENT; + } + ++int pci_device_return_page_response(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp) ++{ ++ PCIDevice *dev; ++ ++ if (!bus) { ++ return -EINVAL; ++ } ++ ++ dev = bus->devices[devfn]; ++ if (dev && dev->pasid_ops && dev->pasid_ops->return_page_response) { ++ return dev->pasid_ops->return_page_response(bus, devfn, resp); ++ } ++ return -ENOENT; ++} ++ + static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque) + { + Range *range = opaque; +diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h +index 12092bda7b..5890f095b1 100644 +--- a/include/hw/iommu/iommu.h ++++ b/include/hw/iommu/iommu.h +@@ -24,5 +24,13 @@ typedef struct IOMMUConfig { + }; + } IOMMUConfig; + ++typedef struct IOMMUPageResponse { ++ union { ++#ifdef __linux__ ++ struct iommu_page_response resp; ++#endif ++ }; ++} IOMMUPageResponse; ++ + + #endif /* QEMU_HW_IOMMU_IOMMU_H */ +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index abffa12a99..809eb32f4a 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -268,6 +268,8 @@ typedef struct PCIReqIDCache PCIReqIDCache; + + struct PCIPASIDOps { + int (*set_pasid_table)(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++ int (*return_page_response)(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp); + }; + typedef struct PCIPASIDOps PCIPASIDOps; + +@@ -508,6 +510,8 @@ void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); + void pci_setup_pasid_ops(PCIDevice *dev, PCIPASIDOps *ops); + bool pci_device_is_pasid_ops_set(PCIBus *bus, int32_t devfn); + int pci_device_set_pasid_table(PCIBus *bus, int32_t devfn, IOMMUConfig *config); ++int pci_device_return_page_response(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp); + + static inline void + pci_set_byte(uint8_t *config, uint8_t val) +-- +2.27.0 + -- Gitee From 7715bfc0bafa45f711495cca8ddf9cd1e13c9a73 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Fri, 6 Nov 2020 12:03:29 -0500 Subject: [PATCH 46/56] vfio/pci: Implement return_page_response page response callback This patch implements the page response path. The response is written into the page response ring buffer and then update header's head index is updated. This path is not used by this series. It is introduced here as a POC for vSVA/ARM integration. Signed-off-by: Eric Auger Signed-off-by: Kunkun Jiang --- ...nt-return_page_response-page-respons.patch | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 vfio-pci-Implement-return_page_response-page-respons.patch diff --git a/vfio-pci-Implement-return_page_response-page-respons.patch b/vfio-pci-Implement-return_page_response-page-respons.patch new file mode 100644 index 00000000..21d88a36 --- /dev/null +++ b/vfio-pci-Implement-return_page_response-page-respons.patch @@ -0,0 +1,199 @@ +From 6bbf810edebdb89a6958519ee3adfb1888520231 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Fri, 6 Nov 2020 12:03:29 -0500 +Subject: [PATCH] vfio/pci: Implement return_page_response page response + callback + +This patch implements the page response path. The +response is written into the page response ring buffer and then +update header's head index is updated. This path is not used +by this series. It is introduced here as a POC for vSVA/ARM +integration. + +Signed-off-by: Eric Auger +Signed-off-by: Kunkun Jiang +--- + hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vfio/pci.h | 2 + + 2 files changed, 125 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index c54e62fe8f..8e24f9c7d1 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2693,6 +2693,61 @@ out: + g_free(fault_region_info); + } + ++static void vfio_init_fault_response_regions(VFIOPCIDevice *vdev, Error **errp) ++{ ++ struct vfio_region_info *fault_region_info = NULL; ++ struct vfio_region_info_cap_fault *cap_fault; ++ VFIODevice *vbasedev = &vdev->vbasedev; ++ struct vfio_info_cap_header *hdr; ++ char *fault_region_name; ++ int ret; ++ ++ ret = vfio_get_dev_region_info(&vdev->vbasedev, ++ VFIO_REGION_TYPE_NESTED, ++ VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT_RESPONSE, ++ &fault_region_info); ++ if (ret) { ++ goto out; ++ } ++ ++ hdr = vfio_get_region_info_cap(fault_region_info, ++ VFIO_REGION_INFO_CAP_DMA_FAULT_RESPONSE); ++ if (!hdr) { ++ error_setg(errp, "failed to retrieve DMA FAULT RESPONSE capability"); ++ goto out; ++ } ++ cap_fault = container_of(hdr, struct vfio_region_info_cap_fault, ++ header); ++ if (cap_fault->version != 1) { ++ error_setg(errp, "Unsupported DMA FAULT RESPONSE API version %d", ++ cap_fault->version); ++ goto out; ++ } ++ ++ fault_region_name = g_strdup_printf("%s DMA FAULT RESPONSE %d", ++ vbasedev->name, ++ fault_region_info->index); ++ ++ ret = vfio_region_setup(OBJECT(vdev), vbasedev, ++ &vdev->dma_fault_response_region, ++ fault_region_info->index, ++ fault_region_name); ++ g_free(fault_region_name); ++ if (ret) { ++ error_setg_errno(errp, -ret, ++ "failed to set up the DMA FAULT RESPONSE region %d", ++ fault_region_info->index); ++ goto out; ++ } ++ ++ ret = vfio_region_mmap(&vdev->dma_fault_response_region); ++ if (ret) { ++ error_setg_errno(errp, -ret, "Failed to mmap the DMA FAULT RESPONSE queue"); ++ } ++out: ++ g_free(fault_region_info); ++} ++ + static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + { + VFIODevice *vbasedev = &vdev->vbasedev; +@@ -2768,6 +2823,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) + return; + } + ++ vfio_init_fault_response_regions(vdev, &err); ++ if (err) { ++ error_propagate(errp, err); ++ return; ++ } ++ + irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +@@ -2946,8 +3007,68 @@ static int vfio_iommu_set_pasid_table(PCIBus *bus, int32_t devfn, + return ioctl(container->fd, VFIO_IOMMU_SET_PASID_TABLE, &info); + } + ++static int vfio_iommu_return_page_response(PCIBus *bus, int32_t devfn, ++ IOMMUPageResponse *resp) ++{ ++ PCIDevice *pdev = bus->devices[devfn]; ++ VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); ++ struct iommu_page_response *response = &resp->resp; ++ struct vfio_region_dma_fault_response header; ++ struct iommu_page_response *queue; ++ char *queue_buffer = NULL; ++ ssize_t bytes; ++ ++ if (!vdev->dma_fault_response_region.mem) { ++ return -EINVAL; ++ } ++ ++ /* read the header */ ++ bytes = pread(vdev->vbasedev.fd, &header, sizeof(header), ++ vdev->dma_fault_response_region.fd_offset); ++ if (bytes != sizeof(header)) { ++ error_report("%s unable to read the fault region header (0x%lx)", ++ __func__, bytes); ++ return -1; ++ } ++ ++ /* Normally the fault queue is mmapped */ ++ queue = (struct iommu_page_response *)vdev->dma_fault_response_region.mmaps[0].mmap; ++ if (!queue) { ++ size_t queue_size = header.nb_entries * header.entry_size; ++ ++ error_report("%s: fault queue not mmapped: slower fault handling", ++ vdev->vbasedev.name); ++ ++ queue_buffer = g_malloc(queue_size); ++ bytes = pread(vdev->vbasedev.fd, queue_buffer, queue_size, ++ vdev->dma_fault_response_region.fd_offset + header.offset); ++ if (bytes != queue_size) { ++ error_report("%s unable to read the fault queue (0x%lx)", ++ __func__, bytes); ++ return -1; ++ } ++ ++ queue = (struct iommu_page_response *)queue_buffer; ++ } ++ /* deposit the new response in the queue and increment the head */ ++ memcpy(queue + header.head, response, header.entry_size); ++ ++ vdev->fault_response_head_index = ++ (vdev->fault_response_head_index + 1) % header.nb_entries; ++ bytes = pwrite(vdev->vbasedev.fd, &vdev->fault_response_head_index, 4, ++ vdev->dma_fault_response_region.fd_offset); ++ if (bytes != 4) { ++ error_report("%s unable to write the fault response region head index (0x%lx)", ++ __func__, bytes); ++ } ++ g_free(queue_buffer); ++ ++ return 0; ++} ++ + static PCIPASIDOps vfio_pci_pasid_ops = { + .set_pasid_table = vfio_iommu_set_pasid_table, ++ .return_page_response = vfio_iommu_return_page_response, + }; + + static void vfio_dma_fault_notifier_handler(void *opaque) +@@ -3411,6 +3532,7 @@ static void vfio_instance_finalize(Object *obj) + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + vfio_region_finalize(&vdev->dma_fault_region); ++ vfio_region_finalize(&vdev->dma_fault_response_region); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* +@@ -3432,6 +3554,7 @@ static void vfio_exitfn(PCIDevice *pdev) + vfio_unregister_err_notifier(vdev); + vfio_unregister_ext_irq_notifiers(vdev); + vfio_region_exit(&vdev->dma_fault_region); ++ vfio_region_exit(&vdev->dma_fault_response_region); + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 03ac8919ef..61b3bf1303 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -147,6 +147,8 @@ struct VFIOPCIDevice { + VFIOPCIExtIRQ *ext_irqs; + VFIORegion dma_fault_region; + uint32_t fault_tail_index; ++ VFIORegion dma_fault_response_region; ++ uint32_t fault_response_head_index; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; +-- +2.27.0 + -- Gitee From 161957e3b1051034a879fe4f1555f998e2624a84 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 31 Jul 2021 10:02:18 +0800 Subject: [PATCH 47/56] vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode The ram section will be unmapped at vfio_prereg_listener_region_del() in nested mode. So let's avoid unmap ram section at vfio_listener_region_dev(). Signed-off-by: Kunkun Jiang --- ...d-unmap-ram-section-at-vfio_listener.patch | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch diff --git a/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch new file mode 100644 index 00000000..71302b2d --- /dev/null +++ b/vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch @@ -0,0 +1,39 @@ +From 9d7b782a0b2c5288e82f3064b4c5b7bf18887280 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Sat, 31 Jul 2021 10:02:18 +0800 +Subject: [PATCH] vfio/common: Avoid unmap ram section at + vfio_listener_region_del() in nested mode + +The ram section will be unmapped at vfio_prereg_listener_region_del() +in nested mode. So let's avoid unmap ram section at +vfio_listener_region_dev(). + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index d05a485808..bdfcc854fe 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1441,6 +1441,16 @@ static void vfio_listener_region_del(MemoryListener *listener, + } + } + ++ /* ++ * In nested mode, stage 2 (gpa->hpa) and the stage 1 ++ * (giova->gpa) are set separately. The ram section ++ * will be unmapped in vfio_prereg_listener_region_del(). ++ * Hence it doesn't need to unmap ram section here. ++ */ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ return; ++ } ++ + /* + * FIXME: We assume the one big unmap below is adequate to + * remove any individual page mappings in the IOMMU which +-- +2.27.0 + -- Gitee From a19b346a89c355edb25e9c34a3571065d9266d5d Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 11 May 2021 10:08:13 +0800 Subject: [PATCH 48/56] vfio: Introduce helpers to mark dirty pages of a RAM section Extract part of the code from vfio_sync_dirty_bitmap to form a new helper, which allows to mark dirty pages of a RAM section. This helper will be called for nested stage. Signed-off-by: Kunkun Jiang --- ...elpers-to-mark-dirty-pages-of-a-RAM-.patch | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch diff --git a/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch new file mode 100644 index 00000000..e77dd1d6 --- /dev/null +++ b/vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch @@ -0,0 +1,64 @@ +From 1675d767aa9bd496178b4d74e01a40dbbd97eccb Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:13 +0800 +Subject: [PATCH] vfio: Introduce helpers to mark dirty pages of a RAM section + +Extract part of the code from vfio_sync_dirty_bitmap to form a +new helper, which allows to mark dirty pages of a RAM section. +This helper will be called for nested stage. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 22 ++++++++++++++-------- + 1 file changed, 14 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index bdfcc854fe..6136b1ef61 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1566,6 +1566,19 @@ err_out: + return ret; + } + ++static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ ram_addr_t ram_addr; ++ ++ ram_addr = memory_region_get_ram_addr(section->mr) + ++ section->offset_within_region; ++ ++ return vfio_get_dirty_bitmap(container, ++ REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), ++ int128_get64(section->size), ram_addr); ++} ++ + typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +@@ -1650,8 +1663,6 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + static int vfio_sync_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) + { +- ram_addr_t ram_addr; +- + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + +@@ -1682,12 +1693,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); + } + +- ram_addr = memory_region_get_ram_addr(section->mr) + +- section->offset_within_region; +- +- return vfio_get_dirty_bitmap(container, +- REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), +- int128_get64(section->size), ram_addr); ++ return vfio_dma_sync_ram_section_dirty_bitmap(container, section); + } + + static void vfio_listener_log_sync(MemoryListener *listener, +-- +2.27.0 + -- Gitee From 8ec78805e6e897ddc1edd8426c35de833b596ee5 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 11 May 2021 10:08:14 +0800 Subject: [PATCH 49/56] vfio: Add vfio_prereg_listener_log_sync in nested stage In nested mode, we set up the stage 2 (gpa->hpa)and stage 1 (giova->gpa) separately by vfio_prereg_listener_region_add() and vfio_listener_region_add(). So when marking dirty pages we just need to pay attention to stage 2 mappings. Legacy vfio_listener_log_sync cannot be used in nested stage. This patch adds vfio_prereg_listener_log_sync to mark dirty pages in nested mode. Signed-off-by: Kunkun Jiang --- ...ereg_listener_log_sync-in-nested-sta.patch | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch diff --git a/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch new file mode 100644 index 00000000..b1df5a38 --- /dev/null +++ b/vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch @@ -0,0 +1,74 @@ +From f4523389bf57593484308124e06d67855bb79315 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:14 +0800 +Subject: [PATCH] vfio: Add vfio_prereg_listener_log_sync in nested stage + +In nested mode, we set up the stage 2 (gpa->hpa)and stage 1 +(giova->gpa) separately by vfio_prereg_listener_region_add() +and vfio_listener_region_add(). So when marking dirty pages +we just need to pay attention to stage 2 mappings. + +Legacy vfio_listener_log_sync cannot be used in nested stage. +This patch adds vfio_prereg_listener_log_sync to mark dirty +pages in nested mode. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 6136b1ef61..2506cd57ee 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1579,6 +1579,22 @@ static int vfio_dma_sync_ram_section_dirty_bitmap(VFIOContainer *container, + int128_get64(section->size), ram_addr); + } + ++static void vfio_prereg_listener_log_sync(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ ++ if (!memory_region_is_ram(section->mr) || ++ !container->dirty_pages_supported) { ++ return; ++ } ++ ++ if (vfio_devices_all_dirty_tracking(container)) { ++ vfio_dma_sync_ram_section_dirty_bitmap(container, section); ++ } ++} ++ + typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +@@ -1666,6 +1682,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + ++ /* ++ * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are ++ * set up separately. It is inappropriate to pass 'giova' to kernel ++ * to get dirty pages. We only need to focus on stage 2 mapping when ++ * marking dirty pages. ++ */ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ return 0; ++ } ++ + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu) == section->mr && + giommu->n.start == section->offset_within_region) { +@@ -1859,6 +1885,7 @@ static const MemoryListener vfio_memory_listener = { + static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, ++ .log_sync = vfio_prereg_listener_log_sync, + }; + + static void vfio_listener_release(VFIOContainer *container) +-- +2.27.0 + -- Gitee From 93a3a98d266fa46e29bf69e0fd6a659dd931d1d5 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 31 Jul 2021 09:40:24 +0800 Subject: [PATCH 50/56] vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages When tracking dirty pages, we just need to pay attention to stage 2 mappings. Legacy vfio_listener_log_clear cannot be used in nested stage. This patch adds vfio_prereg_listener_log_clear to re-enable dirty pages in nested mode. Signed-off-by: Kunkun Jiang --- ...ereg_listener_log_clear-to-re-enable.patch | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch diff --git a/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch new file mode 100644 index 00000000..a055ed55 --- /dev/null +++ b/vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch @@ -0,0 +1,84 @@ +From 7086df6d90cd698a3e20cf4cf6e9a834f168cd8f Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Sat, 31 Jul 2021 09:40:24 +0800 +Subject: [PATCH] vfio: Add vfio_prereg_listener_log_clear to re-enable mark + dirty pages + +When tracking dirty pages, we just need to pay attention to stage 2 +mappings. Legacy vfio_listener_log_clear cannot be used in nested +stage. This patch adds vfio_prereg_listener_log_clear to re-enable +dirty pages in nested mode. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 40 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 39 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 2506cd57ee..20c820aa74 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1857,6 +1857,43 @@ static int vfio_physical_log_clear(VFIOContainer *container, + return ret; + } + ++static void vfio_prereg_listener_log_clear(MemoryListener *listener, ++ MemoryRegionSection *section) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ ++ if (!memory_region_is_ram(section->mr)) { ++ return; ++ } ++ ++ vfio_physical_log_clear(container, section); ++} ++ ++static int vfio_clear_dirty_bitmap(VFIOContainer *container, ++ MemoryRegionSection *section) ++{ ++ if (memory_region_is_iommu(section->mr)) { ++ /* ++ * In nested mode, stage 2 (gpa->hpa) and stage 1 (giova->gpa) are ++ * set up separately. It is inappropriate to pass 'giova' to kernel ++ * to get dirty pages. We only need to focus on stage 2 mapping when ++ * marking dirty pages. ++ */ ++ if (container->iommu_type == VFIO_TYPE1_NESTING_IOMMU) { ++ return 0; ++ } ++ ++ /* ++ * TODO: x86. With the log_clear() interface added, x86 may inplement ++ * its own method. ++ */ ++ } ++ ++ /* Here we assume that memory_region_is_ram(section->mr) == true */ ++ return vfio_physical_log_clear(container, section); ++} ++ + static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) + { +@@ -1868,7 +1905,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, + } + + if (vfio_devices_all_dirty_tracking(container)) { +- vfio_physical_log_clear(container, section); ++ vfio_clear_dirty_bitmap(container, section); + } + } + +@@ -1886,6 +1923,7 @@ static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, + .log_sync = vfio_prereg_listener_log_sync, ++ .log_clear = vfio_prereg_listener_log_clear, + }; + + static void vfio_listener_release(VFIOContainer *container) +-- +2.27.0 + -- Gitee From 58dc63c59a408ebbb1fc7153960fad4b7c7f4b9c Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 11 May 2021 10:08:15 +0800 Subject: [PATCH 51/56] vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage In nested mode, we set up the stage 2 and stage 1 separately. In my opinion, vfio_memory_prereg_listener is used for stage 2 and vfio_memory_listener is used for stage 1. So it feels weird to call the global_log_start/stop interface in vfio_memory_listener to switch dirty tracking, although this won't cause any errors. Add global_log_start/stop interface in vfio_memory_prereg_listener can separate stage 2 from stage 1. Signed-off-by: Kunkun Jiang --- ...ereg_listener_global_log_start-stop-.patch | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch diff --git a/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch new file mode 100644 index 00000000..96226621 --- /dev/null +++ b/vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch @@ -0,0 +1,71 @@ +From 287c63ab540533f1f9642e753c091caa7e6e2511 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:15 +0800 +Subject: [PATCH] vfio: Add vfio_prereg_listener_global_log_start/stop in + nested stage + +In nested mode, we set up the stage 2 and stage 1 separately. In my +opinion, vfio_memory_prereg_listener is used for stage 2 and +vfio_memory_listener is used for stage 1. So it feels weird to call +the global_log_start/stop interface in vfio_memory_listener to switch +dirty tracking, although this won't cause any errors. Add +global_log_start/stop interface in vfio_memory_prereg_listener +can separate stage 2 from stage 1. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 20c820aa74..65f3979492 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1501,6 +1501,17 @@ static void vfio_listener_log_global_start(MemoryListener *listener) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + ++ /* For nested mode, vfio_prereg_listener is used to start dirty tracking */ ++ if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) { ++ vfio_set_dirty_page_tracking(container, true); ++ } ++} ++ ++static void vfio_prereg_listener_log_global_start(MemoryListener *listener) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ + vfio_set_dirty_page_tracking(container, true); + } + +@@ -1508,6 +1519,17 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + ++ /* For nested mode, vfio_prereg_listener is used to stop dirty tracking */ ++ if (container->iommu_type != VFIO_TYPE1_NESTING_IOMMU) { ++ vfio_set_dirty_page_tracking(container, false); ++ } ++} ++ ++static void vfio_prereg_listener_log_global_stop(MemoryListener *listener) ++{ ++ VFIOContainer *container = ++ container_of(listener, VFIOContainer, prereg_listener); ++ + vfio_set_dirty_page_tracking(container, false); + } + +@@ -1922,6 +1944,8 @@ static const MemoryListener vfio_memory_listener = { + static MemoryListener vfio_memory_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, ++ .log_global_start = vfio_prereg_listener_log_global_start, ++ .log_global_stop = vfio_prereg_listener_log_global_stop, + .log_sync = vfio_prereg_listener_log_sync, + .log_clear = vfio_prereg_listener_log_clear, + }; +-- +2.27.0 + -- Gitee From 7dfeb242f2618846618c49cb4906129ffaf48d16 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 11 May 2021 10:08:16 +0800 Subject: [PATCH 52/56] hw/arm/smmuv3: Post-load stage 1 configurations to the host In nested mode, we call the set_pasid_table() callback on each STE update to pass the guest stage 1 configuration to the host and apply it at physical level. In the case of live migration, we need to manually call the set_pasid_table() to load the guest stage 1 configurations to the host. If this operation fails, the migration fails. Signed-off-by: Kunkun Jiang --- ...st-load-stage-1-configurations-to-th.patch | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch diff --git a/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch new file mode 100644 index 00000000..0fc5f844 --- /dev/null +++ b/hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch @@ -0,0 +1,110 @@ +From 1b95c995f032c21bf6607dda8ede0f5856bb190a Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 11 May 2021 10:08:16 +0800 +Subject: [PATCH] hw/arm/smmuv3: Post-load stage 1 configurations to the host + +In nested mode, we call the set_pasid_table() callback on each +STE update to pass the guest stage 1 configuration to the host +and apply it at physical level. + +In the case of live migration, we need to manually call the +set_pasid_table() to load the guest stage 1 configurations to +the host. If this operation fails, the migration fails. + +Signed-off-by: Kunkun Jiang +--- + hw/arm/smmuv3.c | 33 ++++++++++++++++++++++++++++----- + 1 file changed, 28 insertions(+), 5 deletions(-) + +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 45f21c53fe..291e3a12e8 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -932,7 +932,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd) + } + } + +-static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) ++static int smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + { + #ifdef __linux__ + IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); +@@ -941,9 +941,10 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + IOMMUConfig iommu_config = {}; + SMMUTransCfg *cfg; + SMMUDevice *sdev; ++ int ret; + + if (!mr) { +- return; ++ return 0; + } + + sdev = container_of(mr, SMMUDevice, iommu); +@@ -952,13 +953,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + smmuv3_flush_config(sdev); + + if (!pci_device_is_pasid_ops_set(sdev->bus, sdev->devfn)) { +- return; ++ return 0; + } + + cfg = smmuv3_get_config(sdev, &event); + + if (!cfg) { +- return; ++ return 0; + } + + iommu_config.pasid_cfg.argsz = sizeof(struct iommu_pasid_table_config); +@@ -980,10 +981,13 @@ static void smmuv3_notify_config_change(SMMUState *bs, uint32_t sid) + iommu_config.pasid_cfg.config, + iommu_config.pasid_cfg.base_ptr); + +- if (pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config)) { ++ ret = pci_device_set_pasid_table(sdev->bus, sdev->devfn, &iommu_config); ++ if (ret) { + error_report("Failed to pass PASID table to host for iommu mr %s (%m)", + mr->parent_obj.name); + } ++ ++ return ret; + #endif + } + +@@ -1553,6 +1557,24 @@ static void smmu_realize(DeviceState *d, Error **errp) + smmu_init_irq(s, dev); + } + ++static int smmuv3_post_load(void *opaque, int version_id) ++{ ++ SMMUv3State *s3 = opaque; ++ SMMUState *s = &(s3->smmu_state); ++ SMMUDevice *sdev; ++ int ret = 0; ++ ++ QLIST_FOREACH(sdev, &s->devices_with_notifiers, next) { ++ uint32_t sid = smmu_get_sid(sdev); ++ ret = smmuv3_notify_config_change(s, sid); ++ if (ret) { ++ break; ++ } ++ } ++ ++ return ret; ++} ++ + static const VMStateDescription vmstate_smmuv3_queue = { + .name = "smmuv3_queue", + .version_id = 1, +@@ -1571,6 +1593,7 @@ static const VMStateDescription vmstate_smmuv3 = { + .version_id = 1, + .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, ++ .post_load = smmuv3_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT32(features, SMMUv3State), + VMSTATE_UINT8(sid_size, SMMUv3State), +-- +2.27.0 + -- Gitee From 764be3cac4c9f6caf2534c8ddce96b6cca6336c8 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 7 Sep 2021 15:14:12 +0800 Subject: [PATCH 53/56] vfio/common: Fix incorrect address alignment in vfio_dma_map_ram_section The 'iova' will be passed to host kernel for mapping with the HPA. It is related to the host page size. So TARGET_PAGE_ALIGN should be replaced by REAL_HOST_PAGE_ALIGN. In the case of large granularity (64K), it may return early when map MMIO RAM section. And because of the inconsistency with vfio_dma_unmap_ram_section, it may cause 'assert(qrange)' in vfio_dma_unmap. Signed-off-by: Kunkun Jiang Signed-off-by: Zenghui Yu --- ...incorrect-address-alignment-in-vfio_.patch | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch diff --git a/vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch b/vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch new file mode 100644 index 00000000..d61408e6 --- /dev/null +++ b/vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch @@ -0,0 +1,40 @@ +From c2a4ce033db6ab74256e28da382c797a98047d4b Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 7 Sep 2021 15:14:12 +0800 +Subject: [PATCH] vfio/common: Fix incorrect address alignment in + vfio_dma_map_ram_section + +The 'iova' will be passed to host kernel for mapping with the +HPA. It is related to the host page size. So TARGET_PAGE_ALIGN +should be replaced by REAL_HOST_PAGE_ALIGN. In the case of +large granularity (64K), it may return early when map MMIO RAM +section. And because of the inconsistency with +vfio_dma_unmap_ram_section, it may cause 'assert(qrange)' +in vfio_dma_unmap. + +Signed-off-by: Kunkun Jiang +Signed-off-by: Zenghui Yu +--- + hw/vfio/common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 65f3979492..89c49f5508 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1059,10 +1059,10 @@ static int vfio_dma_map_ram_section(VFIOContainer *container, + + assert(memory_region_is_ram(section->mr)); + +- iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); ++ iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); +- llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); ++ llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); + end = int128_get64(int128_sub(llend, int128_one())); + + vaddr = memory_region_get_ram_ptr(section->mr) + +-- +2.27.0 + -- Gitee From df2e32ce9f796ea5724d587e0eb1989732fb4641 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Tue, 14 Sep 2021 14:21:46 +0800 Subject: [PATCH 54/56] vfio/common: Add address alignment check in vfio_listener_region_del Both vfio_listener_region_add and vfio_listener_region_del have reference counting operations on ram section->mr. If the 'iova' and 'llend' of the ram section do not pass the alignment check, the ram section should not be mapped or unmapped. It means that the reference counting should not be changed. However, the address alignment check is missing in vfio_listener_region_del. This makes memory_region_unref will be unconditional called and causes unintended problems in some scenarios. Signed-off-by: Kunkun Jiang --- ...address-alignment-check-in-vfio_list.patch | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 vfio-common-Add-address-alignment-check-in-vfio_list.patch diff --git a/vfio-common-Add-address-alignment-check-in-vfio_list.patch b/vfio-common-Add-address-alignment-check-in-vfio_list.patch new file mode 100644 index 00000000..288f2848 --- /dev/null +++ b/vfio-common-Add-address-alignment-check-in-vfio_list.patch @@ -0,0 +1,53 @@ +From 00c553f53657bf4bc165d859187215dba7110246 Mon Sep 17 00:00:00 2001 +From: Kunkun Jiang +Date: Tue, 14 Sep 2021 14:21:46 +0800 +Subject: [PATCH] vfio/common: Add address alignment check in + vfio_listener_region_del + +Both vfio_listener_region_add and vfio_listener_region_del have +reference counting operations on ram section->mr. If the 'iova' +and 'llend' of the ram section do not pass the alignment +check, the ram section should not be mapped or unmapped. It means +that the reference counting should not be changed. + +However, the address alignment check is missing in +vfio_listener_region_del. This makes memory_region_unref will +be unconditional called and causes unintended problems in some +scenarios. + +Signed-off-by: Kunkun Jiang +--- + hw/vfio/common.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 89c49f5508..4d45c2b625 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1411,6 +1411,8 @@ static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) + { + VFIOContainer *container = container_of(listener, VFIOContainer, listener); ++ hwaddr iova; ++ Int128 llend; + + if (vfio_listener_skipped_section(section)) { + trace_vfio_listener_region_del_skip( +@@ -1460,6 +1462,14 @@ static void vfio_listener_region_del(MemoryListener *listener, + */ + } + ++ iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); ++ llend = int128_make64(section->offset_within_address_space); ++ llend = int128_add(llend, section->size); ++ llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask)); ++ if (int128_ge(int128_make64(iova), llend)) { ++ return; ++ } ++ + vfio_dma_unmap_ram_section(container, section); + + memory_region_unref(section->mr); +-- +2.27.0 + -- Gitee From 3905b41992b8871f92c0b99dd87e7c7a69a06816 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 12 Feb 2022 17:20:31 +0800 Subject: [PATCH 55/56] spec: Update patch and changelog with !243 Support VFIO migration manual clear interface & vSMMUv3/pSMMUv3 2 stage VFIO integration & Support migration in SMMUv3 nested mode !243 linux-headers: update against 5.10 and manual clear vfio dirty log series vfio: Maintain DMA mapping range for the container vfio/migration: Add support for manual clear vfio dirty log update-linux-headers: Import iommu.h vfio.h and iommu.h header update against 5.10 memory: Add new fields in IOTLBEntry hw/arm/smmuv3: Improve stage1 ASID invalidation hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute memory: Introduce IOMMU Memory Region inject_faults API iommu: Introduce generic header pci: introduce PCIPASIDOps to PCIDevice vfio: Force nested if iommu requires it vfio: Introduce hostwin_from_range helper vfio: Introduce helpers to DMA map/unmap a RAM section vfio: Set up nested stage mappings vfio: Pass stage 1 MSI bindings to the host vfio: Helper to get IRQ info including capabilities vfio/pci: Register handler for iommu fault vfio/pci: Set up the DMA FAULT region vfio/pci: Implement the DMA fault handler hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute hw/arm/smmuv3: Store the PASID table GPA in the translation config hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation hw/arm/smmuv3: Pass stage 1 configurations to the host hw/arm/smmuv3: Implement fault injection hw/arm/smmuv3: Allow MAP notifiers pci: Add return_page_response pci ops vfio/pci: Implement return_page_response page response callback vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode vfio: Introduce helpers to mark dirty pages of a RAM section vfio: Add vfio_prereg_listener_log_sync in nested stage vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage hw/arm/smmuv3: Post-load stage 1 configurations to the host vfio/common: Fix incorrect address alignment in vfio_dma_map_ram_section vfio/common: Add address alignment check in vfio_listener_region_del Signed-off-by: Chen Qun --- qemu.spec | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/qemu.spec b/qemu.spec index 11efd808..08279f8e 100644 --- a/qemu.spec +++ b/qemu.spec @@ -99,6 +99,45 @@ Patch0086: tap-return-err-when-tap-TUNGETIFF-fail.patch Patch0087: xhci-check-reg-to-avoid-OOB-read.patch Patch0088: monitor-Discard-BLOCK_IO_ERROR-event-when-VM-reboote.patch Patch0089: monitor-limit-io-error-qmp-event-to-at-most-once-per.patch +Patch0090: linux-headers-update-against-5.10-and-manual-clear-v.patch +Patch0091: vfio-Maintain-DMA-mapping-range-for-the-container.patch +Patch0092: vfio-migration-Add-support-for-manual-clear-vfio-dir.patch +Patch0093: update-linux-headers-Import-iommu.h.patch +Patch0094: vfio.h-and-iommu.h-header-update-against-5.10.patch +Patch0095: memory-Add-new-fields-in-IOTLBEntry.patch +Patch0096: hw-arm-smmuv3-Improve-stage1-ASID-invalidation.patch +Patch0097: hw-arm-smmu-common-Allow-domain-invalidation-for-NH_.patch +Patch0098: memory-Add-IOMMU_ATTR_VFIO_NESTED-IOMMU-memory-regio.patch +Patch0099: memory-Add-IOMMU_ATTR_MSI_TRANSLATE-IOMMU-memory-reg.patch +Patch0100: memory-Introduce-IOMMU-Memory-Region-inject_faults-A.patch +Patch0101: iommu-Introduce-generic-header.patch +Patch0102: pci-introduce-PCIPASIDOps-to-PCIDevice.patch +Patch0103: vfio-Force-nested-if-iommu-requires-it.patch +Patch0104: vfio-Introduce-hostwin_from_range-helper.patch +Patch0105: vfio-Introduce-helpers-to-DMA-map-unmap-a-RAM-sectio.patch +Patch0106: vfio-Set-up-nested-stage-mappings.patch +Patch0107: vfio-Pass-stage-1-MSI-bindings-to-the-host.patch +Patch0108: vfio-Helper-to-get-IRQ-info-including-capabilities.patch +Patch0109: vfio-pci-Register-handler-for-iommu-fault.patch +Patch0110: vfio-pci-Set-up-the-DMA-FAULT-region.patch +Patch0111: vfio-pci-Implement-the-DMA-fault-handler.patch +Patch0112: hw-arm-smmuv3-Advertise-MSI_TRANSLATE-attribute.patch +Patch0113: hw-arm-smmuv3-Store-the-PASID-table-GPA-in-the-trans.patch +Patch0114: hw-arm-smmuv3-Fill-the-IOTLBEntry-arch_id-on-NH_VA-i.patch +Patch0115: hw-arm-smmuv3-Fill-the-IOTLBEntry-leaf-field-on-NH_V.patch +Patch0116: hw-arm-smmuv3-Pass-stage-1-configurations-to-the-hos.patch +Patch0117: hw-arm-smmuv3-Implement-fault-injection.patch +Patch0118: hw-arm-smmuv3-Allow-MAP-notifiers.patch +Patch0119: pci-Add-return_page_response-pci-ops.patch +Patch0120: vfio-pci-Implement-return_page_response-page-respons.patch +Patch0121: vfio-common-Avoid-unmap-ram-section-at-vfio_listener.patch +Patch0122: vfio-Introduce-helpers-to-mark-dirty-pages-of-a-RAM-.patch +Patch0123: vfio-Add-vfio_prereg_listener_log_sync-in-nested-sta.patch +Patch0124: vfio-Add-vfio_prereg_listener_log_clear-to-re-enable.patch +Patch0125: vfio-Add-vfio_prereg_listener_global_log_start-stop-.patch +Patch0126: hw-arm-smmuv3-Post-load-stage-1-configurations-to-th.patch +Patch0127: vfio-common-Fix-incorrect-address-alignment-in-vfio_.patch +Patch0128: vfio-common-Add-address-alignment-check-in-vfio_list.patch BuildRequires: flex BuildRequires: gcc @@ -543,6 +582,47 @@ getent passwd qemu >/dev/null || \ %endif %changelog +* Sat Feb 12 2022 Chen Qun +- linux-headers: update against 5.10 and manual clear vfio dirty log series +- vfio: Maintain DMA mapping range for the container +- vfio/migration: Add support for manual clear vfio dirty log +- update-linux-headers: Import iommu.h +- vfio.h and iommu.h header update against 5.10 +- memory: Add new fields in IOTLBEntry +- hw/arm/smmuv3: Improve stage1 ASID invalidation +- hw/arm/smmu-common: Allow domain invalidation for NH_ALL/NSNH_ALL +- memory: Add IOMMU_ATTR_VFIO_NESTED IOMMU memory region attribute +- memory: Add IOMMU_ATTR_MSI_TRANSLATE IOMMU memory region attribute +- memory: Introduce IOMMU Memory Region inject_faults API +- iommu: Introduce generic header +- pci: introduce PCIPASIDOps to PCIDevice +- vfio: Force nested if iommu requires it +- vfio: Introduce hostwin_from_range helper +- vfio: Introduce helpers to DMA map/unmap a RAM section +- vfio: Set up nested stage mappings +- vfio: Pass stage 1 MSI bindings to the host +- vfio: Helper to get IRQ info including capabilities +- vfio/pci: Register handler for iommu fault +- vfio/pci: Set up the DMA FAULT region +- vfio/pci: Implement the DMA fault handler +- hw/arm/smmuv3: Advertise MSI_TRANSLATE attribute +- hw/arm/smmuv3: Store the PASID table GPA in the translation config +- hw/arm/smmuv3: Fill the IOTLBEntry arch_id on NH_VA invalidation +- hw/arm/smmuv3: Fill the IOTLBEntry leaf field on NH_VA invalidation +- hw/arm/smmuv3: Pass stage 1 configurations to the host +- hw/arm/smmuv3: Implement fault injection +- hw/arm/smmuv3: Allow MAP notifiers +- pci: Add return_page_response pci ops +- vfio/pci: Implement return_page_response page response callback +- vfio/common: Avoid unmap ram section at vfio_listener_region_del() in nested mode +- vfio: Introduce helpers to mark dirty pages of a RAM section +- vfio: Add vfio_prereg_listener_log_sync in nested stage +- vfio: Add vfio_prereg_listener_log_clear to re-enable mark dirty pages +- vfio: Add vfio_prereg_listener_global_log_start/stop in nested stage +- hw/arm/smmuv3: Post-load stage 1 configurations to the host +- vfio/common: Fix incorrect address alignment in vfio_dma_map_ram_section +- vfio/common: Add address alignment check in vfio_listener_region_del + * Sat Feb 12 2022 Chen Qun - log: Add some logs on VM runtime path - qdev/monitors: Fix reundant error_setg of qdev_add_device -- Gitee From 5ea7c5e4598f3cd8391a977364af8bbcf92131c9 Mon Sep 17 00:00:00 2001 From: Chen Qun Date: Sat, 12 Feb 2022 17:20:34 +0800 Subject: [PATCH 56/56] spec: Update release version with !245 !247 !243 increase release verison by one Signed-off-by: Chen Qun --- qemu.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qemu.spec b/qemu.spec index 08279f8e..4fdd311e 100644 --- a/qemu.spec +++ b/qemu.spec @@ -1,6 +1,6 @@ Name: qemu Version: 6.2.0 -Release: 11 +Release: 12 Epoch: 2 Summary: QEMU is a generic and open source machine emulator and virtualizer License: GPLv2 and BSD and MIT and CC-BY-SA-4.0 -- Gitee