From ebafa8d737b5f08e787803375d6e942ecdaef1a9 Mon Sep 17 00:00:00 2001 From: Yabin Li Date: Fri, 4 Aug 2023 21:09:08 +0800 Subject: [PATCH 001/939] hw/vfio: add device hct based on vfio. add hct device based on vfio, used to simulate ccp devices Signed-off-by: Yabin Li Signed-off-by: yangdepei --- hw/vfio/Kconfig | 6 + hw/vfio/hct.c | 543 ++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/meson.build | 1 + 3 files changed, 550 insertions(+) create mode 100644 hw/vfio/hct.c diff --git a/hw/vfio/Kconfig b/hw/vfio/Kconfig index 7cdba0560a..5f0d3c2d2b 100644 --- a/hw/vfio/Kconfig +++ b/hw/vfio/Kconfig @@ -41,3 +41,9 @@ config VFIO_IGD bool default y if PC_PCI depends on VFIO_PCI + +config VFIO_HCT + bool + default y + select VFIO + depends on LINUX && PCI diff --git a/hw/vfio/hct.c b/hw/vfio/hct.c new file mode 100644 index 0000000000..476e86c61d --- /dev/null +++ b/hw/vfio/hct.c @@ -0,0 +1,543 @@ +/* + * vfio based mediated ccp(hct) assignment support + * + * Copyright 2023 HYGON Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include +#include +#include +#include +#include + +#include "qemu/osdep.h" +#include "qemu/queue.h" +#include "qemu/main-loop.h" +#include "qemu/log.h" +#include "trace.h" +#include "hw/pci/pci.h" +#include "hw/vfio/pci.h" +#include "qemu/range.h" +#include "sysemu/kvm.h" +#include "hw/pci/msi.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" + +#define MAX_CCP_CNT 16 +#define PAGE_SIZE 4096 +#define HCT_SHARED_MEMORY_SIZE (PAGE_SIZE * MAX_CCP_CNT) +#define CCP_INDEX_BYTES 4 +#define PATH_MAX 4096 +#define TYPE_HCT_DEV "hct" +#define PCI_HCT_DEV(obj) OBJECT_CHECK(HCTDevState, (obj), TYPE_HCT_DEV) +#define HCT_MMIO_SIZE (1 << 20) +#define HCT_MAX_PASID (1 << 8) + +#define PCI_VENDOR_ID_HYGON_CCP 0x1d94 +#define PCI_DEVICE_ID_HYGON_CCP 0x1468 + +#define HCT_SHARE_DEV "/dev/hct_share" + +#define HCT_VERSION_STRING "0.2" +#define DEF_VERSION_STRING "0.1" +#define VERSION_SIZE 16 + +#define HCT_SHARE_IOC_TYPE 'C' +#define HCT_SHARE_OP_TYPE 0x01 +#define HCT_SHARE_OP _IOWR(HCT_SHARE_IOC_TYPE, \ + HCT_SHARE_OP_TYPE, \ + struct hct_dev_ctrl) +#define HCT_SHARE_OP_DMA_MAP 0x01 +#define HCT_SHARE_OP_GET_ID 0x03 +#define HCT_SHARE_OP_GET_PASID 0x04 +#define HCT_SHARE_OP_DMA_UNMAP 0x05 +#define HCT_SHARE_OP_GET_VERSION 0x06 + +/* BARS */ +#define HCT_REG_BAR_IDX 2 +#define HCT_SHARED_BAR_IDX 3 +#define HCT_PASID_BAR_IDX 4 + +#define PASID_OFFSET 40 + +static volatile struct hct_data { + int init; + int hct_fd; + unsigned long pasid; + uint8_t *pasid_memory; + uint8_t *hct_shared_memory; + uint8_t ccp_index[MAX_CCP_CNT]; + uint8_t ccp_cnt; +} hct_data; + +typedef struct SharedDevice { + PCIDevice dev; + int shared_memory_offset; +} SharedDevice; + +typedef struct HctDevState { + SharedDevice sdev; + VFIODevice vdev; + MemoryRegion mmio; + MemoryRegion shared; + MemoryRegion pasid; + void *maps[PCI_NUM_REGIONS]; +} HCTDevState; + +struct hct_dev_ctrl { + unsigned char op; + unsigned char rsvd[3]; + union { + unsigned char version[VERSION_SIZE]; + struct { + unsigned long vaddr; + unsigned long iova; + unsigned long size; + }; + unsigned int id; + }; +}; + +static int pasid_get_and_init(HCTDevState *state) +{ + struct hct_dev_ctrl ctrl; + int ret; + + ctrl.op = HCT_SHARE_OP_GET_PASID; + ctrl.id = -1; + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) { + ret = -errno; + error_report("GET_PASID fail: %d", -errno); + goto out; + } + + *hct_data.pasid_memory = ctrl.id; + hct_data.pasid = ctrl.id; + +out: + return ret; +} + +static const MemoryRegionOps hct_mmio_ops = { + .endianness = DEVICE_NATIVE_ENDIAN, + .valid = + { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static void vfio_hct_detach_device(HCTDevState *state) +{ + vfio_detach_device(&state->vdev); + g_free(state->vdev.name); +} + +static void vfio_hct_exit(PCIDevice *dev) +{ + HCTDevState *state = PCI_HCT_DEV(dev); + + vfio_hct_detach_device(state); + + if (hct_data.hct_fd) { + qemu_close(hct_data.hct_fd); + hct_data.hct_fd = 0; + } +} + +static Property vfio_hct_properties[] = { + DEFINE_PROP_STRING("sysfsdev", HCTDevState, vdev.sysfsdev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vfio_ccp_compute_needs_reset(VFIODevice *vdev) +{ + vdev->needs_reset = false; +} + +struct VFIODeviceOps vfio_ccp_ops = { + .vfio_compute_needs_reset = vfio_ccp_compute_needs_reset, +}; + +/* create BAR2, BAR3 and BAR4 space for the virtual machine. */ +static int vfio_hct_region_mmap(HCTDevState *state) +{ + int ret; + int i; + struct vfio_region_info *info; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + ret = vfio_get_region_info(&state->vdev, i, &info); + if (ret) + goto out; + + if (info->size) { + state->maps[i] = mmap(NULL, info->size, PROT_READ | PROT_WRITE, + MAP_SHARED, state->vdev.fd, info->offset); + if (state->maps[i] == MAP_FAILED) { + ret = -errno; + g_free(info); + error_report("vfio mmap fail\n"); + goto out; + } + } + g_free(info); + } + + memory_region_init_io(&state->mmio, OBJECT(state), &hct_mmio_ops, state, + "hct mmio", HCT_MMIO_SIZE); + memory_region_init_ram_device_ptr(&state->mmio, OBJECT(state), "hct mmio", + HCT_MMIO_SIZE, + state->maps[HCT_REG_BAR_IDX]); + + memory_region_init_io(&state->shared, OBJECT(state), &hct_mmio_ops, state, + "hct shared memory", PAGE_SIZE); + memory_region_init_ram_device_ptr( + &state->shared, OBJECT(state), "hct shared memory", PAGE_SIZE, + (void *)hct_data.hct_shared_memory + + state->sdev.shared_memory_offset * PAGE_SIZE); + + memory_region_init_io(&state->pasid, OBJECT(state), &hct_mmio_ops, state, + "hct pasid", PAGE_SIZE); + memory_region_init_ram_device_ptr(&state->pasid, OBJECT(state), "hct pasid", + PAGE_SIZE, hct_data.pasid_memory); + + pci_register_bar(&state->sdev.dev, HCT_REG_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &state->mmio); + pci_register_bar(&state->sdev.dev, HCT_SHARED_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &state->shared); + pci_register_bar(&state->sdev.dev, HCT_PASID_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &state->pasid); +out: + return ret; +} + +static int hct_check_duplicated_index(int index) +{ + int cnt; + for (cnt = 0; cnt < hct_data.ccp_cnt; cnt++) { + if (hct_data.ccp_index[cnt] == index) { + error_report("many mdev shouldn't be mapped to one ccp in a " + "virtual machine!\n"); + return -1; + } + } + + hct_data.ccp_index[hct_data.ccp_cnt++] = index; + return 0; +} + +static int hct_get_ccp_index(HCTDevState *state) +{ + char path[PATH_MAX]; + char buf[CCP_INDEX_BYTES]; + int fd; + int ret; + int ccp_index; + + snprintf(path, PATH_MAX, "%s/vendor/id", state->vdev.sysfsdev); + fd = qemu_open_old(path, O_RDONLY); + if (fd < 0) { + error_report("open %s fail\n", path); + return -errno; + } + + ret = read(fd, buf, sizeof(buf)); + if (ret < 0) { + ret = -errno; + error_report("read %s fail\n", path); + goto out; + } + + if (1 != sscanf(buf, "%d", &ccp_index)) { + ret = -errno; + error_report("format addr %s fail\n", buf); + goto out; + } + + if (!hct_check_duplicated_index(ccp_index)) { + state->sdev.shared_memory_offset = ccp_index; + } else { + ret = -1; + } + +out: + qemu_close(fd); + return ret; +} + +static int hct_api_version_check(void) +{ + struct hct_dev_ctrl ctrl; + int ret; + + ctrl.op = HCT_SHARE_OP_GET_VERSION; + memcpy(ctrl.version, DEF_VERSION_STRING, sizeof(DEF_VERSION_STRING)); + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) { + error_report("ret %d, errno %d: fail to get hct.ko version, please " + "update hct.ko to version 0.2.\n", + ret, errno); + return -1; + } else if (memcmp(ctrl.version, HCT_VERSION_STRING, + sizeof(HCT_VERSION_STRING)) < 0) { + error_report("The API version %s is larger than hct.ko version %s, " + "please update hct.ko to version 0.2\n", + HCT_VERSION_STRING, ctrl.version); + return -1; + } + + return 0; +} + +static int hct_shared_memory_init(void) +{ + int ret = 0; + + hct_data.hct_shared_memory = + mmap(NULL, HCT_SHARED_MEMORY_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + hct_data.hct_fd, 0); + if (hct_data.hct_shared_memory == MAP_FAILED) { + ret = -errno; + error_report("map hct shared memory fail\n"); + goto out; + } + +out: + return ret; +} + +static void hct_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct hct_dev_ctrl ctrl; + hwaddr iova; + Int128 llend, llsize; + void *vaddr; + int ret; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_add(llend, int128_exts64(qemu_real_host_page_mask())); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + if (!section->mr->ram) { + return; + } + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + llsize = int128_sub(llend, int128_make64(iova)); + + ctrl.op = HCT_SHARE_OP_DMA_MAP; + ctrl.iova = iova | (hct_data.pasid << PASID_OFFSET); + ctrl.vaddr = (uint64_t)vaddr; + ctrl.size = llsize; + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) + error_report("VFIO_MAP_DMA: %d, iova=%lx", -errno, iova); +} + +static void hct_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct hct_dev_ctrl ctrl; + hwaddr iova; + Int128 llend, llsize; + int ret; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_add(llend, int128_exts64(qemu_real_host_page_mask())); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + if (!section->mr->ram) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + + ctrl.op = HCT_SHARE_OP_DMA_UNMAP; + ctrl.iova = iova | (hct_data.pasid << PASID_OFFSET); + ctrl.size = llsize; + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) + error_report("VFIO_UNMAP_DMA: %d", -errno); +} + +static MemoryListener hct_memory_listener = { + .region_add = hct_listener_region_add, + .region_del = hct_listener_region_del, +}; + +static void hct_data_uninit(HCTDevState *state) +{ + if (hct_data.hct_fd) { + qemu_close(hct_data.hct_fd); + hct_data.hct_fd = 0; + } + + if (hct_data.pasid) { + hct_data.pasid = 0; + } + + if (hct_data.pasid_memory) { + munmap(hct_data.pasid_memory, PAGE_SIZE); + hct_data.pasid_memory = NULL; + } + + if (hct_data.hct_shared_memory) { + munmap((void *)hct_data.hct_shared_memory, HCT_SHARED_MEMORY_SIZE); + hct_data.hct_shared_memory = NULL; + } + + memory_listener_unregister(&hct_memory_listener); +} + +static int hct_data_init(HCTDevState *state) +{ + int ret; + + if (hct_data.init == 0) { + + hct_data.hct_fd = qemu_open_old(HCT_SHARE_DEV, O_RDWR); + if (hct_data.hct_fd < 0) { + error_report("fail to open %s, errno %d.", HCT_SHARE_DEV, errno); + ret = -errno; + goto out; + } + + /* The hct.ko version number needs not to be less than 0.2. */ + ret = hct_api_version_check(); + if (ret) + goto out; + + /* assign a page to the virtual BAR3 of each CCP. */ + ret = hct_shared_memory_init(); + if (ret) + goto out; + + hct_data.pasid_memory = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (hct_data.pasid_memory < 0) + goto unmap_shared_memory_exit; + + /* assign a unique pasid to each virtual machine. */ + ret = pasid_get_and_init(state); + if (ret < 0) + goto unmap_pasid_memory_exit; + + /* perform DMA_MAP and DMA_UNMAP operations on all memories of the + * virtual machine. */ + memory_listener_register(&hct_memory_listener, &address_space_memory); + + hct_data.init = 1; + } + + return hct_get_ccp_index(state); + +unmap_pasid_memory_exit: + munmap(hct_data.pasid_memory, PAGE_SIZE); + +unmap_shared_memory_exit: + munmap((void *)hct_data.hct_shared_memory, HCT_SHARED_MEMORY_SIZE); + +out: + return ret; +} + +/* When device is loaded */ +static void vfio_hct_realize(PCIDevice *pci_dev, Error **errp) +{ + int ret; + char *mdevid; + Error *err = NULL; + HCTDevState *state = PCI_HCT_DEV(pci_dev); + + /* parsing mdev device name from startup scripts */ + mdevid = g_path_get_basename(state->vdev.sysfsdev); + state->vdev.name = g_strdup_printf("%s", mdevid); + + ret = hct_data_init(state); + if (ret < 0) { + g_free(state->vdev.name); + goto out; + } + + ret = vfio_attach_device(state->vdev.name, &state->vdev, + pci_device_iommu_address_space(pci_dev), &err); + + if (ret) { + error_report("attach device failed, name = %s", state->vdev.name); + goto data_uninit_out; + } + + state->vdev.ops = &vfio_ccp_ops; + state->vdev.dev = &state->sdev.dev.qdev; + + ret = vfio_hct_region_mmap(state); + if (ret < 0) + goto detach_device_out; + + return; + +detach_device_out: + vfio_hct_detach_device(state); + +data_uninit_out: + hct_data_uninit(state); + +out: + return; +} + +static void hct_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->desc = "HCT Device"; + device_class_set_props(dc, vfio_hct_properties); + + pdc->realize = vfio_hct_realize; + pdc->exit = vfio_hct_exit; + pdc->vendor_id = PCI_VENDOR_ID_HYGON_CCP; + pdc->device_id = PCI_DEVICE_ID_HYGON_CCP; + pdc->class_id = PCI_CLASS_CRYPT_OTHER; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + return; +} + +static const TypeInfo pci_hct_info = { + .name = TYPE_HCT_DEV, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(HCTDevState), + .class_init = hct_dev_class_init, + .interfaces = + (InterfaceInfo[]){ + {INTERFACE_CONVENTIONAL_PCI_DEVICE}, + {}, + }, +}; + +static void hct_register_types(void) { + type_register_static(&pci_hct_info); +} + +type_init(hct_register_types); diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 2a6912c940..b1db4c8605 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -17,5 +17,6 @@ vfio_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) vfio_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) vfio_ss.add(when: 'CONFIG_VFIO_AP', if_true: files('ap.c')) vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c')) +vfio_ss.add(when: 'CONFIG_VFIO_HCT', if_true: files('hct.c')) specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss) -- Gitee From d95cbdd8738d61b8bc7c9a1541dade42c1f48314 Mon Sep 17 00:00:00 2001 From: adttil <2429917001@qq.com> Date: Thu, 1 Feb 2024 21:53:58 +0800 Subject: [PATCH 002/939] tests/qemu-iotests: resolved the problem that the 108 test cases in the container fail The loop device cannot be created in the compilation environment of the container. Therefore, a judgment condition is added to the initialization variable loopdev to check whether loop-control exists. Signed-off-by: Adttil <2429917001@qq.com> --- tests/qemu-iotests/108 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108 index 54e935acf2..a6fe261265 100755 --- a/tests/qemu-iotests/108 +++ b/tests/qemu-iotests/108 @@ -55,7 +55,7 @@ _supported_os Linux _unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file # This test either needs sudo -n losetup or FUSE exports to work -if sudo -n losetup &>/dev/null; then +if test -c "/dev/loop-control" && sudo -n losetup &>/dev/null; then loopdev=true else loopdev=false -- Gitee From f06b930da5d2acf70d142f1212ef4ee09d643b21 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Tue, 27 Feb 2024 16:18:43 +0800 Subject: [PATCH 003/939] hw/usb: Style cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 455177ffc457098b0103d2a09cb7ba5e260dfcdd We are going to modify these lines, fix their style in order to avoid checkpatch.pl warning. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Michael Tokarev Signed-off-by: dinglimin --- hw/usb/hcd-ehci.c | 3 ++- hw/usb/hcd-uhci.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c index 19b4534c20..7b093acd98 100644 --- a/hw/usb/hcd-ehci.c +++ b/hw/usb/hcd-ehci.c @@ -1086,8 +1086,9 @@ static void ehci_opreg_write(void *ptr, hwaddr addr, case CONFIGFLAG: val &= 0x1; if (val) { - for(i = 0; i < NB_PORTS; i++) + for (i = 0; i < NB_PORTS; i++) { handle_port_owner_write(s, i, 0); + } } break; diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c index 77baaa7a6b..6975966c3f 100644 --- a/hw/usb/hcd-uhci.c +++ b/hw/usb/hcd-uhci.c @@ -457,8 +457,9 @@ static void uhci_port_write(void *opaque, hwaddr addr, int n; n = (addr >> 1) & 7; - if (n >= NB_PORTS) + if (n >= NB_PORTS) { return; + } port = &s->ports[n]; dev = port->port.dev; if (dev && dev->attached) { @@ -513,8 +514,9 @@ static uint64_t uhci_port_read(void *opaque, hwaddr addr, unsigned size) UHCIPort *port; int n; n = (addr >> 1) & 7; - if (n >= NB_PORTS) + if (n >= NB_PORTS) { goto read_default; + } port = &s->ports[n]; val = port->ctrl; } -- Gitee From ffb0dcccbf5f6e662e7c0b6afa4fe7308d96cc06 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Tue, 27 Feb 2024 17:06:01 +0800 Subject: [PATCH 004/939] virtio-gpu: Correct virgl_renderer_resource_get_info() error check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 574b64aa6754ba491f51024c5a823a674d48a658 virgl_renderer_resource_get_info() returns errno and not -1 on error. Correct the return-value check. Reviewed-by: Marc-André Lureau Signed-off-by: Dmitry Osipenko Message-Id: <20240129073921.446869-1-dmitry.osipenko@collabora.com> Cc: qemu-stable@nongnu.org Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: dinglimin --- contrib/vhost-user-gpu/virgl.c | 6 +++--- hw/display/virtio-gpu-virgl.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/vhost-user-gpu/virgl.c b/contrib/vhost-user-gpu/virgl.c index d1ccdf7d06..51da0e3667 100644 --- a/contrib/vhost-user-gpu/virgl.c +++ b/contrib/vhost-user-gpu/virgl.c @@ -327,7 +327,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id, #ifdef VIRGL_RENDERER_RESOURCE_INFO_EXT_VERSION struct virgl_renderer_resource_info_ext info_ext; ret = virgl_renderer_resource_get_info_ext(resource_id, &info_ext); - if (ret < 0) { + if (ret) { return ret; } @@ -335,7 +335,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id, *modifiers = info_ext.modifiers; #else ret = virgl_renderer_resource_get_info(resource_id, info); - if (ret < 0) { + if (ret) { return ret; } @@ -372,7 +372,7 @@ virgl_cmd_set_scanout(VuGpu *g, uint64_t modifiers = 0; ret = virgl_get_resource_info_modifiers(ss.resource_id, &info, &modifiers); - if (ret == -1) { + if (ret) { g_critical("%s: illegal resource specified %d\n", __func__, ss.resource_id); cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c index 8bb7a2c21f..9f34d0e661 100644 --- a/hw/display/virtio-gpu-virgl.c +++ b/hw/display/virtio-gpu-virgl.c @@ -181,7 +181,7 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g, memset(&info, 0, sizeof(info)); ret = virgl_renderer_resource_get_info(ss.resource_id, &info); #endif - if (ret == -1) { + if (ret) { qemu_log_mask(LOG_GUEST_ERROR, "%s: illegal resource specified %d\n", __func__, ss.resource_id); -- Gitee From f8ed9dd954fbd558d549c7c2e2ab7322107218a1 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Tue, 27 Feb 2024 17:40:21 +0800 Subject: [PATCH 005/939] hw/i2c/smbus_slave: Add object path on error prints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from fcc8299e29816c9b6f8d9766254fce6e8a50ee52 The current logging doesn't tell us which specific smbus device is an error state. Signed-off-by: Joe Komlodi Reviewed-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240202204847.2062798-3-komlodi@google.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: dinglimin --- hw/i2c/smbus_slave.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/i2c/smbus_slave.c b/hw/i2c/smbus_slave.c index 2ef2c7c5f6..b35516a404 100644 --- a/hw/i2c/smbus_slave.c +++ b/hw/i2c/smbus_slave.c @@ -25,11 +25,15 @@ #define DPRINTF(fmt, ...) \ do { printf("smbus(%02x): " fmt , dev->i2c.address, ## __VA_ARGS__); } while (0) #define BADF(fmt, ...) \ -do { fprintf(stderr, "smbus: error: " fmt , ## __VA_ARGS__); exit(1);} while (0) +do { g_autofree char *qom_path = object_get_canonical_path(OBJECT(dev)); \ + fprintf(stderr, "%s: smbus: error: " fmt , qom_path, ## __VA_ARGS__); \ + exit(1); } while (0) #else #define DPRINTF(fmt, ...) do {} while(0) #define BADF(fmt, ...) \ -do { fprintf(stderr, "smbus: error: " fmt , ## __VA_ARGS__);} while (0) +do { g_autofree char *qom_path = object_get_canonical_path(OBJECT(dev)); \ + fprintf(stderr, "%s: smbus: error: " fmt , qom_path, ## __VA_ARGS__); \ + } while (0) #endif enum { -- Gitee From c93d512dddb00e3eed2ce9484c55f5f1fbb54c8b Mon Sep 17 00:00:00 2001 From: dinglimin Date: Tue, 27 Feb 2024 19:02:52 +0800 Subject: [PATCH 006/939] blkio: Respect memory-alignment for bounce buffer allocations cheery-pick from 10b2393e5e7f4c1d633f1ac8578465681c333efb blkio_alloc_mem_region() requires that the requested buffer size is a multiple of the memory-alignment property. If it isn't, the allocation fails with a return value of -EINVAL. Fix the call in blkio_resize_bounce_pool() to make sure the requested size is properly aligned. I observed this problem with vhost-vdpa, which requires page aligned memory. As the virtio-blk device behind it still had 512 byte blocks, we got bs->bl.request_alignment = 512, but actually any request that needed a bounce buffer and was not aligned to 4k would fail without this fix. Suggested-by: Stefano Garzarella Signed-off-by: Kevin Wolf Message-ID: <20240131173140.42398-1-kwolf@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Stefano Garzarella Signed-off-by: Kevin Wolf Signed-off-by: dinglimin --- block/blkio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blkio.c b/block/blkio.c index 0a0a6c0f5f..b989617608 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -89,6 +89,9 @@ static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) /* Pad size to reduce frequency of resize calls */ bytes += 128 * 1024; + /* Align the pool size to avoid blkio_alloc_mem_region() failure */ + bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment); + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { int ret; -- Gitee From 70e7ffec16e91138309ad3f76588cbd10c084394 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 27 Nov 2023 12:02:31 +0800 Subject: [PATCH 007/939] hw/loongarch/virt: Align high memory base address with super page size With LoongArch virt machine, there is low memory space with region 0--0x10000000, and high memory space with started from 0x90000000. High memory space is aligned with 256M, it will be better if it is aligned with 1G, which is super page aligned for 4K page size. Currently linux kernel and uefi bios has no limitation with high memory base address, it is ok to set high memory base address with 0x80000000. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20231127040231.4123715-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- include/hw/loongarch/virt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 674f4655e0..db0831b471 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -25,7 +25,7 @@ #define VIRT_LOWMEM_BASE 0 #define VIRT_LOWMEM_SIZE 0x10000000 -#define VIRT_HIGHMEM_BASE 0x90000000 +#define VIRT_HIGHMEM_BASE 0x80000000 #define VIRT_GED_EVT_ADDR 0x100e0000 #define VIRT_GED_MEM_ADDR (VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN) #define VIRT_GED_REG_ADDR (VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN) -- Gitee From 8a43c9379651fbf9d015240d6dc7c4b90ce98683 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 6 Dec 2023 16:18:39 +0800 Subject: [PATCH 008/939] target/loongarch: Add timer information dump support Timer emulation sometimes is problematic especially when vm is running in kvm mode. This patch adds registers dump support relative with timer hardware, so that it is easier to find the problems. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20231206081839.2290178-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index fc075952e6..db9a421cc4 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -762,6 +762,8 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) qemu_fprintf(f, "TLBRENTRY=%016" PRIx64 "\n", env->CSR_TLBRENTRY); qemu_fprintf(f, "TLBRBADV=%016" PRIx64 "\n", env->CSR_TLBRBADV); qemu_fprintf(f, "TLBRERA=%016" PRIx64 "\n", env->CSR_TLBRERA); + qemu_fprintf(f, "TCFG=%016" PRIx64 "\n", env->CSR_TCFG); + qemu_fprintf(f, "TVAL=%016" PRIx64 "\n", env->CSR_TVAL); /* fpr */ if (flags & CPU_DUMP_FPU) { -- Gitee From ae65e1281aa67713bde6bce323a3a8d06f27c636 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 2 Jan 2024 10:01:59 +0800 Subject: [PATCH 009/939] target/loongarch/meson: move gdbstub.c to loongarch.ss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gdbstub.c is not specific to TCG and can be used by other accelerators, such as KVM accelerator Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Song Gao Message-Id: <20240102020200.3462097-1-gaosong@loongson.cn> --- target/loongarch/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build index 18e8191e2b..b3a0fb12fb 100644 --- a/target/loongarch/meson.build +++ b/target/loongarch/meson.build @@ -3,6 +3,7 @@ gen = decodetree.process('insns.decode') loongarch_ss = ss.source_set() loongarch_ss.add(files( 'cpu.c', + 'gdbstub.c', )) loongarch_tcg_ss = ss.source_set() loongarch_tcg_ss.add(gen) @@ -10,7 +11,6 @@ loongarch_tcg_ss.add(files( 'fpu_helper.c', 'op_helper.c', 'translate.c', - 'gdbstub.c', 'vec_helper.c', )) loongarch_tcg_ss.add(zlib) -- Gitee From eef77dd5b0d292d8a0276c820fc8fee24de0d898 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 2 Jan 2024 10:02:00 +0800 Subject: [PATCH 010/939] target/loongarch: move translate modules to tcg/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the target/loongarch/tcg directory. Its purpose is to hold the TCG code that is selected by CONFIG_TCG Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Song Gao Message-Id: <20240102020200.3462097-2-gaosong@loongson.cn> --- target/loongarch/meson.build | 15 +-------------- target/loongarch/{ => tcg}/constant_timer.c | 0 target/loongarch/{ => tcg}/csr_helper.c | 0 target/loongarch/{ => tcg}/fpu_helper.c | 0 .../{ => tcg}/insn_trans/trans_arith.c.inc | 0 .../{ => tcg}/insn_trans/trans_atomic.c.inc | 0 .../{ => tcg}/insn_trans/trans_bit.c.inc | 0 .../{ => tcg}/insn_trans/trans_branch.c.inc | 0 .../{ => tcg}/insn_trans/trans_extra.c.inc | 0 .../{ => tcg}/insn_trans/trans_farith.c.inc | 0 .../{ => tcg}/insn_trans/trans_fcmp.c.inc | 0 .../{ => tcg}/insn_trans/trans_fcnv.c.inc | 0 .../{ => tcg}/insn_trans/trans_fmemory.c.inc | 0 .../{ => tcg}/insn_trans/trans_fmov.c.inc | 0 .../{ => tcg}/insn_trans/trans_memory.c.inc | 0 .../insn_trans/trans_privileged.c.inc | 0 .../{ => tcg}/insn_trans/trans_shift.c.inc | 0 .../{ => tcg}/insn_trans/trans_vec.c.inc | 0 target/loongarch/{ => tcg}/iocsr_helper.c | 0 target/loongarch/tcg/meson.build | 19 +++++++++++++++++++ target/loongarch/{ => tcg}/op_helper.c | 0 target/loongarch/{ => tcg}/tlb_helper.c | 0 target/loongarch/{ => tcg}/translate.c | 0 target/loongarch/{ => tcg}/vec_helper.c | 0 24 files changed, 20 insertions(+), 14 deletions(-) rename target/loongarch/{ => tcg}/constant_timer.c (100%) rename target/loongarch/{ => tcg}/csr_helper.c (100%) rename target/loongarch/{ => tcg}/fpu_helper.c (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_arith.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_atomic.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_bit.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_branch.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_extra.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_farith.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_fcmp.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_fcnv.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_fmemory.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_fmov.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_memory.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_privileged.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_shift.c.inc (100%) rename target/loongarch/{ => tcg}/insn_trans/trans_vec.c.inc (100%) rename target/loongarch/{ => tcg}/iocsr_helper.c (100%) create mode 100644 target/loongarch/tcg/meson.build rename target/loongarch/{ => tcg}/op_helper.c (100%) rename target/loongarch/{ => tcg}/tlb_helper.c (100%) rename target/loongarch/{ => tcg}/translate.c (100%) rename target/loongarch/{ => tcg}/vec_helper.c (100%) diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build index b3a0fb12fb..e84e4c51f4 100644 --- a/target/loongarch/meson.build +++ b/target/loongarch/meson.build @@ -5,29 +5,16 @@ loongarch_ss.add(files( 'cpu.c', 'gdbstub.c', )) -loongarch_tcg_ss = ss.source_set() -loongarch_tcg_ss.add(gen) -loongarch_tcg_ss.add(files( - 'fpu_helper.c', - 'op_helper.c', - 'translate.c', - 'vec_helper.c', -)) -loongarch_tcg_ss.add(zlib) loongarch_system_ss = ss.source_set() loongarch_system_ss.add(files( 'loongarch-qmp-cmds.c', 'machine.c', - 'tlb_helper.c', - 'constant_timer.c', - 'csr_helper.c', - 'iocsr_helper.c', )) common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen]) -loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss]) +subdir('tcg') target_arch += {'loongarch': loongarch_ss} target_system_arch += {'loongarch': loongarch_system_ss} diff --git a/target/loongarch/constant_timer.c b/target/loongarch/tcg/constant_timer.c similarity index 100% rename from target/loongarch/constant_timer.c rename to target/loongarch/tcg/constant_timer.c diff --git a/target/loongarch/csr_helper.c b/target/loongarch/tcg/csr_helper.c similarity index 100% rename from target/loongarch/csr_helper.c rename to target/loongarch/tcg/csr_helper.c diff --git a/target/loongarch/fpu_helper.c b/target/loongarch/tcg/fpu_helper.c similarity index 100% rename from target/loongarch/fpu_helper.c rename to target/loongarch/tcg/fpu_helper.c diff --git a/target/loongarch/insn_trans/trans_arith.c.inc b/target/loongarch/tcg/insn_trans/trans_arith.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_arith.c.inc rename to target/loongarch/tcg/insn_trans/trans_arith.c.inc diff --git a/target/loongarch/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_atomic.c.inc rename to target/loongarch/tcg/insn_trans/trans_atomic.c.inc diff --git a/target/loongarch/insn_trans/trans_bit.c.inc b/target/loongarch/tcg/insn_trans/trans_bit.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_bit.c.inc rename to target/loongarch/tcg/insn_trans/trans_bit.c.inc diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/tcg/insn_trans/trans_branch.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_branch.c.inc rename to target/loongarch/tcg/insn_trans/trans_branch.c.inc diff --git a/target/loongarch/insn_trans/trans_extra.c.inc b/target/loongarch/tcg/insn_trans/trans_extra.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_extra.c.inc rename to target/loongarch/tcg/insn_trans/trans_extra.c.inc diff --git a/target/loongarch/insn_trans/trans_farith.c.inc b/target/loongarch/tcg/insn_trans/trans_farith.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_farith.c.inc rename to target/loongarch/tcg/insn_trans/trans_farith.c.inc diff --git a/target/loongarch/insn_trans/trans_fcmp.c.inc b/target/loongarch/tcg/insn_trans/trans_fcmp.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fcmp.c.inc rename to target/loongarch/tcg/insn_trans/trans_fcmp.c.inc diff --git a/target/loongarch/insn_trans/trans_fcnv.c.inc b/target/loongarch/tcg/insn_trans/trans_fcnv.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fcnv.c.inc rename to target/loongarch/tcg/insn_trans/trans_fcnv.c.inc diff --git a/target/loongarch/insn_trans/trans_fmemory.c.inc b/target/loongarch/tcg/insn_trans/trans_fmemory.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fmemory.c.inc rename to target/loongarch/tcg/insn_trans/trans_fmemory.c.inc diff --git a/target/loongarch/insn_trans/trans_fmov.c.inc b/target/loongarch/tcg/insn_trans/trans_fmov.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fmov.c.inc rename to target/loongarch/tcg/insn_trans/trans_fmov.c.inc diff --git a/target/loongarch/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_memory.c.inc rename to target/loongarch/tcg/insn_trans/trans_memory.c.inc diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc b/target/loongarch/tcg/insn_trans/trans_privileged.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_privileged.c.inc rename to target/loongarch/tcg/insn_trans/trans_privileged.c.inc diff --git a/target/loongarch/insn_trans/trans_shift.c.inc b/target/loongarch/tcg/insn_trans/trans_shift.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_shift.c.inc rename to target/loongarch/tcg/insn_trans/trans_shift.c.inc diff --git a/target/loongarch/insn_trans/trans_vec.c.inc b/target/loongarch/tcg/insn_trans/trans_vec.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_vec.c.inc rename to target/loongarch/tcg/insn_trans/trans_vec.c.inc diff --git a/target/loongarch/iocsr_helper.c b/target/loongarch/tcg/iocsr_helper.c similarity index 100% rename from target/loongarch/iocsr_helper.c rename to target/loongarch/tcg/iocsr_helper.c diff --git a/target/loongarch/tcg/meson.build b/target/loongarch/tcg/meson.build new file mode 100644 index 0000000000..1a3cd589fb --- /dev/null +++ b/target/loongarch/tcg/meson.build @@ -0,0 +1,19 @@ +if 'CONFIG_TCG' not in config_all + subdir_done() +endif + +loongarch_ss.add([zlib, gen]) + +loongarch_ss.add(files( + 'fpu_helper.c', + 'op_helper.c', + 'translate.c', + 'vec_helper.c', +)) + +loongarch_system_ss.add(files( + 'constant_timer.c', + 'csr_helper.c', + 'iocsr_helper.c', + 'tlb_helper.c', +)) diff --git a/target/loongarch/op_helper.c b/target/loongarch/tcg/op_helper.c similarity index 100% rename from target/loongarch/op_helper.c rename to target/loongarch/tcg/op_helper.c diff --git a/target/loongarch/tlb_helper.c b/target/loongarch/tcg/tlb_helper.c similarity index 100% rename from target/loongarch/tlb_helper.c rename to target/loongarch/tcg/tlb_helper.c diff --git a/target/loongarch/translate.c b/target/loongarch/tcg/translate.c similarity index 100% rename from target/loongarch/translate.c rename to target/loongarch/tcg/translate.c diff --git a/target/loongarch/vec_helper.c b/target/loongarch/tcg/vec_helper.c similarity index 100% rename from target/loongarch/vec_helper.c rename to target/loongarch/tcg/vec_helper.c -- Gitee From 9904eb7d4559baca2da713346cd505a80af7e776 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 18 Dec 2023 17:43:18 -0300 Subject: [PATCH 011/939] linux-headers: Update to Linux v6.7-rc5 We'll add a new RISC-V linux-header file, but first let's update all headers. Headers for 'asm-loongarch' were added in this update. Signed-off-by: Daniel Henrique Barboza Acked-by: Alistair Francis Message-ID: <20231218204321.75757-2-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis --- include/standard-headers/drm/drm_fourcc.h | 2 + include/standard-headers/linux/pci_regs.h | 24 ++- include/standard-headers/linux/vhost_types.h | 7 + .../standard-headers/linux/virtio_config.h | 5 + include/standard-headers/linux/virtio_pci.h | 11 ++ linux-headers/asm-arm64/kvm.h | 32 ++++ linux-headers/asm-generic/unistd.h | 14 +- linux-headers/asm-loongarch/bitsperlong.h | 1 + linux-headers/asm-loongarch/kvm.h | 108 +++++++++++ linux-headers/asm-loongarch/mman.h | 1 + linux-headers/asm-loongarch/unistd.h | 5 + linux-headers/asm-mips/unistd_n32.h | 4 + linux-headers/asm-mips/unistd_n64.h | 4 + linux-headers/asm-mips/unistd_o32.h | 4 + linux-headers/asm-powerpc/unistd_32.h | 4 + linux-headers/asm-powerpc/unistd_64.h | 4 + linux-headers/asm-riscv/kvm.h | 12 ++ linux-headers/asm-s390/unistd_32.h | 4 + linux-headers/asm-s390/unistd_64.h | 4 + linux-headers/asm-x86/unistd_32.h | 4 + linux-headers/asm-x86/unistd_64.h | 3 + linux-headers/asm-x86/unistd_x32.h | 3 + linux-headers/linux/iommufd.h | 180 +++++++++++++++++- linux-headers/linux/kvm.h | 11 ++ linux-headers/linux/psp-sev.h | 1 + linux-headers/linux/stddef.h | 9 +- linux-headers/linux/userfaultfd.h | 9 +- linux-headers/linux/vfio.h | 47 +++-- linux-headers/linux/vhost.h | 8 + 29 files changed, 498 insertions(+), 27 deletions(-) create mode 100644 linux-headers/asm-loongarch/bitsperlong.h create mode 100644 linux-headers/asm-loongarch/kvm.h create mode 100644 linux-headers/asm-loongarch/mman.h create mode 100644 linux-headers/asm-loongarch/unistd.h diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h index 72279f4d25..3afb70160f 100644 --- a/include/standard-headers/drm/drm_fourcc.h +++ b/include/standard-headers/drm/drm_fourcc.h @@ -322,6 +322,8 @@ extern "C" { * index 1 = Cr:Cb plane, [39:0] Cr1:Cb1:Cr0:Cb0 little endian */ #define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') /* 2x2 subsampled Cr:Cb plane */ +#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') /* 2x1 subsampled Cr:Cb plane */ +#define DRM_FORMAT_NV30 fourcc_code('N', 'V', '3', '0') /* non-subsampled Cr:Cb plane */ /* * 2 plane YCbCr MSB aligned diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index e5f558d964..a39193213f 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -80,6 +80,7 @@ #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 +#define PCI_HEADER_TYPE_MFD 0x80 /* Multi-Function Device (possible) */ #define PCI_BIST 0x0f /* 8 bits */ #define PCI_BIST_CODE_MASK 0x0f /* Return result */ @@ -637,6 +638,7 @@ #define PCI_EXP_RTCAP 0x1e /* Root Capabilities */ #define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */ #define PCI_EXP_RTSTA 0x20 /* Root Status */ +#define PCI_EXP_RTSTA_PME_RQ_ID 0x0000ffff /* PME Requester ID */ #define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */ #define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */ /* @@ -930,12 +932,13 @@ /* Process Address Space ID */ #define PCI_PASID_CAP 0x04 /* PASID feature register */ -#define PCI_PASID_CAP_EXEC 0x02 /* Exec permissions Supported */ -#define PCI_PASID_CAP_PRIV 0x04 /* Privilege Mode Supported */ +#define PCI_PASID_CAP_EXEC 0x0002 /* Exec permissions Supported */ +#define PCI_PASID_CAP_PRIV 0x0004 /* Privilege Mode Supported */ +#define PCI_PASID_CAP_WIDTH 0x1f00 #define PCI_PASID_CTRL 0x06 /* PASID control register */ -#define PCI_PASID_CTRL_ENABLE 0x01 /* Enable bit */ -#define PCI_PASID_CTRL_EXEC 0x02 /* Exec permissions Enable */ -#define PCI_PASID_CTRL_PRIV 0x04 /* Privilege Mode Enable */ +#define PCI_PASID_CTRL_ENABLE 0x0001 /* Enable bit */ +#define PCI_PASID_CTRL_EXEC 0x0002 /* Exec permissions Enable */ +#define PCI_PASID_CTRL_PRIV 0x0004 /* Privilege Mode Enable */ #define PCI_EXT_CAP_PASID_SIZEOF 8 /* Single Root I/O Virtualization */ @@ -975,6 +978,8 @@ #define PCI_LTR_VALUE_MASK 0x000003ff #define PCI_LTR_SCALE_MASK 0x00001c00 #define PCI_LTR_SCALE_SHIFT 10 +#define PCI_LTR_NOSNOOP_VALUE 0x03ff0000 /* Max No-Snoop Latency Value */ +#define PCI_LTR_NOSNOOP_SCALE 0x1c000000 /* Scale for Max Value */ #define PCI_EXT_CAP_LTR_SIZEOF 8 /* Access Control Service */ @@ -1042,9 +1047,16 @@ #define PCI_EXP_DPC_STATUS 0x08 /* DPC Status */ #define PCI_EXP_DPC_STATUS_TRIGGER 0x0001 /* Trigger Status */ #define PCI_EXP_DPC_STATUS_TRIGGER_RSN 0x0006 /* Trigger Reason */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR 0x0000 /* Uncorrectable error */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE 0x0002 /* Rcvd ERR_NONFATAL */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE 0x0004 /* Rcvd ERR_FATAL */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT 0x0006 /* Reason in Trig Reason Extension field */ #define PCI_EXP_DPC_STATUS_INTERRUPT 0x0008 /* Interrupt Status */ #define PCI_EXP_DPC_RP_BUSY 0x0010 /* Root Port Busy */ #define PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO 0x0000 /* RP PIO error */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_SW_TRIGGER 0x0020 /* DPC SW Trigger bit */ +#define PCI_EXP_DPC_RP_PIO_FEP 0x1f00 /* RP PIO First Err Ptr */ #define PCI_EXP_DPC_SOURCE_ID 0x0A /* DPC Source Identifier */ @@ -1088,6 +1100,8 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_VALUE 0x03ff0000 /* LTR_L1.2_THRESHOLD_Value */ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +#define PCI_L1SS_CTL2_T_PWR_ON_SCALE 0x00000003 /* T_POWER_ON Scale */ +#define PCI_L1SS_CTL2_T_PWR_ON_VALUE 0x000000f8 /* T_POWER_ON Value */ /* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ #define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h index 5ad07e134a..fd54044936 100644 --- a/include/standard-headers/linux/vhost_types.h +++ b/include/standard-headers/linux/vhost_types.h @@ -185,5 +185,12 @@ struct vhost_vdpa_iova_range { * DRIVER_OK */ #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK 0x6 +/* Device may expose the virtqueue's descriptor area, driver area and + * device area to a different group for ASID binding than where its + * buffers may reside. Requires VHOST_BACKEND_F_IOTLB_ASID. + */ +#define VHOST_BACKEND_F_DESC_ASID 0x7 +/* IOTLB don't flush memory mapping across device reset */ +#define VHOST_BACKEND_F_IOTLB_PERSIST 0x8 #endif diff --git a/include/standard-headers/linux/virtio_config.h b/include/standard-headers/linux/virtio_config.h index 8a7d0dc8b0..bfd1ca643e 100644 --- a/include/standard-headers/linux/virtio_config.h +++ b/include/standard-headers/linux/virtio_config.h @@ -103,6 +103,11 @@ */ #define VIRTIO_F_NOTIFICATION_DATA 38 +/* This feature indicates that the driver uses the data provided by the device + * as a virtqueue identifier in available buffer notifications. + */ +#define VIRTIO_F_NOTIF_CONFIG_DATA 39 + /* * This feature indicates that the driver can reset a queue individually. */ diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h index be912cfc95..b7fdfd0668 100644 --- a/include/standard-headers/linux/virtio_pci.h +++ b/include/standard-headers/linux/virtio_pci.h @@ -166,6 +166,17 @@ struct virtio_pci_common_cfg { uint32_t queue_used_hi; /* read-write */ }; +/* + * Warning: do not use sizeof on this: use offsetofend for + * specific fields you need. + */ +struct virtio_pci_modern_common_cfg { + struct virtio_pci_common_cfg cfg; + + uint16_t queue_notify_data; /* read-write */ + uint16_t queue_reset; /* read-write */ +}; + /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */ struct virtio_pci_cfg_cap { struct virtio_pci_cap cap; diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 38e5957526..c59ea55cd8 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -491,6 +491,38 @@ struct kvm_smccc_filter { #define KVM_HYPERCALL_EXIT_SMC (1U << 0) #define KVM_HYPERCALL_EXIT_16BIT (1U << 1) +/* + * Get feature ID registers userspace writable mask. + * + * From DDI0487J.a, D19.2.66 ("ID_AA64MMFR2_EL1, AArch64 Memory Model + * Feature Register 2"): + * + * "The Feature ID space is defined as the System register space in + * AArch64 with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7}, + * op2=={0-7}." + * + * This covers all currently known R/O registers that indicate + * anything useful feature wise, including the ID registers. + * + * If we ever need to introduce a new range, it will be described as + * such in the range field. + */ +#define KVM_ARM_FEATURE_ID_RANGE_IDX(op0, op1, crn, crm, op2) \ + ({ \ + __u64 __op1 = (op1) & 3; \ + __op1 -= (__op1 == 3); \ + (__op1 << 6 | ((crm) & 7) << 3 | (op2)); \ + }) + +#define KVM_ARM_FEATURE_ID_RANGE 0 +#define KVM_ARM_FEATURE_ID_RANGE_SIZE (3 * 8 * 8) + +struct reg_mask_range { + __u64 addr; /* Pointer to mask array */ + __u32 range; /* Requested range */ + __u32 reserved[13]; +}; + #endif #endif /* __ARM_KVM_H__ */ diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index abe087c53b..756b013fb8 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr) #define __NR_getcwd 17 __SYSCALL(__NR_getcwd, sys_getcwd) #define __NR_lookup_dcookie 18 -__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie) +__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall) #define __NR_eventfd2 19 __SYSCALL(__NR_eventfd2, sys_eventfd2) #define __NR_epoll_create1 20 @@ -816,15 +816,21 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) - #define __NR_cachestat 451 __SYSCALL(__NR_cachestat, sys_cachestat) - #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +#define __NR_map_shadow_stack 453 +__SYSCALL(__NR_map_shadow_stack, sys_map_shadow_stack) +#define __NR_futex_wake 454 +__SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_wait 455 +__SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_requeue 456 +__SYSCALL(__NR_futex_requeue, sys_futex_requeue) #undef __NR_syscalls -#define __NR_syscalls 453 +#define __NR_syscalls 457 /* * 32 bit systems traditionally used different diff --git a/linux-headers/asm-loongarch/bitsperlong.h b/linux-headers/asm-loongarch/bitsperlong.h new file mode 100644 index 0000000000..6dc0bb0c13 --- /dev/null +++ b/linux-headers/asm-loongarch/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h new file mode 100644 index 0000000000..c6ad2ee610 --- /dev/null +++ b/linux-headers/asm-loongarch/kvm.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __UAPI_ASM_LOONGARCH_KVM_H +#define __UAPI_ASM_LOONGARCH_KVM_H + +#include + +/* + * KVM LoongArch specific structures and definitions. + * + * Some parts derived from the x86 version of this file. + */ + +#define __KVM_HAVE_READONLY_MEM + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 + +/* + * for KVM_GET_REGS and KVM_SET_REGS + */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 pc; +}; + +/* + * for KVM_GET_FPU and KVM_SET_FPU + */ +struct kvm_fpu { + __u32 fcsr; + __u64 fcc; /* 8x8 */ + struct kvm_fpureg { + __u64 val64[4]; + } fpr[32]; +}; + +/* + * For LoongArch, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various + * registers. The id field is broken down as follows: + * + * bits[63..52] - As per linux/kvm.h + * bits[51..32] - Must be zero. + * bits[31..16] - Register set. + * + * Register set = 0: GP registers from kvm_regs (see definitions below). + * + * Register set = 1: CSR registers. + * + * Register set = 2: KVM specific registers (see definitions below). + * + * Register set = 3: FPU / SIMD registers (see definitions below). + * + * Other sets registers may be added in the future. Each set would + * have its own identifier in bits[31..16]. + */ + +#define KVM_REG_LOONGARCH_GPR (KVM_REG_LOONGARCH | 0x00000ULL) +#define KVM_REG_LOONGARCH_CSR (KVM_REG_LOONGARCH | 0x10000ULL) +#define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) +#define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) +#define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) +#define KVM_CSR_IDX_MASK 0x7fff +#define KVM_CPUCFG_IDX_MASK 0x7fff + +/* + * KVM_REG_LOONGARCH_KVM - KVM specific control registers. + */ + +#define KVM_REG_LOONGARCH_COUNTER (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_VCPU_RESET (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 2) + +#define LOONGARCH_REG_SHIFT 3 +#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) +#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) +#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +/* dummy definition */ +struct kvm_sregs { +}; + +struct kvm_iocsr_entry { + __u32 addr; + __u32 pad; + __u64 data; +}; + +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 64 +#define KVM_MAX_CORES 256 + +#endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/linux-headers/asm-loongarch/mman.h b/linux-headers/asm-loongarch/mman.h new file mode 100644 index 0000000000..8eebf89f5a --- /dev/null +++ b/linux-headers/asm-loongarch/mman.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-loongarch/unistd.h b/linux-headers/asm-loongarch/unistd.h new file mode 100644 index 0000000000..fcb668984f --- /dev/null +++ b/linux-headers/asm-loongarch/unistd.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 + +#include diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index 46d8500654..994b6f008f 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -381,5 +381,9 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) #endif /* _ASM_UNISTD_N32_H */ diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index c2f7ac673b..41dcf5877a 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -357,5 +357,9 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) #endif /* _ASM_UNISTD_N64_H */ diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index 757c68f2ad..ae9d334d96 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -427,5 +427,9 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) #endif /* _ASM_UNISTD_O32_H */ diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 8ef94bbac1..b9b23d66d7 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -434,6 +434,10 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 0e7ee43e88..cbb4b3e8f7 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -406,6 +406,10 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-riscv/kvm.h b/linux-headers/asm-riscv/kvm.h index 992c5e4071..60d3b21dea 100644 --- a/linux-headers/asm-riscv/kvm.h +++ b/linux-headers/asm-riscv/kvm.h @@ -80,6 +80,7 @@ struct kvm_riscv_csr { unsigned long sip; unsigned long satp; unsigned long scounteren; + unsigned long senvcfg; }; /* AIA CSR registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ @@ -93,6 +94,11 @@ struct kvm_riscv_aia_csr { unsigned long iprio2h; }; +/* Smstateen CSR for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_smstateen_csr { + unsigned long sstateen0; +}; + /* TIMER registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ struct kvm_riscv_timer { __u64 frequency; @@ -131,6 +137,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZICSR, KVM_RISCV_ISA_EXT_ZIFENCEI, KVM_RISCV_ISA_EXT_ZIHPM, + KVM_RISCV_ISA_EXT_SMSTATEEN, + KVM_RISCV_ISA_EXT_ZICOND, KVM_RISCV_ISA_EXT_MAX, }; @@ -148,6 +156,7 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_PMU, KVM_RISCV_SBI_EXT_EXPERIMENTAL, KVM_RISCV_SBI_EXT_VENDOR, + KVM_RISCV_SBI_EXT_DBCN, KVM_RISCV_SBI_EXT_MAX, }; @@ -178,10 +187,13 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_CSR (0x03 << KVM_REG_RISCV_TYPE_SHIFT) #define KVM_REG_RISCV_CSR_GENERAL (0x0 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_AIA (0x1 << KVM_REG_RISCV_SUBTYPE_SHIFT) +#define KVM_REG_RISCV_CSR_SMSTATEEN (0x2 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_REG(name) \ (offsetof(struct kvm_riscv_csr, name) / sizeof(unsigned long)) #define KVM_REG_RISCV_CSR_AIA_REG(name) \ (offsetof(struct kvm_riscv_aia_csr, name) / sizeof(unsigned long)) +#define KVM_REG_RISCV_CSR_SMSTATEEN_REG(name) \ + (offsetof(struct kvm_riscv_smstateen_csr, name) / sizeof(unsigned long)) /* Timer registers are mapped as type 4 */ #define KVM_REG_RISCV_TIMER (0x04 << KVM_REG_RISCV_TYPE_SHIFT) diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 716fa368ca..c093e6d5f9 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -425,5 +425,9 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index b2a11b1d13..114c0569a4 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -373,5 +373,9 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index d749ad1c24..329649c377 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -443,6 +443,10 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index cea67282eb..4583606ce6 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -366,6 +366,9 @@ #define __NR_cachestat 451 #define __NR_fchmodat2 452 #define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index 5b2e79bf4c..146d74d8e4 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -318,6 +318,9 @@ #define __NR_set_mempolicy_home_node (__X32_SYSCALL_BIT + 450) #define __NR_cachestat (__X32_SYSCALL_BIT + 451) #define __NR_fchmodat2 (__X32_SYSCALL_BIT + 452) +#define __NR_futex_wake (__X32_SYSCALL_BIT + 454) +#define __NR_futex_wait (__X32_SYSCALL_BIT + 455) +#define __NR_futex_requeue (__X32_SYSCALL_BIT + 456) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 218bf7ac98..806d98d09c 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -47,6 +47,8 @@ enum { IOMMUFD_CMD_VFIO_IOAS, IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, }; /** @@ -347,20 +349,86 @@ struct iommu_vfio_ioas { }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) +/** + * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as + * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment + */ +enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, +}; + +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE, + IOMMU_HWPT_DATA_VTD_S1, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) - * @flags: Must be 0 + * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given parent HWPT via + * @pt_id, in which the parent HWPT must be allocated previously via the + * same ioctl from a given IOAS (@pt_id). In this case, the @data_type + * must be set to a pre-defined type corresponding to an I/O page table + * type supported by the underlying IOMMU hardware. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -369,13 +437,26 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + /** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec @@ -404,6 +485,20 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD, }; +/** + * enum iommufd_hw_capabilities + * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking + * If available, it means the following APIs + * are supported: + * + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * + */ +enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, +}; + /** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) @@ -415,6 +510,8 @@ enum iommu_hw_info_type { * the iommu type specific hardware information data * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -439,6 +536,81 @@ struct iommu_hw_info { __aligned_u64 data_uptr; __u32 out_data_type; __u32 __reserved; + __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) + +/* + * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty + * tracking + * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking + */ +enum iommufd_hwpt_set_dirty_tracking_flags { + IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1, +}; + +/** + * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING) + * @size: sizeof(struct iommu_hwpt_set_dirty_tracking) + * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @__reserved: Must be 0 + * + * Toggle dirty tracking on an HW pagetable. + */ +struct iommu_hwpt_set_dirty_tracking { + __u32 size; + __u32 flags; + __u32 hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) + +/** + * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits + * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing + * any dirty bits metadata. This flag + * can be passed in the expectation + * where the next operation is an unmap + * of the same IOVA range. + * + */ +enum iommufd_hwpt_get_dirty_bitmap_flags { + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1, +}; + +/** + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags + * @__reserved: Must be 0 + * @iova: base IOVA of the bitmap first bit + * @length: IOVA range size + * @page_size: page size granularity of each bit in the bitmap + * @data: bitmap where to set the dirty bits. The bitmap bits each + * represent a page_size which you deviate from an arbitrary iova. + * + * Checking a given IOVA is dirty: + * + * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64)) + * + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap + * with the dirty IOVAs. In doing so it will also by default clear any + * dirty bit metadata set in the IOPTE. + */ +struct iommu_hwpt_get_dirty_bitmap { + __u32 size; + __u32 hwpt_id; + __u32 flags; + __u32 __reserved; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 data; +}; +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 0d74ee999a..549fea3a97 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -264,6 +264,7 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_SBI 35 #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 +#define KVM_EXIT_LOONGARCH_IOCSR 38 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -336,6 +337,13 @@ struct kvm_run { __u32 len; __u8 is_write; } mmio; + /* KVM_EXIT_LOONGARCH_IOCSR */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } iocsr_io; /* KVM_EXIT_HYPERCALL */ struct { __u64 nr; @@ -1188,6 +1196,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COUNTER_OFFSET 227 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 +#define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 #ifdef KVM_CAP_IRQ_ROUTING @@ -1358,6 +1367,7 @@ struct kvm_dirty_tlb { #define KVM_REG_ARM64 0x6000000000000000ULL #define KVM_REG_MIPS 0x7000000000000000ULL #define KVM_REG_RISCV 0x8000000000000000ULL +#define KVM_REG_LOONGARCH 0x9000000000000000ULL #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL @@ -1558,6 +1568,7 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) /* Available with KVM_CAP_COUNTER_OFFSET */ #define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO, 0xb5, struct kvm_arm_counter_offset) +#define KVM_ARM_GET_REG_WRITABLE_MASKS _IOR(KVMIO, 0xb6, struct reg_mask_range) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) diff --git a/linux-headers/linux/psp-sev.h b/linux-headers/linux/psp-sev.h index 12ccb70099..bcb21339ee 100644 --- a/linux-headers/linux/psp-sev.h +++ b/linux-headers/linux/psp-sev.h @@ -68,6 +68,7 @@ typedef enum { SEV_RET_INVALID_PARAM, SEV_RET_RESOURCE_LIMIT, SEV_RET_SECURE_DATA_INVALID, + SEV_RET_INVALID_KEY = 0x27, SEV_RET_MAX, } sev_ret_code; diff --git a/linux-headers/linux/stddef.h b/linux-headers/linux/stddef.h index 9bb07083ac..bf9749dd14 100644 --- a/linux-headers/linux/stddef.h +++ b/linux-headers/linux/stddef.h @@ -27,8 +27,13 @@ union { \ struct { MEMBERS } ATTRS; \ struct TAG { MEMBERS } ATTRS NAME; \ - } + } ATTRS +#ifdef __cplusplus +/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */ +#define __DECLARE_FLEX_ARRAY(T, member) \ + T member[0] +#else /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union * @@ -49,3 +54,5 @@ #ifndef __counted_by #define __counted_by(m) #endif + +#endif /* _LINUX_STDDEF_H */ diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h index 59978fbaae..953c75feda 100644 --- a/linux-headers/linux/userfaultfd.h +++ b/linux-headers/linux/userfaultfd.h @@ -40,7 +40,8 @@ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ - UFFD_FEATURE_POISON) + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -216,6 +217,11 @@ struct uffdio_api { * (i.e. empty ptes). This will be the default behavior for shmem * & hugetlbfs, so this flag only affects anonymous memory behavior * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -232,6 +238,7 @@ struct uffdio_api { #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) __u64 features; __u64 ioctls; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index acf72b4999..8e175ece31 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -277,8 +277,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ - __u64 size; /* Region size (bytes) */ - __u64 offset; /* Region offset from start of device fd */ + __aligned_u64 size; /* Region size (bytes) */ + __aligned_u64 offset; /* Region offset from start of device fd */ }; #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) @@ -294,8 +294,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 struct vfio_region_sparse_mmap_area { - __u64 offset; /* Offset of mmap'able area within region */ - __u64 size; /* Size of mmap'able area */ + __aligned_u64 offset; /* Offset of mmap'able area within region */ + __aligned_u64 size; /* Size of mmap'able area */ }; struct vfio_region_info_cap_sparse_mmap { @@ -450,9 +450,9 @@ struct vfio_device_migration_info { VFIO_DEVICE_STATE_V1_RESUMING) __u32 reserved; - __u64 pending_bytes; - __u64 data_offset; - __u64 data_size; + __aligned_u64 pending_bytes; + __aligned_u64 data_offset; + __aligned_u64 data_size; }; /* @@ -476,7 +476,7 @@ struct vfio_device_migration_info { struct vfio_region_info_cap_nvlink2_ssatgt { struct vfio_info_cap_header header; - __u64 tgt; + __aligned_u64 tgt; }; /* @@ -816,7 +816,7 @@ struct vfio_device_gfx_plane_info { __u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */ /* out */ __u32 drm_format; /* drm format of plane */ - __u64 drm_format_mod; /* tiled mode */ + __aligned_u64 drm_format_mod; /* tiled mode */ __u32 width; /* width of plane */ __u32 height; /* height of plane */ __u32 stride; /* stride of plane */ @@ -829,6 +829,7 @@ struct vfio_device_gfx_plane_info { __u32 region_index; /* region index */ __u32 dmabuf_id; /* dma-buf id */ }; + __u32 reserved; }; #define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14) @@ -863,9 +864,10 @@ struct vfio_device_ioeventfd { #define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ #define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) - __u64 offset; /* device fd offset of write */ - __u64 data; /* data to be written */ + __aligned_u64 offset; /* device fd offset of write */ + __aligned_u64 data; /* data to be written */ __s32 fd; /* -1 for de-assignment */ + __u32 reserved; }; #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) @@ -1434,6 +1436,27 @@ struct vfio_device_feature_mig_data_size { #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 +/** + * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device + * based on the operation specified in op flag. + * + * The functionality is incorporated for devices that needs bus master control, + * but the in-band device interface lacks the support. Consequently, it is not + * applicable to PCI devices, as bus master control for PCI devices is managed + * in-band through the configuration space. At present, this feature is supported + * only for CDX devices. + * When the device's BUS MASTER setting is configured as CLEAR, it will result in + * blocking all incoming DMA requests from the device. On the other hand, configuring + * the device's BUS MASTER setting as SET (enable) will grant the device the + * capability to perform DMA to the host memory. + */ +struct vfio_device_feature_bus_master { + __u32 op; +#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */ +#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */ +}; +#define VFIO_DEVICE_FEATURE_BUS_MASTER 10 + /* -------- API for Type1 VFIO IOMMU -------- */ /** @@ -1449,7 +1472,7 @@ struct vfio_iommu_type1_info { __u32 flags; #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ - __u64 iova_pgsizes; /* Bitmap of supported page sizes */ + __aligned_u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ __u32 pad; }; diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index f5c48b61ab..649560c685 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -219,4 +219,12 @@ */ #define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) +/* Get the group for the descriptor table including driver & device areas + * of a virtqueue: read index, write group in num. + * The virtqueue index is stored in the index field of vhost_vring_state. + * The group ID of the descriptor table for this specific virtqueue + * is returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ + struct vhost_vring_state) #endif -- Gitee From 280cba84e3eaed10f095f0c88dab27b7799558e5 Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:57:56 +0800 Subject: [PATCH 012/939] linux-headers: Synchronize linux headers from linux v6.7.0-rc8 Use the scripts/update-linux-headers.sh to synchronize linux headers from linux v6.7.0-rc8. We mainly want to add the loongarch linux headers and then add the loongarch kvm support based on it. Signed-off-by: Tianrui Zhao Acked-by: Song Gao Message-Id: <20240105075804.1228596-2-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- include/standard-headers/linux/fuse.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h index 6b9793842c..fc0dcd10ae 100644 --- a/include/standard-headers/linux/fuse.h +++ b/include/standard-headers/linux/fuse.h @@ -209,7 +209,7 @@ * - add FUSE_HAS_EXPIRE_ONLY * * 7.39 - * - add FUSE_DIRECT_IO_RELAX + * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures */ @@ -405,8 +405,7 @@ struct fuse_file_lock { * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation - * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now - * allow shared mmap + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -445,7 +444,10 @@ struct fuse_file_lock { #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) -#define FUSE_DIRECT_IO_RELAX (1ULL << 36) +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) + +/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ +#define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP /** * CUSE INIT request/reply flags -- Gitee From 623a99084843f47723cb799d4bcef8e1359d59ad Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:57:57 +0800 Subject: [PATCH 013/939] target/loongarch: Define some kvm_arch interfaces Define some functions in target/loongarch/kvm/kvm.c, such as kvm_arch_put_registers, kvm_arch_get_registers and kvm_arch_handle_exit, etc. which are needed by kvm/kvm-all.c. Now the most functions has no content and they will be implemented in the next patches. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Richard Henderson Reviewed-by: Song Gao Message-Id: <20240105075804.1228596-3-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/kvm/kvm.c | 131 +++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 target/loongarch/kvm/kvm.c diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c new file mode 100644 index 0000000000..0d67322fd9 --- /dev/null +++ b/target/loongarch/kvm/kvm.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * QEMU LoongArch KVM + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include +#include + +#include "qemu/timer.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "sysemu/sysemu.h" +#include "sysemu/kvm.h" +#include "sysemu/kvm_int.h" +#include "hw/pci/pci.h" +#include "exec/memattrs.h" +#include "exec/address-spaces.h" +#include "hw/boards.h" +#include "hw/irq.h" +#include "qemu/log.h" +#include "hw/loader.h" +#include "migration/migration.h" +#include "sysemu/runstate.h" +#include "cpu-csr.h" +#include "kvm_loongarch.h" + +static bool cap_has_mp_state; +const KVMCapabilityInfo kvm_arch_required_capabilities[] = { + KVM_CAP_LAST_INFO +}; + +int kvm_arch_get_registers(CPUState *cs) +{ + return 0; +} +int kvm_arch_put_registers(CPUState *cs, int level) +{ + return 0; +} + +int kvm_arch_init_vcpu(CPUState *cs) +{ + return 0; +} + +int kvm_arch_destroy_vcpu(CPUState *cs) +{ + return 0; +} + +unsigned long kvm_arch_vcpu_id(CPUState *cs) +{ + return cs->cpu_index; +} + +int kvm_arch_release_virq_post(int virq) +{ + return 0; +} + +int kvm_arch_msi_data_to_gsi(uint32_t data) +{ + abort(); +} + +int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data, PCIDevice *dev) +{ + return 0; +} + +int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, + int vector, PCIDevice *dev) +{ + return 0; +} + +void kvm_arch_init_irq_routing(KVMState *s) +{ +} + +int kvm_arch_get_default_type(MachineState *ms) +{ + return 0; +} + +int kvm_arch_init(MachineState *ms, KVMState *s) +{ + return 0; +} + +int kvm_arch_irqchip_create(KVMState *s) +{ + return 0; +} + +void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) +{ +} + +MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run) +{ + return MEMTXATTRS_UNSPECIFIED; +} + +int kvm_arch_process_async_events(CPUState *cs) +{ + return cs->halted; +} + +bool kvm_arch_stop_on_emulation_error(CPUState *cs) +{ + return true; +} + +bool kvm_arch_cpu_check_are_resettable(void) +{ + return true; +} + +int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) +{ + return 0; +} + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} -- Gitee From 48dae5f461bf2cde206e879d52df6cf1bad3ac6e Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:57:58 +0800 Subject: [PATCH 014/939] target/loongarch: Supplement vcpu env initial when vcpu reset Supplement vcpu env initial when vcpu reset, including init vcpu CSR_CPUID,CSR_TID to cpu->cpu_index. The two regs will be used in kvm_get/set_csr_ioctl. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Song Gao Message-Id: <20240105075804.1228596-4-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 2 ++ target/loongarch/cpu.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index db9a421cc4..021592798a 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -531,10 +531,12 @@ static void loongarch_cpu_reset_hold(Object *obj) env->CSR_ESTAT = env->CSR_ESTAT & (~MAKE_64BIT_MASK(0, 2)); env->CSR_RVACFG = FIELD_DP64(env->CSR_RVACFG, CSR_RVACFG, RBITS, 0); + env->CSR_CPUID = cs->cpu_index; env->CSR_TCFG = FIELD_DP64(env->CSR_TCFG, CSR_TCFG, EN, 0); env->CSR_LLBCTL = FIELD_DP64(env->CSR_LLBCTL, CSR_LLBCTL, KLO, 0); env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR, 0); env->CSR_MERRCTL = FIELD_DP64(env->CSR_MERRCTL, CSR_MERRCTL, ISMERR, 0); + env->CSR_TID = cs->cpu_index; env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2); env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63); diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 00d1fba597..f6d5ef0852 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -319,6 +319,7 @@ typedef struct CPUArchState { uint64_t CSR_PWCH; uint64_t CSR_STLBPS; uint64_t CSR_RVACFG; + uint64_t CSR_CPUID; uint64_t CSR_PRCFG1; uint64_t CSR_PRCFG2; uint64_t CSR_PRCFG3; @@ -350,7 +351,6 @@ typedef struct CPUArchState { uint64_t CSR_DBG; uint64_t CSR_DERA; uint64_t CSR_DSAVE; - uint64_t CSR_CPUID; #ifndef CONFIG_USER_ONLY LoongArchTLB tlb[LOONGARCH_TLB_MAX]; -- Gitee From 0884653d8583aaaa5585caf38246518439bcfdfd Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:57:59 +0800 Subject: [PATCH 015/939] target/loongarch: Implement kvm get/set registers Implement kvm_arch_get/set_registers interfaces, many regs can be get/set in the function, such as core regs, csr regs, fpu regs, mp state, etc. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Song Gao Change-Id: Ia8fc48fe08b1768853f7729e77d37cdf270031e4 Message-Id: <20240105075804.1228596-5-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- meson.build | 1 + target/loongarch/cpu.c | 3 + target/loongarch/cpu.h | 1 + target/loongarch/internals.h | 5 +- target/loongarch/kvm/kvm.c | 580 +++++++++++++++++++++++++++++++++- target/loongarch/trace-events | 11 + target/loongarch/trace.h | 1 + 7 files changed, 599 insertions(+), 3 deletions(-) create mode 100644 target/loongarch/trace-events create mode 100644 target/loongarch/trace.h diff --git a/meson.build b/meson.build index 6c77d9687d..445f2b7c2b 100644 --- a/meson.build +++ b/meson.build @@ -3358,6 +3358,7 @@ if have_system or have_user 'target/hppa', 'target/i386', 'target/i386/kvm', + 'target/loongarch', 'target/mips/tcg', 'target/nios2', 'target/ppc', diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 021592798a..275833eec8 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -553,6 +553,9 @@ static void loongarch_cpu_reset_hold(Object *obj) #ifndef CONFIG_USER_ONLY env->pc = 0x1c000000; memset(env->tlb, 0, sizeof(env->tlb)); + if (kvm_enabled()) { + kvm_arch_reset_vcpu(env); + } #endif restore_fp_status(env); diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index f6d5ef0852..f4a89bd626 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -360,6 +360,7 @@ typedef struct CPUArchState { MemoryRegion iocsr_mem; bool load_elf; uint64_t elf_address; + uint32_t mp_state; /* Store ipistate to access from this struct */ DeviceState *ipistate; #endif diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index c492863cc5..0beb034748 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -31,8 +31,10 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env, const char *loongarch_exception_name(int32_t exception); +#ifdef CONFIG_TCG int ieee_ex_to_loongarch(int xcpt); void restore_fp_status(CPULoongArchState *env); +#endif #ifndef CONFIG_USER_ONLY extern const VMStateDescription vmstate_loongarch_cpu; @@ -44,12 +46,13 @@ uint64_t cpu_loongarch_get_constant_timer_counter(LoongArchCPU *cpu); uint64_t cpu_loongarch_get_constant_timer_ticks(LoongArchCPU *cpu); void cpu_loongarch_store_constant_timer_config(LoongArchCPU *cpu, uint64_t value); - +#ifdef CONFIG_TCG bool loongarch_cpu_tlb_fill(CPUState *cs, vaddr address, int size, MMUAccessType access_type, int mmu_idx, bool probe, uintptr_t retaddr); hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); +#endif #endif /* !CONFIG_USER_ONLY */ uint64_t read_fcc(CPULoongArchState *env); diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 0d67322fd9..e7c9ef830c 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -26,19 +26,595 @@ #include "sysemu/runstate.h" #include "cpu-csr.h" #include "kvm_loongarch.h" +#include "trace.h" static bool cap_has_mp_state; const KVMCapabilityInfo kvm_arch_required_capabilities[] = { KVM_CAP_LAST_INFO }; +static int kvm_loongarch_get_regs_core(CPUState *cs) +{ + int ret = 0; + int i; + struct kvm_regs regs; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + /* Get the current register set as KVM seems it */ + ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, ®s); + if (ret < 0) { + trace_kvm_failed_get_regs_core(strerror(errno)); + return ret; + } + /* gpr[0] value is always 0 */ + env->gpr[0] = 0; + for (i = 1; i < 32; i++) { + env->gpr[i] = regs.gpr[i]; + } + + env->pc = regs.pc; + return ret; +} + +static int kvm_loongarch_put_regs_core(CPUState *cs) +{ + int ret = 0; + int i; + struct kvm_regs regs; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + /* Set the registers based on QEMU's view of things */ + for (i = 0; i < 32; i++) { + regs.gpr[i] = env->gpr[i]; + } + + regs.pc = env->pc; + ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, ®s); + if (ret < 0) { + trace_kvm_failed_put_regs_core(strerror(errno)); + } + + return ret; +} + +static int kvm_loongarch_get_csr(CPUState *cs) +{ + int ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CRMD), + &env->CSR_CRMD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRMD), + &env->CSR_PRMD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EUEN), + &env->CSR_EUEN); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_MISC), + &env->CSR_MISC); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ECFG), + &env->CSR_ECFG); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ESTAT), + &env->CSR_ESTAT); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ERA), + &env->CSR_ERA); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADV), + &env->CSR_BADV); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADI), + &env->CSR_BADI); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EENTRY), + &env->CSR_EENTRY); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBIDX), + &env->CSR_TLBIDX); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBEHI), + &env->CSR_TLBEHI); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO0), + &env->CSR_TLBELO0); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO1), + &env->CSR_TLBELO1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ASID), + &env->CSR_ASID); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDL), + &env->CSR_PGDL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDH), + &env->CSR_PGDH); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGD), + &env->CSR_PGD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCL), + &env->CSR_PWCL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCH), + &env->CSR_PWCH); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_STLBPS), + &env->CSR_STLBPS); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG), + &env->CSR_RVACFG); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID), + &env->CSR_CPUID); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1), + &env->CSR_PRCFG1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG2), + &env->CSR_PRCFG2); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG3), + &env->CSR_PRCFG3); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(0)), + &env->CSR_SAVE[0]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(1)), + &env->CSR_SAVE[1]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(2)), + &env->CSR_SAVE[2]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(3)), + &env->CSR_SAVE[3]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(4)), + &env->CSR_SAVE[4]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(5)), + &env->CSR_SAVE[5]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(6)), + &env->CSR_SAVE[6]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(7)), + &env->CSR_SAVE[7]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TID), + &env->CSR_TID); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CNTC), + &env->CSR_CNTC); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TICLR), + &env->CSR_TICLR); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_LLBCTL), + &env->CSR_LLBCTL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL1), + &env->CSR_IMPCTL1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL2), + &env->CSR_IMPCTL2); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRENTRY), + &env->CSR_TLBRENTRY); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRBADV), + &env->CSR_TLBRBADV); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRERA), + &env->CSR_TLBRERA); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRSAVE), + &env->CSR_TLBRSAVE); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO0), + &env->CSR_TLBRELO0); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO1), + &env->CSR_TLBRELO1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBREHI), + &env->CSR_TLBREHI); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRPRMD), + &env->CSR_TLBRPRMD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(0)), + &env->CSR_DMW[0]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(1)), + &env->CSR_DMW[1]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(2)), + &env->CSR_DMW[2]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(3)), + &env->CSR_DMW[3]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TVAL), + &env->CSR_TVAL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TCFG), + &env->CSR_TCFG); + + return ret; +} + +static int kvm_loongarch_put_csr(CPUState *cs) +{ + int ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CRMD), + &env->CSR_CRMD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRMD), + &env->CSR_PRMD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EUEN), + &env->CSR_EUEN); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_MISC), + &env->CSR_MISC); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ECFG), + &env->CSR_ECFG); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ESTAT), + &env->CSR_ESTAT); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ERA), + &env->CSR_ERA); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADV), + &env->CSR_BADV); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADI), + &env->CSR_BADI); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EENTRY), + &env->CSR_EENTRY); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBIDX), + &env->CSR_TLBIDX); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBEHI), + &env->CSR_TLBEHI); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO0), + &env->CSR_TLBELO0); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO1), + &env->CSR_TLBELO1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ASID), + &env->CSR_ASID); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDL), + &env->CSR_PGDL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDH), + &env->CSR_PGDH); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGD), + &env->CSR_PGD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCL), + &env->CSR_PWCL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCH), + &env->CSR_PWCH); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_STLBPS), + &env->CSR_STLBPS); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG), + &env->CSR_RVACFG); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID), + &env->CSR_CPUID); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1), + &env->CSR_PRCFG1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG2), + &env->CSR_PRCFG2); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG3), + &env->CSR_PRCFG3); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(0)), + &env->CSR_SAVE[0]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(1)), + &env->CSR_SAVE[1]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(2)), + &env->CSR_SAVE[2]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(3)), + &env->CSR_SAVE[3]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(4)), + &env->CSR_SAVE[4]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(5)), + &env->CSR_SAVE[5]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(6)), + &env->CSR_SAVE[6]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(7)), + &env->CSR_SAVE[7]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TID), + &env->CSR_TID); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CNTC), + &env->CSR_CNTC); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TICLR), + &env->CSR_TICLR); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_LLBCTL), + &env->CSR_LLBCTL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL1), + &env->CSR_IMPCTL1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL2), + &env->CSR_IMPCTL2); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRENTRY), + &env->CSR_TLBRENTRY); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRBADV), + &env->CSR_TLBRBADV); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRERA), + &env->CSR_TLBRERA); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRSAVE), + &env->CSR_TLBRSAVE); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO0), + &env->CSR_TLBRELO0); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO1), + &env->CSR_TLBRELO1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBREHI), + &env->CSR_TLBREHI); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRPRMD), + &env->CSR_TLBRPRMD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(0)), + &env->CSR_DMW[0]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(1)), + &env->CSR_DMW[1]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(2)), + &env->CSR_DMW[2]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(3)), + &env->CSR_DMW[3]); + /* + * timer cfg must be put at last since it is used to enable + * guest timer + */ + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TVAL), + &env->CSR_TVAL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TCFG), + &env->CSR_TCFG); + return ret; +} + +static int kvm_loongarch_get_regs_fp(CPUState *cs) +{ + int ret, i; + struct kvm_fpu fpu; + + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret = kvm_vcpu_ioctl(cs, KVM_GET_FPU, &fpu); + if (ret < 0) { + trace_kvm_failed_get_fpu(strerror(errno)); + return ret; + } + + env->fcsr0 = fpu.fcsr; + for (i = 0; i < 32; i++) { + env->fpr[i].vreg.UD[0] = fpu.fpr[i].val64[0]; + } + for (i = 0; i < 8; i++) { + env->cf[i] = fpu.fcc & 0xFF; + fpu.fcc = fpu.fcc >> 8; + } + + return ret; +} + +static int kvm_loongarch_put_regs_fp(CPUState *cs) +{ + int ret, i; + struct kvm_fpu fpu; + + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + fpu.fcsr = env->fcsr0; + fpu.fcc = 0; + for (i = 0; i < 32; i++) { + fpu.fpr[i].val64[0] = env->fpr[i].vreg.UD[0]; + } + + for (i = 0; i < 8; i++) { + fpu.fcc |= env->cf[i] << (8 * i); + } + + ret = kvm_vcpu_ioctl(cs, KVM_SET_FPU, &fpu); + if (ret < 0) { + trace_kvm_failed_put_fpu(strerror(errno)); + } + + return ret; +} + +void kvm_arch_reset_vcpu(CPULoongArchState *env) +{ + env->mp_state = KVM_MP_STATE_RUNNABLE; +} + +static int kvm_loongarch_get_mpstate(CPUState *cs) +{ + int ret = 0; + struct kvm_mp_state mp_state; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + if (cap_has_mp_state) { + ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state); + if (ret) { + trace_kvm_failed_get_mpstate(strerror(errno)); + return ret; + } + env->mp_state = mp_state.mp_state; + } + + return ret; +} + +static int kvm_loongarch_put_mpstate(CPUState *cs) +{ + int ret = 0; + + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + struct kvm_mp_state mp_state = { + .mp_state = env->mp_state + }; + + if (cap_has_mp_state) { + ret = kvm_vcpu_ioctl(cs, KVM_SET_MP_STATE, &mp_state); + if (ret) { + trace_kvm_failed_put_mpstate(strerror(errno)); + } + } + + return ret; +} + +static int kvm_loongarch_get_cpucfg(CPUState *cs) +{ + int i, ret = 0; + uint64_t val; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + for (i = 0; i < 21; i++) { + ret = kvm_get_one_reg(cs, KVM_IOC_CPUCFG(i), &val); + if (ret < 0) { + trace_kvm_failed_get_cpucfg(strerror(errno)); + } + env->cpucfg[i] = (uint32_t)val; + } + return ret; +} + +static int kvm_loongarch_put_cpucfg(CPUState *cs) +{ + int i, ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + uint64_t val; + + for (i = 0; i < 21; i++) { + val = env->cpucfg[i]; + /* LSX and LASX and LBT are not supported in kvm now */ + if (i == 2) { + val &= ~(BIT(R_CPUCFG2_LSX_SHIFT) | BIT(R_CPUCFG2_LASX_SHIFT)); + val &= ~(BIT(R_CPUCFG2_LBT_X86_SHIFT) | + BIT(R_CPUCFG2_LBT_ARM_SHIFT) | + BIT(R_CPUCFG2_LBT_MIPS_SHIFT)); + } + ret = kvm_set_one_reg(cs, KVM_IOC_CPUCFG(i), &val); + if (ret < 0) { + trace_kvm_failed_put_cpucfg(strerror(errno)); + } + } + return ret; +} + int kvm_arch_get_registers(CPUState *cs) { - return 0; + int ret; + + ret = kvm_loongarch_get_regs_core(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_csr(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_regs_fp(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_mpstate(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_cpucfg(cs); + return ret; } + int kvm_arch_put_registers(CPUState *cs, int level) { - return 0; + int ret; + + ret = kvm_loongarch_put_regs_core(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_csr(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_regs_fp(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_mpstate(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_cpucfg(cs); + return ret; } int kvm_arch_init_vcpu(CPUState *cs) diff --git a/target/loongarch/trace-events b/target/loongarch/trace-events new file mode 100644 index 0000000000..6827ab566a --- /dev/null +++ b/target/loongarch/trace-events @@ -0,0 +1,11 @@ +# See docs/devel/tracing.rst for syntax documentation. + +#kvm.c +kvm_failed_get_regs_core(const char *msg) "Failed to get core regs from KVM: %s" +kvm_failed_put_regs_core(const char *msg) "Failed to put core regs into KVM: %s" +kvm_failed_get_fpu(const char *msg) "Failed to get fpu from KVM: %s" +kvm_failed_put_fpu(const char *msg) "Failed to put fpu into KVM: %s" +kvm_failed_get_mpstate(const char *msg) "Failed to get mp_state from KVM: %s" +kvm_failed_put_mpstate(const char *msg) "Failed to put mp_state into KVM: %s" +kvm_failed_get_cpucfg(const char *msg) "Failed to get cpucfg from KVM: %s" +kvm_failed_put_cpucfg(const char *msg) "Failed to put cpucfg into KVM: %s" diff --git a/target/loongarch/trace.h b/target/loongarch/trace.h new file mode 100644 index 0000000000..c2ecb78f08 --- /dev/null +++ b/target/loongarch/trace.h @@ -0,0 +1 @@ +#include "trace/trace-target_loongarch.h" -- Gitee From 3a87dbd5e0343ee777bac0f18888a5a2d51254db Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:58:00 +0800 Subject: [PATCH 016/939] target/loongarch: Implement kvm_arch_init function Implement the kvm_arch_init of loongarch, in the function, the KVM_CAP_MP_STATE cap is checked by kvm ioctl. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Richard Henderson Reviewed-by: Song Gao Message-Id: <20240105075804.1228596-6-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/kvm/kvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index e7c9ef830c..29944b9ef8 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -665,6 +665,7 @@ int kvm_arch_get_default_type(MachineState *ms) int kvm_arch_init(MachineState *ms, KVMState *s) { + cap_has_mp_state = kvm_check_extension(s, KVM_CAP_MP_STATE); return 0; } -- Gitee From d7d47c044c9854675b75b91ade678d03316d9271 Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:58:01 +0800 Subject: [PATCH 017/939] target/loongarch: Implement kvm_arch_init_vcpu Implement kvm_arch_init_vcpu interface for loongarch, in this function, we register VM change state handler. And when VM state changes to running, the counter value should be put into kvm to keep consistent with kvm, and when state change to stop, counter value should be refreshed from kvm. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Song Gao Message-Id: <20240105075804.1228596-7-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.h | 2 ++ target/loongarch/kvm/kvm.c | 23 +++++++++++++++++++++++ target/loongarch/trace-events | 2 ++ 3 files changed, 27 insertions(+) diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index f4a89bd626..8ebd6fa1a7 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -381,6 +381,8 @@ struct ArchCPU { /* 'compatible' string for this CPU for Linux device trees */ const char *dtb_compatible; + /* used by KVM_REG_LOONGARCH_COUNTER ioctl to access guest time counters */ + uint64_t kvm_state_counter; }; /** diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 29944b9ef8..85e7aeb083 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -617,8 +617,31 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } +static void kvm_loongarch_vm_stage_change(void *opaque, bool running, + RunState state) +{ + int ret; + CPUState *cs = opaque; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + + if (running) { + ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_COUNTER, + &cpu->kvm_state_counter); + if (ret < 0) { + trace_kvm_failed_put_counter(strerror(errno)); + } + } else { + ret = kvm_get_one_reg(cs, KVM_REG_LOONGARCH_COUNTER, + &cpu->kvm_state_counter); + if (ret < 0) { + trace_kvm_failed_get_counter(strerror(errno)); + } + } +} + int kvm_arch_init_vcpu(CPUState *cs) { + qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs); return 0; } diff --git a/target/loongarch/trace-events b/target/loongarch/trace-events index 6827ab566a..937c3c7c0c 100644 --- a/target/loongarch/trace-events +++ b/target/loongarch/trace-events @@ -7,5 +7,7 @@ kvm_failed_get_fpu(const char *msg) "Failed to get fpu from KVM: %s" kvm_failed_put_fpu(const char *msg) "Failed to put fpu into KVM: %s" kvm_failed_get_mpstate(const char *msg) "Failed to get mp_state from KVM: %s" kvm_failed_put_mpstate(const char *msg) "Failed to put mp_state into KVM: %s" +kvm_failed_get_counter(const char *msg) "Failed to get counter from KVM: %s" +kvm_failed_put_counter(const char *msg) "Failed to put counter into KVM: %s" kvm_failed_get_cpucfg(const char *msg) "Failed to get cpucfg from KVM: %s" kvm_failed_put_cpucfg(const char *msg) "Failed to put cpucfg into KVM: %s" -- Gitee From 3feeca228b010716aacdf7159df10ea63f7e34cd Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:58:02 +0800 Subject: [PATCH 018/939] target/loongarch: Implement kvm_arch_handle_exit Implement kvm_arch_handle_exit for loongarch. In this function, the KVM_EXIT_LOONGARCH_IOCSR is handled, we read or write the iocsr address space by the addr, length and is_write argument in kvm_run. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Richard Henderson Reviewed-by: Song Gao Message-Id: <20240105075804.1228596-8-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/kvm/kvm.c | 24 +++++++++++++++++++++++- target/loongarch/trace-events | 1 + 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 85e7aeb083..d2dab3fef4 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -723,7 +723,29 @@ bool kvm_arch_cpu_check_are_resettable(void) int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) { - return 0; + int ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + MemTxAttrs attrs = {}; + + attrs.requester_id = env_cpu(env)->cpu_index; + + trace_kvm_arch_handle_exit(run->exit_reason); + switch (run->exit_reason) { + case KVM_EXIT_LOONGARCH_IOCSR: + address_space_rw(&env->address_space_iocsr, + run->iocsr_io.phys_addr, + attrs, + run->iocsr_io.data, + run->iocsr_io.len, + run->iocsr_io.is_write); + break; + default: + ret = -1; + warn_report("KVM: unknown exit reason %d", run->exit_reason); + break; + } + return ret; } void kvm_arch_accel_class_init(ObjectClass *oc) diff --git a/target/loongarch/trace-events b/target/loongarch/trace-events index 937c3c7c0c..021839880e 100644 --- a/target/loongarch/trace-events +++ b/target/loongarch/trace-events @@ -11,3 +11,4 @@ kvm_failed_get_counter(const char *msg) "Failed to get counter from KVM: %s" kvm_failed_put_counter(const char *msg) "Failed to put counter into KVM: %s" kvm_failed_get_cpucfg(const char *msg) "Failed to get cpucfg from KVM: %s" kvm_failed_put_cpucfg(const char *msg) "Failed to put cpucfg into KVM: %s" +kvm_arch_handle_exit(int num) "kvm arch handle exit, the reason number: %d" -- Gitee From 2d0d05b7d5925f71d7ddd4df9f1ac12add453298 Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 7 Mar 2024 10:39:23 +0800 Subject: [PATCH 019/939] chardev/char-socket: Fix TLS io channels sending too much data to the backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 462945cd22d2bcd233401ed3aa167d83a8e35b05 Commit ffda5db65a ("io/channel-tls: fix handling of bigger read buffers") changed the behavior of the TLS io channels to schedule a second reading attempt if there is still incoming data pending. This caused a regression with backends like the sclpconsole that check in their read function that the sender does not try to write more bytes to it than the device can currently handle. The problem can be reproduced like this: 1) In one terminal, do this: mkdir qemu-pki cd qemu-pki openssl genrsa 2048 > ca-key.pem openssl req -new -x509 -nodes -days 365000 -key ca-key.pem -out ca-cert.pem # enter some dummy value for the cert openssl genrsa 2048 > server-key.pem openssl req -new -x509 -nodes -days 365000 -key server-key.pem \ -out server-cert.pem # enter some other dummy values for the cert gnutls-serv --echo --x509cafile ca-cert.pem --x509keyfile server-key.pem \ --x509certfile server-cert.pem -p 8338 2) In another terminal, do this: wget https://download.fedoraproject.org/pub/fedora-secondary/releases/39/Cloud/s390x/images/Fedora-Cloud-Base-39-1.5.s390x.qcow2 qemu-system-s390x -nographic -nodefaults \ -hda Fedora-Cloud-Base-39-1.5.s390x.qcow2 \ -object tls-creds-x509,id=tls0,endpoint=client,verify-peer=false,dir=$PWD/qemu-pki \ -chardev socket,id=tls_chardev,host=localhost,port=8338,tls-creds=tls0 \ -device sclpconsole,chardev=tls_chardev,id=tls_serial QEMU then aborts after a second or two with: qemu-system-s390x: ../hw/char/sclpconsole.c:73: chr_read: Assertion `size <= SIZE_BUFFER_VT220 - scon->iov_data_len' failed. Aborted (core dumped) It looks like the second read does not trigger the chr_can_read() function to be called before the second read, which should normally always be done before sending bytes to a character device to see how much it can handle, so the s->max_size in tcp_chr_read() still contains the old value from the previous read. Let's make sure that we use the up-to-date value by calling tcp_chr_read_poll() again here. Fixes: ffda5db65a ("io/channel-tls: fix handling of bigger read buffers") Buglink: https://issues.redhat.com/browse/RHEL-24614 Reviewed-by: "Daniel P. Berrangé" Message-ID: <20240229104339.42574-1-thuth@redhat.com> Reviewed-by: Antoine Damhet Tested-by: Antoine Damhet Reviewed-by: Marc-André Lureau Signed-off-by: Thomas Huth Signed-off-by: qihao_yewu --- chardev/char-socket.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 73947da188..034840593d 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -492,9 +492,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) s->max_size <= 0) { return TRUE; } - len = sizeof(buf); - if (len > s->max_size) { - len = s->max_size; + len = tcp_chr_read_poll(opaque); + if (len > sizeof(buf)) { + len = sizeof(buf); } size = tcp_chr_recv(chr, (void *)buf, len); if (size == 0 || (size == -1 && errno != EAGAIN)) { -- Gitee From c952c9acfab98a83122b4e6d406f4a7a0dfe871f Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 15 Jan 2024 04:13:24 -0500 Subject: [PATCH 020/939] i386/cpu: Clear FEAT_XSAVE_XSS_LO/HI leafs when CPUID_EXT_XSAVE is not available commit 81f5cad3858f27623b1b14467926032d229b76cc upstream. Leaf FEAT_XSAVE_XSS_LO and FEAT_XSAVE_XSS_HI also need to be cleared when CPUID_EXT_XSAVE is not set. Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") Signed-off-by: Xiaoyao Li Reviewed-by: Yang Weijiang Message-ID: <20240115091325.1904229-2-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index cd16cb893d..8b9ef218d3 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6927,6 +6927,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { env->features[FEAT_XSAVE_XCR0_LO] = 0; env->features[FEAT_XSAVE_XCR0_HI] = 0; + env->features[FEAT_XSAVE_XSS_LO] = 0; + env->features[FEAT_XSAVE_XSS_HI] = 0; return; } -- Gitee From 26ddb3428182503b28ac87cad7543eb241a9d353 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 15 Jan 2024 04:13:25 -0500 Subject: [PATCH 021/939] i386/cpu: Mask with XCR0/XSS mask for FEAT_XSAVE_XCR0_HI and FEAT_XSAVE_XSS_HI leafs commit a11a365159b944e05be76f3ec3b98c8b38cb70fd upstream. The value of FEAT_XSAVE_XCR0_HI leaf and FEAT_XSAVE_XSS_HI leaf also need to be masked by XCR0 and XSS mask respectively, to make it logically correct. Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") Signed-off-by: Xiaoyao Li Reviewed-by: Yang Weijiang Message-ID: <20240115091325.1904229-3-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 8b9ef218d3..a66e5a357b 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6947,9 +6947,9 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) } env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK; - env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32; + env->features[FEAT_XSAVE_XCR0_HI] = (mask & CPUID_XSTATE_XCR0_MASK) >> 32; env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK; - env->features[FEAT_XSAVE_XSS_HI] = mask >> 32; + env->features[FEAT_XSAVE_XSS_HI] = (mask & CPUID_XSTATE_XSS_MASK) >> 32; } /***** Steps involved on loading and filtering CPUID data -- Gitee From 576170252c3cbd79ed918f688d088f1ccd15602a Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 24 Jan 2024 21:40:14 -0500 Subject: [PATCH 022/939] i386/cpuid: Decrease cpuid_i when skipping CPUID leaf 1F commit 10f92799af8ba3c3cef2352adcd4780f13fbab31 upstream. Existing code misses a decrement of cpuid_i when skip leaf 0x1F. There's a blank CPUID entry(with leaf, subleaf as 0, and all fields stuffed 0s) left in the CPUID array. It conflicts with correct CPUID leaf 0. Signed-off-by: Xiaoyao Li Reviewed-by:Yang Weijiang Message-ID: <20240125024016.2521244-2-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 4ce80555b4..e68eb8f5e6 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1914,6 +1914,7 @@ int kvm_arch_init_vcpu(CPUState *cs) } case 0x1f: if (env->nr_dies < 2) { + cpuid_i--; break; } /* fallthrough */ -- Gitee From bf3d3ecf9ff5808d1f03e83a363c8295f7abad76 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 24 Jan 2024 21:40:16 -0500 Subject: [PATCH 023/939] i386/cpuid: Move leaf 7 to correct group commit 0729857c707535847d7fe31d3d91eb8b2a118e3c upstream. CPUID leaf 7 was grouped together with SGX leaf 0x12 by commit b9edbadefb9e ("i386: Propagate SGX CPUID sub-leafs to KVM") by mistake. SGX leaf 0x12 has its specific logic to check if subleaf (starting from 2) is valid or not by checking the bit 0:3 of corresponding EAX is 1 or not. Leaf 7 follows the logic that EAX of subleaf 0 enumerates the maximum valid subleaf. Fixes: b9edbadefb9e ("i386: Propagate SGX CPUID sub-leafs to KVM") Signed-off-by: Xiaoyao Li Message-ID: <20240125024016.2521244-4-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index e68eb8f5e6..a0bc9ea7b1 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1955,7 +1955,6 @@ int kvm_arch_init_vcpu(CPUState *cs) c = &cpuid_data.entries[cpuid_i++]; } break; - case 0x7: case 0x12: for (j = 0; ; j++) { c->function = i; @@ -1975,6 +1974,7 @@ int kvm_arch_init_vcpu(CPUState *cs) c = &cpuid_data.entries[cpuid_i++]; } break; + case 0x7: case 0x14: case 0x1d: case 0x1e: { -- Gitee From 773ea71519da1413ca2e0e60857272164e156a47 Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Wed, 10 Jan 2024 10:41:51 +0100 Subject: [PATCH 024/939] target/loongarch: Restrict TCG-specific code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation of supporting KVM in the next commit. Conflict: diff --cc target/loongarch/cpu.c index 275833eec8,70dd4622aa..0000000000 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@@ -17,9 -17,14 +17,17 @@@ #include "internals.h" #include "fpu/softfloat-helpers.h" #include "cpu-csr.h" -#ifndef CONFIG_USER_ONLY #include "sysemu/reset.h" ++<<<<<<< HEAD +#include "tcg/tcg.h" ++======= + #endif ++>>>>>>> target/loongarch: Restrict TCG-specific code #include "vec.h" + #ifdef CONFIG_TCG + #include "exec/cpu_ldst.h" + #include "tcg/tcg.h" + #endif Solve: drop: ++<<<<<<< HEAD +#include "tcg/tcg.h" ++======= + #endif ++>>>>>>> target/loongarch: Restrict TCG-specific code Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Song Gao Message-ID: <20240105075804.1228596-9-zhaotianrui@loongson.cn> [PMD: Split from bigger patch, part 1] Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20240110094152.52138-1-philmd@linaro.org> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 275833eec8..60f2636b43 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -11,15 +11,18 @@ #include "qapi/error.h" #include "qemu/module.h" #include "sysemu/qtest.h" -#include "exec/cpu_ldst.h" +#include "sysemu/tcg.h" #include "exec/exec-all.h" #include "cpu.h" #include "internals.h" #include "fpu/softfloat-helpers.h" #include "cpu-csr.h" #include "sysemu/reset.h" -#include "tcg/tcg.h" #include "vec.h" +#ifdef CONFIG_TCG +#include "exec/cpu_ldst.h" +#include "tcg/tcg.h" +#endif const char * const regnames[32] = { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", @@ -108,12 +111,13 @@ void loongarch_cpu_set_irq(void *opaque, int irq, int level) return; } - env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0); - - if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) { - cpu_interrupt(cs, CPU_INTERRUPT_HARD); - } else { - cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); + if (tcg_enabled()) { + env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0); + if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) { + cpu_interrupt(cs, CPU_INTERRUPT_HARD); + } else { + cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); + } } } @@ -138,7 +142,10 @@ static inline bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env) return (pending & status) != 0; } +#endif +#ifdef CONFIG_TCG +#ifndef CONFIG_USER_ONLY static void loongarch_cpu_do_interrupt(CPUState *cs) { LoongArchCPU *cpu = LOONGARCH_CPU(cs); @@ -320,7 +327,6 @@ static bool loongarch_cpu_exec_interrupt(CPUState *cs, int interrupt_request) } #endif -#ifdef CONFIG_TCG static void loongarch_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb) { @@ -558,7 +564,9 @@ static void loongarch_cpu_reset_hold(Object *obj) } #endif +#ifdef CONFIG_TCG restore_fp_status(env); +#endif cs->exception_index = -1; } @@ -701,8 +709,10 @@ static void loongarch_cpu_init(Object *obj) CPULoongArchState *env = &cpu->env; qdev_init_gpio_in(DEVICE(cpu), loongarch_cpu_set_irq, N_IRQS); +#ifdef CONFIG_TCG timer_init_ns(&cpu->timer, QEMU_CLOCK_VIRTUAL, &loongarch_constant_timer_cb, cpu); +#endif memory_region_init_io(&env->system_iocsr, OBJECT(cpu), NULL, env, "iocsr", UINT64_MAX); address_space_init(&env->address_space_iocsr, &env->system_iocsr, "IOCSR"); @@ -802,7 +812,9 @@ static struct TCGCPUOps loongarch_tcg_ops = { #include "hw/core/sysemu-cpu-ops.h" static const struct SysemuCPUOps loongarch_sysemu_ops = { +#ifdef CONFIG_TCG .get_phys_page_debug = loongarch_cpu_get_phys_page_debug, +#endif }; static int64_t loongarch_cpu_get_arch_id(CPUState *cs) -- Gitee From 5f4c8b31db442e6ac39fbfe4b29d5479ab3567aa Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Wed, 10 Jan 2024 10:41:52 +0100 Subject: [PATCH 025/939] target/loongarch: Implement set vcpu intr for kvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement loongarch kvm set vcpu interrupt interface, when a irq is set in vcpu, we use the KVM_INTERRUPT ioctl to set intr into kvm. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Song Gao Message-ID: <20240105075804.1228596-9-zhaotianrui@loongson.cn> [PMD: Split from bigger patch, part 2] Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20240110094152.52138-2-philmd@linaro.org> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 9 ++++++++- target/loongarch/kvm/kvm.c | 15 +++++++++++++++ target/loongarch/kvm/kvm_loongarch.h | 16 ++++++++++++++++ target/loongarch/trace-events | 1 + 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 target/loongarch/kvm/kvm_loongarch.h diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 60f2636b43..413414392b 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -12,6 +12,8 @@ #include "qemu/module.h" #include "sysemu/qtest.h" #include "sysemu/tcg.h" +#include "sysemu/kvm.h" +#include "kvm/kvm_loongarch.h" #include "exec/exec-all.h" #include "cpu.h" #include "internals.h" @@ -19,6 +21,9 @@ #include "cpu-csr.h" #include "sysemu/reset.h" #include "vec.h" +#ifdef CONFIG_KVM +#include +#endif #ifdef CONFIG_TCG #include "exec/cpu_ldst.h" #include "tcg/tcg.h" @@ -111,7 +116,9 @@ void loongarch_cpu_set_irq(void *opaque, int irq, int level) return; } - if (tcg_enabled()) { + if (kvm_enabled()) { + kvm_loongarch_set_interrupt(cpu, irq, level); + } else if (tcg_enabled()) { env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0); if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) { cpu_interrupt(cs, CPU_INTERRUPT_HARD); diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index d2dab3fef4..bd33ec2114 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -748,6 +748,21 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) return ret; } +int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level) +{ + struct kvm_interrupt intr; + CPUState *cs = CPU(cpu); + + if (level) { + intr.irq = irq; + } else { + intr.irq = -irq; + } + + trace_kvm_set_intr(irq, level); + return kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &intr); +} + void kvm_arch_accel_class_init(ObjectClass *oc) { } diff --git a/target/loongarch/kvm/kvm_loongarch.h b/target/loongarch/kvm/kvm_loongarch.h new file mode 100644 index 0000000000..d945b6bb82 --- /dev/null +++ b/target/loongarch/kvm/kvm_loongarch.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * QEMU LoongArch kvm interface + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + */ + +#include "cpu.h" + +#ifndef QEMU_KVM_LOONGARCH_H +#define QEMU_KVM_LOONGARCH_H + +int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level); +void kvm_arch_reset_vcpu(CPULoongArchState *env); + +#endif diff --git a/target/loongarch/trace-events b/target/loongarch/trace-events index 021839880e..dea11edc0f 100644 --- a/target/loongarch/trace-events +++ b/target/loongarch/trace-events @@ -12,3 +12,4 @@ kvm_failed_put_counter(const char *msg) "Failed to put counter into KVM: %s" kvm_failed_get_cpucfg(const char *msg) "Failed to get cpucfg from KVM: %s" kvm_failed_put_cpucfg(const char *msg) "Failed to put cpucfg into KVM: %s" kvm_arch_handle_exit(int num) "kvm arch handle exit, the reason number: %d" +kvm_set_intr(int irq, int level) "kvm set interrupt, irq num: %d, level: %d" -- Gitee From 49a7ae85d6ac42f8ef556a0d42802508c28adfcc Mon Sep 17 00:00:00 2001 From: Tianrui Zhao Date: Fri, 5 Jan 2024 15:58:04 +0800 Subject: [PATCH 026/939] target/loongarch: Add loongarch kvm into meson build Add kvm.c into meson.build to compile it when kvm is configed. Meanwhile in meson.build, we set the kvm_targets to loongarch64-softmmu when the cpu is loongarch. And fix the compiling error when config is enable-kvm,disable-tcg. Signed-off-by: Tianrui Zhao Signed-off-by: xianglai li Reviewed-by: Richard Henderson Reviewed-by: Song Gao Message-Id: <20240105075804.1228596-10-zhaotianrui@loongson.cn> Signed-off-by: Song Gao --- meson.build | 2 ++ target/loongarch/kvm/meson.build | 1 + target/loongarch/meson.build | 1 + 3 files changed, 4 insertions(+) create mode 100644 target/loongarch/kvm/meson.build diff --git a/meson.build b/meson.build index 445f2b7c2b..0c62b4156d 100644 --- a/meson.build +++ b/meson.build @@ -114,6 +114,8 @@ elif cpu in ['riscv32'] kvm_targets = ['riscv32-softmmu'] elif cpu in ['riscv64'] kvm_targets = ['riscv64-softmmu'] +elif cpu in ['loongarch64'] + kvm_targets = ['loongarch64-softmmu'] else kvm_targets = [] endif diff --git a/target/loongarch/kvm/meson.build b/target/loongarch/kvm/meson.build new file mode 100644 index 0000000000..2266de6ca9 --- /dev/null +++ b/target/loongarch/kvm/meson.build @@ -0,0 +1 @@ +loongarch_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c')) diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build index e84e4c51f4..db310f6022 100644 --- a/target/loongarch/meson.build +++ b/target/loongarch/meson.build @@ -18,3 +18,4 @@ subdir('tcg') target_arch += {'loongarch': loongarch_ss} target_system_arch += {'loongarch': loongarch_system_ss} +subdir('kvm') -- Gitee From b8f53cfa91e86d5163318f8ade1cca18e94f3eb7 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 13 Dec 2023 12:12:01 +0800 Subject: [PATCH 027/939] hw/intc/loongarch_ipi: Use MemTxAttrs interface for ipi ops There are two interface pairs for MemoryRegionOps, read/write and read_with_attrs/write_with_attrs. The later is better for ipi device emulation since initial cpu can be parsed from attrs.requester_id. And requester_id can be overrided for IOCSR_IPI_SEND and mail_send function when it is to forward message to another vcpu. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20231215100333.3933632-2-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/intc/loongarch_ipi.c | 136 +++++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c index 67858b521c..221246c5cb 100644 --- a/hw/intc/loongarch_ipi.c +++ b/hw/intc/loongarch_ipi.c @@ -17,14 +17,16 @@ #include "target/loongarch/internals.h" #include "trace.h" -static void loongarch_ipi_writel(void *, hwaddr, uint64_t, unsigned); - -static uint64_t loongarch_ipi_readl(void *opaque, hwaddr addr, unsigned size) +static MemTxResult loongarch_ipi_readl(void *opaque, hwaddr addr, + uint64_t *data, + unsigned size, MemTxAttrs attrs) { - IPICore *s = opaque; + IPICore *s; + LoongArchIPI *ipi = opaque; uint64_t ret = 0; int index = 0; + s = &ipi->ipi_core; addr &= 0xff; switch (addr) { case CORE_STATUS_OFF: @@ -49,10 +51,12 @@ static uint64_t loongarch_ipi_readl(void *opaque, hwaddr addr, unsigned size) } trace_loongarch_ipi_read(size, (uint64_t)addr, ret); - return ret; + *data = ret; + return MEMTX_OK; } -static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr) +static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr, + MemTxAttrs attrs) { int i, mask = 0, data = 0; @@ -62,7 +66,7 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr) */ if ((val >> 27) & 0xf) { data = address_space_ldl(&env->address_space_iocsr, addr, - MEMTXATTRS_UNSPECIFIED, NULL); + attrs, NULL); for (i = 0; i < 4; i++) { /* get mask for byte writing */ if (val & (0x1 << (27 + i))) { @@ -74,7 +78,7 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr) data &= mask; data |= (val >> 32) & ~mask; address_space_stl(&env->address_space_iocsr, addr, - data, MEMTXATTRS_UNSPECIFIED, NULL); + data, attrs, NULL); } static int archid_cmp(const void *a, const void *b) @@ -103,80 +107,72 @@ static CPUState *ipi_getcpu(int arch_id) CPUArchId *archid; archid = find_cpu_by_archid(machine, arch_id); - return CPU(archid->cpu); -} - -static void ipi_send(uint64_t val) -{ - uint32_t cpuid; - uint8_t vector; - CPUState *cs; - LoongArchCPU *cpu; - LoongArchIPI *s; - - cpuid = extract32(val, 16, 10); - if (cpuid >= LOONGARCH_MAX_CPUS) { - trace_loongarch_ipi_unsupported_cpuid("IOCSR_IPI_SEND", cpuid); - return; + if (archid) { + return CPU(archid->cpu); } - /* IPI status vector */ - vector = extract8(val, 0, 5); - - cs = ipi_getcpu(cpuid); - cpu = LOONGARCH_CPU(cs); - s = LOONGARCH_IPI(cpu->env.ipistate); - loongarch_ipi_writel(&s->ipi_core, CORE_SET_OFF, BIT(vector), 4); + return NULL; } -static void mail_send(uint64_t val) +static MemTxResult mail_send(uint64_t val, MemTxAttrs attrs) { uint32_t cpuid; hwaddr addr; - CPULoongArchState *env; CPUState *cs; - LoongArchCPU *cpu; cpuid = extract32(val, 16, 10); if (cpuid >= LOONGARCH_MAX_CPUS) { trace_loongarch_ipi_unsupported_cpuid("IOCSR_MAIL_SEND", cpuid); - return; + return MEMTX_DECODE_ERROR; } - addr = 0x1020 + (val & 0x1c); cs = ipi_getcpu(cpuid); - cpu = LOONGARCH_CPU(cs); - env = &cpu->env; - send_ipi_data(env, val, addr); + if (cs == NULL) { + return MEMTX_DECODE_ERROR; + } + + /* override requester_id */ + addr = SMP_IPI_MAILBOX + CORE_BUF_20 + (val & 0x1c); + attrs.requester_id = cs->cpu_index; + send_ipi_data(&LOONGARCH_CPU(cs)->env, val, addr, attrs); + return MEMTX_OK; } -static void any_send(uint64_t val) +static MemTxResult any_send(uint64_t val, MemTxAttrs attrs) { uint32_t cpuid; hwaddr addr; - CPULoongArchState *env; CPUState *cs; - LoongArchCPU *cpu; cpuid = extract32(val, 16, 10); if (cpuid >= LOONGARCH_MAX_CPUS) { trace_loongarch_ipi_unsupported_cpuid("IOCSR_ANY_SEND", cpuid); - return; + return MEMTX_DECODE_ERROR; } - addr = val & 0xffff; cs = ipi_getcpu(cpuid); - cpu = LOONGARCH_CPU(cs); - env = &cpu->env; - send_ipi_data(env, val, addr); + if (cs == NULL) { + return MEMTX_DECODE_ERROR; + } + + /* override requester_id */ + addr = val & 0xffff; + attrs.requester_id = cs->cpu_index; + send_ipi_data(&LOONGARCH_CPU(cs)->env, val, addr, attrs); + return MEMTX_OK; } -static void loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, - unsigned size) +static MemTxResult loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) { - IPICore *s = opaque; + LoongArchIPI *ipi = opaque; + IPICore *s; int index = 0; + uint32_t cpuid; + uint8_t vector; + CPUState *cs; + s = &ipi->ipi_core; addr &= 0xff; trace_loongarch_ipi_write(size, (uint64_t)addr, val); switch (addr) { @@ -203,17 +199,35 @@ static void loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, s->buf[index] = val; break; case IOCSR_IPI_SEND: - ipi_send(val); + cpuid = extract32(val, 16, 10); + if (cpuid >= LOONGARCH_MAX_CPUS) { + trace_loongarch_ipi_unsupported_cpuid("IOCSR_IPI_SEND", cpuid); + return MEMTX_DECODE_ERROR; + } + + /* IPI status vector */ + vector = extract8(val, 0, 5); + cs = ipi_getcpu(cpuid); + if (cs == NULL) { + return MEMTX_DECODE_ERROR; + } + + /* override requester_id */ + attrs.requester_id = cs->cpu_index; + ipi = LOONGARCH_IPI(LOONGARCH_CPU(cs)->env.ipistate); + loongarch_ipi_writel(ipi, CORE_SET_OFF, BIT(vector), 4, attrs); break; default: qemu_log_mask(LOG_UNIMP, "invalid write: %x", (uint32_t)addr); break; } + + return MEMTX_OK; } static const MemoryRegionOps loongarch_ipi_ops = { - .read = loongarch_ipi_readl, - .write = loongarch_ipi_writel, + .read_with_attrs = loongarch_ipi_readl, + .write_with_attrs = loongarch_ipi_writel, .impl.min_access_size = 4, .impl.max_access_size = 4, .valid.min_access_size = 4, @@ -222,24 +236,28 @@ static const MemoryRegionOps loongarch_ipi_ops = { }; /* mail send and any send only support writeq */ -static void loongarch_ipi_writeq(void *opaque, hwaddr addr, uint64_t val, - unsigned size) +static MemTxResult loongarch_ipi_writeq(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) { + MemTxResult ret = MEMTX_OK; + addr &= 0xfff; switch (addr) { case MAIL_SEND_OFFSET: - mail_send(val); + ret = mail_send(val, attrs); break; case ANY_SEND_OFFSET: - any_send(val); + ret = any_send(val, attrs); break; default: break; } + + return ret; } static const MemoryRegionOps loongarch_ipi64_ops = { - .write = loongarch_ipi_writeq, + .write_with_attrs = loongarch_ipi_writeq, .impl.min_access_size = 8, .impl.max_access_size = 8, .valid.min_access_size = 8, @@ -253,7 +271,7 @@ static void loongarch_ipi_init(Object *obj) SysBusDevice *sbd = SYS_BUS_DEVICE(obj); memory_region_init_io(&s->ipi_iocsr_mem, obj, &loongarch_ipi_ops, - &s->ipi_core, "loongarch_ipi_iocsr", 0x48); + s, "loongarch_ipi_iocsr", 0x48); /* loongarch_ipi_iocsr performs re-entrant IO through ipi_send */ s->ipi_iocsr_mem.disable_reentrancy_guard = true; @@ -261,7 +279,7 @@ static void loongarch_ipi_init(Object *obj) sysbus_init_mmio(sbd, &s->ipi_iocsr_mem); memory_region_init_io(&s->ipi64_iocsr_mem, obj, &loongarch_ipi64_ops, - &s->ipi_core, "loongarch_ipi64_iocsr", 0x118); + s, "loongarch_ipi64_iocsr", 0x118); sysbus_init_mmio(sbd, &s->ipi64_iocsr_mem); qdev_init_gpio_out(DEVICE(obj), &s->ipi_core.irq, 1); } -- Gitee From 43100bba2bfd9de0c3bab7c3e815b02faa69242d Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 13 Dec 2023 12:13:14 +0800 Subject: [PATCH 028/939] hw/loongarch/virt: Set iocsr address space per-board rather than percpu LoongArch system has iocsr address space, most iocsr registers are per-board, however some iocsr register spaces banked for percpu such as ipi mailbox and extioi interrupt status. For banked iocsr space, each cpu has the same iocsr space, but separate data. This patch changes iocsr address space per-board rather percpu, for iocsr registers specified for cpu, MemTxAttrs.requester_id can be parsed for the cpu. With this patches, the total address space on board will be simple, only iocsr address space and system memory, rather than the number of cpu and system memory. confict: +<<<<<<< HEAD + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(ipi_core, LoongArchIPI, 0, vmstate_ipi_core, IPICore), ++======= + .version_id = 2, + .minimum_version_id = 2, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchIPI, num_cpu, + vmstate_ipi_core, IPICore), ++>>>>>>> hw/loongarch/virt: Set iocsr address space per-board rather than percpu solve: save: hw/loongarch/virt: Set iocsr address space per-board rather than percpu Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20231215100333.3933632-3-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/intc/loongarch_extioi.c | 3 - hw/intc/loongarch_ipi.c | 63 +++++++++++++++----- hw/loongarch/virt.c | 91 +++++++++++++++++++++-------- include/hw/intc/loongarch_extioi.h | 1 - include/hw/intc/loongarch_ipi.h | 3 +- include/hw/loongarch/virt.h | 3 + target/loongarch/cpu.c | 48 --------------- target/loongarch/cpu.h | 4 +- target/loongarch/kvm/kvm.c | 2 +- target/loongarch/tcg/iocsr_helper.c | 16 ++--- 10 files changed, 129 insertions(+), 105 deletions(-) diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c index 24fb3af8cc..77b4776958 100644 --- a/hw/intc/loongarch_extioi.c +++ b/hw/intc/loongarch_extioi.c @@ -282,9 +282,6 @@ static void loongarch_extioi_instance_init(Object *obj) qdev_init_gpio_in(DEVICE(obj), extioi_setirq, EXTIOI_IRQS); for (cpu = 0; cpu < EXTIOI_CPUS; cpu++) { - memory_region_init_io(&s->extioi_iocsr_mem[cpu], OBJECT(s), &extioi_ops, - s, "extioi_iocsr", 0x900); - sysbus_init_mmio(dev, &s->extioi_iocsr_mem[cpu]); for (pin = 0; pin < LS3A_INTC_IP; pin++) { qdev_init_gpio_out(DEVICE(obj), &s->parent_irq[cpu][pin], 1); } diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c index 221246c5cb..e228669aa5 100644 --- a/hw/intc/loongarch_ipi.c +++ b/hw/intc/loongarch_ipi.c @@ -9,6 +9,7 @@ #include "hw/sysbus.h" #include "hw/intc/loongarch_ipi.h" #include "hw/irq.h" +#include "hw/qdev-properties.h" #include "qapi/error.h" #include "qemu/log.h" #include "exec/address-spaces.h" @@ -26,7 +27,7 @@ static MemTxResult loongarch_ipi_readl(void *opaque, hwaddr addr, uint64_t ret = 0; int index = 0; - s = &ipi->ipi_core; + s = &ipi->cpu[attrs.requester_id]; addr &= 0xff; switch (addr) { case CORE_STATUS_OFF: @@ -65,7 +66,7 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr, * if the mask is 0, we need not to do anything. */ if ((val >> 27) & 0xf) { - data = address_space_ldl(&env->address_space_iocsr, addr, + data = address_space_ldl(env->address_space_iocsr, addr, attrs, NULL); for (i = 0; i < 4; i++) { /* get mask for byte writing */ @@ -77,7 +78,7 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr, data &= mask; data |= (val >> 32) & ~mask; - address_space_stl(&env->address_space_iocsr, addr, + address_space_stl(env->address_space_iocsr, addr, data, attrs, NULL); } @@ -172,7 +173,7 @@ static MemTxResult loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, uint8_t vector; CPUState *cs; - s = &ipi->ipi_core; + s = &ipi->cpu[attrs.requester_id]; addr &= 0xff; trace_loongarch_ipi_write(size, (uint64_t)addr, val); switch (addr) { @@ -214,7 +215,6 @@ static MemTxResult loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, /* override requester_id */ attrs.requester_id = cs->cpu_index; - ipi = LOONGARCH_IPI(LOONGARCH_CPU(cs)->env.ipistate); loongarch_ipi_writel(ipi, CORE_SET_OFF, BIT(vector), 4, attrs); break; default: @@ -265,12 +265,18 @@ static const MemoryRegionOps loongarch_ipi64_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; -static void loongarch_ipi_init(Object *obj) +static void loongarch_ipi_realize(DeviceState *dev, Error **errp) { - LoongArchIPI *s = LOONGARCH_IPI(obj); - SysBusDevice *sbd = SYS_BUS_DEVICE(obj); + LoongArchIPI *s = LOONGARCH_IPI(dev); + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + int i; + + if (s->num_cpu == 0) { + error_setg(errp, "num-cpu must be at least 1"); + return; + } - memory_region_init_io(&s->ipi_iocsr_mem, obj, &loongarch_ipi_ops, + memory_region_init_io(&s->ipi_iocsr_mem, OBJECT(dev), &loongarch_ipi_ops, s, "loongarch_ipi_iocsr", 0x48); /* loongarch_ipi_iocsr performs re-entrant IO through ipi_send */ @@ -278,10 +284,20 @@ static void loongarch_ipi_init(Object *obj) sysbus_init_mmio(sbd, &s->ipi_iocsr_mem); - memory_region_init_io(&s->ipi64_iocsr_mem, obj, &loongarch_ipi64_ops, + memory_region_init_io(&s->ipi64_iocsr_mem, OBJECT(dev), + &loongarch_ipi64_ops, s, "loongarch_ipi64_iocsr", 0x118); sysbus_init_mmio(sbd, &s->ipi64_iocsr_mem); - qdev_init_gpio_out(DEVICE(obj), &s->ipi_core.irq, 1); + + s->cpu = g_new0(IPICore, s->num_cpu); + if (s->cpu == NULL) { + error_setg(errp, "Memory allocation for ExtIOICore faile"); + return; + } + + for (i = 0; i < s->num_cpu; i++) { + qdev_init_gpio_out(dev, &s->cpu[i].irq, 1); + } } static const VMStateDescription vmstate_ipi_core = { @@ -300,27 +316,42 @@ static const VMStateDescription vmstate_ipi_core = { static const VMStateDescription vmstate_loongarch_ipi = { .name = TYPE_LOONGARCH_IPI, - .version_id = 1, - .minimum_version_id = 1, - .fields = (VMStateField[]) { - VMSTATE_STRUCT(ipi_core, LoongArchIPI, 0, vmstate_ipi_core, IPICore), + .version_id = 2, + .minimum_version_id = 2, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchIPI, num_cpu, + vmstate_ipi_core, IPICore), VMSTATE_END_OF_LIST() } }; +static Property ipi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", LoongArchIPI, num_cpu, 1), + DEFINE_PROP_END_OF_LIST(), +}; + static void loongarch_ipi_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + dc->realize = loongarch_ipi_realize; + device_class_set_props(dc, ipi_properties); dc->vmsd = &vmstate_loongarch_ipi; } +static void loongarch_ipi_finalize(Object *obj) +{ + LoongArchIPI *s = LOONGARCH_IPI(obj); + + g_free(s->cpu); +} + static const TypeInfo loongarch_ipi_info = { .name = TYPE_LOONGARCH_IPI, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(LoongArchIPI), - .instance_init = loongarch_ipi_init, .class_init = loongarch_ipi_class_init, + .instance_finalize = loongarch_ipi_finalize, }; static void loongarch_ipi_register_types(void) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 4b7dc67a2d..13d19b6da3 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -535,9 +535,6 @@ static void loongarch_irq_init(LoongArchMachineState *lams) CPUState *cpu_state; int cpu, pin, i, start, num; - extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); - sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); - /* * The connection of interrupts: * +-----+ +---------+ +-------+ @@ -559,36 +556,36 @@ static void loongarch_irq_init(LoongArchMachineState *lams) * | UARTs | | Devices | | Devices | * +--------+ +---------+ +---------+ */ + + /* Create IPI device */ + ipi = qdev_new(TYPE_LOONGARCH_IPI); + qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.cpus); + sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); + + /* IPI iocsr memory region */ + memory_region_add_subregion(&lams->system_iocsr, SMP_IPI_MAILBOX, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); + memory_region_add_subregion(&lams->system_iocsr, MAIL_SEND_ADDR, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { cpu_state = qemu_get_cpu(cpu); cpudev = DEVICE(cpu_state); lacpu = LOONGARCH_CPU(cpu_state); env = &(lacpu->env); - - ipi = qdev_new(TYPE_LOONGARCH_IPI); - sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); + env->address_space_iocsr = &lams->as_iocsr; /* connect ipi irq to cpu irq */ - qdev_connect_gpio_out(ipi, 0, qdev_get_gpio_in(cpudev, IRQ_IPI)); - /* IPI iocsr memory region */ - memory_region_add_subregion(&env->system_iocsr, SMP_IPI_MAILBOX, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), - 0)); - memory_region_add_subregion(&env->system_iocsr, MAIL_SEND_ADDR, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), - 1)); - /* - * extioi iocsr memory region - * only one extioi is added on loongarch virt machine - * external device interrupt can only be routed to cpu 0-3 - */ - if (cpu < EXTIOI_CPUS) - memory_region_add_subregion(&env->system_iocsr, APIC_BASE, - sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), - cpu)); + qdev_connect_gpio_out(ipi, cpu, qdev_get_gpio_in(cpudev, IRQ_IPI)); env->ipistate = ipi; } + /* Create EXTIOI device */ + extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); + sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + memory_region_add_subregion(&lams->system_iocsr, APIC_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); + /* * connect ext irq to the cpu irq * cpu_pin[9:2] <= intc_pin[7:0] @@ -733,6 +730,43 @@ static void loongarch_direct_kernel_boot(LoongArchMachineState *lams, } } +static void loongarch_qemu_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ +} + +static uint64_t loongarch_qemu_read(void *opaque, hwaddr addr, unsigned size) +{ + switch (addr) { + case VERSION_REG: + return 0x11ULL; + case FEATURE_REG: + return 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI | + 1ULL << IOCSRF_CSRIPI; + case VENDOR_REG: + return 0x6e6f73676e6f6f4cULL; /* "Loongson" */ + case CPUNAME_REG: + return 0x303030354133ULL; /* "3A5000" */ + case MISC_FUNC_REG: + return 1ULL << IOCSRM_EXTIOI_EN; + } + return 0ULL; +} + +static const MemoryRegionOps loongarch_qemu_ops = { + .read = loongarch_qemu_read, + .write = loongarch_qemu_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 4, + .max_access_size = 8, + }, + .impl = { + .min_access_size = 8, + .max_access_size = 8, + }, +}; + static void loongarch_init(MachineState *machine) { LoongArchCPU *lacpu; @@ -761,8 +795,17 @@ static void loongarch_init(MachineState *machine) exit(1); } create_fdt(lams); - /* Init CPUs */ + /* Create IOCSR space */ + memory_region_init_io(&lams->system_iocsr, OBJECT(machine), NULL, + machine, "iocsr", UINT64_MAX); + address_space_init(&lams->as_iocsr, &lams->system_iocsr, "IOCSR"); + memory_region_init_io(&lams->iocsr_mem, OBJECT(machine), + &loongarch_qemu_ops, + machine, "iocsr_misc", 0x428); + memory_region_add_subregion(&lams->system_iocsr, 0, &lams->iocsr_mem); + + /* Init CPUs */ possible_cpus = mc->possible_cpu_arch_ids(machine); for (i = 0; i < possible_cpus->len; i++) { cpu = cpu_create(machine->cpu_type); diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index fbdef9a7b3..110e5e8873 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -58,7 +58,6 @@ struct LoongArchExtIOI { uint8_t sw_coremap[EXTIOI_IRQS]; qemu_irq parent_irq[EXTIOI_CPUS][LS3A_INTC_IP]; qemu_irq irq[EXTIOI_IRQS]; - MemoryRegion extioi_iocsr_mem[EXTIOI_CPUS]; MemoryRegion extioi_system_mem; }; #endif /* LOONGARCH_EXTIOI_H */ diff --git a/include/hw/intc/loongarch_ipi.h b/include/hw/intc/loongarch_ipi.h index 6c6194786e..1c1e834849 100644 --- a/include/hw/intc/loongarch_ipi.h +++ b/include/hw/intc/loongarch_ipi.h @@ -47,7 +47,8 @@ struct LoongArchIPI { SysBusDevice parent_obj; MemoryRegion ipi_iocsr_mem; MemoryRegion ipi64_iocsr_mem; - IPICore ipi_core; + uint32_t num_cpu; + IPICore *cpu; }; #endif diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index db0831b471..6ef9a92394 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -50,6 +50,9 @@ struct LoongArchMachineState { DeviceState *platform_bus_dev; PCIBus *pci_bus; PFlashCFI01 *flash; + MemoryRegion system_iocsr; + MemoryRegion iocsr_mem; + AddressSpace as_iocsr; }; #define TYPE_LOONGARCH_MACHINE MACHINE_TYPE_NAME("virt") diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 413414392b..6611d137a1 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -602,47 +602,6 @@ static void loongarch_cpu_realizefn(DeviceState *dev, Error **errp) lacc->parent_realize(dev, errp); } -#ifndef CONFIG_USER_ONLY -static void loongarch_qemu_write(void *opaque, hwaddr addr, - uint64_t val, unsigned size) -{ - qemu_log_mask(LOG_UNIMP, "[%s]: Unimplemented reg 0x%" HWADDR_PRIx "\n", - __func__, addr); -} - -static uint64_t loongarch_qemu_read(void *opaque, hwaddr addr, unsigned size) -{ - switch (addr) { - case VERSION_REG: - return 0x11ULL; - case FEATURE_REG: - return 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI | - 1ULL << IOCSRF_CSRIPI; - case VENDOR_REG: - return 0x6e6f73676e6f6f4cULL; /* "Loongson" */ - case CPUNAME_REG: - return 0x303030354133ULL; /* "3A5000" */ - case MISC_FUNC_REG: - return 1ULL << IOCSRM_EXTIOI_EN; - } - return 0ULL; -} - -static const MemoryRegionOps loongarch_qemu_ops = { - .read = loongarch_qemu_read, - .write = loongarch_qemu_write, - .endianness = DEVICE_LITTLE_ENDIAN, - .valid = { - .min_access_size = 4, - .max_access_size = 8, - }, - .impl = { - .min_access_size = 8, - .max_access_size = 8, - }, -}; -#endif - static bool loongarch_get_lsx(Object *obj, Error **errp) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); @@ -713,19 +672,12 @@ static void loongarch_cpu_init(Object *obj) { #ifndef CONFIG_USER_ONLY LoongArchCPU *cpu = LOONGARCH_CPU(obj); - CPULoongArchState *env = &cpu->env; qdev_init_gpio_in(DEVICE(cpu), loongarch_cpu_set_irq, N_IRQS); #ifdef CONFIG_TCG timer_init_ns(&cpu->timer, QEMU_CLOCK_VIRTUAL, &loongarch_constant_timer_cb, cpu); #endif - memory_region_init_io(&env->system_iocsr, OBJECT(cpu), NULL, - env, "iocsr", UINT64_MAX); - address_space_init(&env->address_space_iocsr, &env->system_iocsr, "IOCSR"); - memory_region_init_io(&env->iocsr_mem, OBJECT(cpu), &loongarch_qemu_ops, - NULL, "iocsr_misc", 0x428); - memory_region_add_subregion(&env->system_iocsr, 0, &env->iocsr_mem); #endif } diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 8ebd6fa1a7..4aba8aba4c 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -355,9 +355,7 @@ typedef struct CPUArchState { #ifndef CONFIG_USER_ONLY LoongArchTLB tlb[LOONGARCH_TLB_MAX]; - AddressSpace address_space_iocsr; - MemoryRegion system_iocsr; - MemoryRegion iocsr_mem; + AddressSpace *address_space_iocsr; bool load_elf; uint64_t elf_address; uint32_t mp_state; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index bd33ec2114..84bcdf5f86 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -733,7 +733,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) trace_kvm_arch_handle_exit(run->exit_reason); switch (run->exit_reason) { case KVM_EXIT_LOONGARCH_IOCSR: - address_space_rw(&env->address_space_iocsr, + address_space_rw(env->address_space_iocsr, run->iocsr_io.phys_addr, attrs, run->iocsr_io.data, diff --git a/target/loongarch/tcg/iocsr_helper.c b/target/loongarch/tcg/iocsr_helper.c index 6cd01d5f09..b6916f53d2 100644 --- a/target/loongarch/tcg/iocsr_helper.c +++ b/target/loongarch/tcg/iocsr_helper.c @@ -17,52 +17,52 @@ uint64_t helper_iocsrrd_b(CPULoongArchState *env, target_ulong r_addr) { - return address_space_ldub(&env->address_space_iocsr, r_addr, + return address_space_ldub(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } uint64_t helper_iocsrrd_h(CPULoongArchState *env, target_ulong r_addr) { - return address_space_lduw(&env->address_space_iocsr, r_addr, + return address_space_lduw(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } uint64_t helper_iocsrrd_w(CPULoongArchState *env, target_ulong r_addr) { - return address_space_ldl(&env->address_space_iocsr, r_addr, + return address_space_ldl(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } uint64_t helper_iocsrrd_d(CPULoongArchState *env, target_ulong r_addr) { - return address_space_ldq(&env->address_space_iocsr, r_addr, + return address_space_ldq(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_b(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stb(&env->address_space_iocsr, w_addr, + address_space_stb(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_h(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stw(&env->address_space_iocsr, w_addr, + address_space_stw(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_w(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stl(&env->address_space_iocsr, w_addr, + address_space_stl(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_d(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stq(&env->address_space_iocsr, w_addr, + address_space_stq(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } -- Gitee From 4440ab99f7f7b04ef79f6b35b8330edf7fe66002 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 15 Dec 2023 11:07:36 +0800 Subject: [PATCH 029/939] hw/intc/loongarch_extioi: Add dynamic cpu number support On LoongArch physical machine, one extioi interrupt controller only supports 4 cpus. With processor more than 4 cpus, there are multiple extioi interrupt controllers; if interrupts need to be routed to other cpus, they are forwarded from extioi node0 to other extioi nodes. On virt machine model, there is simple extioi interrupt device model. All cpus can access register of extioi interrupt controller, however interrupt can only be route to 4 vcpu for compatible with old kernel. This patch adds dynamic cpu number support about extioi interrupt. With old kernel legacy extioi model is used, however kernel can detect and choose new route method in future, so that interrupt can be routed to all vcpus. confict: ++<<<<<<< HEAD + .fields = (VMStateField[]) { ++======= + .fields = (const VMStateField[]) { + VMSTATE_UINT32_ARRAY(coreisr, ExtIOICore, EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_END_OF_LIST() + } + }; + + static const VMStateDescription vmstate_loongarch_extioi = { + .name = TYPE_LOONGARCH_EXTIOI, + .version_id = 2, + .minimum_version_id = 2, + .fields = (const VMStateField[]) { ++>>>>>>> hw/intc/loongarch_extioi: Add dynamic cpu number support solve: save: hw/intc/loongarch_extioi: Add dynamic cpu number support Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20231215100333.3933632-4-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/intc/loongarch_extioi.c | 109 +++++++++++++++++++---------- hw/loongarch/virt.c | 3 +- include/hw/intc/loongarch_extioi.h | 11 ++- 3 files changed, 82 insertions(+), 41 deletions(-) diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c index 77b4776958..28802bf3ef 100644 --- a/hw/intc/loongarch_extioi.c +++ b/hw/intc/loongarch_extioi.c @@ -8,6 +8,7 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qemu/log.h" +#include "qapi/error.h" #include "hw/irq.h" #include "hw/sysbus.h" #include "hw/loongarch/virt.h" @@ -32,23 +33,23 @@ static void extioi_update_irq(LoongArchExtIOI *s, int irq, int level) if (((s->enable[irq_index]) & irq_mask) == 0) { return; } - s->coreisr[cpu][irq_index] |= irq_mask; - found = find_first_bit(s->sw_isr[cpu][ipnum], EXTIOI_IRQS); - set_bit(irq, s->sw_isr[cpu][ipnum]); + s->cpu[cpu].coreisr[irq_index] |= irq_mask; + found = find_first_bit(s->cpu[cpu].sw_isr[ipnum], EXTIOI_IRQS); + set_bit(irq, s->cpu[cpu].sw_isr[ipnum]); if (found < EXTIOI_IRQS) { /* other irq is handling, need not update parent irq level */ return; } } else { - s->coreisr[cpu][irq_index] &= ~irq_mask; - clear_bit(irq, s->sw_isr[cpu][ipnum]); - found = find_first_bit(s->sw_isr[cpu][ipnum], EXTIOI_IRQS); + s->cpu[cpu].coreisr[irq_index] &= ~irq_mask; + clear_bit(irq, s->cpu[cpu].sw_isr[ipnum]); + found = find_first_bit(s->cpu[cpu].sw_isr[ipnum], EXTIOI_IRQS); if (found < EXTIOI_IRQS) { /* other irq is handling, need not update parent irq level */ return; } } - qemu_set_irq(s->parent_irq[cpu][ipnum], level); + qemu_set_irq(s->cpu[cpu].parent_irq[ipnum], level); } static void extioi_setirq(void *opaque, int irq, int level) @@ -96,7 +97,7 @@ static MemTxResult extioi_readw(void *opaque, hwaddr addr, uint64_t *data, index = (offset - EXTIOI_COREISR_START) >> 2; /* using attrs to get current cpu index */ cpu = attrs.requester_id; - *data = s->coreisr[cpu][index]; + *data = s->cpu[cpu].coreisr[index]; break; case EXTIOI_COREMAP_START ... EXTIOI_COREMAP_END - 1: index = (offset - EXTIOI_COREMAP_START) >> 2; @@ -189,8 +190,8 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr, index = (offset - EXTIOI_COREISR_START) >> 2; /* using attrs to get current cpu index */ cpu = attrs.requester_id; - old_data = s->coreisr[cpu][index]; - s->coreisr[cpu][index] = old_data & ~val; + old_data = s->cpu[cpu].coreisr[index]; + s->cpu[cpu].coreisr[index] = old_data & ~val; /* write 1 to clear interrupt */ old_data &= val; irq = ctz32(old_data); @@ -248,14 +249,61 @@ static const MemoryRegionOps extioi_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; -static const VMStateDescription vmstate_loongarch_extioi = { - .name = TYPE_LOONGARCH_EXTIOI, +static void loongarch_extioi_realize(DeviceState *dev, Error **errp) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(dev); + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + int i, pin; + + if (s->num_cpu == 0) { + error_setg(errp, "num-cpu must be at least 1"); + return; + } + + for (i = 0; i < EXTIOI_IRQS; i++) { + sysbus_init_irq(sbd, &s->irq[i]); + } + + qdev_init_gpio_in(dev, extioi_setirq, EXTIOI_IRQS); + memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, + s, "extioi_system_mem", 0x900); + sysbus_init_mmio(sbd, &s->extioi_system_mem); + s->cpu = g_new0(ExtIOICore, s->num_cpu); + if (s->cpu == NULL) { + error_setg(errp, "Memory allocation for ExtIOICore faile"); + return; + } + + for (i = 0; i < s->num_cpu; i++) { + for (pin = 0; pin < LS3A_INTC_IP; pin++) { + qdev_init_gpio_out(dev, &s->cpu[i].parent_irq[pin], 1); + } + } +} + +static void loongarch_extioi_finalize(Object *obj) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(obj); + + g_free(s->cpu); +} + +static const VMStateDescription vmstate_extioi_core = { + .name = "extioi-core", .version_id = 1, .minimum_version_id = 1, - .fields = (VMStateField[]) { + .fields = (const VMStateField[]) { + VMSTATE_UINT32_ARRAY(coreisr, ExtIOICore, EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_loongarch_extioi = { + .name = TYPE_LOONGARCH_EXTIOI, + .version_id = 2, + .minimum_version_id = 2, + .fields = (const VMStateField[]) { VMSTATE_UINT32_ARRAY(bounce, LoongArchExtIOI, EXTIOI_IRQS_GROUP_COUNT), - VMSTATE_UINT32_2DARRAY(coreisr, LoongArchExtIOI, EXTIOI_CPUS, - EXTIOI_IRQS_GROUP_COUNT), VMSTATE_UINT32_ARRAY(nodetype, LoongArchExtIOI, EXTIOI_IRQS_NODETYPE_COUNT / 2), VMSTATE_UINT32_ARRAY(enable, LoongArchExtIOI, EXTIOI_IRQS / 32), @@ -265,45 +313,32 @@ static const VMStateDescription vmstate_loongarch_extioi = { VMSTATE_UINT8_ARRAY(sw_ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE), VMSTATE_UINT8_ARRAY(sw_coremap, LoongArchExtIOI, EXTIOI_IRQS), + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchExtIOI, num_cpu, + vmstate_extioi_core, ExtIOICore), VMSTATE_END_OF_LIST() } }; -static void loongarch_extioi_instance_init(Object *obj) -{ - SysBusDevice *dev = SYS_BUS_DEVICE(obj); - LoongArchExtIOI *s = LOONGARCH_EXTIOI(obj); - int i, cpu, pin; - - for (i = 0; i < EXTIOI_IRQS; i++) { - sysbus_init_irq(dev, &s->irq[i]); - } - - qdev_init_gpio_in(DEVICE(obj), extioi_setirq, EXTIOI_IRQS); - - for (cpu = 0; cpu < EXTIOI_CPUS; cpu++) { - for (pin = 0; pin < LS3A_INTC_IP; pin++) { - qdev_init_gpio_out(DEVICE(obj), &s->parent_irq[cpu][pin], 1); - } - } - memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, - s, "extioi_system_mem", 0x900); - sysbus_init_mmio(dev, &s->extioi_system_mem); -} +static Property extioi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", LoongArchExtIOI, num_cpu, 1), + DEFINE_PROP_END_OF_LIST(), +}; static void loongarch_extioi_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + dc->realize = loongarch_extioi_realize; + device_class_set_props(dc, extioi_properties); dc->vmsd = &vmstate_loongarch_extioi; } static const TypeInfo loongarch_extioi_info = { .name = TYPE_LOONGARCH_EXTIOI, .parent = TYPE_SYS_BUS_DEVICE, - .instance_init = loongarch_extioi_instance_init, .instance_size = sizeof(struct LoongArchExtIOI), .class_init = loongarch_extioi_class_init, + .instance_finalize = loongarch_extioi_finalize, }; static void loongarch_extioi_register_types(void) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 13d19b6da3..c9a680e61a 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -582,6 +582,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) /* Create EXTIOI device */ extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); + qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.cpus); sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); memory_region_add_subregion(&lams->system_iocsr, APIC_BASE, sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); @@ -590,7 +591,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) * connect ext irq to the cpu irq * cpu_pin[9:2] <= intc_pin[7:0] */ - for (cpu = 0; cpu < MIN(ms->smp.cpus, EXTIOI_CPUS); cpu++) { + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { cpudev = DEVICE(qemu_get_cpu(cpu)); for (pin = 0; pin < LS3A_INTC_IP; pin++) { qdev_connect_gpio_out(extioi, (cpu * 8 + pin), diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index 110e5e8873..a0a46b888c 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -40,24 +40,29 @@ #define EXTIOI_COREMAP_START (0xC00 - APIC_OFFSET) #define EXTIOI_COREMAP_END (0xD00 - APIC_OFFSET) +typedef struct ExtIOICore { + uint32_t coreisr[EXTIOI_IRQS_GROUP_COUNT]; + DECLARE_BITMAP(sw_isr[LS3A_INTC_IP], EXTIOI_IRQS); + qemu_irq parent_irq[LS3A_INTC_IP]; +} ExtIOICore; + #define TYPE_LOONGARCH_EXTIOI "loongarch.extioi" OBJECT_DECLARE_SIMPLE_TYPE(LoongArchExtIOI, LOONGARCH_EXTIOI) struct LoongArchExtIOI { SysBusDevice parent_obj; + uint32_t num_cpu; /* hardware state */ uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2]; uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT]; uint32_t isr[EXTIOI_IRQS / 32]; - uint32_t coreisr[EXTIOI_CPUS][EXTIOI_IRQS_GROUP_COUNT]; uint32_t enable[EXTIOI_IRQS / 32]; uint32_t ipmap[EXTIOI_IRQS_IPMAP_SIZE / 4]; uint32_t coremap[EXTIOI_IRQS / 4]; uint32_t sw_pending[EXTIOI_IRQS / 32]; - DECLARE_BITMAP(sw_isr[EXTIOI_CPUS][LS3A_INTC_IP], EXTIOI_IRQS); uint8_t sw_ipmap[EXTIOI_IRQS_IPMAP_SIZE]; uint8_t sw_coremap[EXTIOI_IRQS]; - qemu_irq parent_irq[EXTIOI_CPUS][LS3A_INTC_IP]; qemu_irq irq[EXTIOI_IRQS]; + ExtIOICore *cpu; MemoryRegion extioi_system_mem; }; #endif /* LOONGARCH_EXTIOI_H */ -- Gitee From db8c355d923c218c5ca373c4acd5d13493152889 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 15 Dec 2023 17:42:58 +0800 Subject: [PATCH 030/939] hw/intc/loongarch_extioi: Add vmstate post_load support There are elements sw_ipmap and sw_coremap, which is usd to speed up irq injection flow. They are saved and restored in vmstate during migration, indeed they can calculated from hw registers. Here post_load is added for get sw_ipmap and sw_coremap from extioi hw state. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20231215100333.3933632-5-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/intc/loongarch_extioi.c | 120 +++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 44 deletions(-) diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c index 28802bf3ef..bdfa3b481e 100644 --- a/hw/intc/loongarch_extioi.c +++ b/hw/intc/loongarch_extioi.c @@ -130,12 +130,66 @@ static inline void extioi_enable_irq(LoongArchExtIOI *s, int index,\ } } +static inline void extioi_update_sw_coremap(LoongArchExtIOI *s, int irq, + uint64_t val, bool notify) +{ + int i, cpu; + + /* + * loongarch only support little endian, + * so we paresd the value with little endian. + */ + val = cpu_to_le64(val); + + for (i = 0; i < 4; i++) { + cpu = val & 0xff; + cpu = ctz32(cpu); + cpu = (cpu >= 4) ? 0 : cpu; + val = val >> 8; + + if (s->sw_coremap[irq + i] == cpu) { + continue; + } + + if (notify && test_bit(irq, (unsigned long *)s->isr)) { + /* + * lower irq at old cpu and raise irq at new cpu + */ + extioi_update_irq(s, irq + i, 0); + s->sw_coremap[irq + i] = cpu; + extioi_update_irq(s, irq + i, 1); + } else { + s->sw_coremap[irq + i] = cpu; + } + } +} + +static inline void extioi_update_sw_ipmap(LoongArchExtIOI *s, int index, + uint64_t val) +{ + int i; + uint8_t ipnum; + + /* + * loongarch only support little endian, + * so we paresd the value with little endian. + */ + val = cpu_to_le64(val); + for (i = 0; i < 4; i++) { + ipnum = val & 0xff; + ipnum = ctz32(ipnum); + ipnum = (ipnum >= 4) ? 0 : ipnum; + s->sw_ipmap[index * 4 + i] = ipnum; + val = val >> 8; + } +} + static MemTxResult extioi_writew(void *opaque, hwaddr addr, uint64_t val, unsigned size, MemTxAttrs attrs) { LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); - int i, cpu, index, old_data, irq; + int cpu, index, old_data, irq; uint32_t offset; trace_loongarch_extioi_writew(addr, val); @@ -153,20 +207,7 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr, */ index = (offset - EXTIOI_IPMAP_START) >> 2; s->ipmap[index] = val; - /* - * loongarch only support little endian, - * so we paresd the value with little endian. - */ - val = cpu_to_le64(val); - for (i = 0; i < 4; i++) { - uint8_t ipnum; - ipnum = val & 0xff; - ipnum = ctz32(ipnum); - ipnum = (ipnum >= 4) ? 0 : ipnum; - s->sw_ipmap[index * 4 + i] = ipnum; - val = val >> 8; - } - + extioi_update_sw_ipmap(s, index, val); break; case EXTIOI_ENABLE_START ... EXTIOI_ENABLE_END - 1: index = (offset - EXTIOI_ENABLE_START) >> 2; @@ -205,33 +246,8 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr, irq = offset - EXTIOI_COREMAP_START; index = irq / 4; s->coremap[index] = val; - /* - * loongarch only support little endian, - * so we paresd the value with little endian. - */ - val = cpu_to_le64(val); - - for (i = 0; i < 4; i++) { - cpu = val & 0xff; - cpu = ctz32(cpu); - cpu = (cpu >= 4) ? 0 : cpu; - val = val >> 8; - - if (s->sw_coremap[irq + i] == cpu) { - continue; - } - - if (test_bit(irq, (unsigned long *)s->isr)) { - /* - * lower irq at old cpu and raise irq at new cpu - */ - extioi_update_irq(s, irq + i, 0); - s->sw_coremap[irq + i] = cpu; - extioi_update_irq(s, irq + i, 1); - } else { - s->sw_coremap[irq + i] = cpu; - } - } + + extioi_update_sw_coremap(s, irq, val, true); break; default: break; @@ -288,6 +304,23 @@ static void loongarch_extioi_finalize(Object *obj) g_free(s->cpu); } +static int vmstate_extioi_post_load(void *opaque, int version_id) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); + int i, start_irq; + + for (i = 0; i < (EXTIOI_IRQS / 4); i++) { + start_irq = i * 4; + extioi_update_sw_coremap(s, start_irq, s->coremap[i], false); + } + + for (i = 0; i < (EXTIOI_IRQS_IPMAP_SIZE / 4); i++) { + extioi_update_sw_ipmap(s, i, s->ipmap[i]); + } + + return 0; +} + static const VMStateDescription vmstate_extioi_core = { .name = "extioi-core", .version_id = 1, @@ -302,6 +335,7 @@ static const VMStateDescription vmstate_loongarch_extioi = { .name = TYPE_LOONGARCH_EXTIOI, .version_id = 2, .minimum_version_id = 2, + .post_load = vmstate_extioi_post_load, .fields = (const VMStateField[]) { VMSTATE_UINT32_ARRAY(bounce, LoongArchExtIOI, EXTIOI_IRQS_GROUP_COUNT), VMSTATE_UINT32_ARRAY(nodetype, LoongArchExtIOI, @@ -310,8 +344,6 @@ static const VMStateDescription vmstate_loongarch_extioi = { VMSTATE_UINT32_ARRAY(isr, LoongArchExtIOI, EXTIOI_IRQS / 32), VMSTATE_UINT32_ARRAY(ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE / 4), VMSTATE_UINT32_ARRAY(coremap, LoongArchExtIOI, EXTIOI_IRQS / 4), - VMSTATE_UINT8_ARRAY(sw_ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE), - VMSTATE_UINT8_ARRAY(sw_coremap, LoongArchExtIOI, EXTIOI_IRQS), VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchExtIOI, num_cpu, vmstate_extioi_core, ExtIOICore), -- Gitee From b21a705562867cc9dcbf0012ffa200caad8458ba Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 16 Jan 2024 09:39:52 +0800 Subject: [PATCH 031/939] configure: Add linux header compile support for LoongArch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling qemu with system KVM mode for LoongArch, header files in directory linux-headers/asm-loongarch should be used firstly. Otherwise it fails to find kvm.h on system with old glibc, since latest kernel header files are not installed. This patch adds linux_arch definition for LoongArch system so that header files in directory linux-headers/asm-loongarch can be included. Fixes: 714b03c125 ("target/loongarch: Add loongarch kvm into meson build") Signed-off-by: Bibo Mao Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240116013952.264474-1-maobibo@loongson.cn> Signed-off-by: Philippe Mathieu-Daudé --- configure | 1 + 1 file changed, 1 insertion(+) diff --git a/configure b/configure index bdda912f36..6036de83a4 100755 --- a/configure +++ b/configure @@ -445,6 +445,7 @@ case "$cpu" in loongarch*) cpu=loongarch64 host_arch=loongarch64 + linux_arch=loongarch ;; mips64*) -- Gitee From d271f623205c2984a30cfb12e160e219b2bbe974 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 15 Jan 2024 16:51:21 +0800 Subject: [PATCH 032/939] target/loongarch: Set cpuid CSR register only once with kvm mode CSR cpuid register is used for routing irq to different vcpus, its value is kept unchanged since poweron. So it is not necessary to set CSR cpuid register after system resets, and it is only set at vm creation stage. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240115085121.180524-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/kvm/kvm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 84bcdf5f86..2230f029d0 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -250,7 +250,7 @@ static int kvm_loongarch_get_csr(CPUState *cs) return ret; } -static int kvm_loongarch_put_csr(CPUState *cs) +static int kvm_loongarch_put_csr(CPUState *cs, int level) { int ret = 0; LoongArchCPU *cpu = LOONGARCH_CPU(cs); @@ -322,8 +322,11 @@ static int kvm_loongarch_put_csr(CPUState *cs) ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG), &env->CSR_RVACFG); - ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID), + /* CPUID is constant after poweron, it should be set only once */ + if (level >= KVM_PUT_FULL_STATE) { + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID), &env->CSR_CPUID); + } ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1), &env->CSR_PRCFG1); @@ -598,7 +601,7 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } - ret = kvm_loongarch_put_csr(cs); + ret = kvm_loongarch_put_csr(cs, level); if (ret) { return ret; } -- Gitee From 6e503b590e42ad7c522cf937b83e1f8f715dbd1a Mon Sep 17 00:00:00 2001 From: Song Gao Date: Mon, 22 Jan 2024 17:02:06 +0800 Subject: [PATCH 033/939] target/loongarch/kvm: Enable LSX/LASX extension The kernel had already support LSX and LASX [1], but QEMU is disable LSX/LASX for kvm. This patch adds kvm_check_cpucfg2() to check CPUCFG2. [1]: https://lore.kernel.org/all/CABgObfZHRf7E_7Jk4uPRmSyxTy3EiuuYwHC35jQncNL9s-zTDA@mail.gmail.com/ Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240122090206.1083584-1-gaosong@loongson.cn> --- linux-headers/asm-loongarch/kvm.h | 1 + target/loongarch/kvm/kvm.c | 45 ++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h index c6ad2ee610..923d0bd382 100644 --- a/linux-headers/asm-loongarch/kvm.h +++ b/linux-headers/asm-loongarch/kvm.h @@ -79,6 +79,7 @@ struct kvm_fpu { #define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) #define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) +#define KVM_LOONGARCH_VCPU_CPUCFG 0 struct kvm_debug_exit_arch { }; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 2230f029d0..c19978a970 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -540,6 +540,38 @@ static int kvm_loongarch_get_cpucfg(CPUState *cs) return ret; } +static int kvm_check_cpucfg2(CPUState *cs) +{ + int ret; + uint64_t val; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_CPUCFG, + .attr = 2, + .addr = (uint64_t)&val, + }; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, &attr); + + if (!ret) { + kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, &attr); + env->cpucfg[2] &= val; + + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, FP)) { + /* The FP minimal version is 1. */ + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, FP_VER, 1); + } + + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LLFTP)) { + /* The LLFTP minimal version is 1. */ + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LLFTP_VER, 1); + } + } + + return ret; +} + static int kvm_loongarch_put_cpucfg(CPUState *cs) { int i, ret = 0; @@ -548,14 +580,13 @@ static int kvm_loongarch_put_cpucfg(CPUState *cs) uint64_t val; for (i = 0; i < 21; i++) { + if (i == 2) { + ret = kvm_check_cpucfg2(cs); + if (ret) { + return ret; + } + } val = env->cpucfg[i]; - /* LSX and LASX and LBT are not supported in kvm now */ - if (i == 2) { - val &= ~(BIT(R_CPUCFG2_LSX_SHIFT) | BIT(R_CPUCFG2_LASX_SHIFT)); - val &= ~(BIT(R_CPUCFG2_LBT_X86_SHIFT) | - BIT(R_CPUCFG2_LBT_ARM_SHIFT) | - BIT(R_CPUCFG2_LBT_MIPS_SHIFT)); - } ret = kvm_set_one_reg(cs, KVM_IOC_CPUCFG(i), &val); if (ret < 0) { trace_kvm_failed_put_cpucfg(strerror(errno)); -- Gitee From d2381abc2c78de68e765a29a55282707541e315d Mon Sep 17 00:00:00 2001 From: Song Gao Date: Thu, 25 Jan 2024 14:14:01 +0800 Subject: [PATCH 034/939] target/loongarch: Fix qtest test-hmp error when KVM-only build The cc->sysemu_ops->get_phys_page_debug() is NULL when KVM-only build. this patch fixes it. Signed-off-by: Song Gao Tested-by: Bibo Mao Message-Id: <20240125061401.52526-1-gaosong@loongson.cn> --- target/loongarch/cpu.c | 2 - target/loongarch/cpu_helper.c | 231 ++++++++++++++++++++++++++++++ target/loongarch/internals.h | 20 ++- target/loongarch/meson.build | 1 + target/loongarch/tcg/tlb_helper.c | 230 ----------------------------- 5 files changed, 250 insertions(+), 234 deletions(-) create mode 100644 target/loongarch/cpu_helper.c diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 6611d137a1..b098b1c6f3 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -771,9 +771,7 @@ static struct TCGCPUOps loongarch_tcg_ops = { #include "hw/core/sysemu-cpu-ops.h" static const struct SysemuCPUOps loongarch_sysemu_ops = { -#ifdef CONFIG_TCG .get_phys_page_debug = loongarch_cpu_get_phys_page_debug, -#endif }; static int64_t loongarch_cpu_get_arch_id(CPUState *cs) diff --git a/target/loongarch/cpu_helper.c b/target/loongarch/cpu_helper.c new file mode 100644 index 0000000000..f68d63f466 --- /dev/null +++ b/target/loongarch/cpu_helper.c @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch CPU helpers for qemu + * + * Copyright (c) 2024 Loongson Technology Corporation Limited + * + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "cpu-csr.h" + +static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + int access_type, int index, int mmu_idx) +{ + LoongArchTLB *tlb = &env->tlb[index]; + uint64_t plv = mmu_idx; + uint64_t tlb_entry, tlb_ppn; + uint8_t tlb_ps, n, tlb_v, tlb_d, tlb_plv, tlb_nx, tlb_nr, tlb_rplv; + + if (index >= LOONGARCH_STLB) { + tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); + } else { + tlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); + } + n = (address >> tlb_ps) & 0x1;/* Odd or even */ + + tlb_entry = n ? tlb->tlb_entry1 : tlb->tlb_entry0; + tlb_v = FIELD_EX64(tlb_entry, TLBENTRY, V); + tlb_d = FIELD_EX64(tlb_entry, TLBENTRY, D); + tlb_plv = FIELD_EX64(tlb_entry, TLBENTRY, PLV); + if (is_la64(env)) { + tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_64, PPN); + tlb_nx = FIELD_EX64(tlb_entry, TLBENTRY_64, NX); + tlb_nr = FIELD_EX64(tlb_entry, TLBENTRY_64, NR); + tlb_rplv = FIELD_EX64(tlb_entry, TLBENTRY_64, RPLV); + } else { + tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_32, PPN); + tlb_nx = 0; + tlb_nr = 0; + tlb_rplv = 0; + } + + /* Remove sw bit between bit12 -- bit PS*/ + tlb_ppn = tlb_ppn & ~(((0x1UL << (tlb_ps - 12)) -1)); + + /* Check access rights */ + if (!tlb_v) { + return TLBRET_INVALID; + } + + if (access_type == MMU_INST_FETCH && tlb_nx) { + return TLBRET_XI; + } + + if (access_type == MMU_DATA_LOAD && tlb_nr) { + return TLBRET_RI; + } + + if (((tlb_rplv == 0) && (plv > tlb_plv)) || + ((tlb_rplv == 1) && (plv != tlb_plv))) { + return TLBRET_PE; + } + + if ((access_type == MMU_DATA_STORE) && !tlb_d) { + return TLBRET_DIRTY; + } + + *physical = (tlb_ppn << R_TLBENTRY_64_PPN_SHIFT) | + (address & MAKE_64BIT_MASK(0, tlb_ps)); + *prot = PAGE_READ; + if (tlb_d) { + *prot |= PAGE_WRITE; + } + if (!tlb_nx) { + *prot |= PAGE_EXEC; + } + return TLBRET_MATCH; +} + +/* + * One tlb entry holds an adjacent odd/even pair, the vpn is the + * content of the virtual page number divided by 2. So the + * compare vpn is bit[47:15] for 16KiB page. while the vppn + * field in tlb entry contains bit[47:13], so need adjust. + * virt_vpn = vaddr[47:13] + */ +bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr, + int *index) +{ + LoongArchTLB *tlb; + uint16_t csr_asid, tlb_asid, stlb_idx; + uint8_t tlb_e, tlb_ps, tlb_g, stlb_ps; + int i, compare_shift; + uint64_t vpn, tlb_vppn; + + csr_asid = FIELD_EX64(env->CSR_ASID, CSR_ASID, ASID); + stlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); + vpn = (vaddr & TARGET_VIRT_MASK) >> (stlb_ps + 1); + stlb_idx = vpn & 0xff; /* VA[25:15] <==> TLBIDX.index for 16KiB Page */ + compare_shift = stlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; + + /* Search STLB */ + for (i = 0; i < 8; ++i) { + tlb = &env->tlb[i * 256 + stlb_idx]; + tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); + if (tlb_e) { + tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); + tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); + tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); + + if ((tlb_g == 1 || tlb_asid == csr_asid) && + (vpn == (tlb_vppn >> compare_shift))) { + *index = i * 256 + stlb_idx; + return true; + } + } + } + + /* Search MTLB */ + for (i = LOONGARCH_STLB; i < LOONGARCH_TLB_MAX; ++i) { + tlb = &env->tlb[i]; + tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); + if (tlb_e) { + tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); + tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); + tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); + tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); + compare_shift = tlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; + vpn = (vaddr & TARGET_VIRT_MASK) >> (tlb_ps + 1); + if ((tlb_g == 1 || tlb_asid == csr_asid) && + (vpn == (tlb_vppn >> compare_shift))) { + *index = i; + return true; + } + } + } + return false; +} + +static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx) +{ + int index, match; + + match = loongarch_tlb_search(env, address, &index); + if (match) { + return loongarch_map_tlb_entry(env, physical, prot, + address, access_type, index, mmu_idx); + } + + return TLBRET_NOMATCH; +} + +static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va, + target_ulong dmw) +{ + if (is_la64(env)) { + return va & TARGET_VIRT_MASK; + } else { + uint32_t pseg = FIELD_EX32(dmw, CSR_DMW_32, PSEG); + return (va & MAKE_64BIT_MASK(0, R_CSR_DMW_32_VSEG_SHIFT)) | \ + (pseg << R_CSR_DMW_32_VSEG_SHIFT); + } +} + +int get_physical_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx) +{ + int user_mode = mmu_idx == MMU_IDX_USER; + int kernel_mode = mmu_idx == MMU_IDX_KERNEL; + uint32_t plv, base_c, base_v; + int64_t addr_high; + uint8_t da = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, DA); + uint8_t pg = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG); + + /* Check PG and DA */ + if (da & !pg) { + *physical = address & TARGET_PHYS_MASK; + *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + return TLBRET_MATCH; + } + + plv = kernel_mode | (user_mode << R_CSR_DMW_PLV3_SHIFT); + if (is_la64(env)) { + base_v = address >> R_CSR_DMW_64_VSEG_SHIFT; + } else { + base_v = address >> R_CSR_DMW_32_VSEG_SHIFT; + } + /* Check direct map window */ + for (int i = 0; i < 4; i++) { + if (is_la64(env)) { + base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_64, VSEG); + } else { + base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_32, VSEG); + } + if ((plv & env->CSR_DMW[i]) && (base_c == base_v)) { + *physical = dmw_va2pa(env, address, env->CSR_DMW[i]); + *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + return TLBRET_MATCH; + } + } + + /* Check valid extension */ + addr_high = sextract64(address, TARGET_VIRT_ADDR_SPACE_BITS, 16); + if (!(addr_high == 0 || addr_high == -1)) { + return TLBRET_BADADDR; + } + + /* Mapped address */ + return loongarch_map_address(env, physical, prot, address, + access_type, mmu_idx); +} + +hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + hwaddr phys_addr; + int prot; + + if (get_physical_address(env, &phys_addr, &prot, addr, MMU_DATA_LOAD, + cpu_mmu_index(env, false)) != 0) { + return -1; + } + return phys_addr; +} diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index 0beb034748..a2fc54c8a7 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -37,6 +37,17 @@ void restore_fp_status(CPULoongArchState *env); #endif #ifndef CONFIG_USER_ONLY +enum { + TLBRET_MATCH = 0, + TLBRET_BADADDR = 1, + TLBRET_NOMATCH = 2, + TLBRET_INVALID = 3, + TLBRET_DIRTY = 4, + TLBRET_RI = 5, + TLBRET_XI = 6, + TLBRET_PE = 7, +}; + extern const VMStateDescription vmstate_loongarch_cpu; void loongarch_cpu_set_irq(void *opaque, int irq, int level); @@ -46,12 +57,17 @@ uint64_t cpu_loongarch_get_constant_timer_counter(LoongArchCPU *cpu); uint64_t cpu_loongarch_get_constant_timer_ticks(LoongArchCPU *cpu); void cpu_loongarch_store_constant_timer_config(LoongArchCPU *cpu, uint64_t value); +bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr, + int *index); +int get_physical_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx); +hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); + #ifdef CONFIG_TCG bool loongarch_cpu_tlb_fill(CPUState *cs, vaddr address, int size, MMUAccessType access_type, int mmu_idx, bool probe, uintptr_t retaddr); - -hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); #endif #endif /* !CONFIG_USER_ONLY */ diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build index db310f6022..e002e9aaf6 100644 --- a/target/loongarch/meson.build +++ b/target/loongarch/meson.build @@ -8,6 +8,7 @@ loongarch_ss.add(files( loongarch_system_ss = ss.source_set() loongarch_system_ss.add(files( + 'cpu_helper.c', 'loongarch-qmp-cmds.c', 'machine.c', )) diff --git a/target/loongarch/tcg/tlb_helper.c b/target/loongarch/tcg/tlb_helper.c index 449043c68b..804ab7a263 100644 --- a/target/loongarch/tcg/tlb_helper.c +++ b/target/loongarch/tcg/tlb_helper.c @@ -17,236 +17,6 @@ #include "exec/log.h" #include "cpu-csr.h" -enum { - TLBRET_MATCH = 0, - TLBRET_BADADDR = 1, - TLBRET_NOMATCH = 2, - TLBRET_INVALID = 3, - TLBRET_DIRTY = 4, - TLBRET_RI = 5, - TLBRET_XI = 6, - TLBRET_PE = 7, -}; - -static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical, - int *prot, target_ulong address, - int access_type, int index, int mmu_idx) -{ - LoongArchTLB *tlb = &env->tlb[index]; - uint64_t plv = mmu_idx; - uint64_t tlb_entry, tlb_ppn; - uint8_t tlb_ps, n, tlb_v, tlb_d, tlb_plv, tlb_nx, tlb_nr, tlb_rplv; - - if (index >= LOONGARCH_STLB) { - tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); - } else { - tlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); - } - n = (address >> tlb_ps) & 0x1;/* Odd or even */ - - tlb_entry = n ? tlb->tlb_entry1 : tlb->tlb_entry0; - tlb_v = FIELD_EX64(tlb_entry, TLBENTRY, V); - tlb_d = FIELD_EX64(tlb_entry, TLBENTRY, D); - tlb_plv = FIELD_EX64(tlb_entry, TLBENTRY, PLV); - if (is_la64(env)) { - tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_64, PPN); - tlb_nx = FIELD_EX64(tlb_entry, TLBENTRY_64, NX); - tlb_nr = FIELD_EX64(tlb_entry, TLBENTRY_64, NR); - tlb_rplv = FIELD_EX64(tlb_entry, TLBENTRY_64, RPLV); - } else { - tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_32, PPN); - tlb_nx = 0; - tlb_nr = 0; - tlb_rplv = 0; - } - - /* Remove sw bit between bit12 -- bit PS*/ - tlb_ppn = tlb_ppn & ~(((0x1UL << (tlb_ps - 12)) -1)); - - /* Check access rights */ - if (!tlb_v) { - return TLBRET_INVALID; - } - - if (access_type == MMU_INST_FETCH && tlb_nx) { - return TLBRET_XI; - } - - if (access_type == MMU_DATA_LOAD && tlb_nr) { - return TLBRET_RI; - } - - if (((tlb_rplv == 0) && (plv > tlb_plv)) || - ((tlb_rplv == 1) && (plv != tlb_plv))) { - return TLBRET_PE; - } - - if ((access_type == MMU_DATA_STORE) && !tlb_d) { - return TLBRET_DIRTY; - } - - *physical = (tlb_ppn << R_TLBENTRY_64_PPN_SHIFT) | - (address & MAKE_64BIT_MASK(0, tlb_ps)); - *prot = PAGE_READ; - if (tlb_d) { - *prot |= PAGE_WRITE; - } - if (!tlb_nx) { - *prot |= PAGE_EXEC; - } - return TLBRET_MATCH; -} - -/* - * One tlb entry holds an adjacent odd/even pair, the vpn is the - * content of the virtual page number divided by 2. So the - * compare vpn is bit[47:15] for 16KiB page. while the vppn - * field in tlb entry contains bit[47:13], so need adjust. - * virt_vpn = vaddr[47:13] - */ -static bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr, - int *index) -{ - LoongArchTLB *tlb; - uint16_t csr_asid, tlb_asid, stlb_idx; - uint8_t tlb_e, tlb_ps, tlb_g, stlb_ps; - int i, compare_shift; - uint64_t vpn, tlb_vppn; - - csr_asid = FIELD_EX64(env->CSR_ASID, CSR_ASID, ASID); - stlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); - vpn = (vaddr & TARGET_VIRT_MASK) >> (stlb_ps + 1); - stlb_idx = vpn & 0xff; /* VA[25:15] <==> TLBIDX.index for 16KiB Page */ - compare_shift = stlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; - - /* Search STLB */ - for (i = 0; i < 8; ++i) { - tlb = &env->tlb[i * 256 + stlb_idx]; - tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); - if (tlb_e) { - tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); - tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); - tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); - - if ((tlb_g == 1 || tlb_asid == csr_asid) && - (vpn == (tlb_vppn >> compare_shift))) { - *index = i * 256 + stlb_idx; - return true; - } - } - } - - /* Search MTLB */ - for (i = LOONGARCH_STLB; i < LOONGARCH_TLB_MAX; ++i) { - tlb = &env->tlb[i]; - tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); - if (tlb_e) { - tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); - tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); - tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); - tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); - compare_shift = tlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; - vpn = (vaddr & TARGET_VIRT_MASK) >> (tlb_ps + 1); - if ((tlb_g == 1 || tlb_asid == csr_asid) && - (vpn == (tlb_vppn >> compare_shift))) { - *index = i; - return true; - } - } - } - return false; -} - -static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, - int *prot, target_ulong address, - MMUAccessType access_type, int mmu_idx) -{ - int index, match; - - match = loongarch_tlb_search(env, address, &index); - if (match) { - return loongarch_map_tlb_entry(env, physical, prot, - address, access_type, index, mmu_idx); - } - - return TLBRET_NOMATCH; -} - -static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va, - target_ulong dmw) -{ - if (is_la64(env)) { - return va & TARGET_VIRT_MASK; - } else { - uint32_t pseg = FIELD_EX32(dmw, CSR_DMW_32, PSEG); - return (va & MAKE_64BIT_MASK(0, R_CSR_DMW_32_VSEG_SHIFT)) | \ - (pseg << R_CSR_DMW_32_VSEG_SHIFT); - } -} - -static int get_physical_address(CPULoongArchState *env, hwaddr *physical, - int *prot, target_ulong address, - MMUAccessType access_type, int mmu_idx) -{ - int user_mode = mmu_idx == MMU_IDX_USER; - int kernel_mode = mmu_idx == MMU_IDX_KERNEL; - uint32_t plv, base_c, base_v; - int64_t addr_high; - uint8_t da = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, DA); - uint8_t pg = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG); - - /* Check PG and DA */ - if (da & !pg) { - *physical = address & TARGET_PHYS_MASK; - *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - return TLBRET_MATCH; - } - - plv = kernel_mode | (user_mode << R_CSR_DMW_PLV3_SHIFT); - if (is_la64(env)) { - base_v = address >> R_CSR_DMW_64_VSEG_SHIFT; - } else { - base_v = address >> R_CSR_DMW_32_VSEG_SHIFT; - } - /* Check direct map window */ - for (int i = 0; i < 4; i++) { - if (is_la64(env)) { - base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_64, VSEG); - } else { - base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_32, VSEG); - } - if ((plv & env->CSR_DMW[i]) && (base_c == base_v)) { - *physical = dmw_va2pa(env, address, env->CSR_DMW[i]); - *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - return TLBRET_MATCH; - } - } - - /* Check valid extension */ - addr_high = sextract64(address, TARGET_VIRT_ADDR_SPACE_BITS, 16); - if (!(addr_high == 0 || addr_high == -1)) { - return TLBRET_BADADDR; - } - - /* Mapped address */ - return loongarch_map_address(env, physical, prot, address, - access_type, mmu_idx); -} - -hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) -{ - LoongArchCPU *cpu = LOONGARCH_CPU(cs); - CPULoongArchState *env = &cpu->env; - hwaddr phys_addr; - int prot; - - if (get_physical_address(env, &phys_addr, &prot, addr, MMU_DATA_LOAD, - cpu_mmu_index(env, false)) != 0) { - return -1; - } - return phys_addr; -} - static void raise_mmu_exception(CPULoongArchState *env, target_ulong address, MMUAccessType access_type, int tlb_error) { -- Gitee From 4a5a9bef6eff5837dcccd216172957d8470b6245 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 19 Feb 2024 18:34:14 +0800 Subject: [PATCH 035/939] loongarch: Change the UEFI loading mode to loongarch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The UEFI loading mode in loongarch is very different from that in other architectures:loongarch's UEFI code is in rom, while other architectures' UEFI code is in flash. loongarch UEFI can be loaded as follows: -machine virt,pflash=pflash0-format -bios ./QEMU_EFI.fd Other architectures load UEFI using the following methods: -machine virt,pflash0=pflash0-format,pflash1=pflash1-format loongarch's UEFI loading method makes qemu and libvirt incompatible when using NVRAM, and the cost of loongarch's current loading method far outweighs the benefits, so we decided to use the same UEFI loading scheme as other architectures. Cc: Andrea Bolognani Cc: maobibo@loongson.cn Cc: Philippe Mathieu-Daudé Cc: Song Gao Cc: zhaotianrui@loongson.cn Signed-off-by: Xianglai Li Tested-by: Andrea Bolognani Reviewed-by: Song Gao Message-Id: <0bd892aa9b88e0f4cc904cb70efd0251fc1cde29.1708336919.git.lixianglai@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/acpi-build.c | 29 +++++++++-- hw/loongarch/virt.c | 101 ++++++++++++++++++++++++++---------- include/hw/loongarch/virt.h | 10 ++-- 3 files changed, 107 insertions(+), 33 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index ae292fc543..f990405d04 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -314,16 +314,39 @@ static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams) static void build_flash_aml(Aml *scope, LoongArchMachineState *lams) { Aml *dev, *crs; + MemoryRegion *flash_mem; - hwaddr flash_base = VIRT_FLASH_BASE; - hwaddr flash_size = VIRT_FLASH_SIZE; + hwaddr flash0_base; + hwaddr flash0_size; + + hwaddr flash1_base; + hwaddr flash1_size; + + flash_mem = pflash_cfi01_get_memory(lams->flash[0]); + flash0_base = flash_mem->addr; + flash0_size = memory_region_size(flash_mem); + + flash_mem = pflash_cfi01_get_memory(lams->flash[1]); + flash1_base = flash_mem->addr; + flash1_size = memory_region_size(flash_mem); dev = aml_device("FLS0"); aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015"))); aml_append(dev, aml_name_decl("_UID", aml_int(0))); crs = aml_resource_template(); - aml_append(crs, aml_memory32_fixed(flash_base, flash_size, AML_READ_WRITE)); + aml_append(crs, aml_memory32_fixed(flash0_base, flash0_size, + AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + + dev = aml_device("FLS1"); + aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015"))); + aml_append(dev, aml_name_decl("_UID", aml_int(1))); + + crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(flash1_base, flash1_size, + AML_READ_WRITE)); aml_append(dev, aml_name_decl("_CRS", crs)); aml_append(scope, dev); } diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index c9a680e61a..6ef40fa24a 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -54,7 +54,9 @@ struct loaderparams { const char *initrd_filename; }; -static void virt_flash_create(LoongArchMachineState *lams) +static PFlashCFI01 *virt_flash_create1(LoongArchMachineState *lams, + const char *name, + const char *alias_prop_name) { DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01); @@ -66,45 +68,78 @@ static void virt_flash_create(LoongArchMachineState *lams) qdev_prop_set_uint16(dev, "id1", 0x18); qdev_prop_set_uint16(dev, "id2", 0x00); qdev_prop_set_uint16(dev, "id3", 0x00); - qdev_prop_set_string(dev, "name", "virt.flash"); - object_property_add_child(OBJECT(lams), "virt.flash", OBJECT(dev)); - object_property_add_alias(OBJECT(lams), "pflash", + qdev_prop_set_string(dev, "name", name); + object_property_add_child(OBJECT(lams), name, OBJECT(dev)); + object_property_add_alias(OBJECT(lams), alias_prop_name, OBJECT(dev), "drive"); + return PFLASH_CFI01(dev); +} - lams->flash = PFLASH_CFI01(dev); +static void virt_flash_create(LoongArchMachineState *lams) +{ + lams->flash[0] = virt_flash_create1(lams, "virt.flash0", "pflash0"); + lams->flash[1] = virt_flash_create1(lams, "virt.flash1", "pflash1"); } -static void virt_flash_map(LoongArchMachineState *lams, - MemoryRegion *sysmem) +static void virt_flash_map1(PFlashCFI01 *flash, + hwaddr base, hwaddr size, + MemoryRegion *sysmem) { - PFlashCFI01 *flash = lams->flash; DeviceState *dev = DEVICE(flash); - hwaddr base = VIRT_FLASH_BASE; - hwaddr size = VIRT_FLASH_SIZE; + BlockBackend *blk; + hwaddr real_size = size; + + blk = pflash_cfi01_get_blk(flash); + if (blk) { + real_size = blk_getlength(blk); + assert(real_size && real_size <= size); + } - assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE)); - assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX); + assert(QEMU_IS_ALIGNED(real_size, VIRT_FLASH_SECTOR_SIZE)); + assert(real_size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX); - qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); + qdev_prop_set_uint32(dev, "num-blocks", real_size / VIRT_FLASH_SECTOR_SIZE); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); memory_region_add_subregion(sysmem, base, sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0)); +} +static void virt_flash_map(LoongArchMachineState *lams, + MemoryRegion *sysmem) +{ + PFlashCFI01 *flash0 = lams->flash[0]; + PFlashCFI01 *flash1 = lams->flash[1]; + + virt_flash_map1(flash0, VIRT_FLASH0_BASE, VIRT_FLASH0_SIZE, sysmem); + virt_flash_map1(flash1, VIRT_FLASH1_BASE, VIRT_FLASH1_SIZE, sysmem); } static void fdt_add_flash_node(LoongArchMachineState *lams) { MachineState *ms = MACHINE(lams); char *nodename; + MemoryRegion *flash_mem; + + hwaddr flash0_base; + hwaddr flash0_size; - hwaddr flash_base = VIRT_FLASH_BASE; - hwaddr flash_size = VIRT_FLASH_SIZE; + hwaddr flash1_base; + hwaddr flash1_size; - nodename = g_strdup_printf("/flash@%" PRIx64, flash_base); + flash_mem = pflash_cfi01_get_memory(lams->flash[0]); + flash0_base = flash_mem->addr; + flash0_size = memory_region_size(flash_mem); + + flash_mem = pflash_cfi01_get_memory(lams->flash[1]); + flash1_base = flash_mem->addr; + flash1_size = memory_region_size(flash_mem); + + nodename = g_strdup_printf("/flash@%" PRIx64, flash0_base); qemu_fdt_add_subnode(ms->fdt, nodename); qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cfi-flash"); qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", - 2, flash_base, 2, flash_size); + 2, flash0_base, 2, flash0_size, + 2, flash1_base, 2, flash1_size); qemu_fdt_setprop_cell(ms->fdt, nodename, "bank-width", 4); g_free(nodename); } @@ -639,12 +674,32 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) { char *filename = MACHINE(lams)->firmware; char *bios_name = NULL; - int bios_size; + int bios_size, i; + BlockBackend *pflash_blk0; + MemoryRegion *mr; lams->bios_loaded = false; + /* Map legacy -drive if=pflash to machine properties */ + for (i = 0; i < ARRAY_SIZE(lams->flash); i++) { + pflash_cfi01_legacy_drive(lams->flash[i], + drive_get(IF_PFLASH, 0, i)); + } + virt_flash_map(lams, get_system_memory()); + pflash_blk0 = pflash_cfi01_get_blk(lams->flash[0]); + + if (pflash_blk0) { + if (filename) { + error_report("cannot use both '-bios' and '-drive if=pflash'" + "options at once"); + exit(1); + } + lams->bios_loaded = true; + return; + } + if (filename) { bios_name = qemu_find_file(QEMU_FILE_TYPE_BIOS, filename); if (!bios_name) { @@ -652,21 +707,15 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) exit(1); } - bios_size = load_image_targphys(bios_name, VIRT_BIOS_BASE, VIRT_BIOS_SIZE); + mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(lams->flash[0]), 0); + bios_size = load_image_mr(bios_name, mr); if (bios_size < 0) { error_report("Could not load ROM image '%s'", bios_name); exit(1); } - g_free(bios_name); - - memory_region_init_ram(&lams->bios, NULL, "loongarch.bios", - VIRT_BIOS_SIZE, &error_fatal); - memory_region_set_readonly(&lams->bios, true); - memory_region_add_subregion(get_system_memory(), VIRT_BIOS_BASE, &lams->bios); lams->bios_loaded = true; } - } static void reset_load_elf(void *opaque) diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 6ef9a92394..252f7df7f4 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -18,10 +18,12 @@ #define VIRT_FWCFG_BASE 0x1e020000UL #define VIRT_BIOS_BASE 0x1c000000UL -#define VIRT_BIOS_SIZE (4 * MiB) +#define VIRT_BIOS_SIZE (16 * MiB) #define VIRT_FLASH_SECTOR_SIZE (128 * KiB) -#define VIRT_FLASH_BASE 0x1d000000UL -#define VIRT_FLASH_SIZE (16 * MiB) +#define VIRT_FLASH0_BASE VIRT_BIOS_BASE +#define VIRT_FLASH0_SIZE VIRT_BIOS_SIZE +#define VIRT_FLASH1_BASE 0x1d000000UL +#define VIRT_FLASH1_SIZE (16 * MiB) #define VIRT_LOWMEM_BASE 0 #define VIRT_LOWMEM_SIZE 0x10000000 @@ -49,7 +51,7 @@ struct LoongArchMachineState { int fdt_size; DeviceState *platform_bus_dev; PCIBus *pci_bus; - PFlashCFI01 *flash; + PFlashCFI01 *flash[2]; MemoryRegion system_iocsr; MemoryRegion iocsr_mem; AddressSpace as_iocsr; -- Gitee From 105ea4d8301791bbb5a76df1f527fb5df439c565 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Tue, 27 Feb 2024 16:01:50 +0800 Subject: [PATCH 036/939] hw/acpi/cpu: Use CPUState typedef MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from b8492bd430ecc1ceb80cac19b46870d423f1e854 QEMU coding style recommend using structure typedefs: https://www.qemu.org/docs/master/devel/style.html#typedefs Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: dinglimin --- include/hw/acpi/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index bc901660fb..209e1773f8 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -19,7 +19,7 @@ #include "hw/hotplug.h" typedef struct AcpiCpuStatus { - struct CPUState *cpu; + CPUState *cpu; uint64_t arch_id; bool is_inserting; bool is_removing; -- Gitee From 9558ea5d0bded6c9189adf2ce317cca205604c15 Mon Sep 17 00:00:00 2001 From: Binfeng Wu Date: Tue, 8 Feb 2022 17:00:39 +0800 Subject: [PATCH 037/939] vfio/pci: Ascend310 need 4Bytes quirk in bar4 --- hw/vfio/pci-quirks.c | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 84b1a7b948..8fb190ce3c 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1209,6 +1209,80 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, return 0; } +#define PCI_VENDOR_ID_HUAWEI 0x19e5 +#define PCI_DEVICE_ID_ASCEND310 0xd100 +#define ASCEND310_XLOADER_SIZE 4 +#define ASCEND310_XLOADER_OFFSET 0x400 + +typedef struct VFIOAscendBarQuirk { + struct VFIOPCIDevice *vdev; + pcibus_t offset; + uint8_t bar; + MemoryRegion *mem; +} VFIOAscendBarQuirk; + +static uint64_t vfio_ascend_quirk_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOAscendBarQuirk *quirk = opaque; + VFIOPCIDevice *vdev = quirk->vdev; + + qemu_log("read RO region! addr=0x%" HWADDR_PRIx ", size=%d\n", + addr + quirk->offset, size); + + return vfio_region_read(&vdev->bars[quirk->bar].region, + addr + quirk->offset, size); +} + +static void vfio_ascend_quirk_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOAscendBarQuirk *quirk = opaque; + + qemu_log("modifying RO region is not allowed! addr=0x%" + HWADDR_PRIx ", data=0x%" PRIx64 ", size=%d\n", + addr + quirk->offset, data, size); +} + +static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = { + .read = vfio_ascend_quirk_read, + .write = vfio_ascend_quirk_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_ascend310_bar4_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOAscendBarQuirk *bar4_quirk; + + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 4 || + vdev->device_id != PCI_DEVICE_ID_ASCEND310) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->nr_mem = 1; + quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); + bar4_quirk = quirk->data = g_new0(typeof(*bar4_quirk), quirk->nr_mem); + bar4_quirk[0].vdev = vdev; + bar4_quirk[0].offset = ASCEND310_XLOADER_OFFSET; + bar4_quirk[0].bar = nr; + + /* + * intercept w/r to the xloader-updating register, + * so the vm can't enable xloader-updating + */ + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar4_quirk[0], + "vfio-ascend310-bar4-intercept-regs-quirk", + ASCEND310_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar4_quirk[0].offset, + &quirk->mem[0], 1); + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); +} + /* * Common quirk probe entry points. */ @@ -1261,6 +1335,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) #ifdef CONFIG_VFIO_IGD vfio_probe_igd_bar4_quirk(vdev, nr); #endif + vfio_probe_ascend310_bar4_quirk(vdev, nr); } void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) -- Gitee From f999392631e7f9fb15493f17b535a8a42ac88be2 Mon Sep 17 00:00:00 2001 From: Binfeng Wu Date: Tue, 8 Feb 2022 17:16:04 +0800 Subject: [PATCH 038/939] vfio/pci: Ascend710 need 4Bytes quirk in bar0 --- hw/vfio/pci-quirks.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 8fb190ce3c..9ef4b63e82 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1210,7 +1210,10 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, } #define PCI_VENDOR_ID_HUAWEI 0x19e5 +#define PCI_DEVICE_ID_ASCEND710 0xd500 #define PCI_DEVICE_ID_ASCEND310 0xd100 +#define ASCEND710_XLOADER_SIZE 4 +#define ASCEND710_XLOADER_OFFSET 0x20430 #define ASCEND310_XLOADER_SIZE 4 #define ASCEND310_XLOADER_OFFSET 0x400 @@ -1250,6 +1253,39 @@ static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = { .endianness = DEVICE_LITTLE_ENDIAN, }; +static void vfio_probe_ascend710_bar0_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOAscendBarQuirk *bar0_quirk; + + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 || + vdev->device_id != PCI_DEVICE_ID_ASCEND710) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->nr_mem = 1; + quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); + bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem); + bar0_quirk[0].vdev = vdev; + bar0_quirk[0].offset = ASCEND710_XLOADER_OFFSET; + bar0_quirk[0].bar = nr; + + /* + * intercept w/r to the xloader-updating register, + * so the vm can't enable xloader-updating + */ + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar0_quirk[0], + "vfio-ascend710-bar0-intercept-regs-quirk", + ASCEND710_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar0_quirk[0].offset, + &quirk->mem[0], 1); + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); +} + static void vfio_probe_ascend310_bar4_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; @@ -1335,6 +1371,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) #ifdef CONFIG_VFIO_IGD vfio_probe_igd_bar4_quirk(vdev, nr); #endif + vfio_probe_ascend710_bar0_quirk(vdev, nr); vfio_probe_ascend310_bar4_quirk(vdev, nr); } -- Gitee From 5b068100780cf91cc1696589d2115ba3078f9d38 Mon Sep 17 00:00:00 2001 From: Binfeng Wu Date: Tue, 8 Feb 2022 19:20:36 +0800 Subject: [PATCH 039/939] vfio/pci: Ascend910 need 4Bytes quirk in bar0 --- hw/vfio/pci-quirks.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 9ef4b63e82..ba4d8f020c 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1210,8 +1210,11 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, } #define PCI_VENDOR_ID_HUAWEI 0x19e5 +#define PCI_DEVICE_ID_ASCEND910 0xd801 #define PCI_DEVICE_ID_ASCEND710 0xd500 #define PCI_DEVICE_ID_ASCEND310 0xd100 +#define ASCEND910_XLOADER_SIZE 4 +#define ASCEND910_XLOADER_OFFSET 0x80400 #define ASCEND710_XLOADER_SIZE 4 #define ASCEND710_XLOADER_OFFSET 0x20430 #define ASCEND310_XLOADER_SIZE 4 @@ -1253,6 +1256,39 @@ static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = { .endianness = DEVICE_LITTLE_ENDIAN, }; +static void vfio_probe_ascend910_bar0_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOAscendBarQuirk *bar0_quirk; + + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 || + vdev->device_id != PCI_DEVICE_ID_ASCEND910) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->nr_mem = 1; + quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); + bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem); + bar0_quirk[0].vdev = vdev; + bar0_quirk[0].offset = ASCEND910_XLOADER_OFFSET; + bar0_quirk[0].bar = nr; + + /* + * intercept w/r to the xloader-updating register, + * so the vm can't enable xloader-updating + */ + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar0_quirk[0], + "vfio-ascend910-bar0-intercept-regs-quirk", + ASCEND910_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar0_quirk[0].offset, + &quirk->mem[0], 1); + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); +} + static void vfio_probe_ascend710_bar0_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; @@ -1371,6 +1407,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) #ifdef CONFIG_VFIO_IGD vfio_probe_igd_bar4_quirk(vdev, nr); #endif + vfio_probe_ascend910_bar0_quirk(vdev, nr); vfio_probe_ascend710_bar0_quirk(vdev, nr); vfio_probe_ascend310_bar4_quirk(vdev, nr); } -- Gitee From 782040a627d0c3a44a9259a9055610e25c1f44fe Mon Sep 17 00:00:00 2001 From: Wu Binfeng Date: Mon, 25 Apr 2022 15:17:48 +0800 Subject: [PATCH 040/939] vfio/pci: Ascend710 change to bar2 quirk Change Ascend710's quirk regions to bar2 for internal causes. And support Ascend710 2P format now. --- hw/vfio/pci-quirks.c | 64 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index ba4d8f020c..a71ebe26b4 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1213,10 +1213,17 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, #define PCI_DEVICE_ID_ASCEND910 0xd801 #define PCI_DEVICE_ID_ASCEND710 0xd500 #define PCI_DEVICE_ID_ASCEND310 0xd100 +#define PCI_SUB_DEVICE_ID_ASCEND710_1P_MIN 0x100 +#define PCI_SUB_DEVICE_ID_ASCEND710_1P_MAX 0x10f +#define PCI_SUB_DEVICE_ID_ASCEND710_2P_MIN 0x110 +#define PCI_SUB_DEVICE_ID_ASCEND710_2P_MAX 0x11f #define ASCEND910_XLOADER_SIZE 4 #define ASCEND910_XLOADER_OFFSET 0x80400 +#define ASCEND710_2P_BASE (128 * 1024 * 1024) +#define ASCEND710_1P_DEVNUM 1 +#define ASCEND710_2P_DEVNUM 2 #define ASCEND710_XLOADER_SIZE 4 -#define ASCEND710_XLOADER_OFFSET 0x20430 +#define ASCEND710_XLOADER_OFFSET 0x100430 #define ASCEND310_XLOADER_SIZE 4 #define ASCEND310_XLOADER_OFFSET 0x400 @@ -1289,23 +1296,38 @@ static void vfio_probe_ascend910_bar0_quirk(VFIOPCIDevice *vdev, int nr) QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } -static void vfio_probe_ascend710_bar0_quirk(VFIOPCIDevice *vdev, int nr) +static void vfio_probe_ascend710_bar2_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; - VFIOAscendBarQuirk *bar0_quirk; + VFIOAscendBarQuirk *bar2_quirk; + int sub_device_id; + int devnum = 0; - if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 || + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || vdev->device_id != PCI_DEVICE_ID_ASCEND710) { return; } + sub_device_id = pci_get_word(vdev->pdev.config + PCI_SUBSYSTEM_ID); + if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND710_1P_MIN && + sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND710_1P_MAX) { + devnum = ASCEND710_1P_DEVNUM; + } else if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND710_2P_MIN && + sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND710_2P_MAX) { + devnum = ASCEND710_2P_DEVNUM; + } + + if (devnum != ASCEND710_1P_DEVNUM && devnum != ASCEND710_2P_DEVNUM) { + return; + } + quirk = g_malloc0(sizeof(*quirk)); - quirk->nr_mem = 1; + quirk->nr_mem = devnum; quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); - bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem); - bar0_quirk[0].vdev = vdev; - bar0_quirk[0].offset = ASCEND710_XLOADER_OFFSET; - bar0_quirk[0].bar = nr; + bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem); + bar2_quirk[0].vdev = vdev; + bar2_quirk[0].offset = ASCEND710_XLOADER_OFFSET; + bar2_quirk[0].bar = nr; /* * intercept w/r to the xloader-updating register, @@ -1313,12 +1335,28 @@ static void vfio_probe_ascend710_bar0_quirk(VFIOPCIDevice *vdev, int nr) */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, - &bar0_quirk[0], - "vfio-ascend710-bar0-intercept-regs-quirk", + &bar2_quirk[0], + "vfio-ascend710-bar2-1p-intercept-regs-quirk", ASCEND710_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, - bar0_quirk[0].offset, + bar2_quirk[0].offset, &quirk->mem[0], 1); + + if (devnum == ASCEND710_2P_DEVNUM) { + bar2_quirk[1].vdev = vdev; + bar2_quirk[1].offset = (ASCEND710_2P_BASE + ASCEND710_XLOADER_OFFSET); + bar2_quirk[1].bar = nr; + + memory_region_init_io(&quirk->mem[1], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar2_quirk[1], + "vfio-ascend710-bar2-2p-intercept-regs-quirk", + ASCEND710_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar2_quirk[1].offset, + &quirk->mem[1], 1); + } + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } @@ -1408,7 +1446,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) vfio_probe_igd_bar4_quirk(vdev, nr); #endif vfio_probe_ascend910_bar0_quirk(vdev, nr); - vfio_probe_ascend710_bar0_quirk(vdev, nr); + vfio_probe_ascend710_bar2_quirk(vdev, nr); vfio_probe_ascend310_bar4_quirk(vdev, nr); } -- Gitee From 77b2f29dce6ddedcc13488eb80add2f9023b4b89 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Wed, 13 Mar 2024 11:23:35 +0800 Subject: [PATCH 041/939] virtio-gpu: remove needless condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from cab47b210598c11b76053a01316df9835b94dc09 qemu_create_displaysurface_pixman() never returns NULL. Signed-off-by: Marc-André Lureau Signed-off-by: dinglimin --- hw/display/virtio-gpu.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index b016d3bac8..b02d1e3a4c 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -679,10 +679,6 @@ static void virtio_gpu_do_set_scanout(VirtIOGPU *g, /* realloc the surface ptr */ scanout->ds = qemu_create_displaysurface_pixman(rect); - if (!scanout->ds) { - *error = VIRTIO_GPU_RESP_ERR_UNSPEC; - return; - } #ifdef WIN32 qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, fb->offset); #endif @@ -1418,9 +1414,6 @@ static int virtio_gpu_post_load(void *opaque, int version_id) return -EINVAL; } scanout->ds = qemu_create_displaysurface_pixman(res->image); - if (!scanout->ds) { - return -EINVAL; - } #ifdef WIN32 qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, 0); #endif -- Gitee From 3fe9a15feba924675ffcc5b797185091cfb8a007 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 14:49:53 +0800 Subject: [PATCH 042/939] vhost-vdpa: add VHOST_BACKEND_F_BYTEMAPLOG support VHOST_BACKEND_F_BYTEMAPLOG to support vhost device bytemap logging. Signed-off-by: libai --- hw/virtio/vhost-vdpa.c | 9 +++++---- include/standard-headers/linux/vhost_types.h | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 819b2d811a..ce8ff7f417 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -829,10 +829,11 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev, static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) { uint64_t features; - uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | - 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | - 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | - 0x1ULL << VHOST_BACKEND_F_SUSPEND; + uint64_t f = BIT_ULL(VHOST_BACKEND_F_IOTLB_MSG_V2) | + BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH) | + BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID) | + BIT_ULL(VHOST_BACKEND_F_SUSPEND) | + BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG); int r; if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h index fd54044936..46fc53cd83 100644 --- a/include/standard-headers/linux/vhost_types.h +++ b/include/standard-headers/linux/vhost_types.h @@ -192,5 +192,7 @@ struct vhost_vdpa_iova_range { #define VHOST_BACKEND_F_DESC_ASID 0x7 /* IOTLB don't flush memory mapping across device reset */ #define VHOST_BACKEND_F_IOTLB_PERSIST 0x8 +/* device can use bytemap log */ +#define VHOST_BACKEND_F_BYTEMAPLOG 0x3f #endif -- Gitee From 3bc7a4e430e01fd90b427bf74a904664eda9ece6 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:04:25 +0800 Subject: [PATCH 043/939] vhost-vdpa: add migration log ops for VhostOps Implement vhost_set_log_size for setting buffer size for logging. Implement vhost_set_log_fd to specify an eventfd to signal on log write. Implement vhost_log_sync for getting dirtymap logged by vhost backend. Signed-off-by: libai --- hw/virtio/vhost-vdpa.c | 37 +++++++++++++++++++++++++++++++ include/hw/virtio/vhost-backend.h | 8 +++++++ linux-headers/linux/vhost.h | 4 ++++ 3 files changed, 49 insertions(+) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ce8ff7f417..037a9c6e4c 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1355,6 +1355,30 @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); } +static int vhost_vdpa_set_log_fd(struct vhost_dev *dev, int fd, + struct vhost_log *log) +{ + struct vhost_vdpa *v = dev->opaque; + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_SET_LOG_FD, &fd); +} + +static int vhost_vdpa_set_log_size(struct vhost_dev *dev, uint64_t size, + struct vhost_log *log) +{ + struct vhost_vdpa *v = dev->opaque; + uint64_t logsize = size * sizeof(*(log->log)); + + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_SET_LOG_SIZE, &logsize); +} + static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, struct vhost_vring_addr *addr) { @@ -1489,11 +1513,23 @@ static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) return true; } +static int vhost_vdpa_log_sync(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_LOG_SYNC, NULL); +} + const VhostOps vdpa_ops = { .backend_type = VHOST_BACKEND_TYPE_VDPA, .vhost_backend_init = vhost_vdpa_init, .vhost_backend_cleanup = vhost_vdpa_cleanup, .vhost_set_log_base = vhost_vdpa_set_log_base, + .vhost_set_log_size = vhost_vdpa_set_log_size, + .vhost_set_log_fd = vhost_vdpa_set_log_fd, .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, .vhost_set_vring_num = vhost_vdpa_set_vring_num, .vhost_set_vring_base = vhost_vdpa_set_vring_base, @@ -1520,6 +1556,7 @@ const VhostOps vdpa_ops = { .vhost_get_device_id = vhost_vdpa_get_device_id, .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, .vhost_force_iommu = vhost_vdpa_force_iommu, + .vhost_log_sync = vhost_vdpa_log_sync, .vhost_set_config_call = vhost_vdpa_set_config_call, .vhost_reset_status = vhost_vdpa_reset_status, }; diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index a86d103f82..71b02e4a12 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -65,6 +65,11 @@ typedef int (*vhost_scsi_get_abi_version_op)(struct vhost_dev *dev, int *version); typedef int (*vhost_set_log_base_op)(struct vhost_dev *dev, uint64_t base, struct vhost_log *log); +typedef int (*vhost_set_log_size_op)(struct vhost_dev *dev, uint64_t size, + struct vhost_log *log); +typedef int (*vhost_set_log_fd_op)(struct vhost_dev *dev, int fd, + struct vhost_log *log); +typedef int (*vhost_log_sync_op)(struct vhost_dev *dev); typedef int (*vhost_set_mem_table_op)(struct vhost_dev *dev, struct vhost_memory *mem); typedef int (*vhost_set_vring_addr_op)(struct vhost_dev *dev, @@ -162,6 +167,9 @@ typedef struct VhostOps { vhost_scsi_clear_endpoint_op vhost_scsi_clear_endpoint; vhost_scsi_get_abi_version_op vhost_scsi_get_abi_version; vhost_set_log_base_op vhost_set_log_base; + vhost_set_log_size_op vhost_set_log_size; + vhost_set_log_fd_op vhost_set_log_fd; + vhost_log_sync_op vhost_log_sync; vhost_set_mem_table_op vhost_set_mem_table; vhost_set_vring_addr_op vhost_set_vring_addr; vhost_set_vring_endian_op vhost_set_vring_endian; diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index 649560c685..19dc7fd36c 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -43,6 +43,10 @@ * The bit is set using an atomic 32 bit operation. */ /* Set base address for logging. */ #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Set buffer size for logging */ +#define VHOST_SET_LOG_SIZE _IOW(VHOST_VIRTIO, 0x05, __u64) +/* Logging sync */ +#define VHOST_LOG_SYNC _IO(VHOST_VIRTIO, 0x06) /* Specify an eventfd file descriptor to signal on log write. */ #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) /* By default, a device gets one vhost_worker that its virtqueues share. This -- Gitee From 962acd498b11ae5ccc040d76ec89990add119dec Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:09:26 +0800 Subject: [PATCH 044/939] vhost: introduce bytemap for vhost backend logging As vhost backend may use bytemap for logging, when get log_size of vhost device, check whether vhost device support VHOST_BACKEND_F_BYTEMAPLOG. If vhost device support, use bytemap for logging. By the way, add log_resize func pointer check and vhost_log_sync return value check. Signed-off-by: libai --- hw/virtio/vhost.c | 89 ++++++++++++++++++++++++++++++++++++--- include/exec/memory.h | 9 ++++ include/exec/ram_addr.h | 44 +++++++++++++++++++ include/hw/virtio/vhost.h | 1 + system/physmem.c | 11 +++++ 5 files changed, 148 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 038ac37dd0..438182d850 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -29,6 +29,7 @@ #include "migration/migration.h" #include "sysemu/dma.h" #include "trace.h" +#include "qapi/qapi-commands-migration.h" /* enabled until disconnected backend stabilizes */ #define _VHOST_DEBUG 1 @@ -44,6 +45,11 @@ do { } while (0) #endif +static inline bool vhost_bytemap_log_support(struct vhost_dev *dev) +{ + return (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG)); +} + static struct vhost_log *vhost_log; static struct vhost_log *vhost_log_shm; @@ -232,12 +238,40 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, return 0; } +static int vhost_sync_dirty_bytemap(struct vhost_dev *dev, + MemoryRegionSection *section) +{ + unsigned long *bytemap = dev->log->log; + return memory_section_set_dirty_bytemap(section, bytemap); +} + static void vhost_log_sync(MemoryListener *listener, MemoryRegionSection *section) { struct vhost_dev *dev = container_of(listener, struct vhost_dev, memory_listener); - vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); + MigrationState *ms = migrate_get_current(); + + if (!dev->log_enabled || !dev->started) { + return; + } + + if (dev->vhost_ops->vhost_log_sync) { + int r = dev->vhost_ops->vhost_log_sync(dev); + if (r < 0) { + error_report("Failed to sync dirty log: 0x%x\n", r); + if (migration_is_running(ms->state)) { + qmp_migrate_cancel(NULL); + } + return; + } + } + + if (vhost_bytemap_log_support(dev)) { + vhost_sync_dirty_bytemap(dev, section); + } else { + vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); + } } static void vhost_log_sync_range(struct vhost_dev *dev, @@ -247,7 +281,11 @@ static void vhost_log_sync_range(struct vhost_dev *dev, /* FIXME: this is N^2 in number of sections */ for (i = 0; i < dev->n_mem_sections; ++i) { MemoryRegionSection *section = &dev->mem_sections[i]; - vhost_sync_dirty_bitmap(dev, section, first, last); + if (vhost_bytemap_log_support(dev)) { + vhost_sync_dirty_bytemap(dev, section); + } else { + vhost_sync_dirty_bitmap(dev, section, first, last); + } } } @@ -255,11 +293,19 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev) { uint64_t log_size = 0; int i; + uint64_t vhost_log_chunk_size; + + if (vhost_bytemap_log_support(dev)) { + vhost_log_chunk_size = VHOST_LOG_CHUNK_BYTES; + } else { + vhost_log_chunk_size = VHOST_LOG_CHUNK; + } + for (i = 0; i < dev->mem->nregions; ++i) { struct vhost_memory_region *reg = dev->mem->regions + i; uint64_t last = range_get_last(reg->guest_phys_addr, reg->memory_size); - log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); + log_size = MAX(log_size, last / vhost_log_chunk_size + 1); } return log_size; } @@ -377,12 +423,21 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev) dev->vhost_ops->vhost_requires_shm_log(dev); } -static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) +static inline int vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) { struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); - uint64_t log_base = (uintptr_t)log->log; + uint64_t log_base; + int log_fd; int r; + if (!log) { + r = -ENOMEM; + goto out; + } + + log_base = (uint64_t)log->log; + log_fd = log_fd; + /* inform backend of log switching, this must be done before releasing the current log, to ensure no logging is lost */ r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); @@ -390,9 +445,19 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); } + if (dev->vhost_ops->vhost_set_log_size) { + r = dev->vhost_ops->vhost_set_log_size(dev, size, dev->log); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_log_size failed"); + } + } + vhost_log_put(dev, true); dev->log = log; dev->log_size = size; + +out: + return r; } static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, @@ -1018,7 +1083,11 @@ static int vhost_migration_log(MemoryListener *listener, bool enable) } vhost_log_put(dev, false); } else { - vhost_dev_log_resize(dev, vhost_get_log_size(dev)); + r = vhost_dev_log_resize(dev, vhost_get_log_size(dev)); + if ( r < 0 ) { + return r; + } + r = vhost_dev_set_log(dev, true); if (r < 0) { goto check_dev_state; @@ -2057,6 +2126,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); goto fail_log; } + + if (hdev->vhost_ops->vhost_set_log_size) { + r = hdev->vhost_ops->vhost_set_log_size(hdev, hdev->log_size, hdev->log); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_log_size failed"); + goto fail_log; + } + } } if (vrings) { r = vhost_dev_set_vring_enable(hdev, true); diff --git a/include/exec/memory.h b/include/exec/memory.h index 831f7c996d..e131c2682c 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2594,6 +2594,15 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr, MemOp op, MemTxAttrs attrs); +/** + * memory_section_set_dirty_bytemap: Mark a range of bytes as dirty for a memory section + * using a bytemap + * + * @section: the memory section being dirtied. + * @bytemap: bytemap that stores dirty page range information. + */ +int64_t memory_section_set_dirty_bytemap(MemoryRegionSection *section, unsigned long *bytemap); + /** * address_space_init: initializes an address space * diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 90676093f5..ef6988b445 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -535,5 +535,49 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, return num_dirty; } + +#define BYTES_PER_LONG (sizeof(unsigned long)) +#define BYTE_WORD(nr) ((nr) / BYTES_PER_LONG) +#define BYTES_TO_LONGS(nr) DIV_ROUND_UP(nr, BYTES_PER_LONG) + +static inline int64_t _set_dirty_bytemap_atomic(unsigned long *bytemap, unsigned long cur_pfn) +{ + char *byte_of_long = (char *)bytemap; + int i; + int64_t dirty_num = 0; + + for (i = 0; i < BYTES_PER_LONG; i++) { + if (byte_of_long[i]) { + cpu_physical_memory_set_dirty_range((cur_pfn + i) << TARGET_PAGE_BITS, + TARGET_PAGE_SIZE, + 1 << DIRTY_MEMORY_MIGRATION); + /* Per byte ops, no need to atomic_xchg */ + byte_of_long[i] = 0; + dirty_num++; + } + } + + return dirty_num; +} + +static inline int64_t cpu_physical_memory_set_dirty_bytemap(unsigned long *bytemap, + ram_addr_t start, + ram_addr_t pages) +{ + unsigned long i; + unsigned long len = BYTES_TO_LONGS(pages); + unsigned long pfn = (start >> TARGET_PAGE_BITS) / + BYTES_PER_LONG * BYTES_PER_LONG; + int64_t dirty_mig_bits = 0; + + for (i = 0; i < len; i++) { + if (bytemap[i]) { + dirty_mig_bits += _set_dirty_bytemap_atomic(&bytemap[i], + pfn + BYTES_PER_LONG * i); + } + } + + return dirty_mig_bits; +} #endif #endif diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 444ca0ad42..6ae86833e3 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -43,6 +43,7 @@ typedef unsigned long vhost_log_chunk_t; #define VHOST_LOG_PAGE 0x1000 #define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t)) #define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS) +#define VHOST_LOG_CHUNK_BYTES (VHOST_LOG_PAGE * sizeof(vhost_log_chunk_t)) #define VHOST_INVALID_FEATURE_BIT (0xff) #define VHOST_QUEUE_NUM_CONFIG_INR 0 diff --git a/system/physmem.c b/system/physmem.c index f14d64819b..247c252e53 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2602,6 +2602,17 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask); } +int64_t memory_section_set_dirty_bytemap(MemoryRegionSection *section, unsigned long *bytemap) +{ + ram_addr_t start = section->offset_within_region + + memory_region_get_ram_addr(section->mr); + ram_addr_t pages = int128_get64(section->size) >> TARGET_PAGE_BITS; + + hwaddr idx = BYTE_WORD( + section->offset_within_address_space >> TARGET_PAGE_BITS); + return cpu_physical_memory_set_dirty_bytemap(bytemap + idx, start, pages); +} + void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size) { /* -- Gitee From b0a62a84bd1c6ad5d4c11463371fcf267b56d902 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:13:41 +0800 Subject: [PATCH 045/939] vhost: add vhost_dev_suspend/resume_op Introduce new vhost interface to support vhost device suspend & resume Signed-off-by: libai --- include/hw/virtio/vhost-backend.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index 71b02e4a12..84b8fa1075 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -155,6 +155,9 @@ typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev, Error **errp); typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, Error **errp); +typedef int (*vhost_dev_suspend_op)(struct vhost_dev *dev); +typedef int (*vhost_dev_resume_op)(struct vhost_dev *dev); + typedef struct VhostOps { VhostBackendType backend_type; vhost_backend_init vhost_backend_init; @@ -208,6 +211,8 @@ typedef struct VhostOps { vhost_supports_device_state_op vhost_supports_device_state; vhost_set_device_state_fd_op vhost_set_device_state_fd; vhost_check_device_state_op vhost_check_device_state; + vhost_dev_suspend_op vhost_dev_suspend; + vhost_dev_resume_op vhost_dev_resume; } VhostOps; int vhost_backend_update_device_iotlb(struct vhost_dev *dev, -- Gitee From a7f9a67ee98a5261f7639619055034f40bccfef0 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:22:20 +0800 Subject: [PATCH 046/939] vhost: implement vhost-vdpa suspend/resume vhost-vdpa implements the vhost_dev_suspend interface, which will be called during the shutdown phase of the live migration source virtual machine to suspend the device but not reset the device information. vhost-vdpa implements the vhost_dev_resume interface. If the live migration fails, it will be called during the startup phase of the source virtual machine. Enable the device but set the status, etc. Signed-off-by: libai --- hw/virtio/vhost-vdpa.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 037a9c6e4c..063e941544 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1513,6 +1513,45 @@ static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) return true; } +static int vhost_vdpa_suspend_device(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + int ret; + + vhost_vdpa_svqs_stop(dev); + vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); + + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return 0; + } + + ret = vhost_vdpa_call(dev, VHOST_VDPA_SUSPEND, NULL); + memory_listener_unregister(&v->listener); + return ret; +} + +static int vhost_vdpa_resume_device(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + bool ok; + + vhost_vdpa_host_notifiers_init(dev); + ok = vhost_vdpa_svqs_start(dev); + if (unlikely(!ok)) { + return -1; + } + for (int i = 0; i < v->dev->nvqs; ++i) { + vhost_vdpa_set_vring_ready(v, v->dev->vq_index + i); + } + + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return 0; + } + + memory_listener_register(&v->listener, &address_space_memory); + return vhost_vdpa_call(dev, VHOST_VDPA_RESUME, NULL); +} + static int vhost_vdpa_log_sync(struct vhost_dev *dev) { struct vhost_vdpa *v = dev->opaque; @@ -1559,4 +1598,6 @@ const VhostOps vdpa_ops = { .vhost_log_sync = vhost_vdpa_log_sync, .vhost_set_config_call = vhost_vdpa_set_config_call, .vhost_reset_status = vhost_vdpa_reset_status, + .vhost_dev_suspend = vhost_vdpa_suspend_device, + .vhost_dev_resume = vhost_vdpa_resume_device, }; -- Gitee From 4c5a9a0703e227186639124f09cdf7214e40ea7d Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:27:34 +0800 Subject: [PATCH 047/939] vhost: implement vhost_vdpa_device_suspend/resume Implement vhost device suspend & resume interface Signed-off-by: jiangdongxu Signed-off-by: fangyi Signed-off-by: libai --- hw/virtio/meson.build | 2 +- hw/virtio/vdpa-dev-mig.c | 178 +++++++++++++++++++++++++++++++ hw/virtio/vhost.c | 138 ++++++++++++++++++++++++ include/hw/virtio/vdpa-dev-mig.h | 16 +++ include/hw/virtio/vdpa-dev.h | 1 + include/hw/virtio/vhost.h | 3 + migration/migration.c | 3 +- migration/migration.h | 2 + 8 files changed, 340 insertions(+), 3 deletions(-) create mode 100644 hw/virtio/vdpa-dev-mig.c create mode 100644 include/hw/virtio/vdpa-dev-mig.h diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index c0055a7832..596651d113 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -5,7 +5,7 @@ system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c') system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_VSOCK_COMMON', if_true: files('vhost-vsock-common.c')) system_virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c')) -system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c')) +system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c', 'vdpa-dev-mig.c')) specific_virtio_ss = ss.source_set() specific_virtio_ss.add(files('virtio.c')) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c new file mode 100644 index 0000000000..1d2bed2571 --- /dev/null +++ b/hw/virtio/vdpa-dev-mig.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#include +#include +#include "qemu/osdep.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vdpa-dev.h" +#include "hw/virtio/virtio-bus.h" +#include "migration/migration.h" +#include "qemu/error-report.h" +#include "hw/virtio/vdpa-dev-mig.h" + +static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, + void *arg) +{ + struct vhost_vdpa *v = dev->opaque; + int fd = v->device_fd; + + if (dev->vhost_ops->backend_type != VHOST_BACKEND_TYPE_VDPA) { + error_report("backend type isn't VDPA. Operation not permitted!\n"); + return -EPERM; + } + + return ioctl(fd, request, arg); +} + +static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!vdpa->started) { + return -EFAULT; + } + + if (!k->set_guest_notifiers) { + return -EFAULT; + } + + vdpa->started = false; + + ret = vhost_dev_suspend(&vdpa->dev, vdev, false); + if (ret) { + goto suspend_fail; + } + + ret = k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d\n", ret); + goto set_guest_notifiers_fail; + } + + vhost_dev_disable_notifiers(&vdpa->dev, vdev); + return ret; + +set_guest_notifiers_fail: + ret = k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, true); + if (ret) { + error_report("vhost guest notifier restore failed: %d\n", ret); + } + +suspend_fail: + vdpa->started = true; + return ret; +} + +static int vhost_vdpa_device_resume(VhostVdpaDevice *vdpa) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int i, ret; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers\n"); + return -ENOSYS; + } + + ret = vhost_dev_enable_notifiers(&vdpa->dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d\n", ret); + return ret; + } + + ret = k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d\n", ret); + goto err_host_notifiers; + } + + vdpa->dev.acked_features = vdev->guest_features; + + ret = vhost_dev_resume(&vdpa->dev, vdev, false); + if (ret < 0) { + error_report("Error starting vhost: %d\n", ret); + goto err_guest_notifiers; + } + vdpa->started = true; + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < vdpa->dev.nvqs; i++) { + vhost_virtqueue_mask(&vdpa->dev, vdev, i, false); + } + + return ret; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&vdpa->dev, vdev); + return ret; +} + +static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) +{ + VhostVdpaDevice *vdpa = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &vdpa->dev; + int ret; + MigrationState *ms = migrate_get_current(); + MigrationIncomingState *mis = migration_incoming_get_current(); + + if (!running) { + if (ms->state == RUN_STATE_PAUSED) { + ret = vhost_vdpa_device_suspend(vdpa); + if (ret) { + error_report("suspend vdpa device failed: %d\n", ret); + if (ms->migration_thread_running) { + migrate_fd_cancel(ms); + } + } + } + } else { + if (ms->state == RUN_STATE_RESTORE_VM) { + ret = vhost_vdpa_device_resume(vdpa); + if (ret) { + error_report("migration dest resume device failed, abort!\n"); + exit(EXIT_FAILURE); + } + } + + if (mis->state == RUN_STATE_RESTORE_VM) { + vhost_vdpa_call(hdev, VHOST_VDPA_RESUME, NULL); + } + } +} + +void vdpa_migration_register(VhostVdpaDevice *vdev) +{ + vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev), + vdpa_dev_vmstate_change, + DEVICE(vdev)); +} + +void vdpa_migration_unregister(VhostVdpaDevice *vdev) +{ + qemu_del_vm_change_state_handler(vdev->vmstate); +} diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 438182d850..d073a6d5a5 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -2492,3 +2492,141 @@ bool used_memslots_is_exceeded(void) { return used_memslots_exceeded; } + +int vhost_dev_resume(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +{ + int i, r; + EventNotifier *e = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; + + /* should only be called after backend is connected */ + if (!hdev->vhost_ops) { + error_report("Missing vhost_ops! Operation not permitted!\n"); + return -EPERM; + } + + vdev->vhost_started = true; + hdev->started = true; + hdev->vdev = vdev; + + if (vhost_dev_has_iommu(hdev)) { + memory_listener_register(&hdev->iommu_listener, vdev->dma_as); + } + + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); + goto fail_mem; + } + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_start(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + if (r < 0) { + goto fail_vq; + } + } + + r = event_notifier_init(e, 0); + if (r < 0) { + return r; + } + event_notifier_test_and_clear(e); + if (!vdev->use_guest_notifier_mask) { + vhost_config_mask(hdev, vdev, true); + } + if (vrings) { + r = vhost_dev_set_vring_enable(hdev, true); + if (r) { + goto fail_vq; + } + } + if (hdev->vhost_ops->vhost_dev_resume) { + r = hdev->vhost_ops->vhost_dev_resume(hdev); + if (r) { + goto fail_start; + } + } + if (vhost_dev_has_iommu(hdev)) { + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); + + /* + * Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this. + */ + for (i = 0; i < hdev->nvqs; ++i) { + struct vhost_virtqueue *vq = hdev->vqs + i; + vhost_device_iotlb_miss(hdev, vq->used_phys, true); + } + } + vhost_start_config_intr(hdev); + return 0; +fail_start: + if (vrings) { + vhost_dev_set_vring_enable(hdev, false); + } +fail_vq: + while (--i >= 0) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + +fail_mem: + vdev->vhost_started = false; + hdev->started = false; + return r; +} + +int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +{ + int i; + int ret = 0; + EventNotifier *e = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; + + /* should only be called after backend is connected */ + if (!hdev->vhost_ops) { + error_report("Missing vhost_ops! Operation not permitted!\n"); + return -EPERM; + } + + event_notifier_test_and_clear(e); + event_notifier_test_and_clear(&vdev->config_notifier); + + if (hdev->vhost_ops->vhost_dev_suspend) { + ret = hdev->vhost_ops->vhost_dev_suspend(hdev); + if (ret) { + goto fail_suspend; + } + } + if (vrings) { + ret = vhost_dev_set_vring_enable(hdev, false); + if (ret) { + goto fail_suspend; + } + } + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + + if (vhost_dev_has_iommu(hdev)) { + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); + memory_listener_unregister(&hdev->iommu_listener); + } + vhost_stop_config_intr(hdev); + vhost_log_put(hdev, true); + hdev->started = false; + vdev->vhost_started = false; + hdev->vdev = NULL; + + return ret; + +fail_suspend: + event_notifier_test_and_clear(e); + + return ret; +} diff --git a/include/hw/virtio/vdpa-dev-mig.h b/include/hw/virtio/vdpa-dev-mig.h new file mode 100644 index 0000000000..89665ca747 --- /dev/null +++ b/include/hw/virtio/vdpa-dev-mig.h @@ -0,0 +1,16 @@ +/* + * Vhost Vdpa Device Migration Header + * + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All Rights Reserved. + */ + +#ifndef _VHOST_VDPA_MIGRATION_H +#define _VHOST_VDPA_MIGRATION_H + +#include "hw/virtio/vdpa-dev.h" + +void vdpa_migration_register(VhostVdpaDevice *vdev); + +void vdpa_migration_unregister(VhostVdpaDevice *vdev); + +#endif /* _VHOST_VDPA_MIGRATION_H */ diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h index 4dbf98195c..43cbcef81b 100644 --- a/include/hw/virtio/vdpa-dev.h +++ b/include/hw/virtio/vdpa-dev.h @@ -38,6 +38,7 @@ struct VhostVdpaDevice { uint16_t queue_size; bool started; int (*post_init)(VhostVdpaDevice *v, Error **errp); + VMChangeStateEntry *vmstate; }; #endif diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 6ae86833e3..9ca5819deb 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -466,4 +466,7 @@ int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp); */ int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp); +int vhost_dev_resume(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); +int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); + #endif diff --git a/migration/migration.c b/migration/migration.c index 23d9233bbe..dce22c2da5 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -99,7 +99,6 @@ static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, int *current_active_state, int new_state); -static void migrate_fd_cancel(MigrationState *s); static bool close_return_path_on_source(MigrationState *s); static void migration_downtime_start(MigrationState *s) @@ -1386,7 +1385,7 @@ void migrate_fd_error(MigrationState *s, const Error *error) migrate_set_error(s, error); } -static void migrate_fd_cancel(MigrationState *s) +void migrate_fd_cancel(MigrationState *s) { int old_state ; diff --git a/migration/migration.h b/migration/migration.h index 6aafa04314..2f26c9509b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -551,4 +551,6 @@ void migration_rp_kick(MigrationState *s); int migration_stop_vm(RunState state); +void migrate_fd_cancel(MigrationState *s); + #endif -- Gitee From 556aaa9632862505548d5083d369e92590fb2087 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:53:28 +0800 Subject: [PATCH 048/939] vhost: implement savevm_handler for vdpa device Register savevm_handler ops for vdpa devices to support migration:x Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 175 +++++++++++++++++++++++++++++++ include/hw/virtio/vdpa-dev-mig.h | 13 +++ linux-headers/linux/vhost.h | 9 ++ 3 files changed, 197 insertions(+) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 1d2bed2571..662d4a29dc 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -21,9 +21,21 @@ #include "hw/virtio/vhost.h" #include "hw/virtio/vdpa-dev.h" #include "hw/virtio/virtio-bus.h" +#include "migration/register.h" #include "migration/migration.h" #include "qemu/error-report.h" #include "hw/virtio/vdpa-dev-mig.h" +#include "migration/qemu-file-types.h" + +/* + * Flags used as delimiter: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + */ +#define VDPA_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VDPA_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VDPA_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, void *arg) @@ -39,6 +51,80 @@ static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, return ioctl(fd, request, arg); } +static int vhost_vdpa_set_mig_state(struct vhost_dev *dev, uint8_t state) +{ + return vhost_vdpa_call(dev, VHOST_VDPA_SET_MIG_STATE, &state); +} + +static int vhost_vdpa_dev_buffer_size(struct vhost_dev *dev, uint32_t *size) +{ + return vhost_vdpa_call(dev, VHOST_GET_DEV_BUFFER_SIZE, size); +} + +static int vhost_vdpa_dev_buffer_save(struct vhost_dev *dev, QEMUFile *f) +{ + struct vhost_vdpa_config *config; + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); + uint32_t buffer_size = 0; + int ret; + + ret = vhost_vdpa_dev_buffer_size(dev, &buffer_size); + if (ret) { + error_report("get dev buffer size failed: %d\n", ret); + return ret; + } + + qemu_put_be32(f, buffer_size); + + config = g_malloc(buffer_size + config_size); + config->off = 0; + config->len = buffer_size; + + ret = vhost_vdpa_call(dev, VHOST_GET_DEV_BUFFER, config); + if (ret) { + error_report("get dev buffer failed: %d\n", ret); + goto free; + } + + qemu_put_buffer(f, config->buf, buffer_size); +free: + g_free(config); + + return ret; +} + +static int vhost_vdpa_dev_buffer_load(struct vhost_dev *dev, QEMUFile *f) +{ + struct vhost_vdpa_config *config; + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); + uint32_t buffer_size, recv_size; + int ret; + + buffer_size = qemu_get_be32(f); + + config = g_malloc(buffer_size + config_size); + config->off = 0; + config->len = buffer_size; + + recv_size = qemu_get_buffer(f, config->buf, buffer_size); + if (recv_size != buffer_size) { + error_report("read dev mig buffer failed, buffer_size: %u, " + "recv_size: %u\n", buffer_size, recv_size); + ret = -EINVAL; + goto free; + } + + ret = vhost_vdpa_call(dev, VHOST_SET_DEV_BUFFER, config); + if (ret) { + error_report("set dev buffer failed: %d\n", ret); + } + +free: + g_free(config); + + return ret; +} + static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa) { VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); @@ -165,14 +251,103 @@ static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) } } +static int vdpa_save_setup(QEMUFile *f, void *opaque) +{ + qemu_put_be64(f, VDPA_MIG_FLAG_DEV_SETUP_STATE); + qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE); + + return qemu_file_get_error(f); +} + +static int vdpa_save_complete_precopy(QEMUFile *f, void *opaque) +{ + VhostVdpaDevice *vdev = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &vdev->dev; + int ret; + + qemu_put_be64(f, VDPA_MIG_FLAG_DEV_CONFIG_STATE); + ret = vhost_vdpa_dev_buffer_save(hdev, f); + if (ret) { + error_report("Save vdpa device buffer failed: %d\n", ret); + return ret; + } + qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE); + + return qemu_file_get_error(f); +} + +static int vdpa_load_state(QEMUFile *f, void *opaque, int version_id) +{ + VhostVdpaDevice *vdev = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &vdev->dev; + + int ret; + uint64_t data; + + data = qemu_get_be64(f); + while (data != VDPA_MIG_FLAG_END_OF_STATE) { + if (data == VDPA_MIG_FLAG_DEV_SETUP_STATE) { + data = qemu_get_be64(f); + if (data == VDPA_MIG_FLAG_END_OF_STATE) { + return 0; + } else { + error_report("SETUP STATE: EOS not found 0x%lx\n", data); + return -EINVAL; + } + } else if (data == VDPA_MIG_FLAG_DEV_CONFIG_STATE) { + ret = vhost_vdpa_dev_buffer_load(hdev, f); + if (ret) { + error_report("fail to restore device buffer.\n"); + return ret; + } + } + + ret = qemu_file_get_error(f); + if (ret) { + error_report("qemu file error: %d\n", ret); + return ret; + } + data = qemu_get_be64(f); + } + + return 0; +} + +static int vdpa_load_setup(QEMUFile *f, void *opaque) +{ + VhostVdpaDevice *v = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &v->dev; + int ret = 0; + + ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_PRE_START); + if (ret) { + error_report("pre start device failed: %d\n", ret); + goto out; + } + + return qemu_file_get_error(f); +out: + return ret; +} + +static SaveVMHandlers savevm_vdpa_handlers = { + .save_setup = vdpa_save_setup, + .save_live_complete_precopy = vdpa_save_complete_precopy, + .load_state = vdpa_load_state, + .load_setup = vdpa_load_setup, +}; + void vdpa_migration_register(VhostVdpaDevice *vdev) { vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev), vdpa_dev_vmstate_change, DEVICE(vdev)); + register_savevm_live("vdpa", -1, 1, + &savevm_vdpa_handlers, DEVICE(vdev)); } void vdpa_migration_unregister(VhostVdpaDevice *vdev) { + unregister_savevm(VMSTATE_IF(&vdev->parent_obj.parent_obj), "vdpa", DEVICE(vdev)); qemu_del_vm_change_state_handler(vdev->vmstate); } diff --git a/include/hw/virtio/vdpa-dev-mig.h b/include/hw/virtio/vdpa-dev-mig.h index 89665ca747..adc1d657f7 100644 --- a/include/hw/virtio/vdpa-dev-mig.h +++ b/include/hw/virtio/vdpa-dev-mig.h @@ -9,6 +9,19 @@ #include "hw/virtio/vdpa-dev.h" +enum { + VDPA_DEVICE_START, + VDPA_DEVICE_STOP, + VDPA_DEVICE_PRE_START, + VDPA_DEVICE_PRE_STOP, + VDPA_DEVICE_CANCEL, + VDPA_DEVICE_POST_START, + VDPA_DEVICE_START_ASYNC, + VDPA_DEVICE_STOP_ASYNC, + VDPA_DEVICE_PRE_START_ASYNC, + VDPA_DEVICE_QUERY_OP_STATE, +}; + void vdpa_migration_register(VhostVdpaDevice *vdev); void vdpa_migration_unregister(VhostVdpaDevice *vdev); diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index 19dc7fd36c..a08e980a1e 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -231,4 +231,13 @@ */ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* set and get device buffer */ +#define VHOST_GET_DEV_BUFFER _IOR(VHOST_VIRTIO, 0xb0, struct vhost_vdpa_config) +#define VHOST_SET_DEV_BUFFER _IOW(VHOST_VIRTIO, 0xb1, struct vhost_vdpa_config) +#define VHOST_GET_DEV_BUFFER_SIZE _IOR(VHOST_VIRTIO, 0xb3, __u32) + +/* set device migtration state */ +#define VHOST_VDPA_SET_MIG_STATE _IOW(VHOST_VIRTIO, 0xb2, __u8) + #endif -- Gitee From 229737ca91d4e81b4a14143da9981bd59b80a539 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:57:35 +0800 Subject: [PATCH 049/939] vhost: implement post resume bh Set vdpa device mig state to post start when vm post start Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 662d4a29dc..1872f11f3f 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -26,6 +26,7 @@ #include "qemu/error-report.h" #include "hw/virtio/vdpa-dev-mig.h" #include "migration/qemu-file-types.h" +#include "qemu/main-loop.h" /* * Flags used as delimiter: @@ -218,6 +219,18 @@ err_host_notifiers: return ret; } +static void vdpa_dev_migration_handle_incoming_bh(void *opaque) +{ + struct vhost_dev *hdev = opaque; + int ret; + + /* Post start device, unsupport rollback if failed! */ + ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_POST_START); + if (ret) { + error_report("Failed to set state: POST_START\n"); + } +} + static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) { VhostVdpaDevice *vdpa = VHOST_VDPA_DEVICE(opaque); @@ -247,6 +260,10 @@ static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) if (mis->state == RUN_STATE_RESTORE_VM) { vhost_vdpa_call(hdev, VHOST_VDPA_RESUME, NULL); + /* post resume */ + mis->bh = qemu_bh_new(vdpa_dev_migration_handle_incoming_bh, + hdev); + qemu_bh_schedule(mis->bh); } } } -- Gitee From 5a4e9ad98edc1ba5c1e93f0e24753c1a8355ffce Mon Sep 17 00:00:00 2001 From: dinglimin Date: Wed, 13 Mar 2024 13:49:37 +0800 Subject: [PATCH 050/939] target/i386/sev: Fix missing ERRP_GUARD() for error_prepend() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from f55cceac8c03e639711490f08996c32861591435 As the comment in qapi/error, passing @errp to error_prepend() requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: ... * - It should not be passed to error_prepend(), error_vprepend() or * error_append_hint(), because that doesn't work with &error_fatal. * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. ERRP_GUARD() could avoid the case when @errp is the pointer of error_fatal, the user can't see this additional information, because exit() happens in error_setg earlier than information is added [1]. The sev_inject_launch_secret() passes @errp to error_prepend(), and as an APIs defined in target/i386/sev.h, it is necessary to protect its @errp with ERRP_GUARD(). To avoid the issue like [1] said, add missing ERRP_GUARD() at the beginning of this function. [1]: Issue description in the commit message of commit ae7c80a7bd73 ("error: New macro ERRP_GUARD()"). Cc: Paolo Bonzini Cc: Marcelo Tosatti Signed-off-by: Zhao Liu Reviewed-by: Thomas Huth Message-ID: <20240229143914.1977550-17-zhao1.liu@linux.intel.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: dinglimin --- target/i386/sev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/i386/sev.c b/target/i386/sev.c index 9a71246682..1a9d1db7a8 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1044,6 +1044,7 @@ sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) int sev_inject_launch_secret(const char *packet_hdr, const char *secret, uint64_t gpa, Error **errp) { + ERRP_GUARD(); struct kvm_sev_launch_secret input; g_autofree guchar *data = NULL, *hdr = NULL; int error, ret = 1; -- Gitee From 6203b11d2a900c60d2ee3c3a980d2c385050eb62 Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 10 Feb 2022 21:37:49 +0800 Subject: [PATCH 051/939] block/mirror: fix file-system went to read-only after block-mirror config vm disk with prdm, keep the disk writing data continuously during block-mirror, the file-system will went to read-only after block-mirror, fix it. Signed-off-by: caojinhua Signed-off-by: jiangdongxu --- block/mirror.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index cd9d3ad4a8..20b3e8e5d8 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1774,7 +1774,7 @@ static BlockJob *mirror_start_job( * reads on the top, while disabling it in the intermediate nodes, and make * the backing chain writable. */ mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name, - BDRV_O_RDWR, errp); + BDRV_O_RDWR | BDRV_O_NOCACHE, errp); if (mirror_top_bs == NULL) { return NULL; } -- Gitee From 652325f9a04143ffabf5e9a418253a05e927ec37 Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 11:18:21 +0800 Subject: [PATCH 052/939] block: enable cache mode of empty cdrom enable cache mode even if cdrom is empty Signed-off-by: wangjian161 --- blockdev.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/blockdev.c b/blockdev.c index c91f49e7b6..bc2099e9da 100644 --- a/blockdev.c +++ b/blockdev.c @@ -493,6 +493,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, QDict *interval_dict = NULL; QList *interval_list = NULL; const char *id; + const char *cache; BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; const char *throttling_group = NULL; @@ -580,6 +581,21 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, read_only = qemu_opt_get_bool(opts, BDRV_OPT_READ_ONLY, false); + if (!file || !*file) { + cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH); + if (cache && !strcmp(cache, "on")) { + bdrv_flags |= BDRV_O_NO_FLUSH; + } + + cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_DIRECT); + if (cache && !strcmp(cache, "on")) { + bdrv_flags |= BDRV_O_NOCACHE; + } + + qdict_del(bs_opts, BDRV_OPT_CACHE_NO_FLUSH); + qdict_del(bs_opts, BDRV_OPT_CACHE_DIRECT); + } + /* init */ if ((!file || !*file) && !qdict_size(bs_opts)) { BlockBackendRootState *blk_rs; -- Gitee From 85307e997e4ee7a50a87ac2ac218911c0058d8e3 Mon Sep 17 00:00:00 2001 From: Pan Nengyuan Date: Mon, 13 Jan 2020 15:53:32 +0800 Subject: [PATCH 053/939] scsi-disk: define props in scsi_block_disk to avoid memleaks scsi_block_realize() use scsi_realize() to init some props, but these props is not defined in scsi_block_properties, so they will not be freed. This patch defines these prop in scsi_block_disk_properties to avoid memleaks. Signed-off-by: Pan Nengyuan Signed-off-by: Yan Wang Signed-off-by: shaodenghui --- hw/scsi/scsi-disk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 6691f5edb8..f638854ebf 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -3241,9 +3241,7 @@ static const TypeInfo scsi_cd_info = { #ifdef __linux__ static Property scsi_block_properties[] = { - DEFINE_BLOCK_ERROR_PROPERTIES(SCSIDiskState, qdev.conf), - DEFINE_PROP_DRIVE("drive", SCSIDiskState, qdev.conf.blk), - DEFINE_PROP_BOOL("share-rw", SCSIDiskState, qdev.conf.share_rw, false), + DEFINE_SCSI_DISK_PROPERTIES(), DEFINE_PROP_UINT16("rotation_rate", SCSIDiskState, rotation_rate, 0), DEFINE_PROP_UINT64("max_unmap_size", SCSIDiskState, max_unmap_size, DEFAULT_MAX_UNMAP_SIZE), -- Gitee From 48f32788794e061ab0b359fe194c964849bb3040 Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 11:10:42 +0800 Subject: [PATCH 054/939] qemu-pr: fixed ioctl failed for multipath disk We use ioctl to detect multipath devices. However, we only set flags in struct dm_ioctl (the argument to ioctl) and left other fields in random, which may cause the failure of calling ioctl. Hence, we set other fields to 0 to avoid the failure. Signed-off-by: wangjian161 Signed-off-by: shaodenghui --- scsi/qemu-pr-helper.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c index c6c6347e9b..655404fd07 100644 --- a/scsi/qemu-pr-helper.c +++ b/scsi/qemu-pr-helper.c @@ -285,9 +285,12 @@ static void multipath_pr_init(void) static int is_mpath(int fd) { - struct dm_ioctl dm = { .flags = DM_NOFLUSH_FLAG }; + struct dm_ioctl dm; struct dm_target_spec *tgt; + memset(&dm, 0, sizeof(struct dm_ioctl)); + dm.flags = DM_NOFLUSH_FLAG; + tgt = dm_dev_ioctl(fd, DM_TABLE_STATUS, &dm); if (!tgt) { if (errno == ENXIO) { -- Gitee From aac11bd40369aa31c9b3efb701242cc307ce5645 Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 11:42:47 +0800 Subject: [PATCH 055/939] scsi: cdrom: Fix crash after remote cdrom detached There is a small window between the twice blk_is_available in scsi_disk_emulate_command which would cause crash due to the later assertion if the remote cdrom is detached in this window. So this patch replaces assertions with return to avoid qemu crash. Signed-off-by: wangjian161 Signed-off-by: shaodenghui --- hw/scsi/scsi-disk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index f638854ebf..7f581efce8 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -2021,7 +2021,10 @@ static int32_t scsi_disk_emulate_command(SCSIRequest *req, uint8_t *buf) memset(outbuf, 0, r->buflen); switch (req->cmd.buf[0]) { case TEST_UNIT_READY: - assert(blk_is_available(s->qdev.conf.blk)); + if (!blk_is_available(s->qdev.conf.blk)) { + scsi_check_condition(r, SENSE_CODE(NO_MEDIUM)); + return 0; + } break; case INQUIRY: buflen = scsi_disk_emulate_inquiry(req, outbuf); -- Gitee From f2837d186532fb82ed01dbe32bdcf9dda6b06258 Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 16:34:05 +0800 Subject: [PATCH 056/939] scsi: bugfix: fix division by zero Error of PRDM disk may cause divide by zero in scsi_read_complete(), so add LOG and assert(). Signed-off-by: wangjian161 Signed-off-by: shaodenghui --- hw/scsi/scsi-generic.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index 2417f0ad84..22efcd09a6 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -192,6 +192,10 @@ static int scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s, int len) (r->req.cmd.buf[1] & 0x01)) { page = r->req.cmd.buf[2]; if (page == 0xb0 && r->buflen >= 8) { + if (s->blocksize == 0) { + qemu_log("device blocksize is 0!\n"); + abort(); + } uint8_t buf[16] = {}; uint8_t buf_used = MIN(r->buflen, 16); uint64_t max_transfer = calculate_max_transfer(s); @@ -326,11 +330,23 @@ static void scsi_read_complete(void * opaque, int ret) /* Snoop READ CAPACITY output to set the blocksize. */ if (r->req.cmd.buf[0] == READ_CAPACITY_10 && (ldl_be_p(&r->buf[0]) != 0xffffffffU || s->max_lba == 0)) { - s->blocksize = ldl_be_p(&r->buf[4]); + int new_blocksize = ldl_be_p(&r->buf[4]); + if (s->blocksize != new_blocksize) { + qemu_log("device id=%s type=%d: blocksize %d change to %d\n", + s->qdev.id ? s->qdev.id : "null", s->type, + s->blocksize, new_blocksize); + } + s->blocksize = new_blocksize; s->max_lba = ldl_be_p(&r->buf[0]) & 0xffffffffULL; } else if (r->req.cmd.buf[0] == SERVICE_ACTION_IN_16 && (r->req.cmd.buf[1] & 31) == SAI_READ_CAPACITY_16) { - s->blocksize = ldl_be_p(&r->buf[8]); + int new_blocksize = ldl_be_p(&r->buf[8]); + if (s->blocksize != new_blocksize) { + qemu_log("device id=%s type=%d: blocksize %d change to %d\n", + s->qdev.id ? s->qdev.id : "null", s->type, + s->blocksize, new_blocksize); + } + s->blocksize = new_blocksize; s->max_lba = ldq_be_p(&r->buf[0]); } -- Gitee From cfc15dc456126a6fb811f0c51af8d8ce5c4a4a1b Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:45 +0800 Subject: [PATCH 057/939] qapi/block-core: Add retry option for error action Add a new error action 'retry' to support retry on errors. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- blockdev.c | 2 ++ qapi/block-core.json | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index c91f49e7b6..2817f73fad 100644 --- a/blockdev.c +++ b/blockdev.c @@ -326,6 +326,8 @@ static int parse_block_error_action(const char *buf, bool is_read, Error **errp) return BLOCKDEV_ON_ERROR_STOP; } else if (!strcmp(buf, "report")) { return BLOCKDEV_ON_ERROR_REPORT; + } else if (!strcmp(buf, "retry")) { + return BLOCKDEV_ON_ERROR_RETRY; } else { error_setg(errp, "'%s' invalid %s error action", buf, is_read ? "read" : "write"); diff --git a/qapi/block-core.json b/qapi/block-core.json index 1444624590..ded6f0f6d2 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1286,10 +1286,12 @@ # # @auto: inherit the error handling policy of the backend (since: 2.7) # +# @retry: retrying IO with errors +# # Since: 1.3 ## { 'enum': 'BlockdevOnError', - 'data': ['report', 'ignore', 'enospc', 'stop', 'auto'] } + 'data': ['report', 'ignore', 'enospc', 'stop', 'auto', 'retry'] } ## # @MirrorSyncMode: @@ -5480,10 +5482,12 @@ # # @stop: error caused VM to be stopped # +# @retry: retry IO with errors +# # Since: 2.1 ## { 'enum': 'BlockErrorAction', - 'data': [ 'ignore', 'report', 'stop' ] } + 'data': [ 'ignore', 'report', 'stop', 'retry' ] } ## # @BLOCK_IMAGE_CORRUPTED: -- Gitee From 9567fce96050342f393f546d3c5131118c3cad7c Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:46 +0800 Subject: [PATCH 058/939] block-backend: Introduce retry timer Add a timer to regularly trigger retry on errors. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- block/block-backend.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/block/block-backend.c b/block/block-backend.c index ec21148806..2f56cc8382 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -33,6 +33,9 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +/* block backend default retry interval */ +#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL 1000 + typedef struct BlockBackendAioNotifier { void (*attached_aio_context)(AioContext *new_context, void *opaque); void (*detach_aio_context)(void *opaque); @@ -92,6 +95,15 @@ struct BlockBackend { * Accessed with atomic ops. */ unsigned int in_flight; + + /* Timer for retry on errors. */ + QEMUTimer *retry_timer; + /* Interval in ms to trigger next retry. */ + int64_t retry_interval; + /* Start time of the first error. Used to check timeout. */ + int64_t retry_start_time; + /* Retry timeout. 0 represents infinite retry. */ + int64_t retry_timeout; }; typedef struct BlockBackendAIOCB { @@ -368,6 +380,11 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm) blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT; blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC; + blk->retry_timer = NULL; + blk->retry_interval = BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL; + blk->retry_start_time = 0; + blk->retry_timeout = 0; + block_acct_init(&blk->stats); qemu_mutex_init(&blk->queued_requests_lock); @@ -508,6 +525,10 @@ static void blk_delete(BlockBackend *blk) QTAILQ_REMOVE(&block_backends, blk, link); drive_info_del(blk->legacy_dinfo); block_acct_cleanup(&blk->stats); + if (blk->retry_timer) { + timer_del(blk->retry_timer); + timer_free(blk->retry_timer); + } g_free(blk); } -- Gitee From 94580294f0fda3c715caa19f4b33718212c9c531 Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:47 +0800 Subject: [PATCH 059/939] block-backend: Add device specific retry callback Add retry_request_cb in BlockDevOps to do device specific retry action. Backend's timer would be registered only when the backend is set 'retry' on errors and the device supports retry action. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- block/block-backend.c | 8 ++++++++ include/sysemu/block-backend-common.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/block/block-backend.c b/block/block-backend.c index 2f56cc8382..7e25d5a058 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1123,6 +1123,14 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, blk->dev_ops = ops; blk->dev_opaque = opaque; + if ((blk->on_read_error == BLOCKDEV_ON_ERROR_RETRY || + blk->on_write_error == BLOCKDEV_ON_ERROR_RETRY) && + ops->retry_request_cb) { + blk->retry_timer = aio_timer_new(blk->ctx, QEMU_CLOCK_REALTIME, + SCALE_MS, ops->retry_request_cb, + opaque); + } + /* Are we currently quiesced? Should we enforce this right now? */ if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) { ops->drained_begin(opaque); diff --git a/include/sysemu/block-backend-common.h b/include/sysemu/block-backend-common.h index 780cea7305..b76df8834a 100644 --- a/include/sysemu/block-backend-common.h +++ b/include/sysemu/block-backend-common.h @@ -71,6 +71,10 @@ typedef struct BlockDevOps { * Is the device still busy? */ bool (*drained_poll)(void *opaque); + /* + * Runs when retrying failed requests. + */ + void (*retry_request_cb)(void *opaque); /* * I/O API functions. These functions are thread-safe. -- Gitee From 7bcf4385f518580509990ff71c8209505c887abc Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:48 +0800 Subject: [PATCH 060/939] block-backend: Enable retry action on errors Enable retry action when backend's retry timer is available. It would trigger the timer to do device specific retry action. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- block/block-backend.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/block-backend.c b/block/block-backend.c index 7e25d5a058..e62808fc03 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2179,6 +2179,9 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, return BLOCK_ERROR_ACTION_REPORT; case BLOCKDEV_ON_ERROR_IGNORE: return BLOCK_ERROR_ACTION_IGNORE; + case BLOCKDEV_ON_ERROR_RETRY: + return (blk->retry_timer) ? + BLOCK_ERROR_ACTION_RETRY : BLOCK_ERROR_ACTION_REPORT; case BLOCKDEV_ON_ERROR_AUTO: default: abort(); @@ -2227,6 +2230,10 @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action, qemu_system_vmstop_request_prepare(); send_qmp_error_event(blk, action, is_read, error); qemu_system_vmstop_request(RUN_STATE_IO_ERROR); + } else if (action == BLOCK_ERROR_ACTION_RETRY) { + timer_mod(blk->retry_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + + blk->retry_interval); + send_qmp_error_event(blk, action, is_read, error); } else { send_qmp_error_event(blk, action, is_read, error); } -- Gitee From b4bb154e6587b6d3fef819efcced803e309c4e05 Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:49 +0800 Subject: [PATCH 061/939] block-backend: Add timeout support for retry Retry should only be triggered when timeout is not reached, so let's check timeout before retry. Device should also reset retry_start_time after successful retry. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- block/block-backend.c | 25 ++++++++++++++++++++- include/sysemu/block-backend-global-state.h | 1 + 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/block/block-backend.c b/block/block-backend.c index e62808fc03..919699bb70 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2149,6 +2149,29 @@ void blk_drain_all(void) bdrv_drain_all_end(); } +static bool blk_error_retry_timeout(BlockBackend *blk) +{ + /* No timeout set, infinite retries. */ + if (!blk->retry_timeout) { + return false; + } + + /* The first time an error occurs. */ + if (!blk->retry_start_time) { + blk->retry_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + return false; + } + + return qemu_clock_get_ms(QEMU_CLOCK_REALTIME) > (blk->retry_start_time + + blk->retry_timeout); +} + +void blk_error_retry_reset_timeout(BlockBackend *blk) +{ + if (blk->retry_timer && blk->retry_start_time) + blk->retry_start_time = 0; +} + void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, BlockdevOnError on_write_error) { @@ -2180,7 +2203,7 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, case BLOCKDEV_ON_ERROR_IGNORE: return BLOCK_ERROR_ACTION_IGNORE; case BLOCKDEV_ON_ERROR_RETRY: - return (blk->retry_timer) ? + return (blk->retry_timer && !blk_error_retry_timeout(blk)) ? BLOCK_ERROR_ACTION_RETRY : BLOCK_ERROR_ACTION_REPORT; case BLOCKDEV_ON_ERROR_AUTO: default: diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h index 49c12b0fa9..7f59fd411d 100644 --- a/include/sysemu/block-backend-global-state.h +++ b/include/sysemu/block-backend-global-state.h @@ -84,6 +84,7 @@ int blk_commit_all(void); bool blk_in_drain(BlockBackend *blk); void blk_drain(BlockBackend *blk); void blk_drain_all(void); +void blk_error_retry_reset_timeout(BlockBackend *blk); void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, BlockdevOnError on_write_error); bool blk_supports_write_perm(BlockBackend *blk); -- Gitee From d777d1585603aa7599ae8bac4492fafdf1e4b109 Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:50 +0800 Subject: [PATCH 062/939] block: Add error retry param setting Add "retry_interval" and "retry_timeout" parameter for drive and device option. These parameter are valid only when werror/rerror=retry. eg. -device device_name,drive=drive_id,rerror=retry,retry_interval=1000,retry_timeout=5000 Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- block/block-backend.c | 13 ++++-- blockdev.c | 50 +++++++++++++++++++++ hw/block/block.c | 10 +++++ include/hw/block/block.h | 7 ++- include/sysemu/block-backend-common.h | 3 ++ include/sysemu/block-backend-global-state.h | 2 + 6 files changed, 81 insertions(+), 4 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 919699bb70..85d732de7e 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -33,9 +33,6 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -/* block backend default retry interval */ -#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL 1000 - typedef struct BlockBackendAioNotifier { void (*attached_aio_context)(AioContext *new_context, void *opaque); void (*detach_aio_context)(void *opaque); @@ -2149,6 +2146,16 @@ void blk_drain_all(void) bdrv_drain_all_end(); } +void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval) +{ + blk->retry_interval = interval; +} + +void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout) +{ + blk->retry_timeout = timeout; +} + static bool blk_error_retry_timeout(BlockBackend *blk) { /* No timeout set, infinite retries. */ diff --git a/blockdev.c b/blockdev.c index 2817f73fad..6a229e77a5 100644 --- a/blockdev.c +++ b/blockdev.c @@ -484,6 +484,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, const char *buf; int bdrv_flags = 0; int on_read_error, on_write_error; + int64_t retry_interval, retry_timeout; OnOffAuto account_invalid, account_failed; bool writethrough, read_only; BlockBackend *blk; @@ -576,6 +577,10 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, } } + retry_interval = qemu_opt_get_number(opts, "retry_interval", + BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL); + retry_timeout = qemu_opt_get_number(opts, "retry_timeout", 0); + if (snapshot) { bdrv_flags |= BDRV_O_SNAPSHOT; } @@ -639,6 +644,11 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, blk_set_enable_write_cache(blk, !writethrough); blk_set_on_error(blk, on_read_error, on_write_error); + if (on_read_error == BLOCKDEV_ON_ERROR_RETRY || + on_write_error == BLOCKDEV_ON_ERROR_RETRY) { + blk_set_on_error_retry_interval(blk, retry_interval); + blk_set_on_error_retry_timeout(blk, retry_timeout); + } if (!monitor_add_blk(blk, id, errp)) { blk_unref(blk); @@ -773,6 +783,14 @@ QemuOptsList qemu_legacy_drive_opts = { .name = "werror", .type = QEMU_OPT_STRING, .help = "write error action", + },{ + .name = "retry_interval", + .type = QEMU_OPT_NUMBER, + .help = "interval for retry action in millisecond", + },{ + .name = "retry_timeout", + .type = QEMU_OPT_NUMBER, + .help = "timeout for retry action in millisecond", },{ .name = "copy-on-read", .type = QEMU_OPT_BOOL, @@ -795,6 +813,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type, BlockInterfaceType type; int max_devs, bus_id, unit_id, index; const char *werror, *rerror; + int64_t retry_interval, retry_timeout; bool read_only = false; bool copy_on_read; const char *filename; @@ -1013,6 +1032,29 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type, qdict_put_str(bs_opts, "rerror", rerror); } + if (qemu_opt_find(legacy_opts, "retry_interval")) { + if ((werror == NULL || strcmp(werror, "retry")) && + (rerror == NULL || strcmp(rerror, "retry"))) { + error_setg(errp, "retry_interval is only supported " + "by werror/rerror=retry"); + goto fail; + } + retry_interval = qemu_opt_get_number(legacy_opts, "retry_interval", + BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL); + qdict_put_int(bs_opts, "retry_interval", retry_interval); + } + + if (qemu_opt_find(legacy_opts, "retry_timeout")) { + if ((werror == NULL || strcmp(werror, "retry")) && + (rerror == NULL || strcmp(rerror, "retry"))) { + error_setg(errp, "retry_timeout is only supported " + "by werror/rerror=retry"); + goto fail; + } + retry_timeout = qemu_opt_get_number(legacy_opts, "retry_timeout", 0); + qdict_put_int(bs_opts, "retry_timeout", retry_timeout); + } + /* Actual block device init: Functionality shared with blockdev-add */ blk = blockdev_init(filename, bs_opts, errp); bs_opts = NULL; @@ -3794,6 +3836,14 @@ QemuOptsList qemu_common_drive_opts = { .name = "werror", .type = QEMU_OPT_STRING, .help = "write error action", + },{ + .name = "retry_interval", + .type = QEMU_OPT_NUMBER, + .help = "interval for retry action in millisecond", + },{ + .name = "retry_timeout", + .type = QEMU_OPT_NUMBER, + .help = "timeout for retry action in millisecond", },{ .name = BDRV_OPT_READ_ONLY, .type = QEMU_OPT_BOOL, diff --git a/hw/block/block.c b/hw/block/block.c index 9f52ee6e72..6bece87709 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -239,6 +239,16 @@ bool blkconf_apply_backend_options(BlockConf *conf, bool readonly, blk_set_enable_write_cache(blk, wce); blk_set_on_error(blk, rerror, werror); + if (rerror == BLOCKDEV_ON_ERROR_RETRY || + werror == BLOCKDEV_ON_ERROR_RETRY) { + if (conf->retry_interval >= 0) { + blk_set_on_error_retry_interval(blk, conf->retry_interval); + } + if (conf->retry_timeout >= 0) { + blk_set_on_error_retry_timeout(blk, conf->retry_timeout); + } + } + block_acct_setup(blk_get_stats(blk), conf->account_invalid, conf->account_failed); return true; diff --git a/include/hw/block/block.h b/include/hw/block/block.h index 15fff66435..fb8c0df4a5 100644 --- a/include/hw/block/block.h +++ b/include/hw/block/block.h @@ -34,6 +34,8 @@ typedef struct BlockConf { OnOffAuto account_invalid, account_failed; BlockdevOnError rerror; BlockdevOnError werror; + int64_t retry_interval; + int64_t retry_timeout; } BlockConf; static inline unsigned int get_physical_block_exp(BlockConf *conf) @@ -84,7 +86,10 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) DEFINE_PROP_BLOCKDEV_ON_ERROR("rerror", _state, _conf.rerror, \ BLOCKDEV_ON_ERROR_AUTO), \ DEFINE_PROP_BLOCKDEV_ON_ERROR("werror", _state, _conf.werror, \ - BLOCKDEV_ON_ERROR_AUTO) + BLOCKDEV_ON_ERROR_AUTO), \ + DEFINE_PROP_INT64("retry_interval", _state, _conf.retry_interval, \ + -1), \ + DEFINE_PROP_INT64("retry_timeout", _state, _conf.retry_timeout, -1) /* Backend access helpers */ diff --git a/include/sysemu/block-backend-common.h b/include/sysemu/block-backend-common.h index b76df8834a..5a1cdac9c4 100644 --- a/include/sysemu/block-backend-common.h +++ b/include/sysemu/block-backend-common.h @@ -16,6 +16,9 @@ #include "qemu/iov.h" #include "block/throttle-groups.h" +/* block backend default retry interval */ +#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL 1000 + /* * TODO Have to include block/block.h for a bunch of block layer * types. Unfortunately, this pulls in the whole BlockDriverState diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h index 7f59fd411d..d56592c22e 100644 --- a/include/sysemu/block-backend-global-state.h +++ b/include/sysemu/block-backend-global-state.h @@ -84,6 +84,8 @@ int blk_commit_all(void); bool blk_in_drain(BlockBackend *blk); void blk_drain(BlockBackend *blk); void blk_drain_all(void); +void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval); +void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout); void blk_error_retry_reset_timeout(BlockBackend *blk); void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, BlockdevOnError on_write_error); -- Gitee From 0da112402efe63e09fdd6ed43aa026d5b625988f Mon Sep 17 00:00:00 2001 From: yexiao Date: Thu, 21 Jan 2021 15:46:53 +0800 Subject: [PATCH 063/939] virtio_blk: Add support for retry on errors Insert failed requests into device's list for later retry and handle queued requests to implement retry_request_cb. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- hw/block/virtio-blk.c | 47 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index a1f8e15522..1ebc9188c0 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -90,6 +90,10 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, block_acct_failed(blk_get_stats(s->blk), &req->acct); } virtio_blk_free_request(req); + } else if (action == BLOCK_ERROR_ACTION_RETRY) { + req->mr_next = NULL; + req->next = s->rq; + s->rq = req; } blk_error_action(s->blk, action, is_read, error); @@ -131,6 +135,7 @@ static void virtio_blk_rw_complete(void *opaque, int ret) } } + blk_error_retry_reset_timeout(s->blk); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); block_acct_done(blk_get_stats(s->blk), &req->acct); virtio_blk_free_request(req); @@ -150,6 +155,7 @@ static void virtio_blk_flush_complete(void *opaque, int ret) } } + blk_error_retry_reset_timeout(s->blk); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); block_acct_done(blk_get_stats(s->blk), &req->acct); virtio_blk_free_request(req); @@ -172,6 +178,7 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret) } } + blk_error_retry_reset_timeout(s->blk); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); if (is_write_zeroes) { block_acct_done(blk_get_stats(s->blk), &req->acct); @@ -1183,12 +1190,12 @@ static void virtio_blk_dma_restart_bh(void *opaque) { VirtIOBlock *s = opaque; - VirtIOBlockReq *req = s->rq; + VirtIOBlockReq *req; MultiReqBuffer mrb = {}; - s->rq = NULL; - aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + req = s->rq; + s->rq = NULL; while (req) { VirtIOBlockReq *next = req->next; if (virtio_blk_handle_request(req, &mrb)) { @@ -1541,10 +1548,44 @@ static void virtio_blk_drained_end(void *opaque) } } +static void virtio_blk_retry_request(void *opaque) +{ + VirtIOBlock *s = VIRTIO_BLK(opaque); + + VirtIOBlockReq *req; + MultiReqBuffer mrb = {}; + + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + req = s->rq; + s->rq = NULL; + while (req) { + VirtIOBlockReq *next = req->next; + if (virtio_blk_handle_request(req, &mrb)) { + /* Device is now broken and won't do any processing until it gets + * reset. Already queued requests will be lost: let's purge them. + */ + while (req) { + next = req->next; + virtqueue_detach_element(req->vq, &req->elem, 0); + virtio_blk_free_request(req); + req = next; + } + break; + } + req = next; + } + + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s, &mrb); + } + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); +} + static const BlockDevOps virtio_block_ops = { .resize_cb = virtio_blk_resize, .drained_begin = virtio_blk_drained_begin, .drained_end = virtio_blk_drained_end, + .retry_request_cb = virtio_blk_retry_request, }; static void virtio_blk_device_realize(DeviceState *dev, Error **errp) -- Gitee From d69428c793ca7311c55d0efdaa82100247e35dcc Mon Sep 17 00:00:00 2001 From: Jiahui Cen Date: Thu, 21 Jan 2021 15:46:54 +0800 Subject: [PATCH 064/939] scsi-bus: Refactor the code that retries requests Move the code that retries requests from scsi_dma_restart_bh() to its own, non-static, function. This will allow us to call it from the retry_request_cb() of scsi-disk in a future patch. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- hw/scsi/scsi-bus.c | 16 +++++++++++----- include/hw/scsi/scsi.h | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index fc4b77fdb0..cecb02ae7e 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -144,14 +144,10 @@ void scsi_bus_init_named(SCSIBus *bus, size_t bus_size, DeviceState *host, qbus_set_bus_hotplug_handler(BUS(bus)); } -static void scsi_dma_restart_bh(void *opaque) +void scsi_retry_requests(SCSIDevice *s) { - SCSIDevice *s = opaque; SCSIRequest *req, *next; - qemu_bh_delete(s->bh); - s->bh = NULL; - aio_context_acquire(blk_get_aio_context(s->conf.blk)); QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { scsi_req_ref(req); @@ -175,6 +171,16 @@ static void scsi_dma_restart_bh(void *opaque) object_unref(OBJECT(s)); } +static void scsi_dma_restart_bh(void *opaque) +{ + SCSIDevice *s = opaque; + + qemu_bh_delete(s->bh); + s->bh = NULL; + + scsi_retry_requests(s); +} + void scsi_req_retry(SCSIRequest *req) { /* No need to save a reference, because scsi_dma_restart_bh just diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 3692ca82f3..6ec18bf12b 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -226,6 +226,7 @@ void scsi_req_cancel_complete(SCSIRequest *req); void scsi_req_cancel(SCSIRequest *req); void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier); void scsi_req_retry(SCSIRequest *req); +void scsi_retry_requests(SCSIDevice *s); void scsi_device_drained_begin(SCSIDevice *sdev); void scsi_device_drained_end(SCSIDevice *sdev); void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense); -- Gitee From 6100f909506025563ecec29b25f64cce75fc2353 Mon Sep 17 00:00:00 2001 From: Jiahui Cen Date: Thu, 21 Jan 2021 15:46:55 +0800 Subject: [PATCH 065/939] scsi-disk: Add support for retry on errors Mark failed requests as to be retried and implement retry_request_cb to handle these requests. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- hw/scsi/scsi-disk.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 6691f5edb8..97d8c5bb30 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -249,6 +249,10 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int ret, bool acct_failed) scsi_req_retry(&r->req); return true; + case BLOCK_ERROR_ACTION_RETRY: + scsi_req_retry(&r->req); + return true; + default: g_assert_not_reached(); } @@ -256,6 +260,8 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int ret, bool acct_failed) static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) { + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); return true; @@ -265,6 +271,7 @@ static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) return scsi_handle_rw_error(r, ret, acct_failed); } + blk_error_retry_reset_timeout(s->qdev.conf.blk); return false; } @@ -2391,6 +2398,13 @@ static void scsi_disk_resize_cb(void *opaque) } } +static void scsi_disk_retry_request(void *opaque) +{ + SCSIDiskState *s = opaque; + + scsi_retry_requests(&s->qdev); +} + static void scsi_cd_change_media_cb(void *opaque, bool load, Error **errp) { SCSIDiskState *s = opaque; @@ -2440,12 +2454,14 @@ static const BlockDevOps scsi_disk_removable_block_ops = { .is_medium_locked = scsi_cd_is_medium_locked, .is_tray_open = scsi_cd_is_tray_open, .resize_cb = scsi_disk_resize_cb, + .retry_request_cb = scsi_disk_retry_request, }; static const BlockDevOps scsi_disk_block_ops = { .drained_begin = scsi_disk_drained_begin, .drained_end = scsi_disk_drained_end, .resize_cb = scsi_disk_resize_cb, + .retry_request_cb = scsi_disk_retry_request, }; static void scsi_disk_unit_attention_reported(SCSIDevice *dev) -- Gitee From bbac66be575c76216c18d68c558e0dc80a078f68 Mon Sep 17 00:00:00 2001 From: Jiahui Cen Date: Thu, 25 Feb 2021 18:03:57 +0800 Subject: [PATCH 066/939] block-backend: Stop retrying when draining Retrying failed requests when draining would make the draining hung. So it is better not to trigger the retry timer when draining. And after the virtual devices go back to work, they would retry those queued requests. Signed-off-by: Jiahui Cen Signed-off-by: Ying Fang Signed-off-by: Alex Chen --- block/block-backend.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 85d732de7e..bfbbb18af1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2261,9 +2261,11 @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action, send_qmp_error_event(blk, action, is_read, error); qemu_system_vmstop_request(RUN_STATE_IO_ERROR); } else if (action == BLOCK_ERROR_ACTION_RETRY) { - timer_mod(blk->retry_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + - blk->retry_interval); - send_qmp_error_event(blk, action, is_read, error); + if (!blk->quiesce_counter) { + timer_mod(blk->retry_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + + blk->retry_interval); + send_qmp_error_event(blk, action, is_read, error); + } } else { send_qmp_error_event(blk, action, is_read, error); } -- Gitee From e880fc334edb8d07593679cf0c6a9af810c51d0d Mon Sep 17 00:00:00 2001 From: Jiahui Cen Date: Thu, 18 Mar 2021 19:45:11 +0800 Subject: [PATCH 067/939] block: Add sanity check when setting retry parameters Add sanity check when setting retry parameters to avoid invalid retry configuration. Signed-off-by: Jiahui Cen Signed-off-by: Alex Chen --- hw/core/qdev-prop-internal.h | 2 ++ hw/core/qdev-properties-system.c | 45 +++++++++++++++++++++++++++++ hw/core/qdev-properties.c | 4 +-- include/hw/block/block.h | 7 +++-- include/hw/qdev-properties-system.h | 8 +++++ 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/hw/core/qdev-prop-internal.h b/hw/core/qdev-prop-internal.h index d7b77844fe..68b1b9d10c 100644 --- a/hw/core/qdev-prop-internal.h +++ b/hw/core/qdev-prop-internal.h @@ -22,6 +22,8 @@ void qdev_propinfo_set_default_value_uint(ObjectProperty *op, void qdev_propinfo_get_int32(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp); +void qdev_propinfo_get_int64(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp); void qdev_propinfo_get_size32(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp); diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 1473ab3d5e..f2e2718c74 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -635,6 +635,51 @@ const PropertyInfo qdev_prop_blockdev_on_error = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +static void set_retry_time(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + DeviceState *dev = DEVICE(obj); + Property *prop = opaque; + int64_t value, *ptr = object_field_prop_ptr(obj, prop); + Error *local_err = NULL; + + if (dev->realized) { + qdev_prop_set_after_realize(dev, name, errp); + return; + } + + visit_type_int64(v, name, &value, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + /* value should not be negative */ + if (value < 0) { + error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE, + dev->id ? : "", name, (int64_t)value, 0L, LONG_MAX); + return; + } + + *ptr = value; +} + +const PropertyInfo qdev_prop_blockdev_retry_interval = { + .name = "BlockdevRetryInterval", + .description = "Interval for retry error handling policy", + .get = qdev_propinfo_get_int64, + .set = set_retry_time, + .set_default_value = qdev_propinfo_set_default_value_int, +}; + +const PropertyInfo qdev_prop_blockdev_retry_timeout = { + .name = "BlockdevRetryTimeout", + .description = "Timeout for retry error handling policy", + .get = qdev_propinfo_get_int64, + .set = set_retry_time, + .set_default_value = qdev_propinfo_set_default_value_int, +}; + /* --- BIOS CHS translation */ QEMU_BUILD_BUG_ON(sizeof(BiosAtaTranslation) != sizeof(int)); diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index 840006e953..19b7450b4d 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -398,7 +398,7 @@ static void set_uint64(Object *obj, Visitor *v, const char *name, visit_type_uint64(v, name, ptr, errp); } -static void get_int64(Object *obj, Visitor *v, const char *name, +void qdev_propinfo_get_int64(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { Property *prop = opaque; @@ -425,7 +425,7 @@ const PropertyInfo qdev_prop_uint64 = { const PropertyInfo qdev_prop_int64 = { .name = "int64", - .get = get_int64, + .get = qdev_propinfo_get_int64, .set = set_int64, .set_default_value = qdev_propinfo_set_default_value_int, }; diff --git a/include/hw/block/block.h b/include/hw/block/block.h index fb8c0df4a5..844e87495a 100644 --- a/include/hw/block/block.h +++ b/include/hw/block/block.h @@ -87,9 +87,10 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) BLOCKDEV_ON_ERROR_AUTO), \ DEFINE_PROP_BLOCKDEV_ON_ERROR("werror", _state, _conf.werror, \ BLOCKDEV_ON_ERROR_AUTO), \ - DEFINE_PROP_INT64("retry_interval", _state, _conf.retry_interval, \ - -1), \ - DEFINE_PROP_INT64("retry_timeout", _state, _conf.retry_timeout, -1) + DEFINE_PROP_BLOCKDEV_RETRY_INTERVAL("retry_interval", _state, \ + _conf.retry_interval, 1000), \ + DEFINE_PROP_BLOCKDEV_RETRY_TIMEOUT("retry_timeout", _state, \ + _conf.retry_timeout, 0) /* Backend access helpers */ diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 91f7a2452d..7cf27e51b9 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -10,6 +10,8 @@ extern const PropertyInfo qdev_prop_multifd_compression; extern const PropertyInfo qdev_prop_mig_mode; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; +extern const PropertyInfo qdev_prop_blockdev_retry_interval; +extern const PropertyInfo qdev_prop_blockdev_retry_timeout; extern const PropertyInfo qdev_prop_bios_chs_trans; extern const PropertyInfo qdev_prop_fdc_drive_type; extern const PropertyInfo qdev_prop_drive; @@ -52,6 +54,12 @@ extern const PropertyInfo qdev_prop_cpus390entitlement; #define DEFINE_PROP_BLOCKDEV_ON_ERROR(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_on_error, \ BlockdevOnError) +#define DEFINE_PROP_BLOCKDEV_RETRY_INTERVAL(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_retry_interval, \ + int64_t) +#define DEFINE_PROP_BLOCKDEV_RETRY_TIMEOUT(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_retry_timeout, \ + int64_t) #define DEFINE_PROP_BIOS_CHS_TRANS(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_bios_chs_trans, int) #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f) \ -- Gitee From c2f55f210d4e021121865ea31037d2751188befd Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 1 Mar 2022 20:12:12 +0800 Subject: [PATCH 068/939] scsi-bus: fix unmatched object_unref() Fix commit 391dd8f1("scsi-bus: Refactor the code that retries requests"), which split scsi_dma_restart_bh(), but the object_unref() belongs to scsi_dma_restart_bh(). So, we should mv object_unref() from scsi_retry_requests() to scsi_dma_restart_bh(). Signed-off-by: Yan Wang Signed-off-by: shaodenghui --- hw/scsi/scsi-bus.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index cecb02ae7e..7b60ac11f5 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -167,8 +167,6 @@ void scsi_retry_requests(SCSIDevice *s) scsi_req_unref(req); } aio_context_release(blk_get_aio_context(s->conf.blk)); - /* Drop the reference that was acquired in scsi_dma_restart_cb */ - object_unref(OBJECT(s)); } static void scsi_dma_restart_bh(void *opaque) @@ -179,6 +177,9 @@ static void scsi_dma_restart_bh(void *opaque) s->bh = NULL; scsi_retry_requests(s); + + /* Drop the reference that was acquired in scsi_dma_restart_cb */ + object_unref(OBJECT(s)); } void scsi_req_retry(SCSIRequest *req) -- Gitee From 60181b02c77f533105f904ab9e023bc22f65ad48 Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 29 Mar 2022 12:05:56 +0800 Subject: [PATCH 069/939] scsi-bus: fix incorrect call for blk_error_retry_reset_timeout() Fix commit 52115ca0("scsi-disk: Add support for retry on errors"). Call Stack: ... scsi_read_data() scsi_do_read(r, 0) scsi_disk_req_check_error() blk_error_retry_reset_timeout() blk->retry_start_time = 0; It will cause IO hang when storage network disconnected. Before the storage network recovered, the upper call stack will reset the retry_start_time, and cause the next IO operation not returned immediately. Signed-off-by: Yan Wang Signed-off-by: shaodenghui --- hw/scsi/scsi-disk.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 97d8c5bb30..845a2a7d5d 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -258,10 +258,8 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int ret, bool acct_failed) } } -static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +static bool scsi_disk_req_handle_error(SCSIDiskReq *r, int ret, bool acct_failed) { - SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); - if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); return true; @@ -271,6 +269,17 @@ static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) return scsi_handle_rw_error(r, ret, acct_failed); } + return false; +} + +static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +{ + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + + if (r->req.io_canceled || ret < 0) { + return scsi_disk_req_handle_error(r, ret, acct_failed); + } + blk_error_retry_reset_timeout(s->qdev.conf.blk); return false; } @@ -423,7 +432,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret) SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); assert (r->req.aiocb == NULL); - if (scsi_disk_req_check_error(r, ret, false)) { + if (scsi_disk_req_handle_error(r, ret, false)) { goto done; } @@ -464,6 +473,9 @@ static void scsi_do_read_cb(void *opaque, int ret) block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); } else { block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + if (!r->req.io_canceled) { + blk_error_retry_reset_timeout(s->qdev.conf.blk); + } } scsi_do_read(opaque, ret); aio_context_release(blk_get_aio_context(s->qdev.conf.blk)); -- Gitee From ea0feb8a262383582416283ad1af1819c1e0e22a Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 16:10:22 +0800 Subject: [PATCH 070/939] block: bugfix: Don't pause vm when NOSPACE EIO happened When backend disk is FULL and disk IO type is 'dataplane', QEMU will pause the vm, and this may cause endless-loop in QEMU main thread if we do the snapshot merge now. When backend disk is FULL, only reporting an error rather than pausing the virtual machine. Signed-off-by: wangjian161 --- blockdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index bc2099e9da..455ae8606d 100644 --- a/blockdev.c +++ b/blockdev.c @@ -557,7 +557,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, qdict_put_str(bs_opts, "driver", buf); } - on_write_error = BLOCKDEV_ON_ERROR_ENOSPC; + on_write_error = BLOCKDEV_ON_ERROR_REPORT; if ((buf = qemu_opt_get(opts, "werror")) != NULL) { on_write_error = parse_block_error_action(buf, 0, &error); if (error) { -- Gitee From f9aef3909d23af6a33c604f59dccfcb764090f01 Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 11:29:15 +0800 Subject: [PATCH 071/939] block: disallow block jobs when there is a BDRV_O_INACTIVE flag Currently, migration will put a BDRV_O_INACTIVE flag on bs's open_flags until another resume being called. In that case, any IO from vm or block jobs will cause a qemu crash with an assert 'assert(!(bs->open_flags & BDRV_O_INACTIVE))' failure in bdrv_co_pwritev function. we hereby disallow block jobs by faking a blocker. Signed-off-by: wangjian161 --- block.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block.c b/block.c index bfb0861ec6..b7cb963929 100644 --- a/block.c +++ b/block.c @@ -7298,6 +7298,22 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp) bdrv_get_device_or_node_name(bs)); return true; } + + /* + * When migration puts a BDRV_O_INACTIVE flag on driver's open_flags, + * we fake a blocker that doesn't exist. From now on, block jobs + * will not be permitted. + */ + if ((op == BLOCK_OP_TYPE_RESIZE || op == BLOCK_OP_TYPE_COMMIT_SOURCE || + op == BLOCK_OP_TYPE_MIRROR_SOURCE || op == BLOCK_OP_TYPE_MIRROR_TARGET) && + (bs->open_flags & BDRV_O_INACTIVE)) { + if (errp) { + error_setg(errp, "block device is in use by migration with" + " a driver BDRV_O_INACTIVE flag setted"); + } + return true; + } + return false; } -- Gitee From 6a32c9764439093fe4b53f87059c35761d711e39 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:12 +0900 Subject: [PATCH 072/939] hw/nvme: Use pcie_sriov_num_vfs() (CVE-2024-26328) nvme_sriov_pre_write_ctrl() used to directly inspect SR-IOV configurations to know the number of VFs being disabled due to SR-IOV configuration writes, but the logic was flawed and resulted in out-of-bound memory access. It assumed PCI_SRIOV_NUM_VF always has the number of currently enabled VFs, but it actually doesn't in the following cases: - PCI_SRIOV_NUM_VF has been set but PCI_SRIOV_CTRL_VFE has never been. - PCI_SRIOV_NUM_VF was written after PCI_SRIOV_CTRL_VFE was set. - VFs were only partially enabled because of realization failure. It is a responsibility of pcie_sriov to interpret SR-IOV configurations and pcie_sriov does it correctly, so use pcie_sriov_num_vfs(), which it provides, to get the number of enabled VFs before and after SR-IOV configuration writes. Cc: qemu-stable@nongnu.org Fixes: CVE-2024-26328 Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization Management command") Suggested-by: Michael S. Tsirkin Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-1-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/nvme/ctrl.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f026245d1e..7a56e7b79b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8466,36 +8466,26 @@ static void nvme_pci_reset(DeviceState *qdev) nvme_ctrl_reset(n, NVME_RESET_FUNCTION); } -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, - uint32_t val, int len) +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs) { NvmeCtrl *n = NVME(dev); NvmeSecCtrlEntry *sctrl; - uint16_t sriov_cap = dev->exp.sriov_cap; - uint32_t off = address - sriov_cap; - int i, num_vfs; + int i; - if (!sriov_cap) { - return; - } - - if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - for (i = 0; i < num_vfs; i++) { - sctrl = &n->sec_ctrl_list.sec[i]; - nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); - } - } + for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { + sctrl = &n->sec_ctrl_list.sec[i]; + nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } } static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, uint32_t val, int len) { - nvme_sriov_pre_write_ctrl(dev, address, val, len); + uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); + nvme_sriov_post_write_config(dev, old_num_vfs); } static const VMStateDescription nvme_vmstate = { -- Gitee From 632ec38ed57b76baf3e499d1789aeea0f74df0a5 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 28 Feb 2024 20:33:13 +0900 Subject: [PATCH 073/939] pcie_sriov: Validate NumVFs (CVE-2024-26327) The guest may write NumVFs greater than TotalVFs and that can lead to buffer overflow in VF implementations. Cc: qemu-stable@nongnu.org Fixes: CVE-2024-26327 Fixes: 7c0fa8dff811 ("pcie: Add support for Single Root I/O Virtualization (SR/IOV)") Signed-off-by: Akihiko Odaki Message-Id: <20240228-reuse-v8-2-282660281e60@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Sriram Yagnaraman --- hw/pci/pcie_sriov.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index a1fe65f5d8..da209b7f47 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev) assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + return; + } dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); -- Gitee From db37bc0d85e141a666dd287cdc562a47f29b4343 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Mon, 18 Mar 2024 10:01:28 +0800 Subject: [PATCH 074/939] Revert "file-posix: Remove unused s->discard_zeroes" This reverts commit a7ca2eb488ff149c898f43abe103f8bd8e3ca3c4. --- block/file-posix.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/file-posix.c b/block/file-posix.c index b862406c71..01ae5fd88c 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -158,6 +158,7 @@ typedef struct BDRVRawState { bool has_discard:1; bool has_write_zeroes:1; + bool discard_zeroes:1; bool use_linux_aio:1; bool use_linux_io_uring:1; int page_cache_inconsistent; /* errno from fdatasync failure */ @@ -765,6 +766,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } else { + s->discard_zeroes = true; s->has_fallocate = true; } } else { @@ -790,12 +792,19 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, #endif if (S_ISBLK(st.st_mode)) { +#ifdef BLKDISCARDZEROES + unsigned int arg; + if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { + s->discard_zeroes = true; + } +#endif #ifdef __linux__ /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do * not rely on the contents of discarded blocks unless using O_DIRECT. * Same for BLKZEROOUT. */ if (!(bs->open_flags & BDRV_O_NOCACHE)) { + s->discard_zeroes = false; s->has_write_zeroes = false; } #endif -- Gitee From 912641a75955a75f37ab8695a0753b1571762717 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 2 Jan 2024 11:29:01 +0800 Subject: [PATCH 075/939] virtio-net: correctly copy vnet header when flushing TX (CVE-2023-6693) When HASH_REPORT is negotiated, the guest_hdr_len might be larger than the size of the mergeable rx buffer header. Using virtio_net_hdr_mrg_rxbuf during the header swap might lead a stack overflow in this case. Fixing this by using virtio_net_hdr_v1_hash instead. Reported-by: Xiao Lei Cc: Yuri Benditovich Cc: qemu-stable@nongnu.org Cc: Mauro Matteo Cascella Fixes: CVE-2023-6693 Fixes: e22f0603fb2f ("virtio-net: reference implementation of hash report") Reviewed-by: Michael Tokarev Signed-off-by: Jason Wang --- hw/net/virtio-net.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 80c56f0cfc..73024babd4 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -674,6 +674,11 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, n->mergeable_rx_bufs = mergeable_rx_bufs; + /* + * Note: when extending the vnet header, please make sure to + * change the vnet header copying logic in virtio_net_flush_tx() + * as well. + */ if (version_1) { n->guest_hdr_len = hash_report ? sizeof(struct virtio_net_hdr_v1_hash) : @@ -2693,7 +2698,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) ssize_t ret; unsigned int out_num; struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg; - struct virtio_net_hdr_mrg_rxbuf mhdr; + struct virtio_net_hdr_v1_hash vhdr; elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement)); if (!elem) { @@ -2710,7 +2715,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) } if (n->has_vnet_hdr) { - if (iov_to_buf(out_sg, out_num, 0, &mhdr, n->guest_hdr_len) < + if (iov_to_buf(out_sg, out_num, 0, &vhdr, n->guest_hdr_len) < n->guest_hdr_len) { virtio_error(vdev, "virtio-net header incorrect"); virtqueue_detach_element(q->tx_vq, elem, 0); @@ -2718,8 +2723,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) return -EINVAL; } if (n->needs_vnet_hdr_swap) { - virtio_net_hdr_swap(vdev, (void *) &mhdr); - sg2[0].iov_base = &mhdr; + virtio_net_hdr_swap(vdev, (void *) &vhdr); + sg2[0].iov_base = &vhdr; sg2[0].iov_len = n->guest_hdr_len; out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, out_sg, out_num, -- Gitee From 66d91f8cb6c9668744cf0acda4402f75c5e533e0 Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 19 Mar 2024 14:36:46 +0800 Subject: [PATCH 076/939] hw/cxl/cxl-host: Fix missing ERRP_GUARD() in cxl_fixed_memory_window_config() cheery-pick from 2a0e0a35002db7ac64f4e82ea2a4ad2fb6d934b0 As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in cxl_fixed_memory_window_config(), @errp is dereferenced in 2 places without ERRP_GUARD(): fw->enc_int_ways = cxl_interleave_ways_enc(fw->num_targets, errp); if (*errp) { return; } and fw->enc_int_gran = cxl_interleave_granularity_enc(object->interleave_granularity, errp); if (*errp) { return; } For the above 2 places, we check "*errp", because neither function returns a suitable error code. And since machine_set_cfmw() - the caller of cxl_fixed_memory_window_config() - doesn't get the NULL @errp parameter as the "set" method of object property, cxl_fixed_memory_window_config() hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in cxl_fixed_memory_window_config(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-2-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Jonathan Cameron Signed-off-by: qihao_yewu --- hw/cxl/cxl-host.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index 2aa776c79c..c5f5fcfd64 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -26,6 +26,7 @@ static void cxl_fixed_memory_window_config(CXLState *cxl_state, CXLFixedMemoryWindowOptions *object, Error **errp) { + ERRP_GUARD(); g_autofree CXLFixedWindow *fw = g_malloc0(sizeof(*fw)); strList *target; int i; -- Gitee From 855f7f30de962f79393f0b9f8b0355b996d72de7 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Wed, 24 Jan 2024 11:57:48 +0100 Subject: [PATCH 077/939] ui/clipboard: mark type as not available when there is no data (CVE-2023-6683) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With VNC, a client can send a non-extended VNC_MSG_CLIENT_CUT_TEXT message with len=0. In qemu_clipboard_set_data(), the clipboard info will be updated setting data to NULL (because g_memdup(data, size) returns NULL when size is 0). If the client does not set the VNC_ENCODING_CLIPBOARD_EXT feature when setting up the encodings, then the 'request' callback for the clipboard peer is not initialized. Later, because data is NULL, qemu_clipboard_request() can be reached via vdagent_chr_write() and vdagent_clipboard_recv_request() and there, the clipboard owner's 'request' callback will be attempted to be called, but that is a NULL pointer. In particular, this can happen when using the KRDC (22.12.3) VNC client. Another scenario leading to the same issue is with two clients (say noVNC and KRDC): The noVNC client sets the extension VNC_FEATURE_CLIPBOARD_EXT and initializes its cbpeer. The KRDC client does not, but triggers a vnc_client_cut_text() (note it's not the _ext variant)). There, a new clipboard info with it as the 'owner' is created and via qemu_clipboard_set_data() is called, which in turn calls qemu_clipboard_update() with that info. In qemu_clipboard_update(), the notifier for the noVNC client will be called, i.e. vnc_clipboard_notify() and also set vs->cbinfo for the noVNC client. The 'owner' in that clipboard info is the clipboard peer for the KRDC client, which did not initialize the 'request' function. That sounds correct to me, it is the owner of that clipboard info. Then when noVNC sends a VNC_MSG_CLIENT_CUT_TEXT message (it did set the VNC_FEATURE_CLIPBOARD_EXT feature correctly, so a check for it passes), that clipboard info is passed to qemu_clipboard_request() and the original segfault still happens. Fix the issue by handling updates with size 0 differently. In particular, mark in the clipboard info that the type is not available. While at it, switch to g_memdup2(), because g_memdup() is deprecated. Cc: qemu-stable@nongnu.org Fixes: CVE-2023-6683 Reported-by: Markus Frank Suggested-by: Marc-André Lureau Signed-off-by: Fiona Ebner Reviewed-by: Marc-André Lureau Tested-by: Markus Frank Message-ID: <20240124105749.204610-1-f.ebner@proxmox.com> Signed-off-by: liuxiangdong --- ui/clipboard.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ui/clipboard.c b/ui/clipboard.c index 3d14bffaf8..b3f6fa3c9e 100644 --- a/ui/clipboard.c +++ b/ui/clipboard.c @@ -163,9 +163,15 @@ void qemu_clipboard_set_data(QemuClipboardPeer *peer, } g_free(info->types[type].data); - info->types[type].data = g_memdup(data, size); - info->types[type].size = size; - info->types[type].available = true; + if (size) { + info->types[type].data = g_memdup2(data, size); + info->types[type].size = size; + info->types[type].available = true; + } else { + info->types[type].data = NULL; + info->types[type].size = 0; + info->types[type].available = false; + } if (update) { qemu_clipboard_update(info); -- Gitee From c9ee283913cc9df8998a21544a68ac1d2f86aa49 Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 19 Mar 2024 15:07:51 +0800 Subject: [PATCH 078/939] hw/display/macfb: Fix missing ERRP_GUARD() in macfb_nubus_realize() cheery-pick from 5aa4a6417b0f7acbfd7f4c21dca26293bc3d9348 As the comment in qapi/error, dereferencing @errp requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: * - It must not be dereferenced, because it may be null. ... * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. * * Using it when it's not needed is safe, but please avoid cluttering * the source with useless code. But in macfb_nubus_realize(), @errp is dereferenced without ERRP_GUARD(): ndc->parent_realize(dev, errp); if (*errp) { return; } Here we check *errp, because the ndc->parent_realize(), as a DeviceClass.realize() callback, returns void. And since macfb_nubus_realize(), also as a DeviceClass.realize(), doesn't get the NULL @errp parameter, it hasn't triggered the bug that dereferencing the NULL @errp. To follow the requirement of @errp, add missing ERRP_GUARD() in macfb_nubus_realize(). Suggested-by: Markus Armbruster Signed-off-by: Zhao Liu Reviewed-by: Markus Armbruster Message-Id: <20240223085653.1255438-3-zhao1.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: qihao_yewu --- hw/display/macfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/display/macfb.c b/hw/display/macfb.c index d61541ccb5..170da35757 100644 --- a/hw/display/macfb.c +++ b/hw/display/macfb.c @@ -714,6 +714,7 @@ static void macfb_nubus_set_irq(void *opaque, int n, int level) static void macfb_nubus_realize(DeviceState *dev, Error **errp) { + ERRP_GUARD(); NubusDevice *nd = NUBUS_DEVICE(dev); MacfbNubusState *s = NUBUS_MACFB(dev); MacfbNubusDeviceClass *ndc = NUBUS_MACFB_GET_CLASS(dev); -- Gitee From 48c792a802c8cb0ab670ddf92920e2e5e96747a4 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Mon, 18 Mar 2024 10:04:42 +0800 Subject: [PATCH 079/939] qemu-img block: set zero flags only when discard_zeros of the block supported zero flags set for block discard_zeros, only when the block support discard_zeros need set these flags. old commit info: qemu-img: block: dont blk_make_zero if discard_zeroes false Signed-off-by: Jinhua Cao --- block/file-posix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/file-posix.c b/block/file-posix.c index 01ae5fd88c..4782aba59f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -822,7 +822,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, #endif s->needs_alignment = raw_needs_alignment(bs); - bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; + bs->supported_zero_flags = s->discard_zeroes ? (BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) : 0; if (S_ISREG(st.st_mode)) { /* When extending regular files, we get zeros from the OS */ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; -- Gitee From 422ac7d67a7ced985b1beef4b33cc43b48d1f240 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Mon, 18 Mar 2024 10:18:07 +0800 Subject: [PATCH 080/939] qemu-img: add qemu-img direct create Introdue buffer_size while creating raw file, then we can controll the speed of direct write by: qemu-img create -t 'cache' -o buffer_size='num' Signed-off-by: Jinhua Cao --- block/file-posix.c | 65 ++++++++++++++++++-- include/block/block_int-common.h | 2 + qapi/block-core.json | 6 +- qemu-img-cmds.hx | 4 +- qemu-img.c | 14 ++++- tests/qemu-iotests/049.out | 102 +++++++++++++++---------------- tests/qemu-iotests/099.out | 2 +- 7 files changed, 134 insertions(+), 61 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4782aba59f..4ac8f684f1 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -128,6 +128,10 @@ #define FTYPE_CD 1 #define MAX_BLOCKSIZE 4096 +#define DEFAULT_BUFFER_SIZE 65536 +#define BUFFER_ALIGN_SIZE 65536 +#define MIN_BUFFER_SIZE 65536 +#define MAX_BUFFER_SIZE 16777216 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, * leaving a few more bytes for its future use. */ @@ -203,6 +207,8 @@ typedef struct RawPosixAIOData { off_t aio_offset; uint64_t aio_nbytes; + size_t buffer_size; + union { struct { struct iovec *iov; @@ -2630,7 +2636,8 @@ static void raw_close(BlockDriverState *bs) */ static int coroutine_fn raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, - PreallocMode prealloc, Error **errp) + PreallocMode prealloc, size_t buffer_size, + Error **errp) { RawPosixAIOData acb; @@ -2639,6 +2646,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, .aio_fildes = fd, .aio_type = QEMU_AIO_TRUNCATE, .aio_offset = offset, + .buffer_size = buffer_size, .truncate = { .prealloc = prealloc, .errp = errp, @@ -2664,7 +2672,8 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, if (S_ISREG(st.st_mode)) { /* Always resizes to the exact @offset */ - return raw_regular_truncate(bs, s->fd, offset, prealloc, errp); + return raw_regular_truncate(bs, s->fd, offset, prealloc, + DEFAULT_BUFFER_SIZE, errp); } if (prealloc != PREALLOC_MODE_OFF) { @@ -2882,6 +2891,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) int fd; uint64_t perm, shared; int result = 0; + int flags = O_RDWR | O_BINARY; + size_t buffer_size = DEFAULT_BUFFER_SIZE; /* Validate options and set default values */ assert(options->driver == BLOCKDEV_DRIVER_FILE); @@ -2901,9 +2912,19 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) error_setg(errp, "Extent size hint is too large"); goto out; } + if (!file_opts->cache) { + file_opts->cache = g_strdup("writeback"); + } + if (file_opts->preallocation == PREALLOC_MODE_FULL && + !strcmp(file_opts->cache, "none")) { + flags |= O_DIRECT; + } + if (file_opts->has_buffersize) { + buffer_size = file_opts->buffersize; + } /* Create file */ - fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp); + fd = qemu_create(file_opts->filename, flags, 0644, errp); if (fd < 0) { result = -errno; goto out; @@ -2938,7 +2959,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) } /* Clear the file by truncating it to 0 */ - result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp); + result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, + buffer_size, errp); if (result < 0) { goto out_unlock; } @@ -2982,7 +3004,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) /* Resize and potentially preallocate the file to the desired * final size */ result = raw_regular_truncate(NULL, fd, file_opts->size, - file_opts->preallocation, errp); + file_opts->preallocation, + buffer_size, errp); if (result < 0) { goto out_unlock; } @@ -3003,6 +3026,8 @@ out_close: error_setg_errno(errp, -result, "Could not close the new file"); } out: + g_free(file_opts->cache); + file_opts->cache = NULL; return result; } @@ -3018,6 +3043,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename, PreallocMode prealloc; char *buf = NULL; Error *local_err = NULL; + size_t buffersize = DEFAULT_BUFFER_SIZE; + char *cache = NULL; /* Skip file: protocol prefix */ strstart(filename, "file:", &filename); @@ -3040,6 +3067,21 @@ raw_co_create_opts(BlockDriver *drv, const char *filename, return -EINVAL; } + buffersize = qemu_opt_get_size_del(opts, BLOCK_OPT_BUFFER_SIZE, + DEFAULT_BUFFER_SIZE); + if (buffersize < MIN_BUFFER_SIZE || buffersize > MAX_BUFFER_SIZE) { + error_setg_errno(errp, EINVAL, "Buffer size must be between %d " + "and %d", MIN_BUFFER_SIZE, MAX_BUFFER_SIZE); + return -EINVAL; + } + + cache = qemu_opt_get_del(opts, BLOCK_OPT_CACHE); + if (!cache) { + cache = g_strdup("writeback"); + } + + buffersize = ROUND_UP(buffersize, BUFFER_ALIGN_SIZE); + options = (BlockdevCreateOptions) { .driver = BLOCKDEV_DRIVER_FILE, .u.file = { @@ -3051,6 +3093,9 @@ raw_co_create_opts(BlockDriver *drv, const char *filename, .nocow = nocow, .has_extent_size_hint = has_extent_size_hint, .extent_size_hint = extent_size_hint, + .has_buffersize = true, + .buffersize = buffersize, + .cache = cache, }, }; return raw_co_create(&options, errp); @@ -3741,6 +3786,16 @@ static QemuOptsList raw_create_opts = { .type = QEMU_OPT_SIZE, .help = "Extent size hint for the image file, 0 to disable" }, + { + .name = BLOCK_OPT_CACHE, + .type = QEMU_OPT_STRING, + .help = "Cache mode (allowed values: writeback, none)" + }, + { + .name = BLOCK_OPT_BUFFER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "write buffer size" + }, { /* end of list */ } } }; diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 4e31d161c5..a6e2436524 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -57,6 +57,8 @@ #define BLOCK_OPT_DATA_FILE_RAW "data_file_raw" #define BLOCK_OPT_COMPRESSION_TYPE "compression_type" #define BLOCK_OPT_EXTL2 "extended_l2" +#define BLOCK_OPT_CACHE "cache" +#define BLOCK_OPT_BUFFER_SIZE "buffer_size" #define BLOCK_PROBE_BUF_SIZE 512 diff --git a/qapi/block-core.json b/qapi/block-core.json index ca390c5700..1444624590 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4906,6 +4906,8 @@ # # @extent-size-hint: Extent size hint to add to the image file; 0 for # not adding an extent size hint (default: 1 MB, since 5.1) +# @cache: Cache mode used to write the output disk image +# @buffersize: Buffer size for creating image # # Since: 2.12 ## @@ -4914,7 +4916,9 @@ 'size': 'size', '*preallocation': 'PreallocMode', '*nocow': 'bool', - '*extent-size-hint': 'size'} } + '*extent-size-hint': 'size', + '*cache': 'str', + '*buffersize': 'size'} } ## # @BlockdevCreateOptionsGluster: diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 068692d13e..20bdcd7b82 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -52,9 +52,9 @@ SRST ERST DEF("create", img_create, - "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-o options] filename [size]") + "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-t cache] [-o options] filename [size]") SRST -.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-o OPTIONS] FILENAME [SIZE] +.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-t CACHE] [-o OPTIONS] FILENAME [SIZE] ERST DEF("dd", img_dd, diff --git a/qemu-img.c b/qemu-img.c index 5a77f67719..80adee2620 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -516,6 +516,7 @@ static int img_create(int argc, char **argv) const char *base_fmt = NULL; const char *filename; const char *base_filename = NULL; + const char *cache = BDRV_DEFAULT_CACHE; char *options = NULL; Error *local_err = NULL; bool quiet = false; @@ -527,7 +528,7 @@ static int img_create(int argc, char **argv) {"object", required_argument, 0, OPTION_OBJECT}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, ":F:b:f:ho:qu", + c = getopt_long(argc, argv, ":F:b:f:t:ho:qu", long_options, NULL); if (c == -1) { break; @@ -551,6 +552,9 @@ static int img_create(int argc, char **argv) case 'f': fmt = optarg; break; + case 't': + cache = optarg; + break; case 'o': if (accumulate_options(&options, optarg) < 0) { goto fail; @@ -594,6 +598,14 @@ static int img_create(int argc, char **argv) error_exit("Unexpected argument: %s", argv[optind]); } + if (!options) { + options = g_strdup_printf(BLOCK_OPT_CACHE"=%s", cache); + } else { + char *old_options = options; + options = g_strdup_printf("%s,"BLOCK_OPT_CACHE"=%s", options, cache); + g_free(old_options); + } + bdrv_img_create(filename, fmt, base_filename, base_fmt, options, img_size, flags, quiet, &local_err); if (local_err) { diff --git a/tests/qemu-iotests/049.out b/tests/qemu-iotests/049.out index 34e1b452e6..b4a9705ec2 100644 --- a/tests/qemu-iotests/049.out +++ b/tests/qemu-iotests/049.out @@ -4,90 +4,90 @@ QA output created by 049 == 1. Traditional size parameter == qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024b -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1k -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1K -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1G -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1T -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024.0 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024.0b -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5k -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5K -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5G -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5T -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 cache=writeback == 2. Specifying size via -o == qemu-img create -f qcow2 -o size=1024 TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1024b TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1k TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1K TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1M TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1G TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1T TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1024.0 TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1024.0b TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5k TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5K TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5M TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5G TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5T TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 cache=writeback == 3. Invalid sizes == @@ -132,84 +132,84 @@ qemu-img: TEST_DIR/t.qcow2: The image size must be specified only once == Check correct interpretation of suffixes for cluster size == qemu-img create -f qcow2 -o cluster_size=1024 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1024b TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1k TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1K TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1M TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1048576 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1048576 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1024.0 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1024.0b TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=0.5k TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=0.5K TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=0.5M TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=524288 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=524288 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback == Check compat level option == qemu-img create -f qcow2 -o compat=0.10 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=1.1 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=0.42 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.42 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.42 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Parameter 'version' does not accept value '0.42' qemu-img create -f qcow2 -o compat=foobar TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=foobar lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=foobar lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Parameter 'version' does not accept value 'foobar' == Check preallocation option == qemu-img create -f qcow2 -o preallocation=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o preallocation=metadata TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=metadata compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=metadata compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o preallocation=1234 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=1234 compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=1234 compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Parameter 'preallocation' does not accept value '1234' == Check encryption option == qemu-img create -f qcow2 -o encryption=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=off cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=off cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 --object secret,id=sec0,data=123456 -o encryption=on,encrypt.key-secret=sec0 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=on encrypt.key-secret=sec0 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=on encrypt.key-secret=sec0 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback == Check lazy_refcounts option (only with v3) == qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=on TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=on refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=on refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=on TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Lazy refcounts only supported with compatibility level 1.1 and above (use version=v3 or greater) == Expect error when backing file name is empty string == diff --git a/tests/qemu-iotests/099.out b/tests/qemu-iotests/099.out index 8cce627529..f6f8f25957 100644 --- a/tests/qemu-iotests/099.out +++ b/tests/qemu-iotests/099.out @@ -1,6 +1,6 @@ QA output created by 099 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072 -Formatting 'TEST_DIR/t.IMGFMT.compare', fmt=raw size=131072 +Formatting 'TEST_DIR/t.IMGFMT.compare', fmt=raw size=131072 cache=writeback === Testing simple filename for blkverify === -- Gitee From 9ca9391acb780f15a6d8769339e7cd0edf457529 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 24 Mar 2022 17:12:49 +0800 Subject: [PATCH 081/939] qemu-img create: 'cache' paramter only use for reg file image The paramter 'cache' is invalid for host device(/dev/xxx). If 'qemu-img create' operator performed on host device, the host device not support 'cache' would result 'qemu-img create' execute failed. Signed-off-by: Jinhua Cao --- qemu-img.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 80adee2620..49d914c9c4 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -508,6 +508,22 @@ static int64_t cvtnum(const char *name, const char *value) return cvtnum_full(name, value, 0, INT64_MAX); } +static bool is_reg_file(const char *filename) +{ + struct stat st; + + /* file not exist, file will be create later, so it's a reg file */ + if (access(filename, F_OK) == -1) { + return true; + } + + /* file exist, check file type */ + if (stat(filename, &st) >= 0 && S_ISREG(st.st_mode)) { + return true; + } + return false; +} + static int img_create(int argc, char **argv) { int c; @@ -598,12 +614,14 @@ static int img_create(int argc, char **argv) error_exit("Unexpected argument: %s", argv[optind]); } - if (!options) { - options = g_strdup_printf(BLOCK_OPT_CACHE"=%s", cache); - } else { - char *old_options = options; - options = g_strdup_printf("%s,"BLOCK_OPT_CACHE"=%s", options, cache); - g_free(old_options); + if (is_reg_file(filename)) { + if (!options) { + options = g_strdup_printf(BLOCK_OPT_CACHE"=%s", cache); + } else { + char *old_options = options; + options = g_strdup_printf("%s,"BLOCK_OPT_CACHE"=%s", options, cache); + g_free(old_options); + } } bdrv_img_create(filename, fmt, base_filename, base_fmt, -- Gitee From 6588c017de54bab8a11509d43e2ddabf065cfa50 Mon Sep 17 00:00:00 2001 From: jiangdongxu Date: Thu, 10 Feb 2022 21:50:28 +0800 Subject: [PATCH 082/939] bugfix: fix eventfds may double free when vm_id reused in ivshmem As the ivshmem Server-Client Protol describes, when a client disconnects from the server, server sends disconnect notifications to the other clients. And the other clients will free the eventfds of the disconnected client according to the client ID. If the client ID is reused, the eventfds may be double freed. It will be solved by setting eventfds to NULL after freeing and allocating memory for it when it's used. Signed-off-by: Peng Liang Signed-off-by: jiangdongxu Signed-off-by: Adttil --- hw/misc/ivshmem.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 0447888029..ad9a3c546e 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -400,6 +400,7 @@ static void close_peer_eventfds(IVShmemState *s, int posn) } g_free(s->peers[posn].eventfds); + s->peers[posn].eventfds = NULL; s->peers[posn].nb_eventfds = 0; } @@ -533,6 +534,10 @@ static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd, close(fd); return; } + if (peer->eventfds == NULL) { + peer->eventfds = g_new0(EventNotifier, s->vectors); + peer->nb_eventfds = 0; + } vector = peer->nb_eventfds++; IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd); -- Gitee From 9d683f1ea8961d89cececf1fdc3345663744067f Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 8 Feb 2022 15:48:01 +0800 Subject: [PATCH 083/939] log: Add some logs on VM runtime path Add logs on VM runtime path, to make it easier to do trouble shooting. Signed-off-by: Ying Fang Signed-off-by: Yan Wang Signed-off-by: Adttil --- hw/virtio/virtio-pci.c | 2 ++ hw/virtio/virtio.c | 14 ++++++++++++-- monitor/monitor.c | 9 +++++++++ qapi/qmp-dispatch.c | 15 +++++++++++++++ system/qdev-monitor.c | 4 +++- 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index e433879542..134a8eaef6 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2082,7 +2082,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) VirtIOPCIProxy *proxy = VIRTIO_PCI(d); bool modern = virtio_pci_modern(proxy); bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + qemu_log("unplug device name: %s\n", !vdev ? "NULL" : vdev->name); virtio_pci_stop_ioeventfd(proxy); if (modern) { diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 3a160f86ed..a9aa0c4f66 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2048,7 +2048,14 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) k->set_status(vdev, val); } vdev->status = val; - + if (val) { + qemu_log("%s device status is %d that means %s\n", + vdev->name, val, + (val & VIRTIO_CONFIG_S_DRIVER_OK) ? "DRIVER OK" : + (val & VIRTIO_CONFIG_S_DRIVER) ? "DRIVER" : + (val & VIRTIO_CONFIG_S_ACKNOWLEDGE) ? "ACKNOWLEDGE" : + (val & VIRTIO_CONFIG_S_FAILED) ? "FAILED" : "UNKNOWN"); + } return 0; } @@ -2326,8 +2333,11 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, break; } - if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) + if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) { + qemu_log("unacceptable queue_size (%d) or num (%d)\n", + queue_size, i); abort(); + } vdev->vq[i].vring.num = queue_size; vdev->vq[i].vring.num_default = queue_size; diff --git a/monitor/monitor.c b/monitor/monitor.c index 01ede1babd..e540c1334a 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -29,6 +29,7 @@ #include "qapi/qapi-emit-events.h" #include "qapi/qapi-visit-control.h" #include "qapi/qmp/qdict.h" +#include "qapi/qmp/qjson.h" #include "qemu/error-report.h" #include "qemu/option.h" #include "sysemu/qtest.h" @@ -338,6 +339,7 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) { Monitor *mon; MonitorQMP *qmp_mon; + GString *json; trace_monitor_protocol_event_emit(event, qdict); QTAILQ_FOREACH(mon, &mon_list, entry) { @@ -348,6 +350,13 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) qmp_mon = container_of(mon, MonitorQMP, common); if (qmp_mon->commands != &qmp_cap_negotiation_commands) { qmp_send_response(qmp_mon, qdict); + json = qobject_to_json(QOBJECT(qdict)); + if (json) { + if (!strstr(json->str, "RTC_CHANGE")) { + qemu_log("%s\n", json->str); + } + g_string_free(json, true); + } } } } diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 555528b6bb..7a215cbfd7 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -24,6 +24,7 @@ #include "qapi/qmp/qbool.h" #include "qemu/coroutine.h" #include "qemu/main-loop.h" +#include "qemu/log.h" Visitor *qobject_input_visitor_new_qmp(QObject *obj) { @@ -146,6 +147,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ QObject *id; QObject *ret = NULL; QDict *rsp = NULL; + GString *json; dict = qobject_to(QDict, request); if (!dict) { @@ -203,6 +205,19 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ qobject_ref(args); } + json = qobject_to_json(QOBJECT(args)); + if (json) { + if ((strcmp(command, "query-block-jobs") != 0) + && (strcmp(command, "query-migrate") != 0) + && (strcmp(command, "query-blockstats") != 0) + && (strcmp(command, "query-balloon") != 0) + && (strcmp(command, "set_password") != 0)) { + qemu_log("qmp_cmd_name: %s, arguments: %s\n", + command, json->str); + } + g_string_free(json, true); + } + assert(!(oob && qemu_in_coroutine())); assert(monitor_cur() == NULL); if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) { diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index a13db763e5..c885175b66 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -36,6 +36,7 @@ #include "qemu/option.h" #include "qemu/qemu-print.h" #include "qemu/option_int.h" +#include "qemu/log.h" #include "sysemu/block-backend.h" #include "migration/misc.h" #include "migration/migration.h" @@ -643,6 +644,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, if (path != NULL) { bus = qbus_find(path, errp); if (!bus) { + error_setg(errp, "can not find bus for %s", driver); return NULL; } if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { @@ -715,7 +717,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, if (*errp) { goto err_del_dev; } - + qemu_log("add qdev %s:%s success\n", driver, dev->id ? dev->id : "none"); if (!qdev_realize(dev, bus, errp)) { goto err_del_dev; } -- Gitee From 28763d8df34c20cab60baec8f4f5615cbea8c0df Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Fri, 11 Feb 2022 18:20:59 +0800 Subject: [PATCH 084/939] util/log: add CONFIG_DISABLE_QEMU_LOG macro Using CONFIG_DISABLE_QEMU_LOG macro to control qemu_log function. Signed-off-by: Yan Wang Signed-off-by: Adttil --- util/log.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/util/log.c b/util/log.c index d36c98da0b..78b6cf225f 100644 --- a/util/log.c +++ b/util/log.c @@ -143,6 +143,12 @@ void qemu_log_unlock(FILE *logfile) } } +#ifdef CONFIG_DISABLE_QEMU_LOG +void qemu_log(const char *fmt, ...) +{ + return; +} +#else void qemu_log(const char *fmt, ...) { FILE *f = qemu_log_trylock(); @@ -155,6 +161,7 @@ void qemu_log(const char *fmt, ...) qemu_log_unlock(f); } } +#endif static void __attribute__((__constructor__)) startup(void) { -- Gitee From 35054aa25a0d7758a35d75e3298555b502e37b0f Mon Sep 17 00:00:00 2001 From: jiangdongxu Date: Thu, 10 Feb 2022 21:32:37 +0800 Subject: [PATCH 085/939] bugfix: fix some illegal memory access and memory leak Signed-off-by: yuxiating Signed-off-by: jiangdongxu Signed-off-by: Adttil --- util/range.c | 1 + 1 file changed, 1 insertion(+) diff --git a/util/range.c b/util/range.c index f3f40098d5..2ea640662b 100644 --- a/util/range.c +++ b/util/range.c @@ -61,6 +61,7 @@ GList *range_list_insert(GList *list, Range *data) range_extend(l->data, l->next->data); g_free(l->next->data); new_l = g_list_delete_link(list, l->next); + l->next = NULL; assert(new_l == list); } -- Gitee From e6a20580801314e9d47682d7b8d8161c030eab04 Mon Sep 17 00:00:00 2001 From: jiangdongxu Date: Thu, 10 Feb 2022 22:12:50 +0800 Subject: [PATCH 086/939] bugfix: fix possible memory leak Signed-off-by: caojinhua Signed-off-by: jiangdongxu Signed-off-by: Adttil --- migration/savevm.c | 2 ++ qga/main.c | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index eec5503a42..477a19719f 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1553,6 +1553,7 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, ret = vmstate_save(f, se, vmdesc); if (ret) { qemu_file_set_error(f, ret); + json_writer_free(vmdesc); return ret; } @@ -1572,6 +1573,7 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, migrate_set_error(ms, local_err); error_report_err(local_err); qemu_file_set_error(f, ret); + json_writer_free(vmdesc); return ret; } } diff --git a/qga/main.c b/qga/main.c index 8668b9f3d3..c4dcbb86be 100644 --- a/qga/main.c +++ b/qga/main.c @@ -1399,7 +1399,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) if (g_mkdir_with_parents(config->state_dir, S_IRWXU) == -1) { g_critical("unable to create (an ancestor of) the state directory" " '%s': %s", config->state_dir, strerror(errno)); - return NULL; + goto failed; } #endif @@ -1424,7 +1424,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) if (!log_file) { g_critical("unable to open specified log file: %s", strerror(errno)); - return NULL; + goto failed; } s->log_file = log_file; } @@ -1435,7 +1435,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) s->pstate_filepath, ga_is_frozen(s))) { g_critical("failed to load persistent state"); - return NULL; + goto failed; } if (config->allowedrpcs) { @@ -1465,7 +1465,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) #ifndef _WIN32 if (!register_signal_handlers()) { g_critical("failed to register signal handlers"); - return NULL; + goto failed; } #endif @@ -1478,12 +1478,20 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) s->wakeup_event = CreateEvent(NULL, TRUE, FALSE, TEXT("WakeUp")); if (s->wakeup_event == NULL) { g_critical("CreateEvent failed"); - return NULL; + goto failed; } #endif ga_state = s; return s; +failed: + g_free(s->pstate_filepath); + g_free(s->state_filepath_isfrozen); + if (s->log_file) { + fclose(s->log_file); + } + g_free(s); + return NULL; } static void cleanup_agent(GAState *s) -- Gitee From 65435e107fc8eee37c61a3a7d1adebd013ad466f Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Sat, 23 Mar 2024 16:18:03 +0800 Subject: [PATCH 087/939] memory: [backup] Modify the VM's physical bits value set policy. backup code from qemu-6.2 to qemu-8.2 old info: commit id : a09c3928b33b0c53831bd9eeb56f8171c26057bc messages: target-i386: Modify the VM's physical bits value set policy. To resolve the problem that a VM with large memory capacity fails to be live migrated, determine whether the VM is a large memory capacity based on the memory size (4 TB). If yes, set the bus width of the VM address to 46 bits. If no, set the bus width to 42 bits. Signed-off-by: Jinhua Cao Signed-off-by: Jiajie Li Signed-off-by: Ming Yang --- target/i386/cpu.c | 20 +++++++++++++++++++- target/i386/cpu.h | 6 ++++++ target/i386/host-cpu.c | 13 +++++++------ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index a66e5a357b..fc61a84b1e 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -7666,6 +7666,24 @@ static void x86_cpu_set_pc(CPUState *cs, vaddr value) cpu->env.eip = value; } + +/* At present, we check the vm is *LARGE* or not, i.e. whether + * the memory size is more than 4T or not. + */ +const uint64_t large_vm_mem_size = 0x40000000000UL; +void x86_cpu_adjuest_by_ram_size(ram_addr_t ram_size, X86CPU *cpu) +{ + /* If there is not a large vm, we set the phys_bits to 42 bits, + * otherwise, we increase the phys_bits to 46 bits. + */ + if (ram_size < large_vm_mem_size) { + cpu->phys_bits = DEFAULT_VM_CPU_PHYS_BITS; + } else { + cpu->phys_bits = LARGE_VM_CPU_PHYS_BITS; + cpu->fill_mtrr_mask = true; + } +} + static vaddr x86_cpu_get_pc(CPUState *cs) { X86CPU *cpu = X86_CPU(cs); @@ -7868,7 +7886,7 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_UINT32("phys-bits", X86CPU, phys_bits, 0), DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false), DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 0), - DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true), + DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, false), DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7, UINT32_MAX), DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX), diff --git a/target/i386/cpu.h b/target/i386/cpu.h index ef987f344c..6993552cd9 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -24,6 +24,7 @@ #include "cpu-qom.h" #include "kvm/hyperv-proto.h" #include "exec/cpu-defs.h" +#include "exec/cpu-common.h" #include "qapi/qapi-types-common.h" #include "qemu/cpu-float.h" #include "qemu/timer.h" @@ -2081,6 +2082,11 @@ struct X86CPUClass { extern const VMStateDescription vmstate_x86_cpu; #endif +#define DEFAULT_VM_CPU_PHYS_BITS 42 +#define LARGE_VM_CPU_PHYS_BITS 46 + +void x86_cpu_adjuest_by_ram_size(ram_addr_t ram_size, X86CPU *cpu); + int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request); int x86_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cpu, diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c index 92ecb7254b..07738bf857 100644 --- a/target/i386/host-cpu.c +++ b/target/i386/host-cpu.c @@ -13,6 +13,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "sysemu/sysemu.h" +#include "hw/boards.h" /* Note: Only safe for use on x86(-64) hosts */ static uint32_t host_cpu_phys_bits(void) @@ -57,14 +58,14 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu) uint32_t phys_bits = cpu->phys_bits; static bool warned; - /* - * Print a warning if the user set it to a value that's not the - * host value. - */ - if (phys_bits != host_phys_bits && phys_bits != 0 && + /* adjust x86 cpu phys_bits according to ram_size. */ + x86_cpu_adjuest_by_ram_size(current_machine->ram_size, cpu); + + /* Print a warning if the host value less than the user set. */ + if (phys_bits > host_phys_bits && phys_bits != 0 && !warned) { warn_report("Host physical bits (%u)" - " does not match phys-bits property (%u)", + " less than phys-bits property (%u)", host_phys_bits, phys_bits); warned = true; } -- Gitee From c28455a0bac4bbf171d1f19e162557377a85e96c Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Sat, 23 Mar 2024 16:32:46 +0800 Subject: [PATCH 088/939] [backup] memory: bakcup hugepages: hugepages files maybe leftover old info: commit id: 3cb1b0ce091998532a30793e3272925da4e6f3aa old messages: hugepages: hugepages files maybe leftover Before qemu uses the hugepage memory directory /dev/hugepages/libvirt/qemu/xxx, The directory may be deleted because of the destroy virtual machine. Cause qemu to create files directly under /dev/hugepages/libvirt/qemu/. After the file is created, the file is not cleaned up by unlink, and when the virtual machine is destroyed, libvirt will only clean up /dev/hugepages/libvirt/qemu/xxx directory. After creating the hugepage file, execute unlink to clean up the file to fix the problem. Signed-off-by: Jinhua Cao Signed-off-by: Jiajie Li Signed-off-by: Ming Yang --- include/qemu/mmap-alloc.h | 4 ++++ system/physmem.c | 9 ++++++++- util/mmap-alloc.c | 22 ++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h index 8344daaa03..63e4edfd2f 100644 --- a/include/qemu/mmap-alloc.h +++ b/include/qemu/mmap-alloc.h @@ -1,6 +1,10 @@ #ifndef QEMU_MMAP_ALLOC_H #define QEMU_MMAP_ALLOC_H +#define HUGETLBFS_MAGIC 0x958458f6 + +size_t qemu_fd_getfiletype(int fd); + typedef enum { QEMU_FS_TYPE_UNKNOWN = 0, QEMU_FS_TYPE_TMPFS, diff --git a/system/physmem.c b/system/physmem.c index a63853a7bc..f14d64819b 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1329,7 +1329,14 @@ static int file_ram_open(const char *path, /* @path names a file that doesn't exist, create it */ fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644); if (fd >= 0) { - *created = true; + info_report("open %s success \n", path); + /* if fd file type is HUGETLBFS_MAGIC, unlink it, */ + /* in case to prevent residue after qemu killed */ + if (qemu_fd_getfiletype(fd) == HUGETLBFS_MAGIC) { + unlink(path); + } else { + *created = true; + } break; } } else if (errno == EISDIR) { diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index ed14f9c64d..6890ad676c 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -30,6 +30,28 @@ #include #endif +size_t qemu_fd_getfiletype(int fd) +{ + struct statfs fs; + int ret; + + if (fd != -1) { + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret != 0) { + fprintf(stderr, "Couldn't fstatfs() fd: %s\n", + strerror(errno)); + return -1; + } + return fs.f_type; + } else { + fprintf(stderr, "fd is invalid \n"); + return -1; + } +} + QemuFsType qemu_fd_getfs(int fd) { #ifdef CONFIG_LINUX -- Gitee From c03415f3b75e6a37c7eb392ef62bf92b94267b4d Mon Sep 17 00:00:00 2001 From: gaojiazhen Date: Mon, 25 Mar 2024 17:26:52 +0800 Subject: [PATCH 089/939] travis-ci: Rename SOFTMMU -> SYSTEM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 47833f817cc597db124c690bd14600bb5d00e824 Since we *might* have user emulation with softmmu, rename MAIN_SOFTMMU_TARGETS as MAIN_SYSTEM_TARGETS to express 'system emulation targets'. Signed-off-by: Philippe Mathieu-Daudé Message-ID: <20240313213339.82071-3-philmd@linaro.org> Reviewed-by: Thomas Huth Reviewed-by: Richard Henderson Signed-off-by: Thomas Huth Signed-off-by: Gao Jiazhen --- .travis.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 76859d48da..597d151b80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ env: - TEST_BUILD_CMD="" - TEST_CMD="make check V=1" # This is broadly a list of "mainline" system targets which have support across the major distros - - MAIN_SOFTMMU_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu" + - MAIN_SYSTEM_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu" - CCACHE_SLOPPINESS="include_file_ctime,include_file_mtime" - CCACHE_MAXSIZE=1G - G_MESSAGES_DEBUG=error @@ -114,7 +114,7 @@ jobs: env: - TEST_CMD="make check check-tcg V=1" - CONFIG="--disable-containers --enable-fdt=system - --target-list=${MAIN_SOFTMMU_TARGETS} --cxx=/bin/false" + --target-list=${MAIN_SYSTEM_TARGETS} --cxx=/bin/false" - UNRELIABLE=true - name: "[ppc64] GCC check-tcg" @@ -185,7 +185,7 @@ jobs: env: - TEST_CMD="make check check-tcg V=1" - CONFIG="--disable-containers --enable-fdt=system - --target-list=${MAIN_SOFTMMU_TARGETS},s390x-linux-user" + --target-list=${MAIN_SYSTEM_TARGETS},s390x-linux-user" - UNRELIABLE=true script: - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$? @@ -226,7 +226,7 @@ jobs: - genisoimage env: - CONFIG="--disable-containers --enable-fdt=system --audio-drv-list=sdl - --disable-user --target-list-exclude=${MAIN_SOFTMMU_TARGETS}" + --disable-user --target-list-exclude=${MAIN_SYSTEM_TARGETS}" - name: "[s390x] GCC (user)" arch: s390x -- Gitee From e2a4aed3ef07b05302ab4d15017b720fec97905f Mon Sep 17 00:00:00 2001 From: gaojiazhen Date: Mon, 25 Mar 2024 18:04:40 +0800 Subject: [PATCH 090/939] ppc/pnv: I2C controller is not user creatablei MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry picked from commit 5b2b9450a2f83668bedd092b43233ad35f0d40bd The I2C controller is a subunit of the processor. Make it so and avoid QEMU crashes. $ build/qemu-system-ppc64 -S -machine powernv9 -device pnv-i2c qemu-system-ppc64: ../hw/ppc/pnv_i2c.c:521: pnv_i2c_realize: Assertion `i2c->chip' failed. Aborted (core dumped) Fixes: 263b81e ("ppc/pnv: Add an I2C controller model") Cc: Glenn Miles Reported-by: Thomas Huth Reviewed-by: Thomas Huth Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Glenn Miles Signed-off-by: Cédric Le Goater Signed-off-by: Gao Jiazhen --- hw/ppc/pnv_i2c.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/ppc/pnv_i2c.c b/hw/ppc/pnv_i2c.c index 656a48eebe..0ac6aa5c06 100644 --- a/hw/ppc/pnv_i2c.c +++ b/hw/ppc/pnv_i2c.c @@ -673,6 +673,9 @@ static void pnv_i2c_class_init(ObjectClass *klass, void *data) xscomc->dt_xscom = pnv_i2c_dt_xscom; + /* Reason: This device is part of the CPU and cannot be used separately */ + dc->user_creatable = false; + dc->desc = "PowerNV I2C"; dc->realize = pnv_i2c_realize; device_class_set_props(dc, pnv_i2c_properties); -- Gitee From 7212ca27f0dc957f83fe29858430ee2927e0175c Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 Mar 2024 21:31:32 +0800 Subject: [PATCH 091/939] =?UTF-8?q?iotests:=20adapt=20to=20output=20change?= =?UTF-8?q?=20for=20recently=20introduced=20'detached=20hea=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 39a94d7c34ce9d222fa9c0c99a14e20a567456d7 …der' field Failure was noticed when running the tests for the qcow2 image format. Fixes: 0bd779e ("crypto: Introduce 'detached-header' field in QCryptoBlockInfoLUKS") Signed-off-by: Fiona Ebner Message-ID: <20240216101415.293769-1-f.ebner@proxmox.com> Reviewed-by: Daniel P. Berrangé Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf Signed-off-by: Gao Jiazhen --- tests/qemu-iotests/198.out | 2 ++ tests/qemu-iotests/206.out | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/qemu-iotests/198.out b/tests/qemu-iotests/198.out index 805494916f..62fb73fa3e 100644 --- a/tests/qemu-iotests/198.out +++ b/tests/qemu-iotests/198.out @@ -39,6 +39,7 @@ Format specific information: compression type: COMPRESSION_TYPE encrypt: ivgen alg: plain64 + detached header: false hash alg: sha256 cipher alg: aes-256 uuid: 00000000-0000-0000-0000-000000000000 @@ -84,6 +85,7 @@ Format specific information: compression type: COMPRESSION_TYPE encrypt: ivgen alg: plain64 + detached header: false hash alg: sha256 cipher alg: aes-256 uuid: 00000000-0000-0000-0000-000000000000 diff --git a/tests/qemu-iotests/206.out b/tests/qemu-iotests/206.out index 7e95694777..979f00f9bf 100644 --- a/tests/qemu-iotests/206.out +++ b/tests/qemu-iotests/206.out @@ -114,6 +114,7 @@ Format specific information: refcount bits: 16 encrypt: ivgen alg: plain64 + detached header: false hash alg: sha1 cipher alg: aes-128 uuid: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX -- Gitee From 4506b31c0fff0b7a69ec4c7e264715ed70df75a8 Mon Sep 17 00:00:00 2001 From: gaojiazhen Date: Mon, 25 Mar 2024 22:13:43 +0800 Subject: [PATCH 092/939] migration: Skip only empty block devicesi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 2e128776dc56f502c2ee41750afe83938f389528 The block .save_setup() handler calls a helper routine init_blk_migration() which builds a list of block devices to take into account for migration. When one device is found to be empty (sectors == 0), the loop exits and all the remaining devices are ignored. This is a regression introduced when bdrv_iterate() was removed. Change that by skipping only empty devices. Cc: Markus Armbruster Cc: qemu-stable Suggested-by: Kevin Wolf Fixes: fea68bb ("block: Eliminate bdrv_iterate(), use bdrv_next()") Signed-off-by: Cédric Le Goater Reviewed-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Link: https://lore.kernel.org/r/20240312120431.550054-1-clg@redhat.com [peterx: fix "Suggested-by:"] Signed-off-by: Peter Xu Signed-off-by: Gao Jiazhen --- migration/block.c | 5 ++++- tests/qemu-iotests/198.out | 2 -- tests/qemu-iotests/206.out | 1 - 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/migration/block.c b/migration/block.c index a15f9bddcb..710ef6f490 100644 --- a/migration/block.c +++ b/migration/block.c @@ -409,7 +409,10 @@ static int init_blk_migration(QEMUFile *f) } sectors = bdrv_nb_sectors(bs); - if (sectors <= 0) { + if (sectors == 0) { + continue; + } + if (sectors < 0) { ret = sectors; bdrv_next_cleanup(&it); goto out; diff --git a/tests/qemu-iotests/198.out b/tests/qemu-iotests/198.out index 62fb73fa3e..805494916f 100644 --- a/tests/qemu-iotests/198.out +++ b/tests/qemu-iotests/198.out @@ -39,7 +39,6 @@ Format specific information: compression type: COMPRESSION_TYPE encrypt: ivgen alg: plain64 - detached header: false hash alg: sha256 cipher alg: aes-256 uuid: 00000000-0000-0000-0000-000000000000 @@ -85,7 +84,6 @@ Format specific information: compression type: COMPRESSION_TYPE encrypt: ivgen alg: plain64 - detached header: false hash alg: sha256 cipher alg: aes-256 uuid: 00000000-0000-0000-0000-000000000000 diff --git a/tests/qemu-iotests/206.out b/tests/qemu-iotests/206.out index 979f00f9bf..7e95694777 100644 --- a/tests/qemu-iotests/206.out +++ b/tests/qemu-iotests/206.out @@ -114,7 +114,6 @@ Format specific information: refcount bits: 16 encrypt: ivgen alg: plain64 - detached header: false hash alg: sha1 cipher alg: aes-128 uuid: XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX -- Gitee From 59f038d21c1901245ba0be417f6285cec465d6c1 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Wed, 9 Feb 2022 11:24:32 +0800 Subject: [PATCH 093/939] Currently, while kvm and qemu can not handle some kvm exit, qemu will do vm_stop, which will make vm in pause state. This action make vm unrecoverable, so send guest panic to libvirt instead. --- accel/kvm/kvm-all.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e39a810a4e..33f4c6d547 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2993,7 +2993,7 @@ int kvm_cpu_exec(CPUState *cpu) if (ret < 0) { cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); - vm_stop(RUN_STATE_INTERNAL_ERROR); + qemu_system_guest_panicked(cpu_get_crash_info(cpu)); } qatomic_set(&cpu->exit_request, 0); -- Gitee From 0a54d68547df3f276dc242b52d54e8549d0a84a0 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Wed, 9 Feb 2022 11:21:28 +0800 Subject: [PATCH 094/939] ps2: fix oob in ps2 kbd fix oob in ps2 kbd --- hw/input/ps2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/input/ps2.c b/hw/input/ps2.c index c8fd23cf36..b647561069 100644 --- a/hw/input/ps2.c +++ b/hw/input/ps2.c @@ -167,7 +167,7 @@ void ps2_queue_noirq(PS2State *s, int b) } q->data[q->wptr] = b; - if (++q->wptr == PS2_BUFFER_SIZE) { + if (++q->wptr >= PS2_BUFFER_SIZE) { q->wptr = 0; } q->count++; @@ -557,7 +557,7 @@ uint32_t ps2_read_data(PS2State *s) val = q->data[index]; } else { val = q->data[q->rptr]; - if (++q->rptr == PS2_BUFFER_SIZE) { + if (++q->rptr >= PS2_BUFFER_SIZE) { q->rptr = 0; } q->count--; -- Gitee From c6b183a4c3c63454dea39be26b0fb773ec04887e Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Wed, 9 Feb 2022 14:13:05 +0800 Subject: [PATCH 095/939] monitor/qmp: drop inflight rsp if qmp client broken If libvirt restart while qemu is handle qmp message, libvirt will reconnect qemu monitor socket, and query status of qemu by qmp. But qemu may return last qmp respond to new connect socket, and libvirt recv unexpected respond, So libvirt think qemu is abnormal, and will kill qemu. This patch add qmp connect id, while reconnect id will change. While respond to libvirt, judge if id is same, if not, drop this respond. --- monitor/monitor-internal.h | 1 + monitor/qmp.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h index 252de85681..d7842fa464 100644 --- a/monitor/monitor-internal.h +++ b/monitor/monitor-internal.h @@ -144,6 +144,7 @@ typedef struct { const QmpCommandList *commands; bool capab_offered[QMP_CAPABILITY__MAX]; /* capabilities offered */ bool capab[QMP_CAPABILITY__MAX]; /* offered and accepted */ + uint64_t qmp_client_id; /*qmp client id, update if peer disconnect */ /* * Protects qmp request/response queue. * Take monitor_lock first when you need both. diff --git a/monitor/qmp.c b/monitor/qmp.c index 6eee450fe4..8f7671c5f1 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -149,18 +149,19 @@ void qmp_send_response(MonitorQMP *mon, const QDict *rsp) * Null @rsp can only happen for commands with QCO_NO_SUCCESS_RESP. * Nothing is emitted then. */ -static void monitor_qmp_respond(MonitorQMP *mon, QDict *rsp) +static void monitor_qmp_respond(MonitorQMP *mon, QDict *rsp, uint64_t req_client_id) { - if (rsp) { - qmp_send_response(mon, rsp); + if (!rsp || (mon->qmp_client_id != req_client_id)) { + return; } + qmp_send_response(mon, rsp); } /* * Runs outside of coroutine context for OOB commands, but in * coroutine context for everything else. */ -static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req) +static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req, uint64_t req_client_id) { QDict *rsp; QDict *error; @@ -180,7 +181,7 @@ static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req) } } - monitor_qmp_respond(mon, rsp); + monitor_qmp_respond(mon, rsp, req_client_id); qobject_unref(rsp); } @@ -340,13 +341,13 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data) trace_monitor_qmp_cmd_in_band(id_json->str); g_string_free(id_json, true); } - monitor_qmp_dispatch(mon, req_obj->req); + monitor_qmp_dispatch(mon, req_obj->req, mon->qmp_client_id); } else { assert(req_obj->err); trace_monitor_qmp_err_in_band(error_get_pretty(req_obj->err)); rsp = qmp_error_response(req_obj->err); req_obj->err = NULL; - monitor_qmp_respond(mon, rsp); + monitor_qmp_respond(mon, rsp, mon->qmp_client_id); qobject_unref(rsp); } @@ -402,7 +403,7 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err) trace_monitor_qmp_cmd_out_of_band(id_json->str); g_string_free(id_json, true); } - monitor_qmp_dispatch(mon, req); + monitor_qmp_dispatch(mon, req, mon->qmp_client_id); qobject_unref(req); return; } @@ -486,6 +487,7 @@ static void monitor_qmp_event(void *opaque, QEMUChrEvent event) mon_refcount++; break; case CHR_EVENT_CLOSED: + mon->qmp_client_id++; /* * Note: this is only useful when the output of the chardev * backend is still open. For example, when the backend is @@ -539,6 +541,7 @@ void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp) } qemu_chr_fe_set_echo(&mon->common.chr, true); + mon->qmp_client_id = 1; /* Note: we run QMP monitor in I/O thread when @chr supports that */ monitor_data_init(&mon->common, true, false, qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_GCONTEXT)); -- Gitee From b6c45f5ea5d1a379ac0a507cf59345c573b27cc8 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Wed, 9 Feb 2022 14:21:39 +0800 Subject: [PATCH 096/939] oslib-posix: optimise vm startup time for 1G hugepage It takes quit a long time to clear 1G-hugepage, which makes glibc pthread_create quit slow. Create touch_pages threads in advance, and then handle the touch_pages callback. Only read lock is held here. --- util/oslib-posix.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/util/oslib-posix.c b/util/oslib-posix.c index e86fd64e09..9ca3fee2b8 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -88,6 +88,8 @@ static QemuMutex sigbus_mutex; static QemuMutex page_mutex; static QemuCond page_cond; +static int started_num_threads; + int qemu_get_thread_id(void) { #if defined(__linux__) @@ -344,6 +346,10 @@ static void *do_touch_pages(void *arg) } qemu_mutex_unlock(&page_mutex); + while (started_num_threads != memset_args->context.num_threads) { + smp_mb(); + } + /* unblock SIGBUS */ sigemptyset(&set); sigaddset(&set, SIGBUS); @@ -448,7 +454,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, context.threads = g_new0(MemsetThread, context.num_threads); numpages_per_thread = numpages / context.num_threads; leftover = numpages % context.num_threads; - for (i = 0; i < context.num_threads; i++) { + for (i = 0, started_num_threads = 0; i < context.num_threads; i++) { context.threads[i].addr = addr; context.threads[i].numpages = numpages_per_thread + (i < leftover); context.threads[i].hpagesize = hpagesize; @@ -464,6 +470,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, QEMU_THREAD_JOINABLE); } addr += context.threads[i].numpages * hpagesize; + started_num_threads++; } if (!use_madv_populate_write) { -- Gitee From dfb9372702b2fb994392b8a6e8a39964c2656ae6 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Wed, 9 Feb 2022 08:49:41 +0800 Subject: [PATCH 097/939] migration: skip cache_drop for bios bootloader and nvram template Qemu enabled page cache dropping for raw device on the destionation host during shared storage migration. However, fsync may take 300ms to multiple seconds to return in multiple-migration scene, because all domains in a host share bios bootloader file, skip cache_drop for bios bootloader and nvram template to avoid downtime increase. --- block.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index b7cb963929..3bfd4be6b4 100644 --- a/block.c +++ b/block.c @@ -68,6 +68,9 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +#define DEFAULT_BIOS_BOOT_LOADER_DIR "/usr/share/edk2" +#define DEFAULT_NVRAM_TEMPLATE_DIR "/var/lib/libvirt/qemu/nvram" + /* Protected by BQL */ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); @@ -7017,7 +7020,13 @@ int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp) assert(!(bs->open_flags & BDRV_O_INACTIVE)); assert_bdrv_graph_readable(); - if (bs->drv->bdrv_co_invalidate_cache) { + /* + * It's not necessary for bios bootloader and nvram template to drop cache + * when migration, skip this step for them to avoid dowtime increase. + */ + if (bs->drv->bdrv_co_invalidate_cache && + !strstr(bs->filename, DEFAULT_BIOS_BOOT_LOADER_DIR) && + !strstr(bs->filename, DEFAULT_NVRAM_TEMPLATE_DIR)) { bs->drv->bdrv_co_invalidate_cache(bs, &local_err); if (local_err) { error_propagate(errp, local_err); -- Gitee From c2402b63ecb10b9a25695b710f2664dbcbc01ec4 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Sat, 30 Jan 2021 14:57:54 +0800 Subject: [PATCH 098/939] migration: Add multi-thread compress method A multi-thread compress method parameter is added to hold the method we are going to use. By default the 'zlib' method is used to maintain the compatibility as before. Signed-off-by: Chuan Zheng Signed-off-by: Zeyu Jin Signed-off-by: Ying Fang --- hw/core/qdev-properties-system.c | 11 +++++++++++ include/hw/qdev-properties.h | 4 ++++ migration/migration-hmp-cmds.c | 13 +++++++++++++ migration/options.c | 15 +++++++++++++++ monitor/hmp-cmds.c | 1 + qapi/migration.json | 32 ++++++++++++++++++++++++++++++-- util/oslib-posix.c | 2 +- 7 files changed, 75 insertions(+), 3 deletions(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index f2e2718c74..cd5571fcfb 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -1202,6 +1202,17 @@ const PropertyInfo qdev_prop_uuid = { .set_default_value = set_default_uuid_auto, }; +/* --- CompressMethod --- */ +const PropertyInfo qdev_prop_compress_method = { + .name = "CompressMethod", + .description = "multi-thread compression method, " + "zlib", + .enum_table = &CompressMethod_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- s390 cpu entitlement policy --- */ QEMU_BUILD_BUG_ON(sizeof(CpuS390Entitlement) != sizeof(int)); diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index 25743a29a0..63602c2c74 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -60,6 +60,7 @@ extern const PropertyInfo qdev_prop_int64; extern const PropertyInfo qdev_prop_size; extern const PropertyInfo qdev_prop_string; extern const PropertyInfo qdev_prop_on_off_auto; +extern const PropertyInfo qdev_prop_compress_method; extern const PropertyInfo qdev_prop_size32; extern const PropertyInfo qdev_prop_array; extern const PropertyInfo qdev_prop_link; @@ -168,6 +169,9 @@ extern const PropertyInfo qdev_prop_link; DEFINE_PROP(_n, _s, _f, qdev_prop_string, char*) #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto) +#define DEFINE_PROP_COMPRESS_METHOD(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_compress_method, \ + CompressMethod) #define DEFINE_PROP_SIZE32(_n, _s, _f, _d) \ DEFINE_PROP_UNSIGNED(_n, _s, _f, _d, qdev_prop_size32, uint32_t) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 86ae832176..261ec1e35c 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -22,6 +22,7 @@ #include "qapi/qapi-commands-migration.h" #include "qapi/qapi-visit-migration.h" #include "qapi/qmp/qdict.h" +#include "qapi/qapi-visit-migration.h" #include "qapi/string-input-visitor.h" #include "qapi/string-output-visitor.h" #include "qemu/cutils.h" @@ -291,6 +292,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) MigrationParameter_str(MIGRATION_PARAMETER_DECOMPRESS_THREADS), params->decompress_threads); assert(params->has_throttle_trigger_threshold); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_METHOD), + CompressMethod_str(params->compress_method)); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD), params->throttle_trigger_threshold); @@ -519,6 +523,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) MigrateSetParameters *p = g_new0(MigrateSetParameters, 1); uint64_t valuebw = 0; uint64_t cache_size; + CompressMethod compress_method; Error *err = NULL; int val, ret; @@ -544,6 +549,14 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_decompress_threads = true; visit_type_uint8(v, param, &p->decompress_threads, &err); break; + case MIGRATION_PARAMETER_COMPRESS_METHOD: + p->has_compress_method = true; + visit_type_CompressMethod(v, param, &compress_method, &err); + if (err) { + break; + } + p->compress_method = compress_method; + break; case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD: p->has_throttle_trigger_threshold = true; visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err); diff --git a/migration/options.c b/migration/options.c index 8d8ec73ad9..af7ea7b346 100644 --- a/migration/options.c +++ b/migration/options.c @@ -47,6 +47,7 @@ #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 +#define DEFAULT_MIGRATE_COMPRESS_METHOD COMPRESS_METHOD_ZLIB /* Define default autoconverge cpu throttle migration parameters */ #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20 @@ -113,6 +114,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, parameters.decompress_threads, DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), + DEFINE_PROP_COMPRESS_METHOD("compress-method", MigrationState, + parameters.compress_method, + DEFAULT_MIGRATE_COMPRESS_METHOD), DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState, parameters.throttle_trigger_threshold, DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD), @@ -953,6 +957,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->compress_wait_thread = s->parameters.compress_wait_thread; params->has_decompress_threads = true; params->decompress_threads = s->parameters.decompress_threads; + params->has_compress_method = true; + params->compress_method = s->parameters.compress_method; params->has_throttle_trigger_threshold = true; params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold; params->has_cpu_throttle_initial = true; @@ -1025,6 +1031,7 @@ void migrate_params_init(MigrationParameters *params) params->has_compress_threads = true; params->has_compress_wait_thread = true; params->has_decompress_threads = true; + params->has_compress_method = true; params->has_throttle_trigger_threshold = true; params->has_cpu_throttle_initial = true; params->has_cpu_throttle_increment = true; @@ -1259,6 +1266,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->decompress_threads = params->decompress_threads; } + if (params->has_compress_method) { + dest->compress_method = params->compress_method; + } + if (params->has_throttle_trigger_threshold) { dest->throttle_trigger_threshold = params->throttle_trigger_threshold; } @@ -1380,6 +1391,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.decompress_threads = params->decompress_threads; } + if (params->has_compress_method) { + s->parameters.compress_method = params->compress_method; + } + if (params->has_throttle_trigger_threshold) { s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold; } diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 871898ac46..5bb3c9cd46 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -24,6 +24,7 @@ #include "qapi/qapi-commands-control.h" #include "qapi/qapi-commands-misc.h" #include "qapi/qmp/qdict.h" +#include "qapi/qapi-visit-migration.h" #include "qemu/cutils.h" #include "hw/intc/intc.h" #include "qemu/log.h" diff --git a/qapi/migration.json b/qapi/migration.json index eb2f883513..cafaa5ccb3 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -708,6 +708,19 @@ 'bitmaps': [ 'BitmapMigrationBitmapAlias' ] } } +## +# @CompressMethod: +# +# An enumeration of multi-thread compression methods. +# +# @zlib: use zlib compression method. +# +# Since: 5.0 +# +## +{ 'enum': 'CompressMethod', + 'data': [ 'zlib' ] } + ## # @MigrationParameter: # @@ -746,6 +759,9 @@ # fast as compression, so set the decompress-threads to the number # about 1/4 of compress-threads is adequate. # +# @compress-method: Which multi-thread compression method to use. +# Defaults to none. (Since 5.0) +# # @throttle-trigger-threshold: The ratio of bytes_dirty_period and # bytes_xfer_period to trigger throttling. It is expressed as # percentage. The default value is 50. (Since 5.0) @@ -892,6 +908,7 @@ { 'name': 'compress-level', 'features': [ 'deprecated' ] }, { 'name': 'compress-threads', 'features': [ 'deprecated' ] }, { 'name': 'decompress-threads', 'features': [ 'deprecated' ] }, + { 'name': 'compress-method', 'features': [ 'deprecated' ] }, { 'name': 'compress-wait-thread', 'features': [ 'deprecated' ] }, 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', @@ -935,6 +952,9 @@ # # @decompress-threads: decompression thread count # +# @compress-method: Set compression method to use in multi-thread compression. +# Defaults to none. (Since 5.0) +# # @throttle-trigger-threshold: The ratio of bytes_dirty_period and # bytes_xfer_period to trigger throttling. It is expressed as # percentage. The default value is 50. (Since 5.0) @@ -1066,8 +1086,9 @@ # # @deprecated: Member @block-incremental is deprecated. Use # blockdev-mirror with NBD instead. Members @compress-level, -# @compress-threads, @decompress-threads and @compress-wait-thread -# are deprecated because @compression is deprecated. +# @compress-threads, @decompress-threads, @compress-method +# and @compress-wait-thread are deprecated because +# @compression is deprecated. # # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period # are experimental. @@ -1090,6 +1111,8 @@ 'features': [ 'deprecated' ] }, '*decompress-threads': { 'type': 'uint8', 'features': [ 'deprecated' ] }, + '*compress-method': { 'type': 'CompressMethod', + 'features': [ 'deprecated' ] }, '*throttle-trigger-threshold': 'uint8', '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', @@ -1161,6 +1184,9 @@ # # @decompress-threads: decompression thread count # +# @compress-method: Which multi-thread compression method to use. +# Defaults to none. (Since 5.0) +# # @throttle-trigger-threshold: The ratio of bytes_dirty_period and # bytes_xfer_period to trigger throttling. It is expressed as # percentage. The default value is 50. (Since 5.0) @@ -1315,6 +1341,8 @@ 'features': [ 'deprecated' ] }, '*decompress-threads': { 'type': 'uint8', 'features': [ 'deprecated' ] }, + '*compress-method': { 'type': 'CompressMethod', + 'features': [ 'deprecated' ] }, '*throttle-trigger-threshold': 'uint8', '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 9ca3fee2b8..43af077fed 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -346,7 +346,7 @@ static void *do_touch_pages(void *arg) } qemu_mutex_unlock(&page_mutex); - while (started_num_threads != memset_args->context.num_threads) { + while (started_num_threads != memset_args->context->num_threads) { smp_mb(); } -- Gitee From cf6f31249817380e91cbc4e55b189216645fac18 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Sat, 30 Jan 2021 15:21:17 +0800 Subject: [PATCH 099/939] migration: Refactoring multi-thread compress migration Code refactor for the compression procedure which includes: 1. Move qemu_compress_data and qemu_put_compression_data from qemu-file.c to ram.c, for the reason that most part of the code logical has nothing to do with qemu-file. Besides, the decompression code is located at ram.c only. 2. Simplify the function input arguments for compression and decompression. Wrap the input into the param structure which already exists. This change also makes the function much more flexible for other compression methods. Signed-off-by: Chuan Zheng Signed-off-by: Zeyu Jin Signed-off-by: Ying Fang --- migration/meson.build | 4 +- migration/migration-hmp-cmds.c | 1 - migration/qemu-file.c | 61 +++++------------------- migration/qemu-file.h | 4 +- migration/ram-compress.c | 87 ++++++++++++++++++++++++---------- 5 files changed, 77 insertions(+), 80 deletions(-) diff --git a/migration/meson.build b/migration/meson.build index 92b1cc4297..d9b46ef0df 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,7 +22,6 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', - 'ram-compress.c', 'options.c', 'postcopy-ram.c', 'savevm.c', @@ -43,4 +42,5 @@ system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', - 'target.c')) + 'target.c', + 'ram-compress.c')) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 261ec1e35c..1fa6a5f478 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -22,7 +22,6 @@ #include "qapi/qapi-commands-migration.h" #include "qapi/qapi-visit-migration.h" #include "qapi/qmp/qdict.h" -#include "qapi/qapi-visit-migration.h" #include "qapi/string-input-visitor.h" #include "qapi/string-output-visitor.h" #include "qemu/cutils.h" diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 94231ff295..bd1dbc3db1 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -669,55 +669,6 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* return the size after compression, or negative value on error */ -static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len, - const uint8_t *source, size_t source_len) -{ - int err; - - err = deflateReset(stream); - if (err != Z_OK) { - return -1; - } - - stream->avail_in = source_len; - stream->next_in = (uint8_t *)source; - stream->avail_out = dest_len; - stream->next_out = dest; - - err = deflate(stream, Z_FINISH); - if (err != Z_STREAM_END) { - return -1; - } - - return stream->next_out - dest; -} - -/* Compress size bytes of data start at p and store the compressed - * data to the buffer of f. - * - * Since the file is dummy file with empty_ops, return -1 if f has no space to - * save the compressed data. - */ -ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, - const uint8_t *p, size_t size) -{ - ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); - - if (blen < compressBound(size)) { - return -1; - } - - blen = qemu_compress_data(stream, f->buf + f->buf_index + sizeof(int32_t), - blen, p, size); - if (blen < 0) { - return -1; - } - - qemu_put_be32(f, blen); - add_buf_to_iovec(f, blen); - return blen + sizeof(int32_t); -} /* Put the data in the buffer of f_src to the buffer of f_des, and * then reset the buf_index of f_src to 0. @@ -834,3 +785,15 @@ int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size) return 0; } + +ssize_t qemu_put_compress_start(QEMUFile *f, uint8_t **dest_ptr) +{ + *dest_ptr = f->buf + f->buf_index + sizeof(int32_t); + return IO_BUF_SIZE - f->buf_index - sizeof(int32_t); +} + +void qemu_put_compress_end(QEMUFile *f, unsigned int v) +{ + qemu_put_be32(f, v); + add_buf_to_iovec(f, v); +} diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 8aec9fabf7..8afa95732b 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -54,8 +54,8 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset); size_t coroutine_mixed_fn qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size); -ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, - const uint8_t *p, size_t size); +ssize_t qemu_put_compress_start(QEMUFile *f, uint8_t **dest_ptr); +void qemu_put_compress_end(QEMUFile *f, unsigned int v); int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src); bool qemu_file_buffer_empty(QEMUFile *file); diff --git a/migration/ram-compress.c b/migration/ram-compress.c index fa4388f6a6..2be344acbc 100644 --- a/migration/ram-compress.c +++ b/migration/ram-compress.c @@ -28,7 +28,6 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" - #include "ram-compress.h" #include "qemu/error-report.h" @@ -40,6 +39,7 @@ #include "exec/ramblock.h" #include "ram.h" #include "migration-stats.h" +#include "exec/ram_addr.h" static struct { int64_t pages; @@ -83,28 +83,22 @@ static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; -static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream, - RAMBlock *block, ram_addr_t offset, - uint8_t *source_buf); +static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block); static void *do_data_compress(void *opaque) { CompressParam *param = opaque; RAMBlock *block; - ram_addr_t offset; CompressResult result; qemu_mutex_lock(¶m->mutex); while (!param->quit) { if (param->trigger) { block = param->block; - offset = param->offset; param->trigger = false; qemu_mutex_unlock(¶m->mutex); - result = do_compress_ram_page(param->file, ¶m->stream, - block, offset, param->originbuf); - + result = do_compress_ram_page(param, block); qemu_mutex_lock(&comp_done_lock); param->done = true; param->result = result; @@ -204,15 +198,57 @@ exit: return -1; } -static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream, - RAMBlock *block, ram_addr_t offset, - uint8_t *source_buf) +/* + * Compress size bytes of data start at p and store the compressed + * data to the buffer of f. + * + * Since the file is dummy file with empty_ops, return -1 if f has no space to + * save the compressed data. + */ +static ssize_t qemu_put_compression_data(CompressParam *param, size_t size) +{ + int err; + uint8_t *dest = NULL; + z_stream *stream = ¶m->stream; + uint8_t *p = param->originbuf; + QEMUFile *f = f = param->file; + ssize_t blen = qemu_put_compress_start(f, &dest); + + if (blen < compressBound(size)) { + return -1; + } + + err = deflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = size; + stream->next_in = p; + stream->avail_out = blen; + stream->next_out = dest; + + err = deflate(stream, Z_FINISH); + if (err != Z_STREAM_END) { + return -1; + } + + blen = stream->next_out - dest; + if (blen < 0) { + return -1; + } + + qemu_put_compress_end(f, blen); + return blen + sizeof(int32_t); +} + +static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block) { - uint8_t *p = block->host + offset; + uint8_t *p = block->host + (param->offset & TARGET_PAGE_MASK); size_t page_size = qemu_target_page_size(); int ret; - assert(qemu_file_buffer_empty(f)); + assert(qemu_file_buffer_empty(param->file)); if (buffer_is_zero(p, page_size)) { return RES_ZEROPAGE; @@ -223,12 +259,12 @@ static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream, * so that we can catch up the error during compression and * decompression */ - memcpy(source_buf, p, page_size); - ret = qemu_put_compression_data(f, stream, source_buf, page_size); + memcpy(param->originbuf, p, page_size); + ret = qemu_put_compression_data(param, page_size); if (ret < 0) { qemu_file_set_error(migrate_get_current()->to_dst_file, ret); error_report("compressed data failed!"); - qemu_fflush(f); + qemu_fflush(param->file); return RES_NONE; } return RES_COMPRESS; @@ -322,19 +358,20 @@ bool compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset, /* return the size after decompression, or negative value on error */ static int -qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, - const uint8_t *source, size_t source_len) +qemu_uncompress_data(DecompressParam *param, uint8_t *dest, size_t pagesize) { int err; + z_stream *stream = ¶m->stream; + err = inflateReset(stream); if (err != Z_OK) { return -1; } - stream->avail_in = source_len; - stream->next_in = (uint8_t *)source; - stream->avail_out = dest_len; + stream->avail_in = param->len; + stream->next_in = param->compbuf; + stream->avail_out = pagesize; stream->next_out = dest; err = inflate(stream, Z_NO_FLUSH); @@ -350,20 +387,18 @@ static void *do_data_decompress(void *opaque) DecompressParam *param = opaque; unsigned long pagesize; uint8_t *des; - int len, ret; + int ret; qemu_mutex_lock(¶m->mutex); while (!param->quit) { if (param->des) { des = param->des; - len = param->len; param->des = 0; qemu_mutex_unlock(¶m->mutex); pagesize = qemu_target_page_size(); - ret = qemu_uncompress_data(¶m->stream, des, pagesize, - param->compbuf, len); + ret = qemu_uncompress_data(param, des, pagesize); if (ret < 0 && migrate_get_current()->decompress_error_check) { error_report("decompress data failed"); qemu_file_set_error(decomp_file, ret); -- Gitee From 5896dedf32c7e4417bd7f3e889ca41a34b06f5db Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Sat, 30 Jan 2021 15:57:31 +0800 Subject: [PATCH 100/939] migration: Add multi-thread compress ops Add the MigrationCompressOps and MigrationDecompressOps structures to make the compression method configurable for multi-thread compression migration. Signed-off-by: Chuan Zheng Signed-off-by: Zeyu Jin Signed-off-by: Ying Fang --- migration/options.c | 9 ++ migration/options.h | 1 + migration/ram-compress.c | 261 ++++++++++++++++++++++++++------------- migration/ram-compress.h | 31 ++++- migration/ram.c | 4 +- 5 files changed, 215 insertions(+), 91 deletions(-) diff --git a/migration/options.c b/migration/options.c index af7ea7b346..6aaee702dc 100644 --- a/migration/options.c +++ b/migration/options.c @@ -799,6 +799,15 @@ int migrate_decompress_threads(void) return s->parameters.decompress_threads; } +CompressMethod migrate_compress_method(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.compress_method; +} + uint64_t migrate_downtime_limit(void) { MigrationState *s = migrate_get_current(); diff --git a/migration/options.h b/migration/options.h index 246c160aee..9aca5e41ad 100644 --- a/migration/options.h +++ b/migration/options.h @@ -78,6 +78,7 @@ uint8_t migrate_cpu_throttle_increment(void); uint8_t migrate_cpu_throttle_initial(void); bool migrate_cpu_throttle_tailslow(void); int migrate_decompress_threads(void); +CompressMethod migrate_compress_method(void); uint64_t migrate_downtime_limit(void); uint8_t migrate_max_cpu_throttle(void); uint64_t migrate_max_bandwidth(void); diff --git a/migration/ram-compress.c b/migration/ram-compress.c index 2be344acbc..6e37b22492 100644 --- a/migration/ram-compress.c +++ b/migration/ram-compress.c @@ -65,26 +65,167 @@ static QemuThread *compress_threads; static QemuMutex comp_done_lock; static QemuCond comp_done_cond; -struct DecompressParam { - bool done; - bool quit; - QemuMutex mutex; - QemuCond cond; - void *des; - uint8_t *compbuf; - int len; - z_stream stream; -}; -typedef struct DecompressParam DecompressParam; - static QEMUFile *decomp_file; static DecompressParam *decomp_param; static QemuThread *decompress_threads; +MigrationCompressOps *compress_ops; +MigrationDecompressOps *decompress_ops; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block); +static int zlib_save_setup(CompressParam *param) +{ + if (deflateInit(¶m->stream, + migrate_compress_level()) != Z_OK) { + return -1; + } + + return 0; +} + +static ssize_t zlib_compress_data(CompressParam *param, size_t size) +{ + int err; + uint8_t *dest = NULL; + z_stream *stream = ¶m->stream; + uint8_t *p = param->originbuf; + QEMUFile *f = f = param->file; + ssize_t blen = qemu_put_compress_start(f, &dest); + + if (blen < compressBound(size)) { + return -1; + } + + err = deflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = size; + stream->next_in = p; + stream->avail_out = blen; + stream->next_out = dest; + + err = deflate(stream, Z_FINISH); + if (err != Z_STREAM_END) { + return -1; + } + + blen = stream->next_out - dest; + if (blen < 0) { + return -1; + } + + qemu_put_compress_end(f, blen); + return blen + sizeof(int32_t); +} + +static void zlib_save_cleanup(CompressParam *param) +{ + deflateEnd(¶m->stream); +} + +static int zlib_load_setup(DecompressParam *param) +{ + if (inflateInit(¶m->stream) != Z_OK) { + return -1; + } + + return 0; +} + +static int +zlib_decompress_data(DecompressParam *param, uint8_t *dest, size_t size) +{ + int err; + + z_stream *stream = ¶m->stream; + + err = inflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = param->len; + stream->next_in = param->compbuf; + stream->avail_out = size; + stream->next_out = dest; + + err = inflate(stream, Z_NO_FLUSH); + if (err != Z_STREAM_END) { + return -1; + } + + return stream->total_out; +} + +static void zlib_load_cleanup(DecompressParam *param) +{ + inflateEnd(¶m->stream); +} + +static int zlib_check_len(int len) +{ + return len < 0 || len > compressBound(TARGET_PAGE_SIZE); +} + +static int set_compress_ops(void) +{ + compress_ops = g_new0(MigrationCompressOps, 1); + + switch (migrate_compress_method()) { + case COMPRESS_METHOD_ZLIB: + compress_ops->save_setup = zlib_save_setup; + compress_ops->save_cleanup = zlib_save_cleanup; + compress_ops->compress_data = zlib_compress_data; + break; + default: + return -1; + } + + return 0; +} + +static int set_decompress_ops(void) +{ + decompress_ops = g_new0(MigrationDecompressOps, 1); + + switch (migrate_compress_method()) { + case COMPRESS_METHOD_ZLIB: + decompress_ops->load_setup = zlib_load_setup; + decompress_ops->load_cleanup = zlib_load_cleanup; + decompress_ops->decompress_data = zlib_decompress_data; + decompress_ops->check_len = zlib_check_len; + break; + default: + return -1; + } + + return 0; +} + +static void clean_compress_ops(void) +{ + compress_ops->save_setup = NULL; + compress_ops->save_cleanup = NULL; + compress_ops->compress_data = NULL; + + g_free(compress_ops); + compress_ops = NULL; +} + +static void clean_decompress_ops(void) +{ + decompress_ops->load_setup = NULL; + decompress_ops->load_cleanup = NULL; + decompress_ops->decompress_data = NULL; + + g_free(decompress_ops); + decompress_ops = NULL; +} + static void *do_data_compress(void *opaque) { CompressParam *param = opaque; @@ -141,7 +282,7 @@ void compress_threads_save_cleanup(void) qemu_thread_join(compress_threads + i); qemu_mutex_destroy(&comp_param[i].mutex); qemu_cond_destroy(&comp_param[i].cond); - deflateEnd(&comp_param[i].stream); + compress_ops->save_cleanup(&comp_param[i]); g_free(comp_param[i].originbuf); qemu_fclose(comp_param[i].file); comp_param[i].file = NULL; @@ -152,6 +293,7 @@ void compress_threads_save_cleanup(void) g_free(comp_param); compress_threads = NULL; comp_param = NULL; + clean_compress_ops(); } int compress_threads_save_setup(void) @@ -161,6 +303,12 @@ int compress_threads_save_setup(void) if (!migrate_compress()) { return 0; } + + if (set_compress_ops() < 0) { + clean_compress_ops(); + return -1; + } + thread_count = migrate_compress_threads(); compress_threads = g_new0(QemuThread, thread_count); comp_param = g_new0(CompressParam, thread_count); @@ -172,8 +320,7 @@ int compress_threads_save_setup(void) goto exit; } - if (deflateInit(&comp_param[i].stream, - migrate_compress_level()) != Z_OK) { + if (compress_ops->save_setup(&comp_param[i]) < 0) { g_free(comp_param[i].originbuf); goto exit; } @@ -198,50 +345,6 @@ exit: return -1; } -/* - * Compress size bytes of data start at p and store the compressed - * data to the buffer of f. - * - * Since the file is dummy file with empty_ops, return -1 if f has no space to - * save the compressed data. - */ -static ssize_t qemu_put_compression_data(CompressParam *param, size_t size) -{ - int err; - uint8_t *dest = NULL; - z_stream *stream = ¶m->stream; - uint8_t *p = param->originbuf; - QEMUFile *f = f = param->file; - ssize_t blen = qemu_put_compress_start(f, &dest); - - if (blen < compressBound(size)) { - return -1; - } - - err = deflateReset(stream); - if (err != Z_OK) { - return -1; - } - - stream->avail_in = size; - stream->next_in = p; - stream->avail_out = blen; - stream->next_out = dest; - - err = deflate(stream, Z_FINISH); - if (err != Z_STREAM_END) { - return -1; - } - - blen = stream->next_out - dest; - if (blen < 0) { - return -1; - } - - qemu_put_compress_end(f, blen); - return blen + sizeof(int32_t); -} - static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block) { uint8_t *p = block->host + (param->offset & TARGET_PAGE_MASK); @@ -260,7 +363,7 @@ static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block * decompression */ memcpy(param->originbuf, p, page_size); - ret = qemu_put_compression_data(param, page_size); + ret = compress_ops->compress_data(param, page_size); if (ret < 0) { qemu_file_set_error(migrate_get_current()->to_dst_file, ret); error_report("compressed data failed!"); @@ -356,32 +459,6 @@ bool compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset, } } -/* return the size after decompression, or negative value on error */ -static int -qemu_uncompress_data(DecompressParam *param, uint8_t *dest, size_t pagesize) -{ - int err; - - z_stream *stream = ¶m->stream; - - err = inflateReset(stream); - if (err != Z_OK) { - return -1; - } - - stream->avail_in = param->len; - stream->next_in = param->compbuf; - stream->avail_out = pagesize; - stream->next_out = dest; - - err = inflate(stream, Z_NO_FLUSH); - if (err != Z_STREAM_END) { - return -1; - } - - return stream->total_out; -} - static void *do_data_decompress(void *opaque) { DecompressParam *param = opaque; @@ -398,7 +475,7 @@ static void *do_data_decompress(void *opaque) pagesize = qemu_target_page_size(); - ret = qemu_uncompress_data(param, des, pagesize); + ret = decompress_ops->decompress_data(param, des, pagesize); if (ret < 0 && migrate_get_current()->decompress_error_check) { error_report("decompress data failed"); qemu_file_set_error(decomp_file, ret); @@ -466,7 +543,7 @@ void compress_threads_load_cleanup(void) qemu_thread_join(decompress_threads + i); qemu_mutex_destroy(&decomp_param[i].mutex); qemu_cond_destroy(&decomp_param[i].cond); - inflateEnd(&decomp_param[i].stream); + decompress_ops->load_cleanup(&decomp_param[i]); g_free(decomp_param[i].compbuf); decomp_param[i].compbuf = NULL; } @@ -475,6 +552,7 @@ void compress_threads_load_cleanup(void) decompress_threads = NULL; decomp_param = NULL; decomp_file = NULL; + clean_decompress_ops(); } int compress_threads_load_setup(QEMUFile *f) @@ -485,6 +563,11 @@ int compress_threads_load_setup(QEMUFile *f) return 0; } + if (set_decompress_ops() < 0) { + clean_decompress_ops(); + return -1; + } + /* * set compression_counters memory to zero for a new migration */ @@ -497,7 +580,7 @@ int compress_threads_load_setup(QEMUFile *f) qemu_cond_init(&decomp_done_cond); decomp_file = f; for (i = 0; i < thread_count; i++) { - if (inflateInit(&decomp_param[i].stream) != Z_OK) { + if (decompress_ops->load_setup(&decomp_param[i]) < 0) { goto exit; } diff --git a/migration/ram-compress.h b/migration/ram-compress.h index 0d89a2f55e..daf241987f 100644 --- a/migration/ram-compress.h +++ b/migration/ram-compress.h @@ -39,6 +39,20 @@ enum CompressResult { }; typedef enum CompressResult CompressResult; +struct DecompressParam { + bool done; + bool quit; + QemuMutex mutex; + QemuCond cond; + void *des; + uint8_t *compbuf; + int len; + + /* for zlib compression */ + z_stream stream; +}; +typedef struct DecompressParam DecompressParam; + struct CompressParam { bool done; bool quit; @@ -51,11 +65,26 @@ struct CompressParam { ram_addr_t offset; /* internally used fields */ - z_stream stream; uint8_t *originbuf; + + /* for zlib compression */ + z_stream stream; }; typedef struct CompressParam CompressParam; +typedef struct { + int (*save_setup)(CompressParam *param); + void (*save_cleanup)(CompressParam *param); + ssize_t (*compress_data)(CompressParam *param, size_t size); +} MigrationCompressOps; + +typedef struct { + int (*load_setup)(DecompressParam *param); + void (*load_cleanup)(DecompressParam *param); + int (*decompress_data)(DecompressParam *param, uint8_t *dest, size_t size); + int (*check_len)(int len); +} MigrationDecompressOps; + void compress_threads_save_cleanup(void); int compress_threads_save_setup(void); diff --git a/migration/ram.c b/migration/ram.c index 8c7886ab79..f9b2b9b985 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -96,6 +96,8 @@ XBZRLECacheStats xbzrle_counters; +extern MigrationDecompressOps *decompress_ops; + /* used by the search for pages to send */ struct PageSearchStatus { /* The migration channel used for a specific host page */ @@ -3979,7 +3981,7 @@ static int ram_load_precopy(QEMUFile *f) case RAM_SAVE_FLAG_COMPRESS_PAGE: len = qemu_get_be32(f); - if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { + if (decompress_ops->check_len(len)) { error_report("Invalid compressed data length: %d", len); ret = -EINVAL; break; -- Gitee From 8c9603270184d8dadf64ec6de263268e846f8c18 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Sat, 30 Jan 2021 16:15:10 +0800 Subject: [PATCH 101/939] migration: Add zstd support in multi-thread compression This patch enables zstd option in multi-thread compression. Signed-off-by: Chuan Zheng Signed-off-by: Zeyu Jin Signed-off-by: Ying Fang --- hw/core/qdev-properties-system.c | 2 +- migration/ram-compress.c | 112 +++++++++++++++++++++++++++++++ migration/ram-compress.h | 15 +++++ qapi/migration.json | 3 +- 4 files changed, 130 insertions(+), 2 deletions(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index cd5571fcfb..c581d46f2e 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -1206,7 +1206,7 @@ const PropertyInfo qdev_prop_uuid = { const PropertyInfo qdev_prop_compress_method = { .name = "CompressMethod", .description = "multi-thread compression method, " - "zlib", + "zlib/zstd", .enum_table = &CompressMethod_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/ram-compress.c b/migration/ram-compress.c index 6e37b22492..74703f0ec4 100644 --- a/migration/ram-compress.c +++ b/migration/ram-compress.c @@ -171,6 +171,103 @@ static int zlib_check_len(int len) return len < 0 || len > compressBound(TARGET_PAGE_SIZE); } +#ifdef CONFIG_ZSTD +static int zstd_save_setup(CompressParam *param) +{ + int res; + param->zstd_cs = ZSTD_createCStream(); + if (!param->zstd_cs) { + return -1; + } + res = ZSTD_initCStream(param->zstd_cs, migrate_compress_level()); + if (ZSTD_isError(res)) { + return -1; + } + return 0; +} +static void zstd_save_cleanup(CompressParam *param) +{ + ZSTD_freeCStream(param->zstd_cs); + param->zstd_cs = NULL; +} +static ssize_t zstd_compress_data(CompressParam *param, size_t size) +{ + int ret; + uint8_t *dest = NULL; + uint8_t *p = param->originbuf; + QEMUFile *f = f = param->file; + ssize_t blen = qemu_put_compress_start(f, &dest); + if (blen < ZSTD_compressBound(size)) { + return -1; + } + param->out.dst = dest; + param->out.size = blen; + param->out.pos = 0; + param->in.src = p; + param->in.size = size; + param->in.pos = 0; + do { + ret = ZSTD_compressStream2(param->zstd_cs, ¶m->out, + ¶m->in, ZSTD_e_end); + } while (ret > 0 && (param->in.size - param->in.pos > 0) + && (param->out.size - param->out.pos > 0)); + if (ret > 0 && (param->in.size - param->in.pos > 0)) { + return -1; + } + if (ZSTD_isError(ret)) { + return -1; + } + blen = param->out.pos; + qemu_put_compress_end(f, blen); + return blen + sizeof(int32_t); +} + +static int zstd_load_setup(DecompressParam *param) +{ + int ret; + param->zstd_ds = ZSTD_createDStream(); + if (!param->zstd_ds) { + return -1; + } + ret = ZSTD_initDStream(param->zstd_ds); + if (ZSTD_isError(ret)) { + return -1; + } + return 0; +} +static void zstd_load_cleanup(DecompressParam *param) +{ + ZSTD_freeDStream(param->zstd_ds); + param->zstd_ds = NULL; +} +static int +zstd_decompress_data(DecompressParam *param, uint8_t *dest, size_t size) +{ + int ret; + param->out.dst = dest; + param->out.size = size; + param->out.pos = 0; + param->in.src = param->compbuf; + param->in.size = param->len; + param->in.pos = 0; + do { + ret = ZSTD_decompressStream(param->zstd_ds, ¶m->out, ¶m->in); + } while (ret > 0 && (param->in.size - param->in.pos > 0) + && (param->out.size - param->out.pos > 0)); + if (ret > 0 && (param->in.size - param->in.pos > 0)) { + return -1; + } + if (ZSTD_isError(ret)) { + return -1; + } + return ret; +} +static int zstd_check_len(int len) +{ + return len < 0 || len > ZSTD_compressBound(TARGET_PAGE_SIZE); +} +#endif + static int set_compress_ops(void) { compress_ops = g_new0(MigrationCompressOps, 1); @@ -181,6 +278,13 @@ static int set_compress_ops(void) compress_ops->save_cleanup = zlib_save_cleanup; compress_ops->compress_data = zlib_compress_data; break; +#ifdef CONFIG_ZSTD + case COMPRESS_METHOD_ZSTD: + compress_ops->save_setup = zstd_save_setup; + compress_ops->save_cleanup = zstd_save_cleanup; + compress_ops->compress_data = zstd_compress_data; + break; +#endif default: return -1; } @@ -199,6 +303,14 @@ static int set_decompress_ops(void) decompress_ops->decompress_data = zlib_decompress_data; decompress_ops->check_len = zlib_check_len; break; +#ifdef CONFIG_ZSTD + case COMPRESS_METHOD_ZSTD: + decompress_ops->load_setup = zstd_load_setup; + decompress_ops->load_cleanup = zstd_load_cleanup; + decompress_ops->decompress_data = zstd_decompress_data; + decompress_ops->check_len = zstd_check_len; + break; +#endif default: return -1; } diff --git a/migration/ram-compress.h b/migration/ram-compress.h index daf241987f..e8700eb36f 100644 --- a/migration/ram-compress.h +++ b/migration/ram-compress.h @@ -29,6 +29,10 @@ #ifndef QEMU_MIGRATION_COMPRESS_H #define QEMU_MIGRATION_COMPRESS_H +#ifdef CONFIG_ZSTD +#include +#include +#endif #include "qemu-file.h" #include "qapi/qapi-types-migration.h" @@ -50,6 +54,11 @@ struct DecompressParam { /* for zlib compression */ z_stream stream; +#ifdef CONFIG_ZSTD + ZSTD_DStream *zstd_ds; + ZSTD_inBuffer in; + ZSTD_outBuffer out; +#endif }; typedef struct DecompressParam DecompressParam; @@ -69,6 +78,12 @@ struct CompressParam { /* for zlib compression */ z_stream stream; + +#ifdef CONFIG_ZSTD + ZSTD_CStream *zstd_cs; + ZSTD_inBuffer in; + ZSTD_outBuffer out; +#endif }; typedef struct CompressParam CompressParam; diff --git a/qapi/migration.json b/qapi/migration.json index cafaa5ccb3..29af841f4e 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -714,12 +714,13 @@ # An enumeration of multi-thread compression methods. # # @zlib: use zlib compression method. +# @zstd: use zstd compression method. # # Since: 5.0 # ## { 'enum': 'CompressMethod', - 'data': [ 'zlib' ] } + 'data': [ 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } ## # @MigrationParameter: -- Gitee From 79863c5ccdd4c635657d2e32e91bc02aa49655e0 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Sat, 30 Jan 2021 16:23:15 +0800 Subject: [PATCH 102/939] migration: Add compress_level sanity check Zlib compression has level from 1 to 9. However Zstd compression has level from 1 to 22 (level >= 20 not recommanded). Let's do sanity check here to make sure a vaild compress_level is given by user. Signed-off-by: Chuan Zheng Signed-off-by: Zeyu Jin Signed-off-by: Ying Fang --- migration/options.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/migration/options.c b/migration/options.c index 6aaee702dc..9b68962a65 100644 --- a/migration/options.c +++ b/migration/options.c @@ -1065,16 +1065,40 @@ void migrate_params_init(MigrationParameters *params) params->has_mode = true; } +static bool compress_level_check(MigrationParameters *params, Error **errp) +{ + switch (params->compress_method) { + case COMPRESS_METHOD_ZLIB: + if (params->compress_level > 9 || params->compress_level < 1) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", + "a value in the range of 0 to 9 for Zlib method"); + return false; + } + break; +#ifdef CONFIG_ZSTD + case COMPRESS_METHOD_ZSTD: + if (params->compress_level > 19 || params->compress_level < 1) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", + "a value in the range of 1 to 19 for Zstd method"); + return false; + } + break; +#endif + default: + error_setg(errp, "Checking compress_level failed for unknown reason"); + return false; + } + + return true; +} + /* * Check whether the parameters are valid. Error will be put into errp * (if provided). Return true if valid, otherwise false. */ bool migrate_params_check(MigrationParameters *params, Error **errp) { - if (params->has_compress_level && - (params->compress_level > 9)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", - "a value between 0 and 9"); + if (params->has_compress_level && !compress_level_check(params, errp)) { return false; } -- Gitee From 55e5f8cafda3c7d4a91e9d58c7b3259476e0dab9 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Sat, 30 Jan 2021 16:36:47 +0800 Subject: [PATCH 103/939] doc: Update multi-thread compression doc Modify the doc to fit the previous changes. Signed-off-by: Chuan Zheng Signed-off-by: Zeyu Jin Signed-off-by: Ying Fang --- docs/multi-thread-compression.txt | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/multi-thread-compression.txt b/docs/multi-thread-compression.txt index 95b1556f67..450e5de469 100644 --- a/docs/multi-thread-compression.txt +++ b/docs/multi-thread-compression.txt @@ -33,14 +33,15 @@ thread compression can be used to accelerate the compression process. The decompression speed of Zlib is at least 4 times as quick as compression, if the source and destination CPU have equal speed, -keeping the compression thread count 4 times the decompression -thread count can avoid resource waste. +and you choose Zlib as compression method, keeping the compression +thread count 4 times the decompression thread count can avoid resource waste. Compression level can be used to control the compression speed and the -compression ratio. High compression ratio will take more time, level 0 -stands for no compression, level 1 stands for the best compression -speed, and level 9 stands for the best compression ratio. Users can -select a level number between 0 and 9. +compression ratio. High compression ratio will take more time, +level 1 stands for the best compression speed, and higher level means higher +compression ration. For Zlib, users can select a level number between 0 and 9, +where level 0 stands for no compression. For Zstd, users can select a +level number between 1 and 22. When to use the multiple thread compression in live migration @@ -116,16 +117,19 @@ to support the multiple thread compression migration: 2. Activate compression on the source: {qemu} migrate_set_capability compress on -3. Set the compression thread count on source: +3. Set the compression method: + {qemu} migrate_set_parameter compress_method zstd + +4. Set the compression thread count on source: {qemu} migrate_set_parameter compress-threads 12 -4. Set the compression level on the source: +5. Set the compression level on the source: {qemu} migrate_set_parameter compress-level 1 -5. Set the decompression thread count on destination: +6. Set the decompression thread count on destination: {qemu} migrate_set_parameter decompress-threads 3 -6. Start outgoing migration: +7. Start outgoing migration: {qemu} migrate -d tcp:destination.host:4444 {qemu} info migrate Capabilities: ... compress: on @@ -136,6 +140,7 @@ The following are the default settings: compress-threads: 8 decompress-threads: 2 compress-level: 1 (which means best speed) + compress_method: zlib So, only the first two steps are required to use the multiple thread compression in migration. You can do more if the default @@ -143,7 +148,7 @@ settings are not appropriate. TODO ==== -Some faster (de)compression method such as LZ4 and Quicklz can help -to reduce the CPU consumption when doing (de)compression. If using -these faster (de)compression method, less (de)compression threads +Comparing to Zlib, Some faster (de)compression method such as LZ4 +and Quicklz can help to reduce the CPU consumption when doing (de)compression. +If using these faster (de)compression method, less (de)compression threads are needed when doing the migration. -- Gitee From 9ebad9c3020625df0a178e6a2d06eaae15ef767c Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Wed, 9 Feb 2022 12:51:19 +0800 Subject: [PATCH 104/939] cpu/features: fix bug for memory leakage strList hash not free after used, Fix it. --- target/i386/cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index fc61a84b1e..f94405c02b 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -5475,6 +5475,7 @@ static void x86_cpu_get_unavailable_features(Object *obj, Visitor *v, x86_cpu_list_feature_names(xc->filtered_features, &result); visit_type_strList(v, "unavailable-features", &result, errp); + qapi_free_strList(result); } /* Print all cpuid feature names in featureset -- Gitee From 7caa5d818e0fa0e1cee2513f2fde4e81f8b5cc13 Mon Sep 17 00:00:00 2001 From: zhengchuan Date: Mon, 5 Dec 2022 20:52:25 +0800 Subject: [PATCH 105/939] migration: report migration related thread pid to libvirt in order to control migration thread cgroup, we need to report migration related thread pid to libvirt Signed-off-by:zhengchuan --- migration/migration.c | 3 +++ qapi/migration.json | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 3ce04b2aaf..7c2fdde26b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3299,6 +3299,9 @@ static void *migration_thread(void *opaque) MigThrError thr_error; bool urgent = false; + /* report migration thread pid to libvirt */ + qapi_event_send_migration_pid(qemu_get_thread_id()); + thread = migration_threads_add("live_migration", qemu_get_thread_id()); rcu_register_thread(); diff --git a/qapi/migration.json b/qapi/migration.json index 29af841f4e..b442d0d878 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1447,6 +1447,18 @@ { 'event': 'MIGRATION_PASS', 'data': { 'pass': 'int' } } +## +# @MIGRATION_PID: +# +# Emitted when migration thread appear +# +# @pid: pid of migration thread +# +# Since: EulerOS Virtual +## +{ 'event': 'MIGRATION_PID', + 'data': { 'pid': 'int' } } + ## # @COLOMessage: # -- Gitee From e387eaeef8845993a437ad19eaf988fb101d3fdd Mon Sep 17 00:00:00 2001 From: zhengchuan Date: Mon, 5 Dec 2022 20:56:35 +0800 Subject: [PATCH 106/939] migration: report multiFd related thread pid to libvirt report multiFd related thread pid to libvirt in order to pin multiFd thread to different cpu. Signed-off-by:zhengchuan --- migration/multifd.c | 4 ++++ qapi/migration.json | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/migration/multifd.c b/migration/multifd.c index 409460684f..7d373a245e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -17,6 +17,7 @@ #include "exec/ramblock.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qapi/qapi-events-migration.h" #include "ram.h" #include "migration.h" #include "migration-stats.h" @@ -657,6 +658,9 @@ static void *multifd_send_thread(void *opaque) thread = migration_threads_add(p->name, qemu_get_thread_id()); + /* report multifd thread pid to libvirt */ + qapi_event_send_migration_multifd_pid(qemu_get_thread_id()); + trace_multifd_send_thread_start(p->id); rcu_register_thread(); diff --git a/qapi/migration.json b/qapi/migration.json index b442d0d878..5d0855a1d8 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1447,6 +1447,18 @@ { 'event': 'MIGRATION_PASS', 'data': { 'pass': 'int' } } +## +# @MIGRATION_MULTIFD_PID: +# +# Emitted when multifd thread appear +# +# @pid: pid of multifd thread +# +# Since: EulerOS Virtual +## +{ 'event': 'MIGRATION_MULTIFD_PID', + 'data': { 'pid': 'int' } } + ## # @MIGRATION_PID: # -- Gitee From 302401ee7eb437712b69caff44ce684c88573dc6 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Mon, 29 Jul 2019 16:22:12 +0800 Subject: [PATCH 107/939] vhost: cancel migration when vhost-user restarted during migraiton Qemu will abort when vhost-user process is restarted during migration when vhost_log_global_start/stop is called. The reason is clear that vhost_dev_set_log returns -1 because network connection is temporarily lost. Let's cancel migraiton and report it to user in this abnormal situation. Signed-off-by: Ying Fang --- hw/virtio/vhost.c | 9 +++++++-- migration/migration.c | 2 +- migration/migration.h | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 2c9ac79468..a8adc149ad 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -26,6 +26,7 @@ #include "hw/mem/memory-device.h" #include "migration/blocker.h" #include "migration/qemu-file-types.h" +#include "migration/migration.h" #include "sysemu/dma.h" #include "trace.h" @@ -1047,20 +1048,24 @@ check_dev_state: static void vhost_log_global_start(MemoryListener *listener) { int r; + Error *errp = NULL; r = vhost_migration_log(listener, true); if (r < 0) { - abort(); + error_setg(&errp, "Failed to start vhost migration log"); + migrate_fd_error(migrate_get_current(), errp); } } static void vhost_log_global_stop(MemoryListener *listener) { int r; + Error *errp = NULL; r = vhost_migration_log(listener, false); if (r < 0) { - abort(); + error_setg(&errp, "Failed to stop vhost migration log"); + migrate_fd_error(migrate_get_current(), errp); } } diff --git a/migration/migration.c b/migration/migration.c index 3ce04b2aaf..71a03b3248 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1377,7 +1377,7 @@ static void migrate_error_free(MigrationState *s) } } -static void migrate_fd_error(MigrationState *s, const Error *error) +void migrate_fd_error(MigrationState *s, const Error *error) { trace_migrate_fd_error(error_get_pretty(error)); assert(s->to_dst_file == NULL); diff --git a/migration/migration.h b/migration/migration.h index cf2c9c88e0..6aafa04314 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -482,6 +482,7 @@ bool migration_has_all_channels(void); uint64_t migrate_max_downtime(void); +void migrate_fd_error(MigrationState *s, const Error *error); void migrate_set_error(MigrationState *s, const Error *error); bool migrate_has_error(MigrationState *s); -- Gitee From a57cbe41cd8b2d8bc31eac33ee74a3ac058d67dd Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 28 Mar 2024 15:24:25 +0800 Subject: [PATCH 108/939] hw/scsi/scsi-generic: Fix io_timeout property not applying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 7c7a9f578e4fb1adff7ac8d9acaaaedb87474e76 The io_timeout property, introduced in c9b6609 (part of 6.0) is silently overwritten by the hardcoded default value of 30 seconds (DEFAULT_IO_TIMEOUT) in scsi_generic_realize because that function is being called after the properties have already been applied. The property definition already has a default value which is applied correctly when no value is explicitly set, so we can just remove the code which overrides the io_timeout completely. This has been tested by stracing SG_IO operations with the io_timeout property set and unset and now sets the timeout field in the ioctl request to the proper value. Fixes: c9b6609b69facad ("scsi: make io_timeout configurable") Signed-off-by: Lorenz Brun Message-ID: <20240315145831.2531695-1-lorenz@brun.one> Reviewed-by: Alex Bennée Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/scsi/scsi-generic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index 22efcd09a6..12fdd8e748 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -782,7 +782,6 @@ static void scsi_generic_realize(SCSIDevice *s, Error **errp) /* Only used by scsi-block, but initialize it nevertheless to be clean. */ s->default_scsi_version = -1; - s->io_timeout = DEFAULT_IO_TIMEOUT; scsi_generic_read_device_inquiry(s); } -- Gitee From b57e956ea522b487081d1c94aa2e4af6a3314d20 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 11:09:36 +0800 Subject: [PATCH 109/939] virtio: check descriptor numbers Check if the vring num is normal in virtio_save(), and add LOG the vm push the wrong viring num down through writing IO Port. Signed-off-by: Jinhua Cao --- hw/virtio/virtio.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index a9aa0c4f66..27ceab92be 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2797,6 +2797,22 @@ static const VMStateDescription vmstate_virtio = { } }; +static void check_vring_avail_num(VirtIODevice *vdev, int index) +{ + uint16_t nheads; + + /* Check it isn't doing strange things with descriptor numbers. */ + nheads = vring_avail_idx(&vdev->vq[index]) - vdev->vq[index].last_avail_idx; + if (nheads > vdev->vq[index].vring.num) { + qemu_log("VQ %d size 0x%x Guest index 0x%x " + "inconsistent with Host index 0x%x: " + "delta 0x%x\n", + index, vdev->vq[index].vring.num, + vring_avail_idx(&vdev->vq[index]), + vdev->vq[index].last_avail_idx, nheads); + } +} + int virtio_save(VirtIODevice *vdev, QEMUFile *f) { BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); @@ -2827,6 +2843,8 @@ int virtio_save(VirtIODevice *vdev, QEMUFile *f) if (vdev->vq[i].vring.num == 0) break; + check_vring_avail_num(vdev, i); + qemu_put_be32(f, vdev->vq[i].vring.num); if (k->has_variable_vring_alignment) { qemu_put_be32(f, vdev->vq[i].vring.align); -- Gitee From 7b4a9547e68147291e68258db9415ef5a20fe06b Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 11:16:26 +0800 Subject: [PATCH 110/939] virtio: bugfix: add rcu_read_lock when vring_avail_idx is called viring_avail_idx should be called within rcu_read_lock(), or may get NULL caches in vring_get_region_caches() and trigger assert(). Signed-off-by: Jinhua Cao --- hw/virtio/virtio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 27ceab92be..ec09d515c2 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2801,6 +2801,7 @@ static void check_vring_avail_num(VirtIODevice *vdev, int index) { uint16_t nheads; + rcu_read_lock(); /* Check it isn't doing strange things with descriptor numbers. */ nheads = vring_avail_idx(&vdev->vq[index]) - vdev->vq[index].last_avail_idx; if (nheads > vdev->vq[index].vring.num) { @@ -2811,6 +2812,7 @@ static void check_vring_avail_num(VirtIODevice *vdev, int index) vring_avail_idx(&vdev->vq[index]), vdev->vq[index].last_avail_idx, nheads); } + rcu_read_unlock(); } int virtio_save(VirtIODevice *vdev, QEMUFile *f) -- Gitee From b24730e9abe34898483fa62b24c26abb9d98570c Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 14:16:17 +0800 Subject: [PATCH 111/939] virtio: print the guest virtio_net features that host does not support print the guest virtio_net features that host does not support For example: Please check host config, because host does not support required feature bits 0x1983 virtio_net_feature: csum, guest_csum, guest_tso4, guest_tso6, host_tso4, host_tso6 Features 0xef99a3 unsupported. Allowed features: 0x40ff8024 Signed-off-by: Jinhua Cao --- hw/net/virtio-net.c | 41 ++++++++++++++++++++++++++++++++++++++ hw/virtio/virtio.c | 7 +++++++ include/hw/virtio/virtio.h | 1 + 3 files changed, 49 insertions(+) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 80c56f0cfc..7f69a4b842 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -3952,6 +3952,46 @@ static Property virtio_net_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void virtio_net_print_features(uint64_t features) +{ + Property *props = virtio_net_properties; + int feature_cnt = 0; + + if (!features) { + return; + } + printf("virtio_net_feature: "); + + for (; features && props->name; props++) { + /* The bitnr of property may be default(0) besides 'csum' property. */ + if (props->bitnr == 0 && strcmp(props->name, "csum")) { + continue; + } + + /* Features only support 64bit. */ + if (props->bitnr > 63) { + continue; + } + + if (virtio_has_feature(features, props->bitnr)) { + virtio_clear_feature(&features, props->bitnr); + if (feature_cnt != 0) { + printf(", "); + } + printf("%s", props->name); + feature_cnt++; + } + } + + if (features) { + if (feature_cnt != 0) { + printf(", "); + } + printf("unkown bits 0x%." PRIx64, features); + } + printf("\n"); +} + static void virtio_net_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3966,6 +4006,7 @@ static void virtio_net_class_init(ObjectClass *klass, void *data) vdc->set_config = virtio_net_set_config; vdc->get_features = virtio_net_get_features; vdc->set_features = virtio_net_set_features; + vdc->print_features = virtio_net_print_features; vdc->bad_features = virtio_net_bad_features; vdc->reset = virtio_net_reset; vdc->queue_reset = virtio_net_queue_reset; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index ec09d515c2..1f78b74c00 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2905,6 +2905,13 @@ static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val) { VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); bool bad = (val & ~(vdev->host_features)) != 0; + uint64_t feat = val & ~(vdev->host_features); + + if (bad && k->print_features) { + qemu_log("error: Please check host config, "\ + "because host does not support required feature bits 0x%" PRIx64 "\n", feat); + k->print_features(feat); + } val &= vdev->host_features; if (k->set_features) { diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c8f72850bc..7c35bb841b 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -182,6 +182,7 @@ struct VirtioDeviceClass { int (*validate_features)(VirtIODevice *vdev); void (*get_config)(VirtIODevice *vdev, uint8_t *config); void (*set_config)(VirtIODevice *vdev, const uint8_t *config); + void (*print_features)(uint64_t features); void (*reset)(VirtIODevice *vdev); void (*set_status)(VirtIODevice *vdev, uint8_t val); /* Device must validate queue_index. */ -- Gitee From f6b3e8ea39d00d25ab979f7b24842dc24e263ed8 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 14:37:52 +0800 Subject: [PATCH 112/939] virtio: bugfix: check the value of caches before accessing it Vring caches may be NULL in check_vring_avail_num() if virtio_reset() is called at the same time, such as when the virtual machine starts. So check it before accessing it in vring_avail_idx(). Signed-off-by: Jinhua Cao --- hw/virtio/virtio.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 1f78b74c00..d93ea62723 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2800,8 +2800,19 @@ static const VMStateDescription vmstate_virtio = { static void check_vring_avail_num(VirtIODevice *vdev, int index) { uint16_t nheads; + VRingMemoryRegionCaches *caches; rcu_read_lock(); + caches = qatomic_rcu_read(&vdev->vq[index].vring.caches); + if (caches == NULL) { + /* + * caches may be NULL if virtio_reset is called at the same time, + * such as when the virtual machine starts. + */ + rcu_read_unlock(); + return; + } + /* Check it isn't doing strange things with descriptor numbers. */ nheads = vring_avail_idx(&vdev->vq[index]) - vdev->vq[index].last_avail_idx; if (nheads > vdev->vq[index].vring.num) { -- Gitee From 4e5de00fb124d82f9c4ce2ac433ed3d691783c01 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Wed, 9 Feb 2022 19:58:21 +0800 Subject: [PATCH 113/939] virtio-scsi: bugfix: fix qemu crash for hotplug scsi disk with dataplane The vm will trigger a disk sweep operation after plugging a controller who's io type is iothread. If attach a scsi disk immediately, the sg_inqury request in vm will trigger the assert in virtio_scsi_ctx_check(), which is called by virtio_scsi_handle_cmd_req_prepare(). Add judgment in virtio_scsi_handle_cmd_req_prepare() and return IO Error directly if the device has not been initialized. Signed-off-by: Jinhua Cao --- hw/scsi/virtio-scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 9c751bf296..bc7feb404a 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -781,7 +781,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) req->req.cmd.tag, req->req.cmd.cdb[0]); d = virtio_scsi_device_get(s, req->req.cmd.lun); - if (!d) { + if (!d || !d->qdev.realized) { req->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET; virtio_scsi_complete_cmd_req(req); return -ENOENT; -- Gitee From 81b4091eee81fe3871d836b1a684e27828cdc2be Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 10:42:33 +0800 Subject: [PATCH 114/939] nbd/server.c: fix invalid read after client was already free In the process of NBD equipment pressurization, executing QEMU NBD will lead to the failure of IO distribution and go to NBD_ Out process of trip(). If two or more IO go to the out process, client NBD will release in nbd_request_put(). The user after free problem that is read again in close(). Through the NBD_ Save the value of client > closing before the out process in trip to solve the use after free problem. Signed-off-by: wangjian161 --- nbd/server.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nbd/server.c b/nbd/server.c index 895cf0a752..e8baed9705 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -2939,6 +2939,7 @@ static coroutine_fn void nbd_trip(void *opaque) NBDRequestData *req; NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */ int ret; + bool client_closing; Error *local_err = NULL; trace_nbd_trip(); @@ -3023,8 +3024,11 @@ disconnect: if (local_err) { error_reportf_err(local_err, "Disconnect client, due to: "); } + client_closing = client->closing; nbd_request_put(req); - client_close(client, true); + if (!client_closing) { + client_close(client, true); + } nbd_client_put(client); } -- Gitee From 0e610831d584d9485eb0655168d08d8234bbb555 Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 10:48:58 +0800 Subject: [PATCH 115/939] qemu-nbd: make native as the default aio mode When the file system is dealing with multithreading concurrent writing to a file, the performance will be degraded because of the lock. At present, the default AIO mode of QEMU NBD is threads. In the case of large blocks, because IO is divided into small pieces and multiple queues, it will become multithreading concurrent writing the same file. Due to the file system, the performance will be greatly reduced. If you change to native mode, this problem will not exist. Signed-off-by: wangjian161 --- qemu-nbd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/qemu-nbd.c b/qemu-nbd.c index 186e6468b1..acccf2977f 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -843,6 +843,10 @@ int main(int argc, char **argv) trace_init_file(); qemu_set_log(LOG_TRACE, &error_fatal); + if (!seen_aio && (flags & BDRV_O_NOCACHE)) { + flags |= BDRV_O_NATIVE_AIO; + } + socket_activation = check_socket_activation(); if (socket_activation == 0) { if (!sockpath) { -- Gitee From d6aa08ac3693be3e08f2c8d3ad5a356ea6e9dead Mon Sep 17 00:00:00 2001 From: WangJian Date: Wed, 9 Feb 2022 10:55:08 +0800 Subject: [PATCH 116/939] qemu-nbd: set timeout to qemu-nbd socket In case of insufficient memory and kill-9, the NBD socket cannot be processed and stuck all the time. Signed-off-by: wangjian161 --- nbd/client.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nbd/client.c b/nbd/client.c index 29ffc609a4..987dde43c7 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -24,6 +24,8 @@ #include "nbd-internal.h" #include "qemu/cutils.h" +#define NBD_TIMEOUT_SECONDS 30 + /* Definitions for opaque data types */ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); @@ -1310,6 +1312,12 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info, } } + if (ioctl(fd, NBD_SET_TIMEOUT, NBD_TIMEOUT_SECONDS) < 0) { + int serrno = errno; + error_setg(errp, "Failed setting timeout"); + return -serrno; + } + trace_nbd_init_finish(); return 0; -- Gitee From 172d79d8ebb343fa144987d2c50d90655d5aa5f9 Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Thu, 29 Jul 2021 15:24:48 +0800 Subject: [PATCH 117/939] qdev/monitors: Fix reundant error_setg of qdev_add_device There is an extra log "error_setg" in qdev_add_device(). When hot-plug a device, if the corresponding bus doesn't exist, it will trigger an asseration "assert(*errp == NULL)". Fixes: 515a7970490 (log: Add some logs on VM runtime path) Signed-off-by: Kunkun Jiang Signed-off-by: Yan Wang --- system/qdev-monitor.c | 1 - 1 file changed, 1 deletion(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index c885175b66..b10e483a9a 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -644,7 +644,6 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, if (path != NULL) { bus = qbus_find(path, errp); if (!bus) { - error_setg(errp, "can not find bus for %s", driver); return NULL; } if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { -- Gitee From 6c72e65d57dc2a7d811f76a126a9a006abd0ab75 Mon Sep 17 00:00:00 2001 From: fangying Date: Wed, 18 Mar 2020 12:51:33 +0800 Subject: [PATCH 118/939] pcie: Compat with devices which do not support Link Width, such as ioh3420 We hack into PCI_EXP_LNKCAP to support device fast plug/unplug for pcie-root-port. However some devices like ioh3420 does not suport it, so PCI_EXP_LNKCAP is not set for such devices. Signed-off-by: Ying Fang Signed-off-by: Yan Wang --- hw/pci/pcie.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 6db0cf69cd..dccf204451 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -97,13 +97,6 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) return; } - /* Clear and fill LNKCAP from what was configured above */ - pci_long_test_and_clear_mask(exp_cap + PCI_EXP_LNKCAP, - PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS); - pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, - QEMU_PCI_EXP_LNKCAP_MLW(s->width) | - QEMU_PCI_EXP_LNKCAP_MLS(s->speed)); - /* * Link bandwidth notification is required for all root ports and * downstream ports supporting links wider than x1 or multiple link @@ -111,6 +104,12 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) */ if (s->width > QEMU_PCI_EXP_LNK_X1 || s->speed > QEMU_PCI_EXP_LNK_2_5GT) { + /* Clear and fill LNKCAP from what was configured above */ + pci_long_test_and_clear_mask(exp_cap + PCI_EXP_LNKCAP, + PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS); + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, + QEMU_PCI_EXP_LNKCAP_MLW(s->width) | + QEMU_PCI_EXP_LNKCAP_MLS(s->speed)); pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, PCI_EXP_LNKCAP_LBNC); } -- Gitee From 3c4b4c4fc3c71b375490233bb9209763d7094ee9 Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 8 Feb 2022 16:10:31 +0800 Subject: [PATCH 119/939] pcie: Add pcie-root-port fast plug/unplug feature If a device is plugged in the pcie-root-port when VM kernel is booting, the kernel may wrongly disable the device. This bug was brought in by two patches of the linux kernel: https://patchwork.kernel.org/patch/10575355/ https://patchwork.kernel.org/patch/10766219/ VM runtime like kata uses this feature to boot microVM, so we must fix it up. We hack into the pcie native hotplug patch so that hotplug/unplug will work under this circumstance. Signed-off-by: Ying Fang Signed-off-by: Yan Wang --- hw/core/machine.c | 2 ++ hw/pci-bridge/gen_pcie_root_port.c | 2 ++ hw/pci/pcie.c | 13 ++++++++++++- include/hw/pci/pcie_port.h | 3 +++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 0c17398141..965682619b 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -160,6 +160,8 @@ const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0); GlobalProperty hw_compat_3_1[] = { { "pcie-root-port", "x-speed", "2_5" }, { "pcie-root-port", "x-width", "1" }, + { "pcie-root-port", "fast-plug", "0" }, + { "pcie-root-port", "fast-unplug", "0" }, { "memory-backend-file", "x-use-canonical-path-for-ramblock-id", "true" }, { "memory-backend-memfd", "x-use-canonical-path-for-ramblock-id", "true" }, { "tpm-crb", "ppi", "false" }, diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c index 1ce4e7beba..1e1ab5bb19 100644 --- a/hw/pci-bridge/gen_pcie_root_port.c +++ b/hw/pci-bridge/gen_pcie_root_port.c @@ -145,6 +145,8 @@ static Property gen_rp_props[] = { speed, PCIE_LINK_SPEED_16), DEFINE_PROP_PCIE_LINK_WIDTH("x-width", PCIESlot, width, PCIE_LINK_WIDTH_32), + DEFINE_PROP_UINT8("fast-plug", PCIESlot, fast_plug, 0), + DEFINE_PROP_UINT8("fast-unplug", PCIESlot, fast_unplug, 0), DEFINE_PROP_END_OF_LIST() }; diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index dccf204451..04fbd794a8 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -555,6 +555,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); uint16_t sltctl = pci_get_word(exp_cap + PCI_EXP_SLTCTL); + PCIESlot *s = PCIE_SLOT(hotplug_pdev); /* Check if hot-unplug is disabled on the slot */ if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { @@ -600,7 +601,17 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, return; } - pcie_cap_slot_push_attention_button(hotplug_pdev); + if ((pci_dev->cap_present & QEMU_PCIE_LNKSTA_DLLLA) && s->fast_plug) { + pci_word_test_and_clear_mask(pci_dev->config + pci_dev->exp.exp_cap + PCI_EXP_LNKSTA, + PCI_EXP_LNKSTA_DLLLA); + } + + if (s->fast_unplug) { + pcie_cap_slot_event(hotplug_pdev, + PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + } else { + pcie_cap_slot_push_attention_button(hotplug_pdev); + } } /* pci express slot for pci express root/downstream port diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h index 90e6cf45b8..7148a0959b 100644 --- a/include/hw/pci/pcie_port.h +++ b/include/hw/pci/pcie_port.h @@ -56,6 +56,9 @@ struct PCIESlot { uint8_t chassis; uint16_t slot; + uint8_t fast_plug; + uint8_t fast_unplug; + PCIExpLinkSpeed speed; PCIExpLinkWidth width; -- Gitee From 6999f07558308ee6b7d63e46ca554a0b702948d6 Mon Sep 17 00:00:00 2001 From: liuxiangdong Date: Tue, 8 Feb 2022 15:10:25 +0800 Subject: [PATCH 120/939] net/dump.c: Suppress spurious compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with gcc version 11.2.0 (Ubuntu 11.2.0-13ubuntu1) results in a (spurious) warning: In function ‘dump_receive_iov’, inlined from ‘filter_dump_receive_iov’ at ../net/dump.c:157:5: ../net/dump.c:89:9: error: ‘writev’ specified size 18446744073709551600 exceeds maximum object size 9223372036854775807 [-Werror=stringop-overflow=] 89 | if (writev(s->fd, dumpiov, cnt + 1) != sizeof(hdr) + caplen) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from /home/ptomsich/qemu/include/qemu/osdep.h:108, from ../net/dump.c:25: ../net/dump.c: In function ‘filter_dump_receive_iov’: /usr/include/x86_64-linux-gnu/sys/uio.h:52:16: note: in a call to function ‘writev’ declared with attribute ‘read_only (2, 3)’ 52 | extern ssize_t writev (int __fd, const struct iovec *__iovec, int __count) | ^~~~~~ cc1: all warnings being treated as errors This change helps that version of GCC to understand what is going on and suppresses this warning. Signed-off-by: Philipp Tomsich --- net/dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dump.c b/net/dump.c index 16073f2458..d880a7e299 100644 --- a/net/dump.c +++ b/net/dump.c @@ -87,7 +87,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt, dumpiov[0].iov_len = sizeof(hdr); cnt = iov_copy(&dumpiov[1], cnt, iov, cnt, offset, caplen); - if (writev(s->fd, dumpiov, cnt + 1) != sizeof(hdr) + caplen) { + if (writev(s->fd, &dumpiov[0], cnt + 1) != sizeof(hdr) + caplen) { error_report("network dump write error - stopping dump"); close(s->fd); s->fd = -1; -- Gitee From c3f204e02eacdd3e9ec6ac55396ccc7f115ad63e Mon Sep 17 00:00:00 2001 From: Qiang Ning Date: Mon, 12 Jul 2021 17:30:45 +0800 Subject: [PATCH 121/939] hw/net/rocker_of_dpa: fix double free bug of rocker device The of_dpa_cmd_add_l2_flood function of the rocker device releases the memory of group->l2_flood.group_ids before applying for new memory. If the l2_group configured by the guest does not match the input group->l2_flood.group_ids, the err_out branch is redirected to release the memory of the group->l2_flood.group_ids branch. The pointer is not set to NULL after the memory is freed. When the guest accesses the of_dpa_cmd_add_l2_flood function again, the memory of group->l2_flood.group_ids is released again. As a result, the memory is double free. Fix that by setting group->l2_flood.group_ids to NULL after free. Signed-off-by: Jiajie Li Signed-off-by: Qiang Ning Signed-off-by: Yan Wang --- hw/net/rocker/rocker_of_dpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c index 5e16056be6..c25438cccc 100644 --- a/hw/net/rocker/rocker_of_dpa.c +++ b/hw/net/rocker/rocker_of_dpa.c @@ -2070,6 +2070,7 @@ static int of_dpa_cmd_add_l2_flood(OfDpa *of_dpa, OfDpaGroup *group, err_out: group->l2_flood.group_count = 0; g_free(group->l2_flood.group_ids); + group->l2_flood.group_ids = NULL; g_free(tlvs); return err; -- Gitee From 06fc5eb48668a1c83e6a4e76c1a71403917b1835 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Fri, 11 Feb 2022 20:33:47 +0800 Subject: [PATCH 122/939] i6300esb watchdog: bugfix: Add a runstate transition QEMU will abort() for the reasons now: invalid runstate transition: 'prelaunch' -> 'postmigrate' Aborted This happens when: |<- watchdog timeout happened, then sets reset_requested to | SHUTDOWN_CAUSE_GUEST_RESET; |<- hot-migration thread sets vm state to RUN_STATE_FINISH_MIGRATE | before the last time of migration; |<- main thread gets the change of reset_requested and triggers | reset, then sets vm state to RUN_STATE_PRELAUNCH; |<- hot-migration thread sets vm state to RUN_STATE_POSTMIGRATE. Then 'prelaunch' -> 'postmigrate' runstate transition will happen. It is legal so add this transition to runstate_transitions_def. Signed-off-by: Jinhua Cao --- system/runstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/system/runstate.c b/system/runstate.c index ea9d6c2a32..9d3f627fee 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -116,6 +116,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING }, { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE }, { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, + { RUN_STATE_PRELAUNCH, RUN_STATE_POSTMIGRATE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PAUSED }, -- Gitee From 0154183e118169be5945cb5ebec2b79379071591 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Fri, 11 Feb 2022 18:49:21 +0800 Subject: [PATCH 123/939] vhost-user: Set the acked_features to vm's featrue Fix the problem when vm restart, the ovs restart and lead to the net unreachable. The soluation is set the acked_features to vm's featrue just the same as guest virtio-net mod load. Signed-off-by: Jinhua Cao --- hw/net/vhost_net.c | 58 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646..1b08b02477 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -167,9 +167,26 @@ static int vhost_net_get_fd(NetClientState *backend) } } +static uint64_t vhost_get_mask_features(const int *feature_bits, uint64_t features) +{ + const int *bit = feature_bits; + uint64_t out_features = 0; + + while (*bit != VHOST_INVALID_FEATURE_BIT) { + uint64_t bit_mask = (1ULL << *bit); + if (features & bit_mask) { + out_features |= bit_mask; + } + bit++; + } + return out_features; +} + struct vhost_net *vhost_net_init(VhostNetOptions *options) { int r; + VirtIONet *n; + VirtIODevice *vdev; bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_new0(struct vhost_net, 1); uint64_t features = 0; @@ -195,7 +212,46 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->backend = r; net->dev.protocol_features = 0; } else { - net->dev.backend_features = 0; + /* for ovs restart when vm start. + * Normal situation: + * 1.vm start. + * 2.vhost_net_init init ok, then dev.acked_features is 0x40000000. + * 3.guest virtio-net mod load. qemu will call virtio_net_set_features set + * dev.acked_features to 0x40408000. + * 4.feature set to ovs's vhostuser(0x40408000). + * 5.ovs restart. + * 6.vhost_user_stop will save net->dev.acked_features(0x40408000) to + * VhostUserState's acked_features(0x40408000). + * 7.restart ok. + * 8.vhost_net_init fun call vhost_user_get_acked_features get the save + * features, and set to net->dev.acked_features. + * Abnormal situation: + * 1.vm start. + * 2.vhost_net_init init ok, then dev.acked_features is 0x40000000. + * 3.ovs restart. + * 4.vhost_user_stop will save net->dev.acked_features(0x40000000) to + * VhostUserState's acked_features(0x40000000). + * 5.guest virtio-net mod load. qemu will call virtio_net_set_features set + * dev.acked_features to 0x40408000. + * 6.restart ok. + * 7.vhost_net_init fun call vhost_user_get_acked_features get the save + * features(0x40000000), and set to net->dev.acked_features(0x40000000). + * 8.feature set to ovs's vhostuser(0x40000000). + * + * in abnormal situation, qemu set the wrong features to ovs's vhostuser, + * then the vm's network will be down. + * in abnormal situation, we found it just lost the guest feartures in + * acked_features, so hear we set the acked_features to vm's featrue + * just the same as guest virtio-net mod load. + */ + if (options->net_backend->peer) { + n = qemu_get_nic_opaque(options->net_backend->peer); + vdev = VIRTIO_DEVICE(n); + net->dev.backend_features = vhost_get_mask_features(vhost_net_get_feature_bits(net), + vdev->guest_features); + } else { + net->dev.backend_features = 0; + } net->dev.protocol_features = 0; net->backend = -1; -- Gitee From 0bc608ab4117818b32d2a1aaf2d4f5c2aeb54af7 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Fri, 11 Feb 2022 18:05:47 +0800 Subject: [PATCH 124/939] vhost-user: Add support reconnect vhost-user socket Add support reconnect vhost-user socket, the reconnect time is set to be 3 seconds. Signed-off-by: Jinhua Cao --- chardev/char-socket.c | 19 ++++++++++++++++++- hw/net/vhost_net.c | 4 +++- hw/virtio/vhost-user.c | 6 ++++++ include/chardev/char.h | 16 ++++++++++++++++ net/vhost-user.c | 3 +++ 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 034840593d..9c60e15c8e 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -337,6 +337,22 @@ static GSource *tcp_chr_add_watch(Chardev *chr, GIOCondition cond) return qio_channel_create_watch(s->ioc, cond); } +static void tcp_chr_set_reconnect_time(Chardev *chr, + int64_t reconnect_time) +{ + SocketChardev *s = SOCKET_CHARDEV(chr); + s->reconnect_time = reconnect_time; +} + +void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time) +{ + ChardevClass *cc = CHARDEV_GET_CLASS(chr); + + if (cc->chr_set_reconnect_time) { + cc->chr_set_reconnect_time(chr, reconnect_time); + } +} + static void remove_hup_source(SocketChardev *s) { if (s->hup_source != NULL) { @@ -537,7 +553,7 @@ static int tcp_chr_sync_read(Chardev *chr, const uint8_t *buf, int len) if (s->state != TCP_CHARDEV_STATE_DISCONNECTED) { qio_channel_set_blocking(s->ioc, false, NULL); } - if (size == 0) { + if (size == 0 && chr->chr_for_flag != CHR_FOR_VHOST_USER) { /* connection closed */ tcp_chr_disconnect(chr); } @@ -1543,6 +1559,7 @@ static void char_socket_class_init(ObjectClass *oc, void *data) cc->set_msgfds = tcp_set_msgfds; cc->chr_add_client = tcp_chr_add_client; cc->chr_add_watch = tcp_chr_add_watch; + cc->chr_set_reconnect_time = tcp_chr_set_reconnect_time; cc->chr_update_read_handler = tcp_chr_update_read_handler; object_class_property_add(oc, "addr", "SocketAddress", diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 1b08b02477..e48c373b14 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -459,7 +459,9 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, peer = qemu_get_peer(ncs, n->max_queue_pairs); } - if (peer->vring_enable) { + /* ovs needs to restore all states of vring */ + if (peer->vring_enable || + ncs[i].peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) { /* restore vring enable state */ r = vhost_set_vring_enable(peer, peer->vring_enable); diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index f214df804b..05e14e1eff 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -2126,9 +2126,15 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, struct vhost_user *u; VhostUserState *vus = (VhostUserState *) opaque; int err; + Chardev *chr; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + chr = qemu_chr_fe_get_driver(((VhostUserState *)opaque)->chr); + if (chr) { + chr->chr_for_flag = CHR_FOR_VHOST_USER; + } + u = g_new0(struct vhost_user, 1); u->user = vus; u->dev = dev; diff --git a/include/chardev/char.h b/include/chardev/char.h index 01df55f9e8..f8bd469466 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -14,6 +14,8 @@ #define IAC_SB 250 #define IAC 255 +#define CHR_FOR_VHOST_USER 0x32a1 + /* character device */ typedef struct CharBackend CharBackend; @@ -70,6 +72,7 @@ struct Chardev { GSource *gsource; GMainContext *gcontext; DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST); + int chr_for_flag; }; /** @@ -227,6 +230,16 @@ int qemu_chr_write(Chardev *s, const uint8_t *buf, int len, bool write_all); #define qemu_chr_write_all(s, buf, len) qemu_chr_write(s, buf, len, true) int qemu_chr_wait_connected(Chardev *chr, Error **errp); +/** + * @qemu_chr_set_reconnect_time: + * + * Set reconnect time for char disconnect. + * Currently, only vhost user will call it. + * + * @reconnect_time the reconnect_time to be set + */ +void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time); + #define TYPE_CHARDEV "chardev" OBJECT_DECLARE_TYPE(Chardev, ChardevClass, CHARDEV) @@ -306,6 +319,9 @@ struct ChardevClass { /* handle various events */ void (*chr_be_event)(Chardev *s, QEMUChrEvent event); + + /* set reconnect time */ + void (*chr_set_reconnect_time)(Chardev *chr, int64_t reconnect_time); }; Chardev *qemu_chardev_new(const char *id, const char *typename, diff --git a/net/vhost-user.c b/net/vhost-user.c index 12555518e8..51fa8c678f 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -21,6 +21,8 @@ #include "qemu/option.h" #include "trace.h" +#define VHOST_USER_RECONNECT_TIME (3) + typedef struct NetVhostUserState { NetClientState nc; CharBackend chr; /* only queue index 0 */ @@ -292,6 +294,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event) trace_vhost_user_event(chr->label, event); switch (event) { case CHR_EVENT_OPENED: + qemu_chr_set_reconnect_time(chr, VHOST_USER_RECONNECT_TIME); if (vhost_user_start(queues, ncs, s->vhost_user) < 0) { qemu_chr_fe_disconnect(&s->chr); return; -- Gitee From 97335ac382e36db18a61d3891f1fafd15475822e Mon Sep 17 00:00:00 2001 From: caojinhuahw Date: Mon, 19 Dec 2022 12:35:50 +0000 Subject: [PATCH 125/939] fix qemu-core when vhost-user-net config with server mode commit 3a223111d7 set default reconnect for vhost-user-net device, if vhost-user-net config with server mode will casuse the core when ovs client stop. tcp_chr_disconnect ---> set tcp_char state disconnect tcp_chr start reconnect ---> set tcp_char state connecting tcp_char is listen ---> call tcp_chr_accept() fun tcp_char_accept() set tcp_char state to connecting, but current tcp_char state already is connecting, assert failed in tcp_char_change_state() raise qemu core assert(s->state == TCP_CHARDEV_STATE_DISCONNECTED) this commit check tcp_char mode, if tcp_char config with server mode, dont set reconnect time for tcp_chr. fix: 3a223111d7 vhost-user: Add support reconnect vhost-user socket Signed-off-by: caojinhuahw --- chardev/char-socket.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 9c60e15c8e..0c9ab069ae 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -347,6 +347,12 @@ static void tcp_chr_set_reconnect_time(Chardev *chr, void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time) { ChardevClass *cc = CHARDEV_GET_CLASS(chr); + SocketChardev *s = SOCKET_CHARDEV(chr); + + /* if sock dev is listen, dont set reconnect time */ + if (s->is_listen) { + return; + } if (cc->chr_set_reconnect_time) { cc->chr_set_reconnect_time(chr, reconnect_time); -- Gitee From 90d4333d4bbde45a10892bf9004979d239d39e28 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Fri, 11 Feb 2022 19:24:30 +0800 Subject: [PATCH 126/939] vhost-user: quit infinite loop while used memslots is more than the backend limit When used memslots is more than the backend limit, the vhost-user netcard would attach fail and quit infinite loop. Signed-off-by: Jinhua Cao --- hw/virtio/vhost.c | 10 ++++++++++ include/hw/virtio/vhost.h | 1 + net/vhost-user.c | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index a8adc149ad..038ac37dd0 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -56,6 +56,8 @@ static unsigned int used_shared_memslots; static QLIST_HEAD(, vhost_dev) vhost_devices = QLIST_HEAD_INITIALIZER(vhost_devices); +bool used_memslots_exceeded; + unsigned int vhost_get_max_memslots(void) { unsigned int max = UINT_MAX; @@ -1569,8 +1571,11 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, error_setg(errp, "vhost backend memory slots limit (%d) is less" " than current number of used (%d) and reserved (%d)" " memory slots for memory devices.", limit, used, reserved); + used_memslots_exceeded = true; r = -EINVAL; goto fail_busyloop; + } else { + used_memslots_exceeded = false; } return 0; @@ -2405,3 +2410,8 @@ fail: return ret; } + +bool used_memslots_is_exceeded(void) +{ + return used_memslots_exceeded; +} diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 02477788df..444ca0ad42 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -340,6 +340,7 @@ int vhost_dev_set_inflight(struct vhost_dev *dev, struct vhost_inflight *inflight); int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, struct vhost_inflight *inflight); +bool used_memslots_is_exceeded(void); bool vhost_dev_has_iommu(struct vhost_dev *dev); #ifdef CONFIG_VHOST diff --git a/net/vhost-user.c b/net/vhost-user.c index 51fa8c678f..86fd5056ab 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -20,6 +20,7 @@ #include "qemu/error-report.h" #include "qemu/option.h" #include "trace.h" +#include "include/hw/virtio/vhost.h" #define VHOST_USER_RECONNECT_TIME (3) @@ -373,6 +374,10 @@ static int net_vhost_user_init(NetClientState *peer, const char *device, qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event, NULL, nc0->name, NULL, true); + if (used_memslots_is_exceeded()) { + error_report("used memslots exceeded the backend limit, quit loop"); + goto err; + } } while (!s->started); assert(s->vhost_net); -- Gitee From 12cf5e9ece9cb0825f14ca80f6b1c5d1eb95c3e5 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Fri, 11 Feb 2022 18:59:34 +0800 Subject: [PATCH 127/939] vhost-user: add vhost_set_mem_table when vm load_setup at destination When migrate huge vm, packages lost are 90+. During the load_setup of the destination vm, pass the vm mem structure to ovs, the netcard could be enabled when the migration finish state shifting. Signed-off-by: Jinhua Cao --- hw/virtio/vhost-user.c | 24 ++++++++++++++++++++++++ tests/qtest/vhost-user-test.c | 35 ++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index f214df804b..6739dfc98e 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -28,6 +28,7 @@ #include "sysemu/cryptodev.h" #include "migration/migration.h" #include "migration/postcopy-ram.h" +#include "migration/register.h" #include "trace.h" #include "exec/ramblock.h" @@ -2119,6 +2120,28 @@ static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier, return 0; } +static int vhost_user_load_setup(QEMUFile *f, void *opaque) +{ + struct vhost_dev *hdev = opaque; + int r; + + if (hdev->vhost_ops && hdev->vhost_ops->vhost_set_mem_table) { + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); + if (r < 0) { + qemu_log("error: vhost_set_mem_table failed: %s(%d)\n", + strerror(errno), errno); + return r; + } else { + qemu_log("info: vhost_set_mem_table OK\n"); + } + } + return 0; +} + +SaveVMHandlers savevm_vhost_user_handlers = { + .load_setup = vhost_user_load_setup, +}; + static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, Error **errp) { @@ -2255,6 +2278,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, u->postcopy_notifier.notify = vhost_user_postcopy_notifier; postcopy_add_notifier(&u->postcopy_notifier); + register_savevm_live("vhost-user", -1, 1, &savevm_vhost_user_handlers, dev); return 0; } diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index d4e437265f..fadf3f0f2e 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -799,6 +799,23 @@ static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) read_guest_mem_server(global_qtest, server); } +static void wait_for_rings_started(TestServer *s, size_t count) +{ + gint64 end_time; + + g_mutex_lock(&s->data_mutex); + end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND; + while (ctpop64(s->rings) != count) { + if (!g_cond_wait_until(&s->data_cond, &s->data_mutex, end_time)) { + /* timeout has passed */ + g_assert_cmpint(ctpop64(s->rings), ==, count); + break; + } + } + + g_mutex_unlock(&s->data_mutex); +} + static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *s = arg; @@ -869,6 +886,7 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) qtest_qmp_eventwait(to, "RESUME"); g_assert(wait_for_fds(dest)); + wait_for_rings_started(dest, 2); read_guest_mem_server(to, dest); g_source_destroy(source); @@ -880,23 +898,6 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) g_string_free(dest_cmdline, true); } -static void wait_for_rings_started(TestServer *s, size_t count) -{ - gint64 end_time; - - g_mutex_lock(&s->data_mutex); - end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND; - while (ctpop64(s->rings) != count) { - if (!g_cond_wait_until(&s->data_cond, &s->data_mutex, end_time)) { - /* timeout has passed */ - g_assert_cmpint(ctpop64(s->rings), ==, count); - break; - } - } - - g_mutex_unlock(&s->data_mutex); -} - static inline void test_server_connect(TestServer *server) { test_server_create_chr(server, ",reconnect=1"); -- Gitee From c65ff10063a6c599b88cba27fd70a72e2e0cc0ff Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 20:21:33 +0800 Subject: [PATCH 128/939] vhost-user: add unregister_savevm when vhost-user cleanup commit 12cf5e9ece ("vhost-user: add vhost_set_mem_table when vm load_setup at destination") only register savevm handler but not unregister it, which will cause the number of handers increase when vhost-user devices hotplug, so this commit add unregister_savevm when vhost-user cleanup. Fixes: 12cf5e9ece ("vhost-user: add vhost_set_mem_table when vm load_setup at destination") Signed-off-by: Jinhua Cao --- hw/virtio/vhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 6739dfc98e..e589ee3572 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -2310,6 +2310,7 @@ static int vhost_user_backend_cleanup(struct vhost_dev *dev) u->region_rb_len = 0; g_free(u); dev->opaque = 0; + unregister_savevm(NULL, "vhost-user", dev); return 0; } -- Gitee From a344d8636168ba5f034a908d3394ef88d36133dd Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Thu, 10 Feb 2022 11:18:13 +0800 Subject: [PATCH 129/939] monitor: Discard BLOCK_IO_ERROR event when VM rebooted Throttled event like QAPI_EVENT_BLOCK_IO_ERROR may be queued to limit event rate. Event may be delivered when VM is rebooted if the event was queued in the *monitor_qapi_event_state* hash table. Which may casue VM pause and other related problems. Such as seabios blocked during virtio-scsi initialization: vring_add_buf(vq, sg, out_num, in_num, 0, 0); vring_kick(vp, vq, 1); ------------> VM paused here <----------- /* Wait for reply */ while (!vring_more_used(vq)) usleep(5); Signed-off-by: Yan Wang --- include/monitor/monitor.h | 2 ++ monitor/monitor.c | 29 +++++++++++++++++++++++++++++ system/runstate.c | 1 + 3 files changed, 32 insertions(+) diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index 965f5d5450..60079086a8 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -63,4 +63,6 @@ void monitor_register_hmp_info_hrt(const char *name, int error_vprintf_unless_qmp(const char *fmt, va_list ap) G_GNUC_PRINTF(1, 0); int error_printf_unless_qmp(const char *fmt, ...) G_GNUC_PRINTF(1, 2); +void monitor_qapi_event_discard_io_error(void); + #endif /* MONITOR_H */ diff --git a/monitor/monitor.c b/monitor/monitor.c index e540c1334a..8d59a76612 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -34,6 +34,8 @@ #include "qemu/option.h" #include "sysemu/qtest.h" #include "trace.h" +#include "qemu/log.h" +#include "qapi/qmp/qobject.h" /* * To prevent flooding clients, events can be throttled. The @@ -787,6 +789,33 @@ int monitor_init_opts(QemuOpts *opts, Error **errp) return ret; } +void monitor_qapi_event_discard_io_error(void) +{ + GHashTableIter event_iter; + MonitorQAPIEventState *evstate; + gpointer key, value; + GString *json; + + qemu_mutex_lock(&monitor_lock); + g_hash_table_iter_init(&event_iter, monitor_qapi_event_state); + while (g_hash_table_iter_next(&event_iter, &key, &value)) { + evstate = key; + /* Only QAPI_EVENT_BLOCK_IO_ERROR is discarded */ + if (evstate->event == QAPI_EVENT_BLOCK_IO_ERROR) { + g_hash_table_iter_remove(&event_iter); + json = qobject_to_json(QOBJECT(evstate->qdict)); + qemu_log(" %s event discarded\n", json->str); + timer_del(evstate->timer); + timer_free(evstate->timer); + qobject_unref(evstate->data); + qobject_unref(evstate->qdict); + g_string_free(json, true); + g_free(evstate); + } + } + qemu_mutex_unlock(&monitor_lock); +} + QemuOptsList qemu_mon_opts = { .name = "mon", .implied_opt_name = "chardev", diff --git a/system/runstate.c b/system/runstate.c index 9d3f627fee..62e6db8d42 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -503,6 +503,7 @@ void qemu_system_reset(ShutdownCause reason) qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); } cpu_synchronize_all_post_reset(); + monitor_qapi_event_discard_io_error(); } /* -- Gitee From 3cd74fd83d58aa88f9a006980c73844d6b79d1fb Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 10:31:38 +0800 Subject: [PATCH 130/939] virtio-net: bugfix: do not delete netdev before virtio net For the vhost-user net-card, it is allow to delete its network backend while the virtio-net device still exists. However, when the status of the device changes in guest, QEMU will check whether the network backend exists, otherwise it will crash. So do not allowed to delete the network backend directly without delete virtio-net device. Signed-off-by: Jinhua Cao --- net/net.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/net.c b/net/net.c index 0520bc1681..bcd3d7e04c 100644 --- a/net/net.c +++ b/net/net.c @@ -1322,6 +1322,12 @@ void qmp_netdev_del(const char *id, Error **errp) return; } + if (nc->info->type == NET_CLIENT_DRIVER_VHOST_USER && nc->peer) { + error_setg(errp, "Device '%s' is a netdev for vhostuser," + "please delete the peer front-end device (virtio-net) first.", id); + return; + } + qemu_del_net_client(nc); /* -- Gitee From 4321c9f8b85c6a4c1549399aa11e351b66bd1879 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 10:48:27 +0800 Subject: [PATCH 131/939] virtio-net: fix max vring buf size when set ring num Set the max vring buf size of virtio-net devices to 4096 Signed-off-by: Jinhua Cao --- hw/virtio/virtio.c | 9 +++++++-- include/hw/virtio/virtio.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d93ea62723..267c1e6fd0 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2196,12 +2196,17 @@ void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc, void virtio_queue_set_num(VirtIODevice *vdev, int n, int num) { + int vq_max_size = VIRTQUEUE_MAX_SIZE; + + if (!strcmp(vdev->name, "virtio-net")) { + vq_max_size = VIRTIO_NET_VQ_MAX_SIZE; + } + /* Don't allow guest to flip queue between existent and * nonexistent states, or to set it to an invalid size. */ if (!!num != !!vdev->vq[n].vring.num || - num > VIRTQUEUE_MAX_SIZE || - num < 0) { + num > vq_max_size || num < 0) { return; } vdev->vq[n].vring.num = num; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7c35bb841b..e612441357 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -60,6 +60,7 @@ size_t virtio_get_config_size(const VirtIOConfigSizeParams *params, typedef struct VirtQueue VirtQueue; #define VIRTQUEUE_MAX_SIZE 1024 +#define VIRTIO_NET_VQ_MAX_SIZE (4096) typedef struct VirtQueueElement { -- Gitee From 58fe483bf5824db177843675629ed955051078fd Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Sat, 12 Feb 2022 17:22:38 +0800 Subject: [PATCH 132/939] virtio-net: set the max of queue size to 4096 set the max of virtio-net queue size to 4096. Now the queue_size of virtio-net is set by rx_queue_size and tx_queue_size Signed-off-by: Jinhua Cao --- hw/net/virtio-net.c | 5 +++-- hw/virtio/virtio.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7f69a4b842..0ae2ddc002 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -710,6 +710,7 @@ static int virtio_net_max_tx_queue_size(VirtIONet *n) switch(peer->info->type) { case NET_CLIENT_DRIVER_VHOST_USER: + return VIRTIO_NET_VQ_MAX_SIZE; case NET_CLIENT_DRIVER_VHOST_VDPA: return VIRTQUEUE_MAX_SIZE; default: @@ -3638,12 +3639,12 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) * help from us (using virtio 1 and up). */ if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE || - n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE || + n->net_conf.rx_queue_size > VIRTIO_NET_VQ_MAX_SIZE || !is_power_of_2(n->net_conf.rx_queue_size)) { error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), " "must be a power of 2 between %d and %d.", n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE, - VIRTQUEUE_MAX_SIZE); + VIRTIO_NET_VQ_MAX_SIZE); virtio_cleanup(vdev); return; } diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 267c1e6fd0..d00effe4d5 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2338,7 +2338,7 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, break; } - if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) { + if (i == VIRTIO_QUEUE_MAX) { qemu_log("unacceptable queue_size (%d) or num (%d)\n", queue_size, i); abort(); -- Gitee From c2221815b79be9847c4729709809779b4b0550a7 Mon Sep 17 00:00:00 2001 From: Jinhua Cao Date: Thu, 10 Feb 2022 17:28:49 +0800 Subject: [PATCH 133/939] virtio-net: update the default and max of rx/tx_queue_size Set the max of tx_queue_size to 4096 even if the backends are not vhost-user. Set the default of rx/tx_queue_size to 2048 if the backends are vhost-user, otherwise to 4096. Signed-off-by: Jinhua Cao --- hw/net/virtio-net.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 0ae2ddc002..523d01746d 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -50,12 +50,11 @@ #define VIRTIO_NET_VM_VERSION 11 /* previously fixed value */ -#define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256 -#define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256 +#define VIRTIO_NET_VHOST_USER_DEFAULT_SIZE 2048 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */ -#define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE -#define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE +#define VIRTIO_NET_RX_QUEUE_MIN_SIZE 256 +#define VIRTIO_NET_TX_QUEUE_MIN_SIZE 256 #define VIRTIO_NET_IP4_ADDR_SIZE 8 /* ipv4 saddr + daddr */ @@ -696,6 +695,28 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, } } +static void virtio_net_set_default_queue_size(VirtIONet *n) +{ + NetClientState *peer = n->nic_conf.peers.ncs[0]; + + /* Default value is 0 if not set */ + if (n->net_conf.rx_queue_size == 0) { + if (peer && peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) { + n->net_conf.rx_queue_size = VIRTIO_NET_VHOST_USER_DEFAULT_SIZE; + } else { + n->net_conf.rx_queue_size = VIRTIO_NET_VQ_MAX_SIZE; + } + } + + if (n->net_conf.tx_queue_size == 0) { + if (peer && peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) { + n->net_conf.tx_queue_size = VIRTIO_NET_VHOST_USER_DEFAULT_SIZE; + } else { + n->net_conf.tx_queue_size = VIRTIO_NET_VQ_MAX_SIZE; + } + } +} + static int virtio_net_max_tx_queue_size(VirtIONet *n) { NetClientState *peer = n->nic_conf.peers.ncs[0]; @@ -705,16 +726,16 @@ static int virtio_net_max_tx_queue_size(VirtIONet *n) * size. */ if (!peer) { - return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE; + return VIRTIO_NET_VQ_MAX_SIZE; } switch(peer->info->type) { case NET_CLIENT_DRIVER_VHOST_USER: return VIRTIO_NET_VQ_MAX_SIZE; case NET_CLIENT_DRIVER_VHOST_VDPA: - return VIRTQUEUE_MAX_SIZE; + return VIRTIO_NET_VQ_MAX_SIZE; default: - return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE; + return VIRTIO_NET_VQ_MAX_SIZE; }; } @@ -3633,6 +3654,8 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) virtio_net_set_config_size(n, n->host_features); virtio_init(vdev, VIRTIO_ID_NET, n->config_size); + virtio_net_set_default_queue_size(n); + /* * We set a lower limit on RX queue size to what it always was. * Guests that want a smaller ring can always resize it without @@ -3934,10 +3957,8 @@ static Property virtio_net_properties[] = { TX_TIMER_INTERVAL), DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST), DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx), - DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, - VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE), - DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size, - VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE), + DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, 0), + DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size, 0), DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0), DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend, true), -- Gitee From dc7e40b2841132b0bc43d25c2c31f41ae3fa2c68 Mon Sep 17 00:00:00 2001 From: eillon Date: Tue, 8 Feb 2022 22:43:59 -0500 Subject: [PATCH 134/939] hw/usb: reduce the vpcu cost of UHCI when VNC disconnect Reduce the vpcu cost by set a lower FRAME_TIMER_FREQ of the UHCI when VNC client disconnected. This can reduce about 3% cost of vcpu thread. Signed-off-by: eillon --- hw/usb/core.c | 5 ++-- hw/usb/desc.c | 7 +++-- hw/usb/dev-hid.c | 2 +- hw/usb/hcd-uhci.c | 63 ++++++++++++++++++++++++++++++++++------ hw/usb/hcd-uhci.h | 1 + hw/usb/host-libusb.c | 32 ++++++++++++++++++++ include/hw/usb.h | 1 + include/qemu/timer.h | 28 ++++++++++++++++++ ui/vnc.c | 4 +++ util/qemu-timer.c | 69 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 197 insertions(+), 15 deletions(-) diff --git a/hw/usb/core.c b/hw/usb/core.c index 975f76250a..51b36126ca 100644 --- a/hw/usb/core.c +++ b/hw/usb/core.c @@ -87,7 +87,7 @@ void usb_device_reset(USBDevice *dev) return; } usb_device_handle_reset(dev); - dev->remote_wakeup = 0; + dev->remote_wakeup &= ~USB_DEVICE_REMOTE_WAKEUP; dev->addr = 0; dev->state = USB_STATE_DEFAULT; } @@ -105,7 +105,8 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream) */ return; } - if (dev->remote_wakeup && dev->port && dev->port->ops->wakeup) { + if ((dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) + && dev->port && dev->port->ops->wakeup) { dev->port->ops->wakeup(dev->port); } if (bus->ops->wakeup_endpoint) { diff --git a/hw/usb/desc.c b/hw/usb/desc.c index f2bdc05a95..333f73fff1 100644 --- a/hw/usb/desc.c +++ b/hw/usb/desc.c @@ -752,7 +752,7 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p, if (config->bmAttributes & USB_CFG_ATT_SELFPOWER) { data[0] |= 1 << USB_DEVICE_SELF_POWERED; } - if (dev->remote_wakeup) { + if (dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) { data[0] |= 1 << USB_DEVICE_REMOTE_WAKEUP; } data[1] = 0x00; @@ -762,14 +762,15 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p, } case DeviceOutRequest | USB_REQ_CLEAR_FEATURE: if (value == USB_DEVICE_REMOTE_WAKEUP) { - dev->remote_wakeup = 0; + dev->remote_wakeup &= ~USB_DEVICE_REMOTE_WAKEUP; ret = 0; } trace_usb_clear_device_feature(dev->addr, value, ret); break; case DeviceOutRequest | USB_REQ_SET_FEATURE: + dev->remote_wakeup |= USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED; if (value == USB_DEVICE_REMOTE_WAKEUP) { - dev->remote_wakeup = 1; + dev->remote_wakeup |= USB_DEVICE_REMOTE_WAKEUP; ret = 0; } trace_usb_set_device_feature(dev->addr, value, ret); diff --git a/hw/usb/dev-hid.c b/hw/usb/dev-hid.c index bdd6d1ffaf..cc68d1ce9e 100644 --- a/hw/usb/dev-hid.c +++ b/hw/usb/dev-hid.c @@ -745,7 +745,7 @@ static int usb_ptr_post_load(void *opaque, int version_id) { USBHIDState *s = opaque; - if (s->dev.remote_wakeup) { + if (s->dev.remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) { hid_pointer_activate(&s->hid); } return 0; diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c index 6975966c3f..a92581ff5f 100644 --- a/hw/usb/hcd-uhci.c +++ b/hw/usb/hcd-uhci.c @@ -44,6 +44,8 @@ #include "hcd-uhci.h" #define FRAME_TIMER_FREQ 1000 +#define FRAME_TIMER_FREQ_LAZY 10 +#define USB_DEVICE_NEED_NORMAL_FREQ "QEMU USB Tablet" #define FRAME_MAX_LOOPS 256 @@ -109,6 +111,22 @@ static void uhci_async_cancel(UHCIAsync *async); static void uhci_queue_fill(UHCIQueue *q, UHCI_TD *td); static void uhci_resume(void *opaque); +static int64_t uhci_frame_timer_freq = FRAME_TIMER_FREQ_LAZY; + +static void uhci_set_frame_freq(int freq) +{ + if (freq <= 0) { + return; + } + + uhci_frame_timer_freq = freq; +} + +static qemu_usb_controller qemu_uhci = { + .name = "uhci", + .qemu_set_freq = uhci_set_frame_freq, +}; + static inline int32_t uhci_queue_token(UHCI_TD *td) { if ((td->token & (0xf << 15)) == 0) { @@ -351,7 +369,7 @@ static int uhci_post_load(void *opaque, int version_id) if (version_id < 2) { s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + - (NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ); + (NANOSECONDS_PER_SECOND / uhci_frame_timer_freq); } return 0; } @@ -392,8 +410,29 @@ static void uhci_port_write(void *opaque, hwaddr addr, if ((val & UHCI_CMD_RS) && !(s->cmd & UHCI_CMD_RS)) { /* start frame processing */ trace_usb_uhci_schedule_start(); - s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + - (NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ); + + /* + * If the frequency of frame_timer is too slow, Guest OS (Win2012) would become + * blue-screen after hotplugging some vcpus. + * If this USB device support the remote-wakeup, the UHCI controller + * will enter global suspend mode when there is no input for several seconds. + * In this case, Qemu will delete the frame_timer. Since the frame_timer has been deleted, + * there is no influence to the performance of Vms. So, we can change the frequency to 1000. + * After that the frequency will be safe when we trigger the frame_timer again. + * Excepting this, there are two ways to change the frequency: + * 1)VNC connect/disconnect;2)attach/detach USB device. + */ + if ((uhci_frame_timer_freq != FRAME_TIMER_FREQ) + && (s->ports[0].port.dev) + && (!memcmp(s->ports[0].port.dev->product_desc, + USB_DEVICE_NEED_NORMAL_FREQ, strlen(USB_DEVICE_NEED_NORMAL_FREQ))) + && (s->ports[0].port.dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED)) { + qemu_log("turn up the frequency of UHCI controller to %d\n", FRAME_TIMER_FREQ); + uhci_frame_timer_freq = FRAME_TIMER_FREQ; + } + + s->frame_time = NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ; + s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->frame_time; timer_mod(s->frame_timer, s->expire_time); s->status &= ~UHCI_STS_HCHALTED; } else if (!(val & UHCI_CMD_RS)) { @@ -1083,7 +1122,6 @@ static void uhci_frame_timer(void *opaque) UHCIState *s = opaque; uint64_t t_now, t_last_run; int i, frames; - const uint64_t frame_t = NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ; s->completions_only = false; qemu_bh_cancel(s->bh); @@ -1099,14 +1137,14 @@ static void uhci_frame_timer(void *opaque) } /* We still store expire_time in our state, for migration */ - t_last_run = s->expire_time - frame_t; + t_last_run = s->expire_time - s->frame_time; t_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); /* Process up to MAX_FRAMES_PER_TICK frames */ - frames = (t_now - t_last_run) / frame_t; + frames = (t_now - t_last_run) / s->frame_time; if (frames > s->maxframes) { int skipped = frames - s->maxframes; - s->expire_time += skipped * frame_t; + s->expire_time += skipped * s->frame_time; s->frnum = (s->frnum + skipped) & 0x7ff; frames -= skipped; } @@ -1123,7 +1161,7 @@ static void uhci_frame_timer(void *opaque) /* The spec says frnum is the frame currently being processed, and * the guest must look at frnum - 1 on interrupt, so inc frnum now */ s->frnum = (s->frnum + 1) & 0x7ff; - s->expire_time += frame_t; + s->expire_time += s->frame_time; } /* Complete the previous frame(s) */ @@ -1134,7 +1172,12 @@ static void uhci_frame_timer(void *opaque) } s->pending_int_mask = 0; - timer_mod(s->frame_timer, t_now + frame_t); + /* expire_time is calculated from last frame_time, we should calculate it + * according to new frame_time which equals to + * NANOSECONDS_PER_SECOND / uhci_frame_timer_freq */ + s->expire_time -= s->frame_time - NANOSECONDS_PER_SECOND / uhci_frame_timer_freq; + s->frame_time = NANOSECONDS_PER_SECOND / uhci_frame_timer_freq; + timer_mod(s->frame_timer, t_now + s->frame_time); } static const MemoryRegionOps uhci_ioport_ops = { @@ -1195,8 +1238,10 @@ void usb_uhci_common_realize(PCIDevice *dev, Error **errp) s->bh = qemu_bh_new_guarded(uhci_bh, s, &DEVICE(dev)->mem_reentrancy_guard); s->frame_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, uhci_frame_timer, s); s->num_ports_vmstate = NB_PORTS; + s->frame_time = NANOSECONDS_PER_SECOND / uhci_frame_timer_freq; QTAILQ_INIT(&s->queues); + qemu_register_usb_controller(&qemu_uhci, QEMU_USB_CONTROLLER_UHCI); memory_region_init_io(&s->io_bar, OBJECT(s), &uhci_ioport_ops, s, "uhci", 0x20); diff --git a/hw/usb/hcd-uhci.h b/hw/usb/hcd-uhci.h index 69f8b40c49..0918719911 100644 --- a/hw/usb/hcd-uhci.h +++ b/hw/usb/hcd-uhci.h @@ -50,6 +50,7 @@ typedef struct UHCIState { uint16_t status; uint16_t intr; /* interrupt enable register */ uint16_t frnum; /* frame number */ + uint64_t frame_time; /* frame time in ns */ uint32_t fl_base_addr; /* frame list base address */ uint8_t sof_timing; uint8_t status2; /* bit 0 and 1 are used to generate UHCI_STS_USBINT */ diff --git a/hw/usb/host-libusb.c b/hw/usb/host-libusb.c index d7060a42d5..dba469c1ef 100644 --- a/hw/usb/host-libusb.c +++ b/hw/usb/host-libusb.c @@ -945,6 +945,30 @@ static void usb_host_ep_update(USBHostDevice *s) libusb_free_config_descriptor(conf); } +static unsigned int usb_get_controller_type(int speed) +{ + unsigned int type = MAX_USB_CONTROLLER_TYPES; + + switch (speed) { + case USB_SPEED_SUPER: + type = QEMU_USB_CONTROLLER_XHCI; + break; + case USB_SPEED_HIGH: + type = QEMU_USB_CONTROLLER_EHCI; + break; + case USB_SPEED_FULL: + type = QEMU_USB_CONTROLLER_UHCI; + break; + case USB_SPEED_LOW: + type = QEMU_USB_CONTROLLER_OHCI; + break; + default: + break; + } + + return type; +} + static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) { USBDevice *udev = USB_DEVICE(s); @@ -1054,6 +1078,12 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) } trace_usb_host_open_success(bus_num, addr); + + /* change ehci frame time freq when USB passthrough */ + qemu_log("usb host speed is %d\n", udev->speed); + qemu_timer_set_mode(QEMU_TIMER_USB_NORMAL_MODE, + usb_get_controller_type(udev->speed)); + return 0; fail: @@ -1129,6 +1159,8 @@ static int usb_host_close(USBHostDevice *s) } usb_host_auto_check(NULL); + qemu_timer_set_mode(QEMU_TIMER_USB_LAZY_MODE, + usb_get_controller_type(udev->speed)); return 0; } diff --git a/include/hw/usb.h b/include/hw/usb.h index 32c23a5ca2..911179158d 100644 --- a/include/hw/usb.h +++ b/include/hw/usb.h @@ -142,6 +142,7 @@ #define USB_DEVICE_SELF_POWERED 0 #define USB_DEVICE_REMOTE_WAKEUP 1 +#define USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED 2 #define USB_DT_DEVICE 0x01 #define USB_DT_CONFIG 0x02 diff --git a/include/qemu/timer.h b/include/qemu/timer.h index 9a366e551f..475c2a3f18 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -91,6 +91,34 @@ struct QEMUTimer { int scale; }; +#define QEMU_USB_NORMAL_FREQ 1000 +#define QEMU_USB_LAZY_FREQ 10 +#define MAX_USB_CONTROLLER_TYPES 4 +#define QEMU_USB_CONTROLLER_OHCI 0 +#define QEMU_USB_CONTROLLER_UHCI 1 +#define QEMU_USB_CONTROLLER_EHCI 2 +#define QEMU_USB_CONTROLLER_XHCI 3 + +typedef void (*QEMUSetFreqHandler) (int freq); + +typedef struct qemu_usb_controller { + const char *name; + QEMUSetFreqHandler qemu_set_freq; +} qemu_usb_controller; + +typedef qemu_usb_controller* qemu_usb_controller_ptr; + +enum qemu_timer_mode { + QEMU_TIMER_USB_NORMAL_MODE = 1 << 0, /* Set when VNC connect or + * with usb dev passthrough + */ + QEMU_TIMER_USB_LAZY_MODE = 1 << 1, /* Set when VNC disconnect */ +}; + +int qemu_register_usb_controller(qemu_usb_controller_ptr controller, + unsigned int type); +int qemu_timer_set_mode(enum qemu_timer_mode mode, unsigned int type); + extern QEMUTimerListGroup main_loop_tlg; /* diff --git a/ui/vnc.c b/ui/vnc.c index 4f23a0fa79..5dd77e73cb 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -1365,6 +1365,8 @@ void vnc_disconnect_finish(VncState *vs) g_free(vs->zrle); g_free(vs->tight); g_free(vs); + + qemu_timer_set_mode(QEMU_TIMER_USB_LAZY_MODE, QEMU_USB_CONTROLLER_UHCI); } size_t vnc_client_io_error(VncState *vs, ssize_t ret, Error *err) @@ -3341,6 +3343,8 @@ static void vnc_connect(VncDisplay *vd, QIOChannelSocket *sioc, } } } + + qemu_timer_set_mode(QEMU_TIMER_USB_NORMAL_MODE, QEMU_USB_CONTROLLER_UHCI); } void vnc_start_protocol(VncState *vs) diff --git a/util/qemu-timer.c b/util/qemu-timer.c index 6a0de33dd2..dc891cc557 100644 --- a/util/qemu-timer.c +++ b/util/qemu-timer.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "qemu/main-loop.h" #include "qemu/timer.h" #include "qemu/lockable.h" @@ -75,6 +76,74 @@ struct QEMUTimerList { QemuEvent timers_done_ev; }; +typedef struct qemu_controller_timer_state { + qemu_usb_controller_ptr controller; + int refs; +} controller_timer_state; + +typedef controller_timer_state* controller_timer_state_ptr; + +static controller_timer_state uhci_timer_state = { + .controller = NULL, + .refs = 0, +}; + +static controller_timer_state_ptr \ + qemu_usb_controller_tab[MAX_USB_CONTROLLER_TYPES] = {NULL, + &uhci_timer_state, + NULL, NULL}; + +int qemu_register_usb_controller(qemu_usb_controller_ptr controller, + unsigned int type) +{ + if (type != QEMU_USB_CONTROLLER_UHCI) { + return 0; + } + + /* for companion EHCI controller will create three UHCI controllers, + * we init it only once. + */ + if (!qemu_usb_controller_tab[type]->controller) { + qemu_log("the usb controller (%d) registed frame handler\n", type); + qemu_usb_controller_tab[type]->controller = controller; + } + + return 0; +} + +int qemu_timer_set_mode(enum qemu_timer_mode mode, unsigned int type) +{ + if (type != QEMU_USB_CONTROLLER_UHCI) { + qemu_log("the usb controller (%d) no need change frame frep\n", type); + return 0; + } + + if (!qemu_usb_controller_tab[type]->controller) { + qemu_log("the usb controller (%d) not registed yet\n", type); + return 0; + } + + if (mode == QEMU_TIMER_USB_NORMAL_MODE) { + if (qemu_usb_controller_tab[type]->refs++ > 0) { + return 0; + } + qemu_usb_controller_tab[type]->controller-> + qemu_set_freq(QEMU_USB_NORMAL_FREQ); + qemu_log("Set the controller (%d) of freq %d HZ,\n", + type, QEMU_USB_NORMAL_FREQ); + } else { + if (--qemu_usb_controller_tab[type]->refs > 0) { + return 0; + } + qemu_usb_controller_tab[type]->controller-> + qemu_set_freq(QEMU_USB_LAZY_FREQ); + qemu_log("Set the controller(type:%d) of freq %d HZ,\n", + type, QEMU_USB_LAZY_FREQ); + } + + return 0; +} + /** * qemu_clock_ptr: * @type: type of clock -- Gitee From 3ef6dc341d6921a95564e9089f41ddbd79cd2a94 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:55:53 +0800 Subject: [PATCH 135/939] vhost: implement migration state notifier for vdpa device Register migration state notifier to support triggered by migration exceptions Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 29 +++++++++++++++++++++++++++++ include/hw/virtio/vdpa-dev.h | 1 + 2 files changed, 30 insertions(+) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 1872f11f3f..9b47e3ed45 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -23,6 +23,7 @@ #include "hw/virtio/virtio-bus.h" #include "migration/register.h" #include "migration/migration.h" +#include "migration/misc.h" #include "qemu/error-report.h" #include "hw/virtio/vdpa-dev-mig.h" #include "migration/qemu-file-types.h" @@ -354,6 +355,31 @@ static SaveVMHandlers savevm_vdpa_handlers = { .load_setup = vdpa_load_setup, }; +static void vdpa_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VhostVdpaDevice *vdev = container_of(notifier, + VhostVdpaDevice, + migration_state); + struct vhost_dev *hdev = &vdev->dev; + int ret; + + switch (s->state) { + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_FAILED: + ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_CANCEL); + if (ret) { + error_report("Failed to set state CANCEL\n"); + } + + break; + case MIGRATION_STATUS_COMPLETED: + default: + break; + } +} + void vdpa_migration_register(VhostVdpaDevice *vdev) { vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev), @@ -361,10 +387,13 @@ void vdpa_migration_register(VhostVdpaDevice *vdev) DEVICE(vdev)); register_savevm_live("vdpa", -1, 1, &savevm_vdpa_handlers, DEVICE(vdev)); + vdev->migration_state.notify = vdpa_migration_state_notifier; + migration_add_notifier(&vdev->migration_state, vdpa_migration_state_notifier); } void vdpa_migration_unregister(VhostVdpaDevice *vdev) { + migration_remove_notifier(&vdev->migration_state); unregister_savevm(VMSTATE_IF(&vdev->parent_obj.parent_obj), "vdpa", DEVICE(vdev)); qemu_del_vm_change_state_handler(vdev->vmstate); } diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h index 43cbcef81b..20f50c76c6 100644 --- a/include/hw/virtio/vdpa-dev.h +++ b/include/hw/virtio/vdpa-dev.h @@ -39,6 +39,7 @@ struct VhostVdpaDevice { bool started; int (*post_init)(VhostVdpaDevice *v, Error **errp); VMChangeStateEntry *vmstate; + Notifier migration_state; }; #endif -- Gitee From 4688e12c57a34801010abf2a4cf528fcef3b9ec0 Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 15:59:56 +0800 Subject: [PATCH 136/939] vdpa: implement vdpa device migration Integrate the live migration code, call the registered live migration function, and open the vdpa live migration prototype Signed-off-by: libai --- hw/virtio/vdpa-dev.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index f22d5d5bc0..6af78a4229 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -28,6 +28,8 @@ #include "hw/virtio/vdpa-dev.h" #include "sysemu/sysemu.h" #include "sysemu/runstate.h" +#include "hw/virtio/vdpa-dev-mig.h" +#include "migration/migration.h" static void vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -154,6 +156,8 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) vhost_vdpa_device_dummy_handle_output); } + vdpa_migration_register(v); + return; free_config: @@ -173,6 +177,7 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev) VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); int i; + vdpa_migration_unregister(s); virtio_set_status(vdev, 0); for (i = 0; i < s->num_queues; i++) { @@ -308,6 +313,7 @@ static void vhost_vdpa_device_stop(VirtIODevice *vdev) static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) { VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); + MigrationState *ms = migrate_get_current(); bool should_start = virtio_device_started(vdev, status); Error *local_err = NULL; int ret; @@ -320,6 +326,11 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) return; } + if (ms->state == RUN_STATE_PAUSED || + ms->state == RUN_STATE_RESTORE_VM) { + return; + } + if (should_start) { ret = vhost_vdpa_device_start(vdev, &local_err); if (ret < 0) { @@ -338,7 +349,7 @@ static Property vhost_vdpa_device_properties[] = { static const VMStateDescription vmstate_vhost_vdpa_device = { .name = "vhost-vdpa-device", - .unmigratable = 1, + .unmigratable = 0, .minimum_version_id = 1, .version_id = 1, .fields = (VMStateField[]) { -- Gitee From 587f42300488af4478d7aa1b62e2b351155621db Mon Sep 17 00:00:00 2001 From: libai Date: Mon, 4 Dec 2023 16:01:16 +0800 Subject: [PATCH 137/939] vdpa: move memory listener to the realize stage Move the memory listener registration of vdpa from the start stage to the realize stage. Avoid that in the start phase, the memory listener callback function has not yet been processed. Signed-off-by: libai --- hw/virtio/vdpa-dev.c | 4 ++++ hw/virtio/vhost-vdpa.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index 6af78a4229..877bf7464f 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -30,6 +30,7 @@ #include "sysemu/runstate.h" #include "hw/virtio/vdpa-dev-mig.h" #include "migration/migration.h" +#include "exec/address-spaces.h" static void vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -125,6 +126,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) goto free_vqs; } + memory_listener_register(&v->vdpa.listener, &address_space_memory); v->config_size = vhost_vdpa_device_get_u32(v->vhostfd, VHOST_VDPA_GET_CONFIG_SIZE, errp); @@ -163,6 +165,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) free_config: g_free(v->config); vhost_cleanup: + memory_listener_unregister(&v->vdpa.listener); vhost_dev_cleanup(&v->dev); free_vqs: g_free(vqs); @@ -188,6 +191,7 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev) g_free(s->config); g_free(s->dev.vqs); + memory_listener_unregister(&s->vdpa.listener); vhost_dev_cleanup(&s->dev); qemu_close(s->vhostfd); s->vhostfd = -1; diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 063e941544..30408f2069 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1320,8 +1320,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) "IOMMU and try again"); return -1; } - memory_listener_register(&v->listener, dev->vdev->dma_as); - return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); } @@ -1515,7 +1513,6 @@ static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) static int vhost_vdpa_suspend_device(struct vhost_dev *dev) { - struct vhost_vdpa *v = dev->opaque; int ret; vhost_vdpa_svqs_stop(dev); @@ -1526,7 +1523,6 @@ static int vhost_vdpa_suspend_device(struct vhost_dev *dev) } ret = vhost_vdpa_call(dev, VHOST_VDPA_SUSPEND, NULL); - memory_listener_unregister(&v->listener); return ret; } @@ -1548,7 +1544,6 @@ static int vhost_vdpa_resume_device(struct vhost_dev *dev) return 0; } - memory_listener_register(&v->listener, &address_space_memory); return vhost_vdpa_call(dev, VHOST_VDPA_RESUME, NULL); } -- Gitee From fe771abc365ba0cb62dd1726f1aa5274f1807876 Mon Sep 17 00:00:00 2001 From: Jiabo Feng Date: Sat, 30 Mar 2024 16:24:45 +0800 Subject: [PATCH 138/939] disable keyring option Due to the default prohibition of some syscall(e.g. add_key) in the Docker compilation environment, the testcases in test-crypto-secret.c cannot pass. Signed-off-by: Jiabo Feng --- meson_options.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson_options.txt b/meson_options.txt index c9baeda639..cf9706c411 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -121,7 +121,7 @@ option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') option('avx512bw', type: 'feature', value: 'auto', description: 'AVX512BW optimizations') -option('keyring', type: 'feature', value: 'auto', +option('keyring', type: 'feature', value: 'disabled', description: 'Linux keyring support') option('libkeyutils', type: 'feature', value: 'auto', description: 'Linux keyutils support') -- Gitee From e58b48ab2bb679f4c661301019d6f94bd39f93e5 Mon Sep 17 00:00:00 2001 From: libai Date: Tue, 19 Dec 2023 20:18:03 +0800 Subject: [PATCH 139/939] vdpa: support vdpa device suspend/resume only implement suspend and resume interface used for migration. The current implementation still has bugs when suspend/resume a virtual machine. Fix it. Fixes: 4c5a9a0703 (""vhost: implement vhost_vdpa_device_suspend/resume) Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 16 +++++++++++----- hw/virtio/vdpa-dev.c | 8 +------- include/hw/virtio/vdpa-dev.h | 1 + 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 9b47e3ed45..8b13f89c85 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -143,6 +143,7 @@ static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa) } vdpa->started = false; + vdpa->suspended = true; ret = vhost_dev_suspend(&vdpa->dev, vdev, false); if (ret) { @@ -165,6 +166,7 @@ set_guest_notifiers_fail: } suspend_fail: + vdpa->suspended = false; vdpa->started = true; return ret; } @@ -201,6 +203,7 @@ static int vhost_vdpa_device_resume(VhostVdpaDevice *vdpa) goto err_guest_notifiers; } vdpa->started = true; + vdpa->suspended = false; /* * guest_notifier_mask/pending not used yet, so just unmask @@ -241,7 +244,7 @@ static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) MigrationIncomingState *mis = migration_incoming_get_current(); if (!running) { - if (ms->state == RUN_STATE_PAUSED) { + if (ms->state == MIGRATION_STATUS_ACTIVE || state == RUN_STATE_PAUSED) { ret = vhost_vdpa_device_suspend(vdpa); if (ret) { error_report("suspend vdpa device failed: %d\n", ret); @@ -251,16 +254,19 @@ static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) } } } else { - if (ms->state == RUN_STATE_RESTORE_VM) { + if (vdpa->suspended) { ret = vhost_vdpa_device_resume(vdpa); if (ret) { - error_report("migration dest resume device failed, abort!\n"); - exit(EXIT_FAILURE); + error_report("vhost vdpa device resume failed: %d\n", ret); } } if (mis->state == RUN_STATE_RESTORE_VM) { - vhost_vdpa_call(hdev, VHOST_VDPA_RESUME, NULL); + ret = vhost_vdpa_call(hdev, VHOST_VDPA_RESUME, NULL); + if (ret) { + error_report("migration dest resume device failed: %d\n", ret); + exit(EXIT_FAILURE); + } /* post resume */ mis->bh = qemu_bh_new(vdpa_dev_migration_handle_incoming_bh, hdev); diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index 877bf7464f..91e71847b0 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -317,7 +317,6 @@ static void vhost_vdpa_device_stop(VirtIODevice *vdev) static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) { VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); - MigrationState *ms = migrate_get_current(); bool should_start = virtio_device_started(vdev, status); Error *local_err = NULL; int ret; @@ -326,12 +325,7 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) should_start = false; } - if (s->started == should_start) { - return; - } - - if (ms->state == RUN_STATE_PAUSED || - ms->state == RUN_STATE_RESTORE_VM) { + if (s->started == should_start || s->suspended) { return; } diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h index 20f50c76c6..60e9c3f3fe 100644 --- a/include/hw/virtio/vdpa-dev.h +++ b/include/hw/virtio/vdpa-dev.h @@ -37,6 +37,7 @@ struct VhostVdpaDevice { int config_size; uint16_t queue_size; bool started; + bool suspended; int (*post_init)(VhostVdpaDevice *v, Error **errp); VMChangeStateEntry *vmstate; Notifier migration_state; -- Gitee From a78602118043eb9923996504d5b2e1b14a1ec38d Mon Sep 17 00:00:00 2001 From: libai Date: Thu, 21 Dec 2023 11:03:37 +0800 Subject: [PATCH 140/939] vdpa: suspend function return 0 when the vdpa device is stopped When vhost vdpa device is stopped(vdpa->started is false), suspend operation do nothing and return success, instead of return failure. The same goes for resume function. Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 8b13f89c85..b889dd4715 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -134,8 +134,8 @@ static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa) VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); int ret; - if (!vdpa->started) { - return -EFAULT; + if (!vdpa->started || vdpa->suspended) { + return 0; } if (!k->set_guest_notifiers) { @@ -178,6 +178,10 @@ static int vhost_vdpa_device_resume(VhostVdpaDevice *vdpa) VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); int i, ret; + if (vdpa->started || !vdpa->suspended) { + return 0; + } + if (!k->set_guest_notifiers) { error_report("binding does not support guest notifiers\n"); return -ENOSYS; -- Gitee From 5714aaddcbc313e63da435a253d9d472984d7b49 Mon Sep 17 00:00:00 2001 From: libai Date: Thu, 14 Dec 2023 11:22:54 +0800 Subject: [PATCH 141/939] vdpa: correct param passed in when unregister save The idstr passed in the unregister_savevm function is inconsisten with the idstr passed in when register_savevm_live registration. Needs to be modified, otherwise migration will fail after hotunplug all vdpa devices. Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index b889dd4715..1d299019da 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -404,6 +404,6 @@ void vdpa_migration_register(VhostVdpaDevice *vdev) void vdpa_migration_unregister(VhostVdpaDevice *vdev) { migration_remove_notifier(&vdev->migration_state); - unregister_savevm(VMSTATE_IF(&vdev->parent_obj.parent_obj), "vdpa", DEVICE(vdev)); + unregister_savevm(NULL, "vdpa", DEVICE(vdev)); qemu_del_vm_change_state_handler(vdev->vmstate); } -- Gitee From b82f02e93d5efa2ea62dd135c508cb707fdd35a7 Mon Sep 17 00:00:00 2001 From: libai Date: Tue, 19 Dec 2023 20:32:00 +0800 Subject: [PATCH 142/939] vdpa: don't suspend/resume device when vdpa device not started When vdpa device not started, we don't need to suspend vdpa device and send vdpa device state information. Therefore, add the suspended flag of vdpa device to distinguish whether the device is suspended and use it to determine whether the device needs to resume in dest qemu. Signed-off-by: libai --- hw/virtio/vdpa-dev-mig.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 1d299019da..887c96a201 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -294,10 +294,13 @@ static int vdpa_save_complete_precopy(QEMUFile *f, void *opaque) int ret; qemu_put_be64(f, VDPA_MIG_FLAG_DEV_CONFIG_STATE); - ret = vhost_vdpa_dev_buffer_save(hdev, f); - if (ret) { - error_report("Save vdpa device buffer failed: %d\n", ret); - return ret; + qemu_put_be16(f, (uint16_t)vdev->suspended); + if (vdev->suspended) { + ret = vhost_vdpa_dev_buffer_save(hdev, f); + if (ret) { + error_report("Save vdpa device buffer failed: %d\n", ret); + return ret; + } } qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE); @@ -311,6 +314,7 @@ static int vdpa_load_state(QEMUFile *f, void *opaque, int version_id) int ret; uint64_t data; + uint16_t suspended; data = qemu_get_be64(f); while (data != VDPA_MIG_FLAG_END_OF_STATE) { @@ -323,10 +327,13 @@ static int vdpa_load_state(QEMUFile *f, void *opaque, int version_id) return -EINVAL; } } else if (data == VDPA_MIG_FLAG_DEV_CONFIG_STATE) { - ret = vhost_vdpa_dev_buffer_load(hdev, f); - if (ret) { - error_report("fail to restore device buffer.\n"); - return ret; + suspended = qemu_get_be16(f); + if (suspended) { + ret = vhost_vdpa_dev_buffer_load(hdev, f); + if (ret) { + error_report("fail to restore device buffer.\n"); + return ret; + } } } -- Gitee From 28ed79b98f08b5701dcaab7c6ad1015602b28e02 Mon Sep 17 00:00:00 2001 From: libai Date: Sat, 12 Nov 2022 22:40:13 +0800 Subject: [PATCH 143/939] docs: Add generic vhost-vdpa device documentation Add the description of the generic vhost-vdpa device Signed-off-by: libai --- docs/system/device-emulation.rst | 1 + .../devices/vhost-vdpa-generic-device.rst | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst index d1f3277cb0..e1b2d18fb1 100644 --- a/docs/system/device-emulation.rst +++ b/docs/system/device-emulation.rst @@ -98,3 +98,4 @@ Emulated Devices devices/canokey.rst devices/usb-u2f.rst devices/igb.rst + devices/vhost-vdpa-generic-device.rst diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst b/docs/system/devices/vhost-vdpa-generic-device.rst new file mode 100644 index 0000000000..25fbcac60e --- /dev/null +++ b/docs/system/devices/vhost-vdpa-generic-device.rst @@ -0,0 +1,46 @@ + +========================= +vhost-vDPA generic device +========================= + +This document explains the usage of the vhost-vDPA generic device. + +Description +----------- + +vDPA(virtio data path acceleration) device is a device that uses a datapath +which complies with the virtio specifications with vendor specific control +path. + +QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one +is type sensitive which means QEMU needs to know the actual device type +(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which +is type insensitive + +The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio +subsystem. It is quite small, but it can support any type of virtio device. + +Examples +-------- + +Prepare the vhost-vDPA backends first: + +:: + host# ls -l /dev/vhost-vdpa-* + crw------- 1 root root 236, 0 Nov 2 00:49 /dev/vhost-vdpa-0 + +Start QEMU with virtio-mmio bus: + +:: + host# qemu-system \ + -M microvm -m 512 -smp 2 -kernel ... -initrd ... \ + -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0 \ + ... + +Start QEMU with virtio-pci bus: + +:: + host# qemu-system \ + -M pc -m 512 -smp 2 \ + -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0 \ + ...\ -- Gitee From 0f515ff831f46ef34cd83aa145e547e48d8b3b56 Mon Sep 17 00:00:00 2001 From: libai Date: Thu, 14 Dec 2023 11:05:52 +0800 Subject: [PATCH 144/939] vdpa: set vring enable only if the vring address has already been set Currently, vhost-vdpa does not determine the status of each vring when performing the enable operation on vring. When the vBIOS(EDK2) is running, the driver will not enable all vrings. In this case, setting all vrings to enable is isconsistent with the actual situation. Add logic when enabling vring, make a judement on the vring status. If the vring address is not set, the vring will not enabled. Signed-off-by: libai --- hw/virtio/vhost-vdpa.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 30408f2069..d49826845f 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -890,6 +890,11 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) .index = idx, .num = 1, }; + hwaddr addr = virtio_queue_get_desc_addr(dev->vdev, idx); + if (addr == 0) { + return 0; + } + int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); trace_vhost_vdpa_set_vring_ready(dev, idx, r); -- Gitee From 6689eebbb520dc75bc65e0914c4e05e40a4efc1d Mon Sep 17 00:00:00 2001 From: Prasad J Pandit Date: Mon, 21 Jun 2021 09:22:35 +0800 Subject: [PATCH 145/939] ide: ahci: add check to avoid null dereference (CVE-2019-12067) Fix CVE-2019-12067 AHCI emulator while committing DMA buffer in ahci_commit_buf() may do a NULL dereference if the command header 'ad->cur_cmd' is null. Add check to avoid it. Reported-by: Bugs SysSec Signed-off-by: Prasad J Pandit Signed-off-by: Jiajie Li Signed-off-by: Yan Wang Signed-off-by: Adttil --- hw/ide/ahci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index afdc44b8e0..8062e1743c 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -1519,8 +1519,10 @@ static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); - tx_bytes += le32_to_cpu(ad->cur_cmd->status); - ad->cur_cmd->status = cpu_to_le32(tx_bytes); + if (ad->cur_cmd) { + tx_bytes += le32_to_cpu(ad->cur_cmd->status); + ad->cur_cmd->status = cpu_to_le32(tx_bytes); + } } static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write) -- Gitee From 6e6215b3ad0c8eac918bca9e2b5bb661e27f2fed Mon Sep 17 00:00:00 2001 From: zhouli57 Date: Sat, 18 Dec 2021 09:39:57 +0800 Subject: [PATCH 146/939] net: eepro100: validate various address valuesi(CVE-2021-20255) fix CVE-2021-20255 patch link: https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg06098.html fix CVE-2021-20255, sync patch from ostms platform. Signed-off-by: zhouli57 Signed-off-by: Yan Wang --- hw/net/eepro100.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index 69e1c4bb89..f6204ec059 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -279,6 +279,9 @@ typedef struct { /* Quasi static device properties (no need to save them). */ uint16_t stats_size; bool has_extended_tcb_support; + + /* Flag to avoid recursions. */ + bool busy; } EEPRO100State; /* Word indices in EEPROM. */ @@ -844,6 +847,14 @@ static void action_command(EEPRO100State *s) Therefore we limit the number of iterations. */ unsigned max_loop_count = 16; + if (s->busy) { + /* Prevent recursions. */ + logout("recursion in %s:%u\n", __FILE__, __LINE__); + return; + } + + s->busy = true; + for (;;) { bool bit_el; bool bit_s; @@ -940,6 +951,7 @@ static void action_command(EEPRO100State *s) } TRACE(OTHER, logout("CU list empty\n")); /* List is empty. Now CU is idle or suspended. */ + s->busy = false; } static void eepro100_cu_command(EEPRO100State * s, uint8_t val) -- Gitee From 56bfcb77a384419dbd09ca37075a3cf4ba2e9f19 Mon Sep 17 00:00:00 2001 From: Elen Avan Date: Fri, 22 Dec 2023 22:17:21 +0300 Subject: [PATCH 147/939] include/ui/rect.h: fix qemu_rect_init() mis-assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Elen Avan Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2051 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2050 Fixes: a200d53b1fde "virtio-gpu: replace PIXMAN for region/rect test" Cc: qemu-stable@nongnu.org Reviewed-by: Michael Tokarev Reviewed-by: Marc-André Lureau Signed-off-by: Michael Tokarev --- include/ui/rect.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ui/rect.h b/include/ui/rect.h index 94898f92d0..68f05d78a8 100644 --- a/include/ui/rect.h +++ b/include/ui/rect.h @@ -19,7 +19,7 @@ static inline void qemu_rect_init(QemuRect *rect, uint16_t width, uint16_t height) { rect->x = x; - rect->y = x; + rect->y = y; rect->width = width; rect->height = height; } -- Gitee From e4ae54316651bf6af12de263da158c5ec4ed0401 Mon Sep 17 00:00:00 2001 From: Xu Yandong Date: Mon, 18 Mar 2024 17:31:31 +0800 Subject: [PATCH 148/939] cpu: add Kunpeng-920 cpu support Add the Kunpeng-920 CPU model Signed-off-by: Xu Yandong Signed-off-by: Mingwang Li Signed-off-by: Yuan Zhang --- hw/arm/virt.c | 1 + target/arm/cpu64.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index be2856c018..500a15aa5b 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -220,6 +220,7 @@ static const char *valid_cpus[] = { #endif ARM_CPU_TYPE_NAME("cortex-a53"), ARM_CPU_TYPE_NAME("cortex-a57"), + ARM_CPU_TYPE_NAME("Kunpeng-920"), ARM_CPU_TYPE_NAME("host"), ARM_CPU_TYPE_NAME("max"), }; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 1e9c6c85ae..922eac3b61 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -705,6 +705,77 @@ static void aarch64_a53_initfn(Object *obj) define_cortex_a72_a57_a53_cp_reginfo(cpu); } +static void aarch64_a72_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + cpu->dtb_compatible = "arm,cortex-a72"; + set_feature(&cpu->env, ARM_FEATURE_V8); + set_feature(&cpu->env, ARM_FEATURE_NEON); + set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER); + set_feature(&cpu->env, ARM_FEATURE_AARCH64); + set_feature(&cpu->env, ARM_FEATURE_CBAR_RO); + set_feature(&cpu->env, ARM_FEATURE_EL2); + set_feature(&cpu->env, ARM_FEATURE_EL3); + set_feature(&cpu->env, ARM_FEATURE_PMU); + cpu->midr = 0x410fd083; + cpu->revidr = 0x00000000; + cpu->reset_fpsid = 0x41034080; + cpu->isar.mvfr0 = 0x10110222; + cpu->isar.mvfr1 = 0x12111111; + cpu->isar.mvfr2 = 0x00000043; + cpu->ctr = 0x8444c004; + cpu->reset_sctlr = 0x00c50838; + cpu->isar.id_pfr0 = 0x00000131; + cpu->isar.id_pfr1 = 0x00011011; + cpu->isar.id_dfr0 = 0x03010066; + cpu->id_afr0 = 0x00000000; + cpu->isar.id_mmfr0 = 0x10201105; + cpu->isar.id_mmfr1 = 0x40000000; + cpu->isar.id_mmfr2 = 0x01260000; + cpu->isar.id_mmfr3 = 0x02102211; + cpu->isar.id_isar0 = 0x02101110; + cpu->isar.id_isar1 = 0x13112111; + cpu->isar.id_isar2 = 0x21232042; + cpu->isar.id_isar3 = 0x01112131; + cpu->isar.id_isar4 = 0x00011142; + cpu->isar.id_isar5 = 0x00011121; + cpu->isar.id_aa64pfr0 = 0x00002222; + cpu->isar.id_aa64dfr0 = 0x10305106; + cpu->isar.id_aa64isar0 = 0x00011120; + cpu->isar.id_aa64mmfr0 = 0x00001124; + cpu->isar.dbgdidr = 0x3516d000; + cpu->clidr = 0x0a200023; + cpu->ccsidr[0] = 0x701fe00a; /* 32KB L1 dcache */ + cpu->ccsidr[1] = 0x201fe012; /* 48KB L1 icache */ + cpu->ccsidr[2] = 0x707fe07a; /* 1MB L2 cache */ + cpu->dcz_blocksize = 4; /* 64 bytes */ + cpu->gic_num_lrs = 4; + cpu->gic_vpribits = 5; + cpu->gic_vprebits = 5; + define_cortex_a72_a57_a53_cp_reginfo(cpu); +} + +static void aarch64_kunpeng_920_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + /* + * Hisilicon Kunpeng-920 CPU is similar to cortex-a72, + * so first initialize cpu data as cortex-a72, + * and then update the special register. + */ + aarch64_a72_initfn(obj); + + cpu->midr = 0x480fd010; + cpu->ctr = 0x84448004; + cpu->isar.id_aa64pfr0 = 0x11001111; + cpu->isar.id_aa64dfr0 = 0x110305408; + cpu->isar.id_aa64isar0 = 0x10211120; + cpu->isar.id_aa64mmfr0 = 0x101125; + cpu->kvm_target = KVM_ARM_TARGET_GENERIC_V8; +} + static void aarch64_host_initfn(Object *obj) { #if defined(CONFIG_KVM) @@ -744,6 +815,7 @@ static void aarch64_max_initfn(Object *obj) static const ARMCPUInfo aarch64_cpus[] = { { .name = "cortex-a57", .initfn = aarch64_a57_initfn }, { .name = "cortex-a53", .initfn = aarch64_a53_initfn }, + { .name = "Kunpeng-920", .initfn = aarch64_kunpeng_920_initfn}, { .name = "max", .initfn = aarch64_max_initfn }, #if defined(CONFIG_KVM) || defined(CONFIG_HVF) { .name = "host", .initfn = aarch64_host_initfn }, -- Gitee From 5853333c9513caea541701c95a4ac691bb97452f Mon Sep 17 00:00:00 2001 From: Xu Yandong Date: Tue, 19 Mar 2024 10:45:56 +0800 Subject: [PATCH 149/939] cpu: add Cortex-A72 processor kvm target support The ARM Cortex-A72 is ARMv8-A micro-architecture, add kvm target to ARM Cortex-A72 processor definition. Signed-off-by: Xu Yandong Signed-off-by: Mingwang Li Signed-off-by: Yuan Zhang --- target/arm/cpu64.c | 2 +- target/arm/kvm-consts.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 922eac3b61..471014b5a9 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -710,6 +710,7 @@ static void aarch64_a72_initfn(Object *obj) ARMCPU *cpu = ARM_CPU(obj); cpu->dtb_compatible = "arm,cortex-a72"; + cpu->kvm_target = QEMU_KVM_ARM_TARGET_GENERIC_V8; set_feature(&cpu->env, ARM_FEATURE_V8); set_feature(&cpu->env, ARM_FEATURE_NEON); set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER); @@ -773,7 +774,6 @@ static void aarch64_kunpeng_920_initfn(Object *obj) cpu->isar.id_aa64dfr0 = 0x110305408; cpu->isar.id_aa64isar0 = 0x10211120; cpu->isar.id_aa64mmfr0 = 0x101125; - cpu->kvm_target = KVM_ARM_TARGET_GENERIC_V8; } static void aarch64_host_initfn(Object *obj) diff --git a/target/arm/kvm-consts.h b/target/arm/kvm-consts.h index 7c6adc14f6..c034823170 100644 --- a/target/arm/kvm-consts.h +++ b/target/arm/kvm-consts.h @@ -133,6 +133,8 @@ MISMATCH_CHECK(QEMU_PSCI_RET_DISABLED, PSCI_RET_DISABLED); #define QEMU_KVM_ARM_TARGET_CORTEX_A57 2 #define QEMU_KVM_ARM_TARGET_XGENE_POTENZA 3 #define QEMU_KVM_ARM_TARGET_CORTEX_A53 4 +/* Generic ARM v8 target */ +#define QEMU_KVM_ARM_TARGET_GENERIC_V8 5 /* There's no kernel define for this: sentinel value which * matches no KVM target value for either 64 or 32 bit @@ -144,6 +146,7 @@ MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_FOUNDATION_V8, KVM_ARM_TARGET_FOUNDATION_V8); MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A57, KVM_ARM_TARGET_CORTEX_A57); MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_XGENE_POTENZA, KVM_ARM_TARGET_XGENE_POTENZA); MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A53, KVM_ARM_TARGET_CORTEX_A53); +MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_GENERIC_V8, KVM_ARM_TARGET_GENERIC_V8); #define CP_REG_ARM64 0x6000000000000000ULL #define CP_REG_ARM_COPROC_MASK 0x000000000FFF0000 -- Gitee From 3402740cb4f6d6b9baabfde0a7667b4990b010a5 Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Sat, 30 Mar 2024 19:21:59 +0800 Subject: [PATCH 150/939] tests: virt: Allow changes to PPTT test table Allow changes to test/data/acpi/virt/PPTT*, prepare to change the building policy of the cluster topology. Signed-off-by: Kunkun Jiang --- tests/qtest/bios-tables-test-allowed-diff.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..18d02a710d 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,4 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/virt/PPTT", +"tests/data/acpi/virt/PPTT.acpihmatvirt", +"tests/data/acpi/virt/PPTT.topology", -- Gitee From 7d3d37d3af4278aee627952d6a81b63dec6ac62b Mon Sep 17 00:00:00 2001 From: Ying Fang Date: Sun, 17 Mar 2024 18:56:09 +0800 Subject: [PATCH 151/939] hw/arm64: add vcpu cache info support Support VCPU Cache info by dtb and PPTT table, including L1, L2 and L3 Cache. Signed-off-by: zhanghailiang Signed-off-by: Honghao Signed-off-by: Ying Fang Signed-off-by: Yanan Wang Signed-off-by: Yuan Zhang --- hw/acpi/aml-build.c | 158 ++++++++++++++++++++++++++++++++++++ hw/arm/virt.c | 72 ++++++++++++++++ include/hw/acpi/aml-build.h | 47 +++++++++++ 3 files changed, 277 insertions(+) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index af66bde0f5..2968df5562 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -1994,6 +1994,163 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, } } +#ifdef __aarch64__ +/* + * ACPI spec, Revision 6.3 + * 5.2.29.2 Cache Type Structure (Type 1) + */ +static void build_cache_hierarchy_node(GArray *tbl, uint32_t next_level, + uint32_t cache_type) +{ + build_append_byte(tbl, 1); + build_append_byte(tbl, 24); + build_append_int_noprefix(tbl, 0, 2); + build_append_int_noprefix(tbl, 127, 4); + build_append_int_noprefix(tbl, next_level, 4); + + switch (cache_type) { + case ARM_L1D_CACHE: /* L1 dcache info */ + build_append_int_noprefix(tbl, ARM_L1DCACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1DCACHE_SETS, 4); + build_append_byte(tbl, ARM_L1DCACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1DCACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1DCACHE_LINE_SIZE, 2); + break; + case ARM_L1I_CACHE: /* L1 icache info */ + build_append_int_noprefix(tbl, ARM_L1ICACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1ICACHE_SETS, 4); + build_append_byte(tbl, ARM_L1ICACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1ICACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1ICACHE_LINE_SIZE, 2); + break; + case ARM_L2_CACHE: /* L2 cache info */ + build_append_int_noprefix(tbl, ARM_L2CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L2CACHE_SETS, 4); + build_append_byte(tbl, ARM_L2CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L2CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L2CACHE_LINE_SIZE, 2); + break; + case ARM_L3_CACHE: /* L3 cache info */ + build_append_int_noprefix(tbl, ARM_L3CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L3CACHE_SETS, 4); + build_append_byte(tbl, ARM_L3CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L3CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L3CACHE_LINE_SIZE, 2); + break; + default: + build_append_int_noprefix(tbl, 0, 4); + build_append_int_noprefix(tbl, 0, 4); + build_append_byte(tbl, 0); + build_append_byte(tbl, 0); + build_append_int_noprefix(tbl, 0, 2); + } +} + +/* + * ACPI spec, Revision 6.3 + * 5.2.29 Processor Properties Topology Table (PPTT) + */ +void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, + const char *oem_id, const char *oem_table_id) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + GQueue *list = g_queue_new(); + guint pptt_start = table_data->len; + guint parent_offset; + guint length, i; + int uid = 0; + int socket; + AcpiTable table = { .sig = "PPTT", .rev = 2, + .oem_id = oem_id, .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + + for (socket = 0; socket < ms->smp.sockets; socket++) { + uint32_t l3_cache_offset = table_data->len - pptt_start; + build_cache_hierarchy_node(table_data, 0, ARM_L3_CACHE); + + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + /* + * Physical package - represents the boundary + * of a physical package + */ + (1 << 0), + 0, socket, &l3_cache_offset, 1); + } + + if (mc->smp_props.clusters_supported) { + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int cluster; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (cluster = 0; cluster < ms->smp.clusters; cluster++) { + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + (0 << 0), /* not a physical package */ + parent_offset, cluster, NULL, 0); + } + } + } + + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int core; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (core = 0; core < ms->smp.cores; core++) { + uint32_t priv_rsrc[3] = {}; + priv_rsrc[0] = table_data->len - pptt_start; /* L2 cache offset */ + build_cache_hierarchy_node(table_data, 0, ARM_L2_CACHE); + + priv_rsrc[1] = table_data->len - pptt_start; /* L1 dcache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1D_CACHE); + + priv_rsrc[2] = table_data->len - pptt_start; /* L1 icache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1I_CACHE); + + if (ms->smp.threads > 1) { + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + (0 << 0), /* not a physical package */ + parent_offset, core, priv_rsrc, 3); + } else { + build_processor_hierarchy_node( + table_data, + (1 << 1) | /* ACPI Processor ID valid */ + (1 << 3), /* Node is a Leaf */ + parent_offset, uid++, priv_rsrc, 3); + } + } + } + + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int thread; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (thread = 0; thread < ms->smp.threads; thread++) { + build_processor_hierarchy_node( + table_data, + (1 << 1) | /* ACPI Processor ID valid */ + (1 << 2) | /* Processor is a Thread */ + (1 << 3), /* Node is a Leaf */ + parent_offset, uid++, NULL, 0); + } + } + + g_queue_free(list); + acpi_table_end(linker, &table); +} + +#else /* * ACPI spec, Revision 6.3 * 5.2.29 Processor Properties Topology Table (PPTT) @@ -2069,6 +2226,7 @@ void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, acpi_table_end(linker, &table); } +#endif /* build rev1/rev3/rev5.1/rev6.0 FADT */ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 500a15aa5b..b82bd1b8c8 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -379,6 +379,72 @@ static void fdt_add_timer_nodes(const VirtMachineState *vms) INTID_TO_PPI(ARCH_TIMER_NS_EL2_IRQ), irqflags); } +static void fdt_add_l3cache_nodes(const VirtMachineState *vms) +{ + int i; + const MachineState *ms = MACHINE(vms); + int cpus_per_socket = ms->smp.clusters * ms->smp.cores * ms->smp.threads; + int sockets = (ms->smp.cpus + cpus_per_socket - 1) / cpus_per_socket; + + for (i = 0; i < sockets; i++) { + char *nodename = g_strdup_printf("/cpus/l3-cache%d", i); + + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache"); + qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true"); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-level", 3); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", 0x2000000); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", 128); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", 2048); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", + qemu_fdt_alloc_phandle(ms->fdt)); + g_free(nodename); + } +} + +static void fdt_add_l2cache_nodes(const VirtMachineState *vms) +{ + const MachineState *ms = MACHINE(vms); + int cpus_per_socket = ms->smp.clusters * ms->smp.cores * ms->smp.threads; + int cpu; + + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + char *next_path = g_strdup_printf("/cpus/l3-cache%d", + cpu / cpus_per_socket); + char *nodename = g_strdup_printf("/cpus/l2-cache%d", cpu); + + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache"); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", 0x80000); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", 64); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", 1024); + qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", + next_path); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", + qemu_fdt_alloc_phandle(ms->fdt)); + + g_free(next_path); + g_free(nodename); + } +} + +static void fdt_add_l1cache_prop(const VirtMachineState *vms, + char *nodename, int cpu) +{ + const MachineState *ms = MACHINE(vms); + char *cachename = g_strdup_printf("/cpus/l2-cache%d", cpu); + + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-size", 0x10000); + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-line-size", 64); + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-sets", 256); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-size", 0x10000); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-line-size", 64); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-sets", 256); + qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", + cachename); + g_free(cachename); +} + static void fdt_add_cpu_nodes(const VirtMachineState *vms) { int cpu; @@ -413,6 +479,11 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#address-cells", addr_cells); qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#size-cells", 0x0); + if (!vmc->no_cpu_topology) { + fdt_add_l3cache_nodes(vms); + fdt_add_l2cache_nodes(vms); + } + for (cpu = smp_cpus - 1; cpu >= 0; cpu--) { char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu); ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); @@ -442,6 +513,7 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) } if (!vmc->no_cpu_topology) { + fdt_add_l1cache_prop(vms, nodename, cpu); qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", qemu_fdt_alloc_phandle(ms->fdt)); } diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index ff2a310270..84ded2ecd3 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -221,6 +221,53 @@ struct AcpiBuildTables { BIOSLinker *linker; } AcpiBuildTables; +#ifdef __aarch64__ +/* Definitions of the hardcoded cache info*/ + +typedef enum { + ARM_L1D_CACHE, + ARM_L1I_CACHE, + ARM_L2_CACHE, + ARM_L3_CACHE +} ArmCacheType; + +/* L1 data cache: */ +#define ARM_L1DCACHE_SIZE 65536 +#define ARM_L1DCACHE_SETS 256 +#define ARM_L1DCACHE_ASSOCIATIVITY 4 +#define ARM_L1DCACHE_ATTRIBUTES 2 +#define ARM_L1DCACHE_LINE_SIZE 64 + +/* L1 instruction cache: */ +#define ARM_L1ICACHE_SIZE 65536 +#define ARM_L1ICACHE_SETS 256 +#define ARM_L1ICACHE_ASSOCIATIVITY 4 +#define ARM_L1ICACHE_ATTRIBUTES 4 +#define ARM_L1ICACHE_LINE_SIZE 64 + +/* Level 2 unified cache: */ +#define ARM_L2CACHE_SIZE 524288 +#define ARM_L2CACHE_SETS 1024 +#define ARM_L2CACHE_ASSOCIATIVITY 8 +#define ARM_L2CACHE_ATTRIBUTES 10 +#define ARM_L2CACHE_LINE_SIZE 64 + +/* Level 3 unified cache: */ +#define ARM_L3CACHE_SIZE 33554432 +#define ARM_L3CACHE_SETS 2048 +#define ARM_L3CACHE_ASSOCIATIVITY 15 +#define ARM_L3CACHE_ATTRIBUTES 10 +#define ARM_L3CACHE_LINE_SIZE 128 + +struct offset_status { + uint32_t parent; + uint32_t l2_offset; + uint32_t l1d_offset; + uint32_t l1i_offset; +}; + +#endif + typedef struct CrsRangeEntry { uint64_t base; -- Gitee From ebe05c34a66969e4cacc4d6c030dfe93ace89cb2 Mon Sep 17 00:00:00 2001 From: Ying Fang Date: Tue, 19 Mar 2024 14:35:55 +0800 Subject: [PATCH 152/939] arm64: Add the cpufreq device to show cpufreq info to guest On ARM64 platform, cpu frequency is retrieved via ACPI CPPC. A virtual cpufreq device based on ACPI CPPC is created to present cpu frequency info to the guest. The default frequency is set to host cpu nominal frequency, which is obtained from the host CPPC sysfs. Other performance data are set to the same value, since we don't support guest performance scaling here. Performance counters are also not emulated and they simply return 1 if read, and guest should fallback to use desired performance value as the current performance. Guest kernel version above 4.18 is required to make it work. This series is backported from: https://patchwork.kernel.org/cover/11379943/ Signed-off-by: Ying Fang Signed-off-by: Yanan Wang Signed-off-by: Yuan Zhang --- configs/devices/aarch64-softmmu/default.mak | 1 + hw/acpi/aml-build.c | 22 ++ hw/acpi/cpufreq.c | 283 ++++++++++++++++++++ hw/acpi/meson.build | 1 + hw/arm/virt-acpi-build.c | 79 +++++- hw/arm/virt.c | 13 + hw/char/Kconfig | 4 + include/hw/acpi/acpi-defs.h | 40 +++ include/hw/acpi/aml-build.h | 3 + include/hw/arm/virt.h | 1 + 10 files changed, 444 insertions(+), 3 deletions(-) create mode 100644 hw/acpi/cpufreq.c diff --git a/configs/devices/aarch64-softmmu/default.mak b/configs/devices/aarch64-softmmu/default.mak index f82a04c27d..8d66d0f1af 100644 --- a/configs/devices/aarch64-softmmu/default.mak +++ b/configs/devices/aarch64-softmmu/default.mak @@ -8,3 +8,4 @@ include ../arm-softmmu/default.mak # CONFIG_XLNX_ZYNQMP_ARM=n # CONFIG_XLNX_VERSAL=n # CONFIG_SBSA_REF=n +# CONFIG_CPUFREQ=n diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 2968df5562..714498165a 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -1554,6 +1554,28 @@ Aml *aml_sleep(uint64_t msec) return var; } +/* ACPI 5.0b: 6.4.3.7 Generic Register Descriptor */ +Aml *aml_generic_register(AmlRegionSpace rs, uint8_t reg_width, + uint8_t reg_offset, AmlAccessType type, uint64_t addr) +{ + int i; + Aml *var = aml_alloc(); + build_append_byte(var->buf, 0x82); /* Generic Register Descriptor */ + build_append_byte(var->buf, 0x0C); /* Length, bits[7:0] value = 0x0C */ + build_append_byte(var->buf, 0); /* Length, bits[15:8] value = 0 */ + build_append_byte(var->buf, rs); /* Address Space ID */ + build_append_byte(var->buf, reg_width); /* Register Bit Width */ + build_append_byte(var->buf, reg_offset); /* Register Bit Offset */ + build_append_byte(var->buf, type); /* Access Size */ + + /* Register address */ + for (i = 0; i < 8; i++) { + build_append_byte(var->buf, extract64(addr, i * 8, 8)); + } + + return var; +} + static uint8_t Hex2Byte(const char *src) { int hi, lo; diff --git a/hw/acpi/cpufreq.c b/hw/acpi/cpufreq.c new file mode 100644 index 0000000000..a84db490b3 --- /dev/null +++ b/hw/acpi/cpufreq.c @@ -0,0 +1,283 @@ +/* + * ACPI CPPC register device + * + * Support for showing CPU frequency in guest OS. + * + * Copyright (c) 2019 HUAWEI TECHNOLOGIES CO.,LTD. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "hw/sysbus.h" +#include "chardev/char.h" +#include "qemu/log.h" +#include "trace.h" +#include "qemu/option.h" +#include "sysemu/sysemu.h" +#include "hw/acpi/acpi-defs.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include "hw/boards.h" + +#define TYPE_CPUFREQ "cpufreq" +#define CPUFREQ(obj) OBJECT_CHECK(CpuhzState, (obj), TYPE_CPUFREQ) +#define NOMINAL_FREQ_FILE "/sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq" +#define CPU_MAX_FREQ_FILE "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" +#define HZ_MAX_LENGTH 1024 +#define MAX_SUPPORT_SPACE 0x10000 + +/* + * Since Hi1616 will not support CPPC, we simply use its nominal frequency as + * the default. + */ +#define DEFAULT_HZ 2400 + +int cppc_regs_offset[CPPC_REG_COUNT] = { + [HIGHEST_PERF] = 0, + [NOMINAL_PERF] = 4, + [LOW_NON_LINEAR_PERF] = 8, + [LOWEST_PERF] = 12, + [GUARANTEED_PERF] = 16, + [DESIRED_PERF] = 20, + [MIN_PERF] = -1, + [MAX_PERF] = -1, + [PERF_REDUC_TOLERANCE] = -1, + [TIME_WINDOW] = -1, + [CTR_WRAP_TIME] = -1, + [REFERENCE_CTR] = 24, + [DELIVERED_CTR] = 32, + [PERF_LIMITED] = 40, + [ENABLE] = -1, + [AUTO_SEL_ENABLE] = -1, + [AUTO_ACT_WINDOW] = -1, + [ENERGY_PERF] = -1, + [REFERENCE_PERF] = -1, + [LOWEST_FREQ] = 44, + [NOMINAL_FREQ] = 48, +}; + +typedef struct CpuhzState { + SysBusDevice parent_obj; + + MemoryRegion iomem; + uint32_t HighestPerformance; + uint32_t NominalPerformance; + uint32_t LowestNonlinearPerformance; + uint32_t LowestPerformance; + uint32_t GuaranteedPerformance; + uint32_t DesiredPerformance; + uint64_t ReferencePerformanceCounter; + uint64_t DeliveredPerformanceCounter; + uint32_t PerformanceLimited; + uint32_t LowestFreq; + uint32_t NominalFreq; + uint32_t reg_size; +} CpuhzState; + + +static uint64_t cpufreq_read(void *opaque, hwaddr offset, unsigned size) +{ + CpuhzState *s = (CpuhzState *)opaque; + uint64_t r; + uint64_t n; + + MachineState *ms = MACHINE(qdev_get_machine()); + unsigned int smp_cpus = ms->smp.cpus; + + if (offset >= smp_cpus * CPPC_REG_PER_CPU_STRIDE) { + warn_report("cpufreq_read: offset 0x%lx out of range", offset); + return 0; + } + + n = offset % CPPC_REG_PER_CPU_STRIDE; + switch (n) { + case 0: + r = s->HighestPerformance; + break; + case 4: + r = s->NominalPerformance; + break; + case 8: + r = s->LowestNonlinearPerformance; + break; + case 12: + r = s->LowestPerformance; + break; + case 16: + r = s->GuaranteedPerformance; + break; + case 20: + r = s->DesiredPerformance; + break; + /* + * We don't have real counters and it is hard to emulate, so always set the + * counter value to 1 to rely on Linux to use the DesiredPerformance value + * directly. + */ + case 24: + r = s->ReferencePerformanceCounter; + break; + /* + * Guest may still access the register by 32bit; add the process to + * eliminate unnecessary warnings. + */ + case 28: + r = s->ReferencePerformanceCounter >> 32; + break; + case 32: + r = s->DeliveredPerformanceCounter; + break; + case 36: + r = s->DeliveredPerformanceCounter >> 32; + break; + + case 40: + r = s->PerformanceLimited; + break; + case 44: + r = s->LowestFreq; + break; + case 48: + r = s->NominalFreq; + break; + default: + error_printf("cpufreq_read: Bad offset 0x%lx\n", offset); + r = 0; + break; + } + return r; +} + +static void cpufreq_write(void *opaque, hwaddr offset, + uint64_t value, unsigned size) +{ + uint64_t n; + MachineState *ms = MACHINE(qdev_get_machine()); + unsigned int smp_cpus = ms->smp.cpus; + + if (offset >= smp_cpus * CPPC_REG_PER_CPU_STRIDE) { + error_printf("cpufreq_write: offset 0x%lx out of range", offset); + return; + } + + n = offset % CPPC_REG_PER_CPU_STRIDE; + + switch (n) { + case 20: + break; + default: + error_printf("cpufreq_write: Bad offset 0x%lx\n", offset); + } +} + +static uint32_t CPPC_Read(const char *hostpath) +{ + int fd; + char buffer[HZ_MAX_LENGTH] = { 0 }; + uint64_t hz; + int len; + const char *endptr = NULL; + int ret; + + fd = qemu_open_old(hostpath, O_RDONLY); + if (fd < 0) { + return 0; + } + + len = read(fd, buffer, HZ_MAX_LENGTH); + qemu_close(fd); + if (len <= 0) { + return 0; + } + ret = qemu_strtoul(buffer, &endptr, 0, &hz); + if (ret < 0) { + return 0; + } + return (uint32_t)hz; +} + +static const MemoryRegionOps cpufreq_ops = { + .read = cpufreq_read, + .write = cpufreq_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void hz_init(CpuhzState *s) +{ + uint32_t hz; + + hz = CPPC_Read(NOMINAL_FREQ_FILE); + if (hz == 0) { + hz = CPPC_Read(CPU_MAX_FREQ_FILE); + if (hz == 0) { + hz = DEFAULT_HZ; + } else { + /* Value in CpuMaxFrequency is in KHz unit; convert to MHz */ + hz = hz / 1000; + } + } + + s->HighestPerformance = hz; + s->NominalPerformance = hz; + s->LowestNonlinearPerformance = hz; + s->LowestPerformance = hz; + s->GuaranteedPerformance = hz; + s->DesiredPerformance = hz; + s->ReferencePerformanceCounter = 1; + s->DeliveredPerformanceCounter = 1; + s->PerformanceLimited = 0; + s->LowestFreq = hz; + s->NominalFreq = hz; +} + +static void cpufreq_init(Object *obj) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(obj); + CpuhzState *s = CPUFREQ(obj); + + MachineState *ms = MACHINE(qdev_get_machine()); + unsigned int smp_cpus = ms->smp.cpus; + + s->reg_size = smp_cpus * CPPC_REG_PER_CPU_STRIDE; + if (s->reg_size > MAX_SUPPORT_SPACE) { + error_report("Required space 0x%x excesses the max support 0x%x", + s->reg_size, MAX_SUPPORT_SPACE); + goto err_end; + } + + memory_region_init_io(&s->iomem, OBJECT(s), &cpufreq_ops, s, "cpufreq", + s->reg_size); + sysbus_init_mmio(sbd, &s->iomem); + hz_init(s); + return; + +err_end: + /* Set desired perf register offset to -1 to indicate no support for CPPC */ + cppc_regs_offset[DESIRED_PERF] = -1; +} + +static const TypeInfo cpufreq_arm_info = { + .name = TYPE_CPUFREQ, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(CpuhzState), + .instance_init = cpufreq_init, +}; + +static void cpufreq_register_types(void) +{ + type_register_static(&cpufreq_arm_info); +} + +type_init(cpufreq_register_types) diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index fc1b952379..d36b10ea3c 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -27,6 +27,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_ICH9', if_true: files('ich9.c', 'ich9_tco.c')) acpi_ss.add(when: 'CONFIG_ACPI_ERST', if_true: files('erst.c')) acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c')) acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c')) +acpi_ss.add(when: 'CONFIG_CPUFREQ', if_true: files('cpufreq.c')) if have_tpm acpi_ss.add(files('tpm.c')) endif diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 8bc35a483c..3cb50bdc65 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -63,7 +63,68 @@ #define ACPI_BUILD_TABLE_SIZE 0x20000 -static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms) +static void acpi_dsdt_add_psd(Aml *dev, int cpus) +{ + Aml *pkg; + Aml *sub; + + sub = aml_package(5); + aml_append(sub, aml_int(5)); + aml_append(sub, aml_int(0)); + /* Assume all vCPUs belong to the same domain */ + aml_append(sub, aml_int(0)); + /* SW_ANY: OSPM coordinate, initiate on any processor */ + aml_append(sub, aml_int(0xFD)); + aml_append(sub, aml_int(cpus)); + + pkg = aml_package(1); + aml_append(pkg, sub); + + aml_append(dev, aml_name_decl("_PSD", pkg)); +} + +static void acpi_dsdt_add_cppc(Aml *dev, uint64_t cpu_base, int *regs_offset) +{ + Aml *cpc; + int i; + + /* Use version 3 of CPPC table from ACPI 6.3 */ + cpc = aml_package(23); + aml_append(cpc, aml_int(23)); + aml_append(cpc, aml_int(3)); + + for (i = 0; i < CPPC_REG_COUNT; i++) { + Aml *res; + uint8_t reg_width; + uint8_t acc_type; + uint64_t addr; + + if (regs_offset[i] == -1) { + reg_width = 0; + acc_type = AML_ANY_ACC; + addr = 0; + } else { + addr = cpu_base + regs_offset[i]; + if (i == REFERENCE_CTR || i == DELIVERED_CTR) { + reg_width = 64; + acc_type = AML_QWORD_ACC; + } else { + reg_width = 32; + acc_type = AML_DWORD_ACC; + } + } + + res = aml_resource_template(); + aml_append(res, aml_generic_register(AML_SYSTEM_MEMORY, reg_width, 0, + acc_type, addr)); + aml_append(cpc, res); + } + + aml_append(dev, aml_name_decl("_CPC", cpc)); +} + +static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms, + const MemMapEntry *cppc_memmap) { MachineState *ms = MACHINE(vms); uint16_t i; @@ -72,7 +133,19 @@ static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms) Aml *dev = aml_device("C%.03X", i); aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0007"))); aml_append(dev, aml_name_decl("_UID", aml_int(i))); - aml_append(scope, dev); + + /* + * Append _CPC and _PSD to support CPU frequence show + * Check CPPC available by DESIRED_PERF register + */ + if (cppc_regs_offset[DESIRED_PERF] != -1) { + acpi_dsdt_add_cppc(dev, + cppc_memmap->base + i * CPPC_REG_PER_CPU_STRIDE, + cppc_regs_offset); + acpi_dsdt_add_psd(dev, ms->smp.cpus); + } + + aml_append(scope, dev); } } @@ -858,7 +931,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * the RTC ACPI device at all when using UEFI. */ scope = aml_scope("\\_SB"); - acpi_dsdt_add_cpus(scope, vms); + acpi_dsdt_add_cpus(scope, vms, &memmap[VIRT_CPUFREQ]); acpi_dsdt_add_uart(scope, &memmap[VIRT_UART], (irqmap[VIRT_UART] + ARM_SPI_BASE)); if (vmc->acpi_expose_flash) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index b82bd1b8c8..c19cacec8b 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -157,6 +157,7 @@ static const MemMapEntry base_memmap[] = { [VIRT_PVTIME] = { 0x090a0000, 0x00010000 }, [VIRT_SECURE_GPIO] = { 0x090b0000, 0x00001000 }, [VIRT_MMIO] = { 0x0a000000, 0x00000200 }, + [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, @@ -980,6 +981,16 @@ static void create_uart(const VirtMachineState *vms, int uart, g_free(nodename); } +static void create_cpufreq(const VirtMachineState *vms, MemoryRegion *mem) +{ + hwaddr base = vms->memmap[VIRT_CPUFREQ].base; + DeviceState *dev = qdev_new("cpufreq"); + SysBusDevice *s = SYS_BUS_DEVICE(dev); + + sysbus_realize_and_unref(s, &error_fatal); + memory_region_add_subregion(mem, base, sysbus_mmio_get_region(s, 0)); +} + static void create_rtc(const VirtMachineState *vms) { char *nodename; @@ -2346,6 +2357,8 @@ static void machvirt_init(MachineState *machine) create_uart(vms, VIRT_UART, sysmem, serial_hd(0)); + create_cpufreq(vms, sysmem); + if (vms->secure) { create_secure_ram(vms, secure_sysmem, secure_tag_sysmem); create_uart(vms, VIRT_SECURE_UART, secure_sysmem, serial_hd(1)); diff --git a/hw/char/Kconfig b/hw/char/Kconfig index 6b6cf2fc1d..335a60c2c1 100644 --- a/hw/char/Kconfig +++ b/hw/char/Kconfig @@ -71,3 +71,7 @@ config GOLDFISH_TTY config SHAKTI_UART bool + +config CPUFREQ + bool + default y diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 2b42e4192b..b1f389fb4b 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -93,4 +93,44 @@ typedef struct AcpiFadtData { #define ACPI_FADT_ARM_PSCI_COMPLIANT (1 << 0) #define ACPI_FADT_ARM_PSCI_USE_HVC (1 << 1) +/* + * CPPC register definition from kernel header + * include/acpi/cppc_acpi.h + * The last element is newly added for easy use + */ +enum cppc_regs { + HIGHEST_PERF, + NOMINAL_PERF, + LOW_NON_LINEAR_PERF, + LOWEST_PERF, + GUARANTEED_PERF, + DESIRED_PERF, + MIN_PERF, + MAX_PERF, + PERF_REDUC_TOLERANCE, + TIME_WINDOW, + CTR_WRAP_TIME, + REFERENCE_CTR, + DELIVERED_CTR, + PERF_LIMITED, + ENABLE, + AUTO_SEL_ENABLE, + AUTO_ACT_WINDOW, + ENERGY_PERF, + REFERENCE_PERF, + LOWEST_FREQ, + NOMINAL_FREQ, + CPPC_REG_COUNT, +}; + +#define CPPC_REG_PER_CPU_STRIDE 0x40 + +/* + * Offset for each CPPC register; -1 for unavailable + * + * Offset for each CPPC register; -1 for unavailable + * The whole register space is unavailable if desired perf offset is -1. + */ +extern int cppc_regs_offset[CPPC_REG_COUNT]; + #endif diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index 84ded2ecd3..200cb113de 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -429,6 +429,9 @@ Aml *aml_dma(AmlDmaType typ, AmlDmaBusMaster bm, AmlTransferSize sz, uint8_t channel); Aml *aml_sleep(uint64_t msec); Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source); +Aml *aml_generic_register(AmlRegionSpace rs, uint8_t reg_width, + uint8_t reg_offset, AmlAccessType type, + uint64_t addr); /* Block AML object primitives */ Aml *aml_scope(const char *name_format, ...) G_GNUC_PRINTF(1, 2); diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index f69239850e..e944d434c4 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -60,6 +60,7 @@ enum { VIRT_GIC_REDIST, VIRT_SMMU, VIRT_UART, + VIRT_CPUFREQ, VIRT_MMIO, VIRT_RTC, VIRT_FW_CFG, -- Gitee From b062e2f182af4c44fbd3a03eda9c934686037032 Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Sat, 30 Mar 2024 20:16:32 +0800 Subject: [PATCH 153/939] tests: virt: Update expected ACPI tables for virt test Update the ACPI tables according to the acpi aml_build change, also empty bios-tables-test-allowed-diff.h. Signed-off-by: Kunkun Jiang --- tests/data/acpi/virt/DSDT | Bin 5196 -> 5669 bytes tests/data/acpi/virt/DSDT.acpihmatvirt | Bin 5282 -> 7178 bytes tests/data/acpi/virt/DSDT.memhp | Bin 6557 -> 7030 bytes tests/data/acpi/virt/DSDT.pxb | Bin 7679 -> 8152 bytes tests/data/acpi/virt/DSDT.topology | Bin 5398 -> 9190 bytes tests/data/acpi/virt/PPTT | Bin 76 -> 208 bytes tests/data/acpi/virt/PPTT.acpihmatvirt | Bin 156 -> 588 bytes tests/data/acpi/virt/PPTT.topology | Bin 336 -> 700 bytes tests/qtest/bios-tables-test-allowed-diff.h | 3 --- 9 files changed, 3 deletions(-) diff --git a/tests/data/acpi/virt/DSDT b/tests/data/acpi/virt/DSDT index c47503990715d389914fdf9c8bccb510761741ac..dd8573f0312918ea1ba17496e9f3a275e98ab214 100644 GIT binary patch delta 514 zcmX@3u~di4CDC{9nX`Y4z#+dkEr}*e5XZeW+lJy+%&H>Is4l?3g;#|yv zB3yz^JPZs949pA+4BSA>P|1KK$bwCf1Dhbw5KQd?n1b~T%pw?~AbX(zVLeO)NK;MF zfuWv70>gR-1{MVjL12(*Uo4O*Pi7GSAkZ%L delta 38 tcmZ3gb4G*9CD#+dkEr}*e5sfmV?o0XWDMF7oq3Hks4 diff --git a/tests/data/acpi/virt/DSDT.acpihmatvirt b/tests/data/acpi/virt/DSDT.acpihmatvirt index aee6ba017cd730948bfa93e91551eb10a6809293..2581d1937d531db5ad4c8146755a75199ebc0e5a 100644 GIT binary patch literal 7178 zcmcJU%WoT16vpqwPGXNAu^lJQvreH@1xn>d)1;+^*kdPgi4)^Vn@XwVN(j^v7Llls zN`(|6D5YWn4MHq~6bTkA_zO@aSb)T$3nY-(ut7qI1v^$?zB@B^&gq=&$VeH_`R3l= zz2Dq3p5sZyuB@69O7-N+E7qbtRld@&rmz`GDSfOz6YqX)Ty5Fq*7|lOYvppeP;0SP ziIuEI4Z(~GwJy~v*jo)NH8+tlGiD%}Hj~>?rKVH>fC&E7Ek(qjB!(mr5P$cCB&L0$ z9@g9TNAD>3{NKbm6&&%64spsS;)0y;iH9*Ik%`ey`bLMC>x!XP!>*)aM#}6pdzCW( zF@xT!=cQFQ4qElOE|IKyMG_Y!aY+&zlDO&x;QuKEMB9Z`#Q*Gjk`b zbU6@K38mieQ-*OA(Ji&UJv+Z<&AS=%Vzqt=ML_|5?bVVQLB?1m{gePAX~tbdPZ&>2 zr)Md6x!I{FmX@0<=sLYWY`5qB;HK*F2G3E1_i*9}J**GWqxKPb4^h^$@V@Cd!uh82 zF@l6qpbYP}d4!>F8y};OFp89M+P#pEFoF^zL>Q+jV_IbNN(_@QrYYl$$mo+8QNlPw z88afIUt+`wV}>$jMMg+sBnV@cGR}&Ou*66a##zG1dpE};>tafbG+~^h3^7glP0~sA zVZwNpGUPmsO7j>YjPsNs=V?r0j1k5K%8>IkE-@wu<2lNZ^K?LB93hP7DMQZFgv6L4 zj5*4X^E4?j9wUqrWypD&k{HJbqf8lco(@WkIkBQZ`9#sX!?c{(C7a)hx+8FHSEN{j+w)F?yF(=my0 znlLU>hMcG465|YEyg(Upo=!-NS;APN3^`9HCB`|zs1rs}&eJ0j<2+$3Q-++US&8u+ zVXSzJ<d&6`JKxsfa7X7y_fyxVZ2qOM}93)q*-%iXfXUTr3`=0rnTs-|96Q!0nQ z3Xa}TH`Gh`R2t$u@Egt5!KK&DtyCl!X}8saUCSx=*uZgKrJ`Cldk_>lK>$HfrDDNo zyKN}l2kpE__d>(Cd(P>Hrkzmt1m9_kD#>l@MY?CFDM}zcBAPlq5$c|uQSidXZR_+5 zDBZKuG~o6`Q>Q0F-4io%+6JATof!n3o}H#BnLL+HPlVJb-ZYRI^g2B|K6~AsoyjAl zKKqzY6s7Lmok37yQJ)B@&wl2!pZVJuULIlz1lFrNe5Cqn8IKWA`-I(eI5K2cIqp9rbXB=ebMK9k%hLh3Wc ze5ROBl%UinLh5sn`5a_E2f0s#)MuLcOf#P-S*cHi)aMZMImCPpai0jO&tc|snE6DB zOMN1wJ~PZ`hWX5Jp9rZ>{J13RIl_FRB&I$QQlF#D=P2_z%6%fFKF65PG3FB`H1&y) z`W$CI$C=M@?h_&PiP!eT=LGYKlAHQONPSK+pOehzB=?Ds`aHsX9$`LFqEnv;sn0C) znPon++$TcU=lXUns4cob-}FP-%FTBl#0|Vth?^1TRm9`YBOT9OpO)@x*-hKy`L<1EcG9($-+GV delta 73 zcmeCOSft7266_MPNQ8lbDP|&u`b|B?!U|={|kwt`sF}g{5 YvMh@fP*55uXbcn-Wn|vW&cQ4K0Iq@%+5i9m diff --git a/tests/data/acpi/virt/DSDT.memhp b/tests/data/acpi/virt/DSDT.memhp index bae36cdd397473afe3923c52f030641a5ab19d5d..d764481440adea59d928a1a48afe0ba1ce135597 100644 GIT binary patch delta 514 zcmbPh{LPHZCDGeb|X`aTdj4|=SPVv!A&hirtBFt7`TC$p^^bfkOi9{2R1>VA(+|)Fa_%wm_;x|LH0rc!g`npkfxfT z14BKF1cvnv3@i#5g1{irz!0R|P0=tT2>1zTy$*);KtThzAV+*au!|56qYx)67b^n; L*I&kso*|L|A3`us delta 38 ucmexnHrJTTCD8Drvuo#LaLq$V0lZdPInkpuwgSPN7D diff --git a/tests/data/acpi/virt/DSDT.pxb b/tests/data/acpi/virt/DSDT.pxb index fbd78f44c4785d19759daea909fe6d6f9a6e6b01..9ff22b5ea465d2f678beb2ce9d905861d69a5b87 100644 GIT binary patch delta 514 zcmexweZ!v1CD6}CMj=jCE>;Ex MuD^^MJ-5jM0391Kw*UYD delta 38 ucmca%|KFO+CDrRARvbsYu-*Rjp9Pf?Zdwd)~}=a&O-CuC^50_sqQC zeP`~o=gv6!N`CRrT}rk5o?EbIDkHfwC3^&qfKp1o*RQc_M_SHSE4k{@YCdJB)9Fxk zW;P$4vP-iF##N|#ayE}=yJROOhkC7Et2Lgq;;Rv*MpP>Ro%mB%6cK}x7?MOl%;7Of zOd6t2>peEJ*A=|}Z{nm1_8HkBju;{i$X-Ki#*{=RvhOppLrnW(s9LJz6VX7zYO~su zvi`9`Uez;F)dP*HKIs!l)$@`#BZ+g8Sdzp=Llm7iLqvhk7}+77GejKX^M=@rDTz#E zzhq>Gxa^B!)eHKwJ}On+YH3vV6(32;eq9o8O5$xvyd#PC3{kY+3=tLn!N?AA-4Jn% z?;Bz>rX(_v{eh7k;zM6-T=uM3_SQyaf8--c*&j>d6G?n3iO(eQxgmKM*ASaAC6S2=?=!MPO#5QvvX6;nZ);TcNgqi*?RiO@k;FMkEJ@;`A&S-(jO-B486uAHc|&Z*ltd=7Uox^oT=vDrWj`pEy}eP{S9~NX`*lgYDT%iw@s1?k zGepsPGelJQ2O~Sgbwk84zHf-ln3Bju_6J6Gh!1_SaoNYkvUfBp`y(Go%KlgqpGe|U zNqi=W&ka$u-V6~Heqm&X_|gz@j9(dIGo~amk^Qxi9pW2b6w7|tUMSWt+RMe6bUKsS zyl5|1;saLijj+;}`e7AQ>f3e|2<$|3MJ=sPOrN%=-4*k3p?DG({G%9KS)8&uv105g zeFX?a-0E@>En!>bAOzV%_Om3+bFL|0GybDi`&}j`si`qnF zJVZ{`)#sp;?5*cBM zksypigpu(+99!mLNsJ_69HtC$neq=weW~{l#s`!kuhWP$k3PaULK*TpjY^CG!Z=D9 z@;dF37(;|{j56ePx=mv2AdC+wLtdvbi7`SLlawK^)40TVi!i1rLtdu|iLsk7a+D#j z({72ehcIly7?sy)Qex~S4BU5fzLE8J;i$Y$w@Zw738O$6@;dF282bognlj{d+AA^k z6UK4Mkk@IS#7Gmy3}wjcv|nP362>fL$m?`KVjLih6OJjI6v)cSwvQgfUMU@;XgPjAMkc;4$WZD%rE%JWBJW^i+C6|3rb` zE!-%pyO_dpJm+%rep^ymEXPyUP)XTpR(+~QR2qMIy!%pJP@mvcX^3-zE0q_!=f1F3 z5}m=$T1_3V%%+w5K7jWbm56BFYC$ks4_XmqRU#UU)M^2x$6!A%(xb3rm%Ggwhh6KT z9tpm)FS;amUoX-l>${=_(iyRN9cNQ8PMX5{P} zbVk-^5OhY?cSXzOxpYP%q(1R00+~UZGqUcp%^g{vJVNTTo%uvl>dxIsf)5Mw!nj^BLtn5mKLB%x4$#i58LiL`Z#ZV?MVr zpWC=kgw!W4XK;k->o&%GqNSuh5mKLV<}=QG#<@>~)MtYEOfa8lL8(uK)Mq#I+0A@* zbDs#Q&m{AiWIoZdQlAK^&+W|TcII|s88m`}90)F(pfvzPhoWj=ekPlVJb zE-pzu`a(Bu>}Njvxle@D=K%9LzGralo;pM%WjAoDrMeIlejhnUYH z<`XS9^@))B9A-X;na^SF6Cw4vgZbRSe4<6CJ`qx%Ddsc9e5SZhgs#t})!CpfrF;3N zPs(U3^J>~c=&IPV%-vb|alhdc_X#L!w}5grA9X^{%4={L zO5ZP_thWW0t8tds=yDeB8&K9Bf#qr*D;cZWxhb`Q?@$$+#FKkCW;(3PH>0WrU*~w# z^i$v)e7Yj&E}TcKR{K|9mf=y`k1c@+dI0xR%`aCM<#2X9?ecK&o`eWqjWe7xI=Efs&u{+$|{{QmUisYgeD{p{97 zD~KIN@fyNwc3B6Squ*pT>Z*RSQu=v4c6aSU;A?EjID2Q-d+x5)_jBc#-nguH*80Z9 zAG=&#C&_GFOuk>uSBk;)o$hR&jsNmTW$dfscfEfV`+v5J>*2M})keHE5YSJ!(NzqV z3l+8+^^GpQ=CW4WI6h7daCB0Wn}%2AajyrUvP)1*hiiW~gaOqm74(&pvFgKDMt`<%$hL?44Zaubk^_->qy&I*| T<(##$wA!!V(9e_Y=2HIwn^<{F delta 118 zcmaFnK23|uCD7$_*p!3^P;fH^D>jwzVK3gMW6I4o>H&g2igQkx(1Gm8KKiF+I8 diff --git a/tests/data/acpi/virt/PPTT b/tests/data/acpi/virt/PPTT index 7a1258ecf123555b24462c98ccbb76b4ac1d0c2b..b89b2a9c71e0bc2713fc38f5de68fbc39b6302cb 100644 GIT binary patch literal 208 zcmWFt2no5sz`($y>E!S15v<@85#X!<1dKp25F11@N-!|g18FE=V&Gt4;OA;!U;v7P z1UT7A73xEDj6|3JeTfK(!%23G#4~^x046;S0RR91 literal 76 zcmWFt2nq3FU|?Wc;pFe^5v<@85#X!<1dKp25F11@h%hjKX%HI*fMQ%gwhD|7qyeJ> B2LS*8 diff --git a/tests/data/acpi/virt/PPTT.acpihmatvirt b/tests/data/acpi/virt/PPTT.acpihmatvirt index 4eef303a5b6168c6bc3795c2e2c53f65b4c4cfd4..945a73fff3b5c93367999199f06dbe7294f594b4 100644 GIT binary patch literal 588 zcma)(F%E)26hw!Wg3#DdSW;Me1PcNd7A9^Yg-5XP2qs#11Pc$rF+7H)$FOh)(6Ac` z{4l$lVg7$hu(m8W1~7+wTda3!zPCln3IOP=J(vM!?rj-Fz%;%9^!5H*I?t&w`Fbw1 zrJ55hz_Qtzwe^l2&z;Llgg_K$KVDU5Ihjvrdi*d0QXppbN~tE4dLPu28sa3nydWOO w@jZ@O#8GBJt*9Z+Nlkw^j^E+jndYyoIm+CqY6R!i!*P6%qi>Ovc~c3&AN*}20RR91 literal 156 zcmWFt2nm_Pz`(%t&&l7}BUr&HBEVSz2pEB4AU23*5Mf{d(;zks0L8d~Y!w(EL?em8 b)g$Re76a)`0AeN}1_P+x1R#eQBEkRwWK9VH diff --git a/tests/data/acpi/virt/PPTT.topology b/tests/data/acpi/virt/PPTT.topology index 3fbcae5ff08aaf16fedf4da45e941661d79c1174..b0762c0a73eab106bab475f2a177e159b0b01c67 100644 GIT binary patch literal 700 zcmaiyI}XAy5JZPyKB9mo1!W3KjzEE+p+F)MDL4QHhd_cuL&rG>8oYKrs+dflv?45YUMh?!vef$Csl%t&G&Cde(wdO>1GKm-gx_1*yTS+Iz)B8h>R aAic=uf$S9l3b27BK>%tVNQ@mK!T Date: Mon, 29 Jul 2019 16:16:35 +0800 Subject: [PATCH 154/939] pl011: reset read FIFO when UARTTIMSC=0 & UARTICR=0xffff We can enable ACPI when AArch64 Linux is booted with QEMU and UEFI (AAVMF). When VM is booting and the SBSA driver has not initialized, writting data that exceds 32 bytes will cause the read FIFO full and proceeding data will be lost. The searil port appears to be stuck in this abnormal situation. A hack to reset read FIFO when UARTTIMSC=0 & UARTICR=0xffff appears to resolve the issue. The question is fully discussed at https://www.spinics.net/lists/linux-serial/msg23163.html Signed-off-by: Haibin Wang Reviewed-by: Shannon Zhao Reviewed-by: Ying Fang Signed-off-by: Yan Wang --- hw/char/pl011.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/char/pl011.c b/hw/char/pl011.c index 58edeb9ddb..bc65d778d2 100644 --- a/hw/char/pl011.c +++ b/hw/char/pl011.c @@ -314,6 +314,10 @@ static void pl011_write(void *opaque, hwaddr offset, case 17: /* UARTICR */ s->int_level &= ~value; pl011_update(s); + if (!s->int_enabled && !s->int_level) { + s->read_count = 0; + s->read_pos = 0; + } break; case 18: /* UARTDMACR */ s->dmacr = value; -- Gitee From c4829aa6fce007c995b21cfbd86de0473263c19a Mon Sep 17 00:00:00 2001 From: Dongxu Sun Date: Sat, 30 Mar 2024 12:49:05 +0800 Subject: [PATCH 155/939] shadow_dev: introduce shadow dev for virtio-net device for virtio net devices, create the shadow device for vlpi bypass inject supported. Signed-off-by: Wang Haibin Signed-off-by: Yu Zenghui Signed-off-by: Chen Qun Signed-off-by: KunKun Jiang Signed-off-by: Dongxu Sun Signed-off-by: Yuan Zhang --- hw/virtio/virtio-pci.c | 32 ++++++++++++++++++++++++++ include/sysemu/kvm.h | 5 +++++ linux-headers/linux/kvm.h | 13 +++++++++++ target/arm/kvm.c | 47 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 134a8eaef6..f8adb0520a 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -922,18 +922,44 @@ undo: } return ret; } + +#ifdef __aarch64__ +int __attribute__((weak)) kvm_create_shadow_device(PCIDevice *dev) +{ + return 0; +} + +int __attribute__((weak)) kvm_delete_shadow_device(PCIDevice *dev) +{ + return 0; +} +#endif + static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) { int queue_no; int ret = 0; VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); +#ifdef __aarch64__ + if (!strcmp(vdev->name, "virtio-net")) { + kvm_create_shadow_device(&proxy->pci_dev); + } +#endif + for (queue_no = 0; queue_no < nvqs; queue_no++) { if (!virtio_queue_get_num(vdev, queue_no)) { return -1; } ret = kvm_virtio_pci_vector_use_one(proxy, queue_no); } + +#ifdef __aarch64__ + if (!strcmp(vdev->name, "virtio-net") && ret != 0) { + kvm_delete_shadow_device(&proxy->pci_dev); + } +#endif + return ret; } @@ -976,6 +1002,12 @@ static void kvm_virtio_pci_vector_vq_release(VirtIOPCIProxy *proxy, int nvqs) } kvm_virtio_pci_vector_release_one(proxy, queue_no); } + +#ifdef __aarch64__ + if (!strcmp(vdev->name, "virtio-net")) { + kvm_delete_shadow_device(&proxy->pci_dev); + } +#endif } static void kvm_virtio_pci_vector_config_release(VirtIOPCIProxy *proxy) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index d614878164..b46d6203b4 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -538,4 +538,9 @@ bool kvm_arch_cpu_check_are_resettable(void); bool kvm_dirty_ring_enabled(void); uint32_t kvm_dirty_ring_size(void); + +#ifdef __aarch64__ +int kvm_create_shadow_device(PCIDevice *dev); +int kvm_delete_shadow_device(PCIDevice *dev); +#endif #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 549fea3a97..56f6b2583f 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1198,6 +1198,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 +#define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 + #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_irqchip { @@ -1524,6 +1526,17 @@ struct kvm_s390_ucas_mapping { #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) + +#ifdef __aarch64__ +struct kvm_master_dev_info +{ + __u32 nvectors; /* number of msi vectors */ + struct kvm_msi msi[0]; +}; +#define KVM_CREATE_SHADOW_DEV _IOW(KVMIO, 0xf0, struct kvm_master_dev_info) +#define KVM_DEL_SHADOW_DEV _IOW(KVMIO, 0xf1, __u32) +#endif + /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 7903e2ddde..f59f4f81b2 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -26,6 +26,8 @@ #include "trace.h" #include "internals.h" #include "hw/pci/pci.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" #include "exec/memattrs.h" #include "exec/address-spaces.h" #include "hw/boards.h" @@ -1053,6 +1055,51 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, return 0; } +int kvm_create_shadow_device(PCIDevice *dev) +{ + KVMState *s = kvm_state; + struct kvm_master_dev_info *mdi; + MSIMessage msg; + uint32_t vector, nvectors = msix_nr_vectors_allocated(dev); + uint32_t request_id; + int ret; + + if (!kvm_vm_check_extension(s, KVM_CAP_ARM_VIRT_MSI_BYPASS) || !nvectors) { + return 0; + } + + mdi = g_malloc0(sizeof(uint32_t) + sizeof(struct kvm_msi) * nvectors); + mdi->nvectors = nvectors; + request_id = pci_requester_id(dev); + + for (vector = 0; vector < nvectors; vector++) { + msg = msix_get_message(dev, vector); + mdi->msi[vector].address_lo = extract64(msg.address, 0, 32); + mdi->msi[vector].address_hi = extract64(msg.address, 32, 32); + mdi->msi[vector].data = le32_to_cpu(msg.data); + mdi->msi[vector].flags = KVM_MSI_VALID_DEVID; + mdi->msi[vector].devid = request_id; + memset(mdi->msi[vector].pad, 0, sizeof(mdi->msi[vector].pad)); + } + + ret = kvm_vm_ioctl(s, KVM_CREATE_SHADOW_DEV, mdi); + g_free(mdi); + return ret; +} + +int kvm_delete_shadow_device(PCIDevice *dev) +{ + KVMState *s = kvm_state; + uint32_t request_id, nvectors = msix_nr_vectors_allocated(dev); + + if (!kvm_vm_check_extension(s, KVM_CAP_ARM_VIRT_MSI_BYPASS) || !nvectors) { + return 0; + } + + request_id = pci_requester_id(dev); + return kvm_vm_ioctl(s, KVM_DEL_SHADOW_DEV, &request_id); +} + int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, int vector, PCIDevice *dev) { -- Gitee From bad33579c56b73d56e0b220c98faad7893609b85 Mon Sep 17 00:00:00 2001 From: Ying Fang Date: Mon, 18 Mar 2024 10:21:04 +0800 Subject: [PATCH 156/939] tests: Disable filemonitor testcase Since filemonitor testcase requires that host kernel being a LTS version, we cannot guarantee that on OBS system. Lets disable it by default. Signed-off-by: Ying Fang Signed-off-by: Jinhao Gao Signed-off-by: Yuan Zhang --- tests/unit/meson.build | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/meson.build b/tests/unit/meson.build index a05d471090..598ba41bb9 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -142,9 +142,6 @@ if have_system 'test-vmstate': [migration, io], 'test-yank': ['socket-helpers.c', qom, io, chardev] } - if config_host_data.get('CONFIG_INOTIFY1') - tests += {'test-util-filemonitor': []} - endif # Some tests: test-char, test-qdev-global-props, and test-qga, # are not runnable under TSan due to a known issue. -- Gitee From 0a6baf4799dd6e70d7959002ea6ddb998eddbc6d Mon Sep 17 00:00:00 2001 From: "shenghualong@huawei.com" Date: Mon, 18 Mar 2024 15:53:43 +0800 Subject: [PATCH 157/939] freeclock: add qmp command to get time offset of vm in seconds When setting the system time in VM, a RTC_CHANGE event will be reported. However, if libvirt is restarted while the event is be reporting, the event will be lost and we will get the old time (not the time we set in VM) after rebooting the VM. We save the delta time in QEMU and add a rtc-date-diff qmp to get the delta time so that libvirt can get the latest time in VM according to the qmp after libvirt is restarted. Signed-off-by: Peng Liang Signed-off-by: zhangxinhao Signed-off-by: Yuan Zhang --- hw/core/machine-qmp-cmds.c | 6 ++++++ include/sysemu/rtc.h | 4 +++- qapi/misc.json | 9 +++++++++ qapi/pragma.json | 3 ++- system/rtc.c | 11 +++++++++++ 5 files changed, 31 insertions(+), 2 deletions(-) diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c index 3860a50c3b..f1389ef644 100644 --- a/hw/core/machine-qmp-cmds.c +++ b/hw/core/machine-qmp-cmds.c @@ -8,6 +8,7 @@ */ #include "qemu/osdep.h" +#include "sysemu/rtc.h" #include "hw/acpi/vmgenid.h" #include "hw/boards.h" #include "hw/intc/intc.h" @@ -373,6 +374,11 @@ HumanReadableText *qmp_x_query_irq(Error **errp) return human_readable_text_from_str(buf); } +int64_t qmp_query_rtc_date_diff(Error **errp) +{ + return get_rtc_date_diff(); +} + GuidInfo *qmp_query_vm_generation_id(Error **errp) { GuidInfo *info; diff --git a/include/sysemu/rtc.h b/include/sysemu/rtc.h index 0fc8ad6fdf..3edae762d4 100644 --- a/include/sysemu/rtc.h +++ b/include/sysemu/rtc.h @@ -54,5 +54,7 @@ void qemu_get_timedate(struct tm *tm, time_t offset); * then this function will return 3600. */ time_t qemu_timedate_diff(struct tm *tm); - +time_t get_rtc_date_diff(void); +void set_rtc_date_diff(time_t diff); +int64_t qmp_query_rtc_date_diff(Error **errp); #endif diff --git a/qapi/misc.json b/qapi/misc.json index cda2effa81..1832d5f460 100644 --- a/qapi/misc.json +++ b/qapi/misc.json @@ -550,6 +550,15 @@ 'returns': ['CommandLineOptionInfo'], 'allow-preconfig': true} +## +# @query-rtc-date-diff: +# +# get vm's time offset +# +# Since: 2.8 +## +{ 'command': 'query-rtc-date-diff', 'returns': 'int64' } + ## # @RTC_CHANGE: # diff --git a/qapi/pragma.json b/qapi/pragma.json index 0aa4eeddd3..7a07b44bb1 100644 --- a/qapi/pragma.json +++ b/qapi/pragma.json @@ -30,7 +30,8 @@ 'qom-get', 'query-tpm-models', 'query-tpm-types', - 'ringbuf-read' ], + 'ringbuf-read', + 'query-rtc-date-diff'], # Externally visible types whose member names may use uppercase 'member-name-exceptions': [ # visible in: 'ACPISlotType', # query-acpi-ospm-status diff --git a/system/rtc.c b/system/rtc.c index 4904581abe..e16b5fffc5 100644 --- a/system/rtc.c +++ b/system/rtc.c @@ -44,6 +44,7 @@ static time_t rtc_ref_start_datetime; static int rtc_realtime_clock_offset; /* used only with QEMU_CLOCK_REALTIME */ static int rtc_host_datetime_offset = -1; /* valid & used only with RTC_BASE_DATETIME */ +static time_t rtc_date_diff = 0; QEMUClockType rtc_clock; /***********************************************************/ /* RTC reference time/date access */ @@ -108,6 +109,16 @@ time_t qemu_timedate_diff(struct tm *tm) return seconds - qemu_ref_timedate(QEMU_CLOCK_HOST); } +time_t get_rtc_date_diff(void) +{ + return rtc_date_diff; +} + +void set_rtc_date_diff(time_t diff) +{ + rtc_date_diff = diff; +} + static void configure_rtc_base_datetime(const char *startdate) { time_t rtc_start_datetime; -- Gitee From 156be254a48d1d9b7aadcbfa4423485c592bc75d Mon Sep 17 00:00:00 2001 From: "shenghualong@huawei.com" Date: Thu, 21 Mar 2024 11:21:14 +0800 Subject: [PATCH 158/939] freeclock: set rtc_date_diff for arm Set rtc_date_diff in pl031. Signed-off-by: Peng Liang Signed-off-by: zhangxinhao Signed-off-by: Yuan Zhang --- hw/rtc/pl031.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rtc/pl031.c b/hw/rtc/pl031.c index b01d0e75d1..f2e6baebba 100644 --- a/hw/rtc/pl031.c +++ b/hw/rtc/pl031.c @@ -144,7 +144,8 @@ static void pl031_write(void * opaque, hwaddr offset, s->tick_offset += value - pl031_get_count(s); qemu_get_timedate(&tm, s->tick_offset); - qapi_event_send_rtc_change(qemu_timedate_diff(&tm), qom_path); + set_rtc_date_diff(qemu_timedate_diff(&tm)); + qapi_event_send_rtc_change(get_rtc_date_diff(), qom_path); pl031_set_alarm(s); break; -- Gitee From 0a0010fe0656a63e82aea495ab0a59145d3b5750 Mon Sep 17 00:00:00 2001 From: "shenghualong@huawei.com" Date: Thu, 21 Mar 2024 12:26:38 +0800 Subject: [PATCH 159/939] freeclock: set rtc_date_diff for X86 Set rtc_date_diff in mc146818rtc. Signed-off-by: l00500761 Signed-off-by: zhangxinhao Signed-off-by: Yuan Zhang --- hw/rtc/mc146818rtc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c index 2d391a8396..e61c76d060 100644 --- a/hw/rtc/mc146818rtc.c +++ b/hw/rtc/mc146818rtc.c @@ -606,7 +606,8 @@ static void rtc_set_time(MC146818RtcState *s) s->base_rtc = mktimegm(&tm); s->last_update = qemu_clock_get_ns(rtc_clock); - qapi_event_send_rtc_change(qemu_timedate_diff(&tm), qom_path); + set_rtc_date_diff(qemu_timedate_diff(&tm)); + qapi_event_send_rtc_change(get_rtc_date_diff(), qom_path); } static void rtc_set_cmos(MC146818RtcState *s, const struct tm *tm) -- Gitee From ff43e9201aba8f4047e6fd5edb93a4861cc8fed2 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Thu, 28 Mar 2024 18:57:56 +0800 Subject: [PATCH 160/939] i386: cache passthrough: Update AMD 8000_001D.EAX[25:14] based on vCPU topo On AMD target, when host cache passthrough is disabled we will emulate the guest caches with default values and initialize the shared cpu list of the caches based on vCPU topology. However when host cache passthrough is enabled, the shared cpu list is consistent with host regardless what the vCPU topology is. For example, when cache passthrough is enabled, running a guest with vThreads=1 on a host with pThreads=2, we will get that there are every *two* logical vCPUs sharing a L1/L2 cache, which is not consistent with the vCPU topology (vThreads=1). So let's reinitialize BITs[25:14] of AMD CPUID 8000_001D.EAX based on the actual vCPU topology instead of host pCPU topology. Signed-off-by: Yanan Wang Signed-off-by: Yuan Zhang --- target/i386/cpu.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index f94405c02b..491cf40cc7 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6597,9 +6597,31 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } break; case 0x8000001D: + /* Populate AMD Processor Cache Information */ *eax = 0; if (cpu->cache_info_passthrough) { x86_cpu_get_cache_cpuid(index, count, eax, ebx, ecx, edx); + + /* + * Clear BITs[25:14] and then update them based on the guest + * vCPU topology, like what we do in encode_cache_cpuid8000001d + * when cache_info_passthrough is not enabled. + */ + *eax &= ~0x03FFC000; + switch (count) { + case 0: /* L1 dcache info */ + case 1: /* L1 icache info */ + case 2: /* L2 cache info */ + *eax |= ((topo_info.threads_per_core - 1) << 14); + break; + case 3: /* L3 cache info */ + *eax |= ((topo_info.cores_per_die * + topo_info.threads_per_core - 1) << 14); + break; + default: /* end of info */ + *eax = *ebx = *ecx = *edx = 0; + break; + } break; } switch (count) { -- Gitee From 48a328ee1a5a71b7048e4591310471c759fc5af6 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 27 Jul 2020 20:39:07 +0800 Subject: [PATCH 161/939] bugfix: irq: Avoid covering object refcount of qemu_irq Avoid covering object refcount of qemu_irq, otherwise it may causes memory leak. Signed-off-by: Keqian Zhu --- hw/core/irq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/core/irq.c b/hw/core/irq.c index 3f14e2dda7..df9b5dac9b 100644 --- a/hw/core/irq.c +++ b/hw/core/irq.c @@ -110,7 +110,10 @@ void qemu_irq_intercept_in(qemu_irq *gpio_in, qemu_irq_handler handler, int n) int i; qemu_irq *old_irqs = qemu_allocate_irqs(NULL, NULL, n); for (i = 0; i < n; i++) { - *old_irqs[i] = *gpio_in[i]; + old_irqs[i]->handler = gpio_in[i]->handler; + old_irqs[i]->opaque = gpio_in[i]->opaque; + old_irqs[i]->n = gpio_in[i]->n; + gpio_in[i]->handler = handler; gpio_in[i]->opaque = &old_irqs[i]; } -- Gitee From 16c4b8946903985e3dfd470d0e04b79d473505bc Mon Sep 17 00:00:00 2001 From: "wanghaibin.wang" Date: Sun, 17 Mar 2024 15:53:57 +0800 Subject: [PATCH 162/939] log: Add log at boot & cpu init for aarch64 Add log at boot & cpu init for aarch64 Signed-off-by: miaoyubo Signed-off-by: Jingyi Wang Signed-off-by: Yuan Zhang --- hw/arm/boot.c | 4 ++++ hw/arm/virt.c | 3 +++ 2 files changed, 7 insertions(+) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 84ea6a807a..d1671e1d42 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -11,6 +11,7 @@ #include "qemu/datadir.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qemu/log.h" #include #include "hw/arm/boot.h" #include "hw/arm/linux-boot-if.h" @@ -1226,6 +1227,9 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) * doesn't support secure. */ assert(!(info->secure_board_setup && kvm_enabled())); + + qemu_log("load the kernel\n"); + info->kernel_filename = ms->kernel_filename; info->kernel_cmdline = ms->kernel_cmdline; info->initrd_filename = ms->initrd_filename; diff --git a/hw/arm/virt.c b/hw/arm/virt.c index c19cacec8b..f4c3d47f30 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -32,6 +32,7 @@ #include "qemu/datadir.h" #include "qemu/units.h" #include "qemu/option.h" +#include "qemu/log.h" #include "monitor/qdev.h" #include "hw/sysbus.h" #include "hw/arm/boot.h" @@ -1020,6 +1021,7 @@ static void virt_powerdown_req(Notifier *n, void *opaque) { VirtMachineState *s = container_of(n, VirtMachineState, powerdown_notifier); + qemu_log("send powerdown to vm.\n"); if (s->acpi_dev) { acpi_send_event(s->acpi_dev, ACPI_POWER_DOWN_STATUS); } else { @@ -2240,6 +2242,7 @@ static void machvirt_init(MachineState *machine) } create_fdt(vms); + qemu_log("cpu init start\n"); assert(possible_cpus->len == max_cpus); for (n = 0; n < possible_cpus->len; n++) { -- Gitee From 30cc47b6dd3e9ff4842eb1c2a918bbabfd8c593b Mon Sep 17 00:00:00 2001 From: "wangxinxin.wang@huawei.com" Date: Sun, 17 Mar 2024 15:44:28 +0800 Subject: [PATCH 163/939] feature: Add log for each modules add log for each modules. Signed-off-by: miaoyubo Signed-off-by: Jingyi Wang Signed-off-by: Yuan Zhang --- accel/kvm/kvm-all.c | 5 ++++- hw/char/virtio-serial-bus.c | 5 +++++ hw/pci/pci.c | 1 + hw/usb/bus.c | 6 ++++++ hw/usb/host-libusb.c | 5 +++++ hw/virtio/virtio-scsi-pci.c | 3 +++ monitor/qmp-cmds.c | 3 +++ os-posix.c | 1 + qapi/qmp-dispatch.c | 15 +++++++++++++++ system/qdev-monitor.c | 5 +++++ 10 files changed, 48 insertions(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 33f4c6d547..d900df93a4 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1834,7 +1834,10 @@ void kvm_irqchip_commit_routes(KVMState *s) s->irq_routes->flags = 0; trace_kvm_irqchip_commit_routes(); ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); - assert(ret == 0); + if (ret < 0) { + error_report("Set GSI routing failed: %m"); + abort(); + } } static void kvm_add_routing_entry(KVMState *s, diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index dd619f0731..44906057be 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -257,6 +257,8 @@ static size_t send_control_event(VirtIOSerial *vser, uint32_t port_id, virtio_stw_p(vdev, &cpkt.value, value); trace_virtio_serial_send_control_event(port_id, event, value); + qemu_log("virtio serial port %d send control message" + " event = %d, value = %d\n", port_id, event, value); return send_control_msg(vser, &cpkt, sizeof(cpkt)); } @@ -364,6 +366,9 @@ static void handle_control_message(VirtIOSerial *vser, void *buf, size_t len) cpkt.value = virtio_lduw_p(vdev, &gcpkt->value); trace_virtio_serial_handle_control_message(cpkt.event, cpkt.value); + qemu_log("virtio serial port '%u' handle control message" + " event = %d, value = %d\n", + virtio_ldl_p(vdev, &gcpkt->id), cpkt.event, cpkt.value); if (cpkt.event == VIRTIO_CONSOLE_DEVICE_READY) { if (!cpkt.value) { diff --git a/hw/pci/pci.c b/hw/pci/pci.c index c49417abb2..9da41088df 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2411,6 +2411,7 @@ static void pci_add_option_rom(PCIDevice *pdev, bool is_default_rom, snprintf(name, sizeof(name), "%s.rom", vmsd ? vmsd->name : object_get_typename(OBJECT(pdev))); + qemu_log("add rom file: %s\n", name); pdev->has_rom = true; memory_region_init_rom(&pdev->rom, OBJECT(pdev), name, pdev->romsize, &error_fatal); diff --git a/hw/usb/bus.c b/hw/usb/bus.c index 92d6ed5626..20cd9b6e6f 100644 --- a/hw/usb/bus.c +++ b/hw/usb/bus.c @@ -536,6 +536,10 @@ void usb_check_attach(USBDevice *dev, Error **errp) bus->qbus.name, port->path, portspeed); return; } + + qemu_log("attach usb device \"%s\" (%s speed) to VM bus \"%s\", " + "port \"%s\" (%s speed)\n", dev->product_desc, devspeed, + bus->qbus.name, port->path, portspeed); } void usb_device_attach(USBDevice *dev, Error **errp) @@ -564,6 +568,8 @@ int usb_device_detach(USBDevice *dev) usb_detach(port); dev->attached = false; + qemu_log("detach usb device \"%s\" from VM bus \"%s\", port \"%s\"\n", + dev->product_desc, bus->qbus.name, port->path); return 0; } diff --git a/hw/usb/host-libusb.c b/hw/usb/host-libusb.c index dba469c1ef..11a246ac72 100644 --- a/hw/usb/host-libusb.c +++ b/hw/usb/host-libusb.c @@ -992,6 +992,8 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) rc = libusb_open(dev, &s->dh); if (rc != 0) { + qemu_log("libusb open usb device bus %d, device %d failed\n", + bus_num, addr); goto fail; } } else { @@ -1019,6 +1021,7 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) libusb_get_device_descriptor(dev, &s->ddesc); usb_host_get_port(s->dev, s->port, sizeof(s->port)); + qemu_log("open a host usb device on bus %d, device %d\n", bus_num, addr); usb_ep_init(udev); usb_host_ep_update(s); @@ -1146,6 +1149,8 @@ static int usb_host_close(USBHostDevice *s) usb_device_detach(udev); } + qemu_log("begin to reset the usb device, bus : %d, device : %d\n", + s->bus_num, s->addr); usb_host_release_interfaces(s); libusb_reset_device(s->dh); usb_host_attach_kernel(s); diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c index e8e3442f38..e542d47162 100644 --- a/hw/virtio/virtio-scsi-pci.c +++ b/hw/virtio/virtio-scsi-pci.c @@ -20,6 +20,7 @@ #include "qemu/module.h" #include "hw/virtio/virtio-pci.h" #include "qom/object.h" +#include "qemu/log.h" typedef struct VirtIOSCSIPCI VirtIOSCSIPCI; @@ -51,6 +52,8 @@ static void virtio_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) VirtIOSCSIConf *conf = &dev->vdev.parent_obj.conf; char *bus_name; + qemu_log("virtio scsi HBA %s begin to initialize.\n", + !proxy->id ? "NULL" : proxy->id); if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) { conf->num_queues = virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED); diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c index b0f948d337..e78462b857 100644 --- a/monitor/qmp-cmds.c +++ b/monitor/qmp-cmds.c @@ -32,6 +32,7 @@ #include "hw/mem/memory-device.h" #include "hw/intc/intc.h" #include "hw/rdma/rdma.h" +#include "qemu/log.h" NameInfo *qmp_query_name(Error **errp) { @@ -110,8 +111,10 @@ void qmp_cont(Error **errp) } if (runstate_check(RUN_STATE_INMIGRATE)) { + qemu_log("qmp cont is received in migration\n"); autostart = 1; } else { + qemu_log("qmp cont is received and vm is started\n"); vm_start(); } } diff --git a/os-posix.c b/os-posix.c index 52ef6990ff..8f70ee0534 100644 --- a/os-posix.c +++ b/os-posix.c @@ -306,6 +306,7 @@ int os_mlock(void) #ifdef HAVE_MLOCKALL int ret = 0; + qemu_log("do mlockall\n"); ret = mlockall(MCL_CURRENT | MCL_FUTURE); if (ret < 0) { error_report("mlockall: %s", strerror(errno)); diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 7a215cbfd7..e33efd3740 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -25,6 +25,7 @@ #include "qemu/coroutine.h" #include "qemu/main-loop.h" #include "qemu/log.h" +#include "qapi/qmp/qstring.h" Visitor *qobject_input_visitor_new_qmp(QObject *obj) { @@ -220,6 +221,20 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ assert(!(oob && qemu_in_coroutine())); assert(monitor_cur() == NULL); + + json = qobject_to_json(QOBJECT(args)); + if (json) { + if ((strcmp(command, "query-block-jobs") != 0) + && (strcmp(command, "query-migrate") != 0) + && (strcmp(command, "query-blockstats") != 0) + && (strcmp(command, "query-balloon") != 0) + && (strcmp(command, "set_password") != 0)) { + qemu_log("qmp_cmd_name: %s, arguments: %s\n", + command, json->str); + } + g_string_free(json, true); + } + if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) { monitor_set_cur(qemu_coroutine_self(), cur_mon); cmd->fn(args, &ret, &err); diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index b10e483a9a..5b35704b5e 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -644,6 +644,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, if (path != NULL) { bus = qbus_find(path, errp); if (!bus) { + qemu_log("can not find bus for %s\n", driver); return NULL; } if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { @@ -714,6 +715,8 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, object_set_properties_from_keyval(&dev->parent_obj, dev->opts, from_json, errp); if (*errp) { + qemu_log("the bus %s -driver %s set property failed\n", + bus ? bus->name : "None", driver); goto err_del_dev; } qemu_log("add qdev %s:%s success\n", driver, dev->id ? dev->id : "none"); @@ -738,6 +741,8 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) ret = qdev_device_add_from_qdict(qdict, false, errp); if (ret) { + qemu_log("add qdev %s:%s success\n", qemu_opt_get(opts, "driver"), + qemu_opts_id(opts) ? qemu_opts_id(opts) : "none"); qemu_opts_del(opts); } qobject_unref(qdict); -- Gitee From 9a47271fb6c855ec92e087d59d65f3cc0c684725 Mon Sep 17 00:00:00 2001 From: "wangxinxin.wang@huawei.com" Date: Sun, 17 Mar 2024 15:04:09 +0800 Subject: [PATCH 164/939] feature: Add logs for vm start and destroy Add QEMU_LOG for vm start and destroy Signed-off-by: miaoyubo Signed-off-by: Jingyi Wang Signed-off-by: Yuan Zhang --- hw/acpi/core.c | 4 ++++ hw/core/reset.c | 2 ++ system/main.c | 2 ++ system/runstate.c | 2 ++ system/vl.c | 6 ++++++ 5 files changed, 16 insertions(+) diff --git a/hw/acpi/core.c b/hw/acpi/core.c index ec5e127d17..b6241f70e9 100644 --- a/hw/acpi/core.c +++ b/hw/acpi/core.c @@ -24,6 +24,7 @@ #include "hw/acpi/acpi.h" #include "hw/nvram/fw_cfg.h" #include "qemu/config-file.h" +#include "qemu/log.h" #include "qapi/error.h" #include "qapi/opts-visitor.h" #include "qapi/qapi-events-run-state.h" @@ -588,13 +589,16 @@ static void acpi_pm_cnt_write(void *opaque, hwaddr addr, uint64_t val, uint16_t sus_typ = (val >> 10) & 7; switch (sus_typ) { case 0: /* soft power off */ + qemu_log("VM will be soft power off\n"); qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); break; case 1: + qemu_log("VM will be suspend state\n"); qemu_system_suspend_request(); break; default: if (sus_typ == ar->pm1.cnt.s4_val) { /* S4 request */ + qemu_log("VM will be S4 state\n"); qapi_event_send_suspend_disk(); qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); } diff --git a/hw/core/reset.c b/hw/core/reset.c index d3263b613e..fa63bfedb7 100644 --- a/hw/core/reset.c +++ b/hw/core/reset.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qemu/queue.h" +#include "qemu/log.h" #include "sysemu/reset.h" /* reset/shutdown handler */ @@ -75,6 +76,7 @@ void qemu_devices_reset(ShutdownCause reason) { QEMUResetEntry *re, *nre; + qemu_log("reset all devices\n"); /* reset all devices */ QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) { if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && diff --git a/system/main.c b/system/main.c index 9b91d21ea8..28bb283ebf 100644 --- a/system/main.c +++ b/system/main.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "qemu-main.h" #include "sysemu/sysemu.h" @@ -34,6 +35,7 @@ int qemu_default_main(void) { int status; + qemu_log("qemu enter main_loop\n"); status = qemu_main_loop(); qemu_cleanup(status); diff --git a/system/runstate.c b/system/runstate.c index 62e6db8d42..538c645326 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -769,9 +769,11 @@ static bool main_loop_should_exit(int *status) } if (qemu_powerdown_requested()) { qemu_system_powerdown(); + qemu_log("domain is power down by outside operation\n"); } if (qemu_vmstop_requested(&r)) { vm_stop(r); + qemu_log("domain is stopped by outside operation\n"); } return false; } diff --git a/system/vl.c b/system/vl.c index 2bcd9efb9a..165c3cae8a 100644 --- a/system/vl.c +++ b/system/vl.c @@ -26,6 +26,7 @@ #include "qemu/help-texts.h" #include "qemu/datadir.h" #include "qemu/units.h" +#include "qemu/log.h" #include "exec/cpu-common.h" #include "exec/page-vary.h" #include "hw/qdev-properties.h" @@ -2633,6 +2634,7 @@ static void qemu_create_cli_devices(void) } /* init generic devices */ + qemu_log("device init start\n"); rom_set_order_override(FW_CFG_ORDER_OVERRIDE_DEVICE); qemu_opts_foreach(qemu_find_opts("device"), device_init_func, NULL, &error_fatal); @@ -2778,6 +2780,7 @@ void qemu_init(int argc, char **argv) qemu_init_subsystems(); + qemu_log("qemu pid is %d, options parsing start\n", getpid()); /* first pass of option parsing */ optind = 1; while (optind < argc) { @@ -2997,6 +3000,7 @@ void qemu_init(int argc, char **argv) exit(0); break; case QEMU_OPTION_m: + qemu_log("memory options parse start\n"); opts = qemu_opts_parse_noisily(qemu_find_opts("memory"), optarg, true); if (opts == NULL) { exit(1); @@ -3714,6 +3718,7 @@ void qemu_init(int argc, char **argv) */ machine_class = MACHINE_GET_CLASS(current_machine); + qemu_log("configure accelerator %s start\n", machine_class->name); if (!qtest_enabled() && machine_class->deprecation_reason) { warn_report("Machine type '%s' is deprecated: %s", machine_class->name, machine_class->deprecation_reason); @@ -3732,6 +3737,7 @@ void qemu_init(int argc, char **argv) */ migration_object_init(); + qemu_log("machine init start\n"); /* parse features once if machine provides default cpu_type */ current_machine->cpu_type = machine_class->default_cpu_type; if (cpu_option) { -- Gitee From 8e30e81c4268103d502587de565842b9632a7965 Mon Sep 17 00:00:00 2001 From: Jinhao Gao Date: Tue, 15 Feb 2022 17:02:08 +0800 Subject: [PATCH 165/939] pl031: support rtc-timer property for pl031 This patch adds the rtc-timer property for pl031, we can get the rtc time (UTC) through qmp command "qom-get date" with this property. Signed-off-by: Haibin Wang Reviewed-by: Shannon Zhao Reviewed-by: Ying Fang Signed-off-by: Keqian Zhu Signed-off-by: Jinhao Gao Signed-off-by: Yuan Zhang --- hw/rtc/pl031.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/hw/rtc/pl031.c b/hw/rtc/pl031.c index f2e6baebba..57e9a35616 100644 --- a/hw/rtc/pl031.c +++ b/hw/rtc/pl031.c @@ -63,6 +63,15 @@ static uint32_t pl031_get_count(PL031State *s) return s->tick_offset + now / NANOSECONDS_PER_SECOND; } +static void pl031_get_date(Object *obj, struct tm *current_tm, Error **errp) +{ + PL031State *s = PL031(obj); + time_t ti = pl031_get_count(s); + + /* Changed to UTC time */ + gmtime_r(&ti, current_tm); +} + static void pl031_set_alarm(PL031State *s) { uint32_t ticks; @@ -202,6 +211,20 @@ static void pl031_init(Object *obj) qemu_clock_get_ns(rtc_clock) / NANOSECONDS_PER_SECOND; s->timer = timer_new_ns(rtc_clock, pl031_interrupt, s); + object_property_add_tm(OBJECT(s), "date", pl031_get_date); +} + +static void pl031_realize(DeviceState *d, Error **errp) +{ + object_property_add_alias(qdev_get_machine(), "rtc-time", + OBJECT(d), "date"); +} + +static void pl031_unrealize(DeviceState *d) +{ + if (object_property_find(qdev_get_machine(), "rtc-time")) { + object_property_del(qdev_get_machine(), "rtc-time"); + } } static void pl031_finalize(Object *obj) @@ -338,6 +361,8 @@ static void pl031_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); dc->vmsd = &vmstate_pl031; + dc->realize = pl031_realize; + dc->unrealize = pl031_unrealize; device_class_set_props(dc, pl031_properties); } -- Gitee From d269fb9a41abf5888a9bfeec2f8d1684b2d4dfb0 Mon Sep 17 00:00:00 2001 From: saarloos <9090-90-90-9090@163.com> Date: Sat, 30 Mar 2024 21:32:27 +0800 Subject: [PATCH 166/939] arm/acpi: Fix when make qemu-system-aarch64 at x86_64 host bios_tables_test fail reason: __aarch64__ macro let build_pptt at x86_64 and aarch64 host build different function that let bios_tables_test fail. Signed-off-by: Yangzi Zhang Signed-off-by: Yuan Zhang --- hw/acpi/aml-build.c | 5 +---- hw/arm/virt-acpi-build.c | 2 +- include/hw/acpi/aml-build.h | 5 +++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 714498165a..bf9c59f544 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -2016,7 +2016,6 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, } } -#ifdef __aarch64__ /* * ACPI spec, Revision 6.3 * 5.2.29.2 Cache Type Structure (Type 1) @@ -2072,7 +2071,7 @@ static void build_cache_hierarchy_node(GArray *tbl, uint32_t next_level, * ACPI spec, Revision 6.3 * 5.2.29 Processor Properties Topology Table (PPTT) */ -void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, +void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms, const char *oem_id, const char *oem_table_id) { MachineClass *mc = MACHINE_GET_CLASS(ms); @@ -2172,7 +2171,6 @@ void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, acpi_table_end(linker, &table); } -#else /* * ACPI spec, Revision 6.3 * 5.2.29 Processor Properties Topology Table (PPTT) @@ -2248,7 +2246,6 @@ void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, acpi_table_end(linker, &table); } -#endif /* build rev1/rev3/rev5.1/rev6.0 FADT */ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 3cb50bdc65..48fc77fb83 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -1024,7 +1024,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) if (!vmc->no_cpu_topology) { acpi_add_table(table_offsets, tables_blob); - build_pptt(tables_blob, tables->linker, ms, + build_pptt_arm(tables_blob, tables->linker, ms, vms->oem_id, vms->oem_table_id); } diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index 200cb113de..7281c281f6 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -221,7 +221,6 @@ struct AcpiBuildTables { BIOSLinker *linker; } AcpiBuildTables; -#ifdef __aarch64__ /* Definitions of the hardcoded cache info*/ typedef enum { @@ -266,7 +265,6 @@ struct offset_status { uint32_t l1i_offset; }; -#endif typedef struct CrsRangeEntry { @@ -542,6 +540,9 @@ void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms, void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, const char *oem_id, const char *oem_table_id); +void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms, + const char *oem_id, const char *oem_table_id); + void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, const char *oem_id, const char *oem_table_id); -- Gitee From 2ccd1ec0d18070727ad9b9647da6b6937f16de2a Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 8 May 2021 17:31:03 +0800 Subject: [PATCH 167/939] linux-headers: update against 5.10 and manual clear vfio dirty log series The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in the kernel, update the header to add them. Signed-off-by: Zenghui Yu Signed-off-by: Kunkun Jiang --- linux-headers/linux/vfio.h | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 8e175ece31..956154e509 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -56,6 +56,16 @@ */ #define VFIO_UPDATE_VADDR 10 +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -1651,8 +1661,30 @@ struct vfio_iommu_type1_dma_unmap { * actual bitmap. If dirty pages logging is not enabled, an error will be * returned. * - * Only one of the flags _START, _STOP and _GET may be specified at a time. + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. */ struct vfio_iommu_type1_dirty_bitmap { __u32 argsz; @@ -1660,6 +1692,8 @@ struct vfio_iommu_type1_dirty_bitmap { #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) __u8 data[]; }; -- Gitee From bd2d81775edf285149346bf793d9b71236d7cf34 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 8 May 2021 17:31:04 +0800 Subject: [PATCH 168/939] vfio: Maintain DMA mapping range for the container When synchronizing dirty bitmap from kernel VFIO we do it in a per-iova-range fashion and we allocate the userspace bitmap for each of the ioctl. This patch introduces `struct VFIODMARange` to describe a range of the given DMA mapping with respect to a VFIO_IOMMU_MAP_DMA operation, and make the bitmap cache of this range be persistent so that we don't need to g_try_malloc0() every time. Note that the new structure is almost a copy of `struct vfio_iommu_type1_dma_map` but only internally used by QEMU. More importantly, the cached per-iova-range dirty bitmap will be further used when we want to add support for the CLEAR_BITMAP and this cached bitmap will be used to guarantee we don't clear any unknown dirty bits otherwise that can be a severe data loss issue for migration code. It's pretty intuitive to maintain a bitmap per container since we perform log_sync at this granule. But I don't know how to deal with things like memory hot-{un}plug, sparse DMA mappings, etc. Suggestions welcome. * yet something to-do: - can't work with guest viommu - no locks - etc [ The idea and even the commit message are largely inherited from kvm side. See commit 9f4bf4baa8b820c7930e23c9566c9493db7e1d25. ] Signed-off-by: Zenghui Yu Signed-off-by: Kunkun Jiang --- hw/vfio/common.c | 9 +++++-- hw/vfio/container.c | 49 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 12 +++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e70fdf5e0c..564e933135 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1156,6 +1156,7 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, vfio_devices_all_device_dirty_tracking(container); uint64_t dirty_pages; VFIOBitmap vbmap; + VFIODMARange *qrange; int ret; if (!container->dirty_pages_supported && !all_device_dirty_tracking) { @@ -1165,10 +1166,16 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, return 0; } + qrange = vfio_lookup_match_range(container, iova, size); + /* the same as vfio_dma_unmap() */ + assert(qrange); + ret = vfio_bitmap_alloc(&vbmap, size); if (ret) { return ret; } + g_free(vbmap.bitmap); + vbmap.bitmap = qrange->bitmap; if (all_device_dirty_tracking) { ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); @@ -1186,8 +1193,6 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, ram_addr, dirty_pages); out: - g_free(vbmap.bitmap); - return ret; } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 242010036a..9a176a0d33 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -112,6 +112,29 @@ unmap_exit: return ret; } +VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size) +{ + VFIODMARange *qrange; + + QLIST_FOREACH(qrange, &container->dma_list, next) { + if (qrange->iova == start_addr && qrange->size == size) { + return qrange; + } + } + return NULL; +} + +void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) +{ + uint64_t pages, size; + + pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size(); + size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + qrange->bitmap = g_malloc0(size); +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ @@ -124,6 +147,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, .iova = iova, .size = size, }; + VFIODMARange *qrange; bool need_dirty_sync = false; int ret; @@ -136,6 +160,22 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, need_dirty_sync = true; } + /* + * unregister the DMA range + * + * It seems that the memory layer will give us the same section as the one + * used in region_add(). Otherwise it'll be complicated to manipulate the + * bitmap across region_{add,del}. Is there any guarantee? + * + * But there is really not such a restriction on the kernel interface + * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc). + */ + qrange = vfio_lookup_match_range(container, iova, size); + assert(qrange); + g_free(qrange->bitmap); + QLIST_REMOVE(qrange, next); + g_free(qrange); + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -180,6 +220,14 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, .iova = iova, .size = size, }; + VFIODMARange *qrange; + + qrange = g_malloc0(sizeof(*qrange)); + qrange->iova = iova; + qrange->size = size; + QLIST_INSERT_HEAD(&container->dma_list, qrange, next); + /* XXX allocate the dirty bitmap on demand */ + vfio_dma_range_init_dirty_bitmap(qrange); if (!readonly) { map.flags |= VFIO_DMA_MAP_FLAG_WRITE; @@ -552,6 +600,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->iova_ranges = NULL; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->vrdl_list); + QLIST_INIT(&container->dma_list); ret = vfio_init_container(container, group->fd, errp); if (ret) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index a4a22accb9..b131d04c9c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -80,6 +80,14 @@ typedef struct VFIOAddressSpace { struct VFIOGroup; +typedef struct VFIODMARange { + QLIST_ENTRY(VFIODMARange) next; + hwaddr iova; + size_t size; + void *vaddr; /* unused */ + unsigned long *bitmap; /* dirty bitmap cache for this range */ +} VFIODMARange; + typedef struct VFIOContainer { VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ @@ -97,6 +105,7 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_HEAD(, VFIODMARange) dma_list; QLIST_ENTRY(VFIOContainer) next; QLIST_HEAD(, VFIODevice) device_list; GList *iova_ranges; @@ -212,6 +221,9 @@ void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container); /* container->fd */ +VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, + hwaddr start_addr, hwaddr size); +void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); int vfio_dma_map(VFIOContainer *container, hwaddr iova, -- Gitee From 24c3ff779f35b40967d195e4764d4cb605c1a304 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 8 May 2021 17:31:05 +0800 Subject: [PATCH 169/939] vfio/migration: Add support for manual clear vfio dirty log The new capability VFIO_DIRTY_LOG_MANUAL_CLEAR and the new ioctl VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP have been introduced in the kernel, tweak the userspace side to use them. Check if the kernel supports VFIO_DIRTY_LOG_MANUAL_CLEAR and provide the log_clear() hook for vfio_memory_listener. If the kernel supports it, deliever the clear message to kernel. Signed-off-by: Zenghui Yu Signed-off-by: Kunkun Jiang --- hw/vfio/common.c | 136 ++++++++++++++++++++++++++++++++++ hw/vfio/container.c | 13 +++- include/hw/vfio/vfio-common.h | 1 + 3 files changed, 148 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 564e933135..e08b147b3d 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1344,6 +1344,141 @@ static void vfio_listener_log_sync(MemoryListener *listener, } } +/* + * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP + * ioctl. But copy from kvm side and align {start, size} with 64 pages. + * + * I think the code can be simplified a lot if no alignment requirement. + */ +#define VFIO_CLEAR_LOG_SHIFT 6 +#define VFIO_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << VFIO_CLEAR_LOG_SHIFT) +#define VFIO_CLEAR_LOG_MASK (-VFIO_CLEAR_LOG_ALIGN) + +static int vfio_log_clear_one_range(VFIOContainer *container,VFIODMARange *qrange, + uint64_t start, uint64_t size) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + + /* + * Now let's deal with the actual bitmap, which is almost the same + * as the kvm side. + */ + uint64_t end, bmap_start, start_delta, bmap_npages; + unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size(); + int ret; + + bmap_start = start & VFIO_CLEAR_LOG_MASK; + start_delta = start - bmap_start; + bmap_start /= psize; + + bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN) + << VFIO_CLEAR_LOG_SHIFT; + end = qrange->size / psize; + if (bmap_npages > end - bmap_start) { + bmap_npages = end - bmap_start; + } + start_delta /= psize; + + if (start_delta) { + bmap_clear = bitmap_new(bmap_npages); + bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap, + bmap_start, start_delta + size / psize); + bitmap_clear(bmap_clear, 0, start_delta); + range->bitmap.data = (__u64 *)bmap_clear; + } else { + range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start)); + } + + range->iova = qrange->iova + bmap_start * psize; + range->size = bmap_npages * psize; + range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.pgsize = qemu_real_host_page_size(); + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to clear dirty log for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize); +err_out: + g_free(bmap_clear); + g_free(dbitmap); + return 0; +} + +static int vfio_physical_log_clear(VFIOContainer *container, + MemoryRegionSection *section) +{ + uint64_t start, size, offset, count; + VFIODMARange *qrange; + int ret = 0; + + if (!container->dirty_log_manual_clear) { + /* No need to do explicit clear */ + return ret; + } + + start = section->offset_within_address_space; + size = int128_get64(section->size); + + if (!size) { + return ret; + } + + QLIST_FOREACH(qrange, &container->dma_list, next) { + /* + * Discard ranges that do not overlap the section (e.g., the + * Memory BAR regions of the device) + */ + if (qrange->iova > start + size - 1 || + start > qrange->iova + qrange->size - 1) { + continue; + } + + if (start >= qrange->iova) { + /* The range starts before section or is aligned to it. */ + offset = start - qrange->iova; + count = MIN(qrange->size - offset, size); + } else { + /* The range starts after section. */ + offset = 0; + count = MIN(qrange->size, size - (qrange->iova - start)); + } + ret = vfio_log_clear_one_range(container, qrange, offset, count); + if (ret < 0) { + break; + } + } + + return ret; +} + +static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_dirty_tracking(container)) { + vfio_physical_log_clear(container, section); + } +} + const MemoryListener vfio_memory_listener = { .name = "vfio", .region_add = vfio_listener_region_add, @@ -1351,6 +1486,7 @@ const MemoryListener vfio_memory_listener = { .log_global_start = vfio_listener_log_global_start, .log_global_stop = vfio_listener_log_global_stop, .log_sync = vfio_listener_log_sync, + .log_clear = vfio_listener_log_clear, }; void vfio_reset_handler(void *opaque) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 9a176a0d33..d8b9117f4f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -285,7 +285,9 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); - dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + dbitmap->flags = container->dirty_log_manual_clear ? + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR : + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; range->iova = iova; range->size = size; @@ -409,7 +411,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, static int vfio_init_container(VFIOContainer *container, int group_fd, Error **errp) { - int iommu_type, ret; + int iommu_type, dirty_log_manual_clear, ret; iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { @@ -438,6 +440,13 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, } container->iommu_type = iommu_type; + + dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION, + VFIO_DIRTY_LOG_MANUAL_CLEAR); + if (dirty_log_manual_clear) { + container->dirty_log_manual_clear = dirty_log_manual_clear; + } + return 0; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index b131d04c9c..fd9828d50b 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -97,6 +97,7 @@ typedef struct VFIOContainer { Error *error; bool initialized; bool dirty_pages_supported; + bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; unsigned long pgsizes; -- Gitee From c8e062285078e688e692214baf97b35246fc2552 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 5 May 2020 23:19:17 +0100 Subject: [PATCH 170/939] arm/virt,target/arm: Add new ARMCPU {socket,cluster,core,thread}-id property This shall be used to store user specified topology{socket,cluster,core,thread} and shall be converted to a unique 'vcpu-id' which is used as slot-index during hot(un)plug of vCPU. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ target/arm/cpu.c | 4 +++ target/arm/cpu.h | 4 +++ 3 files changed, 71 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index f4c3d47f30..94481d45d4 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -227,6 +227,11 @@ static const char *valid_cpus[] = { ARM_CPU_TYPE_NAME("max"), }; +static int virt_get_socket_id(const MachineState *ms, int cpu_index); +static int virt_get_cluster_id(const MachineState *ms, int cpu_index); +static int virt_get_core_id(const MachineState *ms, int cpu_index); +static int virt_get_thread_id(const MachineState *ms, int cpu_index); + static bool cpu_type_valid(const char *cpu) { int i; @@ -2264,6 +2269,14 @@ static void machvirt_init(MachineState *machine) &error_fatal); aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL); + object_property_set_int(cpuobj, "socket-id", + virt_get_socket_id(machine, n), NULL); + object_property_set_int(cpuobj, "cluster-id", + virt_get_cluster_id(machine, n), NULL); + object_property_set_int(cpuobj, "core-id", + virt_get_core_id(machine, n), NULL); + object_property_set_int(cpuobj, "thread-id", + virt_get_thread_id(machine, n), NULL); if (!vms->secure) { object_property_set_bool(cpuobj, "has_el3", false, NULL); @@ -2750,10 +2763,59 @@ static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx) return socket_id % ms->numa_state->num_nodes; } +static int virt_get_socket_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.socket_id; +} + +static int virt_get_cluster_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.cluster_id; +} + +static int virt_get_core_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.core_id; +} + +static int virt_get_thread_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.thread_id; +} + +static int +virt_get_cpu_id_from_cpu_topo(const MachineState *ms, DeviceState *dev) +{ + int cpu_id, sock_vcpu_num, clus_vcpu_num, core_vcpu_num; + ARMCPU *cpu = ARM_CPU(dev); + + /* calculate total logical cpus across socket/cluster/core */ + sock_vcpu_num = cpu->socket_id * (ms->smp.threads * ms->smp.cores * + ms->smp.clusters); + clus_vcpu_num = cpu->cluster_id * (ms->smp.threads * ms->smp.cores); + core_vcpu_num = cpu->core_id * ms->smp.threads; + + /* get vcpu-id(logical cpu index) for this vcpu from this topology */ + cpu_id = (sock_vcpu_num + clus_vcpu_num + core_vcpu_num) + cpu->thread_id; + + assert(cpu_id >= 0 && cpu_id < ms->possible_cpus->len); + + return cpu_id; +} + static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) { int n; unsigned int max_cpus = ms->smp.max_cpus; + unsigned int smp_threads = ms->smp.threads; VirtMachineState *vms = VIRT_MACHINE(ms); MachineClass *mc = MACHINE_GET_CLASS(vms); @@ -2767,6 +2829,7 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { ms->possible_cpus->cpus[n].type = ms->cpu_type; + ms->possible_cpus->cpus[n].vcpus_count = smp_threads; ms->possible_cpus->cpus[n].arch_id = virt_cpu_mp_affinity(vms, n); diff --git a/target/arm/cpu.c b/target/arm/cpu.c index efb22a87f9..cce315c18a 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -2422,6 +2422,10 @@ static Property arm_cpu_properties[] = { DEFINE_PROP_UINT64("mp-affinity", ARMCPU, mp_affinity, ARM64_AFFINITY_INVALID), DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID), + DEFINE_PROP_INT32("socket-id", ARMCPU, socket_id, 0), + DEFINE_PROP_INT32("cluster-id", ARMCPU, cluster_id, 0), + DEFINE_PROP_INT32("core-id", ARMCPU, core_id, 0), + DEFINE_PROP_INT32("thread-id", ARMCPU, thread_id, 0), DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1), DEFINE_PROP_END_OF_LIST() }; diff --git a/target/arm/cpu.h b/target/arm/cpu.h index a0282e0d28..145d3dbf13 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1096,6 +1096,10 @@ struct ArchCPU { QLIST_HEAD(, ARMELChangeHook) el_change_hooks; int32_t node_id; /* NUMA node this CPU belongs to */ + int32_t socket_id; + int32_t cluster_id; + int32_t core_id; + int32_t thread_id; /* Used to synchronize KVM and QEMU in-kernel device levels */ uint8_t device_irq_level; -- Gitee From 444de91551c1e141a76bf3dae4cebee9dbd57b49 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Wed, 6 May 2020 02:48:49 +0100 Subject: [PATCH 171/939] cpus-common: Add common CPU utility for possible vCPUs Adds various utility functions which might be required to fetch or check the state of the possible vCPUs. This also introduces concept of *disabled* vCPUs, which are part of the *possible* vCPUs but are not part of the *present* vCPU. This state shall be used during machine init time to check the presence of vcpus. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- cpu-common.c | 31 +++++++++++++++++++++++++ include/hw/core/cpu.h | 53 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/cpu-common.c b/cpu-common.c index c81fd72d16..d041a351ab 100644 --- a/cpu-common.c +++ b/cpu-common.c @@ -24,6 +24,7 @@ #include "sysemu/cpus.h" #include "qemu/lockable.h" #include "trace/trace-root.h" +#include "hw/boards.h" QemuMutex qemu_cpu_list_lock; static QemuCond exclusive_cond; @@ -107,6 +108,36 @@ void cpu_list_remove(CPUState *cpu) cpu_list_generation_id++; } +CPUState *qemu_get_possible_cpu(int index) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + const CPUArchIdList *possible_cpus = ms->possible_cpus; + + assert((index >= 0) && (index < possible_cpus->len)); + + return CPU(possible_cpus->cpus[index].cpu); +} + +bool qemu_present_cpu(CPUState *cpu) +{ + return cpu; +} + +bool qemu_enabled_cpu(CPUState *cpu) +{ + return cpu && !cpu->disabled; +} + +uint64_t qemu_get_cpu_archid(int cpu_index) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + const CPUArchIdList *possible_cpus = ms->possible_cpus; + + assert((cpu_index >= 0) && (cpu_index < possible_cpus->len)); + + return possible_cpus->cpus[cpu_index].arch_id; +} + CPUState *qemu_get_cpu(int index) { CPUState *cpu; diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index c0c8320413..c30636a936 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -538,6 +538,17 @@ struct CPUState { GArray *plugin_mem_cbs; #endif + /* + * Some architectures do not allow *presence* of vCPUs to be changed + * after guest has booted using information specified by VMM/firmware + * via ACPI MADT at the boot time. Thus to enable vCPU hotplug on these + * architectures possible vCPU can have CPUState object in 'disabled' + * state or can also not have CPUState object at all. This is possible + * when vCPU Hotplug is supported and vCPUs are 'yet-to-be-plugged' in + * the QOM or have been hot-unplugged. + * By default every CPUState is enabled as of now across all archs. + */ + bool disabled; /* TODO Move common fields from CPUArchState here. */ int cpu_index; int cluster_index; @@ -913,6 +924,48 @@ static inline bool cpu_in_exclusive_context(const CPUState *cpu) */ CPUState *qemu_get_cpu(int index); +/** + * qemu_get_possible_cpu: + * @index: The CPUState@cpu_index value of the CPU to obtain. + * Input index MUST be in range [0, Max Possible CPUs) + * + * If CPUState object exists,then it gets a CPU matching + * @index in the possible CPU array. + * + * Returns: The possible CPU or %NULL if CPU does not exist. + */ +CPUState *qemu_get_possible_cpu(int index); + +/** + * qemu_present_cpu: + * @cpu: The vCPU to check + * + * Checks if the vCPU is amongst the present possible vcpus. + * + * Returns: True if it is present possible vCPU else false + */ +bool qemu_present_cpu(CPUState *cpu); + +/** + * qemu_enabled_cpu: + * @cpu: The vCPU to check + * + * Checks if the vCPU is enabled. + * + * Returns: True if it is 'enabled' else false + */ +bool qemu_enabled_cpu(CPUState *cpu); + +/** + * qemu_get_cpu_archid: + * @cpu_index: possible vCPU for which arch-id needs to be retreived + * + * Fetches the vCPU arch-id from the present possible vCPUs. + * + * Returns: arch-id of the possible vCPU + */ +uint64_t qemu_get_cpu_archid(int cpu_index); + /** * cpu_exists: * @id: Guest-exposed CPU ID to lookup. -- Gitee From 8daa90ad502b79e232377f831f67df456a743304 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 26 Aug 2023 01:29:37 +0000 Subject: [PATCH 172/939] hw/arm/virt: Move setting of common CPU properties in a function Factor out CPU properties code common for {hot,cold}-plugged CPUs. This allows code reuse. Signed-off-by: Salil Mehta --- hw/arm/virt.c | 220 ++++++++++++++++++++++++++---------------- include/hw/arm/virt.h | 4 + 2 files changed, 140 insertions(+), 84 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 94481d45d4..8f647422d8 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2113,16 +2113,130 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) } } +static void virt_cpu_set_properties(Object *cpuobj, const CPUArchId *cpu_slot, + Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + VirtMachineState *vms = VIRT_MACHINE(ms); + Error *local_err = NULL; + VirtMachineClass *vmc; + + vmc = VIRT_MACHINE_GET_CLASS(ms); + + /* now, set the cpu object property values */ + numa_cpu_pre_plug(cpu_slot, DEVICE(cpuobj), &local_err); + if (local_err) { + goto out; + } + + object_property_set_int(cpuobj, "mp-affinity", cpu_slot->arch_id, NULL); + + if (!vms->secure) { + object_property_set_bool(cpuobj, "has_el3", false, NULL); + } + + if (!vms->virt && object_property_find(cpuobj, "has_el2")) { + object_property_set_bool(cpuobj, "has_el2", false, NULL); + } + + if (vmc->kvm_no_adjvtime && + object_property_find(cpuobj, "kvm-no-adjvtime")) { + object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL); + } + + if (vmc->no_kvm_steal_time && + object_property_find(cpuobj, "kvm-steal-time")) { + object_property_set_bool(cpuobj, "kvm-steal-time", false, NULL); + } + + if (vmc->no_pmu && object_property_find(cpuobj, "pmu")) { + object_property_set_bool(cpuobj, "pmu", false, NULL); + } + + if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) { + object_property_set_bool(cpuobj, "lpa2", false, NULL); + } + + if (object_property_find(cpuobj, "reset-cbar")) { + object_property_set_int(cpuobj, "reset-cbar", + vms->memmap[VIRT_CPUPERIPHS].base, + &local_err); + if (local_err) { + goto out; + } + } + + /* link already initialized {secure,tag}-memory regions to this cpu */ + object_property_set_link(cpuobj, "memory", OBJECT(vms->sysmem), &local_err); + if (local_err) { + goto out; + } + + if (vms->secure) { + object_property_set_link(cpuobj, "secure-memory", + OBJECT(vms->secure_sysmem), &local_err); + if (local_err) { + goto out; + } + } + + if (vms->mte) { + if (!object_property_find(cpuobj, "tag-memory")) { + error_setg(&local_err, "MTE requested, but not supported " + "by the guest CPU"); + if (local_err) { + goto out; + } + } + + object_property_set_link(cpuobj, "tag-memory", OBJECT(vms->tag_sysmem), + &local_err); + if (local_err) { + goto out; + } + + if (vms->secure) { + object_property_set_link(cpuobj, "secure-tag-memory", + OBJECT(vms->secure_tag_sysmem), + &local_err); + if (local_err) { + goto out; + } + } + } + + /* + * RFC: Question: this must only be called for the hotplugged cpus. For the + * cold booted secondary cpus this is being taken care in arm_load_kernel() + * in boot.c. Perhaps we should remove that code now? + */ + if (vms->psci_conduit != QEMU_PSCI_CONDUIT_DISABLED) { + object_property_set_int(cpuobj, "psci-conduit", vms->psci_conduit, + NULL); + + /* Secondary CPUs start in PSCI powered-down state */ + if (CPU(cpuobj)->cpu_index > 0) { + object_property_set_bool(cpuobj, "start-powered-off", true, NULL); + } + } + +out: + if (local_err) { + error_propagate(errp, local_err); + } + return; +} + static void machvirt_init(MachineState *machine) { VirtMachineState *vms = VIRT_MACHINE(machine); VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(machine); MachineClass *mc = MACHINE_GET_CLASS(machine); const CPUArchIdList *possible_cpus; - MemoryRegion *sysmem = get_system_memory(); + MemoryRegion *secure_tag_sysmem = NULL; MemoryRegion *secure_sysmem = NULL; MemoryRegion *tag_sysmem = NULL; - MemoryRegion *secure_tag_sysmem = NULL; + MemoryRegion *sysmem; int n, virt_max_cpus; bool firmware_loaded; bool aarch64 = true; @@ -2166,6 +2280,8 @@ static void machvirt_init(MachineState *machine) */ finalize_gic_version(vms); + sysmem = vms->sysmem = get_system_memory(); + if (vms->secure) { /* * The Secure view of the world is the same as the NonSecure, @@ -2173,7 +2289,7 @@ static void machvirt_init(MachineState *machine) * containing the system memory at low priority; any secure-only * devices go in at higher priority and take precedence. */ - secure_sysmem = g_new(MemoryRegion, 1); + secure_sysmem = vms->secure_sysmem = g_new(MemoryRegion, 1); memory_region_init(secure_sysmem, OBJECT(machine), "secure-memory", UINT64_MAX); memory_region_add_subregion_overlap(secure_sysmem, 0, sysmem, -1); @@ -2246,6 +2362,23 @@ static void machvirt_init(MachineState *machine) exit(1); } + if (vms->mte) { + /* Create the memory region only once, but link to all cpus later */ + tag_sysmem = vms->tag_sysmem = g_new(MemoryRegion, 1); + memory_region_init(tag_sysmem, OBJECT(machine), + "tag-memory", UINT64_MAX / 32); + + if (vms->secure) { + secure_tag_sysmem = vms->secure_tag_sysmem = g_new(MemoryRegion, 1); + memory_region_init(secure_tag_sysmem, OBJECT(machine), + "secure-tag-memory", UINT64_MAX / 32); + + /* As with ram, secure-tag takes precedence over tag. */ + memory_region_add_subregion_overlap(secure_tag_sysmem, 0, + tag_sysmem, -1); + } + } + create_fdt(vms); qemu_log("cpu init start\n"); @@ -2259,15 +2392,10 @@ static void machvirt_init(MachineState *machine) } cpuobj = object_new(possible_cpus->cpus[n].type); - object_property_set_int(cpuobj, "mp-affinity", - possible_cpus->cpus[n].arch_id, NULL); cs = CPU(cpuobj); cs->cpu_index = n; - numa_cpu_pre_plug(&possible_cpus->cpus[cs->cpu_index], DEVICE(cpuobj), - &error_fatal); - aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL); object_property_set_int(cpuobj, "socket-id", virt_get_socket_id(machine, n), NULL); @@ -2278,82 +2406,6 @@ static void machvirt_init(MachineState *machine) object_property_set_int(cpuobj, "thread-id", virt_get_thread_id(machine, n), NULL); - if (!vms->secure) { - object_property_set_bool(cpuobj, "has_el3", false, NULL); - } - - if (!vms->virt && object_property_find(cpuobj, "has_el2")) { - object_property_set_bool(cpuobj, "has_el2", false, NULL); - } - - if (vmc->kvm_no_adjvtime && - object_property_find(cpuobj, "kvm-no-adjvtime")) { - object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL); - } - - if (vmc->no_kvm_steal_time && - object_property_find(cpuobj, "kvm-steal-time")) { - object_property_set_bool(cpuobj, "kvm-steal-time", false, NULL); - } - - if (vmc->no_pmu && object_property_find(cpuobj, "pmu")) { - object_property_set_bool(cpuobj, "pmu", false, NULL); - } - - if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) { - object_property_set_bool(cpuobj, "lpa2", false, NULL); - } - - if (object_property_find(cpuobj, "reset-cbar")) { - object_property_set_int(cpuobj, "reset-cbar", - vms->memmap[VIRT_CPUPERIPHS].base, - &error_abort); - } - - object_property_set_link(cpuobj, "memory", OBJECT(sysmem), - &error_abort); - if (vms->secure) { - object_property_set_link(cpuobj, "secure-memory", - OBJECT(secure_sysmem), &error_abort); - } - - if (vms->mte) { - /* Create the memory region only once, but link to all cpus. */ - if (!tag_sysmem) { - /* - * The property exists only if MemTag is supported. - * If it is, we must allocate the ram to back that up. - */ - if (!object_property_find(cpuobj, "tag-memory")) { - error_report("MTE requested, but not supported " - "by the guest CPU"); - exit(1); - } - - tag_sysmem = g_new(MemoryRegion, 1); - memory_region_init(tag_sysmem, OBJECT(machine), - "tag-memory", UINT64_MAX / 32); - - if (vms->secure) { - secure_tag_sysmem = g_new(MemoryRegion, 1); - memory_region_init(secure_tag_sysmem, OBJECT(machine), - "secure-tag-memory", UINT64_MAX / 32); - - /* As with ram, secure-tag takes precedence over tag. */ - memory_region_add_subregion_overlap(secure_tag_sysmem, 0, - tag_sysmem, -1); - } - } - - object_property_set_link(cpuobj, "tag-memory", OBJECT(tag_sysmem), - &error_abort); - if (vms->secure) { - object_property_set_link(cpuobj, "secure-tag-memory", - OBJECT(secure_tag_sysmem), - &error_abort); - } - } - qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); object_unref(cpuobj); } diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index e944d434c4..49d1ec8656 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -139,6 +139,10 @@ struct VirtMachineState { DeviceState *platform_bus_dev; FWCfgState *fw_cfg; PFlashCFI01 *flash[2]; + MemoryRegion *sysmem; + MemoryRegion *secure_sysmem; + MemoryRegion *tag_sysmem; + MemoryRegion *secure_tag_sysmem; bool secure; bool highmem; bool highmem_compact; -- Gitee From 7cd2d7ef7bb7f6c6a97988d86b97922ff700ab06 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Wed, 6 May 2020 00:13:31 +0100 Subject: [PATCH 173/939] arm/virt,target/arm: Machine init time change common to vCPU {cold|hot}-plug Refactor and introduce the common logic required during the initialization of both cold and hot plugged vCPUs. Also initialize the *disabled* state of the vCPUs which shall be used further during init phases of various other components like GIC, PMU, ACPI etc as part of the virt machine initialization. KVM vCPUs corresponding to unplugged/yet-to-be-plugged QOM CPUs are kept in powered-off state in the KVM Host and do not run the guest code. Plugged vCPUs are also kept in powered-off state but vCPU threads exist and is kept sleeping. TBD: For the cold booted vCPUs, this change also exists in the arm_load_kernel() in boot.c but for the hotplugged CPUs this change should still remain part of the pre-plug phase. We are duplicating the powering-off of the cold booted CPUs. Shall we remove the duplicate change from boot.c? Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Reported-by: Gavin Shan [GS: pointed the assertion due to wrong range check] Signed-off-by: Salil Mehta --- hw/arm/virt.c | 149 ++++++++++++++++++++++++++++++++++++++++----- target/arm/cpu.c | 7 +++ target/arm/cpu64.c | 14 +++++ 3 files changed, 156 insertions(+), 14 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 8f647422d8..2f04bc7666 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -227,6 +227,7 @@ static const char *valid_cpus[] = { ARM_CPU_TYPE_NAME("max"), }; +static CPUArchId *virt_find_cpu_slot(MachineState *ms, int vcpuid); static int virt_get_socket_id(const MachineState *ms, int cpu_index); static int virt_get_cluster_id(const MachineState *ms, int cpu_index); static int virt_get_core_id(const MachineState *ms, int cpu_index); @@ -2249,6 +2250,14 @@ static void machvirt_init(MachineState *machine) exit(1); } + finalize_gic_version(vms); + if (tcg_enabled() || hvf_enabled() || qtest_enabled() || + (vms->gic_version < VIRT_GIC_VERSION_3)) { + machine->smp.max_cpus = smp_cpus; + mc->has_hotpluggable_cpus = false; + warn_report("cpu hotplug feature has been disabled"); + } + possible_cpus = mc->possible_cpu_arch_ids(machine); /* @@ -2275,11 +2284,6 @@ static void machvirt_init(MachineState *machine) virt_set_memmap(vms, pa_bits); } - /* We can probe only here because during property set - * KVM is not available yet - */ - finalize_gic_version(vms); - sysmem = vms->sysmem = get_system_memory(); if (vms->secure) { @@ -2385,17 +2389,9 @@ static void machvirt_init(MachineState *machine) assert(possible_cpus->len == max_cpus); for (n = 0; n < possible_cpus->len; n++) { Object *cpuobj; - CPUState *cs; - - if (n >= smp_cpus) { - break; - } cpuobj = object_new(possible_cpus->cpus[n].type); - cs = CPU(cpuobj); - cs->cpu_index = n; - aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL); object_property_set_int(cpuobj, "socket-id", virt_get_socket_id(machine, n), NULL); @@ -2902,6 +2898,50 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) return ms->possible_cpus; } +static CPUArchId *virt_find_cpu_slot(MachineState *ms, int vcpuid) +{ + VirtMachineState *vms = VIRT_MACHINE(ms); + CPUArchId *found_cpu; + uint64_t mp_affinity; + + assert(vcpuid >= 0 && vcpuid < ms->possible_cpus->len); + + /* + * RFC: Question: + * TBD: Should mp-affinity be treated as MPIDR? + */ + mp_affinity = virt_cpu_mp_affinity(vms, vcpuid); + found_cpu = &ms->possible_cpus->cpus[vcpuid]; + + assert(found_cpu->arch_id == mp_affinity); + + /* + * RFC: Question: + * Slot-id is the index where vCPU with certain arch-id(=mpidr/ap-affinity) + * is plugged. For Host KVM, MPIDR for vCPU is derived using vcpu-id. + * As I understand, MPIDR and vcpu-id are property of vCPU but slot-id is + * more related to machine? Current code assumes slot-id and vcpu-id are + * same i.e. meaning of slot is bit vague. + * + * Q1: Is there any requirement to clearly represent slot and dissociate it + * from vcpu-id? + * Q2: Should we make MPIDR within host KVM user configurable? + * + * +----+----+----+----+----+----+----+----+ + * MPIDR ||| Res | Aff2 | Aff1 | Aff0 | + * +----+----+----+----+----+----+----+----+ + * \ \ \ | | + * \ 8bit \ 8bit \ |4bit| + * \<------->\<------->\ |<-->| + * \ \ \| | + * +----+----+----+----+----+----+----+----+ + * VCPU-ID | Byte4 | Byte2 | Byte1 | Byte0 | + * +----+----+----+----+----+----+----+----+ + */ + + return found_cpu; +} + static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -2945,6 +2985,81 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev, dev, &error_abort); } +static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineState *ms = MACHINE(hotplug_dev); + ARMCPU *cpu = ARM_CPU(dev); + CPUState *cs = CPU(dev); + CPUArchId *cpu_slot; + int32_t min_cpuid = 0; + int32_t max_cpuid; + + /* sanity check the cpu */ + if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { + error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", + ms->cpu_type); + return; + } + + if ((cpu->thread_id < 0) || (cpu->thread_id >= ms->smp.threads)) { + error_setg(errp, "Invalid thread-id %u specified, correct range 0:%u", + cpu->thread_id, ms->smp.threads - 1); + return; + } + + max_cpuid = ms->possible_cpus->len - 1; + if (!dev->hotplugged) { + min_cpuid = vms->acpi_dev ? ms->smp.cpus : 0; + max_cpuid = vms->acpi_dev ? max_cpuid : ms->smp.cpus - 1; + } + + if ((cpu->core_id < min_cpuid) || (cpu->core_id > max_cpuid)) { + error_setg(errp, "Invalid core-id %d specified, correct range %d:%d", + cpu->core_id, min_cpuid, max_cpuid); + return; + } + + if ((cpu->cluster_id < 0) || (cpu->cluster_id >= ms->smp.clusters)) { + error_setg(errp, "Invalid cluster-id %u specified, correct range 0:%u", + cpu->cluster_id, ms->smp.clusters - 1); + return; + } + + if ((cpu->socket_id < 0) || (cpu->socket_id >= ms->smp.sockets)) { + error_setg(errp, "Invalid socket-id %u specified, correct range 0:%u", + cpu->socket_id, ms->smp.sockets - 1); + return; + } + + cs->cpu_index = virt_get_cpu_id_from_cpu_topo(ms, dev); + + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); + if (qemu_present_cpu(CPU(cpu_slot->cpu))) { + error_setg(errp, "cpu(id%d=%d:%d:%d:%d) with arch-id %" PRIu64 " exist", + cs->cpu_index, cpu->socket_id, cpu->cluster_id, cpu->core_id, + cpu->thread_id, cpu_slot->arch_id); + return; + } + virt_cpu_set_properties(OBJECT(cs), cpu_slot, errp); +} + +static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + MachineState *ms = MACHINE(hotplug_dev); + CPUState *cs = CPU(dev); + CPUArchId *cpu_slot; + + /* insert the cold/hot-plugged vcpu in the slot */ + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); + cpu_slot->cpu = OBJECT(dev); + + cs->disabled = false; + return; +} + static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -2987,6 +3102,8 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, qlist_append_str(reserved_regions, resv_prop_str); qdev_prop_set_array(dev, "reserved-regions", reserved_regions); g_free(resv_prop_str); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_pre_plug(hotplug_dev, dev, errp); } } @@ -3008,6 +3125,8 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, virt_memory_plug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_plug(hotplug_dev, dev, errp); } if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { @@ -3092,7 +3211,8 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, if (device_is_dynamic_sysbus(mc, dev) || object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) || - object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { return HOTPLUG_HANDLER(machine); } return NULL; @@ -3169,6 +3289,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) #endif mc->get_default_cpu_node_id = virt_get_default_cpu_node_id; mc->kvm_type = virt_kvm_type; + mc->has_hotpluggable_cpus = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = virt_machine_get_hotplug_handler; hc->pre_plug = virt_machine_device_pre_plug_cb; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index cce315c18a..18b8a79c8f 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -2477,6 +2477,12 @@ static const struct TCGCPUOps arm_tcg_ops = { }; #endif /* CONFIG_TCG */ +static int64_t arm_cpu_get_arch_id(CPUState *cs) +{ + ARMCPU *cpu = ARM_CPU(cs); + return cpu->mp_affinity; +} + static void arm_cpu_class_init(ObjectClass *oc, void *data) { ARMCPUClass *acc = ARM_CPU_CLASS(oc); @@ -2495,6 +2501,7 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data) cc->class_by_name = arm_cpu_class_by_name; cc->has_work = arm_cpu_has_work; cc->dump_state = arm_cpu_dump_state; + cc->get_arch_id = arm_cpu_get_arch_id; cc->set_pc = arm_cpu_set_pc; cc->get_pc = arm_cpu_get_pc; cc->gdb_read_register = arm_cpu_gdb_read_register; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 471014b5a9..e226b60b72 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -850,6 +850,17 @@ static void aarch64_cpu_set_aarch64(Object *obj, bool value, Error **errp) } } +static void aarch64_cpu_initfn(Object *obj) +{ + CPUState *cs = CPU(obj); + + /* + * we start every ARM64 vcpu as disabled possible vCPU. It needs to be + * enabled explicitly + */ + cs->disabled = true; +} + static void aarch64_cpu_finalizefn(Object *obj) { } @@ -862,7 +873,9 @@ static const gchar *aarch64_gdb_arch_name(CPUState *cs) static void aarch64_cpu_class_init(ObjectClass *oc, void *data) { CPUClass *cc = CPU_CLASS(oc); + DeviceClass *dc = DEVICE_CLASS(oc); + dc->user_creatable = true; cc->gdb_read_register = aarch64_cpu_gdb_read_register; cc->gdb_write_register = aarch64_cpu_gdb_write_register; cc->gdb_num_core_regs = 34; @@ -908,6 +921,7 @@ void aarch64_cpu_register(const ARMCPUInfo *info) static const TypeInfo aarch64_cpu_type_info = { .name = TYPE_AARCH64_CPU, .parent = TYPE_ARM_CPU, + .instance_init = aarch64_cpu_initfn, .instance_finalize = aarch64_cpu_finalizefn, .abstract = true, .class_init = aarch64_cpu_class_init, -- Gitee From 6999ced63ca3bb05a1cbc4a667bd9fd27eeaeaee Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 9 Sep 2023 00:04:04 +0000 Subject: [PATCH 174/939] accel/kvm: Extract common KVM vCPU {creation,parking} code KVM vCPU creation is done once during the initialization of the VM when Qemu threads are spawned. This is common to all the architectures. If the architecture supports vCPU hot-{un}plug then this KVM vCPU creation could be deferred to later point as well. Some architectures might in any case create KVM vCPUs for the yet-to-be plugged vCPUs (i.e. QoM Object & thread does not exists) during VM init time and park them. Hot-unplug of vCPU results in destruction of the vCPU objects in QOM but the KVM vCPU objects in the Host KVM are not destroyed and their representative KVM vCPU objects in Qemu are parked. Signed-off-by: Salil Mehta --- accel/kvm/kvm-all.c | 61 ++++++++++++++++++++++++++++++++++---------- include/sysemu/kvm.h | 2 ++ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index d900df93a4..6d503aa614 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -136,6 +136,7 @@ static QemuMutex kml_slots_lock; #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock) static void kvm_slot_init_dirty_bitmap(KVMSlot *mem); +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id); static inline void kvm_resample_fd_remove(int gsi) { @@ -324,11 +325,51 @@ err: return ret; } +void kvm_park_vcpu(CPUState *cpu) +{ + unsigned long vcpu_id = cpu->cpu_index; + struct KVMParkedVcpu *vcpu; + + vcpu = g_malloc0(sizeof(*vcpu)); + vcpu->vcpu_id = vcpu_id; + vcpu->kvm_fd = cpu->kvm_fd; + QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); +} + +int kvm_create_vcpu(CPUState *cpu) +{ + unsigned long vcpu_id = cpu->cpu_index; + KVMState *s = kvm_state; + int ret; + + DPRINTF("kvm_create_vcpu\n"); + + /* check if the KVM vCPU already exist but is parked */ + ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); + if (ret > 0) { + goto found; + } + + /* create a new KVM vcpu */ + ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); + if (ret < 0) { + return ret; + } + +found: + cpu->vcpu_dirty = true; + cpu->kvm_fd = ret; + cpu->kvm_state = s; + cpu->dirty_pages = 0; + cpu->throttle_us_per_full = 0; + + return 0; +} + static int do_kvm_destroy_vcpu(CPUState *cpu) { KVMState *s = kvm_state; long mmap_size; - struct KVMParkedVcpu *vcpu = NULL; int ret = 0; DPRINTF("kvm_destroy_vcpu\n"); @@ -357,10 +398,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) } } - vcpu = g_malloc0(sizeof(*vcpu)); - vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); - vcpu->kvm_fd = cpu->kvm_fd; - QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); + kvm_park_vcpu(cpu); err: return ret; } @@ -388,7 +426,7 @@ static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) } } - return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); + return -1; } int kvm_init_vcpu(CPUState *cpu, Error **errp) @@ -399,19 +437,14 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); - ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); + ret = kvm_create_vcpu(cpu); if (ret < 0) { - error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", + error_setg_errno(errp, -ret, + "kvm_init_vcpu: kvm_create_vcpu failed (%lu)", kvm_arch_vcpu_id(cpu)); goto err; } - cpu->kvm_fd = ret; - cpu->kvm_state = s; - cpu->vcpu_dirty = true; - cpu->dirty_pages = 0; - cpu->throttle_us_per_full = 0; - mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { ret = mmap_size; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index b46d6203b4..e534411ddc 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -434,6 +434,8 @@ void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len); int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr, hwaddr *phys_addr); +int kvm_create_vcpu(CPUState *cpu); +void kvm_park_vcpu(CPUState *cpu); #endif /* NEED_CPU_H */ -- Gitee From 2669fd26cbc36e24ebfc844c240b45ad831701cc Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 5 May 2020 18:44:59 +0100 Subject: [PATCH 175/939] arm/virt,kvm: Pre-create disabled possible vCPUs @machine init In ARMv8 architecture, GIC needs all the vCPUs to be created and present when it is initialized. This is because: 1. GICC and MPIDR association must be fixed at the VM initialization time. This is represented by register GIC_TYPER(mp_afffinity, proc_num) 2. GICC(cpu interfaces), GICR(redistributors) etc all must be initialized at the boot time as well. 3. Memory regions associated with GICR etc. cannot be changed(add/del/mod) after VM has inited. This patch adds the support to pre-create all such possible vCPUs within the host using the KVM interface as part of the virt machine initialization. These vCPUs could later be attached to QOM/ACPI while they are actually hot plugged and made present. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Reported-by: Vishnu Pajjuri [VP: Identified CPU stall issue & suggested probable fix] Signed-off-by: Salil Mehta --- hw/arm/virt.c | 53 +++++++++++++++++++++++++++++++++++++++++-- include/hw/core/cpu.h | 1 + target/arm/cpu64.c | 1 + target/arm/kvm.c | 32 ++++++++++++++++++++++++++ target/arm/kvm64.c | 9 +++++++- target/arm/kvm_arm.h | 11 +++++++++ 6 files changed, 104 insertions(+), 3 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 2f04bc7666..f10d75366b 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2389,8 +2389,10 @@ static void machvirt_init(MachineState *machine) assert(possible_cpus->len == max_cpus); for (n = 0; n < possible_cpus->len; n++) { Object *cpuobj; + CPUState *cs; cpuobj = object_new(possible_cpus->cpus[n].type); + cs = CPU(cpuobj); aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL); object_property_set_int(cpuobj, "socket-id", @@ -2402,8 +2404,55 @@ static void machvirt_init(MachineState *machine) object_property_set_int(cpuobj, "thread-id", virt_get_thread_id(machine, n), NULL); - qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); - object_unref(cpuobj); + if (n < smp_cpus) { + qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); + object_unref(cpuobj); + } else { + CPUArchId *cpu_slot; + + /* handling for vcpus which are yet to be hot-plugged */ + cs->cpu_index = n; + cpu_slot = virt_find_cpu_slot(machine, cs->cpu_index); + + /* + * ARM host vCPU features need to be fixed at the boot time. But as + * per current approach this CPU object will be destroyed during + * cpu_post_init(). During hotplug of vCPUs these properties are + * initialized again. + */ + virt_cpu_set_properties(cpuobj, cpu_slot, &error_fatal); + + /* + * For KVM, we shall be pre-creating the now disabled/un-plugged + * possbile host vcpus and park them till the time they are + * actually hot plugged. This is required to pre-size the host + * GICC and GICR with the all possible vcpus for this VM. + */ + if (kvm_enabled()) { + kvm_arm_create_host_vcpu(ARM_CPU(cs)); + } + /* + * Add disabled vCPU to CPU slot during the init phase of the virt + * machine + * 1. We need this ARMCPU object during the GIC init. This object + * will facilitate in pre-realizing the GIC. Any info like + * mp-affinity(required to derive gicr_type) etc. could still be + * fetched while preserving QOM abstraction akin to realized + * vCPUs. + * 2. Now, after initialization of the virt machine is complete we + * could use two approaches to deal with this ARMCPU object: + * (i) re-use this ARMCPU object during hotplug of this vCPU. + * OR + * (ii) defer release this ARMCPU object after gic has been + * initialized or during pre-plug phase when a vCPU is + * hotplugged. + * + * We will use the (ii) approach and release the ARMCPU objects + * after GIC and machine has been fully initialized during + * machine_init_done() phase. + */ + cpu_slot->cpu = OBJECT(cs); + } } fdt_add_timer_nodes(vms); fdt_add_cpu_nodes(vms); diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index c30636a936..fdfb952259 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -528,6 +528,7 @@ struct CPUState { uint32_t kvm_fetch_index; uint64_t dirty_pages; int kvm_vcpu_stats_fd; + VMChangeStateEntry *vmcse; /* Use by accel-block: CPU is executing an ioctl() */ QemuLockCnt in_ioctl_lock; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index e226b60b72..5d28838175 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -859,6 +859,7 @@ static void aarch64_cpu_initfn(Object *obj) * enabled explicitly */ cs->disabled = true; + cs->thread_id = 0; } static void aarch64_cpu_finalizefn(Object *obj) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index f59f4f81b2..70cf15b550 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -659,6 +659,38 @@ void kvm_arm_reset_vcpu(ARMCPU *cpu) write_list_to_cpustate(cpu); } +void kvm_arm_create_host_vcpu(ARMCPU *cpu) +{ + CPUState *cs = CPU(cpu); + unsigned long vcpu_id = cs->cpu_index; + int ret; + + ret = kvm_create_vcpu(cs); + if (ret < 0) { + error_report("Failed to create host vcpu %ld", vcpu_id); + abort(); + } + + /* + * Initialize the vCPU in the host. This will reset the sys regs + * for this vCPU and related registers like MPIDR_EL1 etc. also + * gets programmed during this call to host. These are referred + * later while setting device attributes of the GICR during GICv3 + * reset + */ + ret = kvm_arch_init_vcpu(cs); + if (ret < 0) { + error_report("Failed to initialize host vcpu %ld", vcpu_id); + abort(); + } + + /* + * park the created vCPU. shall be used during kvm_get_vcpu() when + * threads are created during realization of ARM vCPUs. + */ + kvm_park_vcpu(cs); +} + /* * Update KVM's MP_STATE based on what QEMU thinks it is */ diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 3c175c93a7..03ce1e7525 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -562,7 +562,14 @@ int kvm_arch_init_vcpu(CPUState *cs) return -EINVAL; } - qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs); + /* + * Install VM change handler only when vCPU thread has been spawned + * i.e. vCPU is being realized + */ + if (cs->thread_id) { + cs->vmcse = qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, + cs); + } /* Determine init features for this CPU */ memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 051a0da41c..31408499b3 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -163,6 +163,17 @@ void kvm_arm_cpu_post_load(ARMCPU *cpu); */ void kvm_arm_reset_vcpu(ARMCPU *cpu); +/** + * kvm_arm_create_host_vcpu: + * @cpu: ARMCPU + * + * Called at to pre create all possible kvm vCPUs within the the host at the + * virt machine init time. This will also init this pre-created vCPU and + * hence result in vCPU reset at host. These pre created and inited vCPUs + * shall be parked for use when ARM vCPUs are actually realized. + */ +void kvm_arm_create_host_vcpu(ARMCPU *cpu); + /** * kvm_arm_init_serror_injection: * @cs: CPUState -- Gitee From fe61cbaf2dc92b062c8d147b05c3ce213734c24a Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Wed, 6 May 2020 02:20:23 +0100 Subject: [PATCH 176/939] arm/virt,gicv3: Changes to pre-size GIC with possible vcpus @machine init GIC needs to be pre-sized with possible vcpus at the initialization time. This is necessary because Memory regions and resources associated with GICC/GICR etc cannot be changed (add/del/modified) after VM has inited. Also, GIC_TYPER needs to be initialized with mp_affinity and cpu interface number association. This cannot be changed after GIC has initialized. Once all the cpu interfaces of the GIC has been inited it needs to be ensured that any updates to the GICC during reset only takes place for the present vcpus and not the disabled ones. Therefore, proper checks are required at various places. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Jean-Philippe Brucker [changed the comment in arm_gicv3_icc_reset] Signed-off-by: Salil Mehta --- hw/arm/virt.c | 13 +++++++------ hw/intc/arm_gicv3_common.c | 7 +++++-- hw/intc/arm_gicv3_cpuif.c | 8 ++++++++ hw/intc/arm_gicv3_kvm.c | 34 +++++++++++++++++++++++++++++++--- include/hw/arm/virt.h | 2 +- 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index f10d75366b..08ba255317 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -802,6 +802,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) const char *gictype; int i; unsigned int smp_cpus = ms->smp.cpus; + unsigned int max_cpus = ms->smp.max_cpus; uint32_t nb_redist_regions = 0; int revision; @@ -826,7 +827,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) } vms->gic = qdev_new(gictype); qdev_prop_set_uint32(vms->gic, "revision", revision); - qdev_prop_set_uint32(vms->gic, "num-cpu", smp_cpus); + qdev_prop_set_uint32(vms->gic, "num-cpu", max_cpus); /* Note that the num-irq property counts both internal and external * interrupts; there are always 32 of the former (mandated by GIC spec). */ @@ -838,7 +839,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) if (vms->gic_version != VIRT_GIC_VERSION_2) { QList *redist_region_count; uint32_t redist0_capacity = virt_redist_capacity(vms, VIRT_GIC_REDIST); - uint32_t redist0_count = MIN(smp_cpus, redist0_capacity); + uint32_t redist0_count = MIN(max_cpus, redist0_capacity); nb_redist_regions = virt_gicv3_redist_region_count(vms); @@ -915,7 +916,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) } else if (vms->virt) { qemu_irq irq = qdev_get_gpio_in(vms->gic, intidbase + ARCH_GIC_MAINT_IRQ); - sysbus_connect_irq(gicbusdev, i + 4 * smp_cpus, irq); + sysbus_connect_irq(gicbusdev, i + 4 * max_cpus, irq); } qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0, @@ -923,11 +924,11 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) + VIRTUAL_PMU_IRQ)); sysbus_connect_irq(gicbusdev, i, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ)); - sysbus_connect_irq(gicbusdev, i + smp_cpus, + sysbus_connect_irq(gicbusdev, i + max_cpus, qdev_get_gpio_in(cpudev, ARM_CPU_FIQ)); - sysbus_connect_irq(gicbusdev, i + 2 * smp_cpus, + sysbus_connect_irq(gicbusdev, i + 2 * max_cpus, qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ)); - sysbus_connect_irq(gicbusdev, i + 3 * smp_cpus, + sysbus_connect_irq(gicbusdev, i + 3 * max_cpus, qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ)); } diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c index 2ebf880ead..ebd99af610 100644 --- a/hw/intc/arm_gicv3_common.c +++ b/hw/intc/arm_gicv3_common.c @@ -392,10 +392,13 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) s->cpu = g_new0(GICv3CPUState, s->num_cpu); for (i = 0; i < s->num_cpu; i++) { - CPUState *cpu = qemu_get_cpu(i); + CPUState *cpu = qemu_get_possible_cpu(i); uint64_t cpu_affid; - s->cpu[i].cpu = cpu; + if (qemu_enabled_cpu(cpu)) { + s->cpu[i].cpu = cpu; + } + s->cpu[i].gic = s; /* Store GICv3CPUState in CPUARMState gicv3state pointer */ gicv3_set_gicv3state(cpu, &s->cpu[i]); diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index ab1a00508e..0d0eb2f62f 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -934,6 +934,10 @@ void gicv3_cpuif_update(GICv3CPUState *cs) ARMCPU *cpu = ARM_CPU(cs->cpu); CPUARMState *env = &cpu->env; + if (!qemu_enabled_cpu(cs->cpu)) { + return; + } + g_assert(qemu_mutex_iothread_locked()); trace_gicv3_cpuif_update(gicv3_redist_affid(cs), cs->hppi.irq, @@ -1826,6 +1830,10 @@ static void icc_generate_sgi(CPUARMState *env, GICv3CPUState *cs, for (i = 0; i < s->num_cpu; i++) { GICv3CPUState *ocs = &s->cpu[i]; + if (!qemu_enabled_cpu(ocs->cpu)) { + continue; + } + if (irm) { /* IRM == 1 : route to all CPUs except self */ if (cs == ocs) { diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 77eb37e131..db06c75e2b 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -24,6 +24,7 @@ #include "hw/intc/arm_gicv3_common.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "sysemu/cpus.h" #include "sysemu/kvm.h" #include "sysemu/runstate.h" #include "kvm_arm.h" @@ -458,6 +459,18 @@ static void kvm_arm_gicv3_put(GICv3State *s) GICv3CPUState *c = &s->cpu[ncpu]; int num_pri_bits; + /* + * To support hotplug of vcpus we need to make sure all gic cpuif/GICC + * are initialized at machvirt init time. Once the init is done we + * release the ARMCPU object for disabled vcpus but this leg could hit + * during reset of GICC later as well i.e. after init has happened and + * all of the cases we want to make sure we dont acess the GICC for + * the disabled VCPUs. + */ + if (!qemu_enabled_cpu(c->cpu)) { + continue; + } + kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, true); kvm_gicc_access(s, ICC_CTLR_EL1, ncpu, &c->icc_ctlr_el1[GICV3_NS], true); @@ -616,6 +629,11 @@ static void kvm_arm_gicv3_get(GICv3State *s) GICv3CPUState *c = &s->cpu[ncpu]; int num_pri_bits; + /* don't access GICC for the disabled vCPUs. */ + if (!qemu_enabled_cpu(c->cpu)) { + continue; + } + kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, false); kvm_gicc_access(s, ICC_CTLR_EL1, ncpu, &c->icc_ctlr_el1[GICV3_NS], false); @@ -695,10 +713,19 @@ static void arm_gicv3_icc_reset(CPUARMState *env, const ARMCPRegInfo *ri) return; } + /* + * This shall be called even when vcpu is being hotplugged or onlined and + * other vcpus might be running. Host kernel KVM code to handle device + * access of IOCTLs KVM_{GET|SET}_DEVICE_ATTR might fail due to inability to + * grab vcpu locks for all the vcpus. Hence, we need to pause all vcpus to + * facilitate locking within host. + */ + pause_all_vcpus(); /* Initialize to actual HW supported configuration */ kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, KVM_VGIC_ATTR(ICC_CTLR_EL1, c->gicr_typer), &c->icc_ctlr_el1[GICV3_NS], false, &error_abort); + resume_all_vcpus(); c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS]; } @@ -808,9 +835,10 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) gicv3_init_irqs_and_mmio(s, kvm_arm_gicv3_set_irq, NULL); for (i = 0; i < s->num_cpu; i++) { - ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i)); - - define_arm_cp_regs(cpu, gicv3_cpuif_reginfo); + CPUState *cs = qemu_get_cpu(i); + if (qemu_enabled_cpu(cs)) { + define_arm_cp_regs(ARM_CPU(cs), gicv3_cpuif_reginfo); + } } /* Try to create the device via the device control API */ diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 49d1ec8656..a6977bade5 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -208,7 +208,7 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) assert(vms->gic_version != VIRT_GIC_VERSION_2); - return (MACHINE(vms)->smp.cpus > redist0_capacity && + return (MACHINE(vms)->smp.max_cpus > redist0_capacity && vms->highmem_redists) ? 2 : 1; } -- Gitee From c375e6fdc49f7d3d0232786e4cfd8b792379107c Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Wed, 6 May 2020 14:12:34 +0100 Subject: [PATCH 177/939] arm/virt: Init PMU at host for all possible vcpus PMU for all possible vCPUs must be initialized at the VM initialization time. Refactor existing code to accomodate possible vCPUs. This also assumes that all processor being used are identical. Past discussion for reference: Link: https://lists.gnu.org/archive/html/qemu-devel/2020-06/msg00131.html Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 12 ++++++++---- include/hw/arm/virt.h | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 08ba255317..78ed3c4ba8 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2055,12 +2055,14 @@ static void finalize_gic_version(VirtMachineState *vms) */ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) { + CPUArchIdList *possible_cpus = vms->parent.possible_cpus; int max_cpus = MACHINE(vms)->smp.max_cpus; - bool aarch64, pmu, steal_time; + bool aarch64, steal_time; CPUState *cpu; + int n; aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL); - pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL); + vms->pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL); steal_time = object_property_get_bool(OBJECT(first_cpu), "kvm-steal-time", NULL); @@ -2087,8 +2089,10 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) memory_region_add_subregion(sysmem, pvtime_reg_base, pvtime); } - CPU_FOREACH(cpu) { - if (pmu) { + for (n = 0; n < possible_cpus->len; n++) { + cpu = qemu_get_possible_cpu(n); + + if (vms->pmu) { assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU)); if (kvm_irqchip_in_kernel()) { kvm_arm_pmu_set_irq(cpu, VIRTUAL_PMU_IRQ); diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index a6977bade5..c2fde0522c 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -155,6 +155,7 @@ struct VirtMachineState { bool ras; bool mte; bool dtb_randomness; + bool pmu; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; -- Gitee From fd6e7e7278e1c0fb08e0a09d9e22157e11b36ece Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sun, 20 Aug 2023 17:11:04 +0000 Subject: [PATCH 178/939] hw/acpi: Move CPU ctrl-dev MMIO region len macro to common header file CPU ctrl-dev MMIO region length could be used in ACPI GED (common ACPI code across architectures) and various other architecture specific places. To make these code places independent of compilation order, ACPI_CPU_HOTPLUG_REG_LEN macro should be moved to a header file. Signed-off-by: Salil Mehta --- hw/acpi/cpu.c | 2 +- include/hw/acpi/cpu_hotplug.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 011d2c6c2d..4b24a25003 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -1,13 +1,13 @@ #include "qemu/osdep.h" #include "migration/vmstate.h" #include "hw/acpi/cpu.h" +#include "hw/acpi/cpu_hotplug.h" #include "hw/core/cpu.h" #include "qapi/error.h" #include "qapi/qapi-events-acpi.h" #include "trace.h" #include "sysemu/numa.h" -#define ACPI_CPU_HOTPLUG_REG_LEN 12 #define ACPI_CPU_SELECTOR_OFFSET_WR 0 #define ACPI_CPU_FLAGS_OFFSET_RW 4 #define ACPI_CPU_CMD_OFFSET_WR 5 diff --git a/include/hw/acpi/cpu_hotplug.h b/include/hw/acpi/cpu_hotplug.h index 3b932abbbb..48b291e45e 100644 --- a/include/hw/acpi/cpu_hotplug.h +++ b/include/hw/acpi/cpu_hotplug.h @@ -19,6 +19,8 @@ #include "hw/hotplug.h" #include "hw/acpi/cpu.h" +#define ACPI_CPU_HOTPLUG_REG_LEN 12 + typedef struct AcpiCpuHotplug { Object *device; MemoryRegion io; -- Gitee From 37aab238363c8242aa76853396c4f272b5508bca Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Mon, 8 Jun 2020 15:25:35 +0100 Subject: [PATCH 179/939] arm/acpi: Enable ACPI support for vcpu hotplug ACPI is required to interface QEMU with the guest. Roughly falls into below cases, 1. Convey the possible vcpus config at the machine init time to the guest using various DSDT tables like MADT etc. 2. Convey vcpu hotplug events to guest(using GED) 3. Assist in evaluation of various ACPI methods(like _EVT, _STA, _OST, _EJ0, _MAT etc.) 4. Provides ACPI cpu hotplug state and 12 Byte memory mapped cpu hotplug control register interface to the OSPM/guest corresponding to each possible vcpu. The register interface consists of various R/W fields and their handling operations. These are called when ever register fields or memory regions are accessed(i.e. read or written) by OSPM when ever it evaluates various ACPI methods. Note: lot of this framework code is inherited from the changes already done for x86 but still some minor changes are required to make it compatible with ARM64.) This patch enables the ACPI support for virtual cpu hotplug. ACPI changes required will follow in subsequent patches. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 3ada335a24..c0a7d0bd58 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -29,6 +29,7 @@ config ARM_VIRT select ACPI_HW_REDUCED select ACPI_APEI select ACPI_VIOT + select ACPI_CPU_HOTPLUG select VIRTIO_MEM_SUPPORTED select ACPI_CXL select ACPI_HMAT -- Gitee From e442d0f8670dc4218ab4beebe645e369f925410d Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 19 Aug 2023 00:26:20 +0000 Subject: [PATCH 180/939] hw/acpi: Add ACPI CPU hotplug init stub ACPI CPU hotplug related initialization should only happend if ACPI_CPU_HOTPLUG support has been enabled for particular architecture. Add cpu_hotplug_hw_init() stub to avoid compilation break. Signed-off-by: Salil Mehta --- hw/acpi/acpi-cpu-hotplug-stub.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/acpi/acpi-cpu-hotplug-stub.c b/hw/acpi/acpi-cpu-hotplug-stub.c index 3fc4b14c26..c6c61bb9cd 100644 --- a/hw/acpi/acpi-cpu-hotplug-stub.c +++ b/hw/acpi/acpi-cpu-hotplug-stub.c @@ -19,6 +19,12 @@ void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner, return; } +void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, + CPUHotplugState *state, hwaddr base_addr) +{ + return; +} + void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list) { return; -- Gitee From 576a2a88625978f1befde11f0823f32bbc54cad1 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Mon, 28 Aug 2023 20:00:08 +0000 Subject: [PATCH 181/939] hw/acpi: Use qemu_present_cpu() API in ACPI CPU hotplug init ACPI CPU Hotplug code assumes a virtual CPU is unplugged if the CPUState object is absent in the list of ths possible CPUs(CPUArchIdList *possible_cpus) maintained on per-machine basis. Use the earlier introduced qemu_present_cpu() API to check this state. This change should have no bearing on the functionality of any architecture and is mere a representational change. Signed-off-by: Salil Mehta --- hw/acpi/cpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 4b24a25003..cabeb4e86b 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -226,7 +226,10 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, state->dev_count = id_list->len; state->devs = g_new0(typeof(*state->devs), state->dev_count); for (i = 0; i < id_list->len; i++) { - state->devs[i].cpu = CPU(id_list->cpus[i].cpu); + struct CPUState *cpu = CPU(id_list->cpus[i].cpu); + if (qemu_present_cpu(cpu)) { + state->devs[i].cpu = cpu; + } state->devs[i].arch_id = id_list->cpus[i].arch_id; } memory_region_init_io(&state->ctrl_reg, owner, &cpu_hotplug_ops, state, -- Gitee From de1c8d6be3de67ff9854e9b008a000e1898aaacb Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Mon, 8 Jun 2020 21:50:08 +0100 Subject: [PATCH 182/939] hw/acpi: Init GED framework with cpu hotplug events ACPI GED(as described in the ACPI 6.2 spec) can be used to generate ACPI events when OSPM/guest receives an interrupt listed in the _CRS object of GED. OSPM then maps or demultiplexes the event by evaluating _EVT method. This change adds the support of cpu hotplug event initialization in the existing GED framework. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/acpi/generic_event_device.c | 8 ++++++++ include/hw/acpi/generic_event_device.h | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index a3d31631fe..d2fa1d0e4a 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -25,6 +25,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_MEM_HOTPLUG_EVT, ACPI_GED_PWR_DOWN_EVT, ACPI_GED_NVDIMM_HOTPLUG_EVT, + ACPI_GED_CPU_HOTPLUG_EVT, }; /* @@ -400,6 +401,13 @@ static void acpi_ged_initfn(Object *obj) memory_region_init_io(&ged_st->regs, obj, &ged_regs_ops, ged_st, TYPE_ACPI_GED "-regs", ACPI_GED_REG_COUNT); sysbus_init_mmio(sbd, &ged_st->regs); + + s->cpuhp.device = OBJECT(s); + memory_region_init(&s->container_cpuhp, OBJECT(dev), "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); } static void acpi_ged_class_init(ObjectClass *class, void *data) diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index ba84ce0214..a803ea818e 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -60,6 +60,7 @@ #define HW_ACPI_GENERIC_EVENT_DEVICE_H #include "hw/sysbus.h" +#include "hw/acpi/cpu_hotplug.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/ghes.h" #include "qom/object.h" @@ -95,6 +96,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) #define ACPI_GED_MEM_HOTPLUG_EVT 0x1 #define ACPI_GED_PWR_DOWN_EVT 0x2 #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4 +#define ACPI_GED_CPU_HOTPLUG_EVT 0x8 typedef struct GEDState { MemoryRegion evt; @@ -106,6 +108,9 @@ struct AcpiGedState { SysBusDevice parent_obj; MemHotplugState memhp_state; MemoryRegion container_memhp; + CPUHotplugState cpuhp_state; + MemoryRegion container_cpuhp; + AcpiCpuHotplug cpuhp; GEDState ged_state; uint32_t ged_event_bitmap; qemu_irq irq; -- Gitee From f8914ec04d4d892520aa443eaf8018c80516adee Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sun, 6 Aug 2023 16:27:01 +0000 Subject: [PATCH 183/939] arm/virt: Add cpu hotplug events to GED during creation Add CPU Hotplug event to the set of supported ged-events during the creation of GED device during VM init. Also initialize the memory map for CPU Hotplug control device used in event exchanges between Qemu/VMM and the guest. Signed-off-by: Salil Mehta --- hw/arm/virt.c | 5 ++++- include/hw/arm/virt.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 78ed3c4ba8..155000f22f 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -78,6 +78,7 @@ #include "hw/mem/pc-dimm.h" #include "hw/mem/nvdimm.h" #include "hw/acpi/generic_event_device.h" +#include "hw/acpi/cpu_hotplug.h" #include "hw/virtio/virtio-md-pci.h" #include "hw/virtio/virtio-iommu.h" #include "hw/char/pl011.h" @@ -157,6 +158,7 @@ static const MemMapEntry base_memmap[] = { [VIRT_NVDIMM_ACPI] = { 0x09090000, NVDIMM_ACPI_IO_LEN}, [VIRT_PVTIME] = { 0x090a0000, 0x00010000 }, [VIRT_SECURE_GPIO] = { 0x090b0000, 0x00001000 }, + [VIRT_CPUHP_ACPI] = { 0x090c0000, ACPI_CPU_HOTPLUG_REG_LEN}, [VIRT_MMIO] = { 0x0a000000, 0x00000200 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ @@ -725,7 +727,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) DeviceState *dev; MachineState *ms = MACHINE(vms); int irq = vms->irqmap[VIRT_ACPI_GED]; - uint32_t event = ACPI_GED_PWR_DOWN_EVT; + uint32_t event = ACPI_GED_PWR_DOWN_EVT | ACPI_GED_CPU_HOTPLUG_EVT; if (ms->ram_slots) { event |= ACPI_GED_MEM_HOTPLUG_EVT; @@ -741,6 +743,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vms->memmap[VIRT_ACPI_GED].base); sysbus_mmio_map(SYS_BUS_DEVICE(dev), 1, vms->memmap[VIRT_PCDIMM_ACPI].base); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 3, vms->memmap[VIRT_CPUHP_ACPI].base); sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, qdev_get_gpio_in(vms->gic, irq)); return dev; diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index c2fde0522c..5de0185063 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -76,6 +76,7 @@ enum { VIRT_PCDIMM_ACPI, VIRT_ACPI_GED, VIRT_NVDIMM_ACPI, + VIRT_CPUHP_ACPI, VIRT_PVTIME, VIRT_LOWMEMMAP_LAST, }; -- Gitee From 028d71744dfeedabfa67d629c71a6ed5e494cc68 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 29 Aug 2023 00:47:05 +0000 Subject: [PATCH 184/939] arm/virt: Create GED dev before *disabled* CPU Objs are destroyed ACPI CPU hotplug state (is_present=_STA.PRESENT, is_enabled=_STA.ENABLED) for all the possible vCPUs MUST be initialized during machine init. This is done during the creation of the GED device. VMM/Qemu MUST expose/fake the ACPI state of the disabled vCPUs to the Guest kernel as 'present' (_STA.PRESENT) always i.e. ACPI persistent. if the 'disabled' vCPU objectes are destroyed before the GED device has been created then their ACPI hotplug state might not get initialized correctly as acpi_persistent flag is part of the CPUState. This will expose wrong status of the unplugged vCPUs to the Guest kernel. Hence, moving the GED device creation before disabled vCPU objects get destroyed as part of the post CPU init routine. Signed-off-by: Salil Mehta --- hw/arm/virt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 155000f22f..818398e753 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2472,6 +2472,12 @@ static void machvirt_init(MachineState *machine) create_gic(vms, sysmem); + has_ged = has_ged && aarch64 && firmware_loaded && + virt_is_acpi_enabled(vms); + if (has_ged) { + vms->acpi_dev = create_acpi_ged(vms); + } + virt_cpu_post_init(vms, sysmem); fdt_add_pmu_nodes(vms); @@ -2496,9 +2502,7 @@ static void machvirt_init(MachineState *machine) create_pcie(vms); - if (has_ged && aarch64 && firmware_loaded && virt_is_acpi_enabled(vms)) { - vms->acpi_dev = create_acpi_ged(vms); - } else { + if (!has_ged) { create_gpio_devices(vms, VIRT_GPIO, sysmem); } -- Gitee From 06059c960d863c21c7d9cf4829ad2078692ed9e1 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Fri, 8 May 2020 13:27:57 +0100 Subject: [PATCH 185/939] hw/acpi: Update CPUs AML with cpu-(ctrl)dev change CPUs Control device(\\_SB.PCI0) register interface for the x86 arch is based on PCI and is IO port based and hence existing cpus AML code assumes _CRS objects would evaluate to a system resource which describes IO Port address. But on ARM arch CPUs control device(\\_SB.PRES) register interface is memory-mapped hence _CRS object should evaluate to system resource which describes memory-mapped base address. This cpus AML code change updates the existing inerface of the build cpus AML function to accept both IO/MEMORY type regions and update the _CRS object correspondingly. NOTE: Beside above CPU scan shall be triggered when OSPM evaluates _EVT method part of the GED framework which is covered in subsequent patch. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/acpi/cpu.c | 23 ++++++++++++++++------- hw/i386/acpi-build.c | 3 ++- include/hw/acpi/cpu.h | 5 +++-- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index cabeb4e86b..cf0c7e8538 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -342,9 +342,10 @@ const VMStateDescription vmstate_cpu_hotplug = { #define CPU_FW_EJECT_EVENT "CEJF" void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, const char *res_root, - const char *event_handler_method) + const char *event_handler_method, + AmlRegionSpace rs) { Aml *ifctx; Aml *field; @@ -369,13 +370,19 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0)); crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, io_base, io_base, 1, + if (rs == AML_SYSTEM_IO) { + aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1, ACPI_CPU_HOTPLUG_REG_LEN)); + } else { + aml_append(crs, aml_memory32_fixed(base_addr, + ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE)); + } + aml_append(cpu_ctrl_dev, aml_name_decl("_CRS", crs)); /* declare CPU hotplug MMIO region with related access fields */ aml_append(cpu_ctrl_dev, - aml_operation_region("PRST", AML_SYSTEM_IO, aml_int(io_base), + aml_operation_region("PRST", rs, aml_int(base_addr), ACPI_CPU_HOTPLUG_REG_LEN)); field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK, @@ -699,9 +706,11 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(sb_scope, cpus_dev); aml_append(table, sb_scope); - method = aml_method(event_handler_method, 0, AML_NOTSERIALIZED); - aml_append(method, aml_call0("\\_SB.CPUS." CPU_SCAN_METHOD)); - aml_append(table, method); + if (event_handler_method) { + method = aml_method(event_handler_method, 0, AML_NOTSERIALIZED); + aml_append(method, aml_call0("\\_SB.CPUS." CPU_SCAN_METHOD)); + aml_append(table, method); + } g_free(cphp_res_path); } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 80db183b78..db4ca8a66a 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1546,7 +1546,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, .fw_unplugs_cpu = pm->smi_on_cpu_unplug, }; build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, - pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02"); + pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02", + AML_SYSTEM_IO); } if (pcms->memhp_io_base && nr_mem) { diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index 209e1773f8..76bc7eb251 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -60,9 +60,10 @@ typedef void (*build_madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids, GArray *entry, bool force_enabled); void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, const char *res_root, - const char *event_handler_method); + const char *event_handler_method, + AmlRegionSpace rs); void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list); -- Gitee From 2d5040ce21af5fc02a8588456be7316fcd5bc2a0 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 2 Apr 2024 16:36:38 +0800 Subject: [PATCH 186/939] arm/virt/acpi: Factor out CPPC building from DSDT CPU aml When CPU hotplug is enabled, we will use build_cpus_aml instead of acpi_dsdt_add_cpus, so factor out CPPC building to reuse it. Signed-off-by: Keqian Zhu --- hw/arm/virt-acpi-build.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 48fc77fb83..084c8abc7c 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -123,8 +123,23 @@ static void acpi_dsdt_add_cppc(Aml *dev, uint64_t cpu_base, int *regs_offset) aml_append(dev, aml_name_decl("_CPC", cpc)); } -static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms, - const MemMapEntry *cppc_memmap) +static void virt_acpi_dsdt_cpu_cppc(int ncpu, int num_cpu, Aml *dev) { + VirtMachineState *vms = VIRT_MACHINE(qdev_get_machine()); + const MemMapEntry *cppc_memmap = &vms->memmap[VIRT_CPUFREQ]; + + /* + * Append _CPC and _PSD to support CPU frequence show + * Check CPPC available by DESIRED_PERF register + */ + if (cppc_regs_offset[DESIRED_PERF] != -1) { + acpi_dsdt_add_cppc(dev, + cppc_memmap->base + ncpu * CPPC_REG_PER_CPU_STRIDE, + cppc_regs_offset); + acpi_dsdt_add_psd(dev, num_cpu); + } +} + +static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms) { MachineState *ms = MACHINE(vms); uint16_t i; @@ -134,18 +149,9 @@ static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms, aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0007"))); aml_append(dev, aml_name_decl("_UID", aml_int(i))); - /* - * Append _CPC and _PSD to support CPU frequence show - * Check CPPC available by DESIRED_PERF register - */ - if (cppc_regs_offset[DESIRED_PERF] != -1) { - acpi_dsdt_add_cppc(dev, - cppc_memmap->base + i * CPPC_REG_PER_CPU_STRIDE, - cppc_regs_offset); - acpi_dsdt_add_psd(dev, ms->smp.cpus); - } + virt_acpi_dsdt_cpu_cppc(i, ms->smp.cpus, dev); - aml_append(scope, dev); + aml_append(scope, dev); } } @@ -931,7 +937,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * the RTC ACPI device at all when using UEFI. */ scope = aml_scope("\\_SB"); - acpi_dsdt_add_cpus(scope, vms, &memmap[VIRT_CPUFREQ]); + acpi_dsdt_add_cpus(scope, vms); acpi_dsdt_add_uart(scope, &memmap[VIRT_UART], (irqmap[VIRT_UART] + ARM_SPI_BASE)); if (vmc->acpi_expose_flash) { -- Gitee From c75a0102a1bb00190b07b06ede8b1f9fa0bdaa3c Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 2 Apr 2024 16:52:10 +0800 Subject: [PATCH 187/939] acpi/cpu: Add cpu_cppc building support Signed-off-by: Keqian Zhu --- hw/acpi/cpu.c | 8 +++++++- hw/i386/acpi-build.c | 2 +- include/hw/acpi/cpu.h | 6 +++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index cf0c7e8538..c8c11e51c6 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -342,7 +342,9 @@ const VMStateDescription vmstate_cpu_hotplug = { #define CPU_FW_EJECT_EVENT "CEJF" void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, + build_madt_cpu_fn build_madt_cpu, + build_cpu_cppc_fn build_cpu_cppc, + hwaddr base_addr, const char *res_root, const char *event_handler_method, AmlRegionSpace rs) @@ -668,6 +670,10 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(dev, aml_name_decl("_UID", uid)); } + if (build_cpu_cppc) { + build_cpu_cppc(i, arch_ids->len, dev); + } + method = aml_method("_STA", 0, AML_SERIALIZED); aml_append(method, aml_return(aml_call1(CPU_STS_METHOD, uid))); aml_append(dev, method); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index db4ca8a66a..e10799ecc6 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1545,7 +1545,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, .smi_path = pm->smi_on_cpuhp ? "\\_SB.PCI0.SMI0.SMIC" : NULL, .fw_unplugs_cpu = pm->smi_on_cpu_unplug, }; - build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, + build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, NULL, pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02", AML_SYSTEM_IO); } diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index 76bc7eb251..b31a2e50d9 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -59,8 +59,12 @@ typedef struct CPUHotplugFeatures { typedef void (*build_madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids, GArray *entry, bool force_enabled); +typedef void (*build_cpu_cppc_fn)(int uid, int num_cpu, Aml *dev); + void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, + build_madt_cpu_fn build_madt_cpu, + build_cpu_cppc_fn build_cpu_cppc, + hwaddr base_addr, const char *res_root, const char *event_handler_method, AmlRegionSpace rs); -- Gitee From 6cfe9afcaceb7d9fb7d54f08b2362fc654b54d12 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 2 Apr 2024 17:23:18 +0800 Subject: [PATCH 188/939] tests/acpi/bios-tables-test: Allow changes to virt/DSDT file Prepare to change of cpu aml. Signed-off-by: Keqian Zhu --- tests/qtest/bios-tables-test-allowed-diff.h | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..c7406e395a 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,41 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/DSDT", +"tests/data/acpi/pc/DSDT.acpierst", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/pc/DSDT.bridge", +"tests/data/acpi/pc/DSDT.cphp", +"tests/data/acpi/pc/DSDT.dimmpxm", +"tests/data/acpi/pc/DSDT.hpbridge", +"tests/data/acpi/pc/DSDT.hpbrroot", +"tests/data/acpi/pc/DSDT.ipmikcs", +"tests/data/acpi/pc/DSDT.memhp", +"tests/data/acpi/pc/DSDT.nohpet", +"tests/data/acpi/pc/DSDT.numamem", +"tests/data/acpi/pc/DSDT.roothp", +"tests/data/acpi/q35/DSDT", +"tests/data/acpi/q35/DSDT.acpierst", +"tests/data/acpi/q35/DSDT.acpihmat", +"tests/data/acpi/q35/DSDT.acpihmat-noinitiator", +"tests/data/acpi/q35/DSDT.applesmc", +"tests/data/acpi/q35/DSDT.bridge", +"tests/data/acpi/q35/DSDT.cphp", +"tests/data/acpi/q35/DSDT.cxl", +"tests/data/acpi/q35/DSDT.dimmpxm", +"tests/data/acpi/q35/DSDT.ipmibt", +"tests/data/acpi/q35/DSDT.ipmismbus", +"tests/data/acpi/q35/DSDT.ivrs", +"tests/data/acpi/q35/DSDT.memhp", +"tests/data/acpi/q35/DSDT.mmio64", +"tests/data/acpi/q35/DSDT.multi-bridge", +"tests/data/acpi/q35/DSDT.noacpihp", +"tests/data/acpi/q35/DSDT.nohpet", +"tests/data/acpi/q35/DSDT.numamem", +"tests/data/acpi/q35/DSDT.pvpanic-isa", +"tests/data/acpi/q35/DSDT.tis.tpm12", +"tests/data/acpi/q35/DSDT.tis.tpm2", +"tests/data/acpi/q35/DSDT.viot", +"tests/data/acpi/virt/DSDT", +"tests/data/acpi/virt/DSDT.acpihmatvirt", +"tests/data/acpi/virt/DSDT.memhp", +"tests/data/acpi/virt/DSDT.pxb", +"tests/data/acpi/virt/DSDT.topology", \ No newline at end of file -- Gitee From bea23b0f82cedbd860b66c7b9e1f6bb0ca85d1cf Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sun, 6 Aug 2023 17:05:30 +0000 Subject: [PATCH 189/939] arm/virt/acpi: Build CPUs AML with CPU Hotplug support Support of vCPU Hotplug requires sequence of ACPI handshakes between Qemu and Guest kernel when a vCPU is plugged or unplugged. Most of the AML code to support these handshakes already exists. This AML need to be build during VM init for ARM architecture as well if the GED support exists. Signed-off-by: Salil Mehta --- hw/arm/virt-acpi-build.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 084c8abc7c..d88f3cded1 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -937,7 +937,19 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * the RTC ACPI device at all when using UEFI. */ scope = aml_scope("\\_SB"); - acpi_dsdt_add_cpus(scope, vms); + /* if GED is enabled then cpus AML shall be added as part build_cpus_aml */ + if (vms->acpi_dev) { + CPUHotplugFeatures opts = { + .acpi_1_compatible = false, + .has_legacy_cphp = false + }; + + build_cpus_aml(scope, ms, opts, NULL, virt_acpi_dsdt_cpu_cppc, + memmap[VIRT_CPUHP_ACPI].base, + "\\_SB", NULL, AML_SYSTEM_MEMORY); + } else { + acpi_dsdt_add_cpus(scope, vms); + } acpi_dsdt_add_uart(scope, &memmap[VIRT_UART], (irqmap[VIRT_UART] + ARM_SPI_BASE)); if (vmc->acpi_expose_flash) { -- Gitee From 3780dddd4fc8f0471525c50893e24846d1474692 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 8 Aug 2023 00:43:18 +0000 Subject: [PATCH 190/939] arm/virt: Make ARM vCPU *present* status ACPI *persistent* ARM arch does not allow CPUs presence to be changed [1] after kernel has booted. Hence, firmware/ACPI/Qemu must ensure persistent view of the vCPUs to the Guest kernel even when they are not present in the QoM i.e. are unplugged or are yet-to-be-plugged References: [1] Check comment 5 in the bugzilla entry Link: https://bugzilla.tianocore.org/show_bug.cgi?id=4481#c5 Signed-off-by: Salil Mehta --- cpu-common.c | 6 ++++++ hw/arm/virt.c | 7 +++++++ include/hw/core/cpu.h | 20 ++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/cpu-common.c b/cpu-common.c index d041a351ab..da52e45760 100644 --- a/cpu-common.c +++ b/cpu-common.c @@ -128,6 +128,12 @@ bool qemu_enabled_cpu(CPUState *cpu) return cpu && !cpu->disabled; } +bool qemu_persistent_cpu(CPUState *cpu) +{ + /* cpu state can be faked to the guest via acpi */ + return cpu->acpi_persistent; +} + uint64_t qemu_get_cpu_archid(int cpu_index) { MachineState *ms = MACHINE(qdev_get_machine()); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 818398e753..91b2653c03 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3104,6 +3104,13 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, return; } virt_cpu_set_properties(OBJECT(cs), cpu_slot, errp); + + /* + * To give persistent presence view of vCPUs to the guest, ACPI might need + * to fake the presence of the vCPUs to the guest but keep them disabled. + * This shall be used during the init of ACPI Hotplug state and hot-unplug + */ + cs->acpi_persistent = true; } static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index fdfb952259..0ca778eb75 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -550,6 +550,13 @@ struct CPUState { * By default every CPUState is enabled as of now across all archs. */ bool disabled; + /* + * On certain architectures, to give persistent view of the 'presence' of + * vCPUs to the guest, ACPI might need to fake the 'presence' of the vCPUs + * but keep them ACPI disabled to the guest. This is done by returning + * _STA.PRES=True and _STA.Ena=False for the unplugged vCPUs in QEMU QoM. + */ + bool acpi_persistent; /* TODO Move common fields from CPUArchState here. */ int cpu_index; int cluster_index; @@ -957,6 +964,19 @@ bool qemu_present_cpu(CPUState *cpu); */ bool qemu_enabled_cpu(CPUState *cpu); +/** + * qemu_persistent_cpu: + * @cpu: The vCPU to check + * + * Checks if the vCPU state should always be reflected as *present* via ACPI + * to the Guest. By default, this is False on all architectures and has to be + * explicity set during initialization. + * + * Returns: True if it is ACPI 'persistent' CPU + * + */ +bool qemu_persistent_cpu(CPUState *cpu); + /** * qemu_get_cpu_archid: * @cpu_index: possible vCPU for which arch-id needs to be retreived -- Gitee From 19a8fbccbc997110f472df308813ad2d7738065c Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Mon, 14 Nov 2022 02:25:28 +0000 Subject: [PATCH 191/939] hw/acpi: ACPI/AML Changes to reflect the correct _STA.{PRES,ENA} Bits to Guest ACPI AML changes to properly reflect the _STA.PRES and _STA.ENA Bits to the guest during initialzation, when CPUs are hotplugged and after CPUs are hot-unplugged. Signed-off-by: Salil Mehta --- hw/acpi/cpu.c | 49 +++++++++++++++++++++++++++++++--- hw/acpi/generic_event_device.c | 11 ++++++++ include/hw/acpi/cpu.h | 2 ++ 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index c8c11e51c6..991f1d4181 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -64,10 +64,11 @@ static uint64_t cpu_hotplug_rd(void *opaque, hwaddr addr, unsigned size) cdev = &cpu_st->devs[cpu_st->selector]; switch (addr) { case ACPI_CPU_FLAGS_OFFSET_RW: /* pack and return is_* fields */ - val |= cdev->cpu ? 1 : 0; + val |= cdev->is_enabled ? 1 : 0; val |= cdev->is_inserting ? 2 : 0; val |= cdev->is_removing ? 4 : 0; val |= cdev->fw_remove ? 16 : 0; + val |= cdev->is_present ? 32 : 0; trace_cpuhp_acpi_read_flags(cpu_st->selector, val); break; case ACPI_CPU_CMD_DATA_OFFSET_RW: @@ -229,7 +230,21 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, struct CPUState *cpu = CPU(id_list->cpus[i].cpu); if (qemu_present_cpu(cpu)) { state->devs[i].cpu = cpu; + state->devs[i].is_present = true; + } else { + if (qemu_persistent_cpu(cpu)) { + state->devs[i].is_present = true; + } else { + state->devs[i].is_present = false; + } } + + if (qemu_enabled_cpu(cpu)) { + state->devs[i].is_enabled = true; + } else { + state->devs[i].is_enabled = false; + } + state->devs[i].arch_id = id_list->cpus[i].arch_id; } memory_region_init_io(&state->ctrl_reg, owner, &cpu_hotplug_ops, state, @@ -262,6 +277,8 @@ void acpi_cpu_plug_cb(HotplugHandler *hotplug_dev, } cdev->cpu = CPU(dev); + cdev->is_present = true; + cdev->is_enabled = true; if (dev->hotplugged) { cdev->is_inserting = true; acpi_send_event(DEVICE(hotplug_dev), ACPI_CPU_HOTPLUG_STATUS); @@ -293,6 +310,11 @@ void acpi_cpu_unplug_cb(CPUHotplugState *cpu_st, return; } + cdev->is_enabled = false; + if (!qemu_persistent_cpu(CPU(dev))) { + cdev->is_present = false; + } + cdev->cpu = NULL; } @@ -303,6 +325,8 @@ static const VMStateDescription vmstate_cpuhp_sts = { .fields = (VMStateField[]) { VMSTATE_BOOL(is_inserting, AcpiCpuStatus), VMSTATE_BOOL(is_removing, AcpiCpuStatus), + VMSTATE_BOOL(is_present, AcpiCpuStatus), + VMSTATE_BOOL(is_enabled, AcpiCpuStatus), VMSTATE_UINT32(ost_event, AcpiCpuStatus), VMSTATE_UINT32(ost_status, AcpiCpuStatus), VMSTATE_END_OF_LIST() @@ -340,6 +364,7 @@ const VMStateDescription vmstate_cpu_hotplug = { #define CPU_REMOVE_EVENT "CRMV" #define CPU_EJECT_EVENT "CEJ0" #define CPU_FW_EJECT_EVENT "CEJF" +#define CPU_PRESENT "CPRS" void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, build_madt_cpu_fn build_madt_cpu, @@ -400,7 +425,9 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(field, aml_named_field(CPU_EJECT_EVENT, 1)); /* tell firmware to do device eject, write only */ aml_append(field, aml_named_field(CPU_FW_EJECT_EVENT, 1)); - aml_append(field, aml_reserved_field(3)); + /* 1 if present, read only */ + aml_append(field, aml_named_field(CPU_PRESENT, 1)); + aml_append(field, aml_reserved_field(2)); aml_append(field, aml_named_field(CPU_COMMAND, 8)); aml_append(cpu_ctrl_dev, field); @@ -430,6 +457,7 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, Aml *ctrl_lock = aml_name("%s.%s", cphp_res_path, CPU_LOCK); Aml *cpu_selector = aml_name("%s.%s", cphp_res_path, CPU_SELECTOR); Aml *is_enabled = aml_name("%s.%s", cphp_res_path, CPU_ENABLED); + Aml *is_present = aml_name("%s.%s", cphp_res_path, CPU_PRESENT); Aml *cpu_cmd = aml_name("%s.%s", cphp_res_path, CPU_COMMAND); Aml *cpu_data = aml_name("%s.%s", cphp_res_path, CPU_DATA); Aml *ins_evt = aml_name("%s.%s", cphp_res_path, CPU_INSERT_EVENT); @@ -458,13 +486,26 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, { Aml *idx = aml_arg(0); Aml *sta = aml_local(0); + Aml *ifctx2; + Aml *else_ctx; aml_append(method, aml_acquire(ctrl_lock, 0xFFFF)); aml_append(method, aml_store(idx, cpu_selector)); aml_append(method, aml_store(zero, sta)); - ifctx = aml_if(aml_equal(is_enabled, one)); + ifctx = aml_if(aml_equal(is_present, one)); { - aml_append(ifctx, aml_store(aml_int(0xF), sta)); + ifctx2 = aml_if(aml_equal(is_enabled, one)); + { + /* cpu is present and enabled */ + aml_append(ifctx2, aml_store(aml_int(0xF), sta)); + } + aml_append(ifctx, ifctx2); + else_ctx = aml_else(); + { + /* cpu is present but disabled */ + aml_append(else_ctx, aml_store(aml_int(0xD), sta)); + } + aml_append(ifctx, else_ctx); } aml_append(method, ifctx); aml_append(method, aml_release(ctrl_lock)); diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index d2fa1d0e4a..b84602b238 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -319,6 +319,16 @@ static const VMStateDescription vmstate_memhp_state = { } }; +static const VMStateDescription vmstate_cpuhp_state = { + .name = "acpi-ged/cpuhp", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_CPU_HOTPLUG(cpuhp_state, AcpiGedState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_ged_state = { .name = "acpi-ged-state", .version_id = 1, @@ -367,6 +377,7 @@ static const VMStateDescription vmstate_acpi_ged = { }, .subsections = (const VMStateDescription * []) { &vmstate_memhp_state, + &vmstate_cpuhp_state, &vmstate_ghes_state, NULL } diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index b31a2e50d9..fced952152 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -23,6 +23,8 @@ typedef struct AcpiCpuStatus { uint64_t arch_id; bool is_inserting; bool is_removing; + bool is_present; + bool is_enabled; bool fw_remove; uint32_t ost_event; uint32_t ost_status; -- Gitee From cfdb0f24431ae0f5115f905a1411509c01a50e88 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 9 Jun 2020 00:50:36 +0100 Subject: [PATCH 192/939] hw/acpi: Update GED _EVT method AML with cpu scan OSPM evaluates _EVT method to map the event. The cpu hotplug event eventually results in start of the cpu scan. Scan figures out the cpu and the kind of event(plug/unplug) and notifies it back to the guest. The change in this patch updates the GED AML _EVT method with the call to \\_SB.CPUS.CSCN which will do above. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/acpi/generic_event_device.c | 4 ++++ include/hw/acpi/cpu_hotplug.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index b84602b238..ad252e6a91 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -108,6 +108,10 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_append(if_ctx, aml_call0(MEMORY_DEVICES_CONTAINER "." MEMORY_SLOT_SCAN_METHOD)); break; + case ACPI_GED_CPU_HOTPLUG_EVT: + aml_append(if_ctx, aml_call0(ACPI_CPU_CONTAINER "." + ACPI_CPU_SCAN_METHOD)); + break; case ACPI_GED_PWR_DOWN_EVT: aml_append(if_ctx, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), diff --git a/include/hw/acpi/cpu_hotplug.h b/include/hw/acpi/cpu_hotplug.h index 48b291e45e..ef631750b4 100644 --- a/include/hw/acpi/cpu_hotplug.h +++ b/include/hw/acpi/cpu_hotplug.h @@ -20,6 +20,8 @@ #include "hw/acpi/cpu.h" #define ACPI_CPU_HOTPLUG_REG_LEN 12 +#define ACPI_CPU_SCAN_METHOD "CSCN" +#define ACPI_CPU_CONTAINER "\\_SB.CPUS" typedef struct AcpiCpuHotplug { Object *device; -- Gitee From 8e1b8d624128523654786953b381557c82654a57 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Wed, 6 May 2020 18:03:11 +0100 Subject: [PATCH 193/939] hw/arm: MADT Tbl change to size the guest with possible vCPUs Changes required during building of MADT Table by QEMU to accomodate disabled possible vCPUs. This info shall be used by the guest kernel to size up its resources during boot time. This pre-sizing of the guest kernel done on possible vCPUs will facilitate hotplug of the disabled vCPUs. This change also caters ACPI MADT GIC CPU Interface flag related changes recently introduced in the UEFI ACPI 6.5 Specification which allows deferred virtual CPU online'ing in the Guest Kernel. Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#gic-cpu-interface-gicc-structure Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt-acpi-build.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index d88f3cded1..2870c1ec5a 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -779,6 +779,29 @@ static void build_append_gicr(GArray *table_data, uint64_t base, uint32_t size) build_append_int_noprefix(table_data, size, 4); /* Discovery Range Length */ } +static uint32_t virt_acpi_get_gicc_flags(CPUState *cpu) +{ + MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); + + /* can only exist in 'enabled' state */ + if (!mc->has_hotpluggable_cpus) { + return 1; + } + + /* + * ARM GIC CPU Interface can be 'online-capable' or 'enabled' at boot + * We MUST set 'online-capable' Bit for all hotpluggable CPUs except the + * first/boot CPU. Cold-booted CPUs without 'Id' can also be unplugged. + * Though as-of-now this is only used as a debugging feature. + * + * UEFI ACPI Specification 6.5 + * Section: 5.2.12.14. GIC CPU Interface (GICC) Structure + * Table: 5.37 GICC CPU Interface Flags + * Link: https://uefi.org/specs/ACPI/6.5 + */ + return cpu && !cpu->cpu_index ? 1 : (1 << 3); +} + static void build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { @@ -805,12 +828,13 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, vms->gic_version, 1); build_append_int_noprefix(table_data, 0, 3); /* Reserved */ - for (i = 0; i < MACHINE(vms)->smp.cpus; i++) { - ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(i)); + for (i = 0; i < MACHINE(vms)->smp.max_cpus; i++) { + CPUState *cpu = qemu_get_possible_cpu(i); uint64_t physical_base_address = 0, gich = 0, gicv = 0; uint32_t vgic_interrupt = vms->virt ? ARCH_GIC_MAINT_IRQ : 0; - uint32_t pmu_interrupt = arm_feature(&armcpu->env, ARM_FEATURE_PMU) ? - VIRTUAL_PMU_IRQ : 0; + uint32_t pmu_interrupt = vms->pmu ? VIRTUAL_PMU_IRQ : 0; + uint32_t flags = virt_acpi_get_gicc_flags(cpu); + uint64_t mpidr = qemu_get_cpu_archid(i); if (vms->gic_version == VIRT_GIC_VERSION_2) { physical_base_address = memmap[VIRT_GIC_CPU].base; @@ -825,7 +849,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, i, 4); /* GIC ID */ build_append_int_noprefix(table_data, i, 4); /* ACPI Processor UID */ /* Flags */ - build_append_int_noprefix(table_data, 1, 4); /* Enabled */ + build_append_int_noprefix(table_data, flags, 4); /* Parking Protocol Version */ build_append_int_noprefix(table_data, 0, 4); /* Performance Interrupt GSIV */ @@ -839,7 +863,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, vgic_interrupt, 4); build_append_int_noprefix(table_data, 0, 8); /* GICR Base Address*/ /* MPIDR */ - build_append_int_noprefix(table_data, armcpu->mp_affinity, 8); + build_append_int_noprefix(table_data, mpidr, 8); /* Processor Power Efficiency Class */ build_append_int_noprefix(table_data, 0, 1); /* Reserved */ -- Gitee From e9b0d476172e872bf695780a9ffa8072faeb3cd0 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 25 Apr 2022 17:40:57 +0100 Subject: [PATCH 194/939] hw/acpi: Make _MAT method optional The GICC interface on arm64 vCPUs is statically defined in the MADT, and doesn't require a _MAT entry. Although the GICC is indicated as present by the MADT entry, it can only be used from vCPU sysregs, which aren't accessible until hot-add. Co-developed-by: Jean-Philippe Brucker Signed-off-by: Jean-Philippe Brucker Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Signed-off-by: Salil Mehta --- hw/acpi/cpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 991f1d4181..c922c380aa 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -720,9 +720,11 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(dev, method); /* build _MAT object */ - build_madt_cpu(i, arch_ids, madt_buf, true); /* set enabled flag */ - aml_append(dev, aml_name_decl("_MAT", - aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data))); + if (build_madt_cpu) { + build_madt_cpu(i, arch_ids, madt_buf, true); /* set enabled flag */ + aml_append(dev, aml_name_decl("_MAT", + aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data))); + } g_array_free(madt_buf, true); if (CPU(arch_ids->cpus[i].cpu) != first_cpu) { -- Gitee From 097e3b46a7eede0182a846f7b993e14d3eed83b7 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 9 Jun 2020 03:01:08 +0100 Subject: [PATCH 195/939] arm/virt: Release objects for *disabled* possible vCPUs after init During machvirt_init(), QOM ARMCPU objects are also pre-created along with the corresponding KVM vCPUs in the host for all possible vCPUs. This necessary because of the architectural constraint, KVM restricts the deferred creation of the KVM vCPUs and VGIC initialization/sizing after VM init. Hence, VGIC is pre-sized with possible vCPUs. After initialization of the machine is complete disabled possible KVM vCPUs are then parked at the per-virt-machine list "kvm_parked_vcpus" and we release the QOM ARMCPU objects for the disabled vCPUs. These shall be re-created at the time when vCPU is hotplugged again. QOM ARMCPU object is then re-attached with corresponding parked KVM vCPU. Alternatively, we could've never released the QOM CPU objects and kept on reusing. This approach might require some modifications of qdevice_add() interface to get old ARMCPU object instead of creating a new one for the hotplug request. Each of the above approaches come with their own pros and cons. This prototype uses the 1st approach.(suggestions are welcome!) Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 91b2653c03..bf385a469c 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2060,6 +2060,7 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) { CPUArchIdList *possible_cpus = vms->parent.possible_cpus; int max_cpus = MACHINE(vms)->smp.max_cpus; + MachineState *ms = MACHINE(vms); bool aarch64, steal_time; CPUState *cpu; int n; @@ -2120,6 +2121,37 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) } } } + + if (kvm_enabled() || tcg_enabled()) { + for (n = 0; n < possible_cpus->len; n++) { + cpu = qemu_get_possible_cpu(n); + + /* + * Now, GIC has been sized with possible CPUs and we dont require + * disabled vCPU objects to be represented in the QOM. Release the + * disabled ARMCPU objects earlier used during init for pre-sizing. + * + * We fake to the guest through ACPI about the presence(_STA.PRES=1) + * of these non-existent vCPUs at VMM/qemu and present these as + * disabled vCPUs(_STA.ENA=0) so that they cant be used. These vCPUs + * can be later added to the guest through hotplug exchanges when + * ARMCPU objects are created back again using 'device_add' QMP + * command. + */ + /* + * RFC: Question: Other approach could've been to keep them forever + * and release it only once when qemu exits as part of finalize or + * when new vCPU is hotplugged. In the later old could be released + * for the newly created object for the same vCPU? + */ + if (!qemu_enabled_cpu(cpu)) { + CPUArchId *cpu_slot; + cpu_slot = virt_find_cpu_slot(ms, cpu->cpu_index); + cpu_slot->cpu = NULL; + object_unref(OBJECT(cpu)); + } + } + } } static void virt_cpu_set_properties(Object *cpuobj, const CPUArchId *cpu_slot, -- Gitee From 0bdb1861985704af9b82e35053b5ab99f7880eb6 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Thu, 7 May 2020 21:30:09 +0100 Subject: [PATCH 196/939] hw/acpi: Update ACPI GED framework to support vCPU Hotplug ACPI GED shall be used to convey to the guest kernel about any CPU hot-(un)plug events. Therefore, existing ACPI GED framework inside QEMU needs to be enhanced to support CPU hotplug state and events. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/acpi/generic_event_device.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index ad252e6a91..0266733a54 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "hw/acpi/acpi.h" +#include "hw/acpi/cpu.h" #include "hw/acpi/generic_event_device.h" #include "hw/irq.h" #include "hw/mem/pc-dimm.h" @@ -239,6 +240,8 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev, } else { acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp); } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_plug_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "virt: device plug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -253,6 +256,8 @@ static void acpi_ged_unplug_request_cb(HotplugHandler *hotplug_dev, if ((object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) && !(object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)))) { acpi_memory_unplug_request_cb(hotplug_dev, &s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -266,6 +271,8 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev, if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { acpi_memory_unplug_cb(&s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_cb(&s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -277,6 +284,7 @@ static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) AcpiGedState *s = ACPI_GED(adev); acpi_memory_ospm_status(&s->memhp_state, list); + acpi_cpu_ospm_status(&s->cpuhp_state, list); } static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) @@ -291,6 +299,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_PWR_DOWN_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; + } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { + sel = ACPI_GED_CPU_HOTPLUG_EVT; } else { /* Unknown event. Return without generating interrupt. */ warn_report("GED: Unsupported event %d. No irq injected", ev); -- Gitee From 724ab355c047cfb3e970d9ea78577087568eb095 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Fri, 8 May 2020 18:40:19 +0100 Subject: [PATCH 197/939] arm/virt: Add/update basic hot-(un)plug framework Add CPU hot-unplug hooks and update hotplug hooks with additional sanity checks for use in hotplug paths. Note, Functional contents of the hooks(now left with TODO comment) shall be gradually filled in the subsequent patches in an incremental approach to patch and logic building which would be roughly as follows: 1. (Un-)wiring of interrupts between vCPU<->GIC 2. Sending events to Guest for hot-(un)plug so that guest can take appropriate actions. 3. Notifying GIC about hot-(un)plug action so that vCPU could be (un-)stitched to the GIC CPU interface. 4. Updating the Guest with Next boot info for this vCPU in the firmware. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index bf385a469c..ed354be326 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -83,6 +83,7 @@ #include "hw/virtio/virtio-iommu.h" #include "hw/char/pl011.h" #include "qemu/guest-random.h" +#include "qapi/qmp/qdict.h" #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ @@ -3083,12 +3084,23 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, { VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); MachineState *ms = MACHINE(hotplug_dev); + MachineClass *mc = MACHINE_GET_CLASS(ms); ARMCPU *cpu = ARM_CPU(dev); CPUState *cs = CPU(dev); CPUArchId *cpu_slot; int32_t min_cpuid = 0; int32_t max_cpuid; + if (dev->hotplugged && !vms->acpi_dev) { + error_setg(errp, "GED acpi device does not exists"); + return; + } + + if (dev->hotplugged && !mc->has_hotpluggable_cpus) { + error_setg(errp, "CPU hotplug not supported on this machine"); + return; + } + /* sanity check the cpu */ if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", @@ -3137,6 +3149,22 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } virt_cpu_set_properties(OBJECT(cs), cpu_slot, errp); + /* + * Fix the GIC for this new vCPU being plugged. The QOM CPU object for the + * new vCPU need to be updated in the corresponding QOM GICv3CPUState object + * We also need to re-wire the IRQs for this new CPU object. This update + * is limited to the QOM only and does not affects the KVM. Later has + * already been pre-sized with possible CPU at VM init time. This is a + * workaround to the constraints posed by ARM architecture w.r.t supporting + * CPU Hotplug. Specification does not exist for the later. + * This patch-up is required both for {cold,hot}-plugged vCPUs. Cold-inited + * vCPUs have their GIC state initialized during machvit_init(). + */ + if (vms->acpi_dev) { + /* TODO: update GIC about this hotplug change here */ + /* TODO: wire the GIC<->CPU irqs */ + } + /* * To give persistent presence view of vCPUs to the guest, ACPI might need * to fake the presence of the vCPUs to the guest but keep them disabled. @@ -3148,6 +3176,7 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); MachineState *ms = MACHINE(hotplug_dev); CPUState *cs = CPU(dev); CPUArchId *cpu_slot; @@ -3156,10 +3185,81 @@ static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); cpu_slot->cpu = OBJECT(dev); + /* + * Update the ACPI Hotplug state both for vCPUs being {hot,cold}-plugged. + * vCPUs can be cold-plugged using '-device' option. For vCPUs being hot + * plugged, guest is also notified. + */ + if (vms->acpi_dev) { + /* TODO: update acpi hotplug state. Send cpu hotplug event to guest */ + /* TODO: register cpu for reset & update F/W info for the next boot */ + } + cs->disabled = false; return; } +static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + ARMCPU *cpu = ARM_CPU(dev); + CPUState *cs = CPU(dev); + + if (!vms->acpi_dev || !dev->realized) { + error_setg(errp, "GED does not exists or device is not realized!"); + return; + } + + if (!mc->has_hotpluggable_cpus) { + error_setg(errp, "CPU hot(un)plug not supported on this machine"); + return; + } + + if (cs->cpu_index == first_cpu->cpu_index) { + error_setg(errp, "Boot CPU(id%d=%d:%d:%d:%d) hot-unplug not supported", + first_cpu->cpu_index, cpu->socket_id, cpu->cluster_id, + cpu->core_id, cpu->thread_id); + return; + } + + /* TODO: request cpu hotplug from guest */ + + return; +} + +static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineState *ms = MACHINE(hotplug_dev); + CPUState *cs = CPU(dev); + CPUArchId *cpu_slot; + + if (!vms->acpi_dev || !dev->realized) { + error_setg(errp, "GED does not exists or device is not realized!"); + return; + } + + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); + + /* TODO: update the acpi cpu hotplug state for cpu hot-unplug */ + + /* TODO: unwire the gic-cpu irqs here */ + /* TODO: update the GIC about this hot unplug change */ + + /* TODO: unregister cpu for reset & update F/W info for the next boot */ + + qobject_unref(dev->opts); + dev->opts = NULL; + + cpu_slot->cpu = NULL; + cs->disabled = true; + + return; +} + static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -3284,6 +3384,8 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_unplug_request(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_unplug_request(hotplug_dev, dev, errp); } else { error_setg(errp, "device unplug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -3297,6 +3399,8 @@ static void virt_machine_device_unplug_cb(HotplugHandler *hotplug_dev, virt_dimm_unplug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_unplug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_unplug(hotplug_dev, dev, errp); } else { error_setg(errp, "virt: device unplug for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); -- Gitee From a68abeefcbd78daaf7179b922f6b9040b4b63101 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 9 May 2020 15:50:33 +0100 Subject: [PATCH 198/939] arm/virt: Changes to (un)wire GICC<->vCPU IRQs during hot-(un)plug Refactors the existing GIC create code to extract common code to wire the vcpu<->gic interrupts. This function could be used with cold-plug case and also used when vCPU is hot-plugged. It also introduces a new function to unwire the vcpu<->gic interrupts for the vCPU hot-unplug cases. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 138 ++++++++++++++++++++++++++++------------- hw/core/gpio.c | 2 +- include/hw/qdev-core.h | 2 + 3 files changed, 99 insertions(+), 43 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index ed354be326..97bf4cca11 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -798,6 +798,99 @@ static void create_v2m(VirtMachineState *vms) vms->msi_controller = VIRT_MSI_CTRL_GICV2M; } +/* + * Mapping from the output timer irq lines from the CPU to the GIC PPI inputs + * we use for the virt board. + */ +const int timer_irq[] = { + [GTIMER_PHYS] = ARCH_TIMER_NS_EL1_IRQ, + [GTIMER_VIRT] = ARCH_TIMER_VIRT_IRQ, + [GTIMER_HYP] = ARCH_TIMER_NS_EL2_IRQ, + [GTIMER_SEC] = ARCH_TIMER_S_EL1_IRQ, +}; + +static void unwire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs) +{ + MachineState *ms = MACHINE(vms); + unsigned int max_cpus = ms->smp.max_cpus; + DeviceState *cpudev = DEVICE(cs); + DeviceState *gicdev = vms->gic; + int cpu = CPU(cs)->cpu_index; + int type = vms->gic_version; + int irq; + + for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { + qdev_disconnect_gpio_out_named(cpudev, NULL, irq); + } + + if (type != VIRT_GIC_VERSION_2) { + qdev_disconnect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt", + 0); + } else if (vms->virt) { + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, + cpu + 4 * max_cpus); + } + + /* + * RFC: Question: This currently does not takes care of intimating the + * devices which might be sitting on system bus. Do we need a + * sysbus_disconnect_irq() which also does the job of notification beside + * disconnection? + */ + qdev_disconnect_gpio_out_named(cpudev, "pmu-interrupt", 0); + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, cpu); + qdev_disconnect_gpio_out_named(gicdev, + SYSBUS_DEVICE_GPIO_IRQ, cpu + max_cpus); + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, + cpu + 2 * max_cpus); + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, + cpu + 3 * max_cpus); +} + +static void wire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs) +{ + MachineState *ms = MACHINE(vms); + unsigned int max_cpus = ms->smp.max_cpus; + DeviceState *cpudev = DEVICE(cs); + DeviceState *gicdev = vms->gic; + int cpu = CPU(cs)->cpu_index; + int type = vms->gic_version; + SysBusDevice *gicbusdev; + int intidbase; + int irq; + + intidbase = NUM_IRQS + cpu * GIC_INTERNAL; + + for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { + qdev_connect_gpio_out(cpudev, irq, + qdev_get_gpio_in(gicdev, + intidbase + timer_irq[irq])); + } + + gicbusdev = SYS_BUS_DEVICE(gicdev); + if (type != VIRT_GIC_VERSION_2) { + qemu_irq qirq = qdev_get_gpio_in(gicdev, + intidbase + ARCH_GIC_MAINT_IRQ); + qdev_connect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt", + 0, qirq); + } else if (vms->virt) { + qemu_irq qirq = qdev_get_gpio_in(gicdev, + intidbase + ARCH_GIC_MAINT_IRQ); + sysbus_connect_irq(gicbusdev, cpu + 4 * max_cpus, qirq); + } + + qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0, + qdev_get_gpio_in(gicdev, + intidbase + VIRTUAL_PMU_IRQ)); + sysbus_connect_irq(gicbusdev, cpu, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ)); + sysbus_connect_irq(gicbusdev, cpu + max_cpus, + qdev_get_gpio_in(cpudev, ARM_CPU_FIQ)); + sysbus_connect_irq(gicbusdev, cpu + 2 * max_cpus, + qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ)); + sysbus_connect_irq(gicbusdev, cpu + 3 * max_cpus, + qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ)); +} + static void create_gic(VirtMachineState *vms, MemoryRegion *mem) { MachineState *ms = MACHINE(vms); @@ -894,46 +987,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) * and the GIC's IRQ/FIQ/VIRQ/VFIQ interrupt outputs to the CPU's inputs. */ for (i = 0; i < smp_cpus; i++) { - DeviceState *cpudev = DEVICE(qemu_get_cpu(i)); - int intidbase = NUM_IRQS + i * GIC_INTERNAL; - /* Mapping from the output timer irq lines from the CPU to the - * GIC PPI inputs we use for the virt board. - */ - const int timer_irq[] = { - [GTIMER_PHYS] = ARCH_TIMER_NS_EL1_IRQ, - [GTIMER_VIRT] = ARCH_TIMER_VIRT_IRQ, - [GTIMER_HYP] = ARCH_TIMER_NS_EL2_IRQ, - [GTIMER_SEC] = ARCH_TIMER_S_EL1_IRQ, - }; - - for (unsigned irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { - qdev_connect_gpio_out(cpudev, irq, - qdev_get_gpio_in(vms->gic, - intidbase + timer_irq[irq])); - } - - if (vms->gic_version != VIRT_GIC_VERSION_2) { - qemu_irq irq = qdev_get_gpio_in(vms->gic, - intidbase + ARCH_GIC_MAINT_IRQ); - qdev_connect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt", - 0, irq); - } else if (vms->virt) { - qemu_irq irq = qdev_get_gpio_in(vms->gic, - intidbase + ARCH_GIC_MAINT_IRQ); - sysbus_connect_irq(gicbusdev, i + 4 * max_cpus, irq); - } - - qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0, - qdev_get_gpio_in(vms->gic, intidbase - + VIRTUAL_PMU_IRQ)); - - sysbus_connect_irq(gicbusdev, i, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ)); - sysbus_connect_irq(gicbusdev, i + max_cpus, - qdev_get_gpio_in(cpudev, ARM_CPU_FIQ)); - sysbus_connect_irq(gicbusdev, i + 2 * max_cpus, - qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ)); - sysbus_connect_irq(gicbusdev, i + 3 * max_cpus, - qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ)); + wire_gic_cpu_irqs(vms, qemu_get_cpu(i)); } fdt_add_gic_node(vms); @@ -3162,7 +3216,7 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, */ if (vms->acpi_dev) { /* TODO: update GIC about this hotplug change here */ - /* TODO: wire the GIC<->CPU irqs */ + wire_gic_cpu_irqs(vms, cs); } /* @@ -3246,7 +3300,7 @@ static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, /* TODO: update the acpi cpu hotplug state for cpu hot-unplug */ - /* TODO: unwire the gic-cpu irqs here */ + unwire_gic_cpu_irqs(vms, cs); /* TODO: update the GIC about this hot unplug change */ /* TODO: unregister cpu for reset & update F/W info for the next boot */ diff --git a/hw/core/gpio.c b/hw/core/gpio.c index 80d07a6ec9..abb164d5c0 100644 --- a/hw/core/gpio.c +++ b/hw/core/gpio.c @@ -143,7 +143,7 @@ qemu_irq qdev_get_gpio_out_connector(DeviceState *dev, const char *name, int n) /* disconnect a GPIO output, returning the disconnected input (if any) */ -static qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev, +qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev, const char *name, int n) { char *propname = g_strdup_printf("%s[%d]", diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 151d968238..2d3661d6cd 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -739,6 +739,8 @@ qemu_irq qdev_get_gpio_out_connector(DeviceState *dev, const char *name, int n); */ qemu_irq qdev_intercept_gpio_out(DeviceState *dev, qemu_irq icpt, const char *name, int n); +qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev, + const char *name, int n); BusState *qdev_get_child_bus(DeviceState *dev, const char *name); -- Gitee From 8ad397f33f8b7d82c0ef72608ef8dc3e0ecba1c2 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 9 May 2020 14:38:38 +0100 Subject: [PATCH 199/939] hw/arm,gicv3: Changes to update GIC with vCPU hot-plug notification vCPU hot-(un)plug events MUST be notified to the GIC. Introduce a notfication mechanism to update any such events to GIC so that it can update its vCPU to GIC CPU interface association. This is required to implement a workaround to the limitations posed by the ARM architecture. For details about the constraints and workarounds please check below slides: Link: https://kvm-forum.qemu.org/2023/talk/9SMPDQ/ Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 27 +++++++++++++-- hw/intc/arm_gicv3_common.c | 54 +++++++++++++++++++++++++++++- hw/intc/arm_gicv3_cpuif_common.c | 5 +++ hw/intc/gicv3_internal.h | 1 + include/hw/arm/virt.h | 1 + include/hw/intc/arm_gicv3_common.h | 22 ++++++++++++ 6 files changed, 107 insertions(+), 3 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 97bf4cca11..0312fa366d 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -750,6 +750,16 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) return dev; } +static void virt_add_gic_cpuhp_notifier(VirtMachineState *vms) +{ + MachineClass *mc = MACHINE_GET_CLASS(vms); + + if (mc->has_hotpluggable_cpus) { + Notifier *cpuhp_notifier = gicv3_cpuhp_notifier(vms->gic); + notifier_list_add(&vms->cpuhp_notifiers, cpuhp_notifier); + } +} + static void create_its(VirtMachineState *vms) { const char *itsclass = its_class_name(); @@ -997,6 +1007,9 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) } else if (vms->gic_version == VIRT_GIC_VERSION_2) { create_v2m(vms); } + + /* add GIC CPU hot(un)plug update notifier */ + virt_add_gic_cpuhp_notifier(vms); } static void create_uart(const VirtMachineState *vms, int uart, @@ -2481,6 +2494,8 @@ static void machvirt_init(MachineState *machine) create_fdt(vms); qemu_log("cpu init start\n"); + notifier_list_init(&vms->cpuhp_notifiers); + possible_cpus = mc->possible_cpu_arch_ids(machine); assert(possible_cpus->len == max_cpus); for (n = 0; n < possible_cpus->len; n++) { Object *cpuobj; @@ -3133,6 +3148,14 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev, dev, &error_abort); } +static void virt_update_gic(VirtMachineState *vms, CPUState *cs) +{ + GICv3CPUHotplugInfo gic_info = { .gic = vms->gic, .cpu = cs }; + + /* notify gic to stitch GICC to this new cpu */ + notifier_list_notify(&vms->cpuhp_notifiers, &gic_info); +} + static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -3215,7 +3238,7 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, * vCPUs have their GIC state initialized during machvit_init(). */ if (vms->acpi_dev) { - /* TODO: update GIC about this hotplug change here */ + virt_update_gic(vms, cs); wire_gic_cpu_irqs(vms, cs); } @@ -3301,7 +3324,7 @@ static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, /* TODO: update the acpi cpu hotplug state for cpu hot-unplug */ unwire_gic_cpu_irqs(vms, cs); - /* TODO: update the GIC about this hot unplug change */ + virt_update_gic(vms, cs); /* TODO: unregister cpu for reset & update F/W info for the next boot */ diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c index ebd99af610..fc87fa9369 100644 --- a/hw/intc/arm_gicv3_common.c +++ b/hw/intc/arm_gicv3_common.c @@ -33,7 +33,6 @@ #include "hw/arm/linux-boot-if.h" #include "sysemu/kvm.h" - static void gicv3_gicd_no_migration_shift_bug_post_load(GICv3State *cs) { if (cs->gicd_no_migration_shift_bug) { @@ -322,6 +321,56 @@ void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, } } +static int arm_gicv3_get_proc_num(GICv3State *s, CPUState *cpu) +{ + uint64_t mp_affinity; + uint64_t gicr_typer; + uint64_t cpu_affid; + int i; + + mp_affinity = object_property_get_uint(OBJECT(cpu), "mp-affinity", NULL); + /* match the cpu mp-affinity to get the gic cpuif number */ + for (i = 0; i < s->num_cpu; i++) { + gicr_typer = s->cpu[i].gicr_typer; + cpu_affid = (gicr_typer >> 32) & 0xFFFFFF; + if (cpu_affid == mp_affinity) { + return i; + } + } + + return -1; +} + +static void arm_gicv3_cpu_update_notifier(Notifier *notifier, void * data) +{ + GICv3CPUHotplugInfo *gic_info = (GICv3CPUHotplugInfo *)data; + CPUState *cpu = gic_info->cpu; + int gic_cpuif_num; + GICv3State *s; + + s = ARM_GICV3_COMMON(gic_info->gic); + + /* this shall get us mapped gicv3 cpuif corresponding to mpidr */ + gic_cpuif_num = arm_gicv3_get_proc_num(s, cpu); + if (gic_cpuif_num < 0) { + error_report("Failed to associate cpu %d with any GIC cpuif", + cpu->cpu_index); + abort(); + } + + /* check if update is for vcpu hot-unplug */ + if (qemu_enabled_cpu(cpu)) { + s->cpu[gic_cpuif_num].cpu = NULL; + return; + } + + /* re-stitch the gic cpuif to this new cpu */ + gicv3_set_gicv3state(cpu, &s->cpu[gic_cpuif_num]); + gicv3_set_cpustate(&s->cpu[gic_cpuif_num], cpu); + + /* TODO: initialize the registers info for this newly added cpu */ +} + static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) { GICv3State *s = ARM_GICV3_COMMON(dev); @@ -444,6 +493,8 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) s->cpu[cpuidx - 1].gicr_typer |= GICR_TYPER_LAST; } + s->cpu_update_notifier.notify = arm_gicv3_cpu_update_notifier; + s->itslist = g_ptr_array_new(); } @@ -451,6 +502,7 @@ static void arm_gicv3_finalize(Object *obj) { GICv3State *s = ARM_GICV3_COMMON(obj); + notifier_remove(&s->cpu_update_notifier); g_free(s->redist_region_count); } diff --git a/hw/intc/arm_gicv3_cpuif_common.c b/hw/intc/arm_gicv3_cpuif_common.c index ff1239f65d..381cf2754b 100644 --- a/hw/intc/arm_gicv3_cpuif_common.c +++ b/hw/intc/arm_gicv3_cpuif_common.c @@ -20,3 +20,8 @@ void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s) env->gicv3state = (void *)s; }; + +void gicv3_set_cpustate(GICv3CPUState *s, CPUState *cpu) +{ + s->cpu = cpu; +} diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h index 29d5cdc1b6..9d4c1209bd 100644 --- a/hw/intc/gicv3_internal.h +++ b/hw/intc/gicv3_internal.h @@ -848,5 +848,6 @@ static inline void gicv3_cache_all_target_cpustates(GICv3State *s) } void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s); +void gicv3_set_cpustate(GICv3CPUState *s, CPUState *cpu); #endif /* QEMU_ARM_GICV3_INTERNAL_H */ diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 5de0185063..069c9f2a09 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -180,6 +180,7 @@ struct VirtMachineState { PCIBus *bus; char *oem_id; char *oem_table_id; + NotifierList cpuhp_notifiers; }; #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h index 4e2fb518e7..97a48f44b9 100644 --- a/include/hw/intc/arm_gicv3_common.h +++ b/include/hw/intc/arm_gicv3_common.h @@ -280,6 +280,7 @@ struct GICv3State { GICv3CPUState *gicd_irouter_target[GICV3_MAXIRQ]; uint32_t gicd_nsacr[DIV_ROUND_UP(GICV3_MAXIRQ, 16)]; + Notifier cpu_update_notifier; GICv3CPUState *cpu; /* List of all ITSes connected to this GIC */ GPtrArray *itslist; @@ -328,6 +329,27 @@ struct ARMGICv3CommonClass { void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, const MemoryRegionOps *ops); +/** + * Structure used by GICv3 CPU hotplug notifier + */ +typedef struct GICv3CPUHotplugInfo { + DeviceState *gic; /* GICv3State */ + CPUState *cpu; +} GICv3CPUHotplugInfo; + +/** + * gicv3_cpuhp_notifier + * + * Returns CPU hotplug notifier which could be used to update GIC about any + * CPU hot(un)plug events. + * + * Returns: Notifier initialized with CPU Hot(un)plug update function + */ +static inline Notifier *gicv3_cpuhp_notifier(DeviceState *dev) +{ + GICv3State *s = ARM_GICV3_COMMON(dev); + return &s->cpu_update_notifier; +} /** * gicv3_class_name -- Gitee From 4e0a4443b7c36608fc30dcaaf0db120220111dd2 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 9 May 2020 15:26:27 +0100 Subject: [PATCH 200/939] hw/intc/arm-gicv3*: Changes required to (re)init the vCPU register info vCPU register info needs to be re-initialized each time vCPU is hot-plugged. This has to be done both for emulation/TCG and KVM case. This is done in context to the GIC update notification for any vCPU hot-(un)plug events. This change adds that support and re-factors existing to maximize the code re-use. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/intc/arm_gicv3.c | 1 + hw/intc/arm_gicv3_common.c | 7 +- hw/intc/arm_gicv3_cpuif.c | 257 +++++++++++++++-------------- hw/intc/arm_gicv3_kvm.c | 7 +- hw/intc/gicv3_internal.h | 1 + include/hw/intc/arm_gicv3_common.h | 1 + 6 files changed, 150 insertions(+), 124 deletions(-) diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c index 0b8f79a122..e1c7c8c4bc 100644 --- a/hw/intc/arm_gicv3.c +++ b/hw/intc/arm_gicv3.c @@ -410,6 +410,7 @@ static void arm_gicv3_class_init(ObjectClass *klass, void *data) ARMGICv3Class *agc = ARM_GICV3_CLASS(klass); agcc->post_load = arm_gicv3_post_load; + agcc->init_cpu_reginfo = gicv3_init_cpu_reginfo; device_class_set_parent_realize(dc, arm_gic_realize, &agc->parent_realize); } diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c index fc87fa9369..d051024a30 100644 --- a/hw/intc/arm_gicv3_common.c +++ b/hw/intc/arm_gicv3_common.c @@ -345,10 +345,12 @@ static void arm_gicv3_cpu_update_notifier(Notifier *notifier, void * data) { GICv3CPUHotplugInfo *gic_info = (GICv3CPUHotplugInfo *)data; CPUState *cpu = gic_info->cpu; + ARMGICv3CommonClass *c; int gic_cpuif_num; GICv3State *s; s = ARM_GICV3_COMMON(gic_info->gic); + c = ARM_GICV3_COMMON_GET_CLASS(s); /* this shall get us mapped gicv3 cpuif corresponding to mpidr */ gic_cpuif_num = arm_gicv3_get_proc_num(s, cpu); @@ -368,7 +370,10 @@ static void arm_gicv3_cpu_update_notifier(Notifier *notifier, void * data) gicv3_set_gicv3state(cpu, &s->cpu[gic_cpuif_num]); gicv3_set_cpustate(&s->cpu[gic_cpuif_num], cpu); - /* TODO: initialize the registers info for this newly added cpu */ + /* initialize the registers info for this newly added cpu */ + if (c->init_cpu_reginfo) { + c->init_cpu_reginfo(cpu); + } } static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index 0d0eb2f62f..a013510074 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -2782,6 +2782,127 @@ static const ARMCPRegInfo gicv3_cpuif_ich_apxr23_reginfo[] = { }, }; +void gicv3_init_cpu_reginfo(CPUState *cs) +{ + ARMCPU *cpu = ARM_CPU(cs); + GICv3CPUState *gcs = icc_cs_from_env(&cpu->env); + + /* + * If the CPU doesn't define a GICv3 configuration, probably because + * in real hardware it doesn't have one, then we use default values + * matching the one used by most Arm CPUs. This applies to: + * cpu->gic_num_lrs + * cpu->gic_vpribits + * cpu->gic_vprebits + * cpu->gic_pribits + */ + + /* + * Note that we can't just use the GICv3CPUState as an opaque pointer + * in define_arm_cp_regs_with_opaque(), because when we're called back + * it might be with code translated by CPU 0 but run by CPU 1, in + * which case we'd get the wrong value. + * So instead we define the regs with no ri->opaque info, and + * get back to the GICv3CPUState from the CPUARMState. + */ + define_arm_cp_regs(cpu, gicv3_cpuif_reginfo); + + /* + * The CPU implementation specifies the number of supported + * bits of physical priority. For backwards compatibility + * of migration, we have a compat property that forces use + * of 8 priority bits regardless of what the CPU really has. + */ + if (gcs->gic->force_8bit_prio) { + gcs->pribits = 8; + } else { + gcs->pribits = cpu->gic_pribits ?: 5; + } + + /* + * The GICv3 has separate ID register fields for virtual priority + * and preemption bit values, but only a single ID register field + * for the physical priority bits. The preemption bit count is + * always the same as the priority bit count, except that 8 bits + * of priority means 7 preemption bits. We precalculate the + * preemption bits because it simplifies the code and makes the + * parallels between the virtual and physical bits of the GIC + * a bit clearer. + */ + gcs->prebits = gcs->pribits; + if (gcs->prebits == 8) { + gcs->prebits--; + } + /* + * Check that CPU code defining pribits didn't violate + * architectural constraints our implementation relies on. + */ + g_assert(gcs->pribits >= 4 && gcs->pribits <= 8); + + /* + * gicv3_cpuif_reginfo[] defines ICC_AP*R0_EL1; add definitions + * for ICC_AP*R{1,2,3}_EL1 if the prebits value requires them. + */ + if (gcs->prebits >= 6) { + define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr1_reginfo); + } + if (gcs->prebits == 7) { + define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr23_reginfo); + } + + if (arm_feature(&cpu->env, ARM_FEATURE_EL2)) { + int j; + + gcs->num_list_regs = cpu->gic_num_lrs ?: 4; + gcs->vpribits = cpu->gic_vpribits ?: 5; + gcs->vprebits = cpu->gic_vprebits ?: 5; + + /* + * Check against architectural constraints: getting these + * wrong would be a bug in the CPU code defining these, + * and the implementation relies on them holding. + */ + g_assert(gcs->vprebits <= gcs->vpribits); + g_assert(gcs->vprebits >= 5 && gcs->vprebits <= 7); + g_assert(gcs->vpribits >= 5 && gcs->vpribits <= 8); + + define_arm_cp_regs(cpu, gicv3_cpuif_hcr_reginfo); + + for (j = 0; j < gcs->num_list_regs; j++) { + /* + * Note that the AArch64 LRs are 64-bit; the AArch32 LRs + * are split into two cp15 regs, LR (the low part, with the + * same encoding as the AArch64 LR) and LRC (the high part). + */ + ARMCPRegInfo lr_regset[] = { + { .name = "ICH_LRn_EL2", .state = ARM_CP_STATE_BOTH, + .opc0 = 3, .opc1 = 4, .crn = 12, + .crm = 12 + (j >> 3), .opc2 = j & 7, + .type = ARM_CP_IO | ARM_CP_NO_RAW, + .access = PL2_RW, + .readfn = ich_lr_read, + .writefn = ich_lr_write, + }, + { .name = "ICH_LRCn_EL2", .state = ARM_CP_STATE_AA32, + .cp = 15, .opc1 = 4, .crn = 12, + .crm = 14 + (j >> 3), .opc2 = j & 7, + .type = ARM_CP_IO | ARM_CP_NO_RAW, + .access = PL2_RW, + .readfn = ich_lr_read, + .writefn = ich_lr_write, + }, + }; + define_arm_cp_regs(cpu, lr_regset); + } + if (gcs->vprebits >= 6) { + define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr1_reginfo); + } + if (gcs->vprebits == 7) { + define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo); + } + } +} + static void gicv3_cpuif_el_change_hook(ARMCPU *cpu, void *opaque) { GICv3CPUState *cs = opaque; @@ -2804,131 +2925,23 @@ void gicv3_init_cpuif(GICv3State *s) for (i = 0; i < s->num_cpu; i++) { ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i)); - GICv3CPUState *cs = &s->cpu[i]; - - /* - * If the CPU doesn't define a GICv3 configuration, probably because - * in real hardware it doesn't have one, then we use default values - * matching the one used by most Arm CPUs. This applies to: - * cpu->gic_num_lrs - * cpu->gic_vpribits - * cpu->gic_vprebits - * cpu->gic_pribits - */ - - /* Note that we can't just use the GICv3CPUState as an opaque pointer - * in define_arm_cp_regs_with_opaque(), because when we're called back - * it might be with code translated by CPU 0 but run by CPU 1, in - * which case we'd get the wrong value. - * So instead we define the regs with no ri->opaque info, and - * get back to the GICv3CPUState from the CPUARMState. - * - * These CP regs callbacks can be called from either TCG or HVF code. - */ - define_arm_cp_regs(cpu, gicv3_cpuif_reginfo); - - /* - * The CPU implementation specifies the number of supported - * bits of physical priority. For backwards compatibility - * of migration, we have a compat property that forces use - * of 8 priority bits regardless of what the CPU really has. - */ - if (s->force_8bit_prio) { - cs->pribits = 8; - } else { - cs->pribits = cpu->gic_pribits ?: 5; - } - - /* - * The GICv3 has separate ID register fields for virtual priority - * and preemption bit values, but only a single ID register field - * for the physical priority bits. The preemption bit count is - * always the same as the priority bit count, except that 8 bits - * of priority means 7 preemption bits. We precalculate the - * preemption bits because it simplifies the code and makes the - * parallels between the virtual and physical bits of the GIC - * a bit clearer. - */ - cs->prebits = cs->pribits; - if (cs->prebits == 8) { - cs->prebits--; - } - /* - * Check that CPU code defining pribits didn't violate - * architectural constraints our implementation relies on. - */ - g_assert(cs->pribits >= 4 && cs->pribits <= 8); - /* - * gicv3_cpuif_reginfo[] defines ICC_AP*R0_EL1; add definitions - * for ICC_AP*R{1,2,3}_EL1 if the prebits value requires them. - */ - if (cs->prebits >= 6) { - define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr1_reginfo); - } - if (cs->prebits == 7) { - define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr23_reginfo); - } - - if (arm_feature(&cpu->env, ARM_FEATURE_EL2)) { - int j; - - cs->num_list_regs = cpu->gic_num_lrs ?: 4; - cs->vpribits = cpu->gic_vpribits ?: 5; - cs->vprebits = cpu->gic_vprebits ?: 5; - - /* Check against architectural constraints: getting these - * wrong would be a bug in the CPU code defining these, - * and the implementation relies on them holding. - */ - g_assert(cs->vprebits <= cs->vpribits); - g_assert(cs->vprebits >= 5 && cs->vprebits <= 7); - g_assert(cs->vpribits >= 5 && cs->vpribits <= 8); - - define_arm_cp_regs(cpu, gicv3_cpuif_hcr_reginfo); - - for (j = 0; j < cs->num_list_regs; j++) { - /* Note that the AArch64 LRs are 64-bit; the AArch32 LRs - * are split into two cp15 regs, LR (the low part, with the - * same encoding as the AArch64 LR) and LRC (the high part). + if (qemu_enabled_cpu(CPU(cpu))) { + GICv3CPUState *cs = icc_cs_from_env(&cpu->env); + gicv3_init_cpu_reginfo(CPU(cpu)); + if (tcg_enabled() || qtest_enabled()) { + /* + * We can only trap EL changes with TCG. However the GIC + * interrupt state only changes on EL changes involving EL2 or + * EL3, so for the non-TCG case this is OK, as EL2 and EL3 can't + * exist. */ - ARMCPRegInfo lr_regset[] = { - { .name = "ICH_LRn_EL2", .state = ARM_CP_STATE_BOTH, - .opc0 = 3, .opc1 = 4, .crn = 12, - .crm = 12 + (j >> 3), .opc2 = j & 7, - .type = ARM_CP_IO | ARM_CP_NO_RAW, - .access = PL2_RW, - .readfn = ich_lr_read, - .writefn = ich_lr_write, - }, - { .name = "ICH_LRCn_EL2", .state = ARM_CP_STATE_AA32, - .cp = 15, .opc1 = 4, .crn = 12, - .crm = 14 + (j >> 3), .opc2 = j & 7, - .type = ARM_CP_IO | ARM_CP_NO_RAW, - .access = PL2_RW, - .readfn = ich_lr_read, - .writefn = ich_lr_write, - }, - }; - define_arm_cp_regs(cpu, lr_regset); - } - if (cs->vprebits >= 6) { - define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr1_reginfo); - } - if (cs->vprebits == 7) { - define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo); + arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, + cs); + } else { + assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2)); + assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3)); } } - if (tcg_enabled() || qtest_enabled()) { - /* - * We can only trap EL changes with TCG. However the GIC interrupt - * state only changes on EL changes involving EL2 or EL3, so for - * the non-TCG case this is OK, as EL2 and EL3 can't exist. - */ - arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, cs); - } else { - assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2)); - assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3)); - } } } diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index db06c75e2b..dd2a60fa20 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -804,6 +804,10 @@ static void vm_change_state_handler(void *opaque, bool running, } } +static void kvm_gicv3_init_cpu_reginfo(CPUState *cs) +{ + define_arm_cp_regs(ARM_CPU(cs), gicv3_cpuif_reginfo); +} static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) { @@ -837,7 +841,7 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) for (i = 0; i < s->num_cpu; i++) { CPUState *cs = qemu_get_cpu(i); if (qemu_enabled_cpu(cs)) { - define_arm_cp_regs(ARM_CPU(cs), gicv3_cpuif_reginfo); + kvm_gicv3_init_cpu_reginfo(cs); } } @@ -925,6 +929,7 @@ static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data) agcc->pre_save = kvm_arm_gicv3_get; agcc->post_load = kvm_arm_gicv3_put; + agcc->init_cpu_reginfo = kvm_gicv3_init_cpu_reginfo; device_class_set_parent_realize(dc, kvm_arm_gicv3_realize, &kgc->parent_realize); resettable_class_set_parent_phases(rc, NULL, kvm_arm_gicv3_reset_hold, NULL, diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h index 9d4c1209bd..0bed0f6e2a 100644 --- a/hw/intc/gicv3_internal.h +++ b/hw/intc/gicv3_internal.h @@ -709,6 +709,7 @@ void gicv3_redist_vinvall(GICv3CPUState *cs, uint64_t vptaddr); void gicv3_redist_send_sgi(GICv3CPUState *cs, int grp, int irq, bool ns); void gicv3_init_cpuif(GICv3State *s); +void gicv3_init_cpu_reginfo(CPUState *cs); /** * gicv3_cpuif_update: diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h index 97a48f44b9..b5f8ba17ff 100644 --- a/include/hw/intc/arm_gicv3_common.h +++ b/include/hw/intc/arm_gicv3_common.h @@ -325,6 +325,7 @@ struct ARMGICv3CommonClass { void (*pre_save)(GICv3State *s); void (*post_load)(GICv3State *s); + void (*init_cpu_reginfo)(CPUState *cs); }; void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, -- Gitee From afb71c88d935349cdf9763e8f51f77334ab615ec Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Fri, 8 May 2020 18:54:10 +0100 Subject: [PATCH 201/939] arm/virt: Update the guest(via GED) about CPU hot-(un)plug events During any vCPU hot-(un)plug, running guest VM needs to be intimated about the new vCPU being added or request the deletion of the vCPU which is already part of the guest VM. This is done using the ACPI GED event which eventually gets demultiplexed to a CPU hotplug event and further to specific hot-(un)plug event of a particular vCPU. This change adds the ACPI calls to the existing hot-(un)plug hooks to trigger ACPI GED events from QEMU to guest VM. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/virt.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 0312fa366d..60cd560ab9 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3256,6 +3256,7 @@ static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); MachineState *ms = MACHINE(hotplug_dev); CPUState *cs = CPU(dev); + Error *local_err = NULL; CPUArchId *cpu_slot; /* insert the cold/hot-plugged vcpu in the slot */ @@ -3268,12 +3269,20 @@ static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, * plugged, guest is also notified. */ if (vms->acpi_dev) { - /* TODO: update acpi hotplug state. Send cpu hotplug event to guest */ + HotplugHandlerClass *hhc; + /* update acpi hotplug state and send cpu hotplug event to guest */ + hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); + hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err); + if (local_err) { + goto fail; + } /* TODO: register cpu for reset & update F/W info for the next boot */ } cs->disabled = false; return; +fail: + error_propagate(errp, local_err); } static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, @@ -3281,8 +3290,10 @@ static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, { MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + HotplugHandlerClass *hhc; ARMCPU *cpu = ARM_CPU(dev); CPUState *cs = CPU(dev); + Error *local_err = NULL; if (!vms->acpi_dev || !dev->realized) { error_setg(errp, "GED does not exists or device is not realized!"); @@ -3301,9 +3312,16 @@ static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, return; } - /* TODO: request cpu hotplug from guest */ + /* request cpu hotplug from guest */ + hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); + hhc->unplug_request(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err); + if (local_err) { + goto fail; + } return; +fail: + error_propagate(errp, local_err); } static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, @@ -3311,7 +3329,9 @@ static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, { VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); MachineState *ms = MACHINE(hotplug_dev); + HotplugHandlerClass *hhc; CPUState *cs = CPU(dev); + Error *local_err = NULL; CPUArchId *cpu_slot; if (!vms->acpi_dev || !dev->realized) { @@ -3321,7 +3341,12 @@ static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); - /* TODO: update the acpi cpu hotplug state for cpu hot-unplug */ + /* update the acpi cpu hotplug state for cpu hot-unplug */ + hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); + hhc->unplug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err); + if (local_err) { + goto fail; + } unwire_gic_cpu_irqs(vms, cs); virt_update_gic(vms, cs); @@ -3335,6 +3360,8 @@ static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, cs->disabled = true; return; +fail: + error_propagate(errp, local_err); } static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, -- Gitee From 3e5f043c493fa4765c5637bec66be2bd620bc53f Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 9 May 2020 18:10:24 +0100 Subject: [PATCH 202/939] hw/arm: Changes required for reset and to support next boot Updates the firmware config with the next boot cpus information and also registers the reset callback to be called when guest reboots to reset the cpu. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta --- hw/arm/boot.c | 2 +- hw/arm/virt.c | 18 +++++++++++++++--- include/hw/arm/boot.h | 2 ++ include/hw/arm/virt.h | 1 + 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index d1671e1d42..345c7cfa19 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -683,7 +683,7 @@ fail: return -1; } -static void do_cpu_reset(void *opaque) +void do_cpu_reset(void *opaque) { ARMCPU *cpu = opaque; CPUState *cs = CPU(cpu); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 60cd560ab9..eedff8e525 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -46,6 +46,8 @@ #include "sysemu/device_tree.h" #include "sysemu/numa.h" #include "sysemu/runstate.h" +#include "sysemu/reset.h" +#include "sysemu/sysemu.h" #include "sysemu/tpm.h" #include "sysemu/tcg.h" #include "sysemu/kvm.h" @@ -1453,7 +1455,7 @@ static FWCfgState *create_fw_cfg(const VirtMachineState *vms, AddressSpace *as) char *nodename; fw_cfg = fw_cfg_init_mem_wide(base + 8, base, 8, base + 16, as); - fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, (uint16_t)ms->smp.cpus); + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus); nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -3276,7 +3278,13 @@ static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, if (local_err) { goto fail; } - /* TODO: register cpu for reset & update F/W info for the next boot */ + /* register this cpu for reset & update F/W info for the next boot */ + qemu_register_reset(do_cpu_reset, ARM_CPU(cs)); + } + + vms->boot_cpus++; + if (vms->fw_cfg) { + fw_cfg_modify_i16(vms->fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus); } cs->disabled = false; @@ -3351,7 +3359,11 @@ static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, unwire_gic_cpu_irqs(vms, cs); virt_update_gic(vms, cs); - /* TODO: unregister cpu for reset & update F/W info for the next boot */ + qemu_unregister_reset(do_cpu_reset, ARM_CPU(cs)); + vms->boot_cpus--; + if (vms->fw_cfg) { + fw_cfg_modify_i16(vms->fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus); + } qobject_unref(dev->opts); dev->opts = NULL; diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h index 80c492d742..f81326a1dc 100644 --- a/include/hw/arm/boot.h +++ b/include/hw/arm/boot.h @@ -178,6 +178,8 @@ AddressSpace *arm_boot_address_space(ARMCPU *cpu, int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, hwaddr addr_limit, AddressSpace *as, MachineState *ms); +void do_cpu_reset(void *opaque); + /* Write a secure board setup routine with a dummy handler for SMCs */ void arm_write_secure_board_setup_dummy_smc(ARMCPU *cpu, const struct arm_boot_info *info, diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 069c9f2a09..ae0f5beb26 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -167,6 +167,7 @@ struct VirtMachineState { MemMapEntry *memmap; char *pciehb_nodename; const int *irqmap; + uint16_t boot_cpus; int fdt_size; uint32_t clock_phandle; uint32_t gic_phandle; -- Gitee From 8fa5af7de07d9bc2535ea8fab087d509795e3579 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sun, 6 Aug 2023 22:12:52 +0000 Subject: [PATCH 203/939] physmem,gdbstub: Common helping funcs/changes to *unrealize* vCPU Supporting vCPU Hotplug for ARM arch also means introducing new functionality of unrealizing the ARMCPU. This requires some new common functions. Defining them as part of architecture independent change so that this code could be reused by other interested parties. Signed-off-by: Salil Mehta --- gdbstub/gdbstub.c | 6 ++++++ include/exec/cpu-common.h | 8 ++++++++ include/exec/gdbstub.h | 1 + include/hw/core/cpu.h | 1 + system/physmem.c | 25 +++++++++++++++++++++++++ 5 files changed, 41 insertions(+) diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c index 46d752bbc2..f16006d2a8 100644 --- a/gdbstub/gdbstub.c +++ b/gdbstub/gdbstub.c @@ -582,6 +582,12 @@ void gdb_register_coprocessor(CPUState *cpu, } } +void gdb_unregister_coprocessor_all(CPUState *cpu) +{ + g_array_free(cpu->gdb_regs, true); + cpu->gdb_regs = NULL; +} + static void gdb_process_breakpoint_remove_all(GDBProcess *p) { CPUState *cpu = gdb_get_first_cpu_in_process(p); diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 41115d8919..2a3d4aa1c8 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -139,6 +139,14 @@ size_t qemu_ram_pagesize_largest(void); */ void cpu_address_space_init(CPUState *cpu, int asidx, const char *prefix, MemoryRegion *mr); +/** + * cpu_address_space_destroy: + * @cpu: CPU for which address space needs to be destroyed + * @asidx: integer index of this address space + * + * Note that with KVM only one address space is supported. + */ +void cpu_address_space_destroy(CPUState *cpu, int asidx); void cpu_physical_memory_rw(hwaddr addr, void *buf, hwaddr len, bool is_write); diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h index d8a3c56fa2..d123b838c2 100644 --- a/include/exec/gdbstub.h +++ b/include/exec/gdbstub.h @@ -39,6 +39,7 @@ typedef int (*gdb_set_reg_cb)(CPUArchState *env, uint8_t *buf, int reg); void gdb_register_coprocessor(CPUState *cpu, gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg, int num_regs, const char *xml, int g_pos); +void gdb_unregister_coprocessor_all(CPUState *cpu); /** * gdbserver_start: start the gdb server diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 0ca778eb75..6dbe163548 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -496,6 +496,7 @@ struct CPUState { QSIMPLEQ_HEAD(, qemu_work_item) work_list; CPUAddressSpace *cpu_ases; + int cpu_ases_ref_count; int num_ases; AddressSpace *as; MemoryRegion *memory; diff --git a/system/physmem.c b/system/physmem.c index 247c252e53..299174ad91 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -761,6 +761,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (!cpu->cpu_ases) { cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); + cpu->cpu_ases_ref_count = cpu->num_ases; } newas = &cpu->cpu_ases[asidx]; @@ -774,6 +775,30 @@ void cpu_address_space_init(CPUState *cpu, int asidx, } } +void cpu_address_space_destroy(CPUState *cpu, int asidx) +{ + CPUAddressSpace *cpuas; + + assert(asidx < cpu->num_ases); + assert(asidx == 0 || !kvm_enabled()); + assert(cpu->cpu_ases); + + cpuas = &cpu->cpu_ases[asidx]; + if (tcg_enabled()) { + memory_listener_unregister(&cpuas->tcg_as_listener); + } + + address_space_destroy(cpuas->as); + g_free_rcu(cpuas->as, rcu); + + if (cpu->cpu_ases_ref_count == 1) { + g_free(cpu->cpu_ases); + cpu->cpu_ases = NULL; + } + + cpu->cpu_ases_ref_count--; +} + AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) { /* Return the AddressSpace corresponding to the specified index */ -- Gitee From b311feda2078e7ee8f060531d4d061beccbc2f77 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 9 May 2020 20:13:10 +0100 Subject: [PATCH 204/939] target/arm: Add support of *unrealize* ARMCPU during vCPU Hot-unplug vCPU Hot-unplug will result in QOM CPU object unrealization which will do away with all the vCPU thread creations, allocations, registrations that happened as part of the realization process. This change introduces the ARM CPU unrealize function taking care of exactly that. Note, initialized KVM vCPUs are not destroyed in host KVM but their Qemu context is parked at the QEMU KVM layer. Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Reported-by: Vishnu Pajjuri [VP: Identified CPU stall issue & suggested probable fix] Signed-off-by: Salil Mehta --- target/arm/cpu.c | 101 +++++++++++++++++++++++++++++++++++++++++ target/arm/cpu.h | 14 ++++++ target/arm/gdbstub.c | 6 +++ target/arm/helper.c | 25 ++++++++++ target/arm/internals.h | 3 ++ target/arm/kvm64.c | 4 ++ 6 files changed, 153 insertions(+) diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 18b8a79c8f..501f88eb2f 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -142,6 +142,16 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, QLIST_INSERT_HEAD(&cpu->pre_el_change_hooks, entry, node); } +void arm_unregister_pre_el_change_hooks(ARMCPU *cpu) +{ + ARMELChangeHook *entry, *next; + + QLIST_FOREACH_SAFE(entry, &cpu->pre_el_change_hooks, node, next) { + QLIST_REMOVE(entry, node); + g_free(entry); + } +} + void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void *opaque) { @@ -153,6 +163,16 @@ void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, QLIST_INSERT_HEAD(&cpu->el_change_hooks, entry, node); } +void arm_unregister_el_change_hooks(ARMCPU *cpu) +{ + ARMELChangeHook *entry, *next; + + QLIST_FOREACH_SAFE(entry, &cpu->el_change_hooks, node, next) { + QLIST_REMOVE(entry, node); + g_free(entry); + } +} + static void cp_reg_reset(gpointer key, gpointer value, gpointer opaque) { /* Reset a single ARMCPRegInfo register */ @@ -2390,6 +2410,85 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) acc->parent_realize(dev, errp); } +static void arm_cpu_unrealizefn(DeviceState *dev) +{ + ARMCPUClass *acc = ARM_CPU_GET_CLASS(dev); + ARMCPU *cpu = ARM_CPU(dev); + CPUARMState *env = &cpu->env; + CPUState *cs = CPU(dev); + bool has_secure; + + has_secure = cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY); + + /* rock 'n' un-roll, whatever happened in the arm_cpu_realizefn cleanly */ + cpu_address_space_destroy(cs, ARMASIdx_NS); + + if (cpu->tag_memory != NULL) { + cpu_address_space_destroy(cs, ARMASIdx_TagNS); + if (has_secure) { + cpu_address_space_destroy(cs, ARMASIdx_TagS); + } + } + + if (has_secure) { + cpu_address_space_destroy(cs, ARMASIdx_S); + } + + destroy_cpreg_list(cpu); + arm_cpu_unregister_gdb_regs(cpu); + unregister_cp_regs_for_features(cpu); + + if (cpu->sau_sregion && arm_feature(env, ARM_FEATURE_M_SECURITY)) { + g_free(env->sau.rbar); + g_free(env->sau.rlar); + } + + if (arm_feature(env, ARM_FEATURE_PMSA) && + arm_feature(env, ARM_FEATURE_V7) && + cpu->pmsav7_dregion) { + if (arm_feature(env, ARM_FEATURE_V8)) { + g_free(env->pmsav8.rbar[M_REG_NS]); + g_free(env->pmsav8.rlar[M_REG_NS]); + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + g_free(env->pmsav8.rbar[M_REG_S]); + g_free(env->pmsav8.rlar[M_REG_S]); + } + } else { + g_free(env->pmsav7.drbar); + g_free(env->pmsav7.drsr); + g_free(env->pmsav7.dracr); + } + if (cpu->pmsav8r_hdregion) { + g_free(env->pmsav8.hprbar); + g_free(env->pmsav8.hprlar); + } + } + + if (arm_feature(env, ARM_FEATURE_PMU)) { + if (!kvm_enabled()) { + arm_unregister_pre_el_change_hooks(cpu); + arm_unregister_el_change_hooks(cpu); + } + +#ifndef CONFIG_USER_ONLY + if (cpu->pmu_timer) { + timer_del(cpu->pmu_timer); + } +#endif + } + + cpu_remove_sync(CPU(dev)); + acc->parent_unrealize(dev); + +#ifndef CONFIG_USER_ONLY + timer_del(cpu->gt_timer[GTIMER_PHYS]); + timer_del(cpu->gt_timer[GTIMER_VIRT]); + timer_del(cpu->gt_timer[GTIMER_HYP]); + timer_del(cpu->gt_timer[GTIMER_SEC]); + timer_del(cpu->gt_timer[GTIMER_HYPVIRT]); +#endif +} + static ObjectClass *arm_cpu_class_by_name(const char *cpu_model) { ObjectClass *oc; @@ -2492,6 +2591,8 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data) device_class_set_parent_realize(dc, arm_cpu_realizefn, &acc->parent_realize); + device_class_set_parent_unrealize(dc, arm_cpu_unrealizefn, + &acc->parent_unrealize); device_class_set_props(dc, arm_cpu_properties); diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 145d3dbf13..c51a0e3467 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1138,6 +1138,7 @@ struct ARMCPUClass { const ARMCPUInfo *info; DeviceRealize parent_realize; + DeviceUnrealize parent_unrealize; ResettablePhases parent_phases; }; @@ -3359,6 +3360,13 @@ static inline AddressSpace *arm_addressspace(CPUState *cs, MemTxAttrs attrs) */ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void *opaque); +/** + * arm_unregister_pre_el_change_hook: + * unregister all pre EL change hook functions. Generally called during + * unrealize'ing leg + */ +void arm_unregister_pre_el_change_hooks(ARMCPU *cpu); + /** * arm_register_el_change_hook: * Register a hook function which will be called immediately after this @@ -3371,6 +3379,12 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, */ void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void *opaque); +/** + * arm_unregister_el_change_hook: + * unregister all EL change hook functions. Generally called during + * unrealize'ing leg + */ +void arm_unregister_el_change_hooks(ARMCPU *cpu); /** * arm_rebuild_hflags: diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c index 28f546a5ff..5ba1e28e34 100644 --- a/target/arm/gdbstub.c +++ b/target/arm/gdbstub.c @@ -553,3 +553,9 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu) } #endif /* CONFIG_TCG */ } + +void arm_cpu_unregister_gdb_regs(ARMCPU *cpu) +{ + CPUState *cs = CPU(cpu); + gdb_unregister_coprocessor_all(cs); +} diff --git a/target/arm/helper.c b/target/arm/helper.c index 2746d3fdac..e47498828c 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -263,6 +263,19 @@ void init_cpreg_list(ARMCPU *cpu) g_list_free(keys); } +void destroy_cpreg_list(ARMCPU *cpu) +{ + assert(cpu->cpreg_indexes); + assert(cpu->cpreg_values); + assert(cpu->cpreg_vmstate_indexes); + assert(cpu->cpreg_vmstate_values); + + g_free(cpu->cpreg_indexes); + g_free(cpu->cpreg_values); + g_free(cpu->cpreg_vmstate_indexes); + g_free(cpu->cpreg_vmstate_values); +} + /* * Some registers are not accessible from AArch32 EL3 if SCR.NS == 0. */ @@ -9438,6 +9451,18 @@ void register_cp_regs_for_features(ARMCPU *cpu) #endif } +void unregister_cp_regs_for_features(ARMCPU *cpu) +{ + CPUARMState *env = &cpu->env; + if (arm_feature(env, ARM_FEATURE_M)) { + /* M profile has no coprocessor registers */ + return; + } + + /* empty it all. unregister all the coprocessor registers */ + g_hash_table_remove_all(cpu->cp_regs); +} + /* Sort alphabetically by type name, except for "any". */ static gint arm_cpu_list_compare(gconstpointer a, gconstpointer b) { diff --git a/target/arm/internals.h b/target/arm/internals.h index 143d57c0fe..c3a7682f05 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -187,9 +187,12 @@ void arm_cpu_register(const ARMCPUInfo *info); void aarch64_cpu_register(const ARMCPUInfo *info); void register_cp_regs_for_features(ARMCPU *cpu); +void unregister_cp_regs_for_features(ARMCPU *cpu); void init_cpreg_list(ARMCPU *cpu); +void destroy_cpreg_list(ARMCPU *cpu); void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu); +void arm_cpu_unregister_gdb_regs(ARMCPU *cpu); void arm_translate_init(void); void arm_restore_state_to_opc(CPUState *cs, diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 03ce1e7525..9c3a35d63a 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -647,6 +647,10 @@ int kvm_arch_init_vcpu(CPUState *cs) int kvm_arch_destroy_vcpu(CPUState *cs) { + if (cs->thread_id) { + qemu_del_vm_change_state_handler(cs->vmcse); + } + return 0; } -- Gitee From a079801cd3ae6484cad6826f20bcf4ecc7e97ead Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 5 May 2021 15:43:27 +0200 Subject: [PATCH 205/939] target/arm/kvm: Write CPU state back to KVM on reset When a KVM vCPU is reset following a PSCI CPU_ON call, its power state is not synchronized with KVM at the moment. Because the vCPU is not marked dirty, we miss the call to kvm_arch_put_registers() that writes to KVM's MP_STATE. Force mp_state synchronization. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Salil Mehta --- target/arm/kvm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 70cf15b550..aca652621f 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -636,11 +636,12 @@ void kvm_arm_cpu_post_load(ARMCPU *cpu) void kvm_arm_reset_vcpu(ARMCPU *cpu) { int ret; + CPUState *cs = CPU(cpu); /* Re-init VCPU so that all registers are set to * their respective reset values. */ - ret = kvm_arm_vcpu_init(CPU(cpu)); + ret = kvm_arm_vcpu_init(cs); if (ret < 0) { fprintf(stderr, "kvm_arm_vcpu_init failed: %s\n", strerror(-ret)); abort(); @@ -657,6 +658,12 @@ void kvm_arm_reset_vcpu(ARMCPU *cpu) * for the same reason we do so in kvm_arch_get_registers(). */ write_list_to_cpustate(cpu); + + /* + * Ensure we call kvm_arch_put_registers(). The vCPU isn't marked dirty if + * it was parked in KVM and is now booting from a PSCI CPU_ON call. + */ + cs->vcpu_dirty = true; } void kvm_arm_create_host_vcpu(ARMCPU *cpu) -- Gitee From 9c4a7c44c3c9e89c6aeab85b00c72a09a0c13940 Mon Sep 17 00:00:00 2001 From: Author Salil Mehta Date: Sat, 27 May 2023 22:13:13 +0200 Subject: [PATCH 206/939] target/arm/kvm,tcg: Register/Handle SMCCC hypercall exits to VMM/Qemu Add registration and Handling of HVC/SMC hypercall exits to VMM Co-developed-by: Salil Mehta Signed-off-by: Salil Mehta Co-developed-by: Jean-Philippe Brucker Signed-off-by: Jean-Philippe Brucker Signed-off-by: Salil Mehta --- target/arm/arm-powerctl.c | 51 +++++++++++++++++++++++++++++------- target/arm/helper.c | 2 +- target/arm/internals.h | 11 -------- target/arm/kvm.c | 52 +++++++++++++++++++++++++++++++++++++ target/arm/kvm64.c | 46 +++++++++++++++++++++++++++++--- target/arm/kvm_arm.h | 13 ++++++++++ target/arm/meson.build | 1 + target/arm/{tcg => }/psci.c | 8 ++++++ target/arm/tcg/meson.build | 4 --- 9 files changed, 160 insertions(+), 28 deletions(-) rename target/arm/{tcg => }/psci.c (97%) diff --git a/target/arm/arm-powerctl.c b/target/arm/arm-powerctl.c index c078849403..fb19b04189 100644 --- a/target/arm/arm-powerctl.c +++ b/target/arm/arm-powerctl.c @@ -16,6 +16,7 @@ #include "qemu/log.h" #include "qemu/main-loop.h" #include "sysemu/tcg.h" +#include "hw/boards.h" #ifndef DEBUG_ARM_POWERCTL #define DEBUG_ARM_POWERCTL 0 @@ -28,18 +29,37 @@ } \ } while (0) +static CPUArchId *arm_get_archid_by_id(uint64_t id) +{ + int n; + CPUArchId *arch_id; + MachineState *ms = MACHINE(qdev_get_machine()); + + /* + * At this point disabled CPUs don't have a CPUState, but their CPUArchId + * exists. + * + * TODO: Is arch_id == mp_affinity? This needs work. + */ + for (n = 0; n < ms->possible_cpus->len; n++) { + arch_id = &ms->possible_cpus->cpus[n]; + + if (arch_id->arch_id == id) { + return arch_id; + } + } + return NULL; +} + CPUState *arm_get_cpu_by_id(uint64_t id) { - CPUState *cpu; + CPUArchId *arch_id; DPRINTF("cpu %" PRId64 "\n", id); - CPU_FOREACH(cpu) { - ARMCPU *armcpu = ARM_CPU(cpu); - - if (armcpu->mp_affinity == id) { - return cpu; - } + arch_id = arm_get_archid_by_id(id); + if (arch_id && arch_id->cpu) { + return CPU(arch_id->cpu); } qemu_log_mask(LOG_GUEST_ERROR, @@ -97,6 +117,7 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, { CPUState *target_cpu_state; ARMCPU *target_cpu; + CPUArchId *arch_id; struct CpuOnInfo *info; assert(qemu_mutex_iothread_locked()); @@ -117,12 +138,24 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, } /* Retrieve the cpu we are powering up */ - target_cpu_state = arm_get_cpu_by_id(cpuid); - if (!target_cpu_state) { + arch_id = arm_get_archid_by_id(cpuid); + if (!arch_id) { /* The cpu was not found */ return QEMU_ARM_POWERCTL_INVALID_PARAM; } + target_cpu_state = CPU(arch_id->cpu); + if (!qemu_enabled_cpu(target_cpu_state)) { + /* + * The cpu is not plugged in or disabled. We should return appropriate + * value as introduced in DEN0022E PSCI 1.2 issue E + */ + qemu_log_mask(LOG_GUEST_ERROR, + "[ARM]%s: Denying attempt to online removed/disabled " + "CPU%" PRId64"\n", __func__, cpuid); + return QEMU_ARM_POWERCTL_IS_OFF; + } + target_cpu = ARM_CPU(target_cpu_state); if (target_cpu->power_state == PSCI_ON) { qemu_log_mask(LOG_GUEST_ERROR, diff --git a/target/arm/helper.c b/target/arm/helper.c index e47498828c..793aa89cc6 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -11346,7 +11346,7 @@ void arm_cpu_do_interrupt(CPUState *cs) env->exception.syndrome); } - if (tcg_enabled() && arm_is_psci_call(cpu, cs->exception_index)) { + if (arm_is_psci_call(cpu, cs->exception_index)) { arm_handle_psci_call(cpu); qemu_log_mask(CPU_LOG_INT, "...handled as PSCI call\n"); return; diff --git a/target/arm/internals.h b/target/arm/internals.h index c3a7682f05..20b9c1da38 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -314,21 +314,10 @@ vaddr arm_adjust_watchpoint_address(CPUState *cs, vaddr addr, int len); /* Callback function for when a watchpoint or breakpoint triggers. */ void arm_debug_excp_handler(CPUState *cs); -#if defined(CONFIG_USER_ONLY) || !defined(CONFIG_TCG) -static inline bool arm_is_psci_call(ARMCPU *cpu, int excp_type) -{ - return false; -} -static inline void arm_handle_psci_call(ARMCPU *cpu) -{ - g_assert_not_reached(); -} -#else /* Return true if the r0/x0 value indicates that this SMC/HVC is a PSCI call. */ bool arm_is_psci_call(ARMCPU *cpu, int excp_type); /* Actually handle a PSCI call */ void arm_handle_psci_call(ARMCPU *cpu); -#endif /** * arm_clear_exclusive: clear the exclusive monitor diff --git a/target/arm/kvm.c b/target/arm/kvm.c index aca652621f..66caf9e5e7 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -260,6 +260,7 @@ int kvm_arch_get_default_type(MachineState *ms) int kvm_arch_init(MachineState *ms, KVMState *s) { int ret = 0; + /* For ARM interrupt delivery is always asynchronous, * whether we are using an in-kernel VGIC or not. */ @@ -310,6 +311,22 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + /* + * To be able to handle PSCI CPU ON calls in QEMU, we need to install SMCCC + * filter in the Host KVM. This is required to support features like + * virtual CPU Hotplug on ARM platforms. + */ + if (kvm_arm_set_smccc_filter(PSCI_0_2_FN64_CPU_ON, + KVM_SMCCC_FILTER_FWD_TO_USER)) { + error_report("CPU On PSCI-to-user-space fwd filter install failed"); + abort(); + } + if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF, + KVM_SMCCC_FILTER_FWD_TO_USER)) { + error_report("CPU Off PSCI-to-user-space fwd filter install failed"); + abort(); + } + kvm_arm_init_debug(s); return ret; @@ -966,6 +983,38 @@ static int kvm_arm_handle_dabt_nisv(CPUState *cs, uint64_t esr_iss, return -1; } +static int kvm_arm_handle_hypercall(CPUState *cs, struct kvm_run *run) +{ + ARMCPU *cpu = ARM_CPU(cs); + CPUARMState *env = &cpu->env; + + kvm_cpu_synchronize_state(cs); + + /* + * hard coding immediate to 0 as we dont expect non-zero value as of now + * This might change in future versions. Hence, KVM_GET_ONE_REG could be + * used in such cases but it must be enhanced then only synchronize will + * also fetch ESR_EL2 value. + */ + if (run->hypercall.flags == KVM_HYPERCALL_EXIT_SMC) { + cs->exception_index = EXCP_SMC; + env->exception.syndrome = syn_aa64_smc(0); + } else { + cs->exception_index = EXCP_HVC; + env->exception.syndrome = syn_aa64_hvc(0); + } + env->exception.target_el = 1; + qemu_mutex_lock_iothread(); + arm_cpu_do_interrupt(cs); + qemu_mutex_unlock_iothread(); + + /* + * For PSCI, exit the kvm_run loop and process the work. Especially + * important if this was a CPU_OFF command and we can't return to the guest. + */ + return EXCP_INTERRUPT; +} + int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) { int ret = 0; @@ -981,6 +1030,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ret = kvm_arm_handle_dabt_nisv(cs, run->arm_nisv.esr_iss, run->arm_nisv.fault_ipa); break; + case KVM_EXIT_HYPERCALL: + ret = kvm_arm_handle_hypercall(cs, run); + break; default: qemu_log_mask(LOG_UNIMP, "%s: un-handled exit reason %d\n", __func__, run->exit_reason); diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 9c3a35d63a..00b257bb4b 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -111,6 +111,25 @@ bool kvm_arm_hw_debug_active(CPUState *cs) return ((cur_hw_wps > 0) || (cur_hw_bps > 0)); } +static bool kvm_arm_set_vm_attr(struct kvm_device_attr *attr, const char *name) +{ + int err; + + err = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, attr); + if (err != 0) { + error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err)); + return false; + } + + err = kvm_vm_ioctl(kvm_state, KVM_SET_DEVICE_ATTR, attr); + if (err != 0) { + error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err)); + return false; + } + + return true; +} + static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr, const char *name) { @@ -181,6 +200,28 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa) } } +int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction) +{ + struct kvm_smccc_filter filter = { + .base = func, + .nr_functions = 1, + .action = faction, + }; + struct kvm_device_attr attr = { + .group = KVM_ARM_VM_SMCCC_CTRL, + .attr = KVM_ARM_VM_SMCCC_FILTER, + .flags = 0, + .addr = (uintptr_t)&filter, + }; + + if (!kvm_arm_set_vm_attr(&attr, "SMCCC Filter")) { + error_report("failed to set SMCCC filter in KVM Host"); + return -1; + } + + return 0; +} + static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id) { uint64_t ret; @@ -629,9 +670,8 @@ int kvm_arch_init_vcpu(CPUState *cs) } /* - * When KVM is in use, PSCI is emulated in-kernel and not by qemu. - * Currently KVM has its own idea about MPIDR assignment, so we - * override our defaults with what we get from KVM. + * KVM may emulate PSCI in-kernel. Currently KVM has its own idea about + * MPIDR assignment, so we override our defaults with what we get from KVM. */ ret = kvm_get_one_reg(cs, ARM64_SYS_REG(ARM_CPU_ID_MPIDR), &mpidr); if (ret) { diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 31408499b3..bf4df54c96 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -388,6 +388,15 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa); int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); +/** + * kvm_arm_set_smccc_filter + * @func: funcion + * @faction: SMCCC filter action(handle, deny, fwd-to-user) to be deployed + * + * Sets the ARMs SMC-CC filter in KVM Host for selective hypercall exits + */ +int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction); + #else /* @@ -462,6 +471,10 @@ static inline uint32_t kvm_arm_sve_get_vls(CPUState *cs) g_assert_not_reached(); } +static inline int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction) +{ + g_assert_not_reached(); +} #endif /** diff --git a/target/arm/meson.build b/target/arm/meson.build index 5d04a8e94f..d1dd4932ed 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -23,6 +23,7 @@ arm_system_ss.add(files( 'arm-qmp-cmds.c', 'cortex-regs.c', 'machine.c', + 'psci.c', 'ptw.c', )) diff --git a/target/arm/tcg/psci.c b/target/arm/psci.c similarity index 97% rename from target/arm/tcg/psci.c rename to target/arm/psci.c index 6c1239bb96..a8690a16af 100644 --- a/target/arm/tcg/psci.c +++ b/target/arm/psci.c @@ -21,7 +21,9 @@ #include "exec/helper-proto.h" #include "kvm-consts.h" #include "qemu/main-loop.h" +#include "qemu/error-report.h" #include "sysemu/runstate.h" +#include "sysemu/tcg.h" #include "internals.h" #include "arm-powerctl.h" @@ -157,6 +159,11 @@ void arm_handle_psci_call(ARMCPU *cpu) case QEMU_PSCI_0_1_FN_CPU_SUSPEND: case QEMU_PSCI_0_2_FN_CPU_SUSPEND: case QEMU_PSCI_0_2_FN64_CPU_SUSPEND: + if (!tcg_enabled()) { + warn_report("CPU suspend not supported in non-tcg mode"); + break; + } +#ifdef CONFIG_TCG /* Affinity levels are not supported in QEMU */ if (param[1] & 0xfffe0000) { ret = QEMU_PSCI_RET_INVALID_PARAMS; @@ -169,6 +176,7 @@ void arm_handle_psci_call(ARMCPU *cpu) env->regs[0] = 0; } helper_wfi(env, 4); +#endif break; case QEMU_PSCI_1_0_FN_PSCI_FEATURES: switch (param[1]) { diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build index 6fca38f2cc..ad3cfcb3bd 100644 --- a/target/arm/tcg/meson.build +++ b/target/arm/tcg/meson.build @@ -51,7 +51,3 @@ arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'sme_helper.c', 'sve_helper.c', )) - -arm_system_ss.add(files( - 'psci.c', -)) -- Gitee From c5dfec0bfd78f7e8f84a527a1aa73896f69b2367 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Thu, 10 Aug 2023 01:15:31 +0000 Subject: [PATCH 207/939] hw/arm: Support hotplug capability check using _OSC method Physical CPU hotplug results in (un)setting of ACPI _STA.Present bit. AARCH64 platforms do not support physical CPU hotplug. Virtual CPU hotplug support being implemented toggles ACPI _STA.Enabled Bit to achieve hotplug functionality. This is not same as physical CPU hotplug support. In future, if ARM architecture supports physical CPU hotplug then the current design of virtual CPU hotplug can be used unchanged. Hence, there is a need for firmware/VMM/Qemu to support evaluation of platform wide capabilitiy related to the *type* of CPU hotplug support present on the platform. OSPM might need this during boot time to correctly initialize the CPUs and other related components in the kernel. NOTE: This implementation will be improved to add the support of *query* in the subsequent versions. This is very minimal support to assist kernel. ASL for the implemented _OSC method: Method (_OSC, 4, NotSerialized) // _OSC: Operating System Capabilities { CreateDWordField (Arg3, Zero, CDW1) If ((Arg0 == ToUUID ("0811b06e-4a27-44f9-8d60-3cbbc22e7b48") /* Platform-wide Capabilities */)) { CreateDWordField (Arg3, 0x04, CDW2) Local0 = CDW2 /* \_SB_._OSC.CDW2 */ If ((Arg1 != One)) { CDW1 |= 0x08 } Local0 &= 0x00800000 If ((CDW2 != Local0)) { CDW1 |= 0x10 } CDW2 = Local0 } Else { CDW1 |= 0x04 } Return (Arg3) } Signed-off-by: Salil Mehta --- hw/arm/virt-acpi-build.c | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 2870c1ec5a..c402e102c4 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -940,6 +940,55 @@ static void build_fadt_rev6(GArray *table_data, BIOSLinker *linker, build_fadt(table_data, linker, &fadt, vms->oem_id, vms->oem_table_id); } +static void build_virt_osc_method(Aml *scope, VirtMachineState *vms) +{ + Aml *if_uuid, *else_uuid, *if_rev, *if_caps_masked, *method; + Aml *a_cdw1 = aml_name("CDW1"); + Aml *a_cdw2 = aml_local(0); + + method = aml_method("_OSC", 4, AML_NOTSERIALIZED); + aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1")); + + /* match UUID */ + if_uuid = aml_if(aml_equal( + aml_arg(0), aml_touuid("0811B06E-4A27-44F9-8D60-3CBBC22E7B48"))); + + aml_append(if_uuid, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2")); + aml_append(if_uuid, aml_store(aml_name("CDW2"), a_cdw2)); + + /* check unknown revision in arg(1) */ + if_rev = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1)))); + /* set revision error bits, DWORD1 Bit[3] */ + aml_append(if_rev, aml_or(a_cdw1, aml_int(0x08), a_cdw1)); + aml_append(if_uuid, if_rev); + + /* + * check support for vCPU hotplug type(=enabled) platform-wide capability + * in DWORD2 as sepcified in the below ACPI Specification ECR, + * # https://bugzilla.tianocore.org/show_bug.cgi?id=4481 + */ + if (vms->acpi_dev) { + aml_append(if_uuid, aml_and(a_cdw2, aml_int(0x800000), a_cdw2)); + /* check if OSPM specified hotplug capability bits were masked */ + if_caps_masked = aml_if(aml_lnot(aml_equal(aml_name("CDW2"), a_cdw2))); + aml_append(if_caps_masked, aml_or(a_cdw1, aml_int(0x10), a_cdw1)); + aml_append(if_uuid, if_caps_masked); + } + aml_append(if_uuid, aml_store(a_cdw2, aml_name("CDW2"))); + + aml_append(method, if_uuid); + else_uuid = aml_else(); + + /* set unrecognized UUID error bits, DWORD1 Bit[2] */ + aml_append(else_uuid, aml_or(a_cdw1, aml_int(4), a_cdw1)); + aml_append(method, else_uuid); + + aml_append(method, aml_return(aml_arg(3))); + aml_append(scope, method); + + return; +} + /* DSDT */ static void build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) @@ -974,6 +1023,9 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } else { acpi_dsdt_add_cpus(scope, vms); } + + build_virt_osc_method(scope, vms); + acpi_dsdt_add_uart(scope, &memmap[VIRT_UART], (irqmap[VIRT_UART] + ARM_SPI_BASE)); if (vmc->acpi_expose_flash) { -- Gitee From f797e2713a94b48de59324d00c851d89f4438fc0 Mon Sep 17 00:00:00 2001 From: Miguel Luis Date: Fri, 3 Feb 2023 12:33:41 -0100 Subject: [PATCH 208/939] tcg/mttcg: enable threads to unregister in tcg_ctxs[] [This patch is just for reference. It has problems as it does not takes care of the TranslationBlocks and their assigned regions during CPU unrealize] When using TCG acceleration in a multi-threaded context each vCPU has its own thread registered in tcg_ctxs[] upon creation and tcg_cur_ctxs stores the current number of threads that got created. Although, the lack of a mechanism to unregister these threads is a problem when exercising vCPU hotplug/unplug due to the fact that tcg_cur_ctxs gets incremented everytime a vCPU gets hotplugged but never gets decremented everytime a vCPU gets unplugged, therefore breaking the assert stating tcg_cur_ctxs < tcg_max_ctxs after a certain amount of vCPU hotplugs. Suggested-by: Salil Mehta [SM: Check Things To Do Section, https://lore.kernel.org/all/20200613213629.21984-1-salil.mehta@huawei.com/] Signed-off-by: Miguel Luis --- accel/tcg/tcg-accel-ops-mttcg.c | 1 + include/tcg/startup.h | 5 +++++ tcg/tcg.c | 23 +++++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c index fac80095bb..73866990ce 100644 --- a/accel/tcg/tcg-accel-ops-mttcg.c +++ b/accel/tcg/tcg-accel-ops-mttcg.c @@ -122,6 +122,7 @@ static void *mttcg_cpu_thread_fn(void *arg) qemu_mutex_unlock_iothread(); rcu_remove_force_rcu_notifier(&force_rcu.notifier); rcu_unregister_thread(); + tcg_unregister_thread(); return NULL; } diff --git a/include/tcg/startup.h b/include/tcg/startup.h index f71305765c..c6cb1d92a7 100644 --- a/include/tcg/startup.h +++ b/include/tcg/startup.h @@ -45,6 +45,11 @@ void tcg_init(size_t tb_size, int splitwx, unsigned max_cpus); */ void tcg_register_thread(void); +/** + * tcg_register_thread: Unregister this thread with the TCG runtime + */ +void tcg_unregister_thread(void); + /** * tcg_prologue_init(): Generate the code for the TCG prologue * diff --git a/tcg/tcg.c b/tcg/tcg.c index 896a36caeb..61fcf8597d 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -764,6 +764,14 @@ static void alloc_tcg_plugin_context(TCGContext *s) #endif } +static void free_tcg_plugin_context(TCGContext *s) +{ +#ifdef CONFIG_PLUGIN + g_ptr_array_unref(s->plugin_tb->insns); + g_free(s->plugin_tb); +#endif +} + /* * All TCG threads except the parent (i.e. the one that called tcg_context_init * and registered the target's TCG globals) must register with this function @@ -814,6 +822,21 @@ void tcg_register_thread(void) tcg_ctx = s; } + +void tcg_unregister_thread(void) +{ + TCGContext *s = tcg_ctx; + unsigned int n; + + /* Unclaim an entry in tcg_ctxs */ + n = qatomic_fetch_dec(&tcg_cur_ctxs); + g_assert(n > 1); + qatomic_store_release(&tcg_ctxs[n - 1], 0); + + free_tcg_plugin_context(s); + + g_free(s); +} #endif /* !CONFIG_USER_ONLY */ /* pool based memory allocation */ -- Gitee From 837b04877be49b930a2d437f55e2ae15ff820421 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Sat, 23 Sep 2023 22:31:49 +0000 Subject: [PATCH 209/939] hw/arm/virt: Expose cold-booted CPUs as MADT GICC Enabled Hotpluggable CPUs MUST be exposed as 'online-capable' as per the new change. But cold booted CPUs if made 'online-capable' during boot time might not get detected in the legacy OS. Hence, can cause compatibility problems. Original Change Link: https://bugzilla.tianocore.org/show_bug.cgi?id=3706 Specification change might take time and hence disabling the support of unplugging any cold booted CPUs to preserve the compatibility with legacy OS. Signed-off-by: Salil Mehta --- hw/arm/virt-acpi-build.c | 19 ++++++++++++++----- hw/arm/virt.c | 16 ++++++++++++++++ include/hw/core/cpu.h | 2 ++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index c402e102c4..590afcfa98 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -789,17 +789,26 @@ static uint32_t virt_acpi_get_gicc_flags(CPUState *cpu) } /* - * ARM GIC CPU Interface can be 'online-capable' or 'enabled' at boot - * We MUST set 'online-capable' Bit for all hotpluggable CPUs except the - * first/boot CPU. Cold-booted CPUs without 'Id' can also be unplugged. - * Though as-of-now this is only used as a debugging feature. + * ARM GIC CPU Interface can be 'online-capable' or 'enabled' at boot. We + * MUST set 'online-capable' bit for all hotpluggable CPUs. + * Change Link: https://bugzilla.tianocore.org/show_bug.cgi?id=3706 * * UEFI ACPI Specification 6.5 * Section: 5.2.12.14. GIC CPU Interface (GICC) Structure * Table: 5.37 GICC CPU Interface Flags * Link: https://uefi.org/specs/ACPI/6.5 + * + * Cold-booted CPUs, except for the first/boot CPU, SHOULD be allowed to be + * hot(un)plug as well but for this to happen these MUST have + * 'online-capable' bit set. Later creates compatibility problem with legacy + * OS as it might ignore online-capable' bits during boot time and hence + * some CPUs might not get detected. To fix this MADT GIC CPU interface flag + * should be allowed to have both bits set i.e. 'online-capable' and + * 'Enabled' bits together. This change will require UEFI ACPI standard + * change. Till this happens exposing all cold-booted CPUs as 'enabled' only + * */ - return cpu && !cpu->cpu_index ? 1 : (1 << 3); + return cpu && cpu->cold_booted ? 1 : (1 << 3); } static void diff --git a/hw/arm/virt.c b/hw/arm/virt.c index eedff8e525..ed437ce0e8 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3250,6 +3250,10 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, * This shall be used during the init of ACPI Hotplug state and hot-unplug */ cs->acpi_persistent = true; + + if (!dev->hotplugged) { + cs->cold_booted = true; + } } static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, @@ -3313,6 +3317,18 @@ static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, return; } + /* + * UEFI ACPI standard change is required to make both 'enabled' and the + * 'online-capable' bit co-exist instead of being mutually exclusive. + * check virt_acpi_get_gicc_flags() for more details. + * + * Disable the unplugging of cold-booted vCPUs as a temporary mitigation. + */ + if (cs->cold_booted) { + error_setg(errp, "Hot-unplug of cold-booted CPU not supported!"); + return; + } + if (cs->cpu_index == first_cpu->cpu_index) { error_setg(errp, "Boot CPU(id%d=%d:%d:%d:%d) hot-unplug not supported", first_cpu->cpu_index, cpu->socket_id, cpu->cluster_id, diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 6dbe163548..ee04ee44c2 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -565,6 +565,8 @@ struct CPUState { uint32_t halted; int32_t exception_index; + bool cold_booted; + AccelCPUState *accel; /* shared by kvm and hvf */ bool vcpu_dirty; -- Gitee From 5f7464524d0fb2c25c9bacfb550df92bef9bb3bf Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 14:11:05 +0800 Subject: [PATCH 210/939] system/physmem: Fix possible double free when destroy cpu as address_space_destroy() and g_free_rcu() both operate cpuas->as at rcu thread context asynchronously, each one is a rcu task that have different callback (the first callback is do_address_ space_destroy() and the second callback is g_free()). It's possible that while the first task is pending and the second task overwrites the rcu callback (as the second task operates on the same object). Then the g_free will be called twice on cpuas->as. Signed-off-by: Keqian Zhu --- include/exec/memory.h | 1 + system/memory.c | 3 +++ system/physmem.c | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index e131c2682c..91c42c9a6a 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1114,6 +1114,7 @@ struct AddressSpace { struct rcu_head rcu; char *name; MemoryRegion *root; + bool free_in_rcu; /* Accessed via RCU. */ struct FlatView *current_map; diff --git a/system/memory.c b/system/memory.c index 798b6c0a17..fb817e54bc 100644 --- a/system/memory.c +++ b/system/memory.c @@ -3130,6 +3130,9 @@ static void do_address_space_destroy(AddressSpace *as) g_free(as->name); g_free(as->ioeventfds); memory_region_unref(as->root); + if (as->free_in_rcu) { + g_free(as); + } } void address_space_destroy(AddressSpace *as) diff --git a/system/physmem.c b/system/physmem.c index 299174ad91..cbe838f203 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -788,8 +788,8 @@ void cpu_address_space_destroy(CPUState *cpu, int asidx) memory_listener_unregister(&cpuas->tcg_as_listener); } + cpuas->as->free_in_rcu = true; address_space_destroy(cpuas->as); - g_free_rcu(cpuas->as, rcu); if (cpu->cpu_ases_ref_count == 1) { g_free(cpu->cpu_ases); -- Gitee From b394996c99c0af0de870a5d79fff69f01d504b0c Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 14:47:07 +0800 Subject: [PATCH 211/939] arm/cpu: Some fixes for arm_cpu_unrealizefn() Some minor fixes for arm_cpu_unrealizefn(). Signed-off-by: Keqian Zhu --- target/arm/cpu.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 501f88eb2f..9dd61c10ea 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -2418,6 +2418,7 @@ static void arm_cpu_unrealizefn(DeviceState *dev) CPUState *cs = CPU(dev); bool has_secure; +#ifndef CONFIG_USER_ONLY has_secure = cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY); /* rock 'n' un-roll, whatever happened in the arm_cpu_realizefn cleanly */ @@ -2433,30 +2434,38 @@ static void arm_cpu_unrealizefn(DeviceState *dev) if (has_secure) { cpu_address_space_destroy(cs, ARMASIdx_S); } +#endif destroy_cpreg_list(cpu); arm_cpu_unregister_gdb_regs(cpu); unregister_cp_regs_for_features(cpu); +#ifndef CONFIG_USER_ONLY + if (tcg_enabled() && cpu_isar_feature(aa64_rme, cpu)) { + arm_unregister_el_change_hooks(cpu); + } +#endif + if (cpu->sau_sregion && arm_feature(env, ARM_FEATURE_M_SECURITY)) { g_free(env->sau.rbar); g_free(env->sau.rlar); } if (arm_feature(env, ARM_FEATURE_PMSA) && - arm_feature(env, ARM_FEATURE_V7) && - cpu->pmsav7_dregion) { - if (arm_feature(env, ARM_FEATURE_V8)) { - g_free(env->pmsav8.rbar[M_REG_NS]); - g_free(env->pmsav8.rlar[M_REG_NS]); - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - g_free(env->pmsav8.rbar[M_REG_S]); - g_free(env->pmsav8.rlar[M_REG_S]); + arm_feature(env, ARM_FEATURE_V7)) { + if (cpu->pmsav7_dregion) { + if (arm_feature(env, ARM_FEATURE_V8)) { + g_free(env->pmsav8.rbar[M_REG_NS]); + g_free(env->pmsav8.rlar[M_REG_NS]); + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + g_free(env->pmsav8.rbar[M_REG_S]); + g_free(env->pmsav8.rlar[M_REG_S]); + } + } else { + g_free(env->pmsav7.drbar); + g_free(env->pmsav7.drsr); + g_free(env->pmsav7.dracr); } - } else { - g_free(env->pmsav7.drbar); - g_free(env->pmsav7.drsr); - g_free(env->pmsav7.dracr); } if (cpu->pmsav8r_hdregion) { g_free(env->pmsav8.hprbar); -- Gitee From 14c4062c4acc7d417d163276b65e59073ba18eeb Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 14:51:18 +0800 Subject: [PATCH 212/939] acpi/cpu: Fix cpu_hotplug_hw_init() For the present but disabled vCPUs, they will be released after cpu_hotplug_hw_init(), we should not assign it to AcpiCpuStatus. Signed-off-by: Keqian Zhu --- hw/acpi/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index c922c380aa..b258396e01 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -229,7 +229,6 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, for (i = 0; i < id_list->len; i++) { struct CPUState *cpu = CPU(id_list->cpus[i].cpu); if (qemu_present_cpu(cpu)) { - state->devs[i].cpu = cpu; state->devs[i].is_present = true; } else { if (qemu_persistent_cpu(cpu)) { @@ -240,6 +239,7 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, } if (qemu_enabled_cpu(cpu)) { + state->devs[i].cpu = cpu; state->devs[i].is_enabled = true; } else { state->devs[i].is_enabled = false; -- Gitee From 401e145800134d0310d613f48c4962a108b8ddda Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Sun, 17 Mar 2024 16:37:03 +0800 Subject: [PATCH 213/939] system/cpus: Fix pause_all_vcpus() under concurrent environment Both main loop thread and vCPU thread are allowed to call pause_all_vcpus(), and in general resume_all_vcpus() is called after it. Two issues live in pause_all_vcpus(): 1. There is possibility that during thread T1 waits on qemu_pause_cond with bql unlocked, other thread has called pause_all_vcpus() and resume_all_vcpus(), then thread T1 will stuck, because the condition all_vcpus_paused() is always false. 2. After all_vcpus_paused() has been checked as true, we will unlock bql to relock replay_mutex. During the bql was unlocked, the vcpu's state may has been changed by other thread, so we must retry. Signed-off-by: Keqian Zhu --- system/cpus.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/system/cpus.c b/system/cpus.c index a444a747f0..7c5369fa9c 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -551,12 +551,14 @@ static bool all_vcpus_paused(void) return true; } -void pause_all_vcpus(void) +static void request_pause_all_vcpus(void) { CPUState *cpu; - qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false); CPU_FOREACH(cpu) { + if (cpu->stopped) { + continue; + } if (qemu_cpu_is_self(cpu)) { qemu_cpu_stop(cpu, true); } else { @@ -564,6 +566,14 @@ void pause_all_vcpus(void) qemu_cpu_kick(cpu); } } +} + +void pause_all_vcpus(void) +{ + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false); + +retry: + request_pause_all_vcpus(); /* We need to drop the replay_lock so any vCPU threads woken up * can finish their replay tasks @@ -572,14 +582,23 @@ void pause_all_vcpus(void) while (!all_vcpus_paused()) { qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex); - CPU_FOREACH(cpu) { - qemu_cpu_kick(cpu); - } + /* During we waited on qemu_pause_cond the bql was unlocked, + * the vcpu's state may has been changed by other thread, so + * we must request the pause state on all vcpus again. + */ + request_pause_all_vcpus(); } qemu_mutex_unlock_iothread(); replay_mutex_lock(); qemu_mutex_lock_iothread(); + + /* During the bql was unlocked, the vcpu's state may has been + * changed by other thread, so we must retry. + */ + if (!all_vcpus_paused()) { + goto retry; + } } void cpu_resume(CPUState *cpu) -- Gitee From a29922f76c9b5064ddd2e686fa725b96c435e889 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Sun, 17 Mar 2024 16:37:04 +0800 Subject: [PATCH 214/939] system/cpus: Fix resume_all_vcpus() under vCPU hotplug condition For vCPU being hotplugged, qemu_init_vcpu() is called. In this function, we set vcpu state as stopped, and then wait vcpu thread to be created. As the vcpu state is stopped, it will inform us it has been created and then wait on halt_cond. After we has realized vcpu object, we will resume the vcpu thread. However, during we wait vcpu thread to be created, the bql is unlocked, and other thread is allowed to call resume_all_vcpus(), which will resume the un-realized vcpu. This fixes the issue by filter out un-realized vcpu during resume_all_vcpus(). Signed-off-by: Keqian Zhu --- system/cpus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/system/cpus.c b/system/cpus.c index 7c5369fa9c..f2289e9545 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -618,6 +618,9 @@ void resume_all_vcpus(void) qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true); CPU_FOREACH(cpu) { + if (!object_property_get_bool(OBJECT(cpu), "realized", &error_abort)) { + continue; + } cpu_resume(cpu); } } -- Gitee From 25438f2cdb13d07c1bd228fcf4223c21da368548 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 15:15:31 +0800 Subject: [PATCH 215/939] arm/virt.c: Convey local_err when set psci-conduit Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index ed437ce0e8..934b0412ef 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2323,7 +2323,10 @@ static void virt_cpu_set_properties(Object *cpuobj, const CPUArchId *cpu_slot, */ if (vms->psci_conduit != QEMU_PSCI_CONDUIT_DISABLED) { object_property_set_int(cpuobj, "psci-conduit", vms->psci_conduit, - NULL); + &local_err); + if (local_err) { + goto out; + } /* Secondary CPUs start in PSCI powered-down state */ if (CPU(cpuobj)->cpu_index > 0) { -- Gitee From 00a78edf572783c18a1d4945758371c0f175e321 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 15:41:14 +0800 Subject: [PATCH 216/939] arm/virt: Fix adjudgement of core_id for vcpu hotplugged The core_id should between 0 and ms->smp.cores - 1. Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 934b0412ef..e60f3431f9 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3170,8 +3170,6 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, ARMCPU *cpu = ARM_CPU(dev); CPUState *cs = CPU(dev); CPUArchId *cpu_slot; - int32_t min_cpuid = 0; - int32_t max_cpuid; if (dev->hotplugged && !vms->acpi_dev) { error_setg(errp, "GED acpi device does not exists"); @@ -3196,15 +3194,9 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - max_cpuid = ms->possible_cpus->len - 1; - if (!dev->hotplugged) { - min_cpuid = vms->acpi_dev ? ms->smp.cpus : 0; - max_cpuid = vms->acpi_dev ? max_cpuid : ms->smp.cpus - 1; - } - - if ((cpu->core_id < min_cpuid) || (cpu->core_id > max_cpuid)) { - error_setg(errp, "Invalid core-id %d specified, correct range %d:%d", - cpu->core_id, min_cpuid, max_cpuid); + if ((cpu->core_id < 0) || (cpu->core_id >= ms->smp.cores)) { + error_setg(errp, "Invalid core-id %d specified, correct range 0:%u", + cpu->core_id, ms->smp.cores - 1); return; } -- Gitee From 9de26d69c52db67f48619ad20b8cb9d8ee71e42c Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 15:42:57 +0800 Subject: [PATCH 217/939] accel/kvm: Use correct id for parked vcpu kvm_arch_vcpu_id is correct for all platform. Signed-off-by: Keqian Zhu --- accel/kvm/kvm-all.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 6d503aa614..75a3075c14 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -327,11 +327,10 @@ err: void kvm_park_vcpu(CPUState *cpu) { - unsigned long vcpu_id = cpu->cpu_index; struct KVMParkedVcpu *vcpu; vcpu = g_malloc0(sizeof(*vcpu)); - vcpu->vcpu_id = vcpu_id; + vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); vcpu->kvm_fd = cpu->kvm_fd; QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); } -- Gitee From 85e8e1ee8560e587845142342f81b218e44cba6a Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 22:07:33 +0800 Subject: [PATCH 218/939] arm/kvm: Set psci smccc filter only with vcpu hotplug The smccc filter mechanism is supported by newer Linux kernel, don't try to do it unconditionaly. Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 4 +++- target/arm/kvm.c | 21 ++++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index e60f3431f9..38b5d214a1 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2366,8 +2366,10 @@ static void machvirt_init(MachineState *machine) finalize_gic_version(vms); if (tcg_enabled() || hvf_enabled() || qtest_enabled() || (vms->gic_version < VIRT_GIC_VERSION_3)) { - machine->smp.max_cpus = smp_cpus; mc->has_hotpluggable_cpus = false; + } + if (!mc->has_hotpluggable_cpus) { + machine->smp.max_cpus = smp_cpus; warn_report("cpu hotplug feature has been disabled"); } diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 66caf9e5e7..19783d567f 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -259,6 +259,7 @@ int kvm_arch_get_default_type(MachineState *ms) int kvm_arch_init(MachineState *ms, KVMState *s) { + MachineClass *mc = MACHINE_GET_CLASS(ms); int ret = 0; /* For ARM interrupt delivery is always asynchronous, @@ -316,15 +317,17 @@ int kvm_arch_init(MachineState *ms, KVMState *s) * filter in the Host KVM. This is required to support features like * virtual CPU Hotplug on ARM platforms. */ - if (kvm_arm_set_smccc_filter(PSCI_0_2_FN64_CPU_ON, - KVM_SMCCC_FILTER_FWD_TO_USER)) { - error_report("CPU On PSCI-to-user-space fwd filter install failed"); - abort(); - } - if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF, - KVM_SMCCC_FILTER_FWD_TO_USER)) { - error_report("CPU Off PSCI-to-user-space fwd filter install failed"); - abort(); + if (mc->has_hotpluggable_cpus && ms->smp.max_cpus > ms->smp.cpus) { + if (kvm_arm_set_smccc_filter(PSCI_0_2_FN64_CPU_ON, + KVM_SMCCC_FILTER_FWD_TO_USER)) { + error_report("CPU On PSCI-to-user-space fwd filter install failed"); + mc->has_hotpluggable_cpus = false; + } + if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF, + KVM_SMCCC_FILTER_FWD_TO_USER)) { + error_report("CPU Off PSCI-to-user-space fwd filter install failed"); + mc->has_hotpluggable_cpus = false; + } } kvm_arm_init_debug(s); -- Gitee From 343b61303152b06f9e1ba6d09a405faeaa3fcc98 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 22:12:58 +0800 Subject: [PATCH 219/939] intc/gicv3: Fixes for vcpu hotplug 1. Some types of machine don't support possible_cpus callback. 2. The cpu_update_notifier is register only when machine support vcpu hotplug, so do notifier_remove() unconditi- onally is wrong. Signed-off-by: Keqian Zhu --- cpu-common.c | 4 ++++ hw/intc/arm_gicv3_common.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpu-common.c b/cpu-common.c index da52e45760..54e63b3f77 100644 --- a/cpu-common.c +++ b/cpu-common.c @@ -113,6 +113,10 @@ CPUState *qemu_get_possible_cpu(int index) MachineState *ms = MACHINE(qdev_get_machine()); const CPUArchIdList *possible_cpus = ms->possible_cpus; + if (possible_cpus == NULL) { + return qemu_get_cpu(index); + } + assert((index >= 0) && (index < possible_cpus->len)); return CPU(possible_cpus->cpus[index].cpu); diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c index d051024a30..5667d9f40b 100644 --- a/hw/intc/arm_gicv3_common.c +++ b/hw/intc/arm_gicv3_common.c @@ -25,6 +25,7 @@ #include "qapi/error.h" #include "qemu/module.h" #include "qemu/error-report.h" +#include "hw/boards.h" #include "hw/core/cpu.h" #include "hw/intc/arm_gicv3_common.h" #include "hw/qdev-properties.h" @@ -446,7 +447,7 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) s->cpu = g_new0(GICv3CPUState, s->num_cpu); for (i = 0; i < s->num_cpu; i++) { - CPUState *cpu = qemu_get_possible_cpu(i); + CPUState *cpu = qemu_get_possible_cpu(i) ? : qemu_get_cpu(i); uint64_t cpu_affid; if (qemu_enabled_cpu(cpu)) { @@ -506,8 +507,12 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) static void arm_gicv3_finalize(Object *obj) { GICv3State *s = ARM_GICV3_COMMON(obj); + Object *ms = qdev_get_machine(); + MachineClass *mc = MACHINE_GET_CLASS(ms); - notifier_remove(&s->cpu_update_notifier); + if (mc->has_hotpluggable_cpus) { + notifier_remove(&s->cpu_update_notifier); + } g_free(s->redist_region_count); } -- Gitee From 6e17d32d6df25d4fac1a31da61d89e0bb9c8c7da Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 22:20:20 +0800 Subject: [PATCH 220/939] acpi/ged: Init cpu hotplug only when machine support it Signed-off-by: Keqian Zhu --- hw/acpi/generic_event_device.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 0266733a54..6e4f5f075f 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -403,6 +403,7 @@ static void acpi_ged_initfn(Object *obj) AcpiGedState *s = ACPI_GED(dev); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); GEDState *ged_st = &s->ged_state; + MachineClass *mc; memory_region_init_io(&ged_st->evt, obj, &ged_evt_ops, ged_st, TYPE_ACPI_GED, ACPI_GED_EVT_SEL_LEN); @@ -427,12 +428,15 @@ static void acpi_ged_initfn(Object *obj) TYPE_ACPI_GED "-regs", ACPI_GED_REG_COUNT); sysbus_init_mmio(sbd, &ged_st->regs); - s->cpuhp.device = OBJECT(s); - memory_region_init(&s->container_cpuhp, OBJECT(dev), "cpuhp container", - ACPI_CPU_HOTPLUG_REG_LEN); - sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container_cpuhp); - cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), - &s->cpuhp_state, 0); + mc = MACHINE_GET_CLASS(qdev_get_machine()); + if (mc->possible_cpu_arch_ids) { + s->cpuhp.device = OBJECT(s); + memory_region_init(&s->container_cpuhp, OBJECT(dev), "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); + } } static void acpi_ged_class_init(ObjectClass *class, void *data) -- Gitee From 7af2722536b4b0d80f6c508066e8e77158869923 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 23:34:01 +0800 Subject: [PATCH 221/939] acpi/ged: Remove cpuhp field of ged It's unused. Signed-off-by: Keqian Zhu --- hw/acpi/generic_event_device.c | 1 - include/hw/acpi/generic_event_device.h | 1 - 2 files changed, 2 deletions(-) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 6e4f5f075f..4731a614a3 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -430,7 +430,6 @@ static void acpi_ged_initfn(Object *obj) mc = MACHINE_GET_CLASS(qdev_get_machine()); if (mc->possible_cpu_arch_ids) { - s->cpuhp.device = OBJECT(s); memory_region_init(&s->container_cpuhp, OBJECT(dev), "cpuhp container", ACPI_CPU_HOTPLUG_REG_LEN); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container_cpuhp); diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index a803ea818e..90fc41cbb8 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -110,7 +110,6 @@ struct AcpiGedState { MemoryRegion container_memhp; CPUHotplugState cpuhp_state; MemoryRegion container_cpuhp; - AcpiCpuHotplug cpuhp; GEDState ged_state; uint32_t ged_event_bitmap; qemu_irq irq; -- Gitee From 0bee56446962676992d11e5879f6fbac57e785e8 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 23:38:31 +0800 Subject: [PATCH 222/939] arm/virt-acpi: Require possible_cpu_arch_ids for build_cpus_aml() As the acpi_dev requires possible_cpu_arch_ids to support vcpu hotplug. Signed-off-by: Keqian Zhu --- hw/arm/virt-acpi-build.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 590afcfa98..46642efac4 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -1003,6 +1003,7 @@ static void build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); + MachineClass *mc = MACHINE_GET_CLASS(vms); Aml *scope, *dsdt; MachineState *ms = MACHINE(vms); const MemMapEntry *memmap = vms->memmap; @@ -1020,7 +1021,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) */ scope = aml_scope("\\_SB"); /* if GED is enabled then cpus AML shall be added as part build_cpus_aml */ - if (vms->acpi_dev) { + if (mc->has_hotpluggable_cpus) { CPUHotplugFeatures opts = { .acpi_1_compatible = false, .has_legacy_cphp = false -- Gitee From baa26f2fc075522f91c3e9a332fc4fa3f3b167bf Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 22:55:49 +0800 Subject: [PATCH 223/939] arm/virt: Consider has_ged when set mc->has_hotpluggable_cpus Vcpu hotplug relies on ged device. Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 38b5d214a1..00e57f2d75 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2357,6 +2357,7 @@ static void machvirt_init(MachineState *machine) bool has_ged = !vmc->no_ged; unsigned int smp_cpus = machine->smp.cpus; unsigned int max_cpus = machine->smp.max_cpus; + ObjectClass *cpu_class; if (!cpu_type_valid(machine->cpu_type)) { error_report("mach-virt: CPU type %s not supported", machine->cpu_type); @@ -2364,14 +2365,6 @@ static void machvirt_init(MachineState *machine) } finalize_gic_version(vms); - if (tcg_enabled() || hvf_enabled() || qtest_enabled() || - (vms->gic_version < VIRT_GIC_VERSION_3)) { - mc->has_hotpluggable_cpus = false; - } - if (!mc->has_hotpluggable_cpus) { - machine->smp.max_cpus = smp_cpus; - warn_report("cpu hotplug feature has been disabled"); - } possible_cpus = mc->possible_cpu_arch_ids(machine); @@ -2501,6 +2494,21 @@ static void machvirt_init(MachineState *machine) create_fdt(vms); qemu_log("cpu init start\n"); + cpu_class = object_class_by_name(machine->cpu_type); + has_ged = has_ged && firmware_loaded && + virt_is_acpi_enabled(vms) && + !!object_class_dynamic_cast(cpu_class, TYPE_AARCH64_CPU); + if (tcg_enabled() || hvf_enabled() || qtest_enabled() || + (vms->gic_version < VIRT_GIC_VERSION_3) || !has_ged) { + mc->has_hotpluggable_cpus = false; + } + if (!mc->has_hotpluggable_cpus) { + if (machine->smp.max_cpus > smp_cpus) { + warn_report("cpu hotplug feature has been disabled"); + } + machine->smp.max_cpus = smp_cpus; + } + notifier_list_init(&vms->cpuhp_notifiers); possible_cpus = mc->possible_cpu_arch_ids(machine); assert(possible_cpus->len == max_cpus); @@ -2581,8 +2589,6 @@ static void machvirt_init(MachineState *machine) create_gic(vms, sysmem); - has_ged = has_ged && aarch64 && firmware_loaded && - virt_is_acpi_enabled(vms); if (has_ged) { vms->acpi_dev = create_acpi_ged(vms); } -- Gitee From 519699c61eeb980bb7d7f443eb95c0406aae82da Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 26 Mar 2024 23:05:39 +0800 Subject: [PATCH 224/939] arm/virt: Require mc->has_hotpluggable_cpus for cold-plugged vcpu Cold-plugged vCPU also need mc->has_hotpluggable_cpus. Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 00e57f2d75..73b29c7f73 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3179,16 +3179,6 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, CPUState *cs = CPU(dev); CPUArchId *cpu_slot; - if (dev->hotplugged && !vms->acpi_dev) { - error_setg(errp, "GED acpi device does not exists"); - return; - } - - if (dev->hotplugged && !mc->has_hotpluggable_cpus) { - error_setg(errp, "CPU hotplug not supported on this machine"); - return; - } - /* sanity check the cpu */ if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", @@ -3222,6 +3212,17 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, cs->cpu_index = virt_get_cpu_id_from_cpu_topo(ms, dev); + /* Except for cold-booted vCPUs, this should check presence of ACPI GED */ + if (cs->cpu_index >= ms->smp.cpus && !vms->acpi_dev) { + error_setg(errp, "GED acpi device does not exists"); + return; + } + + if (cs->cpu_index >= ms->smp.cpus && !mc->has_hotpluggable_cpus) { + error_setg(errp, "CPU [cold|hot]plug not supported on this machine"); + return; + } + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); if (qemu_present_cpu(CPU(cpu_slot->cpu))) { error_setg(errp, "cpu(id%d=%d:%d:%d:%d) with arch-id %" PRIu64 " exist", -- Gitee From cecec52ca38fa98a821c2a833e71a5fae1cc735d Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Tue, 2 Apr 2024 20:10:51 +0800 Subject: [PATCH 225/939] tests/acpi: Update expected ACPI tables for vcpu hotplug Update the ACPI tables for vcpu hotplug. Signed-off-by: Keqian Zhu --- tests/data/acpi/pc/DSDT | Bin 6830 -> 6864 bytes tests/data/acpi/pc/DSDT.acpierst | Bin 6741 -> 6775 bytes tests/data/acpi/pc/DSDT.acpihmat | Bin 8155 -> 8189 bytes tests/data/acpi/pc/DSDT.bridge | Bin 13701 -> 13735 bytes tests/data/acpi/pc/DSDT.cphp | Bin 7294 -> 7328 bytes tests/data/acpi/pc/DSDT.dimmpxm | Bin 8484 -> 8518 bytes tests/data/acpi/pc/DSDT.hpbridge | Bin 6781 -> 6815 bytes tests/data/acpi/pc/DSDT.hpbrroot | Bin 3337 -> 3371 bytes tests/data/acpi/pc/DSDT.ipmikcs | Bin 6902 -> 6936 bytes tests/data/acpi/pc/DSDT.memhp | Bin 8189 -> 8223 bytes tests/data/acpi/pc/DSDT.nohpet | Bin 6688 -> 6722 bytes tests/data/acpi/pc/DSDT.numamem | Bin 6836 -> 6870 bytes tests/data/acpi/pc/DSDT.roothp | Bin 10623 -> 10657 bytes tests/data/acpi/q35/DSDT | Bin 8355 -> 8389 bytes tests/data/acpi/q35/DSDT.acpierst | Bin 8372 -> 8406 bytes tests/data/acpi/q35/DSDT.acpihmat | Bin 9680 -> 9714 bytes tests/data/acpi/q35/DSDT.acpihmat-noinitiator | Bin 8634 -> 8668 bytes tests/data/acpi/q35/DSDT.applesmc | Bin 8401 -> 8435 bytes tests/data/acpi/q35/DSDT.bridge | Bin 11968 -> 12002 bytes tests/data/acpi/q35/DSDT.cphp | Bin 8819 -> 8853 bytes tests/data/acpi/q35/DSDT.cxl | Bin 9713 -> 9747 bytes tests/data/acpi/q35/DSDT.dimmpxm | Bin 10009 -> 10043 bytes tests/data/acpi/q35/DSDT.ipmibt | Bin 8430 -> 8464 bytes tests/data/acpi/q35/DSDT.ipmismbus | Bin 8443 -> 8477 bytes tests/data/acpi/q35/DSDT.ivrs | Bin 8372 -> 8406 bytes tests/data/acpi/q35/DSDT.memhp | Bin 9714 -> 9748 bytes tests/data/acpi/q35/DSDT.mmio64 | Bin 9485 -> 9519 bytes tests/data/acpi/q35/DSDT.multi-bridge | Bin 13208 -> 13242 bytes tests/data/acpi/q35/DSDT.noacpihp | Bin 8235 -> 8269 bytes tests/data/acpi/q35/DSDT.nohpet | Bin 8213 -> 8247 bytes tests/data/acpi/q35/DSDT.numamem | Bin 8361 -> 8395 bytes tests/data/acpi/q35/DSDT.pvpanic-isa | Bin 8456 -> 8490 bytes tests/data/acpi/q35/DSDT.tis.tpm12 | Bin 8961 -> 8995 bytes tests/data/acpi/q35/DSDT.tis.tpm2 | Bin 8987 -> 9021 bytes tests/data/acpi/q35/DSDT.viot | Bin 9464 -> 9498 bytes tests/data/acpi/virt/DSDT | Bin 5669 -> 5814 bytes tests/data/acpi/virt/DSDT.acpihmatvirt | Bin 7178 -> 7323 bytes tests/data/acpi/virt/DSDT.memhp | Bin 7030 -> 7175 bytes tests/data/acpi/virt/DSDT.pxb | Bin 8152 -> 8297 bytes tests/data/acpi/virt/DSDT.topology | Bin 9190 -> 9335 bytes tests/qtest/bios-tables-test-allowed-diff.h | 40 ------------------ 41 files changed, 40 deletions(-) diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT index c93ad6b7f83a168a1833d7dba1112dd2ab8a431f..714279255b1c17241109cfaaf25a1e1ccb41259e 100644 GIT binary patch delta 125 zcmZ2ydcl;-CDAX8JH%o zWz%5tGM#*#&5X&FV=_OxshRoY7=0Fyh5%*%X)AX8JH%o zWz%5tGM#*#&5X&FV=_OxshRoY7=0Fyh5%;M1& diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat index 9d3695ff289036856886a093733926667a32a058..295e3d7bf86d080cefb9d1571cb84c7325a1d98d 100644 GIT binary patch delta 108 zcmca@|JR<&CD|Nj6Rg_xNC^_~<6z$vSLtjJlJ%*%a8F1A>AX8JH%o zWz%5tw4QvO&5_BKW3m*xshIiX7=0Fyh5%>p099}t#{d8T delta 73 zcmV-P0Ji`AKifYFL{mgm+aCY`0m!im83qYXL|;=vUt5AmlPm^a0VtD;1|R|e1Cz1_ fC<075liUVD0!Rjv9|tv)YX=&W@d*{Ppa)$TEBF;t diff --git a/tests/data/acpi/pc/DSDT.bridge b/tests/data/acpi/pc/DSDT.bridge index 840b45f354ac14c858d0af8fbd31e97949a65d4b..ff9694ee4cbd7213b1f9f9e684c6c9feb9564327 100644 GIT binary patch delta 125 zcmZq8UY^b666_MP+?0WVF>522BpaurV|=hvd~}oVWF59RM%~HXYzpko0YSlx3`~>P zvS~1RnNGgWX2#^oF`1v;)XaQxj6MrULx8iVK|qjeFj(mVDNJEkKgI$s{)C0B1zfzF IW7#$I09t+_egFUf delta 76 zcmZ3U-I~qi66_MvYRbUCD7leKl8w{fE)EB*7P1y_@g_{-65YI> H{k#YOSeG66 delta 69 zcmV-L0J{I6IsP~bL{mgmejES*0dKJi83qYYTVGQ`Ut5AmlPm^a0VtD;1|R|e1Cz1_ bC<0ATliUV!0!RjvPzN=WoCg}Sya(hIsSp$s diff --git a/tests/data/acpi/pc/DSDT.dimmpxm b/tests/data/acpi/pc/DSDT.dimmpxm index 1294f655d418dbdccc095e0d47ab220869a61a07..6b74c6cdcb01aa2cb781715a8b595072da5c2fab 100644 GIT binary patch delta 103 zcmZ4Dbj*p%CDAX8JH#? zWYb`BcAxy3Et1KVW3n5&shIiX7=0Fyh5%gvs@sqMOgKS4jc@)EycW diff --git a/tests/data/acpi/pc/DSDT.hpbridge b/tests/data/acpi/pc/DSDT.hpbridge index 8012b5eb3155377dc7995b73059ecb267d19232c..39e93ce02b6cd6db664f5469c6b974675205ced8 100644 GIT binary patch delta 125 zcmexsGT)TTCDAX8JH#~ zuxc=QnNIFvHDhz-a1IV}oUG5LXUUMTzOkr0)#sV(> QgoUgHT)dkF*nYAB02x0casU7T delta 98 zcmZ22)hWg066_Mf$;-gNsJD^p3=5~fU3{=pd~}n?U_1CDAX8JH%o zWz%5tGM#*#&5X&FV=_OxshRoY7=0Fyh5%P zvS~0mm`%RUX2#^oF`1v;)XaQxj6MrULx8iVK|qjeFj(mVDNJEkKgI$s{)C0B1zfzF JW7!u-000gPAuRv^ delta 76 zcmV-S0JHy}K>a@oL{mgm{T~1T0sXNG83qYOKVMTqUt5AmlPm^a0VtD;1|R|e1Cz1_ iC;~$?liUV221o`&Q&d5do(C=h0AQ0}2OG112cH;ceig<5 diff --git a/tests/data/acpi/pc/DSDT.nohpet b/tests/data/acpi/pc/DSDT.nohpet index 1754c6878839fc657230e1e714cd7c5142e0a77e..b8b37e810e17176cfc3ce105bf3a8c97de627f8d 100644 GIT binary patch delta 125 zcmZ2ra>#_sCD=d&A diff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem index 9fc731d3d2bcde5e2612a8ccd81e12098134afe9..6283080304b6b6d80346457a0f428867a9e4a09b 100644 GIT binary patch delta 125 zcmdmDdd-x}CDE^a0kyFT83qYOKVMTqUt5AmlPm^a0VtD;1|R|e1Cz1_ iC;~$?liUV221o`&Q&d5do(C=h0AQ0}2OG112UZg{ITX79 diff --git a/tests/data/acpi/pc/DSDT.roothp b/tests/data/acpi/pc/DSDT.roothp index e654c83ebe40c413b204c711adcefe3f04655e8c..19b5f0ceea9ad696e6fff21f0cb9955217b017ac 100644 GIT binary patch delta 125 zcmew#v@n>*%X)0CDAX8JH%| zmegSKGM#)>(u~QKWAa}~Q!(?&G5RbZ4FS%c1_42?!Cn7P1y_@g_{(C?mQ# IQOb!80A}|c$^ZZW delta 76 zcmX@=xY&`)CDu1FqxnWIc1ccuyZB(I_~<5&$x@PWj2e?GB^8($m?tlg)L`;8 go_tEujLn11IXJ{|a*vcA6GOseUn%L$c~aqQ02K=qHUIzs diff --git a/tests/data/acpi/q35/DSDT.acpierst b/tests/data/acpi/q35/DSDT.acpierst index 46fd25400b7c00ee9149ddb64cb5d5bd73f6a82b..f91cbe55fcfeea319babf7c9a0c6a6ccdc3320d1 100644 GIT binary patch delta 108 zcmdnuc+HW^CDAX8JH%| zmegSKGM#)>(u~QKWAa}~Q!(?&G5RbZ4FS%c1_42?!Cn7P1y_@g_{(C?mQ# IQ7V`X09c|Ot^fc4 delta 76 zcmccSxW$pnCD2b8v{`AX8JH%| zmegSKw4Qua(vj1Z!#Oy_kuzF(vZ9Re*F7d%m@zG5llcgl%7&RtWN-8ihFi&0}slnu9 zIr)^NBa;W)WL_y#DXGaZ`Ya%|0nVNV0YR?8&H=7|j0If$36mXVL^tL{mgmx*-4n0ll#b2N?-QP+wC*Ut5AmlNuRb0VtDs86W}x1CyW` iC;~)9lgb%c21o`&Q&d5duNp1_0AQ1F8XL2b8cPRMBNdqd diff --git a/tests/data/acpi/q35/DSDT.applesmc b/tests/data/acpi/q35/DSDT.applesmc index 944209adeaa5bbb722431161c404cb51b8209993..a5d032b7d96113c9393036b2ba831adb6d584142 100644 GIT binary patch delta 108 zcmccU_}P)mCDAX8JH%| zmegSKGM#)>(u~QKWAa}~Q!(?&G5RbZ4FS%c1_42?!Cn7P1y_@g_{(C?mQ# IQL2Cq0GY8KasU7T delta 76 zcmezDc+ru|CDQ`zV&nCD4FZB(gTYEC8%jyDEo3d=;!T*mQATug IqEx&H0I`T3t^fc4 delta 76 zcmaD9dmxs}CDQ!(?&G5RbZ4FS%c1_42?!CzZ%vD0Bsc$ diff --git a/tests/data/acpi/q35/DSDT.cxl b/tests/data/acpi/q35/DSDT.cxl index 145301c52af9a17242bb306c210f8a7e0f01b827..27a1726af2d71417e0cd41b2a14bea0d9c410225 100644 GIT binary patch delta 108 zcmez9J=urLCD!(S0KqyCkQhV|=hvd~}oVWGTrwM%~Gkk_zn30YSlx3`~<} zOKLEAnNB_`X~yKrG5N2gshIiX7=0Fyh5%AX8JH%o zm(*Z#cAtDxGLp@e!#Oy_aq>bbJxhj!1?H1u^jSdq0-QY!0)kwF!8#U5VG6tYF&1#~ QCoE(w;NsofBsHA}0HD?-HUIzs delta 91 zcmdn(H`9;HCDNE4v?QlTKzy)Md~}n?WIf3^Mvck6k_t==%#*iBYB0IE tPQE7@$>hN{*;2|>QfhLHJ_|@~fU~DTK#*&&bAYQKV*wZc<|R_oc>rS!7_9&R diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt index 45f911ada5645f158f3d6c0c430ec1d52cadc5d8..25f43ae8efb55364a739e6b5e3cb4e71e61862b0 100644 GIT binary patch delta 108 zcmaFoIKhd_CDAX8JH%| zmegSKGM#)>(u~QKWAa}~Q!(?&G5RbZ4FS%c1_42?!Cn7P1y_@g_{(C?mQ# IQL2{>06xVXs{jB1 delta 76 zcmbQ>^v;pXCD`WA;Wac1ccuyZB(I_~<5&$x@PWj2e?GB^8($m?tlg)L`;8 go_tEujLn11IXJ{|a*vcA6GOseUn%L$c~Xnm06Z5J7XSbN diff --git a/tests/data/acpi/q35/DSDT.ipmismbus b/tests/data/acpi/q35/DSDT.ipmismbus index e5d6811bee1233d74236453c49060390d74d4416..32bcd25bda9e9d2775790385f8da6a11e9d5cb46 100644 GIT binary patch delta 108 zcmezEIM<2GCD02)FaPyhe` delta 76 zcmV-S0JHy{Li<4qL{mgm`yc=S0o<_)2N?-ZK3`KpUt5AmlNuRb0VtDs86W}x1CyW` iC<0A0lgb%321o`&Q&d5dj2bQi0AQ0%8XL228m|Uoloh-H diff --git a/tests/data/acpi/q35/DSDT.ivrs b/tests/data/acpi/q35/DSDT.ivrs index 46fd25400b7c00ee9149ddb64cb5d5bd73f6a82b..f91cbe55fcfeea319babf7c9a0c6a6ccdc3320d1 100644 GIT binary patch delta 108 zcmdnuc+HW^CDAX8JH%| zmegSKGM#)>(u~QKWAa}~Q!(?&G5RbZ4FS%c1_42?!Cn7P1y_@g_{(C?mQ# IQ7V`X09c|Ot^fc4 delta 76 zcmccSxW$pnCD2b8v{`50iyCkQ(V|=hvd~}oVWGTrwM%~Gkk_zn30YSlx3`~<} zOKLDVm`y$^X~yKrG5N2gshIiX7=0Fyh5%66_M9ugbu{sJ)SkU6Rw?F+SKSKDxTqweHNNd4FZB(gTYEC8%jyDEo3d=;!T*mQATug IqLjHL0EhJ+B>(^b delta 76 zcmdm$J|msWCDNGPy72LuH(GB8c{ zlh9!DGM!u@VaDXjF?oZ8shRoY7=0Fyh5%u-bvkCDnCDoVSr39y=V|=hvd~}oVAX8JH&X zN@_59nNBvBG-Gn*n4B+ZYGyt;MxOBc1ccm$M|5U_~<6z$x@PWjJlI6B^B761A>AX8JH%| zmegQ!Fq?c-(u~QKWAa}~Q!(?&G5RbZ4FS%c1_42?!Cn7P1y_@g_{(C?mQ# IQObi207+UMdH?_b delta 76 zcmX@@xYCi!CD2b8v{`TqweHNNdUP0Qtck6951J delta 76 zcmZ4G)ZxVC66_Mfp~%3%_;Mo`yCkQ-U3{=pd~}n?WGTrwMvcjpk_t==%##;LYB2d4 gPd+7S#^%B1930{}xkpNmi6LRKuaxxWJgGfw010Fie*gdg diff --git a/tests/data/acpi/q35/DSDT.tis.tpm12 b/tests/data/acpi/q35/DSDT.tis.tpm12 index e381ce4cbf2b11f56a2d0537db4d21acc97450c9..29a416f0508655d2bfde01fff4d25ad7f89581d9 100644 GIT binary patch delta 108 zcmZp4TkOW=66_M9tjxf`xML$1yCkQhV|=hvd~}oVWGTrwM%~Gkk_zn30YSlx3`~<} zOKLEAnNB_`X~yKrG5N2gshIiX7=0Fyh5%TqsHV)Nd+bb=E(~rHJE&j gC!dluWAk8h4i0gg+#{vO#E>xAS4w(wp43(j0Oqn2?EnA( diff --git a/tests/data/acpi/q35/DSDT.tis.tpm2 b/tests/data/acpi/q35/DSDT.tis.tpm2 index a09253042ce4a715922027245de8a2ab7449c5b7..59288f02c43cf2efc1555599131fde05dbbaa1cd 100644 GIT binary patch delta 108 zcmbR3w%3izCD0PFk|4*&oF diff --git a/tests/data/acpi/q35/DSDT.viot b/tests/data/acpi/q35/DSDT.viot index 64e81f571120e3eb2b8c6c9545293a78c75b7bbd..d4d05ed620114ce793a59285e6237b566dd1390a 100644 GIT binary patch delta 108 zcmez2Im?U7CDpF delta 76 zcmV-S0JHy^O87|%L{mgm_#^-T0pzg?2N?-ZK3`KpUt5AmlNuRb0VtDs86W}x1CyW` iC<0A0lgb%321o`&Q&d5dj2bQi0AQ0%8XL228mtOupB2ym diff --git a/tests/data/acpi/virt/DSDT b/tests/data/acpi/virt/DSDT index dd8573f0312918ea1ba17496e9f3a275e98ab214..d49ead54fa2d8fcd6c0f25ba74e748d90fec3551 100644 GIT binary patch delta 201 zcmZ3gvrU)FCDqXqzm5LunT!@$tMkgz~_ z(qym-h@1dOt{?#)q`gBi^xzY~>_Tp%&I eI)RIWfr}$XpE*9*Dc(6CG}t-V*>5tJm^1)(;5q#O delta 71 zcmdm{yHtnECDC{9nX{LsRjq1M{H_u`!6J~N^pDZL6EGrxD8WzG?;2P}e bldwQwa&-b169X4hTR?b_6Ia9J*<#WF*GLom diff --git a/tests/data/acpi/virt/DSDT.acpihmatvirt b/tests/data/acpi/virt/DSDT.acpihmatvirt index 2581d1937d531db5ad4c8146755a75199ebc0e5a..c753f34bb050d146f4dc3195ec850ea26b3141ba 100644 GIT binary patch delta 224 zcmeCOm~F}B66_K(TZVywY3)QVX{NU08`U4M*NeEb$NL96vvg%MIJ<-!F7RZXoFOR8 zC6Kp4kVD<;r%P{w&F({b)gE2hTr5BZMg<@uVS&J;$(f9`ARSyBAhJ4vhk>DiAz^{= zq{(0v5IF&mTtNazWFZek2xN-klI-XvD`$USgUx&#`NB-z9Fyy%gU!t1UBf~+3tWR; yeSkJhu1?@$V&GzG3kVN#;%ZnR0TSR~;NpnUXO0hcigykO4R#K8_M7}(S{eXj*+F^$ delta 47 zcmbPj*=51y66_MfCBwkLWH^yanyGc)M)e2mo9j8sgqfV!Cx4d?W|W=mEh96zK}H$? DI@k?o diff --git a/tests/data/acpi/virt/DSDT.memhp b/tests/data/acpi/virt/DSDT.memhp index d764481440adea59d928a1a48afe0ba1ce135597..8c6ae4dcd57481b374607a21a45b95665c3661cb 100644 GIT binary patch delta 203 zcmexn)^5S&66_MfF2lgU)H0DvnyG2uM)lu}^&;-<@&3WiEM3_Q&Mx7G3p`mTX9x;& z3FK`MC&5Ev-^- gT%Ewh!NA24qt6^4>=f@D5E|?p?CiJMQp}bW00Y4~+yDRo delta 49 zcmZp-_-4lC66_LECe6UW^m-zfG*jc&jq1M{H_u`!6J~Pcnk*z1%;du|IbFY~ FRse_R4;cUe diff --git a/tests/data/acpi/virt/DSDT.pxb b/tests/data/acpi/virt/DSDT.pxb index 9ff22b5ea465d2f678beb2ce9d905861d69a5b87..1e91767c3045bb8569fd7d5dfa991348ed625944 100644 GIT binary patch delta 202 zcmca%|I&fWCD za|z^a5adwz`svb}V6*#>UbRP8HWv#}fl&d7NLU~+X>ulGEl39!2Z*dr;9+2BU`SXX zJZUmm1w>8&Bv+6C5?ROt5dxWFxFkEe$;#Q^*I=_IlaefxH^*dO`Cvw~$qVIWBo|0b fu1?_MVBq42(Pxeic8Yfn2n}`)cJ`aRS6&(bZLK-@ delta 48 zcmaFqaKoOSDV%$87NmG`|iGA`>`Cvxb$&v~(ll>H= E0hq!MH2?qr diff --git a/tests/data/acpi/virt/DSDT.topology b/tests/data/acpi/virt/DSDT.topology index f54f686161935f70897a3c98c631a4a01994d6a4..4fc186675dd2ac21f2f0cff75465474941180ae1 100644 GIT binary patch delta 201 zcmaFn{@sJiCDC&5Ev-^-*x=!Hi~;*DK3NE|8d9 eoxsJxz{L@x&m14@6z?1m8tfeG>^J$evNQnFggb%& delta 71 zcmezF@ywmeCD Date: Wed, 3 Apr 2024 16:34:39 +0800 Subject: [PATCH 226/939] hw/net/virtio-net: fix qemu set used ring flag even vhost started MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 4c54f5bc8e1d38f15cc35b6a6932d8fbe219c692 When vhost-user or vhost-kernel is handling virtio net datapath, QEMU should not touch used ring. But with vhost-user socket reconnect scenario, in a very rare case (has pending kick event). VRING_USED_F_NO_NOTIFY is set by QEMU in following code path: #0 virtio_queue_split_set_notification (vq=0x7ff5f4c920a8, enable=0) at ../hw/virtio/virtio.c:511 #1 0x0000559d6dbf033b in virtio_queue_set_notification (vq=0x7ff5f4c920a8, enable=0) at ../hw/virtio/virtio.c:576 #2 0x0000559d6dbbbdbc in virtio_net_handle_tx_bh (vdev=0x559d703a6aa0, vq=0x7ff5f4c920a8) at ../hw/net/virtio-net.c:2801 #3 0x0000559d6dbf4791 in virtio_queue_notify_vq (vq=0x7ff5f4c920a8) at ../hw/virtio/virtio.c:2248 #4 0x0000559d6dbf79da in virtio_queue_host_notifier_read (n=0x7ff5f4c9211c) at ../hw/virtio/virtio.c:3525 #5 0x0000559d6d9a5814 in virtio_bus_cleanup_host_notifier (bus=0x559d703a6a20, n=1) at ../hw/virtio/virtio-bus.c:321 #6 0x0000559d6dbf83c9 in virtio_device_stop_ioeventfd_impl (vdev=0x559d703a6aa0) at ../hw/virtio/virtio.c:3774 #7 0x0000559d6d9a55c8 in virtio_bus_stop_ioeventfd (bus=0x559d703a6a20) at ../hw/virtio/virtio-bus.c:259 #8 0x0000559d6d9a53e8 in virtio_bus_grab_ioeventfd (bus=0x559d703a6a20) at ../hw/virtio/virtio-bus.c:199 #9 0x0000559d6dbf841c in virtio_device_grab_ioeventfd (vdev=0x559d703a6aa0) at ../hw/virtio/virtio.c:3783 #10 0x0000559d6d9bde18 in vhost_dev_enable_notifiers (hdev=0x559d707edd70, vdev=0x559d703a6aa0) at ../hw/virtio/vhost.c:1592 #11 0x0000559d6d89a0b8 in vhost_net_start_one (net=0x559d707edd70, dev=0x559d703a6aa0) at ../hw/net/vhost_net.c:266 #12 0x0000559d6d89a6df in vhost_net_start (dev=0x559d703a6aa0, ncs=0x559d7048d890, data_queue_pairs=31, cvq=0) at ../hw/net/vhost_net.c:412 #13 0x0000559d6dbb5b89 in virtio_net_vhost_status (n=0x559d703a6aa0, status=15 '\017') at ../hw/net/virtio-net.c:311 #14 0x0000559d6dbb5e34 in virtio_net_set_status (vdev=0x559d703a6aa0, status=15 '\017') at ../hw/net/virtio-net.c:392 #15 0x0000559d6dbb60d8 in virtio_net_set_link_status (nc=0x559d7048d890) at ../hw/net/virtio-net.c:455 #16 0x0000559d6da64863 in qmp_set_link (name=0x559d6f0b83d0 "hostnet1", up=true, errp=0x7ffdd76569f0) at ../net/net.c:1459 #17 0x0000559d6da7226e in net_vhost_user_event (opaque=0x559d6f0b83d0, event=CHR_EVENT_OPENED) at ../net/vhost-user.c:301 #18 0x0000559d6ddc7f63 in chr_be_event (s=0x559d6f2ffea0, event=CHR_EVENT_OPENED) at ../chardev/char.c:62 #19 0x0000559d6ddc7fdc in qemu_chr_be_event (s=0x559d6f2ffea0, event=CHR_EVENT_OPENED) at ../chardev/char.c:82 This issue causes guest kernel stop kicking device and traffic stop. Add vhost_started check in virtio_net_handle_tx_bh to fix this wrong VRING_USED_F_NO_NOTIFY set. Signed-off-by: Yajun Wu Reviewed-by: Jiri Pirko Acked-by: Michael S. Tsirkin Message-ID: <20240402045109.97729-1-yajunw@nvidia.com> [PMD: Use unlikely()] Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/net/virtio-net.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 9559b3386a..c0a54f2d61 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -2831,6 +2831,10 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq) VirtIONet *n = VIRTIO_NET(vdev); VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; + if (unlikely(n->vhost_started)) { + return; + } + if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) { virtio_net_drop_tx_queue_data(vdev, vq); return; -- Gitee From c2b377814e7874811d7eb98462d5153e966281cf Mon Sep 17 00:00:00 2001 From: Fei Xu Date: Wed, 3 Apr 2024 18:05:25 +0800 Subject: [PATCH 227/939] coro: support live patch for libcare Signed-off-by: Dawei Jiang --- include/qemu/coroutine_int.h | 3 ++- util/coroutine-ucontext.c | 52 ++++++++++++++++++++++++++++++++++++ util/qemu-coroutine.c | 4 +++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h index 1da148552f..11b550a0fc 100644 --- a/include/qemu/coroutine_int.h +++ b/include/qemu/coroutine_int.h @@ -73,5 +73,6 @@ Coroutine *qemu_coroutine_new(void); void qemu_coroutine_delete(Coroutine *co); CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to, CoroutineAction action); - +void qemu_coroutine_info_add(const Coroutine *co_); +void qemu_coroutine_info_delete(const Coroutine *co_); #endif diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c index 7b304c79d9..650c21846d 100644 --- a/util/coroutine-ucontext.c +++ b/util/coroutine-ucontext.c @@ -80,6 +80,19 @@ union cc_arg { int i[2]; }; +/** + * coroutines list for libcare + */ +struct CoroutineInformation { + sigjmp_buf *env; + QLIST_ENTRY(CoroutineInformation) next; +}; + +static QemuMutex coro_mtx; +QLIST_HEAD(, CoroutineInformation) coro_info_list = QLIST_HEAD_INITIALIZER(pool); +int coro_env_offset = offsetof(struct CoroutineInformation, env); +int coro_next_offset = offsetof(struct CoroutineInformation, next); + /* * QEMU_ALWAYS_INLINE only does so if __OPTIMIZE__, so we cannot use it. * always_inline is required to avoid TSan runtime fatal errors. @@ -340,3 +353,42 @@ bool qemu_in_coroutine(void) return self && self->caller; } + +static void __attribute__((constructor)) coro_mutex_init(void) +{ + qemu_mutex_init(&coro_mtx); +} + +void qemu_coroutine_info_add(const Coroutine *co_) +{ + CoroutineUContext *co; + struct CoroutineInformation *coro_info; + + /* save coroutine env to coro_info_list */ + co = DO_UPCAST(CoroutineUContext, base, co_); + coro_info = g_malloc0(sizeof(struct CoroutineInformation)); + coro_info->env = &co->env; + + qemu_mutex_lock(&coro_mtx); + QLIST_INSERT_HEAD(&coro_info_list, coro_info, next); + qemu_mutex_unlock(&coro_mtx); +} + +void qemu_coroutine_info_delete(const Coroutine *co_) +{ + CoroutineUContext *co; + struct CoroutineInformation *coro_info; + + /* Remove relative coroutine env info from coro_info_list */ + co = DO_UPCAST(CoroutineUContext, base, co_); + + qemu_mutex_lock(&coro_mtx); + QLIST_FOREACH(coro_info, &coro_info_list, next) { + if (coro_info->env == &co->env) { + QLIST_REMOVE(coro_info, next); + g_free(coro_info); + break; + } + } + qemu_mutex_unlock(&coro_mtx); +} diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 5fd2dbaf8b..f550214484 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -89,6 +89,8 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) co = qemu_coroutine_new(); } + qemu_coroutine_info_add(co); + co->entry = entry; co->entry_arg = opaque; QSIMPLEQ_INIT(&co->co_queue_wakeup); @@ -99,6 +101,8 @@ static void coroutine_delete(Coroutine *co) { co->caller = NULL; + qemu_coroutine_info_delete(co); + if (IS_ENABLED(CONFIG_COROUTINE_POOL)) { if (release_pool_size < qatomic_read(&pool_max_size) * 2) { QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); -- Gitee From b54d853396820150735294107e2e3d060724de04 Mon Sep 17 00:00:00 2001 From: qihao Date: Mon, 8 Apr 2024 14:39:43 +0800 Subject: [PATCH 228/939] block/virtio-blk: Fix memory leak from virtio_blk_zone_report cheery-pick from bbdf9023665f409113cb07b463732861af63fb47 This modification ensures that in scenarios where the buffer size is insufficient for a zone report, the function will now properly set an error status and proceed to a cleanup label, instead of merely returning. The following ASAN log reveals it: ==1767400==ERROR: LeakSanitizer: detected memory leaks Direct leak of 312 byte(s) in 1 object(s) allocated from: #0 0x64ac7b3280cd in malloc llvm/compiler-rt/lib/asan/asan_malloc_linux.cpp:129:3 #1 0x735b02fb9738 in g_malloc (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x5e738) #2 0x64ac7d23be96 in virtqueue_split_pop hw/virtio/virtio.c:1612:12 #3 0x64ac7d23728a in virtqueue_pop hw/virtio/virtio.c:1783:16 #4 0x64ac7cfcaacd in virtio_blk_get_request hw/block/virtio-blk.c:228:27 #5 0x64ac7cfca7c7 in virtio_blk_handle_vq hw/block/virtio-blk.c:1123:23 #6 0x64ac7cfecb95 in virtio_blk_handle_output hw/block/virtio-blk.c:1157:5 Signed-off-by: Zheyu Ma Message-id: 20240404120040.1951466-1-zheyuma97@gmail.com Signed-off-by: Stefan Hajnoczi Signed-off-by: qihao_yewu --- hw/block/virtio-blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 1ebc9188c0..2eb096a6dc 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -790,7 +790,8 @@ static void virtio_blk_handle_zone_report(VirtIOBlockReq *req, sizeof(struct virtio_blk_zone_report) + sizeof(struct virtio_blk_zone_descriptor)) { virtio_error(vdev, "in buffer too small for zone report"); - return; + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + goto out; } /* start byte offset of the zone report */ -- Gitee From 2fc8029b9e274a0dbedc55b6b114b29e003b32ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=A9=A720201110?= Date: Mon, 8 Apr 2024 04:32:11 -0400 Subject: [PATCH 229/939] hw/nvme: fix -Werror=maybe-uninitialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ../hw/nvme/ctrl.c:6081:21: error: ‘result’ may be used uninitialized [-Werror=maybe-uninitialized] It's not obvious that 'result' is set in all code paths. When &result is a returned argument, it's even less clear. Looking at various assignments, 0 seems to be a suitable default value. Signed-off-by: Marc-André Lureau Signed-off-by: Liu Jing --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 7a56e7b79b..237b5c8871 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5882,7 +5882,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t nsid = le32_to_cpu(cmd->nsid); - uint32_t result; + uint32_t result = 0; uint8_t fid = NVME_GETSETFEAT_FID(dw10); NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; -- Gitee From 8c1ad2043705184da00d39250402a70f403d14a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 4 Apr 2024 20:56:11 +0200 Subject: [PATCH 230/939] hw/virtio: Introduce virtio_bh_new_guarded() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce virtio_bh_new_guarded(), similar to qemu_bh_new_guarded() but using the transport memory guard, instead of the device one (there can only be one virtio device per virtio bus). Inspired-by: Gerd Hoffmann Reviewed-by: Gerd Hoffmann Acked-by: Michael S. Tsirkin Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Message-Id: <20240409105537.18308-2-philmd@linaro.org> --- hw/virtio/virtio.c | 10 ++++++++++ include/hw/virtio/virtio.h | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d00effe4d5..202aae868e 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -4148,3 +4148,13 @@ static void virtio_register_types(void) } type_init(virtio_register_types) + +QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev, + QEMUBHFunc *cb, void *opaque, + const char *name) +{ + DeviceState *transport = qdev_get_parent_bus(dev)->parent; + + return qemu_bh_new_full(cb, opaque, name, + &transport->mem_reentrancy_guard); +} diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index e612441357..60494aed62 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -22,6 +22,7 @@ #include "standard-headers/linux/virtio_config.h" #include "standard-headers/linux/virtio_ring.h" #include "qom/object.h" +#include "block/aio.h" /* * A guest should never accept this. It implies negotiation is broken @@ -510,4 +511,10 @@ static inline bool virtio_device_disabled(VirtIODevice *vdev) bool virtio_legacy_allowed(VirtIODevice *vdev); bool virtio_legacy_check_disabled(VirtIODevice *vdev); +QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev, + QEMUBHFunc *cb, void *opaque, + const char *name); +#define virtio_bh_new_guarded(dev, cb, opaque) \ + virtio_bh_new_guarded_full((dev), (cb), (opaque), (stringify(cb))) + #endif -- Gitee From e72177cc2b3a4425c4be5ca8cc12bc99e63e2788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 4 Apr 2024 20:56:27 +0200 Subject: [PATCH 231/939] hw/display/virtio-gpu: Protect from DMA re-entrancy bugs(CVE-2024-3446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace qemu_bh_new_guarded() by virtio_bh_new_guarded() so the bus and device use the same guard. Otherwise the DMA-reentrancy protection can be bypassed: $ cat << EOF | qemu-system-i386 -display none -nodefaults \ -machine q35,accel=qtest \ -m 512M \ -device virtio-gpu \ -qtest stdio outl 0xcf8 0x80000820 outl 0xcfc 0xe0004000 outl 0xcf8 0x80000804 outw 0xcfc 0x06 write 0xe0004030 0x4 0x024000e0 write 0xe0004028 0x1 0xff write 0xe0004020 0x4 0x00009300 write 0xe000401c 0x1 0x01 write 0x101 0x1 0x04 write 0x103 0x1 0x1c write 0x9301c8 0x1 0x18 write 0x105 0x1 0x1c write 0x107 0x1 0x1c write 0x109 0x1 0x1c write 0x10b 0x1 0x00 write 0x10d 0x1 0x00 write 0x10f 0x1 0x00 write 0x111 0x1 0x00 write 0x113 0x1 0x00 write 0x115 0x1 0x00 write 0x117 0x1 0x00 write 0x119 0x1 0x00 write 0x11b 0x1 0x00 write 0x11d 0x1 0x00 write 0x11f 0x1 0x00 write 0x121 0x1 0x00 write 0x123 0x1 0x00 write 0x125 0x1 0x00 write 0x127 0x1 0x00 write 0x129 0x1 0x00 write 0x12b 0x1 0x00 write 0x12d 0x1 0x00 write 0x12f 0x1 0x00 write 0x131 0x1 0x00 write 0x133 0x1 0x00 write 0x135 0x1 0x00 write 0x137 0x1 0x00 write 0x139 0x1 0x00 write 0xe0007003 0x1 0x00 EOF ... ================================================================= ==276099==ERROR: AddressSanitizer: heap-use-after-free on address 0x60d000011178 at pc 0x562cc3b736c7 bp 0x7ffed49dee60 sp 0x7ffed49dee58 READ of size 8 at 0x60d000011178 thread T0 #0 0x562cc3b736c6 in virtio_gpu_ctrl_response hw/display/virtio-gpu.c:180:42 #1 0x562cc3b7c40b in virtio_gpu_ctrl_response_nodata hw/display/virtio-gpu.c:192:5 #2 0x562cc3b7c40b in virtio_gpu_simple_process_cmd hw/display/virtio-gpu.c:1015:13 #3 0x562cc3b82873 in virtio_gpu_process_cmdq hw/display/virtio-gpu.c:1050:9 #4 0x562cc4a85514 in aio_bh_call util/async.c:169:5 #5 0x562cc4a85c52 in aio_bh_poll util/async.c:216:13 #6 0x562cc4a1a79b in aio_dispatch util/aio-posix.c:423:5 #7 0x562cc4a8a2da in aio_ctx_dispatch util/async.c:358:5 #8 0x7f36840547a8 in g_main_context_dispatch (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x547a8) #9 0x562cc4a8b753 in glib_pollfds_poll util/main-loop.c:290:9 #10 0x562cc4a8b753 in os_host_main_loop_wait util/main-loop.c:313:5 #11 0x562cc4a8b753 in main_loop_wait util/main-loop.c:592:11 #12 0x562cc3938186 in qemu_main_loop system/runstate.c:782:9 #13 0x562cc43b7af5 in qemu_default_main system/main.c:37:14 #14 0x7f3683a6c189 in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16 #15 0x7f3683a6c244 in __libc_start_main csu/../csu/libc-start.c:381:3 #16 0x562cc2a58ac0 in _start (qemu-system-i386+0x231bac0) 0x60d000011178 is located 56 bytes inside of 136-byte region [0x60d000011140,0x60d0000111c8) freed by thread T0 here: #0 0x562cc2adb662 in __interceptor_free (qemu-system-i386+0x239e662) #1 0x562cc3b86b21 in virtio_gpu_reset hw/display/virtio-gpu.c:1524:9 #2 0x562cc416e20e in virtio_reset hw/virtio/virtio.c:2145:9 #3 0x562cc37c5644 in virtio_pci_reset hw/virtio/virtio-pci.c:2249:5 #4 0x562cc4233758 in memory_region_write_accessor system/memory.c:497:5 #5 0x562cc4232eea in access_with_adjusted_size system/memory.c:573:18 previously allocated by thread T0 here: #0 0x562cc2adb90e in malloc (qemu-system-i386+0x239e90e) #1 0x7f368405a678 in g_malloc (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x5a678) #2 0x562cc4163ffc in virtqueue_split_pop hw/virtio/virtio.c:1612:12 #3 0x562cc4163ffc in virtqueue_pop hw/virtio/virtio.c:1783:16 #4 0x562cc3b91a95 in virtio_gpu_handle_ctrl hw/display/virtio-gpu.c:1112:15 #5 0x562cc4a85514 in aio_bh_call util/async.c:169:5 #6 0x562cc4a85c52 in aio_bh_poll util/async.c:216:13 #7 0x562cc4a1a79b in aio_dispatch util/aio-posix.c:423:5 SUMMARY: AddressSanitizer: heap-use-after-free hw/display/virtio-gpu.c:180:42 in virtio_gpu_ctrl_response With this change, the same reproducer triggers: qemu-system-i386: warning: Blocked re-entrant IO on MemoryRegion: virtio-pci-common-virtio-gpu at addr: 0x6 Fixes: CVE-2024-3446 Cc: qemu-stable@nongnu.org Reported-by: Alexander Bulekov Reported-by: Yongkang Jia Reported-by: Xiao Lei Reported-by: Yiming Tao Buglink: https://bugs.launchpad.net/qemu/+bug/1888606 Reviewed-by: Gerd Hoffmann Acked-by: Michael S. Tsirkin Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Message-Id: <20240409105537.18308-3-philmd@linaro.org> --- hw/display/virtio-gpu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index b02d1e3a4c..a714638822 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -1456,10 +1456,8 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp) g->ctrl_vq = virtio_get_queue(vdev, 0); g->cursor_vq = virtio_get_queue(vdev, 1); - g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g, - &qdev->mem_reentrancy_guard); - g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g, - &qdev->mem_reentrancy_guard); + g->ctrl_bh = virtio_bh_new_guarded(qdev, virtio_gpu_ctrl_bh, g); + g->cursor_bh = virtio_bh_new_guarded(qdev, virtio_gpu_cursor_bh, g); g->reset_bh = qemu_bh_new(virtio_gpu_reset_bh, g); qemu_cond_init(&g->reset_cond); QTAILQ_INIT(&g->reslist); -- Gitee From fa62831c301fa2a1d4226e0fefdeb6b7a280fca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 4 Apr 2024 20:56:35 +0200 Subject: [PATCH 232/939] hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs(CVE-2024-3446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace qemu_bh_new_guarded() by virtio_bh_new_guarded() so the bus and device use the same guard. Otherwise the DMA-reentrancy protection can be bypassed. Fixes: CVE-2024-3446 Cc: qemu-stable@nongnu.org Suggested-by: Alexander Bulekov Reviewed-by: Gerd Hoffmann Acked-by: Michael S. Tsirkin Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Message-Id: <20240409105537.18308-4-philmd@linaro.org> --- hw/char/virtio-serial-bus.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 44906057be..096214b11b 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -990,8 +990,7 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp) return; } - port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port, - &dev->mem_reentrancy_guard); + port->bh = virtio_bh_new_guarded(dev, flush_queued_data_bh, port); port->elem = NULL; } -- Gitee From edb30c972ba68b03cc5febefc880698573a17b04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 4 Apr 2024 20:56:41 +0200 Subject: [PATCH 233/939] hw/virtio/virtio-crypto: Protect from DMA re-entrancy bugs(CVE-2024-3446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace qemu_bh_new_guarded() by virtio_bh_new_guarded() so the bus and device use the same guard. Otherwise the DMA-reentrancy protection can be bypassed. Fixes: CVE-2024-3446 Cc: qemu-stable@nongnu.org Suggested-by: Alexander Bulekov Reviewed-by: Gerd Hoffmann Acked-by: Michael S. Tsirkin Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Message-Id: <20240409105537.18308-5-philmd@linaro.org> --- hw/virtio/virtio-crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index 0e2cc8d5a8..4aaced74be 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -1080,8 +1080,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp) vcrypto->vqs[i].dataq = virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh); vcrypto->vqs[i].dataq_bh = - qemu_bh_new_guarded(virtio_crypto_dataq_bh, &vcrypto->vqs[i], - &dev->mem_reentrancy_guard); + virtio_bh_new_guarded(dev, virtio_crypto_dataq_bh, + &vcrypto->vqs[i]); vcrypto->vqs[i].vcrypto = vcrypto; } -- Gitee From b628859b936c6d6348d2af9e6b6d2887c697b9b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 9 Apr 2024 16:19:27 +0200 Subject: [PATCH 234/939] hw/sd/sdhci: Do not update TRNMOD when Command Inhibit (DAT) is set(CVE-2024-3447) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per "SD Host Controller Standard Specification Version 3.00": * 2.2.5 Transfer Mode Register (Offset 00Ch) Writes to this register shall be ignored when the Command Inhibit (DAT) in the Present State register is 1. Do not update the TRNMOD register when Command Inhibit (DAT) bit is set to avoid the present-status register going out of sync, leading to malicious guest using DMA mode and overflowing the FIFO buffer: $ cat << EOF | qemu-system-i386 \ -display none -nographic -nodefaults \ -machine accel=qtest -m 512M \ -device sdhci-pci,sd-spec-version=3 \ -device sd-card,drive=mydrive \ -drive if=none,index=0,file=null-co://,format=raw,id=mydrive \ -qtest stdio outl 0xcf8 0x80001013 outl 0xcfc 0x91 outl 0xcf8 0x80001001 outl 0xcfc 0x06000000 write 0x9100002c 0x1 0x05 write 0x91000058 0x1 0x16 write 0x91000005 0x1 0x04 write 0x91000028 0x1 0x08 write 0x16 0x1 0x21 write 0x19 0x1 0x20 write 0x9100000c 0x1 0x01 write 0x9100000e 0x1 0x20 write 0x9100000f 0x1 0x00 write 0x9100000c 0x1 0x00 write 0x91000020 0x1 0x00 EOF Stack trace (part): ================================================================= ==89993==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x615000029900 at pc 0x55d5f885700d bp 0x7ffc1e1e9470 sp 0x7ffc1e1e9468 WRITE of size 1 at 0x615000029900 thread T0 #0 0x55d5f885700c in sdhci_write_dataport hw/sd/sdhci.c:564:39 #1 0x55d5f8849150 in sdhci_write hw/sd/sdhci.c:1223:13 #2 0x55d5fa01db63 in memory_region_write_accessor system/memory.c:497:5 #3 0x55d5fa01d245 in access_with_adjusted_size system/memory.c:573:18 #4 0x55d5fa01b1a9 in memory_region_dispatch_write system/memory.c:1521:16 #5 0x55d5fa09f5c9 in flatview_write_continue system/physmem.c:2711:23 #6 0x55d5fa08f78b in flatview_write system/physmem.c:2753:12 #7 0x55d5fa08f258 in address_space_write system/physmem.c:2860:18 ... 0x615000029900 is located 0 bytes to the right of 512-byte region [0x615000029700,0x615000029900) allocated by thread T0 here: #0 0x55d5f7237b27 in __interceptor_calloc #1 0x7f9e36dd4c50 in g_malloc0 #2 0x55d5f88672f7 in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5 #3 0x55d5f844b582 in pci_qdev_realize hw/pci/pci.c:2092:9 #4 0x55d5fa2ee74b in device_set_realized hw/core/qdev.c:510:13 #5 0x55d5fa325bfb in property_set_bool qom/object.c:2358:5 #6 0x55d5fa31ea45 in object_property_set qom/object.c:1472:5 #7 0x55d5fa332509 in object_property_set_qobject om/qom-qobject.c:28:10 #8 0x55d5fa31f6ed in object_property_set_bool qom/object.c:1541:15 #9 0x55d5fa2e2948 in qdev_realize hw/core/qdev.c:292:12 #10 0x55d5f8eed3f1 in qdev_device_add_from_qdict system/qdev-monitor.c:719:10 #11 0x55d5f8eef7ff in qdev_device_add system/qdev-monitor.c:738:11 #12 0x55d5f8f211f0 in device_init_func system/vl.c:1200:11 #13 0x55d5fad0877d in qemu_opts_foreach util/qemu-option.c:1135:14 #14 0x55d5f8f0df9c in qemu_create_cli_devices system/vl.c:2638:5 #15 0x55d5f8f0db24 in qmp_x_exit_preconfig system/vl.c:2706:5 #16 0x55d5f8f14dc0 in qemu_init system/vl.c:3737:9 ... SUMMARY: AddressSanitizer: heap-buffer-overflow hw/sd/sdhci.c:564:39 in sdhci_write_dataport Add assertions to ensure the fifo_buffer[] is not overflowed by malicious accesses to the Buffer Data Port register. Fixes: CVE-2024-3447 Cc: qemu-stable@nongnu.org Fixes: d7dfca0807 ("hw/sdhci: introduce standard SD host controller") Buglink: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58813 Reported-by: Alexander Bulekov Reported-by: Chuhong Yuan Signed-off-by: Peter Maydell Message-Id: Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20240409145524.27913-1-philmd@linaro.org> --- hw/sd/sdhci.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c index 40473b0db0..e95ea34895 100644 --- a/hw/sd/sdhci.c +++ b/hw/sd/sdhci.c @@ -473,6 +473,7 @@ static uint32_t sdhci_read_dataport(SDHCIState *s, unsigned size) } for (i = 0; i < size; i++) { + assert(s->data_count < s->buf_maxsz); value |= s->fifo_buffer[s->data_count] << i * 8; s->data_count++; /* check if we've read all valid data (blksize bytes) from buffer */ @@ -561,6 +562,7 @@ static void sdhci_write_dataport(SDHCIState *s, uint32_t value, unsigned size) } for (i = 0; i < size; i++) { + assert(s->data_count < s->buf_maxsz); s->fifo_buffer[s->data_count] = value & 0xFF; s->data_count++; value >>= 8; @@ -1208,6 +1210,12 @@ sdhci_write(void *opaque, hwaddr offset, uint64_t val, unsigned size) if (!(s->capareg & R_SDHC_CAPAB_SDMA_MASK)) { value &= ~SDHC_TRNS_DMA; } + + /* TRNMOD writes are inhibited while Command Inhibit (DAT) is true */ + if (s->prnsts & SDHC_DATA_INHIBIT) { + mask |= 0xffff; + } + MASKED_WRITE(s->trnmod, mask, value & SDHC_TRNMOD_MASK); MASKED_WRITE(s->cmdreg, mask >> 16, value >> 16); -- Gitee From 0ec1c95eea8c68243919ee4f8cd28b9a97dfc2f0 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 15 Apr 2024 22:37:53 +0800 Subject: [PATCH 235/939] arm/virt: Use separate filed to identify cpu-hotplug enable The mc->has_hotpluggable_cpus should not be modified after machine class init. Signed-off-by: Keqian Zhu --- accel/kvm/kvm-all.c | 6 ++++++ hw/arm/virt-acpi-build.c | 13 +++++-------- hw/arm/virt.c | 20 +++++++++++++------- include/hw/arm/virt.h | 1 + include/sysemu/kvm.h | 2 ++ include/sysemu/kvm_int.h | 1 + target/arm/kvm.c | 7 +++---- 7 files changed, 31 insertions(+), 19 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 75a3075c14..b791aad1d6 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -3603,6 +3603,11 @@ bool kvm_kernel_irqchip_split(void) return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; } +bool kvm_smccc_filter_enabled(void) +{ + return kvm_state->kvm_smccc_filter_enabled; +} + static void kvm_get_dirty_ring_size(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -3648,6 +3653,7 @@ static void kvm_accel_instance_init(Object *obj) /* KVM dirty ring is by default off */ s->kvm_dirty_ring_size = 0; s->kvm_dirty_ring_with_bitmap = false; + s->kvm_smccc_filter_enabled = false; s->kvm_eager_split_size = 0; s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN; s->notify_window = 0; diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 46642efac4..99296fc6d8 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -779,12 +779,10 @@ static void build_append_gicr(GArray *table_data, uint64_t base, uint32_t size) build_append_int_noprefix(table_data, size, 4); /* Discovery Range Length */ } -static uint32_t virt_acpi_get_gicc_flags(CPUState *cpu) +static uint32_t virt_acpi_get_gicc_flags(CPUState *cpu, VirtMachineState *vms) { - MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); - /* can only exist in 'enabled' state */ - if (!mc->has_hotpluggable_cpus) { + if (!vms->cpu_hotplug_enabled) { return 1; } @@ -842,7 +840,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) uint64_t physical_base_address = 0, gich = 0, gicv = 0; uint32_t vgic_interrupt = vms->virt ? ARCH_GIC_MAINT_IRQ : 0; uint32_t pmu_interrupt = vms->pmu ? VIRTUAL_PMU_IRQ : 0; - uint32_t flags = virt_acpi_get_gicc_flags(cpu); + uint32_t flags = virt_acpi_get_gicc_flags(cpu, vms); uint64_t mpidr = qemu_get_cpu_archid(i); if (vms->gic_version == VIRT_GIC_VERSION_2) { @@ -1003,7 +1001,6 @@ static void build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); - MachineClass *mc = MACHINE_GET_CLASS(vms); Aml *scope, *dsdt; MachineState *ms = MACHINE(vms); const MemMapEntry *memmap = vms->memmap; @@ -1020,8 +1017,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * the RTC ACPI device at all when using UEFI. */ scope = aml_scope("\\_SB"); - /* if GED is enabled then cpus AML shall be added as part build_cpus_aml */ - if (mc->has_hotpluggable_cpus) { + + if (vms->cpu_hotplug_enabled) { CPUHotplugFeatures opts = { .acpi_1_compatible = false, .has_legacy_cphp = false diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 73b29c7f73..44931355d6 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -756,7 +756,7 @@ static void virt_add_gic_cpuhp_notifier(VirtMachineState *vms) { MachineClass *mc = MACHINE_GET_CLASS(vms); - if (mc->has_hotpluggable_cpus) { + if (mc->has_hotpluggable_cpus && vms->gic_version >= VIRT_GIC_VERSION_3) { Notifier *cpuhp_notifier = gicv3_cpuhp_notifier(vms->gic); notifier_list_add(&vms->cpuhp_notifiers, cpuhp_notifier); } @@ -2498,11 +2498,16 @@ static void machvirt_init(MachineState *machine) has_ged = has_ged && firmware_loaded && virt_is_acpi_enabled(vms) && !!object_class_dynamic_cast(cpu_class, TYPE_AARCH64_CPU); + if (tcg_enabled() || hvf_enabled() || qtest_enabled() || + (kvm_enabled() && !kvm_smccc_filter_enabled()) || (vms->gic_version < VIRT_GIC_VERSION_3) || !has_ged) { - mc->has_hotpluggable_cpus = false; + vms->cpu_hotplug_enabled = false; + } else { + vms->cpu_hotplug_enabled = true; } - if (!mc->has_hotpluggable_cpus) { + + if (!vms->cpu_hotplug_enabled) { if (machine->smp.max_cpus > smp_cpus) { warn_report("cpu hotplug feature has been disabled"); } @@ -3174,7 +3179,6 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, { VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); MachineState *ms = MACHINE(hotplug_dev); - MachineClass *mc = MACHINE_GET_CLASS(ms); ARMCPU *cpu = ARM_CPU(dev); CPUState *cs = CPU(dev); CPUArchId *cpu_slot; @@ -3218,7 +3222,7 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - if (cs->cpu_index >= ms->smp.cpus && !mc->has_hotpluggable_cpus) { + if (cs->cpu_index >= ms->smp.cpus && !vms->cpu_hotplug_enabled) { error_setg(errp, "CPU [cold|hot]plug not supported on this machine"); return; } @@ -3304,7 +3308,6 @@ fail: static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); HotplugHandlerClass *hhc; ARMCPU *cpu = ARM_CPU(dev); @@ -3316,7 +3319,7 @@ static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, return; } - if (!mc->has_hotpluggable_cpus) { + if (!vms->cpu_hotplug_enabled) { error_setg(errp, "CPU hot(un)plug not supported on this machine"); return; } @@ -3780,6 +3783,9 @@ static void virt_instance_init(Object *obj) /* EL2 is also disabled by default, for similar reasons */ vms->virt = false; + /* CPU hotplug is enabled by default */ + vms->cpu_hotplug_enabled = true; + /* High memory is enabled by default */ vms->highmem = true; vms->highmem_compact = !vmc->no_highmem_compact; diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index ae0f5beb26..138531f9c1 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -153,6 +153,7 @@ struct VirtMachineState { bool its; bool tcg_its; bool virt; + bool cpu_hotplug_enabled; bool ras; bool mte; bool dtb_randomness; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index e534411ddc..cfa77cc15b 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -492,6 +492,8 @@ bool kvm_kernel_irqchip_allowed(void); bool kvm_kernel_irqchip_required(void); bool kvm_kernel_irqchip_split(void); +bool kvm_smccc_filter_enabled(void); + /** * kvm_arch_irqchip_create: * @KVMState: The KVMState pointer diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index fd846394be..b2d2c59477 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -112,6 +112,7 @@ struct KVMState uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ bool kvm_dirty_ring_with_bitmap; + bool kvm_smccc_filter_enabled; uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */ struct KVMDirtyRingReaper reaper; NotifyVmexitOption notify_vmexit; diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 19783d567f..12c1b4b328 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -321,12 +321,11 @@ int kvm_arch_init(MachineState *ms, KVMState *s) if (kvm_arm_set_smccc_filter(PSCI_0_2_FN64_CPU_ON, KVM_SMCCC_FILTER_FWD_TO_USER)) { error_report("CPU On PSCI-to-user-space fwd filter install failed"); - mc->has_hotpluggable_cpus = false; - } - if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF, + } else if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF, KVM_SMCCC_FILTER_FWD_TO_USER)) { error_report("CPU Off PSCI-to-user-space fwd filter install failed"); - mc->has_hotpluggable_cpus = false; + } else { + s->kvm_smccc_filter_enabled = true; } } -- Gitee From 4a3d9e9dc874f6825b8b5f18a4dece1609d48d2f Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 15 Apr 2024 22:40:29 +0800 Subject: [PATCH 236/939] arm/virt: Use max_cpus to calculate redist1_count When cpu hotplug is enabled, the redist1_count should include all possible cpus. Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 44931355d6..e4473354d4 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -959,7 +959,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) virt_redist_capacity(vms, VIRT_HIGH_GIC_REDIST2); qlist_append_int(redist_region_count, - MIN(smp_cpus - redist0_count, redist1_capacity)); + MIN(max_cpus - redist0_count, redist1_capacity)); } qdev_prop_set_array(vms->gic, "redist-region-count", redist_region_count); -- Gitee From c23034c79ad8632388bc00dd4268e429638eee9e Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 18 Apr 2024 14:45:15 +0800 Subject: [PATCH 237/939] hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 83ddb3dbba2ee0f1767442ae6ee665058aeb1093 If a fragmented packet size is too short, do not try to calculate its checksum. Reproduced using: $ cat << EOF | qemu-system-i386 -display none -nodefaults \ -machine q35,accel=qtest -m 32M \ -device igb,netdev=net0 \ -netdev user,id=net0 \ -qtest stdio outl 0xcf8 0x80000810 outl 0xcfc 0xe0000000 outl 0xcf8 0x80000804 outw 0xcfc 0x06 write 0xe0000403 0x1 0x02 writel 0xe0003808 0xffffffff write 0xe000381a 0x1 0x5b write 0xe000381b 0x1 0x00 EOF Assertion failed: (offset == 0), function iov_from_buf_full, file util/iov.c, line 39. #1 0x5575e81e952a in iov_from_buf_full qemu/util/iov.c:39:5 #2 0x5575e6500768 in net_tx_pkt_update_sctp_checksum qemu/hw/net/net_tx_pkt.c:144:9 #3 0x5575e659f3e1 in igb_setup_tx_offloads qemu/hw/net/igb_core.c:478:11 #4 0x5575e659f3e1 in igb_tx_pkt_send qemu/hw/net/igb_core.c:552:10 #5 0x5575e659f3e1 in igb_process_tx_desc qemu/hw/net/igb_core.c:671:17 #6 0x5575e659f3e1 in igb_start_xmit qemu/hw/net/igb_core.c:903:9 #7 0x5575e659f3e1 in igb_set_tdt qemu/hw/net/igb_core.c:2812:5 #8 0x5575e657d6a4 in igb_core_write qemu/hw/net/igb_core.c:4248:9 Fixes: CVE-2024-3567 Cc: qemu-stable@nongnu.org Reported-by: Zheyu Ma Fixes: f199b13bc1 ("igb: Implement Tx SCTP CSO") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2273 Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Akihiko Odaki Acked-by: Jason Wang Message-Id: <20240410070459.49112-1-philmd@linaro.org> Signed-off-by: qihao_yewu --- hw/net/net_tx_pkt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index 2e5f58b3c9..d40d508a11 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -141,6 +141,10 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) uint32_t csum = 0; struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; + if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) { + return false; + } + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { return false; } -- Gitee From b59b75fc9f7ed73323179305363f0c2e00613863 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Tue, 28 Nov 2023 00:02:02 +0800 Subject: [PATCH 238/939] tests: bios-tables-test: Rename smbios type 4 related test functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In fact, type4-count, core-count, core-count2, thread-count and thread-count2 are tested with KVM not TCG. Rename these test functions to reflect KVM base instead of TCG. Signed-off-by: Zhao Liu Message-Id: <20231127160202.1037290-1-zhao1.liu@linux.intel.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index fe6a9a8563..21811a1ab5 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -1015,7 +1015,7 @@ static void test_acpi_q35_tcg(void) free_test_data(&data); } -static void test_acpi_q35_tcg_type4_count(void) +static void test_acpi_q35_kvm_type4_count(void) { test_data data = { .machine = MACHINE_Q35, @@ -1031,7 +1031,7 @@ static void test_acpi_q35_tcg_type4_count(void) free_test_data(&data); } -static void test_acpi_q35_tcg_core_count(void) +static void test_acpi_q35_kvm_core_count(void) { test_data data = { .machine = MACHINE_Q35, @@ -1048,7 +1048,7 @@ static void test_acpi_q35_tcg_core_count(void) free_test_data(&data); } -static void test_acpi_q35_tcg_core_count2(void) +static void test_acpi_q35_kvm_core_count2(void) { test_data data = { .machine = MACHINE_Q35, @@ -1065,7 +1065,7 @@ static void test_acpi_q35_tcg_core_count2(void) free_test_data(&data); } -static void test_acpi_q35_tcg_thread_count(void) +static void test_acpi_q35_kvm_thread_count(void) { test_data data = { .machine = MACHINE_Q35, @@ -1082,7 +1082,7 @@ static void test_acpi_q35_tcg_thread_count(void) free_test_data(&data); } -static void test_acpi_q35_tcg_thread_count2(void) +static void test_acpi_q35_kvm_thread_count2(void) { test_data data = { .machine = MACHINE_Q35, @@ -2262,15 +2262,15 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/kvm/xapic", test_acpi_q35_kvm_xapic); qtest_add_func("acpi/q35/kvm/dmar", test_acpi_q35_kvm_dmar); qtest_add_func("acpi/q35/type4-count", - test_acpi_q35_tcg_type4_count); + test_acpi_q35_kvm_type4_count); qtest_add_func("acpi/q35/core-count", - test_acpi_q35_tcg_core_count); + test_acpi_q35_kvm_core_count); qtest_add_func("acpi/q35/core-count2", - test_acpi_q35_tcg_core_count2); + test_acpi_q35_kvm_core_count2); qtest_add_func("acpi/q35/thread-count", - test_acpi_q35_tcg_thread_count); + test_acpi_q35_kvm_thread_count); qtest_add_func("acpi/q35/thread-count2", - test_acpi_q35_tcg_thread_count2); + test_acpi_q35_kvm_thread_count2); } if (qtest_has_device("virtio-iommu-pci")) { qtest_add_func("acpi/q35/viot", test_acpi_q35_viot); -- Gitee From 74817cbc4ccb4e3b0f6d7b464b5707d3fbc5f686 Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 23 Apr 2024 10:40:32 +0800 Subject: [PATCH 239/939] hw/isa/vt82c686: Keep track of PIRQ/PINT pins separately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from f33274265a242df5d9fdb00915fe72fbb1b2a3c4 Move calculation of mask after the switch which sets the function number for PIRQ/PINT pins to make sure the state of these pins are kept track of separately and IRQ is raised if any of them is active. Cc: qemu-stable@nongnu.org Fixes: 7e01bd80c1 hw/isa/vt82c686: Bring back via_isa_set_irq() Signed-off-by: BALATON Zoltan Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240410222543.0EA534E6005@zero.eik.bme.hu> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/isa/vt82c686.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index 9c2333a277..0334431219 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -613,7 +613,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level) ViaISAState *s = VIA_ISA(pci_get_function_0(d)); uint8_t irq = d->config[PCI_INTERRUPT_LINE], max_irq = 15; int f = PCI_FUNC(d->devfn); - uint16_t mask = BIT(f); + uint16_t mask; switch (f) { case 0: /* PIRQ/PINT inputs */ @@ -628,6 +628,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level) } /* Keep track of the state of all sources */ + mask = BIT(f); if (level) { s->irq_state[0] |= mask; } else { -- Gitee From 3af7045d3aea901d366f4f6dee51e70998351698 Mon Sep 17 00:00:00 2001 From: Yabin Li Date: Tue, 23 Apr 2024 15:38:48 +0800 Subject: [PATCH 240/939] hw/vfio/hct: update support ccp count to 48. Signed-off-by: Yabin Li Signed-off-by: yangdepei --- hw/vfio/hct.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/hw/vfio/hct.c b/hw/vfio/hct.c index 476e86c61d..790bb78439 100644 --- a/hw/vfio/hct.c +++ b/hw/vfio/hct.c @@ -28,7 +28,7 @@ #include "qapi/error.h" #include "hw/qdev-properties.h" -#define MAX_CCP_CNT 16 +#define MAX_CCP_CNT 48 #define PAGE_SIZE 4096 #define HCT_SHARED_MEMORY_SIZE (PAGE_SIZE * MAX_CCP_CNT) #define CCP_INDEX_BYTES 4 @@ -43,7 +43,7 @@ #define HCT_SHARE_DEV "/dev/hct_share" -#define HCT_VERSION_STRING "0.2" +#define HCT_VERSION_STRING "0.5" #define DEF_VERSION_STRING "0.1" #define VERSION_SIZE 16 @@ -281,15 +281,14 @@ static int hct_api_version_check(void) memcpy(ctrl.version, DEF_VERSION_STRING, sizeof(DEF_VERSION_STRING)); ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); if (ret < 0) { - error_report("ret %d, errno %d: fail to get hct.ko version, please " - "update hct.ko to version 0.2.\n", - ret, errno); + error_report("ret %d, errno %d: fail to get hct.ko version.\n", ret, + errno); return -1; } else if (memcmp(ctrl.version, HCT_VERSION_STRING, sizeof(HCT_VERSION_STRING)) < 0) { - error_report("The API version %s is larger than hct.ko version %s, " - "please update hct.ko to version 0.2\n", - HCT_VERSION_STRING, ctrl.version); + error_report("The hct.ko version is %s, please upgrade to version %s " + "or higher.\n", + ctrl.version, HCT_VERSION_STRING); return -1; } -- Gitee From c2eb1176fe06f359a8102bbacb54760c9c1d5aae Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Sun, 28 Apr 2024 12:50:09 +0800 Subject: [PATCH 241/939] acpi/cpu: Fix detection of present cpu When qemu_present_cpu is false. it means cpu object is null and then calling of qemu_persistent_cpu() will cause null pointer access. Signed-off-by: Keqian Zhu --- hw/acpi/cpu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index b258396e01..292e1daca2 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -231,11 +231,7 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, if (qemu_present_cpu(cpu)) { state->devs[i].is_present = true; } else { - if (qemu_persistent_cpu(cpu)) { - state->devs[i].is_present = true; - } else { - state->devs[i].is_present = false; - } + state->devs[i].is_present = false; } if (qemu_enabled_cpu(cpu)) { -- Gitee From 52909d74ec37e851df3762a6eab1d7a6eeb89fba Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Sun, 28 Apr 2024 12:56:47 +0800 Subject: [PATCH 242/939] arm/virt: Don't modify smp.max_cpus when vcpu hotplug disabled The smp.max_cpus has been used when create possible_cpus, so we must not change it after that. We should use smp.cpus when create cpu and acpi table if vcpu hotplug is disabled, instead of change smp.max_cpus to smp.cpus and use it everywhere. Signed-off-by: Keqian Zhu --- hw/arm/virt-acpi-build.c | 8 +++++++- hw/arm/virt.c | 24 ++++++++++++++++++++++-- include/hw/arm/virt.h | 8 +++++++- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 99296fc6d8..179600d4fe 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -814,9 +814,15 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i; VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); + MachineState *ms = MACHINE(vms); const MemMapEntry *memmap = vms->memmap; AcpiTable table = { .sig = "APIC", .rev = 4, .oem_id = vms->oem_id, .oem_table_id = vms->oem_table_id }; + unsigned int max_cpus = ms->smp.max_cpus; + + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } acpi_table_begin(&table, table_data); /* Local Interrupt Controller Address */ @@ -835,7 +841,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, vms->gic_version, 1); build_append_int_noprefix(table_data, 0, 3); /* Reserved */ - for (i = 0; i < MACHINE(vms)->smp.max_cpus; i++) { + for (i = 0; i < max_cpus; i++) { CPUState *cpu = qemu_get_possible_cpu(i); uint64_t physical_base_address = 0, gich = 0, gicv = 0; uint32_t vgic_interrupt = vms->virt ? ARCH_GIC_MAINT_IRQ : 0; diff --git a/hw/arm/virt.c b/hw/arm/virt.c index e4473354d4..507b09d96c 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -831,6 +831,10 @@ static void unwire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs) int type = vms->gic_version; int irq; + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } + for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { qdev_disconnect_gpio_out_named(cpudev, NULL, irq); } @@ -871,6 +875,10 @@ static void wire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs) int intidbase; int irq; + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } + intidbase = NUM_IRQS + cpu * GIC_INTERNAL; for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { @@ -915,6 +923,10 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) uint32_t nb_redist_regions = 0; int revision; + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } + if (vms->gic_version == VIRT_GIC_VERSION_2) { gictype = gic_class_name(); } else { @@ -2165,6 +2177,9 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) for (n = 0; n < possible_cpus->len; n++) { cpu = qemu_get_possible_cpu(n); + if (!qemu_present_cpu(cpu)) { + continue; + } if (vms->pmu) { assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU)); @@ -2195,6 +2210,9 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) if (kvm_enabled() || tcg_enabled()) { for (n = 0; n < possible_cpus->len; n++) { cpu = qemu_get_possible_cpu(n); + if (!qemu_present_cpu(cpu)) { + continue; + } /* * Now, GIC has been sized with possible CPUs and we dont require @@ -2511,16 +2529,18 @@ static void machvirt_init(MachineState *machine) if (machine->smp.max_cpus > smp_cpus) { warn_report("cpu hotplug feature has been disabled"); } - machine->smp.max_cpus = smp_cpus; } notifier_list_init(&vms->cpuhp_notifiers); - possible_cpus = mc->possible_cpu_arch_ids(machine); assert(possible_cpus->len == max_cpus); for (n = 0; n < possible_cpus->len; n++) { Object *cpuobj; CPUState *cs; + if (!vms->cpu_hotplug_enabled && n >= smp_cpus) { + break; + } + cpuobj = object_new(possible_cpus->cpus[n].type); cs = CPU(cpuobj); diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 138531f9c1..7a734f07f7 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -210,10 +210,16 @@ static uint32_t virt_redist_capacity(VirtMachineState *vms, int region) static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) { uint32_t redist0_capacity = virt_redist_capacity(vms, VIRT_GIC_REDIST); + MachineState *ms = MACHINE(vms); + unsigned int max_cpus = ms->smp.max_cpus; + + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } assert(vms->gic_version != VIRT_GIC_VERSION_2); - return (MACHINE(vms)->smp.max_cpus > redist0_capacity && + return (max_cpus > redist0_capacity && vms->highmem_redists) ? 2 : 1; } -- Gitee From 1228f5c7cfcb78b19f163551aae0612602ac2d7d Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Sun, 28 Apr 2024 13:01:48 +0800 Subject: [PATCH 243/939] kvm/arm: Fix SVE related logic for vcpu hotplug feature 1. Must finalize SVE setting before kvm_arch_init_vcpu(). 2. Must not finalize KVM SVE repeatly for hotplugged vcpu. Signed-off-by: Keqian Zhu --- target/arm/kvm.c | 1 + target/arm/kvm64.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 12c1b4b328..1ceb72a1c1 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -704,6 +704,7 @@ void kvm_arm_create_host_vcpu(ARMCPU *cpu) * later while setting device attributes of the GICR during GICv3 * reset */ + arm_cpu_finalize_features(cpu, &error_abort); ret = kvm_arch_init_vcpu(cs); if (ret < 0) { error_report("Failed to initialize host vcpu %ld", vcpu_id); diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 00b257bb4b..615e8bbbdf 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -647,7 +647,7 @@ int kvm_arch_init_vcpu(CPUState *cs) return ret; } - if (cpu_isar_feature(aa64_sve, cpu)) { + if (cpu_isar_feature(aa64_sve, cpu) && !DEVICE(cpu)->hotplugged) { ret = kvm_arm_sve_set_vls(cs); if (ret) { return ret; -- Gitee From fb27704692362d151eb191f0c687ded09b04e04c Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Sun, 28 Apr 2024 14:14:07 +0800 Subject: [PATCH 244/939] arm/virt/acpi: Extend cpufreq to support max_cpus We support vcpu hotplug now, so extend memory region size to allow hotplugged CPU access cpufreq space. Signed-off-by: Keqian Zhu --- hw/acpi/cpufreq.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/hw/acpi/cpufreq.c b/hw/acpi/cpufreq.c index a84db490b3..a76f7b8fa2 100644 --- a/hw/acpi/cpufreq.c +++ b/hw/acpi/cpufreq.c @@ -83,6 +83,7 @@ typedef struct CpuhzState { uint32_t PerformanceLimited; uint32_t LowestFreq; uint32_t NominalFreq; + uint32_t num_cpu; uint32_t reg_size; } CpuhzState; @@ -93,10 +94,7 @@ static uint64_t cpufreq_read(void *opaque, hwaddr offset, unsigned size) uint64_t r; uint64_t n; - MachineState *ms = MACHINE(qdev_get_machine()); - unsigned int smp_cpus = ms->smp.cpus; - - if (offset >= smp_cpus * CPPC_REG_PER_CPU_STRIDE) { + if (offset >= s->num_cpu * CPPC_REG_PER_CPU_STRIDE) { warn_report("cpufreq_read: offset 0x%lx out of range", offset); return 0; } @@ -163,11 +161,10 @@ static uint64_t cpufreq_read(void *opaque, hwaddr offset, unsigned size) static void cpufreq_write(void *opaque, hwaddr offset, uint64_t value, unsigned size) { + CpuhzState *s = CPUFREQ(opaque); uint64_t n; - MachineState *ms = MACHINE(qdev_get_machine()); - unsigned int smp_cpus = ms->smp.cpus; - if (offset >= smp_cpus * CPPC_REG_PER_CPU_STRIDE) { + if (offset >= s->num_cpu * CPPC_REG_PER_CPU_STRIDE) { error_printf("cpufreq_write: offset 0x%lx out of range", offset); return; } @@ -248,9 +245,9 @@ static void cpufreq_init(Object *obj) CpuhzState *s = CPUFREQ(obj); MachineState *ms = MACHINE(qdev_get_machine()); - unsigned int smp_cpus = ms->smp.cpus; + s->num_cpu = ms->smp.max_cpus; - s->reg_size = smp_cpus * CPPC_REG_PER_CPU_STRIDE; + s->reg_size = s->num_cpu * CPPC_REG_PER_CPU_STRIDE; if (s->reg_size > MAX_SUPPORT_SPACE) { error_report("Required space 0x%x excesses the max support 0x%x", s->reg_size, MAX_SUPPORT_SPACE); -- Gitee From 8f2e7e0ebc4351d61091669137a4e26b78f3cb27 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 20 Mar 2024 17:31:38 +0800 Subject: [PATCH 245/939] target/i386: Introduce Icelake-Server-v7 to enable TSX commit c895fa54e3060c5ac6f3888dce96c9b78626072b upstream. When start L2 guest with both L1/L2 using Icelake-Server-v3 or above, QEMU reports below warning: "warning: host doesn't support requested feature: MSR(10AH).taa-no [bit 8]" Reason is QEMU Icelake-Server-v3 has TSX feature disabled but enables taa-no bit. It's meaningless that TSX isn't supported but still claim TSX is secure. So L1 KVM doesn't expose taa-no to L2 if TSX is unsupported, then starting L2 triggers the warning. Fix it by introducing a new version Icelake-Server-v7 which has both TSX and taa-no features. Then guest can use TSX securely when it see taa-no. This matches the production Icelake which supports TSX and isn't susceptible to TSX Async Abort (TAA) vulnerabilities, a.k.a, taa-no. Ideally, TSX should have being enabled together with taa-no since v3, but for compatibility, we'd better to add v7 to enable it. Fixes: d965dc35592d ("target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model") Intel-SIG: commit c895fa54e306 target/i386: Introduce Icelake-Server-v7 to enable TSX. 8.2.0-Add SRF CPU module support Tested-by: Xiangfei Ma Signed-off-by: Zhenzhong Duan Message-ID: <20240320093138.80267-2-zhenzhong.duan@intel.com> Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 491cf40cc7..6abe33946c 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3822,6 +3822,16 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } }, }, + { + .version = 7, + .note = "TSX, taa-no", + .props = (PropValue[]) { + /* Restore TSX features removed by -v2 above */ + { "hle", "on" }, + { "rtm", "on" }, + { /* end of list */ } + }, + }, { /* end of list */ } } }, -- Gitee From c61eabb8aa86fed57c2cd5394e0e89e350c99c5e Mon Sep 17 00:00:00 2001 From: Tao Su Date: Wed, 20 Mar 2024 10:10:44 +0800 Subject: [PATCH 246/939] target/i386: Add new CPU model SierraForest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6e82d3b6220777667968a04c87e1667f164ebe88 upstream. According to table 1-2 in Intel Architecture Instruction Set Extensions and Future Features (rev 051) [1], SierraForest has the following new features which have already been virtualized: - CMPCCXADD CPUID.(EAX=7,ECX=1):EAX[bit 7] - AVX-IFMA CPUID.(EAX=7,ECX=1):EAX[bit 23] - AVX-VNNI-INT8 CPUID.(EAX=7,ECX=1):EDX[bit 4] - AVX-NE-CONVERT CPUID.(EAX=7,ECX=1):EDX[bit 5] Add above features to new CPU model SierraForest. Comparing with GraniteRapids CPU model, SierraForest bare-metal removes the following features: - HLE CPUID.(EAX=7,ECX=0):EBX[bit 4] - RTM CPUID.(EAX=7,ECX=0):EBX[bit 11] - AVX512F CPUID.(EAX=7,ECX=0):EBX[bit 16] - AVX512DQ CPUID.(EAX=7,ECX=0):EBX[bit 17] - AVX512_IFMA CPUID.(EAX=7,ECX=0):EBX[bit 21] - AVX512CD CPUID.(EAX=7,ECX=0):EBX[bit 28] - AVX512BW CPUID.(EAX=7,ECX=0):EBX[bit 30] - AVX512VL CPUID.(EAX=7,ECX=0):EBX[bit 31] - AVX512_VBMI CPUID.(EAX=7,ECX=0):ECX[bit 1] - AVX512_VBMI2 CPUID.(EAX=7,ECX=0):ECX[bit 6] - AVX512_VNNI CPUID.(EAX=7,ECX=0):ECX[bit 11] - AVX512_BITALG CPUID.(EAX=7,ECX=0):ECX[bit 12] - AVX512_VPOPCNTDQ CPUID.(EAX=7,ECX=0):ECX[bit 14] - LA57 CPUID.(EAX=7,ECX=0):ECX[bit 16] - TSXLDTRK CPUID.(EAX=7,ECX=0):EDX[bit 16] - AMX-BF16 CPUID.(EAX=7,ECX=0):EDX[bit 22] - AVX512_FP16 CPUID.(EAX=7,ECX=0):EDX[bit 23] - AMX-TILE CPUID.(EAX=7,ECX=0):EDX[bit 24] - AMX-INT8 CPUID.(EAX=7,ECX=0):EDX[bit 25] - AVX512_BF16 CPUID.(EAX=7,ECX=1):EAX[bit 5] - fast zero-length MOVSB CPUID.(EAX=7,ECX=1):EAX[bit 10] - fast short CMPSB, SCASB CPUID.(EAX=7,ECX=1):EAX[bit 12] - AMX-FP16 CPUID.(EAX=7,ECX=1):EAX[bit 21] - PREFETCHI CPUID.(EAX=7,ECX=1):EDX[bit 14] - XFD CPUID.(EAX=0xD,ECX=1):EAX[bit 4] - EPT_PAGE_WALK_LENGTH_5 VMX_EPT_VPID_CAP(0x48c)[bit 7] Add all features of GraniteRapids CPU model except above features to SierraForest CPU model. SierraForest doesn’t support TSX and RTM but supports TAA_NO. When RTM is not enabled in host, KVM will not report TAA_NO. So, just don't include TAA_NO in SierraForest CPU model. [1] https://cdrdv2.intel.com/v1/dl/getContent/671368 Intel-SIG: commit 6e82d3b62207 target/i386: Add new CPU model SierraForest. 8.2.0-Add SRF CPU module support Reviewed-by: Zhao Liu Reviewed-by: Xiaoyao Li Signed-off-by: Tao Su Message-ID: <20240320021044.508263-1-tao1.su@linux.intel.com> Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6abe33946c..57a832cea2 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4109,6 +4109,132 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, + { + .name = "SierraForest", + .level = 0x23, + .vendor = CPUID_VENDOR_INTEL, + .family = 6, + .model = 175, + .stepping = 0, + /* + * please keep the ascending order so that we can have a clear view of + * bit position of each feature. + */ + .features[FEAT_1_EDX] = + CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | + CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | + CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | + CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR | + CPUID_SSE | CPUID_SSE2, + .features[FEAT_1_ECX] = + CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 | + CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 | + CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | + CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES | + CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | CPUID_EXT_RDRAND, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | + CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, + .features[FEAT_8000_0008_EBX] = + CPUID_8000_0008_EBX_WBNOINVD, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | + CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | + CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | + CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB | + CPUID_7_0_EBX_SHA_NI, + .features[FEAT_7_0_ECX] = + CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_GFNI | + CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ | + CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT, + .features[FEAT_7_0_EDX] = + CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE | + CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_ARCH_CAPABILITIES | + CPUID_7_0_EDX_SPEC_CTRL_SSBD, + .features[FEAT_ARCH_CAPABILITIES] = + MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | + MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO | + MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_SBDR_SSDP_NO | + MSR_ARCH_CAP_FBSDP_NO | MSR_ARCH_CAP_PSDP_NO | + MSR_ARCH_CAP_PBRSB_NO, + .features[FEAT_XSAVE] = + CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | + CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, + .features[FEAT_7_1_EAX] = + CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_CMPCCXADD | + CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_AVX_IFMA, + .features[FEAT_7_1_EDX] = + CPUID_7_1_EDX_AVX_VNNI_INT8 | CPUID_7_1_EDX_AVX_NE_CONVERT, + .features[FEAT_7_2_EDX] = + CPUID_7_2_EDX_MCDT_NO, + .features[FEAT_VMX_BASIC] = + MSR_VMX_BASIC_INS_OUTS | MSR_VMX_BASIC_TRUE_CTLS, + .features[FEAT_VMX_ENTRY_CTLS] = + VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_IA32E_MODE | + VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_ENTRY_LOAD_IA32_PAT | VMX_VM_ENTRY_LOAD_IA32_EFER, + .features[FEAT_VMX_EPT_VPID_CAPS] = + MSR_VMX_EPT_EXECONLY | MSR_VMX_EPT_PAGE_WALK_LENGTH_4 | + MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | MSR_VMX_EPT_1GB | + MSR_VMX_EPT_INVEPT | MSR_VMX_EPT_AD_BITS | + MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT | + MSR_VMX_EPT_INVVPID_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS, + .features[FEAT_VMX_EXIT_CTLS] = + VMX_VM_EXIT_SAVE_DEBUG_CONTROLS | + VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_IA32_PAT | + VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER | + VMX_VM_EXIT_LOAD_IA32_EFER | VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, + .features[FEAT_VMX_MISC] = + MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_ACTIVITY_HLT | + MSR_VMX_MISC_VMWRITE_VMEXIT, + .features[FEAT_VMX_PINBASED_CTLS] = + VMX_PIN_BASED_EXT_INTR_MASK | VMX_PIN_BASED_NMI_EXITING | + VMX_PIN_BASED_VIRTUAL_NMIS | VMX_PIN_BASED_VMX_PREEMPTION_TIMER | + VMX_PIN_BASED_POSTED_INTR, + .features[FEAT_VMX_PROCBASED_CTLS] = + VMX_CPU_BASED_VIRTUAL_INTR_PENDING | + VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING | + VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING | + VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING | + VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING | + VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING | + VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_VIRTUAL_NMI_PENDING | + VMX_CPU_BASED_MOV_DR_EXITING | VMX_CPU_BASED_UNCOND_IO_EXITING | + VMX_CPU_BASED_USE_IO_BITMAPS | VMX_CPU_BASED_MONITOR_TRAP_FLAG | + VMX_CPU_BASED_USE_MSR_BITMAPS | VMX_CPU_BASED_MONITOR_EXITING | + VMX_CPU_BASED_PAUSE_EXITING | + VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, + .features[FEAT_VMX_SECONDARY_CTLS] = + VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + VMX_SECONDARY_EXEC_ENABLE_EPT | VMX_SECONDARY_EXEC_DESC | + VMX_SECONDARY_EXEC_RDTSCP | + VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_WBINVD_EXITING | + VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST | + VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT | + VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + VMX_SECONDARY_EXEC_RDRAND_EXITING | + VMX_SECONDARY_EXEC_ENABLE_INVPCID | + VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS | + VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML | + VMX_SECONDARY_EXEC_XSAVES, + .features[FEAT_VMX_VMFUNC] = + MSR_VMX_VMFUNC_EPT_SWITCHING, + .xlevel = 0x80000008, + .model_id = "Intel Xeon Processor (SierraForest)", + .versions = (X86CPUVersionDefinition[]) { + { .version = 1 }, + { /* end of list */ }, + }, + }, { .name = "Denverton", .level = 21, -- Gitee From b167617657fa078c4ea14cf54138ff5a4ce180f3 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 13 Mar 2024 07:53:23 -0700 Subject: [PATCH 247/939] target/i386: Export RFDS bit to guests commit 41bdd9812863c150284a9339a048ed88c40f4df7 upstream. Register File Data Sampling (RFDS) is a CPU side-channel vulnerability that may expose stale register value. CPUs that set RFDS_NO bit in MSR IA32_ARCH_CAPABILITIES indicate that they are not vulnerable to RFDS. Similarly, RFDS_CLEAR indicates that CPU is affected by RFDS, and has the microcode to help mitigate RFDS. Make RFDS_CLEAR and RFDS_NO bits available to guests. Intel-SIG: commit 41bdd9812863 target/i386: Export RFDS bit to guests. 8.2.0-Add SRF CPU module support Signed-off-by: Pawan Gupta Reviewed-by: Xiaoyao Li Reviewed-by: Zhao Liu Message-ID: <9a38877857392b5c2deae7e7db1b170d15510314.1710341348.git.pawan.kumar.gupta@linux.intel.com> Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 57a832cea2..fd32c64f99 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1157,8 +1157,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no", NULL, "fb-clear", NULL, NULL, NULL, NULL, NULL, NULL, - "pbrsb-no", NULL, "gds-no", NULL, - NULL, NULL, NULL, NULL, + "pbrsb-no", NULL, "gds-no", "rfds-no", + "rfds-clear", NULL, NULL, NULL, }, .msr = { .index = MSR_IA32_ARCH_CAPABILITIES, -- Gitee From baacc5ed528a5259286622482a01e3e848aed57e Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Mon, 29 Apr 2024 17:14:47 +0800 Subject: [PATCH 248/939] kvm/arm: Fix compatibility of cold-plug CPU with SVE For arm virt machine, besides hotplugged vcpu, the kvm state of coldplugged CPU is also pre-inited and thus SVE is finalized. And a flag in ARMCPU state and skip finalize SVE again. Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 5 +++++ target/arm/cpu.h | 3 +++ target/arm/kvm64.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 507b09d96c..dfe4d9e129 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3282,6 +3282,11 @@ static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, if (!dev->hotplugged) { cs->cold_booted = true; } +#ifdef CONFIG_KVM + if (cs->cpu_index >= ms->smp.cpus) { + cpu->kvm_sve_finalized = true; + } +#endif } static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, diff --git a/target/arm/cpu.h b/target/arm/cpu.h index c51a0e3467..a5ba7f2a26 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -971,6 +971,9 @@ struct ArchCPU { /* KVM steal time */ OnOffAuto kvm_steal_time; + + /* KVM SVE has been finalized for this CPU */ + bool kvm_sve_finalized; #endif /* CONFIG_KVM */ /* Uniprocessor system with MP extensions */ diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 615e8bbbdf..8f01d485b0 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -647,7 +647,7 @@ int kvm_arch_init_vcpu(CPUState *cs) return ret; } - if (cpu_isar_feature(aa64_sve, cpu) && !DEVICE(cpu)->hotplugged) { + if (cpu_isar_feature(aa64_sve, cpu) && !cpu->kvm_sve_finalized) { ret = kvm_arm_sve_set_vls(cs); if (ret) { return ret; -- Gitee From 73fecb1c0fab9a1e0593b769c36bdc795c9316ae Mon Sep 17 00:00:00 2001 From: qihao Date: Wed, 15 May 2024 15:52:28 +0800 Subject: [PATCH 249/939] hw/ufs: Fix buffer overflow bug cheery-pick from f2c8aeb1afefcda92054c448b21fc59cdd99db30 It fixes the buffer overflow vulnerability in the ufs device. The bug was detected by sanitizers. You can reproduce it by: cat << EOF |\ qemu-system-x86_64 \ -display none -machine accel=qtest -m 512M -M q35 -nodefaults -drive \ file=null-co://,if=none,id=disk0 -device ufs,id=ufs_bus -device \ ufs-lu,drive=disk0,bus=ufs_bus -qtest stdio outl 0xcf8 0x80000810 outl 0xcfc 0xe0000000 outl 0xcf8 0x80000804 outw 0xcfc 0x06 write 0xe0000058 0x1 0xa7 write 0xa 0x1 0x50 EOF Resolves: #2299 Fixes: 329f16624499 ("hw/ufs: Support for Query Transfer Requests") Reported-by: Zheyu Ma Signed-off-by: Jeuk Kim Signed-off-by: qihao_yewu --- hw/ufs/ufs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c index eccdb852a0..bac78a32bb 100644 --- a/hw/ufs/ufs.c +++ b/hw/ufs/ufs.c @@ -126,6 +126,10 @@ static MemTxResult ufs_dma_read_req_upiu(UfsRequest *req) copy_size = sizeof(UtpUpiuHeader) + UFS_TRANSACTION_SPECIFIC_FIELD_SIZE + data_segment_length; + if (copy_size > sizeof(req->req_upiu)) { + copy_size = sizeof(req->req_upiu); + } + ret = ufs_addr_read(u, req_upiu_base_addr, &req->req_upiu, copy_size); if (ret) { trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr); @@ -225,6 +229,10 @@ static MemTxResult ufs_dma_write_rsp_upiu(UfsRequest *req) copy_size = rsp_upiu_byte_len; } + if (copy_size > sizeof(req->rsp_upiu)) { + copy_size = sizeof(req->rsp_upiu); + } + ret = ufs_addr_write(u, rsp_upiu_base_addr, &req->rsp_upiu, copy_size); if (ret) { trace_ufs_err_dma_write_rsp_upiu(req->slot, rsp_upiu_base_addr); -- Gitee From 3db0118d3663c5d56841dac30e4bf95ccfff21bd Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 2 Apr 2024 09:39:36 +0800 Subject: [PATCH 250/939] target/loongarch: Fix qemu-system-loongarch64 assert failed with the option '-d int' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu-system-loongarch64 assert failed with the option '-d int', the helper_idle() raise an exception EXCP_HLT, but the exception name is undefined. ----- merge patch: 0cbb322f70e8a87e4acbffecef5ea8f9448f3513(target/loongarch/cpu.c: typo fix: expection) Signed-off-by: Song Gao Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20240321123606.1704900-1-gaosong@loongson.cn> --- target/loongarch/cpu.c | 74 +++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index b098b1c6f3..0b3f954b64 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -43,33 +43,45 @@ const char * const fregnames[32] = { "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", }; -static const char * const excp_names[] = { - [EXCCODE_INT] = "Interrupt", - [EXCCODE_PIL] = "Page invalid exception for load", - [EXCCODE_PIS] = "Page invalid exception for store", - [EXCCODE_PIF] = "Page invalid exception for fetch", - [EXCCODE_PME] = "Page modified exception", - [EXCCODE_PNR] = "Page Not Readable exception", - [EXCCODE_PNX] = "Page Not Executable exception", - [EXCCODE_PPI] = "Page Privilege error", - [EXCCODE_ADEF] = "Address error for instruction fetch", - [EXCCODE_ADEM] = "Address error for Memory access", - [EXCCODE_SYS] = "Syscall", - [EXCCODE_BRK] = "Break", - [EXCCODE_INE] = "Instruction Non-Existent", - [EXCCODE_IPE] = "Instruction privilege error", - [EXCCODE_FPD] = "Floating Point Disabled", - [EXCCODE_FPE] = "Floating Point Exception", - [EXCCODE_DBP] = "Debug breakpoint", - [EXCCODE_BCE] = "Bound Check Exception", - [EXCCODE_SXD] = "128 bit vector instructions Disable exception", - [EXCCODE_ASXD] = "256 bit vector instructions Disable exception", +struct TypeExcp { + int32_t exccode; + const char * const name; +}; + +static const struct TypeExcp excp_names[] = { + {EXCCODE_INT, "Interrupt"}, + {EXCCODE_PIL, "Page invalid exception for load"}, + {EXCCODE_PIS, "Page invalid exception for store"}, + {EXCCODE_PIF, "Page invalid exception for fetch"}, + {EXCCODE_PME, "Page modified exception"}, + {EXCCODE_PNR, "Page Not Readable exception"}, + {EXCCODE_PNX, "Page Not Executable exception"}, + {EXCCODE_PPI, "Page Privilege error"}, + {EXCCODE_ADEF, "Address error for instruction fetch"}, + {EXCCODE_ADEM, "Address error for Memory access"}, + {EXCCODE_SYS, "Syscall"}, + {EXCCODE_BRK, "Break"}, + {EXCCODE_INE, "Instruction Non-Existent"}, + {EXCCODE_IPE, "Instruction privilege error"}, + {EXCCODE_FPD, "Floating Point Disabled"}, + {EXCCODE_FPE, "Floating Point Exception"}, + {EXCCODE_DBP, "Debug breakpoint"}, + {EXCCODE_BCE, "Bound Check Exception"}, + {EXCCODE_SXD, "128 bit vector instructions Disable exception"}, + {EXCCODE_ASXD, "256 bit vector instructions Disable exception"}, + {EXCP_HLT, "EXCP_HLT"}, }; const char *loongarch_exception_name(int32_t exception) { - assert(excp_names[exception]); - return excp_names[exception]; + int i; + + for (i = 0; i < ARRAY_SIZE(excp_names); i++) { + if (excp_names[i].exccode == exception) { + return excp_names[i].name; + } + } + return "Unknown"; } void G_NORETURN do_raise_exception(CPULoongArchState *env, @@ -78,7 +90,7 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env, { CPUState *cs = env_cpu(env); - qemu_log_mask(CPU_LOG_INT, "%s: %d (%s)\n", + qemu_log_mask(CPU_LOG_INT, "%s: exception: %d (%s)\n", __func__, exception, loongarch_exception_name(exception)); @@ -159,22 +171,16 @@ static void loongarch_cpu_do_interrupt(CPUState *cs) CPULoongArchState *env = &cpu->env; bool update_badinstr = 1; int cause = -1; - const char *name; bool tlbfill = FIELD_EX64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR); uint32_t vec_size = FIELD_EX64(env->CSR_ECFG, CSR_ECFG, VS); if (cs->exception_index != EXCCODE_INT) { - if (cs->exception_index < 0 || - cs->exception_index >= ARRAY_SIZE(excp_names)) { - name = "unknown"; - } else { - name = excp_names[cs->exception_index]; - } - qemu_log_mask(CPU_LOG_INT, "%s enter: pc " TARGET_FMT_lx " ERA " TARGET_FMT_lx - " TLBRERA " TARGET_FMT_lx " %s exception\n", __func__, - env->pc, env->CSR_ERA, env->CSR_TLBRERA, name); + " TLBRERA " TARGET_FMT_lx " exception: %d (%s)\n", + __func__, env->pc, env->CSR_ERA, env->CSR_TLBRERA, + cs->exception_index, + loongarch_exception_name(cs->exception_index)); } switch (cs->exception_index) { -- Gitee From 1d283874a1009c3c244330c46dfa7440c9c1127c Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 20 Mar 2024 09:39:55 +0800 Subject: [PATCH 251/939] target/loongarch: Fix qemu-loongarch64 hang when executing 'll.d $t0, $t0, 0' On gen_ll, if a->imm is zero, make_address_x return src1, but the load to destination may clobber src1. We use a new destination to fix this problem. Fixes: c5af6628f4be (target/loongarch: Extract make_address_i() helper) Reviewed-by: Richard Henderson Suggested-by: Richard Henderson Signed-off-by: Song Gao Message-Id: <20240320013955.1561311-1-gaosong@loongson.cn> --- target/loongarch/tcg/insn_trans/trans_atomic.c.inc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc index 80c2e286fd..974bc2a70f 100644 --- a/target/loongarch/tcg/insn_trans/trans_atomic.c.inc +++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc @@ -5,14 +5,14 @@ static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop) { - TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE); + TCGv t1 = tcg_temp_new(); TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE); TCGv t0 = make_address_i(ctx, src1, a->imm); - tcg_gen_qemu_ld_i64(dest, t0, ctx->mem_idx, mop); + tcg_gen_qemu_ld_i64(t1, t0, ctx->mem_idx, mop); tcg_gen_st_tl(t0, tcg_env, offsetof(CPULoongArchState, lladdr)); - tcg_gen_st_tl(dest, tcg_env, offsetof(CPULoongArchState, llval)); - gen_set_gpr(a->rd, dest, EXT_NONE); + tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval)); + gen_set_gpr(a->rd, t1, EXT_NONE); return true; } -- Gitee From 9c858f7d6e34b00486e3493f643fd19cb87ee290 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 18 Mar 2024 15:03:32 +0800 Subject: [PATCH 252/939] target/loongarch: Fix tlb huge page loading issue When we use qemu tcg simulation, the page size of bios is 4KB. When using the level 2 super huge page (page size is 1G) to create the page table, it is found that the content of the corresponding address space is abnormal, resulting in the bios can not start the operating system and graphical interface normally. The lddir and ldpte instruction emulation has a problem with the use of super huge page processing above level 2. The page size is not correctly calculated, resulting in the wrong page size of the table entry found by tlb. Signed-off-by: Xianglai Li Reviewed-by: Richard Henderson Signed-off-by: Song Gao Message-Id: <20240318070332.1273939-1-lixianglai@loongson.cn> --- target/loongarch/cpu-csr.h | 3 + target/loongarch/internals.h | 5 -- target/loongarch/tcg/tlb_helper.c | 113 +++++++++++++++++++++--------- 3 files changed, 82 insertions(+), 39 deletions(-) diff --git a/target/loongarch/cpu-csr.h b/target/loongarch/cpu-csr.h index c59d7a9fcb..0834e91f30 100644 --- a/target/loongarch/cpu-csr.h +++ b/target/loongarch/cpu-csr.h @@ -67,6 +67,9 @@ FIELD(TLBENTRY, D, 1, 1) FIELD(TLBENTRY, PLV, 2, 2) FIELD(TLBENTRY, MAT, 4, 2) FIELD(TLBENTRY, G, 6, 1) +FIELD(TLBENTRY, HUGE, 6, 1) +FIELD(TLBENTRY, HGLOBAL, 12, 1) +FIELD(TLBENTRY, LEVEL, 13, 2) FIELD(TLBENTRY_32, PPN, 8, 24) FIELD(TLBENTRY_64, PPN, 12, 36) FIELD(TLBENTRY_64, NR, 61, 1) diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index a2fc54c8a7..944153b180 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -16,11 +16,6 @@ #define TARGET_PHYS_MASK MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS) #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS) -/* Global bit used for lddir/ldpte */ -#define LOONGARCH_PAGE_HUGE_SHIFT 6 -/* Global bit for huge page */ -#define LOONGARCH_HGLOBAL_SHIFT 12 - void loongarch_translate_init(void); void loongarch_cpu_dump_state(CPUState *cpu, FILE *f, int flags); diff --git a/target/loongarch/tcg/tlb_helper.c b/target/loongarch/tcg/tlb_helper.c index 804ab7a263..eedd1ac376 100644 --- a/target/loongarch/tcg/tlb_helper.c +++ b/target/loongarch/tcg/tlb_helper.c @@ -17,6 +17,34 @@ #include "exec/log.h" #include "cpu-csr.h" +static void get_dir_base_width(CPULoongArchState *env, uint64_t *dir_base, + uint64_t *dir_width, target_ulong level) +{ + switch (level) { + case 1: + *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_WIDTH); + break; + case 2: + *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_WIDTH); + break; + case 3: + *dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_WIDTH); + break; + case 4: + *dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_WIDTH); + break; + default: + /* level may be zero for ldpte */ + *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTBASE); + *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTWIDTH); + break; + } +} + static void raise_mmu_exception(CPULoongArchState *env, target_ulong address, MMUAccessType access_type, int tlb_error) { @@ -486,7 +514,25 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base, target_ulong badvaddr, index, phys, ret; int shift; uint64_t dir_base, dir_width; - bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1; + + if (unlikely((level == 0) || (level > 4))) { + qemu_log_mask(LOG_GUEST_ERROR, + "Attepted LDDIR with level %"PRId64"\n", level); + return base; + } + + if (FIELD_EX64(base, TLBENTRY, HUGE)) { + if (unlikely(level == 4)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Attempted use of level 4 huge page\n"); + } + + if (FIELD_EX64(base, TLBENTRY, LEVEL)) { + return base; + } else { + return FIELD_DP64(base, TLBENTRY, LEVEL, level); + } + } badvaddr = env->CSR_TLBRBADV; base = base & TARGET_PHYS_MASK; @@ -495,30 +541,7 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base, shift = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTEWIDTH); shift = (shift + 1) * 3; - if (huge) { - return base; - } - switch (level) { - case 1: - dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_BASE); - dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_WIDTH); - break; - case 2: - dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_BASE); - dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_WIDTH); - break; - case 3: - dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_BASE); - dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_WIDTH); - break; - case 4: - dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_BASE); - dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_WIDTH); - break; - default: - do_raise_exception(env, EXCCODE_INE, GETPC()); - return 0; - } + get_dir_base_width(env, &dir_base, &dir_width, level); index = (badvaddr >> dir_base) & ((1 << dir_width) - 1); phys = base | index << shift; ret = ldq_phys(cs->as, phys) & TARGET_PHYS_MASK; @@ -531,20 +554,42 @@ void helper_ldpte(CPULoongArchState *env, target_ulong base, target_ulong odd, CPUState *cs = env_cpu(env); target_ulong phys, tmp0, ptindex, ptoffset0, ptoffset1, ps, badv; int shift; - bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1; uint64_t ptbase = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTBASE); uint64_t ptwidth = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTWIDTH); + uint64_t dir_base, dir_width; + /* + * The parameter "base" has only two types, + * one is the page table base address, + * whose bit 6 should be 0, + * and the other is the huge page entry, + * whose bit 6 should be 1. + */ base = base & TARGET_PHYS_MASK; + if (FIELD_EX64(base, TLBENTRY, HUGE)) { + /* + * Gets the huge page level and Gets huge page size. + * Clears the huge page level information in the entry. + * Clears huge page bit. + * Move HGLOBAL bit to GLOBAL bit. + */ + get_dir_base_width(env, &dir_base, &dir_width, + FIELD_EX64(base, TLBENTRY, LEVEL)); + + base = FIELD_DP64(base, TLBENTRY, LEVEL, 0); + base = FIELD_DP64(base, TLBENTRY, HUGE, 0); + if (FIELD_EX64(base, TLBENTRY, HGLOBAL)) { + base = FIELD_DP64(base, TLBENTRY, HGLOBAL, 0); + base = FIELD_DP64(base, TLBENTRY, G, 1); + } - if (huge) { - /* Huge Page. base is paddr */ - tmp0 = base ^ (1 << LOONGARCH_PAGE_HUGE_SHIFT); - /* Move Global bit */ - tmp0 = ((tmp0 & (1 << LOONGARCH_HGLOBAL_SHIFT)) >> - LOONGARCH_HGLOBAL_SHIFT) << R_TLBENTRY_G_SHIFT | - (tmp0 & (~(1 << LOONGARCH_HGLOBAL_SHIFT))); - ps = ptbase + ptwidth - 1; + ps = dir_base + dir_width - 1; + /* + * Huge pages are evenly split into parity pages + * when loaded into the tlb, + * so the tlb page size needs to be divided by 2. + */ + tmp0 = base; if (odd) { tmp0 += MAKE_64BIT_MASK(ps, 1); } -- Gitee From 24ca459bf142516195cc968db17291938309c278 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 18 Feb 2024 15:00:25 +0800 Subject: [PATCH 253/939] target/loongarch/kvm: Add software breakpoint support With KVM virtualization, debug exception is passthrough to to guest kernel rather than host mode. Here hypercall instruction with special hypercall code is used for sw breakpoint usage. Now only software breakpoint is supported, and itt is allowed to insert/remove software breakpoint. Later hardware breakpoint will be added. Signed-off-by: Bibo Mao --- target/loongarch/kvm/kvm.c | 77 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index c19978a970..49d02076ad 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -29,6 +29,7 @@ #include "trace.h" static bool cap_has_mp_state; +static unsigned int brk_insn; const KVMCapabilityInfo kvm_arch_required_capabilities[] = { KVM_CAP_LAST_INFO }; @@ -675,7 +676,14 @@ static void kvm_loongarch_vm_stage_change(void *opaque, bool running, int kvm_arch_init_vcpu(CPUState *cs) { + uint64_t val; + qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs); + + if (!kvm_get_one_reg(cs, KVM_REG_LOONGARCH_DEBUG_INST, &val)) { + brk_insn = val; + } + return 0; } @@ -755,6 +763,68 @@ bool kvm_arch_cpu_check_are_resettable(void) return true; } + +void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) +{ + if (kvm_sw_breakpoints_active(cpu)) { + dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; + } +} + +int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) +{ + if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 0) || + cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk_insn, 4, 1)) { + error_report("%s failed", __func__); + return -EINVAL; + } + return 0; +} + +int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) +{ + static uint32_t brk; + + if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk, 4, 0) || + brk != brk_insn || + cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 1)) { + error_report("%s failed", __func__); + return -EINVAL; + } + return 0; +} + +int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type) +{ + return -ENOSYS; +} + +int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type) +{ + return -ENOSYS; +} + +void kvm_arch_remove_all_hw_breakpoints(void) +{ +} + +static bool kvm_loongarch_handle_debug(CPUState *cs, struct kvm_run *run) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + kvm_cpu_synchronize_state(cs); + if (cs->singlestep_enabled) { + return true; + } + + if (kvm_find_sw_breakpoint(cs, env->pc)) { + return true; + } + + return false; +} + int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) { int ret = 0; @@ -774,6 +844,13 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) run->iocsr_io.len, run->iocsr_io.is_write); break; + + case KVM_EXIT_DEBUG: + if (kvm_loongarch_handle_debug(cs, run)) { + ret = EXCP_DEBUG; + } + break; + default: ret = -1; warn_report("KVM: unknown exit reason %d", run->exit_reason); -- Gitee From 9e7059885812f60263d82ea2822877a7e916f175 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 13 Mar 2024 10:04:33 +0800 Subject: [PATCH 254/939] target/loongarch/kvm: sync kernel header files sync kernel header files. Signed-off-by: Bibo Mao --- linux-headers/asm-loongarch/kvm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h index 923d0bd382..4cec8c1601 100644 --- a/linux-headers/asm-loongarch/kvm.h +++ b/linux-headers/asm-loongarch/kvm.h @@ -15,10 +15,12 @@ */ #define __KVM_HAVE_READONLY_MEM +#define __KVM_HAVE_GUEST_DEBUG #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_DIRTY_LOG_PAGE_OFFSET 64 +#define KVM_GUESTDBG_USE_SW_BP 0x00010000 /* * for KVM_GET_REGS and KVM_SET_REGS */ @@ -74,6 +76,8 @@ struct kvm_fpu { #define KVM_REG_LOONGARCH_COUNTER (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 1) #define KVM_REG_LOONGARCH_VCPU_RESET (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 2) +/* Debugging: Special instruction for software breakpoint */ +#define KVM_REG_LOONGARCH_DEBUG_INST (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3) #define LOONGARCH_REG_SHIFT 3 #define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) -- Gitee From 04aef27ede108edd63d288dd3bb395e22a603f42 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 11 Mar 2024 15:01:31 +0800 Subject: [PATCH 255/939] hw/intc/loongarch_extioi: Add virt extension support With hardware extioi, irq can be routed to four vcpus with hardware extioi. This patch adds virt extension support, sot that irq can be routed to 256 vcpus. Signed-off-by: Song Gao Signed-off-by: Bibo Mao --- hw/intc/loongarch_extioi.c | 88 ++++++++++++++++++++- hw/loongarch/virt.c | 122 ++++++++++++++++++++++++++--- include/hw/intc/loongarch_extioi.h | 21 +++++ include/hw/loongarch/virt.h | 3 + target/loongarch/cpu.h | 1 + 5 files changed, 220 insertions(+), 15 deletions(-) diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c index bdfa3b481e..fa23e247ca 100644 --- a/hw/intc/loongarch_extioi.c +++ b/hw/intc/loongarch_extioi.c @@ -143,15 +143,17 @@ static inline void extioi_update_sw_coremap(LoongArchExtIOI *s, int irq, for (i = 0; i < 4; i++) { cpu = val & 0xff; - cpu = ctz32(cpu); - cpu = (cpu >= 4) ? 0 : cpu; + if (!(s->status & BIT(EXTIOI_ENABLE_CPU_ENCODE))) { + cpu = ctz32(cpu); + cpu = (cpu >= 4) ? 0 : cpu; + } val = val >> 8; if (s->sw_coremap[irq + i] == cpu) { continue; } - if (notify && test_bit(irq, (unsigned long *)s->isr)) { + if (notify && test_bit(irq + i, (unsigned long *)s->isr)) { /* * lower irq at old cpu and raise irq at new cpu */ @@ -265,6 +267,61 @@ static const MemoryRegionOps extioi_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; +static MemTxResult extioi_virt_readw(void *opaque, hwaddr addr, uint64_t *data, + unsigned size, MemTxAttrs attrs) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); + + switch (addr) { + case EXTIOI_VIRT_FEATURES: + *data = s->features; + break; + case EXTIOI_VIRT_CONFIG: + *data = s->status; + break; + default: + break; + } + + return MEMTX_OK; +} + +static MemTxResult extioi_virt_writew(void *opaque, hwaddr addr, + uint64_t val, unsigned size, + MemTxAttrs attrs) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); + + switch (addr) { + case EXTIOI_VIRT_FEATURES: + return MEMTX_ACCESS_ERROR; + + case EXTIOI_VIRT_CONFIG: + /* + * extioi features can only be set at disabled status + */ + if ((s->status & BIT(EXTIOI_ENABLE)) && val) { + return MEMTX_ACCESS_ERROR; + } + + s->status = val & s->features; + break; + default: + break; + } + return MEMTX_OK; +} + +static const MemoryRegionOps extioi_virt_ops = { + .read_with_attrs = extioi_virt_readw, + .write_with_attrs = extioi_virt_writew, + .impl.min_access_size = 4, + .impl.max_access_size = 4, + .valid.min_access_size = 4, + .valid.max_access_size = 8, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + static void loongarch_extioi_realize(DeviceState *dev, Error **errp) { LoongArchExtIOI *s = LOONGARCH_EXTIOI(dev); @@ -284,6 +341,16 @@ static void loongarch_extioi_realize(DeviceState *dev, Error **errp) memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, s, "extioi_system_mem", 0x900); sysbus_init_mmio(sbd, &s->extioi_system_mem); + + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + memory_region_init_io(&s->virt_extend, OBJECT(s), &extioi_virt_ops, + s, "extioi_virt", EXTIOI_VIRT_SIZE); + sysbus_init_mmio(sbd, &s->virt_extend); + s->features |= EXTIOI_VIRT_HAS_FEATURES; + } else { + s->status |= BIT(EXTIOI_ENABLE); + } + s->cpu = g_new0(ExtIOICore, s->num_cpu); if (s->cpu == NULL) { error_setg(errp, "Memory allocation for ExtIOICore faile"); @@ -304,6 +371,16 @@ static void loongarch_extioi_finalize(Object *obj) g_free(s->cpu); } +static void loongarch_extioi_reset(DeviceState *d) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(d); + + /* use legacy interrupt routing method by default */ + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + s->status = 0; + } +} + static int vmstate_extioi_post_load(void *opaque, int version_id) { LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); @@ -347,12 +424,16 @@ static const VMStateDescription vmstate_loongarch_extioi = { VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchExtIOI, num_cpu, vmstate_extioi_core, ExtIOICore), + VMSTATE_UINT32(features, LoongArchExtIOI), + VMSTATE_UINT32(status, LoongArchExtIOI), VMSTATE_END_OF_LIST() } }; static Property extioi_properties[] = { DEFINE_PROP_UINT32("num-cpu", LoongArchExtIOI, num_cpu, 1), + DEFINE_PROP_BIT("has-virtualization-extension", LoongArchExtIOI, features, + EXTIOI_HAS_VIRT_EXTENSION, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -361,6 +442,7 @@ static void loongarch_extioi_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = loongarch_extioi_realize; + dc->reset = loongarch_extioi_reset; device_class_set_props(dc, extioi_properties); dc->vmsd = &vmstate_loongarch_extioi; } diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 6ef40fa24a..01e59f3a95 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -15,6 +15,8 @@ #include "sysemu/runstate.h" #include "sysemu/reset.h" #include "sysemu/rtc.h" +#include "sysemu/tcg.h" +#include "sysemu/kvm.h" #include "hw/loongarch/virt.h" #include "exec/address-spaces.h" #include "hw/irq.h" @@ -54,6 +56,31 @@ struct loaderparams { const char *initrd_filename; }; +static bool virt_is_veiointc_enabled(LoongArchMachineState *lams) +{ + if (lams->veiointc == ON_OFF_AUTO_OFF) { + return false; + } + return true; +} + +static void virt_get_veiointc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + OnOffAuto veiointc = lams->veiointc; + + visit_type_OnOffAuto(v, name, &veiointc, errp); +} + +static void virt_set_veiointc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &lams->veiointc, errp); +} + static PFlashCFI01 *virt_flash_create1(LoongArchMachineState *lams, const char *name, const char *alias_prop_name) @@ -618,9 +645,18 @@ static void loongarch_irq_init(LoongArchMachineState *lams) /* Create EXTIOI device */ extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.cpus); + if (virt_is_veiointc_enabled(lams)) { + qdev_prop_set_bit(extioi, "has-virtualization-extension", true); + } sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + memory_region_add_subregion(&lams->system_iocsr, APIC_BASE, sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); + if (virt_is_veiointc_enabled(lams)) { + memory_region_add_subregion(&lams->system_iocsr, EXTIOI_VIRT_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); + } + lams->extioi = extioi; /* * connect ext irq to the cpu irq @@ -780,32 +816,87 @@ static void loongarch_direct_kernel_boot(LoongArchMachineState *lams, } } -static void loongarch_qemu_write(void *opaque, hwaddr addr, - uint64_t val, unsigned size) +static MemTxResult loongarch_qemu_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) { + LoongArchMachineState *lams = LOONGARCH_MACHINE(opaque); + uint64_t features; + + switch (addr) { + case MISC_FUNC_REG: + if (!virt_is_veiointc_enabled(lams)) { + return MEMTX_OK; + } + + features = address_space_ldl(&lams->as_iocsr, + EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, + attrs, NULL); + if (val & BIT_ULL(IOCSRM_EXTIOI_EN)) { + features |= BIT(EXTIOI_ENABLE); + } + if (val & BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE)) { + features |= BIT(EXTIOI_ENABLE_INT_ENCODE); + } + + address_space_stl(&lams->as_iocsr, + EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, + features, attrs, NULL); + } + + return MEMTX_OK; } -static uint64_t loongarch_qemu_read(void *opaque, hwaddr addr, unsigned size) +static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, + uint64_t *data, + unsigned size, MemTxAttrs attrs) { + LoongArchMachineState *lams = LOONGARCH_MACHINE(opaque); + uint64_t ret = 0; + int features; + switch (addr) { case VERSION_REG: - return 0x11ULL; + ret = 0x11ULL; + break; case FEATURE_REG: - return 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI | - 1ULL << IOCSRF_CSRIPI; + ret = 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI | + 1ULL << IOCSRF_CSRIPI; + if (kvm_enabled()) { + ret |= 1ULL << IOCSRF_VM; + } + break; case VENDOR_REG: - return 0x6e6f73676e6f6f4cULL; /* "Loongson" */ + ret = 0x6e6f73676e6f6f4cULL; /* "Loongson" */ + break; case CPUNAME_REG: - return 0x303030354133ULL; /* "3A5000" */ + ret = 0x303030354133ULL; /* "3A5000" */ + break; case MISC_FUNC_REG: - return 1ULL << IOCSRM_EXTIOI_EN; + if (!virt_is_veiointc_enabled(lams)) { + ret |= BIT_ULL(IOCSRM_EXTIOI_EN); + break; + } + + features = address_space_ldl(&lams->as_iocsr, + EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, + attrs, NULL); + if (features & BIT(EXTIOI_ENABLE)) { + ret |= BIT_ULL(IOCSRM_EXTIOI_EN); + } + + if (features & BIT(EXTIOI_ENABLE_INT_ENCODE)) { + ret |= BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE); + } + break; } - return 0ULL; + + *data = ret; + return MEMTX_OK; } static const MemoryRegionOps loongarch_qemu_ops = { - .read = loongarch_qemu_read, - .write = loongarch_qemu_write, + .read_with_attrs = loongarch_qemu_read, + .write_with_attrs = loongarch_qemu_write, .endianness = DEVICE_LITTLE_ENDIAN, .valid = { .min_access_size = 4, @@ -1010,6 +1101,9 @@ static void loongarch_machine_initfn(Object *obj) { LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + if (tcg_enabled()) { + lams->veiointc = ON_OFF_AUTO_OFF; + } lams->acpi = ON_OFF_AUTO_AUTO; lams->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); lams->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); @@ -1197,6 +1291,10 @@ static void loongarch_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "acpi", "Enable ACPI"); + object_class_property_add(oc, "v-eiointc", "OnOffAuto", + virt_get_veiointc, virt_set_veiointc, NULL, NULL); + object_class_property_set_description(oc, "v-eiointc", + "Enable Virt Extend I/O Interrupt Controller"); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index a0a46b888c..98f348c49d 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -40,6 +40,24 @@ #define EXTIOI_COREMAP_START (0xC00 - APIC_OFFSET) #define EXTIOI_COREMAP_END (0xD00 - APIC_OFFSET) +#define EXTIOI_VIRT_BASE (0x40000000) +#define EXTIOI_VIRT_SIZE (0x1000) +#define EXTIOI_VIRT_FEATURES (0x0) +#define EXTIOI_HAS_VIRT_EXTENSION (0) +#define EXTIOI_HAS_ENABLE_OPTION (1) +#define EXTIOI_HAS_INT_ENCODE (2) +#define EXTIOI_HAS_CPU_ENCODE (3) +#define EXTIOI_VIRT_HAS_FEATURES (BIT(EXTIOI_HAS_VIRT_EXTENSION) \ + | BIT(EXTIOI_HAS_ENABLE_OPTION)\ + | BIT(EXTIOI_HAS_INT_ENCODE) \ + | BIT(EXTIOI_HAS_CPU_ENCODE)) +#define EXTIOI_VIRT_CONFIG (0x4) +#define EXTIOI_ENABLE (1) +#define EXTIOI_ENABLE_INT_ENCODE (2) +#define EXTIOI_ENABLE_CPU_ENCODE (3) +#define EXTIOI_VIRT_COREMAP_START (0x40) +#define EXTIOI_VIRT_COREMAP_END (0x240) + typedef struct ExtIOICore { uint32_t coreisr[EXTIOI_IRQS_GROUP_COUNT]; DECLARE_BITMAP(sw_isr[LS3A_INTC_IP], EXTIOI_IRQS); @@ -51,6 +69,8 @@ OBJECT_DECLARE_SIMPLE_TYPE(LoongArchExtIOI, LOONGARCH_EXTIOI) struct LoongArchExtIOI { SysBusDevice parent_obj; uint32_t num_cpu; + uint32_t features; + uint32_t status; /* hardware state */ uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2]; uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT]; @@ -64,5 +84,6 @@ struct LoongArchExtIOI { qemu_irq irq[EXTIOI_IRQS]; ExtIOICore *cpu; MemoryRegion extioi_system_mem; + MemoryRegion virt_extend; }; #endif /* LOONGARCH_EXTIOI_H */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 252f7df7f4..99447fd1d6 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -45,16 +45,19 @@ struct LoongArchMachineState { Notifier machine_done; Notifier powerdown_notifier; OnOffAuto acpi; + OnOffAuto veiointc; char *oem_id; char *oem_table_id; DeviceState *acpi_ged; int fdt_size; DeviceState *platform_bus_dev; + DeviceState *extioi; PCIBus *pci_bus; PFlashCFI01 *flash[2]; MemoryRegion system_iocsr; MemoryRegion iocsr_mem; AddressSpace as_iocsr; + int features; }; #define TYPE_LOONGARCH_MACHINE MACHINE_TYPE_NAME("virt") diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 4aba8aba4c..4749d41c8c 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -36,6 +36,7 @@ #define CPUNAME_REG 0x20 #define MISC_FUNC_REG 0x420 #define IOCSRM_EXTIOI_EN 48 +#define IOCSRM_EXTIOI_INT_ENCODE 49 #define IOCSR_MEM_SIZE 0x428 -- Gitee From 6a2fea8f4e2a4c80ca28924db7b66e0446f9181c Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 22 Mar 2024 19:26:35 +0800 Subject: [PATCH 256/939] target/loongarch/kvm: Add pmu support This patch adds PMU support e.g '... -cpu max,pmu=on,pmnum=[1-16]'; '... -cpu max,pmu=on' (default pmnum = 4); '... -cpu max,pmu=off' (disable PMU) Signed-off-by: Song Gao --- target/loongarch/cpu.c | 64 +++++++++++++++++++++++++++ target/loongarch/cpu.h | 2 + target/loongarch/kvm/kvm.c | 55 ++++++++++++++++++++++- target/loongarch/loongarch-qmp-cmds.c | 2 +- 4 files changed, 121 insertions(+), 2 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 0b3f954b64..8e7c8332da 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -8,6 +8,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "qemu/qemu-print.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "qemu/module.h" #include "sysemu/qtest.h" @@ -19,6 +20,7 @@ #include "internals.h" #include "fpu/softfloat-helpers.h" #include "cpu-csr.h" +#include "qapi/visitor.h" #include "sysemu/reset.h" #include "vec.h" #ifdef CONFIG_KVM @@ -426,6 +428,14 @@ static void loongarch_la464_initfn(Object *obj) data = FIELD_DP32(data, CPUCFG5, CC_DIV, 1); env->cpucfg[5] = data; + if (kvm_enabled()) { + data = 0; + data = FIELD_DP32(data, CPUCFG6, PMP, 1); + data = FIELD_DP32(data, CPUCFG6, PMNUM, 3); + data = FIELD_DP32(data, CPUCFG6, PMBITS, 63); + env->cpucfg[6] = data; + } + data = 0; data = FIELD_DP32(data, CPUCFG16, L1_IUPRE, 1); data = FIELD_DP32(data, CPUCFG16, L1_DPRE, 1); @@ -660,6 +670,48 @@ static void loongarch_set_lasx(Object *obj, bool value, Error **errp) } } +static bool loongarch_get_pmu(Object *obj, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + + return !!(FIELD_EX32(cpu->env.cpucfg[6], CPUCFG6, PMP)); +} + +static void loongarch_set_pmu(Object *obj, bool value, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + + cpu->env.cpucfg[6] = FIELD_DP32(cpu->env.cpucfg[6], CPUCFG6, PMP, value); +} + +static void loongarch_get_pmnum(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + uint32_t value = FIELD_EX32(cpu->env.cpucfg[6], CPUCFG6, PMNUM); + + visit_type_uint32(v, name, &value, errp); +} + +static void loongarch_set_pmnum(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + uint32_t *value= opaque; + + if (!visit_type_uint32(v, name, value, errp)) { + return; + } + if ((*value <= PMNUM_MAX) && (*value > 0)) { + cpu->env.cpucfg[6] = FIELD_DP32(cpu->env.cpucfg[6], CPUCFG6, PMNUM, *value -1); + } else { + error_report("Performance counter number need be in [1- %d]\n", PMNUM_MAX); + exit(EXIT_FAILURE); + } +} + void loongarch_cpu_post_init(Object *obj) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); @@ -672,6 +724,18 @@ void loongarch_cpu_post_init(Object *obj) object_property_add_bool(obj, "lasx", loongarch_get_lasx, loongarch_set_lasx); } + + if (kvm_enabled()) { + object_property_add_bool(obj, "pmu", loongarch_get_pmu, + loongarch_set_pmu); + if (FIELD_EX32(cpu->env.cpucfg[6], CPUCFG6, PMP)) { + uint32_t value = 4; + object_property_add(obj, "pmnum", "uint32", + loongarch_get_pmnum, + loongarch_set_pmnum, NULL, + (void *)&value); + } + } } static void loongarch_cpu_init(Object *obj) diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 4749d41c8c..80cad24fa1 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -186,6 +186,8 @@ FIELD(CPUCFG6, PMNUM, 4, 4) FIELD(CPUCFG6, PMBITS, 8, 6) FIELD(CPUCFG6, UPM, 14, 1) +#define PMNUM_MAX 16 + /* cpucfg[16] bits */ FIELD(CPUCFG16, L1_IUPRE, 0, 1) FIELD(CPUCFG16, L1_IUUNIFY, 1, 1) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 49d02076ad..5dda631b2b 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -573,6 +573,53 @@ static int kvm_check_cpucfg2(CPUState *cs) return ret; } +static int kvm_check_cpucfg6(CPUState *cs) +{ + int ret; + uint64_t val; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_CPUCFG, + .attr = 6, + .addr = (uint64_t)&val, + }; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, &attr); + if (!ret) { + kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, &attr); + + if (FIELD_EX32(env->cpucfg[6], CPUCFG6, PMP)) { + /* Check PMP */ + if (!FIELD_EX32(val, CPUCFG6, PMP)) { + error_report("'pmu' feature not supported by KVM on this host" + " Please disable 'pmu' with " + "'... -cpu XXX,pmu=off ...'\n"); + exit(EXIT_FAILURE); + } + /* Check PMNUM */ + int guest_pmnum = FIELD_EX32(env->cpucfg[6], CPUCFG6, PMNUM); + int host_pmnum = FIELD_EX32(val, CPUCFG6, PMNUM); + if (guest_pmnum > host_pmnum){ + warn_report("The guest pmnum %d larger than KVM support %d\n", + guest_pmnum, host_pmnum); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, + PMNUM, host_pmnum); + } + /* Check PMBITS */ + int guest_pmbits = FIELD_EX32(env->cpucfg[6], CPUCFG6, PMBITS); + int host_pmbits = FIELD_EX32(val, CPUCFG6, PMBITS); + if (guest_pmbits != host_pmbits) { + warn_report("The host not support PMBITS %d\n", guest_pmbits); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, + PMBITS, host_pmbits); + } + } + } + + return ret; +} + static int kvm_loongarch_put_cpucfg(CPUState *cs) { int i, ret = 0; @@ -586,7 +633,13 @@ static int kvm_loongarch_put_cpucfg(CPUState *cs) if (ret) { return ret; } - } + } + if (i == 6) { + ret = kvm_check_cpucfg6(cs); + if (ret) { + return ret; + } + } val = env->cpucfg[i]; ret = kvm_set_one_reg(cs, KVM_IOC_CPUCFG(i), &val); if (ret < 0) { diff --git a/target/loongarch/loongarch-qmp-cmds.c b/target/loongarch/loongarch-qmp-cmds.c index 645672ff59..2612f43de9 100644 --- a/target/loongarch/loongarch-qmp-cmds.c +++ b/target/loongarch/loongarch-qmp-cmds.c @@ -42,7 +42,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) } static const char *cpu_model_advertised_features[] = { - "lsx", "lasx", NULL + "lsx", "lasx", "pmu", "pmnum", NULL }; CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type, -- Gitee From b3c8a372c63bd1384a5896abeb4e8b0ce21d93cf Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 24 Apr 2024 14:18:46 +0800 Subject: [PATCH 257/939] target/loongarch/kvm: Fix vm restore failed The vmstate_loongarch_cpu need kvm_state_counter. Signed-off-by: Song Gao --- target/loongarch/machine.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index 1c4e01d076..4443caed2d 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -191,6 +191,8 @@ const VMStateDescription vmstate_loongarch_cpu = { VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX, 0, vmstate_tlb, LoongArchTLB), + VMSTATE_UINT64(kvm_state_counter, LoongArchCPU), + VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription*[]) { -- Gitee From 8b69a1b340da95cacdff252927ca8aef9d43c33a Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 24 Apr 2024 16:06:33 +0800 Subject: [PATCH 258/939] target/loongarch/kvm: Add pv steal time support Signed-off-by: Song Gao --- linux-headers/asm-loongarch/kvm.h | 2 ++ target/loongarch/cpu.h | 3 ++ target/loongarch/kvm/kvm.c | 50 ++++++++++++++++++++++++++++ target/loongarch/kvm/kvm_loongarch.h | 2 ++ target/loongarch/machine.c | 25 ++++++++++++++ 5 files changed, 82 insertions(+) diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h index 4cec8c1601..81fec85f0a 100644 --- a/linux-headers/asm-loongarch/kvm.h +++ b/linux-headers/asm-loongarch/kvm.h @@ -84,6 +84,8 @@ struct kvm_fpu { #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) #define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) #define KVM_LOONGARCH_VCPU_CPUCFG 0 +#define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1 +#define KVM_LOONGARCH_VCPU_PVTIME_GPA 0 struct kvm_debug_exit_arch { }; diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 80cad24fa1..0ed24051af 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -365,6 +365,9 @@ typedef struct CPUArchState { /* Store ipistate to access from this struct */ DeviceState *ipistate; #endif + struct { + uint64_t guest_addr; + } st; } CPULoongArchState; /** diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 5dda631b2b..e1d521a1de 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -649,6 +649,56 @@ static int kvm_loongarch_put_cpucfg(CPUState *cs) return ret; } +int kvm_loongarch_put_pvtime(LoongArchCPU *cpu) +{ + CPULoongArchState *env = &cpu->env; + int err; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&env->st.guest_addr, + }; + + err = kvm_vcpu_ioctl(CPU(cpu), KVM_HAS_DEVICE_ATTR, attr); + if (err != 0) { + /* It's ok even though kvm has not such attr */ + return 0; + } + + err = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEVICE_ATTR, attr); + if (err != 0) { + error_report("PVTIME IPA: KVM_SET_DEVICE_ATTR: %s", strerror(-err)); + return err; + } + + return 0; +} + +int kvm_loongarch_get_pvtime(LoongArchCPU *cpu) +{ + CPULoongArchState *env = &cpu->env; + int err; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&env->st.guest_addr, + }; + + err = kvm_vcpu_ioctl(CPU(cpu), KVM_HAS_DEVICE_ATTR, attr); + if (err != 0) { + /* It's ok even though kvm has not such attr */ + return 0; + } + + err = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEVICE_ATTR, attr); + if (err != 0) { + error_report("PVTIME IPA: KVM_GET_DEVICE_ATTR: %s", strerror(-err)); + return err; + } + + return 0; +} + int kvm_arch_get_registers(CPUState *cs) { int ret; diff --git a/target/loongarch/kvm/kvm_loongarch.h b/target/loongarch/kvm/kvm_loongarch.h index d945b6bb82..551878a725 100644 --- a/target/loongarch/kvm/kvm_loongarch.h +++ b/target/loongarch/kvm/kvm_loongarch.h @@ -12,5 +12,7 @@ int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level); void kvm_arch_reset_vcpu(CPULoongArchState *env); +int kvm_loongarch_put_pvtime(LoongArchCPU *cpu); +int kvm_loongarch_get_pvtime(LoongArchCPU *cpu); #endif diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index 4443caed2d..ec5abe56db 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -9,6 +9,8 @@ #include "cpu.h" #include "migration/cpu.h" #include "vec.h" +#include "kvm/kvm_loongarch.h" +#include "sysemu/kvm.h" static const VMStateDescription vmstate_fpu_reg = { .name = "fpu_reg", @@ -122,15 +124,38 @@ const VMStateDescription vmstate_tlb = { } }; +static int cpu_post_load(void *opaque, int version_id) +{ +#ifdef CONFIG_KVM + LoongArchCPU *cpu = opaque; + kvm_loongarch_put_pvtime(cpu); +#endif + return 0; +} + +static int cpu_pre_save(void *opaque) +{ +#ifdef CONFIG_KVM + LoongArchCPU *cpu = opaque; + kvm_loongarch_get_pvtime(cpu); +#endif + return 0; +} + /* LoongArch CPU state */ const VMStateDescription vmstate_loongarch_cpu = { .name = "cpu", .version_id = 1, .minimum_version_id = 1, + .post_load = cpu_post_load, + .pre_save = cpu_pre_save, .fields = (VMStateField[]) { VMSTATE_UINTTL_ARRAY(env.gpr, LoongArchCPU, 32), VMSTATE_UINTTL(env.pc, LoongArchCPU), + /* PV time */ + VMSTATE_UINT64(env.st.guest_addr, LoongArchCPU), + /* Remaining CSRs */ VMSTATE_UINT64(env.CSR_CRMD, LoongArchCPU), VMSTATE_UINT64(env.CSR_PRMD, LoongArchCPU), -- Gitee From 2e5fd7f2e6027899e84984bc31f52d4dda3b89ed Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 21 May 2024 14:35:19 +0800 Subject: [PATCH 259/939] ui/gtk: Fix mouse/motion event scaling issue with GTK display backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 37e91415018db3656b46cdea8f9e4d47b3ff130d Remove gtk_widget_get_scale_factor() usage from the calculation of the motion events in the GTK backend to make it work correctly on environments that have `gtk_widget_get_scale_factor() != 1`. This scale factor usage had been introduced in the commit f14aab420c and at that time the window size was used for calculating the things and it was working correctly. However, in the commit 2f31663ed4 the logic switched to use the widget size instead of window size and because of the change the usage of scale factor becomes invalid (since widgets use `vc->gfx.scale_{x, y}` for scaling). Tested on Crostini on ChromeOS (15823.51.0) with an external display. Fixes: 2f31663ed4 ("ui/gtk: use widget size for cursor motion event") Fixes: f14aab420c ("ui: fix incorrect pointer position on highdpi with gtk") Signed-off-by: hikalium Acked-by: Marc-André Lureau Message-Id: <20240512111435.30121-3-hikalium@hikalium.com> Signed-off-by: qihao_yewu --- ui/gtk.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/ui/gtk.c b/ui/gtk.c index 810d7fc796..1a69f6fc37 100644 --- a/ui/gtk.c +++ b/ui/gtk.c @@ -887,7 +887,7 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion, int x, y; int mx, my; int fbh, fbw; - int ww, wh, ws; + int ww, wh; if (!vc->gfx.ds) { return TRUE; @@ -898,8 +898,13 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion, ww = gtk_widget_get_allocated_width(widget); wh = gtk_widget_get_allocated_height(widget); - ws = gtk_widget_get_scale_factor(widget); + /* + * `widget` may not have the same size with the frame buffer. + * In such cases, some paddings are needed around the `vc`. + * To achieve that, `vc` will be displayed at (mx, my) + * so that it is displayed at the center of the widget. + */ mx = my = 0; if (ww > fbw) { mx = (ww - fbw) / 2; @@ -908,8 +913,12 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion, my = (wh - fbh) / 2; } - x = (motion->x - mx) / vc->gfx.scale_x * ws; - y = (motion->y - my) / vc->gfx.scale_y * ws; + /* + * `motion` is reported in `widget` coordinates + * so translating it to the coordinates in `vc`. + */ + x = (motion->x - mx) / vc->gfx.scale_x; + y = (motion->y - my) / vc->gfx.scale_y; if (qemu_input_is_absolute(vc->gfx.dcl.con)) { if (x < 0 || y < 0 || -- Gitee From 7a3573ce009afa271168829da86e2c70c63fa58a Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 14 May 2024 19:07:52 +0800 Subject: [PATCH 260/939] target/loongarch/kvm: fpu save the vreg registers high 192bit On kvm side, get_fpu/set_fpu save the vreg registers high 192bits, but QEMU missing. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240514110752.989572-1-gaosong@loongson.cn> --- target/loongarch/kvm/kvm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index e1d521a1de..5c88270132 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -444,6 +444,9 @@ static int kvm_loongarch_get_regs_fp(CPUState *cs) env->fcsr0 = fpu.fcsr; for (i = 0; i < 32; i++) { env->fpr[i].vreg.UD[0] = fpu.fpr[i].val64[0]; + env->fpr[i].vreg.UD[1] = fpu.fpr[i].val64[1]; + env->fpr[i].vreg.UD[2] = fpu.fpr[i].val64[2]; + env->fpr[i].vreg.UD[3] = fpu.fpr[i].val64[3]; } for (i = 0; i < 8; i++) { env->cf[i] = fpu.fcc & 0xFF; @@ -465,6 +468,9 @@ static int kvm_loongarch_put_regs_fp(CPUState *cs) fpu.fcc = 0; for (i = 0; i < 32; i++) { fpu.fpr[i].val64[0] = env->fpr[i].vreg.UD[0]; + fpu.fpr[i].val64[1] = env->fpr[i].vreg.UD[1]; + fpu.fpr[i].val64[2] = env->fpr[i].vreg.UD[2]; + fpu.fpr[i].val64[3] = env->fpr[i].vreg.UD[3]; } for (i = 0; i < 8; i++) { -- Gitee From 85d1711807bc1ec0118cdc9f7cbf9a6e6b96db76 Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Thu, 23 May 2024 15:51:35 +0800 Subject: [PATCH 261/939] arm/virt: Set vcpus_count of CPU as 1 to compatible with libvirt If vcpus_count is greater than 1, use libvirt to hotplug vcpu will fail: "An error occurred, but the cause is unknown". Signed-off-by: Keqian Zhu --- hw/arm/virt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index dfe4d9e129..a6e324c6f8 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3064,7 +3064,6 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) { int n; unsigned int max_cpus = ms->smp.max_cpus; - unsigned int smp_threads = ms->smp.threads; VirtMachineState *vms = VIRT_MACHINE(ms); MachineClass *mc = MACHINE_GET_CLASS(vms); @@ -3078,7 +3077,7 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { ms->possible_cpus->cpus[n].type = ms->cpu_type; - ms->possible_cpus->cpus[n].vcpus_count = smp_threads; + ms->possible_cpus->cpus[n].vcpus_count = 1; ms->possible_cpus->cpus[n].arch_id = virt_cpu_mp_affinity(vms, n); -- Gitee From 7d4bc795419a69457ee5f2e32674183dc009d48f Mon Sep 17 00:00:00 2001 From: Yanjing Zhou Date: Wed, 15 May 2024 13:49:19 +0800 Subject: [PATCH 262/939] target/i386: Add Hygon Dhyana-v3 CPU model Add the following feature bits for Dhyana CPU model: perfctr-core, clzero, xsaveerptr, aes, pclmulqdq, sha-ni Disable xsaves feature bit for Erratum 1386 Signed-off-by: Yanjing Zhou --- target/i386/cpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index fd32c64f99..f4c22f32c6 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4793,6 +4793,20 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } }, }, + { .version = 3, + .props = (PropValue[]) { + { "xsaves", "off" }, + { "perfctr-core", "on" }, + { "clzero", "on" }, + { "xsaveerptr", "on" }, + { "aes", "on" }, + { "pclmulqdq", "on" }, + { "sha-ni", "on" }, + { "model-id", + "Hygon Dhyana-v3 processor" }, + { /* end of list */ } + }, + }, { /* end of list */ } } }, -- Gitee From f4d31d640491c66bb1277e12d3c1d0e7ebc7cae5 Mon Sep 17 00:00:00 2001 From: Yanjing Zhou Date: Wed, 15 May 2024 13:50:17 +0800 Subject: [PATCH 263/939] target/i386: Add new Hygon 'Dharma' CPU model Add the following feature bits compare to Dhyana CPU model: stibp, ibrs, umip, ssbd Signed-off-by: Yanjing Zhou --- target/i386/cpu.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index f4c22f32c6..711370d9b8 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2162,6 +2162,56 @@ static const CPUCaches epyc_genoa_cache_info = { }, }; +static const CPUCaches dharma_cache_info = { + .l1d_cache = &(CPUCacheInfo) { + .type = DATA_CACHE, + .level = 1, + .size = 32 * KiB, + .line_size = 64, + .associativity = 8, + .partitions = 1, + .sets = 64, + .lines_per_tag = 1, + .self_init = 1, + .no_invd_sharing = true, + }, + .l1i_cache = &(CPUCacheInfo) { + .type = INSTRUCTION_CACHE, + .level = 1, + .size = 32 * KiB, + .line_size = 64, + .associativity = 8, + .partitions = 1, + .sets = 64, + .lines_per_tag = 1, + .self_init = 1, + .no_invd_sharing = true, + }, + .l2_cache = &(CPUCacheInfo) { + .type = UNIFIED_CACHE, + .level = 2, + .size = 512 * KiB, + .line_size = 64, + .associativity = 8, + .partitions = 1, + .sets = 1024, + .lines_per_tag = 1, + }, + .l3_cache = &(CPUCacheInfo) { + .type = UNIFIED_CACHE, + .level = 3, + .size = 16 * MiB, + .line_size = 64, + .associativity = 16, + .partitions = 1, + .sets = 16384, + .lines_per_tag = 1, + .self_init = true, + .inclusive = true, + .complex_indexing = true, + }, +}; + /* The following VMX features are not supported by KVM and are left out in the * CPU definitions: * @@ -5038,6 +5088,55 @@ static const X86CPUDefinition builtin_x86_defs[] = { .model_id = "AMD EPYC-Genoa Processor", .cache_info = &epyc_genoa_cache_info, }, + { + .name = "Dharma", + .level = 0xd, + .vendor = CPUID_VENDOR_HYGON, + .family = 24, + .model = 4, + .stepping = 0, + .features[FEAT_1_EDX] = + CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | + CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | + CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | + CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | + CPUID_VME | CPUID_FP87, + .features[FEAT_1_ECX] = + CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | + CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | + CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | + CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 | + CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | + CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | + CPUID_EXT2_SYSCALL, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | + CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | + CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | + CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, + .features[FEAT_8000_0008_EBX] = + CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | + CPUID_8000_0008_EBX_IBPB | CPUID_8000_0008_EBX_IBRS | + CPUID_8000_0008_EBX_STIBP | CPUID_8000_0008_EBX_AMD_SSBD, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | + CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED | + CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | + CPUID_7_0_EBX_SHA_NI, + .features[FEAT_7_0_ECX] = CPUID_7_0_ECX_UMIP, + .features[FEAT_XSAVE] = + CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | + CPUID_XSAVE_XGETBV1, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, + .features[FEAT_SVM] = + CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE, + .xlevel = 0x8000001E, + .model_id = "Hygon Dharma Processor", + .cache_info = &dharma_cache_info, + }, }; /* -- Gitee From c7c526af0bb4de631e2e5f1d38518beb8fa5a8a4 Mon Sep 17 00:00:00 2001 From: qihao Date: Wed, 5 Jun 2024 15:21:06 +0800 Subject: [PATCH 264/939] target/riscv/cpu.c: fix Zvkb extension config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from ff33b7a9699e977a050a1014c617a89da1bf8295 This code has a typo that writes zvkb to zvkg, causing users can't enable zvkb through the config. This patch gets this fixed. Signed-off-by: Yangyu Chen Fixes: ea61ef7097d0 ("target/riscv: Move vector crypto extensions to riscv_cpu_extensions") Reviewed-by: LIU Zhiwei Reviewed-by: Alistair Francis Reviewed-by: Max Chou Reviewed-by:  Weiwei Li Message-ID: Cc: qemu-stable Signed-off-by: Alistair Francis Signed-off-by: qihao_yewu --- target/riscv/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 83c7c0cf07..77cb59b8a1 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -1359,7 +1359,7 @@ const RISCVCPUMultiExtConfig riscv_cpu_extensions[] = { /* Vector cryptography extensions */ MULTI_EXT_CFG_BOOL("zvbb", ext_zvbb, false), MULTI_EXT_CFG_BOOL("zvbc", ext_zvbc, false), - MULTI_EXT_CFG_BOOL("zvkb", ext_zvkg, false), + MULTI_EXT_CFG_BOOL("zvkb", ext_zvkb, false), MULTI_EXT_CFG_BOOL("zvkg", ext_zvkg, false), MULTI_EXT_CFG_BOOL("zvkned", ext_zvkned, false), MULTI_EXT_CFG_BOOL("zvknha", ext_zvknha, false), -- Gitee From c36b2fb64446013ce8ded7f6bca5787795a17de1 Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 13 Jun 2024 10:31:49 +0800 Subject: [PATCH 265/939] target/hexagon: idef-parser fix leak of init_list cheery-pick from 95408ad8e24c4364086f185285039e89927dad6c gen_inst_init_args() is called for instructions using a predicate as an rvalue. Upon first call, the list of arguments which might need initialization init_list is freed to indicate that they have been processed. For instructions without an rvalue predicate, gen_inst_init_args() isn't called and init_list will never be freed. Free init_list from free_instruction() if it hasn't already been freed. A comment in free_instruction is also updated. Signed-off-by: Anton Johansson Reviewed-by: Taylor Simpson Reviewed-by: Brian Cain Message-Id: <20240523125901.27797-4-anjo@rev.ng> Signed-off-by: Brian Cain Signed-off-by: qihao_yewu --- target/hexagon/idef-parser/parser-helpers.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c index 4af020933a..a83099de6b 100644 --- a/target/hexagon/idef-parser/parser-helpers.c +++ b/target/hexagon/idef-parser/parser-helpers.c @@ -2123,9 +2123,16 @@ void free_instruction(Context *c) g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE); } g_array_free(c->inst.strings, TRUE); + /* + * Free list of arguments that might need initialization, if they haven't + * already been freed. + */ + if (c->inst.init_list) { + g_array_free(c->inst.init_list, TRUE); + } /* Free INAME token value */ g_string_free(c->inst.name, TRUE); - /* Free variables and registers */ + /* Free declared TCGv variables */ g_array_free(c->inst.allocated, TRUE); /* Initialize instruction-specific portion of the context */ memset(&(c->inst), 0, sizeof(Inst)); -- Gitee From 44b6911233ea62a6a57afd90b259064fac3855ea Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 18 Jun 2024 09:50:38 +0800 Subject: [PATCH 266/939] migration/dirtyrate: Fix segmentation fault cheery-pick from e65152d5483b2c847ec7a947ed52650152cfdcc0 Since the kvm_dirty_ring_enabled function accesses a null kvm_state pointer when the KVM acceleration parameter is not specified, running calc_dirty_rate with the -r or -b option causes a segmentation fault. Signed-off-by: Masato Imai Message-ID: <20240507025010.1968881-1-mii@sfc.wide.ad.jp> [Assert kvm_state when kvm_dirty_ring_enabled was called to fix it. - Hyman] Signed-off-by: Hyman Huang Signed-off-by: qihao_yewu --- accel/kvm/kvm-all.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index b791aad1d6..ade7841ca3 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2343,7 +2343,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id) bool kvm_dirty_ring_enabled(void) { - return kvm_state->kvm_dirty_ring_size ? true : false; + return kvm_state && kvm_state->kvm_dirty_ring_size; } static void query_stats_cb(StatsResultList **result, StatsTarget target, -- Gitee From 1163031f9e9662c0882c986e5e76d20a7cd9d579 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 11 Apr 2024 15:06:01 +0200 Subject: [PATCH 267/939] qcow2: Don't open data_file with BDRV_O_NO_IO (CVE-2024-4467) One use case for 'qemu-img info' is verifying that untrusted images don't reference an unwanted external file, be it as a backing file or an external data file. To make sure that calling 'qemu-img info' can't already have undesired side effects with a malicious image, just don't open the data file at all with BDRV_O_NO_IO. If nothing ever tries to do I/O, we don't need to have it open. This changes the output of iotests case 061, which used 'qemu-img info' to show that opening an image with an invalid data file fails. After this patch, it succeeds. Replace this part of the test with a qemu-io call, but keep the final 'qemu-img info' to show that the invalid data file is correctly displayed in the output. Fixes: CVE-2024-4467 Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Reviewed-by: Hanna Czenczek --- block/qcow2.c | 17 ++++++++++++++++- tests/qemu-iotests/061 | 6 ++++-- tests/qemu-iotests/061.out | 8 ++++++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 13e032bd5e..7af7c0bee4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1636,7 +1636,22 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - if (open_data_file) { + if (open_data_file && (flags & BDRV_O_NO_IO)) { + /* + * Don't open the data file for 'qemu-img info' so that it can be used + * to verify that an untrusted qcow2 image doesn't refer to external + * files. + * + * Note: This still makes has_data_file() return true. + */ + if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) { + s->data_file = NULL; + } else { + s->data_file = bs->file; + } + qdict_extract_subqdict(options, NULL, "data-file."); + qdict_del(options, "data-file"); + } else if (open_data_file) { /* Open external data file */ bdrv_graph_co_rdunlock(); s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs, diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061 index 53c7d428e3..b71ac097d1 100755 --- a/tests/qemu-iotests/061 +++ b/tests/qemu-iotests/061 @@ -326,12 +326,14 @@ $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG" echo _make_test_img -o "compat=1.1,data_file=$TEST_IMG.data" 64M $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG" -_img_info --format-specific +$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt +$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts echo $QEMU_IMG amend -o "data_file=" --image-opts "data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -_img_info --format-specific +$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt +$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts echo diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out index 139fc68177..24c33add7c 100644 --- a/tests/qemu-iotests/061.out +++ b/tests/qemu-iotests/061.out @@ -545,7 +545,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 qemu-img: data-file can only be set for images that use an external data file Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 data_file=TEST_DIR/t.IMGFMT.data -qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'foo': No such file or directory +qemu-io: can't open device TEST_DIR/t.IMGFMT: Could not open 'foo': No such file or directory +read 4096/4096 bytes at offset 0 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) image: TEST_DIR/t.IMGFMT file format: IMGFMT virtual size: 64 MiB (67108864 bytes) @@ -560,7 +562,9 @@ Format specific information: corrupt: false extended l2: false -qemu-img: Could not open 'TEST_DIR/t.IMGFMT': 'data-file' is required for this image +qemu-io: can't open device TEST_DIR/t.IMGFMT: 'data-file' is required for this image +read 4096/4096 bytes at offset 0 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) image: TEST_DIR/t.IMGFMT file format: IMGFMT virtual size: 64 MiB (67108864 bytes) -- Gitee From 905b918d99f2b60834b55f24738728ce9972ea29 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Apr 2024 14:49:40 +0200 Subject: [PATCH 268/939] iotests/244: Don't store data-file with protocol in image (CVE-2024-4467) We want to disable filename parsing for data files because it's too easy to abuse in malicious image files. Make the test ready for the change by passing the data file explicitly in command line options. Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Reviewed-by: Hanna Czenczek --- tests/qemu-iotests/244 | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244 index 3e61fa25bb..bb9cc6512f 100755 --- a/tests/qemu-iotests/244 +++ b/tests/qemu-iotests/244 @@ -215,9 +215,22 @@ $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" $QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" # blkdebug doesn't support copy offloading, so this tests the error path -$QEMU_IMG amend -f $IMGFMT -o "data_file=blkdebug::$TEST_IMG.data" "$TEST_IMG" -$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" -$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" +test_img_with_blkdebug="json:{ + 'driver': 'qcow2', + 'file': { + 'driver': 'file', + 'filename': '$TEST_IMG' + }, + 'data-file': { + 'driver': 'blkdebug', + 'image': { + 'driver': 'file', + 'filename': '$TEST_IMG.data' + } + } +}" +$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$test_img_with_blkdebug" +$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$test_img_with_blkdebug" echo echo "=== Flushing should flush the data file ===" -- Gitee From db48de0be2e1f4b476ffcaa94a4bd2c4b222f077 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Apr 2024 14:49:40 +0200 Subject: [PATCH 269/939] iotests/270: Don't store data-file with json: prefix in image (CVE-2024-4467) We want to disable filename parsing for data files because it's too easy to abuse in malicious image files. Make the test ready for the change by passing the data file explicitly in command line options. Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Reviewed-by: Hanna Czenczek --- tests/qemu-iotests/270 | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/270 b/tests/qemu-iotests/270 index 74352342db..c37b674aa2 100755 --- a/tests/qemu-iotests/270 +++ b/tests/qemu-iotests/270 @@ -60,8 +60,16 @@ _make_test_img -o cluster_size=2M,data_file="$TEST_IMG.orig" \ # "write" 2G of data without using any space. # (qemu-img create does not like it, though, because null-co does not # support image creation.) -$QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \ - "$TEST_IMG" +test_img_with_null_data="json:{ + 'driver': '$IMGFMT', + 'file': { + 'filename': '$TEST_IMG' + }, + 'data-file': { + 'driver': 'null-co', + 'size':'4294967296' + } +}" # This gives us a range of: # 2^31 - 512 + 768 - 1 = 2^31 + 255 > 2^31 @@ -74,7 +82,7 @@ $QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \ # on L2 boundaries, we need large L2 tables; hence the cluster size of # 2 MB. (Anything from 256 kB should work, though, because then one L2 # table covers 8 GB.) -$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$test_img_with_null_data" | _filter_qemu_io _check_test_img -- Gitee From fc74f24988cc2160d6115337330e8549df3aad0d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Apr 2024 14:56:02 +0200 Subject: [PATCH 270/939] block: Parse filenames only when explicitly requested (CVE-2024-4467) When handling image filenames from legacy options such as -drive or from tools, these filenames are parsed for protocol prefixes, including for the json:{} pseudo-protocol. This behaviour is intended for filenames that come directly from the command line and for backing files, which may come from the image file itself. Higher level management tools generally take care to verify that untrusted images don't contain a bad (or any) backing file reference; 'qemu-img info' is a suitable tool for this. However, for other files that can be referenced in images, such as qcow2 data files or VMDK extents, the string from the image file is usually not verified by management tools - and 'qemu-img info' wouldn't be suitable because in contrast to backing files, it already opens these other referenced files. So here the string should be interpreted as a literal local filename. More complex configurations need to be specified explicitly on the command line or in QMP. This patch changes bdrv_open_inherit() so that it only parses filenames if a new parameter parse_filename is true. It is set for the top level in bdrv_open(), for the file child and for the backing file child. All other callers pass false and disable filename parsing this way. Cc: qemu-stable@nongnu.org Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Reviewed-by: Hanna Czenczek Signed-off-by: liuxiangdong --- block.c | 98 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/block.c b/block.c index 3bfd4be6b4..6a2abfabcb 100644 --- a/block.c +++ b/block.c @@ -89,6 +89,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename, BlockDriverState *parent, const BdrvChildClass *child_class, BdrvChildRole child_role, + bool parse_filename, Error **errp); static bool bdrv_recurse_has_child(BlockDriverState *bs, @@ -2050,7 +2051,8 @@ static void parse_json_protocol(QDict *options, const char **pfilename, * block driver has been specified explicitly. */ static int bdrv_fill_options(QDict **options, const char *filename, - int *flags, Error **errp) + int *flags, bool allow_parse_filename, + Error **errp) { const char *drvname; bool protocol = *flags & BDRV_O_PROTOCOL; @@ -2092,7 +2094,7 @@ static int bdrv_fill_options(QDict **options, const char *filename, if (protocol && filename) { if (!qdict_haskey(*options, "filename")) { qdict_put_str(*options, "filename", filename); - parse_filename = true; + parse_filename = allow_parse_filename; } else { error_setg(errp, "Can't specify 'file' and 'filename' options at " "the same time"); @@ -3678,7 +3680,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, } backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs, - &child_of_bds, bdrv_backing_role(bs), errp); + &child_of_bds, bdrv_backing_role(bs), true, + errp); if (!backing_hd) { bs->open_flags |= BDRV_O_NO_BACKING; error_prepend(errp, "Could not open backing file: "); @@ -3715,7 +3718,8 @@ free_exit: static BlockDriverState * bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key, BlockDriverState *parent, const BdrvChildClass *child_class, - BdrvChildRole child_role, bool allow_none, Error **errp) + BdrvChildRole child_role, bool allow_none, + bool parse_filename, Error **errp) { BlockDriverState *bs = NULL; QDict *image_options; @@ -3746,7 +3750,8 @@ bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key, } bs = bdrv_open_inherit(filename, reference, image_options, 0, - parent, child_class, child_role, errp); + parent, child_class, child_role, parse_filename, + errp); if (!bs) { goto done; } @@ -3756,6 +3761,37 @@ done: return bs; } +static BdrvChild *bdrv_open_child_common(const char *filename, + QDict *options, const char *bdref_key, + BlockDriverState *parent, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + bool allow_none, bool parse_filename, + Error **errp) +{ + BlockDriverState *bs; + BdrvChild *child; + AioContext *ctx; + + GLOBAL_STATE_CODE(); + + bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class, + child_role, allow_none, parse_filename, errp); + if (bs == NULL) { + return NULL; + } + + bdrv_graph_wrlock(NULL); + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); + child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, + errp); + aio_context_release(ctx); + bdrv_graph_wrunlock(NULL); + + return child; +} + /* * Opens a disk image whose options are given as BlockdevRef in another block * device's options. @@ -3781,31 +3817,15 @@ BdrvChild *bdrv_open_child(const char *filename, BdrvChildRole child_role, bool allow_none, Error **errp) { - BlockDriverState *bs; - BdrvChild *child; - AioContext *ctx; - - GLOBAL_STATE_CODE(); - - bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class, - child_role, allow_none, errp); - if (bs == NULL) { - return NULL; - } - - bdrv_graph_wrlock(NULL); - ctx = bdrv_get_aio_context(bs); - aio_context_acquire(ctx); - child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, - errp); - aio_context_release(ctx); - bdrv_graph_wrunlock(NULL); - - return child; + return bdrv_open_child_common(filename, options, bdref_key, parent, + child_class, child_role, allow_none, false, + errp); } /* - * Wrapper on bdrv_open_child() for most popular case: open primary child of bs. + * This does mostly the same as bdrv_open_child(), but for opening the primary + * child of a node. A notable difference from bdrv_open_child() is that it + * enables filename parsing for protocol names (including json:). * * The caller must hold the lock of the main AioContext and no other AioContext. * @parent can move to a different AioContext in this function. Callers must @@ -3822,8 +3842,8 @@ int bdrv_open_file_child(const char *filename, role = parent->drv->is_filter ? (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE; - if (!bdrv_open_child(filename, options, bdref_key, parent, - &child_of_bds, role, false, errp)) + if (!bdrv_open_child_common(filename, options, bdref_key, parent, + &child_of_bds, role, false, true, errp)) { return -EINVAL; } @@ -3868,7 +3888,8 @@ BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp) } - bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp); + bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, false, + errp); obj = NULL; qobject_unref(obj); visit_free(v); @@ -3965,7 +3986,7 @@ static BlockDriverState * no_coroutine_fn bdrv_open_inherit(const char *filename, const char *reference, QDict *options, int flags, BlockDriverState *parent, const BdrvChildClass *child_class, BdrvChildRole child_role, - Error **errp) + bool parse_filename, Error **errp) { int ret; BlockBackend *file = NULL; @@ -4014,9 +4035,11 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, } /* json: syntax counts as explicit options, as if in the QDict */ - parse_json_protocol(options, &filename, &local_err); - if (local_err) { - goto fail; + if (parse_filename) { + parse_json_protocol(options, &filename, &local_err); + if (local_err) { + goto fail; + } } bs->explicit_options = qdict_clone_shallow(options); @@ -4041,7 +4064,8 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, parent->open_flags, parent->options); } - ret = bdrv_fill_options(&options, filename, &flags, &local_err); + ret = bdrv_fill_options(&options, filename, &flags, parse_filename, + &local_err); if (ret < 0) { goto fail; } @@ -4110,7 +4134,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, file_bs = bdrv_open_child_bs(filename, options, "file", bs, &child_of_bds, BDRV_CHILD_IMAGE, - true, &local_err); + true, true, &local_err); if (local_err) { goto fail; } @@ -4273,7 +4297,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference, GLOBAL_STATE_CODE(); return bdrv_open_inherit(filename, reference, options, flags, NULL, - NULL, 0, errp); + NULL, 0, true, errp); } /* Return true if the NULL-terminated @list contains @str */ -- Gitee From 39eae397a6b573505c0e84cc808cd9765a950908 Mon Sep 17 00:00:00 2001 From: guping Date: Mon, 15 Jul 2024 00:54:12 +0000 Subject: [PATCH 271/939] physmem: Bail out qemu_ram_block_from_host() for invalid ram addrs cherry-pick from 596ccccdbfa124adb42be8c2faf0c74f4849c7a6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bail out in qemu_ram_block_from_host() when xen_ram_addr_from_mapcache() does not find an existing mapping. Signed-off-by: default avatarEdgar E. Iglesias Reviewed-by: default avatarAlex Bennée Reviewed-by: default avatarStefano Stabellini Signed-off-by: guping --- system/physmem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/system/physmem.c b/system/physmem.c index cbe838f203..0c629233bd 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2263,6 +2263,10 @@ RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, ram_addr_t ram_addr; RCU_READ_LOCK_GUARD(); ram_addr = xen_ram_addr_from_mapcache(ptr); + if (ram_addr == RAM_ADDR_INVALID) { + return NULL; + } + block = qemu_get_ram_block(ram_addr); if (block) { *offset = ram_addr - block->offset; -- Gitee From 2afaa29abe368d51bbd553e3ebacd7e310c8e5c7 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 1 Aug 2024 16:49:20 -0500 Subject: [PATCH 272/939] nbd: Minor style and typo fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Touch up a comment with the wrong type name, and an over-long line, both noticed while working on upcoming patches. Signed-off-by: Eric Blake Message-ID: <20240807174943.771624-10-eblake@redhat.com> Reviewed-by: Daniel P. Berrangé --- nbd/server.c | 2 +- qemu-nbd.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index e8baed9705..7cf61e5aa7 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1865,7 +1865,7 @@ static void nbd_export_request_shutdown(BlockExport *blk_exp) blk_exp_ref(&exp->common); /* - * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a + * TODO: Should we expand QMP BlockExportRemoveMode enum to allow a * close mode that stops advertising the export to new clients but * still permits existing clients to run to completion? Because of * that possibility, nbd_export_close() can be called more than diff --git a/qemu-nbd.c b/qemu-nbd.c index acccf2977f..bfcc653d13 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -587,7 +587,8 @@ int main(int argc, char **argv) pthread_t client_thread; const char *fmt = NULL; Error *local_err = NULL; - BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; + BlockdevDetectZeroesOptions detect_zeroes = + BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; QDict *options = NULL; const char *export_name = NULL; /* defaults to "" later for server mode */ const char *export_description = NULL; -- Gitee From da0bf4171d0b386d1e7a22ad5b78a3ad48927471 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Wed, 7 Aug 2024 08:50:01 -0500 Subject: [PATCH 273/939] nbd/server: Plumb in new args to nbd_client_add() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upcoming patches to fix a CVE need to track an opaque pointer passed in by the owner of a client object, as well as request for a time limit on how fast negotiation must complete. Prepare for that by changing the signature of nbd_client_new() and adding an accessor to get at the opaque pointer, although for now the two servers (qemu-nbd.c and blockdev-nbd.c) do not change behavior even though they pass in a new default timeout value. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Eric Blake Message-ID: <20240807174943.771624-11-eblake@redhat.com> Reviewed-by: Daniel P. Berrangé [eblake: s/LIMIT/MAX_SECS/ as suggested by Dan] Signed-off-by: Eric Blake --- blockdev-nbd.c | 6 ++++-- include/block/nbd.h | 11 ++++++++++- nbd/server.c | 20 +++++++++++++++++--- qemu-nbd.c | 4 +++- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/blockdev-nbd.c b/blockdev-nbd.c index 213012435f..267a1de903 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -64,8 +64,10 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, nbd_update_server_watch(nbd_server); qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server"); - nbd_client_new(cioc, nbd_server->tlscreds, nbd_server->tlsauthz, - nbd_blockdev_client_closed); + /* TODO - expose handshake timeout as QMP option */ + nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, + nbd_server->tlscreds, nbd_server->tlsauthz, + nbd_blockdev_client_closed, NULL); } static void nbd_update_server_watch(NBDServerData *s) diff --git a/include/block/nbd.h b/include/block/nbd.h index 4e7bd6342f..1d4d65922d 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -33,6 +33,12 @@ typedef struct NBDMetaContexts NBDMetaContexts; extern const BlockExportDriver blk_exp_nbd; +/* + * NBD_DEFAULT_HANDSHAKE_MAX_SECS: Number of seconds in which client must + * succeed at NBD_OPT_GO before being forcefully dropped as too slow. + */ +#define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10 + /* Handshake phase structs - this struct is passed on the wire */ typedef struct NBDOption { @@ -403,9 +409,12 @@ AioContext *nbd_export_aio_context(NBDExport *exp); NBDExport *nbd_export_find(const char *name); void nbd_client_new(QIOChannelSocket *sioc, + uint32_t handshake_max_secs, QCryptoTLSCreds *tlscreds, const char *tlsauthz, - void (*close_fn)(NBDClient *, bool)); + void (*close_fn)(NBDClient *, bool), + void *owner); +void *nbd_client_owner(NBDClient *client); void nbd_client_get(NBDClient *client); void nbd_client_put(NBDClient *client); diff --git a/nbd/server.c b/nbd/server.c index 7cf61e5aa7..b3c4ba2c30 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -124,10 +124,12 @@ struct NBDMetaContexts { struct NBDClient { int refcount; void (*close_fn)(NBDClient *client, bool negotiated); + void *owner; NBDExport *exp; QCryptoTLSCreds *tlscreds; char *tlsauthz; + uint32_t handshake_max_secs; QIOChannelSocket *sioc; /* The underlying data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ @@ -3049,6 +3051,7 @@ static coroutine_fn void nbd_co_client_start(void *opaque) qemu_co_mutex_init(&client->send_lock); + /* TODO - utilize client->handshake_max_secs */ if (nbd_negotiate(client, &local_err)) { if (local_err) { error_report_err(local_err); @@ -3061,14 +3064,17 @@ static coroutine_fn void nbd_co_client_start(void *opaque) } /* - * Create a new client listener using the given channel @sioc. + * Create a new client listener using the given channel @sioc and @owner. * Begin servicing it in a coroutine. When the connection closes, call - * @close_fn with an indication of whether the client completed negotiation. + * @close_fn with an indication of whether the client completed negotiation + * within @handshake_max_secs seconds (0 for unbounded). */ void nbd_client_new(QIOChannelSocket *sioc, + uint32_t handshake_max_secs, QCryptoTLSCreds *tlscreds, const char *tlsauthz, - void (*close_fn)(NBDClient *, bool)) + void (*close_fn)(NBDClient *, bool), + void *owner) { NBDClient *client; Coroutine *co; @@ -3080,13 +3086,21 @@ void nbd_client_new(QIOChannelSocket *sioc, object_ref(OBJECT(client->tlscreds)); } client->tlsauthz = g_strdup(tlsauthz); + client->handshake_max_secs = handshake_max_secs; client->sioc = sioc; qio_channel_set_delay(QIO_CHANNEL(sioc), false); object_ref(OBJECT(client->sioc)); client->ioc = QIO_CHANNEL(sioc); object_ref(OBJECT(client->ioc)); client->close_fn = close_fn; + client->owner = owner; co = qemu_coroutine_create(nbd_co_client_start, client); qemu_coroutine_enter(co); } + +void * +nbd_client_owner(NBDClient *client) +{ + return client->owner; +} diff --git a/qemu-nbd.c b/qemu-nbd.c index bfcc653d13..8b09cb5e2a 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -389,7 +389,9 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, nb_fds++; nbd_update_server_watch(); - nbd_client_new(cioc, tlscreds, tlsauthz, nbd_client_closed); + /* TODO - expose handshake timeout as command line option */ + nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, + tlscreds, tlsauthz, nbd_client_closed, NULL); } static void nbd_update_server_watch(void) -- Gitee From cfbbd9903e2ea12d365105648ec8e3dfd07b6194 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 6 Aug 2024 13:53:00 -0500 Subject: [PATCH 274/939] nbd/server: CVE-2024-7409: Cap default max-connections to 100 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allowing an unlimited number of clients to any web service is a recipe for a rudimentary denial of service attack: the client merely needs to open lots of sockets without closing them, until qemu no longer has any more fds available to allocate. For qemu-nbd, we default to allowing only 1 connection unless more are explicitly asked for (-e or --shared); this was historically picked as a nice default (without an explicit -t, a non-persistent qemu-nbd goes away after a client disconnects, without needing any additional follow-up commands), and we are not going to change that interface now (besides, someday we want to point people towards qemu-storage-daemon instead of qemu-nbd). But for qemu proper, and the newer qemu-storage-daemon, the QMP nbd-server-start command has historically had a default of unlimited number of connections, in part because unlike qemu-nbd it is inherently persistent until nbd-server-stop. Allowing multiple client sockets is particularly useful for clients that can take advantage of MULTI_CONN (creating parallel sockets to increase throughput), although known clients that do so (such as libnbd's nbdcopy) typically use only 8 or 16 connections (the benefits of scaling diminish once more sockets are competing for kernel attention). Picking a number large enough for typical use cases, but not unlimited, makes it slightly harder for a malicious client to perform a denial of service merely by opening lots of connections withot progressing through the handshake. This change does not eliminate CVE-2024-7409 on its own, but reduces the chance for fd exhaustion or unlimited memory usage as an attack surface. On the other hand, by itself, it makes it more obvious that with a finite limit, we have the problem of an unauthenticated client holding 100 fds opened as a way to block out a legitimate client from being able to connect; thus, later patches will further add timeouts to reject clients that are not making progress. This is an INTENTIONAL change in behavior, and will break any client of nbd-server-start that was not passing an explicit max-connections parameter, yet expects more than 100 simultaneous connections. We are not aware of any such client (as stated above, most clients aware of MULTI_CONN get by just fine on 8 or 16 connections, and probably cope with later connections failing by relying on the earlier connections; libvirt has not yet been passing max-connections, but generally creates NBD servers with the intent for a single client for the sake of live storage migration; meanwhile, the KubeSAN project anticipates a large cluster sharing multiple clients [up to 8 per node, and up to 100 nodes in a cluster], but it currently uses qemu-nbd with an explicit --shared=0 rather than qemu-storage-daemon with nbd-server-start). We considered using a deprecation period (declare that omitting max-parameters is deprecated, and make it mandatory in 3 releases - then we don't need to pick an arbitrary default); that has zero risk of breaking any apps that accidentally depended on more than 100 connections, and where such breakage might not be noticed under unit testing but only under the larger loads of production usage. But it does not close the denial-of-service hole until far into the future, and requires all apps to change to add the parameter even if 100 was good enough. It also has a drawback that any app (like libvirt) that is accidentally relying on an unlimited default should seriously consider their own CVE now, at which point they are going to change to pass explicit max-connections sooner than waiting for 3 qemu releases. Finally, if our changed default breaks an app, that app can always pass in an explicit max-parameters with a larger value. It is also intentional that the HMP interface to nbd-server-start is not changed to expose max-connections (any client needing to fine-tune things should be using QMP). Suggested-by: Daniel P. Berrangé Signed-off-by: Eric Blake Message-ID: <20240807174943.771624-12-eblake@redhat.com> Reviewed-by: Daniel P. Berrangé [ericb: Expand commit message to summarize Dan's argument for why we break corner-case back-compat behavior without a deprecation period] Signed-off-by: Eric Blake --- block/monitor/block-hmp-cmds.c | 3 ++- blockdev-nbd.c | 8 ++++++++ include/block/nbd.h | 7 +++++++ qapi/block-export.json | 4 ++-- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index c729cbf1eb..78a6975852 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -415,7 +415,8 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict) goto exit; } - nbd_server_start(addr, NULL, NULL, 0, &local_err); + nbd_server_start(addr, NULL, NULL, NBD_DEFAULT_MAX_CONNECTIONS, + &local_err); qapi_free_SocketAddress(addr); if (local_err != NULL) { goto exit; diff --git a/blockdev-nbd.c b/blockdev-nbd.c index 267a1de903..24ba5382db 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -170,6 +170,10 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds, void nbd_server_start_options(NbdServerOptions *arg, Error **errp) { + if (!arg->has_max_connections) { + arg->max_connections = NBD_DEFAULT_MAX_CONNECTIONS; + } + nbd_server_start(arg->addr, arg->tls_creds, arg->tls_authz, arg->max_connections, errp); } @@ -182,6 +186,10 @@ void qmp_nbd_server_start(SocketAddressLegacy *addr, { SocketAddress *addr_flat = socket_address_flatten(addr); + if (!has_max_connections) { + max_connections = NBD_DEFAULT_MAX_CONNECTIONS; + } + nbd_server_start(addr_flat, tls_creds, tls_authz, max_connections, errp); qapi_free_SocketAddress(addr_flat); } diff --git a/include/block/nbd.h b/include/block/nbd.h index 1d4d65922d..d4f8b21aec 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -39,6 +39,13 @@ extern const BlockExportDriver blk_exp_nbd; */ #define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10 +/* + * NBD_DEFAULT_MAX_CONNECTIONS: Number of client sockets to allow at + * once; must be large enough to allow a MULTI_CONN-aware client like + * nbdcopy to create its typical number of 8-16 sockets. + */ +#define NBD_DEFAULT_MAX_CONNECTIONS 100 + /* Handshake phase structs - this struct is passed on the wire */ typedef struct NBDOption { diff --git a/qapi/block-export.json b/qapi/block-export.json index 7874a49ba7..1d255d77e3 100644 --- a/qapi/block-export.json +++ b/qapi/block-export.json @@ -28,7 +28,7 @@ # @max-connections: The maximum number of connections to allow at the # same time, 0 for unlimited. Setting this to 1 also stops the # server from advertising multiple client support (since 5.2; -# default: 0) +# default: 100) # # Since: 4.2 ## @@ -63,7 +63,7 @@ # @max-connections: The maximum number of connections to allow at the # same time, 0 for unlimited. Setting this to 1 also stops the # server from advertising multiple client support (since 5.2; -# default: 0). +# default: 100). # # Returns: error if the server is already running. # -- Gitee From 5f89a59b5f877d6795bd417c9193efa65fb83c3f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 8 Aug 2024 16:05:08 -0500 Subject: [PATCH 275/939] nbd/server: CVE-2024-7409: Drop non-negotiating clients MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A client that opens a socket but does not negotiate is merely hogging qemu's resources (an open fd and a small amount of memory); and a malicious client that can access the port where NBD is listening can attempt a denial of service attack by intentionally opening and abandoning lots of unfinished connections. The previous patch put a default bound on the number of such ongoing connections, but once that limit is hit, no more clients can connect (including legitimate ones). The solution is to insist that clients complete handshake within a reasonable time limit, defaulting to 10 seconds. A client that has not successfully completed NBD_OPT_GO by then (including the case of where the client didn't know TLS credentials to even reach the point of NBD_OPT_GO) is wasting our time and does not deserve to stay connected. Later patches will allow fine-tuning the limit away from the default value (including disabling it for doing integration testing of the handshake process itself). Note that this patch in isolation actually makes it more likely to see qemu SEGV after nbd-server-stop, as any client socket still connected when the server shuts down will now be closed after 10 seconds rather than at the client's whims. That will be addressed in the next patch. For a demo of this patch in action: $ qemu-nbd -f raw -r -t -e 10 file & $ nbdsh --opt-mode -c ' H = list() for i in range(20): print(i) H.insert(i, nbd.NBD()) H[i].set_opt_mode(True) H[i].connect_uri("nbd://localhost") ' $ kill $! where later connections get to start progressing once earlier ones are forcefully dropped for taking too long, rather than hanging. Suggested-by: Daniel P. Berrangé Signed-off-by: Eric Blake Message-ID: <20240807174943.771624-13-eblake@redhat.com> Reviewed-by: Daniel P. Berrangé [eblake: rebase to changes earlier in series, reduce scope of timer] Signed-off-by: Eric Blake Signed-off-by: liuxiangdong --- nbd/server.c | 28 +++++++++++++++++++++++++++- nbd/trace-events | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/nbd/server.c b/nbd/server.c index b3c4ba2c30..d1b3c35b59 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -3044,22 +3044,48 @@ static void nbd_client_receive_next_request(NBDClient *client) } } +static void nbd_handshake_timer_cb(void *opaque) +{ + QIOChannel *ioc = opaque; + + trace_nbd_handshake_timer_cb(); + qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); +} + static coroutine_fn void nbd_co_client_start(void *opaque) { NBDClient *client = opaque; Error *local_err = NULL; + QEMUTimer *handshake_timer = NULL; qemu_co_mutex_init(&client->send_lock); - /* TODO - utilize client->handshake_max_secs */ + /* + * Create a timer to bound the time spent in negotiation. If the + * timer expires, it is likely nbd_negotiate will fail because the + * socket was shutdown. + */ + if (client->handshake_max_secs > 0) { + handshake_timer = aio_timer_new(qemu_get_aio_context(), + QEMU_CLOCK_REALTIME, + SCALE_NS, + nbd_handshake_timer_cb, + client->sioc); + timer_mod(handshake_timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + + client->handshake_max_secs * NANOSECONDS_PER_SECOND); + } + if (nbd_negotiate(client, &local_err)) { if (local_err) { error_report_err(local_err); } + timer_free(handshake_timer); client_close(client, false); return; } + timer_free(handshake_timer); nbd_client_receive_next_request(client); } diff --git a/nbd/trace-events b/nbd/trace-events index 00ae3216a1..cbd0a4ab7e 100644 --- a/nbd/trace-events +++ b/nbd/trace-events @@ -76,6 +76,7 @@ nbd_co_receive_request_payload_received(uint64_t cookie, uint64_t len) "Payload nbd_co_receive_ext_payload_compliance(uint64_t from, uint64_t len) "client sent non-compliant write without payload flag: from=0x%" PRIx64 ", len=0x%" PRIx64 nbd_co_receive_align_compliance(const char *op, uint64_t from, uint64_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx64 ", align=0x%" PRIx32 nbd_trip(void) "Reading request" +nbd_handshake_timer_cb(void) "client took too long to negotiate" # client-connection.c nbd_connect_thread_sleep(uint64_t timeout) "timeout %" PRIu64 -- Gitee From fc5e00fc5ff2e7c454a576a81236131b8c74d042 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Wed, 7 Aug 2024 12:23:13 -0500 Subject: [PATCH 276/939] nbd/server: CVE-2024-7409: Close stray clients at server-stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A malicious client can attempt to connect to an NBD server, and then intentionally delay progress in the handshake, including if it does not know the TLS secrets. Although the previous two patches reduce this behavior by capping the default max-connections parameter and killing slow clients, they did not eliminate the possibility of a client waiting to close the socket until after the QMP nbd-server-stop command is executed, at which point qemu would SEGV when trying to dereference the NULL nbd_server global which is no longer present. This amounts to a denial of service attack. Worse, if another NBD server is started before the malicious client disconnects, I cannot rule out additional adverse effects when the old client interferes with the connection count of the new server (although the most likely is a crash due to an assertion failure when checking nbd_server->connections > 0). For environments without this patch, the CVE can be mitigated by ensuring (such as via a firewall) that only trusted clients can connect to an NBD server. Note that using frameworks like libvirt that ensure that TLS is used and that nbd-server-stop is not executed while any trusted clients are still connected will only help if there is also no possibility for an untrusted client to open a connection but then stall on the NBD handshake. Given the previous patches, it would be possible to guarantee that no clients remain connected by having nbd-server-stop sleep for longer than the default handshake deadline before finally freeing the global nbd_server object, but that could make QMP non-responsive for a long time. So intead, this patch fixes the problem by tracking all client sockets opened while the server is running, and forcefully closing any such sockets remaining without a completed handshake at the time of nbd-server-stop, then waiting until the coroutines servicing those sockets notice the state change. nbd-server-stop now has a second AIO_WAIT_WHILE_UNLOCKED (the first is indirectly through the blk_exp_close_all_type() that disconnects all clients that completed handshakes), but forced socket shutdown is enough to progress the coroutines and quickly tear down all clients before the server is freed, thus finally fixing the CVE. This patch relies heavily on the fact that nbd/server.c guarantees that it only calls nbd_blockdev_client_closed() from the main loop (see the assertion in nbd_client_put() and the hoops used in nbd_client_put_nonzero() to achieve that); if we did not have that guarantee, we would also need a mutex protecting our accesses of the list of connections to survive re-entrancy from independent iothreads. Although I did not actually try to test old builds, it looks like this problem has existed since at least commit 862172f45c (v2.12.0, 2017) - even back when that patch started using a QIONetListener to handle listening on multiple sockets, nbd_server_free() was already unaware that the nbd_blockdev_client_closed callback can be reached later by a client thread that has not completed handshakes (and therefore the client's socket never got added to the list closed in nbd_export_close_all), despite that patch intentionally tearing down the QIONetListener to prevent new clients. Reported-by: Alexander Ivanov Fixes: CVE-2024-7409 CC: qemu-stable@nongnu.org Signed-off-by: Eric Blake Message-ID: <20240807174943.771624-14-eblake@redhat.com> Reviewed-by: Daniel P. Berrangé --- blockdev-nbd.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/blockdev-nbd.c b/blockdev-nbd.c index 24ba5382db..f73409ae49 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -21,12 +21,18 @@ #include "io/channel-socket.h" #include "io/net-listener.h" +typedef struct NBDConn { + QIOChannelSocket *cioc; + QLIST_ENTRY(NBDConn) next; +} NBDConn; + typedef struct NBDServerData { QIONetListener *listener; QCryptoTLSCreds *tlscreds; char *tlsauthz; uint32_t max_connections; uint32_t connections; + QLIST_HEAD(, NBDConn) conns; } NBDServerData; static NBDServerData *nbd_server; @@ -51,6 +57,14 @@ int nbd_server_max_connections(void) static void nbd_blockdev_client_closed(NBDClient *client, bool ignored) { + NBDConn *conn = nbd_client_owner(client); + + assert(qemu_in_main_thread() && nbd_server); + + object_unref(OBJECT(conn->cioc)); + QLIST_REMOVE(conn, next); + g_free(conn); + nbd_client_put(client); assert(nbd_server->connections > 0); nbd_server->connections--; @@ -60,14 +74,20 @@ static void nbd_blockdev_client_closed(NBDClient *client, bool ignored) static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) { + NBDConn *conn = g_new0(NBDConn, 1); + + assert(qemu_in_main_thread() && nbd_server); nbd_server->connections++; + object_ref(OBJECT(cioc)); + conn->cioc = cioc; + QLIST_INSERT_HEAD(&nbd_server->conns, conn, next); nbd_update_server_watch(nbd_server); qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server"); /* TODO - expose handshake timeout as QMP option */ nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, nbd_server->tlscreds, nbd_server->tlsauthz, - nbd_blockdev_client_closed, NULL); + nbd_blockdev_client_closed, conn); } static void nbd_update_server_watch(NBDServerData *s) @@ -81,12 +101,25 @@ static void nbd_update_server_watch(NBDServerData *s) static void nbd_server_free(NBDServerData *server) { + NBDConn *conn, *tmp; + if (!server) { return; } + /* + * Forcefully close the listener socket, and any clients that have + * not yet disconnected on their own. + */ qio_net_listener_disconnect(server->listener); object_unref(OBJECT(server->listener)); + QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) { + qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH, + NULL); + } + + AIO_WAIT_WHILE_UNLOCKED(NULL, server->connections > 0); + if (server->tlscreds) { object_unref(OBJECT(server->tlscreds)); } -- Gitee From 550d304465b366a116e02d2cb006475ea453a98a Mon Sep 17 00:00:00 2001 From: guping Date: Mon, 22 Jul 2024 00:37:30 +0000 Subject: [PATCH 277/939] hvf: arm: Do not advance PC when raising an exception cherry-pick from 30a1690f2402e6c1582d5b3ebcf7940bfe2fad4b hvf did not advance PC when raising an exception for most unhandled system registers, but it mistakenly advanced PC when raising an exception for GICv3 registers. Cc: qemu-stable@nongnu.org Fixes: a2260983 ("hvf: arm: Add support for GICv3") Signed-off-by: default avatarAkihiko Odaki Message-id: 20240716-pmu-v3-4-8c7c1858a227@daynix.com Reviewed-by: default avatarPeter Maydell Signed-off-by: default avatarPeter Maydell Signed-off-by: guping --- target/arm/hvf/hvf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index 757e13b0f9..b4e98a99e2 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -1272,6 +1272,7 @@ static int hvf_sysreg_read(CPUState *cpu, uint32_t reg, uint32_t rt) /* Call the TCG sysreg handler. This is only safe for GICv3 regs. */ if (!hvf_sysreg_read_cp(cpu, reg, &val)) { hvf_raise_exception(cpu, EXCP_UDEF, syn_uncategorized()); + return 1; } break; case SYSREG_DBGBVR0_EL1: -- Gitee From 0c23d22ea9f160a8e0e0e48b6cb400d7964ae868 Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 23 Jul 2024 21:06:08 +0800 Subject: [PATCH 278/939] hw/nvme: fix memory leak in nvme_dsm cheery-pick from c510fe78f1b7c966524489d6ba752107423b20c8 The allocated memory to hold LBA ranges leaks in the nvme_dsm function. This happens because the allocated memory for iocb->range is not freed in all error handling paths. Fix this by adding a free to ensure that the allocated memory is properly freed. ASAN log: ==3075137==ERROR: LeakSanitizer: detected memory leaks Direct leak of 480 byte(s) in 6 object(s) allocated from: #0 0x55f1f8a0eddd in malloc llvm/compiler-rt/lib/asan/asan_malloc_linux.cpp:129:3 #1 0x7f531e0f6738 in g_malloc (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x5e738) #2 0x55f1faf1f091 in blk_aio_get block/block-backend.c:2583:12 #3 0x55f1f945c74b in nvme_dsm hw/nvme/ctrl.c:2609:30 #4 0x55f1f945831b in nvme_io_cmd hw/nvme/ctrl.c:4470:16 #5 0x55f1f94561b7 in nvme_process_sq hw/nvme/ctrl.c:7039:29 Cc: qemu-stable@nongnu.org Fixes: d7d1474fd85d ("hw/nvme: reimplement dsm to allow cancellation") Signed-off-by: Zheyu Ma Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen Signed-off-by: qihao_yewu --- hw/nvme/ctrl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 237b5c8871..dd1c962f93 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -2592,6 +2592,7 @@ next: done: iocb->aiocb = NULL; iocb->common.cb(iocb->common.opaque, iocb->ret); + g_free(iocb->range); qemu_aio_unref(iocb); } -- Gitee From 041c319f2f91c85aeb4ed0cefa6afa76773fe960 Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 25 Jul 2024 09:57:01 +0800 Subject: [PATCH 279/939] aspeed/smc: Fix possible integer overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 13951ccfcdf0f31902a93859506ccf8c0ef66583 Coverity reports a possible integer overflow because routine aspeeed_smc_hclk_divisor() has a codepath returning 0, which could lead to an integer overflow when computing variable 'hclk_shift' in the caller aspeed_smc_dma_calibration(). The value passed to aspeed_smc_hclk_divisor() is always between 0 and 15 and, in this case, there is always a matching hclk divisor. Remove the return 0 and use g_assert_not_reached() instead. Fixes: Coverity CID 1547822 Suggested-by: Peter Maydell Signed-off-by: Cédric Le Goater Reviewed-by: Peter Maydell Signed-off-by: qihao_yewu --- hw/ssi/aspeed_smc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c index 2a4001b774..8af919a970 100644 --- a/hw/ssi/aspeed_smc.c +++ b/hw/ssi/aspeed_smc.c @@ -764,8 +764,7 @@ static uint8_t aspeed_smc_hclk_divisor(uint8_t hclk_mask) } } - aspeed_smc_error("invalid HCLK mask %x", hclk_mask); - return 0; + g_assert_not_reached(); } /* -- Gitee From 1d3ea28fd7da9a23e278be70c7e028fbd2b69bf3 Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 25 Jul 2024 10:29:20 +0800 Subject: [PATCH 280/939] hw/display/bcm2835_fb: fix fb_use_offsets condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 345acc443905eda8008a1d328dd89b73c4a3f89e It is common practice when implementing double-buffering on VideoCore to do so by multiplying the height of the virtual buffer by the number of virtual screens desired (i.e., two - in the case of double-bufferring). At present, this won't work in QEMU because the logic in fb_use_offsets require that both the virtual width and height exceed their physical counterparts. This appears to be unintentional/a typo and indeed the comment states; "Experimentally, the hardware seems to do this only if the viewport size is larger than the physical screen". The viewport/virtual size would be larger than the physical size if either virtual dimension were larger than their physical counterparts and not necessarily both. Signed-off-by: SamJakob Message-id: 20240713160353.62410-1-me@samjakob.com Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Peter Maydell Signed-off-by: qihao_yewu --- hw/display/bcm2835_fb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/display/bcm2835_fb.c b/hw/display/bcm2835_fb.c index a05277674f..c45da149d9 100644 --- a/hw/display/bcm2835_fb.c +++ b/hw/display/bcm2835_fb.c @@ -145,7 +145,7 @@ static bool fb_use_offsets(BCM2835FBConfig *config) * viewport size is larger than the physical screen. (It doesn't * prevent the guest setting this silly viewport setting, though...) */ - return config->xres_virtual > config->xres && + return config->xres_virtual > config->xres || config->yres_virtual > config->yres; } -- Gitee From ef42d79d805e430e24df57d46c156f9a7e3e1bed Mon Sep 17 00:00:00 2001 From: qihao Date: Thu, 25 Jul 2024 14:11:12 +0800 Subject: [PATCH 281/939] vl: fix "type is NULL" in -vga help MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from a99dc9cd611cbaf10edee6260272e299626d0871 Don't pass NULL to module_object_class_by_name(), when the interface is unavailable. Signed-off-by: Marc-André Lureau Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240715114420.2062870-1-marcandre.lureau@redhat.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- system/vl.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/system/vl.c b/system/vl.c index 165c3cae8a..8e3357c578 100644 --- a/system/vl.c +++ b/system/vl.c @@ -994,9 +994,16 @@ static bool vga_interface_available(VGAInterfaceType t) const VGAInterfaceInfo *ti = &vga_interfaces[t]; assert(t < VGA_TYPE_MAX); - return !ti->class_names[0] || - module_object_class_by_name(ti->class_names[0]) || - module_object_class_by_name(ti->class_names[1]); + + if (!ti->class_names[0] || module_object_class_by_name(ti->class_names[0])) { + return true; + } + + if (ti->class_names[1] && module_object_class_by_name(ti->class_names[1])) { + return true; + } + + return false; } static const char * -- Gitee From ad1d68502c41ff6a966ae89ae5ac008050602e2a Mon Sep 17 00:00:00 2001 From: qihao Date: Mon, 29 Jul 2024 10:38:46 +0800 Subject: [PATCH 282/939] ppc/vof: Fix unaligned FDT property access cheery-pick from 785c8637f9d2362a8addf4ded853d975955a9d6b FDT properties are aligned by 4 bytes, not 8 bytes. Signed-off-by: Akihiko Odaki Reviewed-by: Peter Maydell Reviewed-by: Michael S. Tsirkin Signed-off-by: Nicholas Piggin Signed-off-by: qihao_yewu --- hw/ppc/vof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c index e3b430a81f..b5b6514d79 100644 --- a/hw/ppc/vof.c +++ b/hw/ppc/vof.c @@ -646,7 +646,7 @@ static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base) mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen); g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc)); if (sc == 2) { - mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac)); + mem0_end = ldq_be_p(mem0_reg + sizeof(uint32_t) * ac); } else { mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac)); } -- Gitee From f402887e0c3e97dcbd6d1929ca9908ec57e2bb1f Mon Sep 17 00:00:00 2001 From: Hyman Huang Date: Thu, 7 Dec 2023 23:47:35 +0800 Subject: [PATCH 283/939] crypto: Introduce SM4 symmetric cipher algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the SM4 cipher algorithms (OSCCA GB/T 32907-2016). SM4 (GBT.32907-2016) is a cryptographic standard issued by the Organization of State Commercial Administration of China (OSCCA) as an authorized cryptographic algorithms for the use within China. Detect the SM4 cipher algorithms and enable the feature silently if it is available. Signed-off-by: Hyman Huang Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Daniel P. Berrangé Signed-off-by: Daniel P. Berrangé Signed-off-by: cheliequan --- crypto/block-luks.c | 11 ++++++++ crypto/cipher-gcrypt.c.inc | 8 ++++++ crypto/cipher-nettle.c.inc | 49 +++++++++++++++++++++++++++++++++ crypto/cipher.c | 6 ++++ meson.build | 26 +++++++++++++++++ qapi/crypto.json | 5 +++- tests/unit/test-crypto-cipher.c | 13 +++++++++ 7 files changed, 117 insertions(+), 1 deletion(-) diff --git a/crypto/block-luks.c b/crypto/block-luks.c index fb01ec38bb..f0813d69b4 100644 --- a/crypto/block-luks.c +++ b/crypto/block-luks.c @@ -95,12 +95,23 @@ qcrypto_block_luks_cipher_size_map_twofish[] = { { 0, 0 }, }; +#ifdef CONFIG_CRYPTO_SM4 +static const QCryptoBlockLUKSCipherSizeMap +qcrypto_block_luks_cipher_size_map_sm4[] = { + { 16, QCRYPTO_CIPHER_ALG_SM4}, + { 0, 0 }, +}; +#endif + static const QCryptoBlockLUKSCipherNameMap qcrypto_block_luks_cipher_name_map[] = { { "aes", qcrypto_block_luks_cipher_size_map_aes }, { "cast5", qcrypto_block_luks_cipher_size_map_cast5 }, { "serpent", qcrypto_block_luks_cipher_size_map_serpent }, { "twofish", qcrypto_block_luks_cipher_size_map_twofish }, +#ifdef CONFIG_CRYPTO_SM4 + { "sm4", qcrypto_block_luks_cipher_size_map_sm4}, +#endif }; QEMU_BUILD_BUG_ON(sizeof(struct QCryptoBlockLUKSKeySlot) != 48); diff --git a/crypto/cipher-gcrypt.c.inc b/crypto/cipher-gcrypt.c.inc index a6a0117717..1377cbaf14 100644 --- a/crypto/cipher-gcrypt.c.inc +++ b/crypto/cipher-gcrypt.c.inc @@ -35,6 +35,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_ALG_SERPENT_256: case QCRYPTO_CIPHER_ALG_TWOFISH_128: case QCRYPTO_CIPHER_ALG_TWOFISH_256: +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: +#endif break; default: return false; @@ -219,6 +222,11 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_ALG_TWOFISH_256: gcryalg = GCRY_CIPHER_TWOFISH; break; +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: + gcryalg = GCRY_CIPHER_SM4; + break; +#endif default: error_setg(errp, "Unsupported cipher algorithm %s", QCryptoCipherAlgorithm_str(alg)); diff --git a/crypto/cipher-nettle.c.inc b/crypto/cipher-nettle.c.inc index 24cc61f87b..42b39e18a2 100644 --- a/crypto/cipher-nettle.c.inc +++ b/crypto/cipher-nettle.c.inc @@ -33,6 +33,9 @@ #ifndef CONFIG_QEMU_PRIVATE_XTS #include #endif +#ifdef CONFIG_CRYPTO_SM4 +#include +#endif static inline bool qcrypto_length_check(size_t len, size_t blocksize, Error **errp) @@ -426,6 +429,30 @@ DEFINE_ECB_CBC_CTR_XTS(qcrypto_nettle_twofish, QCryptoNettleTwofish, TWOFISH_BLOCK_SIZE, twofish_encrypt_native, twofish_decrypt_native) +#ifdef CONFIG_CRYPTO_SM4 +typedef struct QCryptoNettleSm4 { + QCryptoCipher base; + struct sm4_ctx key[2]; +} QCryptoNettleSm4; + +static void sm4_encrypt_native(void *ctx, size_t length, + uint8_t *dst, const uint8_t *src) +{ + struct sm4_ctx *keys = ctx; + sm4_crypt(&keys[0], length, dst, src); +} + +static void sm4_decrypt_native(void *ctx, size_t length, + uint8_t *dst, const uint8_t *src) +{ + struct sm4_ctx *keys = ctx; + sm4_crypt(&keys[1], length, dst, src); +} + +DEFINE_ECB(qcrypto_nettle_sm4, + QCryptoNettleSm4, SM4_BLOCK_SIZE, + sm4_encrypt_native, sm4_decrypt_native) +#endif bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, QCryptoCipherMode mode) @@ -443,6 +470,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_ALG_TWOFISH_128: case QCRYPTO_CIPHER_ALG_TWOFISH_192: case QCRYPTO_CIPHER_ALG_TWOFISH_256: +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: +#endif break; default: return false; @@ -701,6 +731,25 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, return &ctx->base; } +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: + { + QCryptoNettleSm4 *ctx = g_new0(QCryptoNettleSm4, 1); + + switch (mode) { + case QCRYPTO_CIPHER_MODE_ECB: + ctx->base.driver = &qcrypto_nettle_sm4_driver_ecb; + break; + default: + goto bad_cipher_mode; + } + + sm4_set_encrypt_key(&ctx->key[0], key); + sm4_set_decrypt_key(&ctx->key[1], key); + + return &ctx->base; + } +#endif default: error_setg(errp, "Unsupported cipher algorithm %s", diff --git a/crypto/cipher.c b/crypto/cipher.c index 74b09a5b26..5f512768ea 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -38,6 +38,9 @@ static const size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = { [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16, [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24, [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32, +#ifdef CONFIG_CRYPTO_SM4 + [QCRYPTO_CIPHER_ALG_SM4] = 16, +#endif }; static const size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = { @@ -53,6 +56,9 @@ static const size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = { [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16, [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 16, [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 16, +#ifdef CONFIG_CRYPTO_SM4 + [QCRYPTO_CIPHER_ALG_SM4] = 16, +#endif }; static const bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = { diff --git a/meson.build b/meson.build index 0c62b4156d..089f45d386 100644 --- a/meson.build +++ b/meson.build @@ -1485,6 +1485,7 @@ endif gcrypt = not_found nettle = not_found hogweed = not_found +crypto_sm4 = not_found xts = 'none' if get_option('nettle').enabled() and get_option('gcrypt').enabled() @@ -1510,6 +1511,17 @@ if not gnutls_crypto.found() cc.find_library('gpg-error', required: true)], version: gcrypt.version()) endif + crypto_sm4 = gcrypt + # SM4 ALG is available in libgcrypt >= 1.9 + if gcrypt.found() and not cc.links(''' + #include + int main(void) { + gcry_cipher_hd_t handler; + gcry_cipher_open(&handler, GCRY_CIPHER_SM4, GCRY_CIPHER_MODE_ECB, 0); + return 0; + }''', dependencies: gcrypt) + crypto_sm4 = not_found + endif endif if (not get_option('nettle').auto() or have_system) and not gcrypt.found() nettle = dependency('nettle', version: '>=3.4', @@ -1518,6 +1530,18 @@ if not gnutls_crypto.found() if nettle.found() and not cc.has_header('nettle/xts.h', dependencies: nettle) xts = 'private' endif + crypto_sm4 = nettle + # SM4 ALG is available in nettle >= 3.9 + if nettle.found() and not cc.links(''' + #include + int main(void) { + struct sm4_ctx ctx; + unsigned char key[16] = {0}; + sm4_set_encrypt_key(&ctx, key); + return 0; + }''', dependencies: nettle) + crypto_sm4 = not_found + endif endif endif @@ -2204,6 +2228,7 @@ config_host_data.set('CONFIG_GNUTLS_CRYPTO', gnutls_crypto.found()) config_host_data.set('CONFIG_TASN1', tasn1.found()) config_host_data.set('CONFIG_GCRYPT', gcrypt.found()) config_host_data.set('CONFIG_NETTLE', nettle.found()) +config_host_data.set('CONFIG_CRYPTO_SM4', crypto_sm4.found()) config_host_data.set('CONFIG_HOGWEED', hogweed.found()) config_host_data.set('CONFIG_QEMU_PRIVATE_XTS', xts == 'private') config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) @@ -4280,6 +4305,7 @@ summary_info += {'nettle': nettle} if nettle.found() summary_info += {' XTS': xts != 'private'} endif +summary_info += {'SM4 ALG support': crypto_sm4} summary_info += {'AF_ALG support': have_afalg} summary_info += {'rng-none': get_option('rng_none')} summary_info += {'Linux keyring': have_keyring} diff --git a/qapi/crypto.json b/qapi/crypto.json index fd3d46ebd1..2f2aeff5fd 100644 --- a/qapi/crypto.json +++ b/qapi/crypto.json @@ -94,6 +94,8 @@ # # @twofish-256: Twofish with 256 bit / 32 byte keys # +# @sm4: SM4 with 128 bit / 16 byte keys (since 9.0) +# # Since: 2.6 ## { 'enum': 'QCryptoCipherAlgorithm', @@ -102,7 +104,8 @@ 'des', '3des', 'cast5-128', 'serpent-128', 'serpent-192', 'serpent-256', - 'twofish-128', 'twofish-192', 'twofish-256']} + 'twofish-128', 'twofish-192', 'twofish-256', + 'sm4']} ## # @QCryptoCipherMode: diff --git a/tests/unit/test-crypto-cipher.c b/tests/unit/test-crypto-cipher.c index d9d9d078ff..11ab1a54fc 100644 --- a/tests/unit/test-crypto-cipher.c +++ b/tests/unit/test-crypto-cipher.c @@ -382,6 +382,19 @@ static QCryptoCipherTestData test_data[] = { .plaintext = "90afe91bb288544f2c32dc239b2635e6", .ciphertext = "6cb4561c40bf0a9705931cb6d408e7fa", }, +#ifdef CONFIG_CRYPTO_SM4 + { + /* SM4, GB/T 32907-2016, Appendix A.1 */ + .path = "/crypto/cipher/sm4", + .alg = QCRYPTO_CIPHER_ALG_SM4, + .mode = QCRYPTO_CIPHER_MODE_ECB, + .key = "0123456789abcdeffedcba9876543210", + .plaintext = + "0123456789abcdeffedcba9876543210", + .ciphertext = + "681edf34d206965e86b3e94f536e4246", + }, +#endif { /* #1 32 byte key, 32 byte PTX */ .path = "/crypto/cipher/aes-xts-128-1", -- Gitee From 3696b12c582440669de12d127701187828c5598f Mon Sep 17 00:00:00 2001 From: Xu Zheng Date: Fri, 19 Jul 2024 22:11:17 +0800 Subject: [PATCH 284/939] hw/nvme: fix number of PIDs for FDP RUH update The number of PIDs is in the upper 16 bits of cdw10. So we need to right-shift by 16 bits instead of only a single bit. Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") cherry-pick from 3936bbdf9a2e9233875f850c7576c79d06add261 Signed-off-by: Vincent Fu Signed-off-by: Klaus Jensen Signed-off-by: Michael Tokarev Signed-off-by: Xu Zheng --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 237b5c8871..d7e83c3d55 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4352,7 +4352,7 @@ static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns = req->ns; uint32_t cdw10 = le32_to_cpu(cmd->cdw10); uint16_t ret = NVME_SUCCESS; - uint32_t npid = (cdw10 >> 1) + 1; + uint32_t npid = (cdw10 >> 16) + 1; unsigned int i = 0; g_autofree uint16_t *pids = NULL; uint32_t maxnpid; -- Gitee From a222f9c1eea20db470c55f534d85987df27a1654 Mon Sep 17 00:00:00 2001 From: Xu Zheng Date: Fri, 19 Jul 2024 22:45:21 +0800 Subject: [PATCH 285/939] target/sparc: use signed denominator in sdiv helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The result has to be done with the signed denominator (b32) instead of the unsigned value passed in argument (b). cherry-pick from 6b4965373e561b77f91cfbdf41353635c9661358 Fixes: 1326010322d6 ("target/sparc: Remove CC_OP_DIV") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2319 Signed-off-by: Clément Chigot Reviewed-by: Richard Henderson Message-Id: <20240606144331.698361-1-chigot@adacore.com> Signed-off-by: Richard Henderson (cherry picked from commit 6b4965373e561b77f91cfbdf41353635c9661358) Signed-off-by: Michael Tokarev Signed-off-by: Xu Zheng --- target/sparc/helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/sparc/helper.c b/target/sparc/helper.c index bd10b60e4b..8820c59e7c 100644 --- a/target/sparc/helper.c +++ b/target/sparc/helper.c @@ -121,7 +121,7 @@ uint64_t helper_sdiv(CPUSPARCState *env, target_ulong a, target_ulong b) return (uint32_t)(b32 < 0 ? INT32_MAX : INT32_MIN) | (-1ull << 32); } - a64 /= b; + a64 /= b32; r = a64; if (unlikely(r != a64)) { return (uint32_t)(a64 < 0 ? INT32_MIN : INT32_MAX) | (-1ull << 32); -- Gitee From 5db954cb188d3775aec053fad8a39bf4c26a2b92 Mon Sep 17 00:00:00 2001 From: liupingwei Date: Fri, 2 Aug 2024 11:55:43 +0800 Subject: [PATCH 286/939] Add support for the virtcca cvm feature. With this commit,we can append new startup parameters :"cma=64M cvm_guest=1" and "kvm_type=cvm" to use virtcca cvm feature. Here is a full example of the append parameters for a cvm : -M virt,gic-version=3,accel=kvm,kernel_irqchip=on,kvm_type=cvm \ -append "swiotlb=force console=tty0 console=ttyAMA0 kaslr.disabled=1 cma=64M cvm_guest=1 rodata=off rootfstype=ext4 root=/dev/vad rw" \ Additionally,the SVE and PMU are optional configurations for cvm,here is an example: -object tmm-guest,id=tmm0,sve-vector-length=128,num-pmu-counters=1 Signed-off-by: liupingwei --- accel/kvm/kvm-all.c | 36 ++++ hw/arm/boot.c | 49 +++++ hw/arm/virt.c | 61 +++++- hw/virtio/virtio-bus.c | 6 + include/hw/arm/boot.h | 1 + include/hw/arm/virt.h | 1 + include/sysemu/kvm.h | 9 + linux-headers/asm-arm64/kvm.h | 62 ++++++ linux-headers/linux/kvm.h | 32 +++- qapi/qom.json | 29 ++- target/arm/kvm-tmm.c | 344 ++++++++++++++++++++++++++++++++++ target/arm/kvm.c | 6 +- target/arm/kvm64.c | 5 + target/arm/kvm_arm.h | 16 ++ target/arm/meson.build | 1 + 15 files changed, 651 insertions(+), 7 deletions(-) create mode 100644 target/arm/kvm-tmm.c diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index ade7841ca3..dc3605e648 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -52,6 +52,8 @@ #include "hw/boards.h" #include "sysemu/stats.h" +#include "sysemu/kvm.h" + /* This check must be after config-host.h is included */ #ifdef CONFIG_EVENTFD #include @@ -86,6 +88,9 @@ struct KVMParkedVcpu { }; KVMState *kvm_state; + +bool virtcca_cvm_allowed = false; + bool kvm_kernel_irqchip; bool kvm_split_irqchip; bool kvm_async_interrupts_allowed; @@ -2355,6 +2360,11 @@ uint32_t kvm_dirty_ring_size(void) return kvm_state->kvm_dirty_ring_size; } +static inline bool kvm_is_virtcca_cvm_type(int type) +{ + return type & VIRTCCA_CVM_TYPE; +} + static int kvm_init(MachineState *ms) { MachineClass *mc = MACHINE_GET_CLASS(ms); @@ -2447,6 +2457,10 @@ static int kvm_init(MachineState *ms) goto err; } + if (kvm_is_virtcca_cvm_type(type)) { + virtcca_cvm_allowed = true; + } + do { ret = kvm_ioctl(s, KVM_CREATE_VM, type); } while (ret == -EINTR); @@ -3503,6 +3517,28 @@ int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) return r; } +int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, + struct kvm_numa_info *numa_info) +{ + KVMState *state = kvm_state; + struct kvm_user_data data; + int ret; + + data.loader_start = loader_start; + data.image_end = image_end; + data.initrd_start = initrd_start; + data.dtb_end = dtb_end; + data.ram_size = ram_size; + memcpy(&data.numa_info, numa_info, sizeof(struct kvm_numa_info)); + + ret = kvm_vm_ioctl(state, KVM_LOAD_USER_DATA, &data); + if (ret < 0) { + error_report("%s: KVM_LOAD_USER_DATA failed!\n", __func__); + } + + return ret; +} + static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, hwaddr start_addr, hwaddr size) { diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 345c7cfa19..42110b0f18 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -27,6 +27,7 @@ #include "qemu/config-file.h" #include "qemu/option.h" #include "qemu/units.h" +#include "kvm_arm.h" /* Kernel boot protocol is specified in the kernel docs * Documentation/arm/Booting and Documentation/arm64/booting.txt @@ -1142,6 +1143,16 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, for (cs = first_cpu; cs; cs = CPU_NEXT(cs)) { ARM_CPU(cs)->env.boot_info = info; } + + if (kvm_enabled() && virtcca_cvm_enabled()) { + if (info->dtb_limit == 0) { + info->dtb_limit = info->dtb_start + 0x200000; + } + kvm_load_user_data(info->loader_start, image_high_addr, info->initrd_start, + info->dtb_limit, info->ram_size, (struct kvm_numa_info *)info->numa_info); + tmm_add_ram_region(info->loader_start, image_high_addr - info->loader_start, + info->initrd_start, info->dtb_limit - info->initrd_start, true); + } } static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) @@ -1235,6 +1246,39 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) info->initrd_filename = ms->initrd_filename; info->dtb_filename = ms->dtb; info->dtb_limit = 0; + if (kvm_enabled() && virtcca_cvm_enabled()) { + info->ram_size = ms->ram_size; + info->numa_info = g_malloc(sizeof(struct kvm_numa_info)); + struct kvm_numa_info *numa_info = (struct kvm_numa_info *) info->numa_info; + if (ms->numa_state != NULL && ms->numa_state->num_nodes > 0) { + numa_info->numa_cnt = ms->numa_state->num_nodes; + uint64_t mem_base = info->loader_start; + for (int64_t i = 0; i < ms->numa_state->num_nodes && i < MAX_NUMA_NODE; i++) { + uint64_t mem_len = ms->numa_state->nodes[i].node_mem; + numa_info->numa_nodes[i].numa_id = i; + numa_info->numa_nodes[i].ipa_start = mem_base; + numa_info->numa_nodes[i].ipa_size = mem_len; + memcpy(numa_info->numa_nodes[i].host_numa_nodes, ms->numa_state->nodes[i].node_memdev->host_nodes, + MAX_NODES / BITS_PER_LONG * sizeof(uint64_t)); + mem_base += mem_len; + } + } else { + numa_info->numa_cnt = 1; + numa_info->numa_nodes[0].numa_id = 0; + numa_info->numa_nodes[0].ipa_start = info->loader_start; + numa_info->numa_nodes[0].ipa_size = info->ram_size; + memset(numa_info->numa_nodes[0].host_numa_nodes, 0, MAX_NODES / BITS_PER_LONG * sizeof(uint64_t)); + } + + for (int cpu_idx = ms->smp.cpus - 1; cpu_idx >= 0; cpu_idx--) { + ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu_idx)); + CPUState *local_cs = CPU(armcpu); + uint64_t node_id = 0; + if (ms->possible_cpus->cpus[local_cs->cpu_index].props.has_node_id) + node_id = ms->possible_cpus->cpus[local_cs->cpu_index].props.node_id; + bitmap_set((unsigned long *)numa_info->numa_nodes[node_id].cpu_id, cpu_idx, 1); + } + } /* Load the kernel. */ if (!info->kernel_filename || info->firmware_loaded) { @@ -1243,6 +1287,11 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) arm_setup_direct_kernel_boot(cpu, info); } + if (kvm_enabled() && virtcca_cvm_enabled()) { + g_free(info->numa_info); + info->numa_info = NULL; + } + /* * Disable the PSCI conduit if it is set up to target the same * or a lower EL than the one we're going to start the guest code in. diff --git a/hw/arm/virt.c b/hw/arm/virt.c index a6e324c6f8..e73a795d3d 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -285,8 +285,16 @@ static void create_fdt(VirtMachineState *vms) /* /chosen must exist for load_dtb to fill in necessary properties later */ qemu_fdt_add_subnode(fdt, "/chosen"); + + g_autofree char *kvm_type = NULL; + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", &error_abort); + } if (vms->dtb_randomness) { - create_randomness(ms, "/chosen"); + if (!(kvm_type && !strcmp(kvm_type, "cvm"))) { + create_randomness(ms, "/chosen"); + } } if (vms->secure) { @@ -1953,6 +1961,19 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) vms->memmap[i] = base_memmap[i]; } + /* fix VIRT_MEM range */ + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", &error_abort); + + if (!strcmp(kvm_type, "cvm")) { + vms->memmap[VIRT_MEM].base = 3 * GiB; + vms->memmap[VIRT_MEM].size = ms->ram_size; + info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), + (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); + } + } + if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported number of memory slots: %"PRIu64, ms->ram_slots); @@ -2440,7 +2461,7 @@ static void machvirt_init(MachineState *machine) */ if (vms->secure && firmware_loaded) { vms->psci_conduit = QEMU_PSCI_CONDUIT_DISABLED; - } else if (vms->virt) { + } else if (vms->virt || virtcca_cvm_enabled()) { vms->psci_conduit = QEMU_PSCI_CONDUIT_SMC; } else { vms->psci_conduit = QEMU_PSCI_CONDUIT_HVC; @@ -2509,6 +2530,13 @@ static void machvirt_init(MachineState *machine) } } + if (virtcca_cvm_enabled()) { + int ret = kvm_arm_tmm_init(machine->cgs, &error_fatal); + if (ret != 0) { + error_report("fail to initialize TMM"); + exit(1); + } + } create_fdt(vms); qemu_log("cpu init start\n"); @@ -3592,6 +3620,15 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, static int virt_kvm_type(MachineState *ms, const char *type_str) { VirtMachineState *vms = VIRT_MACHINE(ms); + int virtcca_cvm_type = 0; + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", &error_abort); + + if (!strcmp(kvm_type, "cvm")) { + virtcca_cvm_type = VIRTCCA_CVM_TYPE; + } + } int max_vm_pa_size, requested_pa_size; bool fixed_ipa; @@ -3621,7 +3658,9 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) * the implicit legacy 40b IPA setting, in which case the kvm_type * must be 0. */ - return fixed_ipa ? 0 : requested_pa_size; + return strcmp(type_str, "cvm") == 0 ? + ((fixed_ipa ? 0 : requested_pa_size) | virtcca_cvm_type) : + (fixed_ipa ? 0 : requested_pa_size); } static void virt_machine_class_init(ObjectClass *oc, void *data) @@ -3793,6 +3832,19 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) } +static char *virt_get_kvm_type(Object *obj, Error **errp G_GNUC_UNUSED) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + return g_strdup(vms->kvm_type); +} + +static void virt_set_kvm_type(Object *obj, const char *value, Error **errp G_GNUC_UNUSED) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + g_free(vms->kvm_type); + vms->kvm_type = g_strdup(value); +} + static void virt_instance_init(Object *obj) { VirtMachineState *vms = VIRT_MACHINE(obj); @@ -3853,6 +3905,9 @@ static void virt_instance_init(Object *obj) vms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); vms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); + + object_property_add_str(obj, "kvm-type", virt_get_kvm_type, virt_set_kvm_type); + object_property_set_description(obj, "kvm-type", "CVM or Normal VM"); } static const TypeInfo virt_machine_info = { diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 896feb37a1..7e750d073d 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "sysemu/kvm.h" #include "qapi/error.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio.h" @@ -81,6 +82,11 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) vdev->dma_as = &address_space_memory; if (has_iommu) { vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + + if (virtcca_cvm_enabled() && (strcmp(vdev->name, "vhost-user-fs") == 0)) { + vdev_has_iommu = true; + } + /* * Present IOMMU_PLATFORM to the driver iff iommu_plattform=on and * device operational. If the driver does not accept IOMMU_PLATFORM diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h index f81326a1dc..4491b1f85b 100644 --- a/include/hw/arm/boot.h +++ b/include/hw/arm/boot.h @@ -39,6 +39,7 @@ void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename, /* arm_boot.c */ struct arm_boot_info { uint64_t ram_size; + void *numa_info; const char *kernel_filename; const char *kernel_cmdline; const char *initrd_filename; diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 7a734f07f7..27f5333772 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -182,6 +182,7 @@ struct VirtMachineState { PCIBus *bus; char *oem_id; char *oem_table_id; + char *kvm_type; NotifierList cpuhp_notifiers; }; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index cfa77cc15b..31af5f0e24 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -19,6 +19,7 @@ #include "exec/memattrs.h" #include "qemu/accel.h" #include "qom/object.h" +#include "linux-headers/linux/kvm.h" #ifdef NEED_CPU_H # ifdef CONFIG_KVM @@ -32,6 +33,7 @@ #ifdef CONFIG_KVM_IS_POSSIBLE extern bool kvm_allowed; +extern bool virtcca_cvm_allowed; extern bool kvm_kernel_irqchip; extern bool kvm_split_irqchip; extern bool kvm_async_interrupts_allowed; @@ -44,6 +46,8 @@ extern bool kvm_readonly_mem_allowed; extern bool kvm_msi_use_devid; #define kvm_enabled() (kvm_allowed) +#define virtcca_cvm_enabled() (virtcca_cvm_allowed) +#define VIRTCCA_CVM_TYPE (1UL << 8) /** * kvm_irqchip_in_kernel: * @@ -146,6 +150,8 @@ extern bool kvm_msi_use_devid; #else #define kvm_enabled() (0) +#define virtcca_cvm_enabled() (0) +#define VIRTCCA_CVM_TYPE (0) #define kvm_irqchip_in_kernel() (false) #define kvm_irqchip_is_split() (false) #define kvm_async_interrupts_enabled() (false) @@ -543,6 +549,9 @@ bool kvm_dirty_ring_enabled(void); uint32_t kvm_dirty_ring_size(void); +int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, + struct kvm_numa_info *numa_info); + #ifdef __aarch64__ int kvm_create_shadow_device(PCIDevice *dev); int kvm_delete_shadow_device(PCIDevice *dev); diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index c59ea55cd8..2b040b5d60 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -110,6 +110,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_TEC 8 /* VCPU TEC state as part of cvm */ struct kvm_vcpu_init { __u32 target; @@ -523,6 +524,67 @@ struct reg_mask_range { __u32 reserved[13]; }; +/* KVM_CAP_ARM_TMM on VM fd */ +#define KVM_CAP_ARM_TMM_CONFIG_CVM 0 +#define KVM_CAP_ARM_TMM_CREATE_RD 1 +#define KVM_CAP_ARM_TMM_POPULATE_CVM 2 +#define KVM_CAP_ARM_TMM_ACTIVATE_CVM 3 + +#define KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA256 0 +#define KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA512 1 + +#define KVM_CAP_ARM_TMM_RPV_SIZE 64 + +/* List of configuration items accepted for KVM_CAP_ARM_RME_CONFIG_REALM */ +#define KVM_CAP_ARM_TMM_CFG_RPV 0 +#define KVM_CAP_ARM_TMM_CFG_HASH_ALGO 1 +#define KVM_CAP_ARM_TMM_CFG_SVE 2 +#define KVM_CAP_ARM_TMM_CFG_DBG 3 +#define KVM_CAP_ARM_TMM_CFG_PMU 4 + +struct kvm_cap_arm_tmm_config_item { + __u32 cfg; + union { + /* cfg == KVM_CAP_ARM_TMM_CFG_RPV */ + struct { + __u8 rpv[KVM_CAP_ARM_TMM_RPV_SIZE]; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_HASH_ALGO */ + struct { + __u32 hash_algo; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_SVE */ + struct { + __u32 sve_vq; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_DBG */ + struct { + __u32 num_brps; + __u32 num_wrps; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_PMU */ + struct { + __u32 num_pmu_cntrs; + }; + /* Fix the size of the union */ + __u8 reserved[256]; + }; +}; + +#define KVM_ARM_TMM_POPULATE_FLAGS_MEASURE (1U << 0) +struct kvm_cap_arm_tmm_populate_region_args { + __u64 populate_ipa_base1; + __u64 populate_ipa_size1; + __u64 populate_ipa_base2; + __u64 populate_ipa_size2; + __u32 flags; + __u32 reserved[3]; +}; + #endif #endif /* __ARM_KVM_H__ */ diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 56f6b2583f..8d12435e41 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -14,6 +14,8 @@ #include #include +#include "sysemu/numa.h" + #define KVM_API_VERSION 12 /* *** Deprecated interfaces *** */ @@ -1198,6 +1200,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 +#define KVM_CAP_ARM_TMM 300 + #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 #ifdef KVM_CAP_IRQ_ROUTING @@ -1469,6 +1473,32 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; +#define MAX_NUMA_NODE 8 +#define MAX_CPU_BIT_MAP 4 +#define MAX_NODE_BIT_MAP (MAX_NODES / BITS_PER_LONG) + +struct kvm_numa_node { + __u64 numa_id; + __u64 ipa_start; + __u64 ipa_size; + __u64 host_numa_nodes[MAX_NODE_BIT_MAP]; + __u64 cpu_id[MAX_CPU_BIT_MAP]; +}; + +struct kvm_numa_info { + __u64 numa_cnt; + struct kvm_numa_node numa_nodes[MAX_NUMA_NODE]; +}; + +struct kvm_user_data { + __u64 loader_start; + __u64 image_end; + __u64 initrd_start; + __u64 dtb_end; + __u64 ram_size; + struct kvm_numa_info numa_info; +}; + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -1481,7 +1511,7 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) - +#define KVM_LOAD_USER_DATA _IOW(KVMIO, 0x49, struct kvm_user_data) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { __u64 user_addr; diff --git a/qapi/qom.json b/qapi/qom.json index c53ef978ff..213edd8db2 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -899,6 +899,29 @@ 'data': { '*cpu-affinity': ['uint16'], '*node-affinity': ['uint16'] } } +## +# @TmmGuestMeasurementAlgo: +# +# Algorithm to use for cvm measurements +# +# Since: FIXME +## +{ 'enum': 'TmmGuestMeasurementAlgo', +'data': ['default', 'sha256', 'sha512'] } + +## +# @TmmGuestProperties: +# +# Properties for tmm-guest objects. +# +# @sve-vector-length: SVE vector length (default: 0, SVE disabled) +# +# Since: FIXME +## +{ 'struct': 'TmmGuestProperties', + 'data': { '*sve-vector-length': 'uint32', + '*num-pmu-counters': 'uint32', + '*measurement-algo': 'TmmGuestMeasurementAlgo' } } ## # @ObjectType: @@ -962,7 +985,8 @@ 'tls-creds-x509', 'tls-cipher-suites', { 'name': 'x-remote-object', 'features': [ 'unstable' ] }, - { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] } + { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] }, + 'tmm-guest' ] } ## @@ -1029,7 +1053,8 @@ 'tls-creds-x509': 'TlsCredsX509Properties', 'tls-cipher-suites': 'TlsCredsProperties', 'x-remote-object': 'RemoteObjectProperties', - 'x-vfio-user-server': 'VfioUserServerProperties' + 'x-vfio-user-server': 'VfioUserServerProperties', + 'tmm-guest': 'TmmGuestProperties' } } ## diff --git a/target/arm/kvm-tmm.c b/target/arm/kvm-tmm.c new file mode 100644 index 0000000000..efe2ca0006 --- /dev/null +++ b/target/arm/kvm-tmm.c @@ -0,0 +1,344 @@ +/* + * QEMU add virtcca cvm feature. + * + * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "exec/confidential-guest-support.h" +#include "hw/boards.h" +#include "hw/core/cpu.h" +#include "kvm_arm.h" +#include "migration/blocker.h" +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" +#include "hw/loader.h" + +#define TYPE_TMM_GUEST "tmm-guest" +OBJECT_DECLARE_SIMPLE_TYPE(TmmGuest, TMM_GUEST) + +#define TMM_PAGE_SIZE qemu_real_host_page_size() +#define TMM_MAX_PMU_CTRS 0x20 +#define TMM_MAX_CFG 5 + +struct TmmGuest { + ConfidentialGuestSupport parent_obj; + GSList *ram_regions; + TmmGuestMeasurementAlgo measurement_algo; + uint32_t sve_vl; + uint32_t num_pmu_cntrs; +}; + +typedef struct { + hwaddr base1; + hwaddr len1; + hwaddr base2; + hwaddr len2; + bool populate; +} TmmRamRegion; + +static TmmGuest *tmm_guest; + +bool kvm_arm_tmm_enabled(void) +{ + return !!tmm_guest; +} + +static int tmm_configure_one(TmmGuest *guest, uint32_t cfg, Error **errp) +{ + int ret = 1; + const char *cfg_str; + struct kvm_cap_arm_tmm_config_item args = { + .cfg = cfg, + }; + + switch (cfg) { + case KVM_CAP_ARM_TMM_CFG_RPV: + return 0; + case KVM_CAP_ARM_TMM_CFG_HASH_ALGO: + switch (guest->measurement_algo) { + case TMM_GUEST_MEASUREMENT_ALGO_DEFAULT: + return 0; + case TMM_GUEST_MEASUREMENT_ALGO_SHA256: + args.hash_algo = KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA256; + break; + case TMM_GUEST_MEASUREMENT_ALGO_SHA512: + args.hash_algo = KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA512; + break; + default: + g_assert_not_reached(); + } + cfg_str = "hash algorithm"; + break; + case KVM_CAP_ARM_TMM_CFG_SVE: + if (!guest->sve_vl) { + return 0; + } + args.sve_vq = guest->sve_vl / 128; + cfg_str = "SVE"; + break; + case KVM_CAP_ARM_TMM_CFG_DBG: + return 0; + case KVM_CAP_ARM_TMM_CFG_PMU: + if (!guest->num_pmu_cntrs) { + return 0; + } + args.num_pmu_cntrs = guest->num_pmu_cntrs; + cfg_str = "PMU"; + break; + default: + g_assert_not_reached(); + } + + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_CONFIG_CVM, (intptr_t)&args); + if (ret) { + error_setg_errno(errp, -ret, "TMM: failed to configure %s", cfg_str); + } + + return ret; +} + +static gint tmm_compare_ram_regions(gconstpointer a, gconstpointer b) +{ + const TmmRamRegion *ra = a; + const TmmRamRegion *rb = b; + + g_assert(ra->base1 != rb->base1); + return ra->base1 < rb->base1 ? -1 : 1; +} + +void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate) +{ + TmmRamRegion *region; + + region = g_new0(TmmRamRegion, 1); + region->base1 = QEMU_ALIGN_DOWN(base1, TMM_PAGE_SIZE); + region->len1 = QEMU_ALIGN_UP(len1, TMM_PAGE_SIZE); + region->base2 = QEMU_ALIGN_DOWN(base2, TMM_PAGE_SIZE); + region->len2 = QEMU_ALIGN_UP(len2, TMM_PAGE_SIZE); + region->populate = populate; + + tmm_guest->ram_regions = g_slist_insert_sorted(tmm_guest->ram_regions, + region, tmm_compare_ram_regions); +} + +static void tmm_populate_region(gpointer data, gpointer unused) +{ + int ret; + const TmmRamRegion *region = data; + struct kvm_cap_arm_tmm_populate_region_args populate_args = { + .populate_ipa_base1 = region->base1, + .populate_ipa_size1 = region->len1, + .populate_ipa_base2 = region->base2, + .populate_ipa_size2 = region->len2, + .flags = KVM_ARM_TMM_POPULATE_FLAGS_MEASURE, + }; + + if (!region->populate) { + return; + } + + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_POPULATE_CVM, + (intptr_t)&populate_args); + if (ret) { + error_report("TMM: failed to populate cvm region (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx"): %s", + region->base1, region->len1, region->base2, region->len2, strerror(-ret)); + exit(1); + } +} + +static int tmm_create_rd(Error **errp) +{ + int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_CREATE_RD); + if (ret) { + error_setg_errno(errp, -ret, "TMM: failed to create tmm Descriptor"); + } + return ret; +} + +static void tmm_vm_state_change(void *opaque, bool running, RunState state) +{ + int ret; + CPUState *cs; + + if (!running) { + return; + } + + g_slist_foreach(tmm_guest->ram_regions, tmm_populate_region, NULL); + g_slist_free_full(g_steal_pointer(&tmm_guest->ram_regions), g_free); + + CPU_FOREACH(cs) { + ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_TEC); + if (ret) { + error_report("TMM: failed to finalize vCPU: %s", strerror(-ret)); + exit(1); + } + } + + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_ACTIVATE_CVM); + if (ret) { + error_report("TMM: failed to activate cvm: %s", strerror(-ret)); + exit(1); + } +} + +int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + int ret; + int cfg; + + if (!tmm_guest) { + return -ENODEV; + } + + if (!kvm_check_extension(kvm_state, KVM_CAP_ARM_TMM)) { + error_setg(errp, "KVM does not support TMM"); + return -ENODEV; + } + + for (cfg = 0; cfg < TMM_MAX_CFG; cfg++) { + ret = tmm_configure_one(tmm_guest, cfg, &error_abort); + if (ret) { + return ret; + } + } + + ret = tmm_create_rd(&error_abort); + if (ret) { + return ret; + } + + qemu_add_vm_change_state_handler(tmm_vm_state_change, NULL); + return 0; +} + +static void tmm_get_sve_vl(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + + visit_type_uint32(v, name, &guest->sve_vl, errp); +} + +static void tmm_set_sve_vl(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value & 0x7f || value >= ARM_MAX_VQ * 128) { + error_setg(errp, "invalid SVE vector length"); + return; + } + + guest->sve_vl = value; +} + +static void tmm_get_num_pmu_cntrs(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + + visit_type_uint32(v, name, &guest->num_pmu_cntrs, errp); +} + +static void tmm_set_num_pmu_cntrs(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value >= TMM_MAX_PMU_CTRS) { + error_setg(errp, "invalid number of PMU counters"); + return; + } + + guest->num_pmu_cntrs = value; +} + +static int tmm_get_measurement_algo(Object *obj, Error **errp G_GNUC_UNUSED) +{ + TmmGuest *guest = TMM_GUEST(obj); + + return guest->measurement_algo; +} + +static void tmm_set_measurement_algo(Object *obj, int algo, Error **errp G_GNUC_UNUSED) +{ + TmmGuest *guest = TMM_GUEST(obj); + + guest->measurement_algo = algo; +} + +static void tmm_guest_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_enum(oc, "measurement-algo", + "TmmGuestMeasurementAlgo", + &TmmGuestMeasurementAlgo_lookup, + tmm_get_measurement_algo, + tmm_set_measurement_algo); + object_class_property_set_description(oc, "measurement-algo", + "cvm measurement algorithm ('sha256', 'sha512')"); + /* + * This is not ideal. Normally SVE parameters are given to -cpu, but the + * cvm parameters are needed much earlier than CPU initialization. We also + * don't have a way to discover what is supported at the moment, the idea is + * that the user knows exactly what hardware it is running on because these + * parameters are part of the measurement and play in the attestation. + */ + object_class_property_add(oc, "sve-vector-length", "uint32", tmm_get_sve_vl, + tmm_set_sve_vl, NULL, NULL); + object_class_property_set_description(oc, "sve-vector-length", + "SVE vector length. 0 disables SVE (the default)"); + object_class_property_add(oc, "num-pmu-counters", "uint32", + tmm_get_num_pmu_cntrs, tmm_set_num_pmu_cntrs, + NULL, NULL); + object_class_property_set_description(oc, "num-pmu-counters", + "Number of PMU counters"); +} + +static void tmm_guest_instance_init(Object *obj) +{ + if (tmm_guest) { + error_report("a single instance of TmmGuest is supported"); + exit(1); + } + tmm_guest = TMM_GUEST(obj); +} + +static const TypeInfo tmm_guest_info = { + .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, + .name = TYPE_TMM_GUEST, + .instance_size = sizeof(struct TmmGuest), + .instance_init = tmm_guest_instance_init, + .class_init = tmm_guest_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void tmm_register_types(void) +{ + type_register_static(&tmm_guest_info); +} +type_init(tmm_register_types); diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 1ceb72a1c1..ee5ba68305 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -613,6 +613,10 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level) continue; } + if (virtcca_cvm_enabled() && regidx == KVM_REG_ARM_TIMER_CNT) { + continue; + } + switch (regidx & KVM_REG_SIZE_MASK) { case KVM_REG_SIZE_U32: v32 = cpu->cpreg_values[i]; @@ -1212,7 +1216,7 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) bool kvm_arch_cpu_check_are_resettable(void) { - return true; + return !virtcca_cvm_enabled(); } static void kvm_arch_get_eager_split_size(Object *obj, Visitor *v, diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 8f01d485b0..b099287ed0 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -584,6 +584,11 @@ static int kvm_arm_sve_set_vls(CPUState *cs) assert(cpu->sve_max_vq <= KVM_ARM64_SVE_VQ_MAX); + if (virtcca_cvm_enabled()) { + /* Already set through tmm config */ + return 0; + } + return kvm_set_one_reg(cs, KVM_REG_ARM64_SVE_VLS, &vls[0]); } diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index bf4df54c96..d6c7139f4a 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -388,6 +388,11 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa); int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); +void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate); + +int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp); +bool kvm_arm_tmm_enabled(void); + /** * kvm_arm_set_smccc_filter * @func: funcion @@ -475,6 +480,17 @@ static inline int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction) { g_assert_not_reached(); } + +static inline int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp G_GNUC_UNUSED) +{ + g_assert_not_reached(); +} + +static inline void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, + hwaddr len2, bool populate) +{ + g_assert_not_reached(); +} #endif /** diff --git a/target/arm/meson.build b/target/arm/meson.build index d1dd4932ed..389ee54658 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -10,6 +10,7 @@ arm_ss.add(zlib) arm_ss.add(when: 'CONFIG_KVM', if_true: files('hyp_gdbstub.c', 'kvm.c', 'kvm64.c'), if_false: files('kvm-stub.c')) arm_ss.add(when: 'CONFIG_HVF', if_true: files('hyp_gdbstub.c')) +arm_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c', 'kvm64.c', 'kvm-tmm.c'), if_false: files('kvm-stub.c')) arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'cpu64.c', -- Gitee From 8bc3dd094a9daa348d49436dc4d0867b7b514ba7 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Fri, 12 Jan 2024 14:00:41 +0800 Subject: [PATCH 287/939] target/i386: add support for LAM in CPUID enumeration commit ba6780905943696d790cc880c8e5684b51f027fe upstream. Linear Address Masking (LAM) is a new Intel CPU feature, which allows software to use of the untranslated address bits for metadata. The bit definition: CPUID.(EAX=7,ECX=1):EAX[26] Add CPUID definition for LAM. Note LAM feature is not supported for TCG of target-i386, LAM CPIUD bit will not be added to TCG_7_1_EAX_FEATURES. More info can be found in Intel ISE Chapter "LINEAR ADDRESS MASKING(LAM)" https://cdrdv2.intel.com/v1/dl/getContent/671368 Intel-SIG: commit ba6780905943 target/i386: add support for LAM in CPUID enumeration Backport Qemu Linear Address Masking (LAM) support. Signed-off-by: Robert Hoo Co-developed-by: Binbin Wu Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Reviewed-by: Xiaoyao Li Reviewed-by: Zhao Liu Message-ID: <20240112060042.19925-2-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- target/i386/cpu.c | 2 +- target/i386/cpu.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 711370d9b8..19ebd49e8c 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -967,7 +967,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "fsrc", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "amx-fp16", NULL, "avx-ifma", - NULL, NULL, NULL, NULL, + NULL, NULL, "lam", NULL, NULL, NULL, NULL, NULL, }, .cpuid = { diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 6993552cd9..8dbcb4a35f 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -926,6 +926,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_7_1_EAX_AMX_FP16 (1U << 21) /* Support for VPMADD52[H,L]UQ */ #define CPUID_7_1_EAX_AVX_IFMA (1U << 23) +/* Linear Address Masking */ +#define CPUID_7_1_EAX_LAM (1U << 26) /* Support for VPDPB[SU,UU,SS]D[,S] */ #define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4) -- Gitee From 03e73f225c44daa067ff1c57845dcd4678897a49 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Fri, 12 Jan 2024 14:00:42 +0800 Subject: [PATCH 288/939] target/i386: add control bits support for LAM commit 0117067131f99acaab4f4d2cca0290c5510e37cf upstream. LAM uses CR3[61] and CR3[62] to configure/enable LAM on user pointers. LAM uses CR4[28] to configure/enable LAM on supervisor pointers. For CR3 LAM bits, no additional handling needed: - TCG LAM is not supported for TCG of target-i386. helper_write_crN() and helper_vmrun() check max physical address bits before calling cpu_x86_update_cr3(), no change needed, i.e. CR3 LAM bits are not allowed to be set in TCG. - gdbstub x86_cpu_gdb_write_register() will call cpu_x86_update_cr3() to update cr3. Allow gdb to set the LAM bit(s) to CR3, if vcpu doesn't support LAM, KVM_SET_SREGS will fail as other reserved bits. For CR4 LAM bit, its reservation depends on vcpu supporting LAM feature or not. - TCG LAM is not supported for TCG of target-i386. helper_write_crN() and helper_vmrun() check CR4 reserved bit before calling cpu_x86_update_cr4(), i.e. CR4 LAM bit is not allowed to be set in TCG. - gdbstub x86_cpu_gdb_write_register() will call cpu_x86_update_cr4() to update cr4. Mask out LAM bit on CR4 if vcpu doesn't support LAM. - x86_cpu_reset_hold() doesn't need special handling. Intel-SIG: commit 0117067131f9 target/i386: add control bits support for LAM Backport Qemu Linear Address Masking (LAM) support. Signed-off-by: Binbin Wu Tested-by: Xuelian Guo Reviewed-by: Xiaoyao Li Reviewed-by: Zhao Liu Message-ID: <20240112060042.19925-3-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log ] Signed-off-by: Zhiquan Li --- target/i386/cpu.h | 7 ++++++- target/i386/helper.c | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 8dbcb4a35f..b0666167d2 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -262,6 +262,7 @@ typedef enum X86Seg { #define CR4_SMAP_MASK (1U << 21) #define CR4_PKE_MASK (1U << 22) #define CR4_PKS_MASK (1U << 24) +#define CR4_LAM_SUP_MASK (1U << 28) #define CR4_RESERVED_MASK \ (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ @@ -270,7 +271,8 @@ typedef enum X86Seg { | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \ | CR4_LA57_MASK \ | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \ - | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK)) + | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \ + | CR4_LAM_SUP_MASK)) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) @@ -2527,6 +2529,9 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env) if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) { reserved_bits |= CR4_PKS_MASK; } + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { + reserved_bits |= CR4_LAM_SUP_MASK; + } return reserved_bits; } diff --git a/target/i386/helper.c b/target/i386/helper.c index 2070dd0dda..1da7a7d315 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -219,6 +219,10 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4) new_cr4 &= ~CR4_PKS_MASK; } + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { + new_cr4 &= ~CR4_LAM_SUP_MASK; + } + env->cr[4] = new_cr4; env->hflags = hflags; -- Gitee From 2da2e7ebea456360cc41881ff2e4a81a03b6d10c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 7 May 2020 22:26:17 +0000 Subject: [PATCH 289/939] doc: update AMD SEV to include Live migration flow cherry-picked from https://github.com/AMDESE/qemu/commit/0e2b3d80e3. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Signed-off-by: hanliyang --- docs/system/i386/amd-memory-encryption.rst | 40 +++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/docs/system/i386/amd-memory-encryption.rst b/docs/system/i386/amd-memory-encryption.rst index e9bc142bc1..b7e3f46ff6 100644 --- a/docs/system/i386/amd-memory-encryption.rst +++ b/docs/system/i386/amd-memory-encryption.rst @@ -177,7 +177,45 @@ TODO Live Migration --------------- -TODO +AMD SEV encrypts the memory of VMs and because a different key is used +in each VM, the hypervisor will be unable to simply copy the +ciphertext from one VM to another to migrate the VM. Instead the AMD SEV Key +Management API provides sets of function which the hypervisor can use +to package a guest page for migration, while maintaining the confidentiality +provided by AMD SEV. + +SEV guest VMs have the concept of private and shared memory. The private +memory is encrypted with the guest-specific key, while shared memory may +be encrypted with the hypervisor key. The migration APIs provided by the +SEV API spec should be used for migrating the private pages. The +KVM_GET_PAGE_ENC_BITMAP ioctl can be used to get the guest page encryption +bitmap. The bitmap can be used to check if the given guest page is +private or shared. + +Before initiating the migration, we need to know the targets machine's public +Diffie-Hellman key (PDH) and certificate chain. It can be retrieved +with the 'query-sev-capabilities' QMP command or using the sev-tool. The +migrate-set-parameter can be used to pass the target machine's PDH and +certificate chain. + +During the migration flow, the SEND_START is called on the source hypervisor +to create an outgoing encryption context. The SEV guest policy dictates whether +the certificate passed through the migrate-sev-set-info command will be +validated. SEND_UPDATE_DATA is called to encrypt the guest private pages. +After migration is completed, SEND_FINISH is called to destroy the encryption +context and make the VM non-runnable to protect it against cloning. + +On the target machine, RECEIVE_START is called first to create an +incoming encryption context. The RECEIVE_UPDATE_DATA is called to copy +the received encrypted page into guest memory. After migration has +completed, RECEIVE_FINISH is called to make the VM runnable. + +For more information about the migration see SEV API Appendix A +Usage flow (Live migration section). + +NOTE: +To protect against the memory clone SEV APIs are designed to make the VM +unrunnable in case of the migration failure. References ---------- -- Gitee From 5ff59a5649385672da42097b24a2428bc2348d9b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 11:27:00 +0000 Subject: [PATCH 290/939] migration.json: add AMD SEV specific migration parameters cherry-picked from https://github.com/AMDESE/qemu/commit/d6a23bde6b6e. AMD SEV migration flow requires that target machine's public Diffie-Hellman key (PDH) and certificate chain must be passed before initiating the guest migration. User can use QMP 'migrate-set-parameters' to pass the certificate chain. The certificate chain will be used while creating the outgoing encryption context. Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra [ Fix conflicts and qapi errors. ] Signed-off-by: hanliyang --- migration/migration-hmp-cmds.c | 28 ++++++++++++++++ migration/options.c | 60 ++++++++++++++++++++++++++++++++++ qapi/migration.json | 41 +++++++++++++++++++++-- 3 files changed, 126 insertions(+), 3 deletions(-) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 1fa6a5f478..7ce0446d46 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -395,6 +395,19 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MODE), qapi_enum_lookup(&MigMode_lookup, params->mode)); + + assert(params->sev_pdh); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_SEV_PDH), + params->sev_pdh); + assert(params->sev_plat_cert); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_SEV_PLAT_CERT), + params->sev_plat_cert); + assert(params->sev_amd_cert); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_SEV_AMD_CERT), + params->sev_amd_cert); } qapi_free_MigrationParameters(params); @@ -691,6 +704,21 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_mode = true; visit_type_MigMode(v, param, &p->mode, &err); break; + case MIGRATION_PARAMETER_SEV_PDH: + p->sev_pdh = g_new0(StrOrNull, 1); + p->sev_pdh->type = QTYPE_QSTRING; + visit_type_str(v, param, &p->sev_pdh->u.s, &err); + break; + case MIGRATION_PARAMETER_SEV_PLAT_CERT: + p->sev_plat_cert = g_new0(StrOrNull, 1); + p->sev_plat_cert->type = QTYPE_QSTRING; + visit_type_str(v, param, &p->sev_plat_cert->u.s, &err); + break; + case MIGRATION_PARAMETER_SEV_AMD_CERT: + p->sev_amd_cert = g_new0(StrOrNull, 1); + p->sev_amd_cert->type = QTYPE_QSTRING; + visit_type_str(v, param, &p->sev_amd_cert->u.s, &err); + break; default: assert(0); } diff --git a/migration/options.c b/migration/options.c index 9b68962a65..71e71ea801 100644 --- a/migration/options.c +++ b/migration/options.c @@ -183,6 +183,9 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), + DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), + DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), @@ -1012,6 +1015,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->announce_rounds = s->parameters.announce_rounds; params->has_announce_step = true; params->announce_step = s->parameters.announce_step; + params->sev_pdh = g_strdup(s->parameters.sev_pdh); + params->sev_plat_cert = g_strdup(s->parameters.sev_plat_cert); + params->sev_amd_cert = g_strdup(s->parameters.sev_amd_cert); if (s->parameters.has_block_bitmap_mapping) { params->has_block_bitmap_mapping = true; @@ -1063,6 +1069,10 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + + params->sev_pdh = g_strdup(""); + params->sev_plat_cert = g_strdup(""); + params->sev_amd_cert = g_strdup(""); } static bool compress_level_check(MigrationParameters *params, Error **errp) @@ -1392,6 +1402,19 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_mode) { dest->mode = params->mode; } + + if (params->sev_pdh) { + assert(params->sev_pdh->type == QTYPE_QSTRING); + dest->sev_pdh = params->sev_pdh->u.s; + } + if (params->sev_plat_cert) { + assert(params->sev_plat_cert->type == QTYPE_QSTRING); + dest->sev_plat_cert = params->sev_plat_cert->u.s; + } + if (params->sev_amd_cert) { + assert(params->sev_amd_cert->type == QTYPE_QSTRING); + dest->sev_amd_cert = params->sev_amd_cert->u.s; + } } static void migrate_params_apply(MigrateSetParameters *params, Error **errp) @@ -1540,6 +1563,22 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_mode) { s->parameters.mode = params->mode; } + + if (params->sev_pdh) { + g_free(s->parameters.sev_pdh); + assert(params->sev_pdh->type == QTYPE_QSTRING); + s->parameters.sev_pdh = g_strdup(params->sev_pdh->u.s); + } + if (params->sev_plat_cert) { + g_free(s->parameters.sev_plat_cert); + assert(params->sev_plat_cert->type == QTYPE_QSTRING); + s->parameters.sev_plat_cert = g_strdup(params->sev_plat_cert->u.s); + } + if (params->sev_amd_cert) { + g_free(s->parameters.sev_amd_cert); + assert(params->sev_amd_cert->type == QTYPE_QSTRING); + s->parameters.sev_amd_cert = g_strdup(params->sev_amd_cert->u.s); + } } void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) @@ -1565,6 +1604,27 @@ void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) params->tls_authz->type = QTYPE_QSTRING; params->tls_authz->u.s = strdup(""); } + /* TODO Rewrite "" to null instead */ + if (params->sev_pdh + && params->sev_pdh->type == QTYPE_QNULL) { + qobject_unref(params->sev_pdh->u.n); + params->sev_pdh->type = QTYPE_QSTRING; + params->sev_pdh->u.s = strdup(""); + } + /* TODO Rewrite "" to null instead */ + if (params->sev_plat_cert + && params->sev_plat_cert->type == QTYPE_QNULL) { + qobject_unref(params->sev_plat_cert->u.n); + params->sev_plat_cert->type = QTYPE_QSTRING; + params->sev_plat_cert->u.s = strdup(""); + } + /* TODO Rewrite "" to null instead */ + if (params->sev_amd_cert + && params->sev_amd_cert->type == QTYPE_QNULL) { + qobject_unref(params->sev_amd_cert->u.n); + params->sev_amd_cert->type = QTYPE_QSTRING; + params->sev_amd_cert->u.s = strdup(""); + } migrate_params_test_apply(params, &tmp); diff --git a/qapi/migration.json b/qapi/migration.json index 5d0855a1d8..038e99cba3 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -891,6 +891,15 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @sev-pdh: The target host platform diffie-hellman key encoded in base64 +# (Since 4.2) +# +# @sev-plat-cert: The target host platform certificate chain encoded in base64 +# (Since 4.2) +# +# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in +# base64 (Since 4.2) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -925,7 +934,8 @@ 'block-bitmap-mapping', { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', - 'mode'] } + 'mode', + 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert'] } ## # @MigrateSetParameters: @@ -1083,6 +1093,15 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @sev-pdh: The target host platform diffie-hellman key encoded in base64 +# (Since 4.2) +# +# @sev-plat-cert: The target host platform certificate chain encoded in base64 +# (Since 4.2) +# +# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in +# base64 (Since 4.2) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1139,7 +1158,11 @@ '*x-vcpu-dirty-limit-period': { 'type': 'uint64', 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', - '*mode': 'MigMode'} } + '*mode': 'MigMode', + '*sev-pdh': 'StrOrNull', + '*sev-plat-cert': 'StrOrNull', + '*sev-amd-cert' : 'StrOrNull' } } + ## # @migrate-set-parameters: @@ -1317,6 +1340,15 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @sev-pdh: The target host platform diffie-hellman key encoded in base64 +# (Since 4.2) +# +# @sev-plat-cert: The target host platform certificate chain encoded in base64 +# (Since 4.2) +# +# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in +# base64 (Since 4.2) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1369,7 +1401,10 @@ '*x-vcpu-dirty-limit-period': { 'type': 'uint64', 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', - '*mode': 'MigMode'} } + '*mode': 'MigMode', + '*sev-pdh': 'str', + '*sev-plat-cert': 'str', + '*sev-amd-cert' : 'str'} } ## # @query-migrate-parameters: -- Gitee From da96618de3227b87ddd78388b80278bde230ce79 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 11:41:37 +0000 Subject: [PATCH 291/939] confidential guest support: introduce ConfidentialGuestMemoryEncryptionOps for encrypted VMs cherry-picked from https://github.com/AMDESE/qemu/commit/74fce7be9bd. When memory encryption is enabled in VM, the guest RAM will be encrypted with the guest-specific key, to protect the confidentiality of data while in transit we need to platform specific hooks to save or migrate the guest RAM. Introduce the new ConfidentialGuestMemoryEncryptionOps in this patch which will be later used by the encrypted guest for migration. Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index ba2dd4b5df..343f686fc2 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -53,8 +53,35 @@ struct ConfidentialGuestSupport { bool ready; }; +/** + * The functions registers with ConfidentialGuestMemoryEncryptionOps will be + * used during the encrypted guest migration. + */ +struct ConfidentialGuestMemoryEncryptionOps { + /* Initialize the platform specific state before starting the migration */ + int (*save_setup)(const char *pdh, const char *plat_cert, + const char *amd_cert); + + /* Write the encrypted page and metadata associated with it */ + int (*save_outgoing_page)(QEMUFile *f, uint8_t *ptr, uint32_t size, + uint64_t *bytes_sent); + + /* Load the incoming encrypted page into guest memory */ + int (*load_incoming_page)(QEMUFile *f, uint8_t *ptr); + + /* Check if gfn is in shared/unencrypted region */ + bool (*is_gfn_in_unshared_region)(unsigned long gfn); + + /* Write the shared regions list */ + int (*save_outgoing_shared_regions_list)(QEMUFile *f); + + /* Load the shared regions list */ + int (*load_incoming_shared_regions_list)(QEMUFile *f); +}; + typedef struct ConfidentialGuestSupportClass { ObjectClass parent; + struct ConfidentialGuestMemoryEncryptionOps *memory_encryption_ops; } ConfidentialGuestSupportClass; #endif /* !CONFIG_USER_ONLY */ -- Gitee From f6753191237118294d04193908db503bb87619f7 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 12:10:23 +0000 Subject: [PATCH 292/939] target/i386: sev: provide callback to setup outgoing context cherry-picked from https://github.com/AMDESE/qemu/commit/7521883afc0. The user provides the target machine's Platform Diffie-Hellman key (PDH) and certificate chain before starting the SEV guest migration. Cache the certificate chain as we need them while creating the outgoing context. Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra [ Fix conflict. ] Signed-off-by: hanliyang --- target/i386/sev.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ target/i386/sev.h | 2 ++ 2 files changed, 61 insertions(+) diff --git a/target/i386/sev.c b/target/i386/sev.c index 1a9d1db7a8..10233511cf 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -73,6 +73,12 @@ struct SevGuestState { int sev_fd; SevState state; gchar *measurement; + guchar *remote_pdh; + size_t remote_pdh_len; + guchar *remote_plat_cert; + size_t remote_plat_cert_len; + guchar *amd_cert; + size_t amd_cert_len; uint32_t reset_cs; uint32_t reset_ip; @@ -157,6 +163,12 @@ static const char *const sev_fw_errlist[] = { #define SEV_FW_MAX_ERROR ARRAY_SIZE(sev_fw_errlist) +#define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ + +static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { + .save_setup = sev_save_setup, +}; + static int sev_ioctl(int fd, int cmd, void *data, int *error) { @@ -906,6 +918,48 @@ sev_vm_state_change(void *opaque, bool running, RunState state) } } +static inline bool check_blob_length(size_t value) +{ + if (value > SEV_FW_BLOB_MAX_SIZE) { + error_report("invalid length max=%d got=%ld", + SEV_FW_BLOB_MAX_SIZE, value); + return false; + } + + return true; +} + +int sev_save_setup(const char *pdh, const char *plat_cert, + const char *amd_cert) +{ + SevGuestState *s = sev_guest; + + s->remote_pdh = g_base64_decode(pdh, &s->remote_pdh_len); + if (!check_blob_length(s->remote_pdh_len)) { + goto error; + } + + s->remote_plat_cert = g_base64_decode(plat_cert, + &s->remote_plat_cert_len); + if (!check_blob_length(s->remote_plat_cert_len)) { + goto error; + } + + s->amd_cert = g_base64_decode(amd_cert, &s->amd_cert_len); + if (!check_blob_length(s->amd_cert_len)) { + goto error; + } + + return 0; + +error: + g_free(s->remote_pdh); + g_free(s->remote_plat_cert); + g_free(s->amd_cert); + + return 1; +} + int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { SevGuestState *sev @@ -920,6 +974,9 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) return 0; } + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(cgs)); + ret = ram_block_discard_disable(true); if (ret) { error_report("%s: cannot disable RAM discard", __func__); @@ -1013,6 +1070,8 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) qemu_add_machine_init_done_notifier(&sev_machine_done_notify); qemu_add_vm_change_state_handler(sev_vm_state_change, sev); + cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; + cgs->ready = true; return 0; diff --git a/target/i386/sev.h b/target/i386/sev.h index e7499c95b1..e96de021f5 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -51,6 +51,8 @@ uint32_t sev_get_reduced_phys_bits(void); bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp); int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp); +int sev_save_setup(const char *pdh, const char *plat_cert, + const char *amd_cert); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); -- Gitee From c8a6d5f18c45079575b707db8f017cce22acc970 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 12:16:09 +0000 Subject: [PATCH 293/939] target/i386: sev: do not create launch context for an incoming guest cherry-picked from https://github.com/AMDESE/qemu/commit/b85694233495. The LAUNCH_START is used for creating an encryption context to encrypt newly created guest, for an incoming guest the RECEIVE_START should be used. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra [ Fix conflict. ] Signed-off-by: hanliyang --- target/i386/sev.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index 10233511cf..65984f013a 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1060,10 +1060,16 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) goto err; } - ret = sev_launch_start(sev); - if (ret) { - error_setg(errp, "%s: failed to create encryption context", __func__); - goto err; + /* + * The LAUNCH context is used for new guest, if its an incoming guest + * then RECEIVE context will be created after the connection is established. + */ + if (!runstate_check(RUN_STATE_INMIGRATE)) { + ret = sev_launch_start(sev); + if (ret) { + error_setg(errp, "%s: failed to create encryption context", __func__); + goto err; + } } ram_block_notifier_add(&sev_ram_notifier); -- Gitee From 0a7dde8450d9b6a6d0c75cef11e4bbff65e95edc Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 12:55:25 +0000 Subject: [PATCH 294/939] target/i386: sev: add support to encrypt the outgoing page cherry-picked from https://github.com/AMDESE/qemu/commit/5187c6f86bd. The sev_save_outgoing_page() provide the implementation to encrypt the guest private pages during the transit. The routines uses the SEND_START command to create the outgoing encryption context on the first call then uses the SEND_UPDATE_DATA command to encrypt the data before writing it to the socket. While encrypting the data SEND_UPDATE_DATA produces some metadata (e.g MAC, IV). The metadata is also sent to the target machine. After migration is completed, we issue the SEND_FINISH command to transition the SEV guest state from sending to unrunnable state. Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra [ Fix conflict. ] Signed-off-by: hanliyang --- target/i386/sev.c | 219 +++++++++++++++++++++++++++++++++++++++ target/i386/sev.h | 2 + target/i386/trace-events | 3 + 3 files changed, 224 insertions(+) diff --git a/target/i386/sev.c b/target/i386/sev.c index 65984f013a..e1fa0ec5e5 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -31,6 +31,8 @@ #include "sysemu/runstate.h" #include "trace.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" +#include "migration/misc.h" #include "qom/object.h" #include "monitor/monitor.h" #include "monitor/hmp-target.h" @@ -79,6 +81,8 @@ struct SevGuestState { size_t remote_plat_cert_len; guchar *amd_cert; size_t amd_cert_len; + gchar *send_packet_hdr; + size_t send_packet_hdr_len; uint32_t reset_cs; uint32_t reset_ip; @@ -167,6 +171,7 @@ static const char *const sev_fw_errlist[] = { static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_setup = sev_save_setup, + .save_outgoing_page = sev_save_outgoing_page, }; static int @@ -960,6 +965,38 @@ error: return 1; } +static void +sev_send_finish(void) +{ + int ret, error; + + trace_kvm_sev_send_finish(); + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_FINISH, 0, &error); + if (ret) { + error_report("%s: SEND_FINISH ret=%d fw_error=%d '%s'", + __func__, ret, error, fw_error_to_str(error)); + } + + g_free(sev_guest->send_packet_hdr); + sev_set_guest_state(sev_guest, SEV_STATE_RUNNING); +} + +static void +sev_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + + if (migration_has_finished(s) || + migration_in_postcopy_after_devices(s) || + migration_has_failed(s)) { + if (sev_check_state(sev_guest, SEV_STATE_SEND_UPDATE)) { + sev_send_finish(); + } + } +} + +static Notifier sev_migration_state; + int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { SevGuestState *sev @@ -1075,6 +1112,7 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) ram_block_notifier_add(&sev_ram_notifier); qemu_add_machine_init_done_notifier(&sev_machine_done_notify); qemu_add_vm_change_state_handler(sev_vm_state_change, sev); + migration_add_notifier(&sev_migration_state, sev_migration_state_notifier); cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; @@ -1317,6 +1355,187 @@ int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size) return 0; } +static int +sev_get_send_session_length(void) +{ + int ret, fw_err = 0; + struct kvm_sev_send_start start = {}; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_START, &start, &fw_err); + if (fw_err != SEV_RET_INVALID_LEN) { + ret = -1; + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, fw_err, fw_error_to_str(fw_err)); + goto err; + } + + ret = start.session_len; +err: + return ret; +} + +static int +sev_send_start(SevGuestState *s, QEMUFile *f, uint64_t *bytes_sent) +{ + gsize pdh_len = 0, plat_cert_len; + int session_len, ret, fw_error; + struct kvm_sev_send_start start = { }; + guchar *pdh = NULL, *plat_cert = NULL, *session = NULL; + Error *local_err = NULL; + + if (!s->remote_pdh || !s->remote_plat_cert || !s->amd_cert_len) { + error_report("%s: missing remote PDH or PLAT_CERT", __func__); + return 1; + } + + start.pdh_cert_uaddr = (uintptr_t) s->remote_pdh; + start.pdh_cert_len = s->remote_pdh_len; + + start.plat_certs_uaddr = (uintptr_t)s->remote_plat_cert; + start.plat_certs_len = s->remote_plat_cert_len; + + start.amd_certs_uaddr = (uintptr_t)s->amd_cert; + start.amd_certs_len = s->amd_cert_len; + + /* get the session length */ + session_len = sev_get_send_session_length(); + if (session_len < 0) { + ret = 1; + goto err; + } + + session = g_new0(guchar, session_len); + start.session_uaddr = (unsigned long)session; + start.session_len = session_len; + + /* Get our PDH certificate */ + ret = sev_get_pdh_info(s->sev_fd, &pdh, &pdh_len, + &plat_cert, &plat_cert_len, &local_err); + if (ret) { + error_report("Failed to get our PDH cert"); + goto err; + } + + trace_kvm_sev_send_start(start.pdh_cert_uaddr, start.pdh_cert_len, + start.plat_certs_uaddr, start.plat_certs_len, + start.amd_certs_uaddr, start.amd_certs_len); + + ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_START, &start, &fw_error); + if (ret < 0) { + error_report("%s: SEND_START ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + qemu_put_be32(f, start.policy); + qemu_put_be32(f, pdh_len); + qemu_put_buffer(f, (uint8_t *)pdh, pdh_len); + qemu_put_be32(f, start.session_len); + qemu_put_buffer(f, (uint8_t *)start.session_uaddr, start.session_len); + *bytes_sent = 12 + pdh_len + start.session_len; + + sev_set_guest_state(s, SEV_STATE_SEND_UPDATE); + +err: + g_free(pdh); + g_free(plat_cert); + return ret; +} + +static int +sev_send_get_packet_len(int *fw_err) +{ + int ret; + struct kvm_sev_send_update_data update = { 0, }; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_DATA, + &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + ret = -1; + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + goto err; + } + + ret = update.hdr_len; + +err: + return ret; +} + +static int +sev_send_update_data(SevGuestState *s, QEMUFile *f, uint8_t *ptr, uint32_t size, + uint64_t *bytes_sent) +{ + int ret, fw_error; + guchar *trans; + struct kvm_sev_send_update_data update = { }; + + /* + * If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (!s->send_packet_hdr) { + s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error); + if (s->send_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len); + } + + /* allocate transport buffer */ + trans = g_new(guchar, size); + + update.hdr_uaddr = (uintptr_t)s->send_packet_hdr; + update.hdr_len = s->send_packet_hdr_len; + update.guest_uaddr = (uintptr_t)ptr; + update.guest_len = size; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = size; + + trace_kvm_sev_send_update_data(ptr, trans, size); + + ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_UPDATE_DATA, &update, &fw_error); + if (ret) { + error_report("%s: SEND_UPDATE_DATA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent = 4 + update.hdr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += (4 + update.trans_len); + +err: + g_free(trans); + return ret; +} + +int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr, + uint32_t sz, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + + /* + * If this is a first buffer then create outgoing encryption context + * and write our PDH, policy and session data. + */ + if (!sev_check_state(s, SEV_STATE_SEND_UPDATE) && + sev_send_start(s, f, bytes_sent)) { + error_report("Failed to create outgoing context"); + return 1; + } + + return sev_send_update_data(s, f, ptr, sz, bytes_sent); +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index e96de021f5..463e94bb81 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -53,6 +53,8 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp); int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp); int sev_save_setup(const char *pdh, const char *plat_cert, const char *amd_cert); +int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr, + uint32_t size, uint64_t *bytes_sent); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); diff --git a/target/i386/trace-events b/target/i386/trace-events index 2cd8726eeb..e8d4aec125 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -11,3 +11,6 @@ kvm_sev_launch_measurement(const char *value) "data %s" kvm_sev_launch_finish(void) "" kvm_sev_launch_secret(uint64_t hpa, uint64_t hva, uint64_t secret, int len) "hpa 0x%" PRIx64 " hva 0x%" PRIx64 " data 0x%" PRIx64 " len %d" kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data %s" +kvm_sev_send_start(uint64_t pdh, int l1, uint64_t plat, int l2, uint64_t amd, int l3) "pdh 0x%" PRIx64 " len %d plat 0x%" PRIx64 " len %d amd 0x%" PRIx64 " len %d" +kvm_sev_send_update_data(void *src, void *dst, int len) "guest %p trans %p len %d" +kvm_sev_send_finish(void) "" -- Gitee From 778457c2f0f91b6a52e5db02dd3dc1f35ae64526 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 13:00:50 +0000 Subject: [PATCH 295/939] target/i386: sev: add support to load incoming encrypted page cherry-picked from https://github.com/AMDESE/qemu/commit/e86e5dccb045. The sev_load_incoming_page() provide the implementation to read the incoming guest private pages from the socket and load it into the guest memory. The routines uses the RECEIVE_START command to create the incoming encryption context on the first call then uses the RECEIEVE_UPDATE_DATA command to load the encrypted pages into the guest memory. After migration is completed, we issue the RECEIVE_FINISH command to transition the SEV guest to the runnable state so that it can be executed. Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra [ Fix conflicts. ] Signed-off-by: hanliyang --- target/i386/sev.c | 137 ++++++++++++++++++++++++++++++++++++++- target/i386/sev.h | 1 + target/i386/trace-events | 3 + 3 files changed, 140 insertions(+), 1 deletion(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index e1fa0ec5e5..de1a4b271e 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -172,6 +172,7 @@ static const char *const sev_fw_errlist[] = { static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_setup = sev_save_setup, .save_outgoing_page = sev_save_outgoing_page, + .load_incoming_page = sev_load_incoming_page, }; static int @@ -911,13 +912,33 @@ sev_launch_finish(SevGuestState *sev) migrate_add_blocker(&sev_mig_blocker, &error_fatal); } +static int +sev_receive_finish(SevGuestState *s) +{ + int error, ret = 1; + + trace_kvm_sev_receive_finish(); + ret = sev_ioctl(s->sev_fd, KVM_SEV_RECEIVE_FINISH, 0, &error); + if (ret) { + error_report("%s: RECEIVE_FINISH ret=%d fw_error=%d '%s'", + __func__, ret, error, fw_error_to_str(error)); + goto err; + } + + sev_set_guest_state(s, SEV_STATE_RUNNING); +err: + return ret; +} + static void sev_vm_state_change(void *opaque, bool running, RunState state) { SevGuestState *sev = opaque; if (running) { - if (!sev_check_state(sev, SEV_STATE_RUNNING)) { + if (sev_check_state(sev, SEV_STATE_RECEIVE_UPDATE)) { + sev_receive_finish(sev); + } else if (!sev_check_state(sev, SEV_STATE_RUNNING)) { sev_launch_finish(sev); } } @@ -1536,6 +1557,120 @@ int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr, return sev_send_update_data(s, f, ptr, sz, bytes_sent); } +static int +sev_receive_start(SevGuestState *sev, QEMUFile *f) +{ + int ret = 1; + int fw_error; + struct kvm_sev_receive_start start = { }; + gchar *session = NULL, *pdh_cert = NULL; + + /* get SEV guest handle */ + start.handle = object_property_get_int(OBJECT(sev), "handle", + &error_abort); + + /* get the source policy */ + start.policy = qemu_get_be32(f); + + /* get source PDH key */ + start.pdh_len = qemu_get_be32(f); + if (!check_blob_length(start.pdh_len)) { + return 1; + } + + pdh_cert = g_new(gchar, start.pdh_len); + qemu_get_buffer(f, (uint8_t *)pdh_cert, start.pdh_len); + start.pdh_uaddr = (uintptr_t)pdh_cert; + + /* get source session data */ + start.session_len = qemu_get_be32(f); + if (!check_blob_length(start.session_len)) { + return 1; + } + session = g_new(gchar, start.session_len); + qemu_get_buffer(f, (uint8_t *)session, start.session_len); + start.session_uaddr = (uintptr_t)session; + + trace_kvm_sev_receive_start(start.policy, session, pdh_cert); + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_START, + &start, &fw_error); + if (ret < 0) { + error_report("Error RECEIVE_START ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + object_property_set_int(OBJECT(sev), "handle", start.handle, &error_abort); + sev_set_guest_state(sev, SEV_STATE_RECEIVE_UPDATE); +err: + g_free(session); + g_free(pdh_cert); + + return ret; +} + +static int sev_receive_update_data(QEMUFile *f, uint8_t *ptr) +{ + int ret = 1, fw_error = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_sev_receive_update_data update = {}; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + if (!check_blob_length(update.hdr_len)) { + return 1; + } + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + if (!check_blob_length(update.trans_len)) { + goto err; + } + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + update.guest_uaddr = (uintptr_t) ptr; + update.guest_len = update.trans_len; + + trace_kvm_sev_receive_update_data(trans, ptr, update.guest_len, + hdr, update.hdr_len); + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_UPDATE_DATA, + &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_UPDATE_DATA ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } +err: + g_free(trans); + g_free(hdr); + return ret; +} + +int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr) +{ + SevGuestState *s = sev_guest; + + /* + * If this is first buffer and SEV is not in recieiving state then + * use RECEIVE_START command to create a encryption context. + */ + if (!sev_check_state(s, SEV_STATE_RECEIVE_UPDATE) && + sev_receive_start(s, f)) { + return 1; + } + + return sev_receive_update_data(f, ptr); +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index 463e94bb81..d94da2956b 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -55,6 +55,7 @@ int sev_save_setup(const char *pdh, const char *plat_cert, const char *amd_cert); int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr, uint32_t size, uint64_t *bytes_sent); +int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); diff --git a/target/i386/trace-events b/target/i386/trace-events index e8d4aec125..475de65ad4 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -14,3 +14,6 @@ kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data kvm_sev_send_start(uint64_t pdh, int l1, uint64_t plat, int l2, uint64_t amd, int l3) "pdh 0x%" PRIx64 " len %d plat 0x%" PRIx64 " len %d amd 0x%" PRIx64 " len %d" kvm_sev_send_update_data(void *src, void *dst, int len) "guest %p trans %p len %d" kvm_sev_send_finish(void) "" +kvm_sev_receive_start(int policy, void *session, void *pdh) "policy 0x%x session %p pdh %p" +kvm_sev_receive_update_data(void *src, void *dst, int len, void *hdr, int hdr_len) "guest %p trans %p len %d hdr %p hdr_len %d" +kvm_sev_receive_finish(void) "" -- Gitee From 02e6bfc88ce5e944ce36b8ccb7d2af103a969980 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 27 Jul 2021 15:05:49 +0000 Subject: [PATCH 296/939] kvm: Add support for SEV shared regions list and KVM_EXIT_HYPERCALL. cherry-picked from https://github.com/AMDESE/qemu/commit/fcbbd9b19ac. KVM_HC_MAP_GPA_RANGE hypercall is used by the SEV guest to notify a change in the page encryption status to the hypervisor. The hypercall should be invoked only when the encryption attribute is changed from encrypted -> decrypted and vice versa. By default all guest pages are considered encrypted. The hypercall exits to userspace with KVM_EXIT_HYPERCALL exit code, currently this is used only by SEV guests for guest page encryptiion status tracking. Add support to handle this exit and invoke SEV shared regions list handlers. Add support for SEV guest shared regions and implementation of the SEV shared regions list. Signed-off-by: Ashish Kalra [ Fix conflicts. ] Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 3 ++ target/i386/kvm/kvm.c | 48 +++++++++++++++++ target/i386/kvm/sev-stub.c | 11 ++++ target/i386/sev.c | 106 +++++++++++++++++++++++++++++++++++++ target/i386/sev.h | 3 ++ 5 files changed, 171 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 8d12435e41..9489a20835 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -348,6 +348,7 @@ struct kvm_run { } iocsr_io; /* KVM_EXIT_HYPERCALL */ struct { +#define KVM_HC_MAP_GPA_RANGE 12 __u64 nr; __u64 args[6]; __u64 ret; @@ -1204,6 +1205,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) + #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_irqchip { diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index a0bc9ea7b1..82f6d3b048 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -148,6 +148,7 @@ static int has_xcrs; static int has_sregs2; static int has_exception_payload; static int has_triple_fault_event; +static int has_map_gpa_range; static bool has_msr_mcg_ext_ctl; @@ -2191,6 +2192,17 @@ int kvm_arch_init_vcpu(CPUState *cs) c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10); } + if (sev_enabled()) { + c = cpuid_find_entry(&cpuid_data.cpuid, + KVM_CPUID_FEATURES | kvm_base, 0); + if (c) { + c->eax |= (1 << KVM_FEATURE_MIGRATION_CONTROL); + if (has_map_gpa_range) { + c->eax |= (1 << KVM_FEATURE_HC_MAP_GPA_RANGE); + } + } + } + cpuid_data.cpuid.nent = cpuid_i; cpuid_data.cpuid.padding = 0; @@ -2584,6 +2596,17 @@ int kvm_arch_init(MachineState *ms, KVMState *s) #endif } + has_map_gpa_range = kvm_check_extension(s, KVM_CAP_EXIT_HYPERCALL); + if (has_map_gpa_range) { + ret = kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, + KVM_EXIT_HYPERCALL_VALID_MASK); + if (ret < 0) { + error_report("kvm: Failed to enable MAP_GPA_RANGE cap: %s", + strerror(-ret)); + return ret; + } + } + ret = kvm_get_supported_msrs(s); if (ret < 0) { return ret; @@ -4936,6 +4959,28 @@ static int kvm_handle_tpr_access(X86CPU *cpu) return 1; } +static int kvm_handle_exit_hypercall(X86CPU *cpu, struct kvm_run *run) +{ + /* + * Currently this exit is only used by SEV guests for + * guest page encryption status tracking. + */ + if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) { + unsigned long enc = run->hypercall.args[2]; + unsigned long gpa = run->hypercall.args[0]; + unsigned long npages = run->hypercall.args[1]; + unsigned long gfn_start = gpa >> TARGET_PAGE_BITS; + unsigned long gfn_end = gfn_start + npages; + + if (enc) { + sev_remove_shared_regions_list(gfn_start, gfn_end); + } else { + sev_add_shared_regions_list(gfn_start, gfn_end); + } + } + return 0; +} + int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) { static const uint8_t int3 = 0xcc; @@ -5359,6 +5404,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ret = kvm_xen_handle_exit(cpu, &run->xen); break; #endif + case KVM_EXIT_HYPERCALL: + ret = kvm_handle_exit_hypercall(cpu, run); + break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; diff --git a/target/i386/kvm/sev-stub.c b/target/i386/kvm/sev-stub.c index 1be5341e8a..1282d242a7 100644 --- a/target/i386/kvm/sev-stub.c +++ b/target/i386/kvm/sev-stub.c @@ -19,3 +19,14 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) /* If we get here, cgs must be some non-SEV thing */ return 0; } + +int sev_remove_shared_regions_list(unsigned long gfn_start, + unsigned long gfn_end) +{ + return 0; +} + +int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end) +{ + return 0; +} diff --git a/target/i386/sev.c b/target/i386/sev.c index de1a4b271e..8525a7351f 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -44,6 +44,11 @@ #define TYPE_SEV_GUEST "sev-guest" OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) +struct shared_region { + unsigned long gfn_start, gfn_end; + QTAILQ_ENTRY(shared_region) list; +}; + /** * SevGuestState: @@ -87,6 +92,8 @@ struct SevGuestState { uint32_t reset_cs; uint32_t reset_ip; bool reset_data_valid; + + QTAILQ_HEAD(, shared_region) shared_regions_list; }; #define DEFAULT_GUEST_POLICY 0x1 /* disable debug */ @@ -1136,6 +1143,7 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) migration_add_notifier(&sev_migration_state, sev_migration_state_notifier); cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; + QTAILQ_INIT(&sev->shared_regions_list); cgs->ready = true; @@ -1671,6 +1679,104 @@ int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr) return sev_receive_update_data(f, ptr); } +int sev_remove_shared_regions_list(unsigned long start, unsigned long end) +{ + SevGuestState *s = sev_guest; + struct shared_region *pos; + + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + unsigned long l, r; + unsigned long curr_gfn_end = pos->gfn_end; + + /* + * Find if any intersection exists ? + * left bound for intersecting segment + */ + l = MAX(start, pos->gfn_start); + /* right bound for intersecting segment */ + r = MIN(end, pos->gfn_end); + if (l <= r) { + if (pos->gfn_start == l && pos->gfn_end == r) { + QTAILQ_REMOVE(&s->shared_regions_list, pos, list); + } else if (l == pos->gfn_start) { + pos->gfn_start = r; + } else if (r == pos->gfn_end) { + pos->gfn_end = l; + } else { + /* Do a de-merge -- split linked list nodes */ + struct shared_region *shrd_region; + + pos->gfn_end = l; + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return 0; + } + shrd_region->gfn_start = r; + shrd_region->gfn_end = curr_gfn_end; + QTAILQ_INSERT_AFTER(&s->shared_regions_list, pos, + shrd_region, list); + } + } + if (end <= curr_gfn_end) { + break; + } + } + return 0; +} + +int sev_add_shared_regions_list(unsigned long start, unsigned long end) +{ + struct shared_region *shrd_region; + struct shared_region *pos; + SevGuestState *s = sev_guest; + + if (QTAILQ_EMPTY(&s->shared_regions_list)) { + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return -1; + } + shrd_region->gfn_start = start; + shrd_region->gfn_end = end; + QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list); + return 0; + } + + /* + * shared regions list is a sorted list in ascending order + * of guest PA's and also merges consecutive range of guest PA's + */ + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + /* handle duplicate overlapping regions */ + if (start >= pos->gfn_start && end <= pos->gfn_end) { + return 0; + } + if (pos->gfn_end < start) { + continue; + } + /* merge consecutive guest PA(s) -- forward merge */ + if (pos->gfn_start <= start && pos->gfn_end >= start) { + pos->gfn_end = end; + return 0; + } + break; + } + /* + * Add a new node + */ + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return -1; + } + shrd_region->gfn_start = start; + shrd_region->gfn_end = end; + if (pos) { + QTAILQ_INSERT_BEFORE(pos, shrd_region, list); + } else { + QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list); + } + return 1; +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index d94da2956b..acf69d4e6f 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -61,6 +61,9 @@ int sev_inject_launch_secret(const char *hdr, const char *secret, int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size); void sev_es_set_reset_vector(CPUState *cpu); +int sev_remove_shared_regions_list(unsigned long gfn_start, + unsigned long gfn_end); +int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end); int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); -- Gitee From 0f85e3a486c2d0130cb3be322900aa839d77d4bd Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 16:31:36 +0000 Subject: [PATCH 297/939] migration: add support to migrate shared regions list cherry-picked from https://github.com/AMDESE/qemu/commit/9236f522e48b6. When memory encryption is enabled, the hypervisor maintains a shared regions list which is referred by hypervisor during migration to check if page is private or shared. This list is built during the VM bootup and must be migrated to the target host so that hypervisor on target host can use it for future migration. Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra [ Fix conflicts. ] Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 2 +- target/i386/sev.c | 45 +++++++++++++++++++++++ target/i386/sev.h | 2 + 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index 343f686fc2..dd4887f65f 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -73,7 +73,7 @@ struct ConfidentialGuestMemoryEncryptionOps { bool (*is_gfn_in_unshared_region)(unsigned long gfn); /* Write the shared regions list */ - int (*save_outgoing_shared_regions_list)(QEMUFile *f); + int (*save_outgoing_shared_regions_list)(QEMUFile *f, uint64_t *bytes_sent); /* Load the shared regions list */ int (*load_incoming_shared_regions_list)(QEMUFile *f); diff --git a/target/i386/sev.c b/target/i386/sev.c index 8525a7351f..92aedf0503 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -176,10 +176,15 @@ static const char *const sev_fw_errlist[] = { #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ +#define SHARED_REGION_LIST_CONT 0x1 +#define SHARED_REGION_LIST_END 0x2 + static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_setup = sev_save_setup, .save_outgoing_page = sev_save_outgoing_page, .load_incoming_page = sev_load_incoming_page, + .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, + .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, }; static int @@ -1777,6 +1782,46 @@ int sev_add_shared_regions_list(unsigned long start, unsigned long end) return 1; } +int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + struct shared_region *pos; + + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + qemu_put_be32(f, SHARED_REGION_LIST_CONT); + qemu_put_be32(f, pos->gfn_start); + qemu_put_be32(f, pos->gfn_end); + *bytes_sent += 12; + } + + qemu_put_be32(f, SHARED_REGION_LIST_END); + *bytes_sent += 4; + return 0; +} + +int sev_load_incoming_shared_regions_list(QEMUFile *f) +{ + SevGuestState *s = sev_guest; + struct shared_region *shrd_region; + int status; + + status = qemu_get_be32(f); + while (status == SHARED_REGION_LIST_CONT) { + + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return 0; + } + shrd_region->gfn_start = qemu_get_be32(f); + shrd_region->gfn_end = qemu_get_be32(f); + + QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list); + + status = qemu_get_be32(f); + } + return 0; +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index acf69d4e6f..5b4231c859 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -64,6 +64,8 @@ void sev_es_set_reset_vector(CPUState *cpu); int sev_remove_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end); int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end); +int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent); +int sev_load_incoming_shared_regions_list(QEMUFile *f); int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); -- Gitee From af3077a2f19f0604c4e7f8b94eb0338b7f1f85d6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Jul 2021 16:53:19 +0000 Subject: [PATCH 298/939] migration/ram: add support to send encrypted pages cherry-picked from https://github.com/AMDESE/qemu/commit/2d6bda0d4cf. When memory encryption is enabled, the guest memory will be encrypted with the guest specific key. The patch introduces RAM_SAVE_FLAG_ENCRYPTED_PAGE flag to distinguish the encrypted data from plaintext. Encrypted pages may need special handling. The sev_save_outgoing_page() is used by the sender to write the encrypted pages onto the socket, similarly the sev_load_incoming_page() is used by the target to read the encrypted pages from the socket and load into the guest memory. Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra [ Fix conflicts. ] Signed-off-by: hanliyang --- migration/migration.h | 2 + migration/ram.c | 174 +++++++++++++++++++++++++++++++++++++++++- target/i386/sev.c | 14 ++++ target/i386/sev.h | 4 + 4 files changed, 192 insertions(+), 2 deletions(-) diff --git a/migration/migration.h b/migration/migration.h index 2f26c9509b..eeddb7c0bd 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -553,4 +553,6 @@ int migration_stop_vm(RunState state); void migrate_fd_cancel(MigrationState *s); +bool memcrypt_enabled(void); + #endif diff --git a/migration/ram.c b/migration/ram.c index f9b2b9b985..beac7ea2c0 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -63,6 +63,10 @@ #include "options.h" #include "sysemu/dirtylimit.h" #include "sysemu/kvm.h" +#include "exec/confidential-guest-support.h" + +/* Defines RAM_SAVE_ENCRYPTED_PAGE and RAM_SAVE_SHARED_REGION_LIST */ +#include "target/i386/sev.h" #include "hw/boards.h" /* for machine_dump_guest_core() */ @@ -92,7 +96,16 @@ /* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */ #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 -/* We can't use any flag that is bigger than 0x200 */ +#define RAM_SAVE_FLAG_ENCRYPTED_DATA 0x400 + +bool memcrypt_enabled(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + if(ms->cgs) + return ms->cgs->ready; + else + return false; +} XBZRLECacheStats xbzrle_counters; @@ -1206,6 +1219,88 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, return 1; } +/** + * ram_save_encrypted_page - send the given encrypted page to the stream + */ +static int ram_save_encrypted_page(RAMState *rs, PageSearchStatus *pss) +{ + QEMUFile *file = pss->pss_channel; + int ret; + uint8_t *p; + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + uint64_t bytes_xmit = 0; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + p = block->host + offset; + trace_ram_save_page(block->idstr, (uint64_t)offset, p); + + ram_transferred_add(save_page_header(pss, file, block, + offset | RAM_SAVE_FLAG_ENCRYPTED_DATA)); + qemu_put_be32(file, RAM_SAVE_ENCRYPTED_PAGE); + ret = ops->save_outgoing_page(file, p, TARGET_PAGE_SIZE, &bytes_xmit); + if (ret) { + return -1; + } + ram_transferred_add(4 + bytes_xmit); + stat64_add(&mig_stats.normal_pages, 1); + + return 1; +} + +/** + * ram_save_shared_region_list: send the shared region list + */ +static int ram_save_shared_region_list(RAMState *rs, QEMUFile *f) +{ + int ret; + uint64_t bytes_xmit = 0; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + ram_transferred_add(save_page_header(pss, f, + pss->last_sent_block, + RAM_SAVE_FLAG_ENCRYPTED_DATA)); + qemu_put_be32(f, RAM_SAVE_SHARED_REGIONS_LIST); + ret = ops->save_outgoing_shared_regions_list(f, &bytes_xmit); + if (ret < 0) { + return ret; + } + ram_transferred_add(4 + bytes_xmit); + + return 0; +} + +static int load_encrypted_data(QEMUFile *f, uint8_t *ptr) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + int flag; + + flag = qemu_get_be32(f); + + if (flag == RAM_SAVE_ENCRYPTED_PAGE) { + return ops->load_incoming_page(f, ptr); + } else if (flag == RAM_SAVE_SHARED_REGIONS_LIST) { + return ops->load_incoming_shared_regions_list(f); + } else { + error_report("unknown encrypted flag %x", flag); + return 1; + } +} + /** * ram_save_page: send the given page to the stream * @@ -2036,6 +2131,35 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, compress_send_queued_data); } +/** + * encrypted_test_list: check if the page is encrypted + * + * Returns a bool indicating whether the page is encrypted. + */ +static bool encrypted_test_list(RAMState *rs, RAMBlock *block, + unsigned long page) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + unsigned long gfn; + + /* ROM devices contains the unencrypted data */ + if (memory_region_is_rom(block->mr)) { + return false; + } + + /* + * Translate page in ram_addr_t address space to GPA address + * space using memory region. + */ + gfn = page + (block->mr->addr >> TARGET_PAGE_BITS); + + return ops->is_gfn_in_unshared_region(gfn); +} + /** * ram_save_target_page_legacy: save one target page * @@ -2054,6 +2178,17 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return res; } + /* + * If memory encryption is enabled then use memory encryption APIs + * to write the outgoing buffer to the wire. The encryption APIs + * will take care of accessing the guest memory and re-encrypt it + * for the transport purposes. + */ + if (memcrypt_enabled() && + encrypted_test_list(rs, pss->block, pss->page)) { + return ram_save_encrypted_page(rs, pss); + } + if (save_compress_page(rs, pss, offset)) { return 1; } @@ -2919,6 +3054,18 @@ void qemu_guest_free_page_hint(void *addr, size_t len) } } +static int ram_encrypted_save_setup(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + MigrationParameters *p = &migrate_get_current()->parameters; + + return ops->save_setup(p->sev_pdh, p->sev_plat_cert, p->sev_amd_cert); +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -2954,6 +3101,13 @@ static int ram_save_setup(QEMUFile *f, void *opaque) (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; WITH_RCU_READ_LOCK_GUARD() { + + if (memcrypt_enabled()) { + if (ram_encrypted_save_setup()) { + return -1; + } + } + qemu_put_be64(f, ram_bytes_total_with_ignored() | RAM_SAVE_FLAG_MEM_SIZE); @@ -3183,6 +3337,15 @@ static int ram_save_complete(QEMUFile *f, void *opaque) qemu_file_set_error(f, ret); return ret; } + + /* send the shared regions list */ + if (memcrypt_enabled()) { + ret = ram_save_shared_region_list(rs, f); + if (ret < 0) { + qemu_file_set_error(f, ret); + return ret; + } + } } ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); @@ -3920,7 +4083,8 @@ static int ram_load_precopy(QEMUFile *f) } if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | - RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { + RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE | + RAM_SAVE_FLAG_ENCRYPTED_DATA)) { RAMBlock *block = ram_block_from_stream(mis, f, flags, RAM_CHANNEL_PRECOPY); @@ -4013,6 +4177,12 @@ static int ram_load_precopy(QEMUFile *f) qemu_file_set_error(f, ret); } break; + case RAM_SAVE_FLAG_ENCRYPTED_DATA: + if (load_encrypted_data(f, host)) { + error_report("Failed to load encrypted data"); + ret = -EINVAL; + } + break; default: error_report("Unknown combination of migration flags: 0x%x", flags); ret = -EINVAL; diff --git a/target/i386/sev.c b/target/i386/sev.c index 92aedf0503..47f41aefe7 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -183,6 +183,7 @@ static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_setup = sev_save_setup, .save_outgoing_page = sev_save_outgoing_page, .load_incoming_page = sev_load_incoming_page, + .is_gfn_in_unshared_region = sev_is_gfn_in_unshared_region, .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, }; @@ -1822,6 +1823,19 @@ int sev_load_incoming_shared_regions_list(QEMUFile *f) return 0; } +bool sev_is_gfn_in_unshared_region(unsigned long gfn) +{ + SevGuestState *s = sev_guest; + struct shared_region *pos; + + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + if (gfn >= pos->gfn_start && gfn < pos->gfn_end) { + return false; + } + } + return true; +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index 5b4231c859..b9c2afb799 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -38,6 +38,9 @@ typedef struct SevKernelLoaderContext { size_t cmdline_size; } SevKernelLoaderContext; +#define RAM_SAVE_ENCRYPTED_PAGE 0x1 +#define RAM_SAVE_SHARED_REGIONS_LIST 0x2 + #ifdef CONFIG_SEV bool sev_enabled(void); bool sev_es_enabled(void); @@ -66,6 +69,7 @@ int sev_remove_shared_regions_list(unsigned long gfn_start, int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end); int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent); int sev_load_incoming_shared_regions_list(QEMUFile *f); +bool sev_is_gfn_in_unshared_region(unsigned long gfn); int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); -- Gitee From cbbac2aa57d5609c254f99bf247d16e4b9fd7de3 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 27 Jul 2021 18:05:25 +0000 Subject: [PATCH 299/939] migration/ram: Force encrypted status for flash0 & flash1 devices. cherry-picked from https://github.com/AMDESE/qemu/commit/803d6a4c8d. Currently OVMF clears the C-bit and marks NonExistent memory space as decrypted in the page encryption bitmap. By marking the NonExistent memory space as decrypted it gurantees any future MMIO adds will work correctly, but this marks flash0 device space as decrypted. At reset the SEV core will be in forced encrypted state, so this decrypted marking of flash0 device space will cause VCPU reset to fail as flash0 device pages will be migrated incorrectly. Signed-off-by: Ashish Kalra Signed-off-by: hanliyang --- migration/ram.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index beac7ea2c0..9ecd8580c5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2151,6 +2151,14 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, return false; } + if (!strcmp(memory_region_name(block->mr), "system.flash0")) { + return true; + } + + if (!strcmp(memory_region_name(block->mr), "system.flash1")) { + return false; + } + /* * Translate page in ram_addr_t address space to GPA address * space using memory region. -- Gitee From 7aced2a5fff91e0fcff97bb5eafddafece0cb983 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 27 Jul 2021 17:59:33 +0000 Subject: [PATCH 300/939] kvm: Add support for userspace MSR filtering and handling of MSR_KVM_MIGRATION_CONTROL. cherry-picked from https://github.com/AMDESE/qemu/commit/67935c3fd5f. Add support for userspace MSR filtering using KVM_X86_SET_MSR_FILTER ioctl and handling of MSRs in userspace. Currently this is only used for SEV guests which use MSR_KVM_MIGRATION_CONTROL to indicate if the guest is enabled and ready for migration. KVM arch code calls into SEV guest specific code to delete the SEV migrate blocker which has been setup at SEV_LAUNCH_FINISH. Signed-off-by: Ashish Kalra [ Fix conflicts. ] Signed-off-by: hanliyang --- target/i386/kvm/kvm.c | 35 +++++++++++++++++++++++++++++++++++ target/i386/kvm/sev-stub.c | 4 ++++ target/i386/sev.c | 6 ++++++ target/i386/sev.h | 1 + 4 files changed, 46 insertions(+) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 82f6d3b048..a5a755db01 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2488,6 +2488,32 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr, return true; } +/* + * Currently this exit is only used by SEV guests for + * MSR_KVM_MIGRATION_CONTROL to indicate if the guest + * is ready for migration. + */ +static uint64_t msr_kvm_migration_control; + +static bool kvm_rdmsr_kvm_migration_control(X86CPU *cpu, uint32_t msr, + uint64_t *val) +{ + *val = msr_kvm_migration_control; + + return true; +} + +static bool kvm_wrmsr_kvm_migration_control(X86CPU *cpu, uint32_t msr, + uint64_t val) +{ + msr_kvm_migration_control = val; + + if (val == KVM_MIGRATION_READY) + sev_del_migrate_blocker(); + + return true; +} + static Notifier smram_machine_done; static KVMMemoryListener smram_listener; static AddressSpace smram_address_space; @@ -2735,6 +2761,15 @@ int kvm_arch_init(MachineState *ms, KVMState *s) strerror(-ret)); exit(1); } + + r = kvm_filter_msr(s, MSR_KVM_MIGRATION_CONTROL, + kvm_rdmsr_kvm_migration_control, + kvm_wrmsr_kvm_migration_control); + if (!r) { + error_report("Could not install MSR_KVM_MIGRATION_CONTROL handler: %s", + strerror(-ret)); + exit(1); + } } return 0; diff --git a/target/i386/kvm/sev-stub.c b/target/i386/kvm/sev-stub.c index 1282d242a7..99899688e4 100644 --- a/target/i386/kvm/sev-stub.c +++ b/target/i386/kvm/sev-stub.c @@ -30,3 +30,7 @@ int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end) { return 0; } + +void sev_del_migrate_blocker(void) +{ +} diff --git a/target/i386/sev.c b/target/i386/sev.c index 47f41aefe7..98b0d3937a 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -925,6 +925,12 @@ sev_launch_finish(SevGuestState *sev) migrate_add_blocker(&sev_mig_blocker, &error_fatal); } +void +sev_del_migrate_blocker(void) +{ + migrate_del_blocker(&sev_mig_blocker); +} + static int sev_receive_finish(SevGuestState *s) { diff --git a/target/i386/sev.h b/target/i386/sev.h index b9c2afb799..84e3bdf2df 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -70,6 +70,7 @@ int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end); int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent); int sev_load_incoming_shared_regions_list(QEMUFile *f); bool sev_is_gfn_in_unshared_region(unsigned long gfn); +void sev_del_migrate_blocker(void); int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); -- Gitee From ccca5618025567c4168630459b90bf11bf96cca4 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Wed, 31 Jan 2024 07:26:57 +0800 Subject: [PATCH 301/939] target/i386: sev: Return 0 if sev_send_get_packet_len() fails The send_packet_hdr_len of struct SEVState is of type size_t which is an unsigned class type. If the send_packet_hdr_len is assigned as -1, then it will be a huge number and the QEMU process will crash when allocating packet buffer with the huge size. For example, the following code could cause crash described above. ``` static int sev_send_update_data(SEVState *s, QEMUFile *f, uint8_t *ptr, uint32_t size, uint64_t *bytes_sent) { ...... if (!s->send_packet_hdr) { s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error); if (s->send_packet_hdr_len < 1) { error_report("%s: SEND_UPDATE fw_error=%d '%s'", __func__, fw_error, fw_error_to_str(fw_error)); return 1; } s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len); } ...... } ``` Signed-off-by: hanliyang --- target/i386/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index 98b0d3937a..6ccb22c00a 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1492,7 +1492,7 @@ sev_send_get_packet_len(int *fw_err) ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_DATA, &update, fw_err); if (*fw_err != SEV_RET_INVALID_LEN) { - ret = -1; + ret = 0; error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", __func__, ret, *fw_err, fw_error_to_str(*fw_err)); goto err; -- Gitee From e6a20047ca9f61d7fc544e4f0b9b26aa268ccda7 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Tue, 8 Dec 2020 22:57:46 -0500 Subject: [PATCH 302/939] migration/ram: Force encrypted status for VGA vram The VGA vram memory region act as frame buffer of VM. This memory is decrypted in the QEMU process. For CSV VM live migration, we should avoid memory encryption status check on VGA vram. Signed-off-by: hanliyang --- migration/ram.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 9ecd8580c5..66a36736ad 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2159,6 +2159,10 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, return false; } + if (!strcmp(memory_region_name(block->mr), "vga.vram")) { + return false; + } + /* * Translate page in ram_addr_t address space to GPA address * space using memory region. -- Gitee From e98147762cb47645c590ee000dbc12c654a6cc2d Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sun, 16 Jan 2022 19:57:58 -0500 Subject: [PATCH 303/939] target/i386: sev: Clear shared_regions_list when reboot CSV Guest Also fix memory leak in sev_remove_shared_regions_list(). Signed-off-by: hanliyang --- target/i386/kvm/kvm.c | 5 +++++ target/i386/sev.c | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index a5a755db01..5730d0e0c0 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2270,6 +2270,11 @@ void kvm_arch_reset_vcpu(X86CPU *cpu) env->mp_state = KVM_MP_STATE_RUNNABLE; } + if (cpu_is_bsp(cpu) && + sev_enabled() && has_map_gpa_range) { + sev_remove_shared_regions_list(0, -1); + } + /* enabled by default */ env->poll_control_msr = 1; diff --git a/target/i386/sev.c b/target/i386/sev.c index 6ccb22c00a..0b0f589aee 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1694,9 +1694,9 @@ int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr) int sev_remove_shared_regions_list(unsigned long start, unsigned long end) { SevGuestState *s = sev_guest; - struct shared_region *pos; + struct shared_region *pos, *next_pos; - QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + QTAILQ_FOREACH_SAFE(pos, &s->shared_regions_list, list, next_pos) { unsigned long l, r; unsigned long curr_gfn_end = pos->gfn_end; @@ -1710,6 +1710,7 @@ int sev_remove_shared_regions_list(unsigned long start, unsigned long end) if (l <= r) { if (pos->gfn_start == l && pos->gfn_end == r) { QTAILQ_REMOVE(&s->shared_regions_list, pos, list); + g_free(pos); } else if (l == pos->gfn_start) { pos->gfn_start = r; } else if (r == pos->gfn_end) { -- Gitee From ec2518709b8d461c3a165c1722ccd2e585cec161 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sun, 16 Jan 2022 20:05:02 -0500 Subject: [PATCH 304/939] migration/ram: Fix calculation of gfn correpond to a page in ramblock A RAMBlock contains a host memory region which may consist of many discontiguous MemoryRegion in AddressSpace of a Guest, so we cannot get gpa by MemoryRegion.addr. Since KVM memslot records the relationship between gpa and hva, so we can pass the hva of page in RAMBlock to kvm_phisical_memory_addr_from_host() to get the expected gpa. Signed-off-by: hanliyang --- migration/ram.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 66a36736ad..1abe8476f7 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -67,6 +67,7 @@ /* Defines RAM_SAVE_ENCRYPTED_PAGE and RAM_SAVE_SHARED_REGION_LIST */ #include "target/i386/sev.h" +#include "sysemu/kvm.h" #include "hw/boards.h" /* for machine_dump_guest_core() */ @@ -2145,6 +2146,8 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, struct ConfidentialGuestMemoryEncryptionOps *ops = cgs_class->memory_encryption_ops; unsigned long gfn; + hwaddr paddr = 0; + int ret; /* ROM devices contains the unencrypted data */ if (memory_region_is_rom(block->mr)) { @@ -2167,7 +2170,14 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, * Translate page in ram_addr_t address space to GPA address * space using memory region. */ - gfn = page + (block->mr->addr >> TARGET_PAGE_BITS); + if (kvm_enabled()) { + ret = kvm_physical_memory_addr_from_host(kvm_state, + block->host + (page << TARGET_PAGE_BITS), &paddr); + if (ret == 0) { + return false; + } + } + gfn = paddr >> TARGET_PAGE_BITS; return ops->is_gfn_in_unshared_region(gfn); } -- Gitee From 2bdf07593dbec66205f2f20fa5430595678ded89 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Thu, 14 Mar 2024 19:21:11 +0800 Subject: [PATCH 305/939] target/i386: Introduce header file csv.h This header file is used to provide common helper functions and data structures for Hygon CSV. Signed-off-by: hanliyang --- configs/devices/i386-softmmu/default.mak | 1 + hw/i386/Kconfig | 5 +++ target/i386/csv.h | 47 ++++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 target/i386/csv.h diff --git a/configs/devices/i386-softmmu/default.mak b/configs/devices/i386-softmmu/default.mak index 598c6646df..db83ffcab9 100644 --- a/configs/devices/i386-softmmu/default.mak +++ b/configs/devices/i386-softmmu/default.mak @@ -23,6 +23,7 @@ #CONFIG_TPM_TIS_ISA=n #CONFIG_VTD=n #CONFIG_SGX=n +#CONFIG_CSV=n # Boards: # diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 55850791df..08f3ae43f8 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -10,6 +10,10 @@ config SGX bool depends on KVM +config CSV + bool + depends on SEV + config PC bool imply APPLESMC @@ -26,6 +30,7 @@ config PC imply QXL imply SEV imply SGX + imply CSV imply TEST_DEVICES imply TPM_CRB imply TPM_TIS_ISA diff --git a/target/i386/csv.h b/target/i386/csv.h new file mode 100644 index 0000000000..f935babe97 --- /dev/null +++ b/target/i386/csv.h @@ -0,0 +1,47 @@ +/* + * QEMU CSV support + * + * Copyright: Hygon Info Technologies Ltd. 2022 + * + * Author: + * Jiang Xin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef I386_CSV_H +#define I386_CSV_H + +#ifdef CONFIG_CSV + +#include "cpu.h" + +#define CPUID_VENDOR_HYGON_EBX 0x6f677948 /* "Hygo" */ +#define CPUID_VENDOR_HYGON_ECX 0x656e6975 /* "uine" */ +#define CPUID_VENDOR_HYGON_EDX 0x6e65476e /* "nGen" */ + +static bool __attribute__((unused)) is_hygon_cpu(void) +{ + uint32_t ebx = 0; + uint32_t ecx = 0; + uint32_t edx = 0; + + host_cpuid(0, 0, NULL, &ebx, &ecx, &edx); + + if (ebx == CPUID_VENDOR_HYGON_EBX && + ecx == CPUID_VENDOR_HYGON_ECX && + edx == CPUID_VENDOR_HYGON_EDX) + return true; + else + return false; +} + +#else + +#define is_hygon_cpu() (false) + +#endif + +#endif -- Gitee From d23c6a2bcc836587620bd35726ca4d5f71c0a844 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Mon, 13 Nov 2023 21:55:33 +0000 Subject: [PATCH 306/939] target/i386: csv: Read cert chain from file when prepared for CSV live migration The cert chain is too long when encoded with base64, use the filename of cert chain instead of the encoded string when prepared for CSV live migration. [ Fix conflicts. ] Signed-off-by: hanliyang --- qapi/migration.json | 24 +++++++++++++++--------- target/i386/sev.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/qapi/migration.json b/qapi/migration.json index 038e99cba3..3aed216c3b 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -891,14 +891,16 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # -# @sev-pdh: The target host platform diffie-hellman key encoded in base64 +# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or +# pdh filename for hygon # (Since 4.2) # -# @sev-plat-cert: The target host platform certificate chain encoded in base64 +# @sev-plat-cert: The target host platform certificate chain encoded in base64, +# or plat cert filename for hygon # (Since 4.2) # # @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in -# base64 (Since 4.2) +# base64, or vendor cert filename for hygon (Since 4.2) # # Features: # @@ -1093,14 +1095,16 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # -# @sev-pdh: The target host platform diffie-hellman key encoded in base64 +# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or +# pdh filename for hygon # (Since 4.2) # -# @sev-plat-cert: The target host platform certificate chain encoded in base64 +# @sev-plat-cert: The target host platform certificate chain encoded in base64, +# or plat cert filename for hygon # (Since 4.2) # # @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in -# base64 (Since 4.2) +# base64, or vendor cert filename for hygon (Since 4.2) # # Features: # @@ -1340,14 +1344,16 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # -# @sev-pdh: The target host platform diffie-hellman key encoded in base64 +# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or +# pdh filename for hygon # (Since 4.2) # -# @sev-plat-cert: The target host platform certificate chain encoded in base64 +# @sev-plat-cert: The target host platform certificate chain encoded in base64, +# or plat cert filename for hygon # (Since 4.2) # # @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in -# base64 (Since 4.2) +# base64, or vendor cert filename for hygon (Since 4.2) # # Features: # diff --git a/target/i386/sev.c b/target/i386/sev.c index 0b0f589aee..331dfa4516 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -27,6 +27,7 @@ #include "crypto/hash.h" #include "sysemu/kvm.h" #include "sev.h" +#include "csv.h" #include "sysemu/sysemu.h" #include "sysemu/runstate.h" #include "trace.h" @@ -979,18 +980,39 @@ int sev_save_setup(const char *pdh, const char *plat_cert, { SevGuestState *s = sev_guest; - s->remote_pdh = g_base64_decode(pdh, &s->remote_pdh_len); + if (is_hygon_cpu()) { + if (sev_read_file_base64(pdh, &s->remote_pdh, + &s->remote_pdh_len) < 0) { + goto error; + } + } else { + s->remote_pdh = g_base64_decode(pdh, &s->remote_pdh_len); + } if (!check_blob_length(s->remote_pdh_len)) { goto error; } - s->remote_plat_cert = g_base64_decode(plat_cert, - &s->remote_plat_cert_len); + if (is_hygon_cpu()) { + if (sev_read_file_base64(plat_cert, &s->remote_plat_cert, + &s->remote_plat_cert_len) < 0) { + goto error; + } + } else { + s->remote_plat_cert = g_base64_decode(plat_cert, + &s->remote_plat_cert_len); + } if (!check_blob_length(s->remote_plat_cert_len)) { goto error; } - s->amd_cert = g_base64_decode(amd_cert, &s->amd_cert_len); + if (is_hygon_cpu()) { + if (sev_read_file_base64(amd_cert, &s->amd_cert, + &s->amd_cert_len) < 0) { + goto error; + } + } else { + s->amd_cert = g_base64_decode(amd_cert, &s->amd_cert_len); + } if (!check_blob_length(s->amd_cert_len)) { goto error; } -- Gitee From e6d587b63c3950f5d5af9002a8ae14e0904d62c3 Mon Sep 17 00:00:00 2001 From: fangbaoshun Date: Mon, 2 Aug 2021 11:00:07 +0800 Subject: [PATCH 307/939] target/i386: csv: add support to queue the outgoing page into a list The csv_queue_outgoing_page() provide the implementation to queue the guest private pages during transmission. The routines queues the outgoing pages into a listi, and then issues the KVM_CSV_COMMAND_BATCH command to encrypt the pages togather before writing them to the socket. Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 3 + linux-headers/linux/kvm.h | 6 + target/i386/csv.h | 11 ++ target/i386/sev.c | 161 ++++++++++++++++++++++ 4 files changed, 181 insertions(+) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index dd4887f65f..8949568acc 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -77,6 +77,9 @@ struct ConfidentialGuestMemoryEncryptionOps { /* Load the shared regions list */ int (*load_incoming_shared_regions_list)(QEMUFile *f); + + /* Queue the encrypted page and metadata associated with it into a list */ + int (*queue_outgoing_page)(uint8_t *ptr, uint32_t size, uint64_t addr); }; typedef struct ConfidentialGuestSupportClass { diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 9489a20835..ca78fdc8b6 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2067,6 +2067,12 @@ struct kvm_sev_receive_update_data { __u32 trans_len; }; +struct kvm_csv_batch_list_node { + __u64 cmd_data_addr; + __u64 addr; + __u64 next_cmd_addr; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) diff --git a/target/i386/csv.h b/target/i386/csv.h index f935babe97..4c1ef20029 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -44,4 +44,15 @@ static bool __attribute__((unused)) is_hygon_cpu(void) #endif +typedef struct CsvBatchCmdList CsvBatchCmdList; +typedef void (*CsvDestroyCmdNodeFn) (void *data); + +struct CsvBatchCmdList { + struct kvm_csv_batch_list_node *head; + struct kvm_csv_batch_list_node *tail; + CsvDestroyCmdNodeFn destroy_fn; +}; + +int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); + #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index 331dfa4516..7dd35d64ee 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -95,6 +95,9 @@ struct SevGuestState { bool reset_data_valid; QTAILQ_HEAD(, shared_region) shared_regions_list; + + /* link list used for HYGON CSV */ + CsvBatchCmdList *csv_batch_cmd_list; }; #define DEFAULT_GUEST_POLICY 0x1 /* disable debug */ @@ -187,6 +190,7 @@ static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .is_gfn_in_unshared_region = sev_is_gfn_in_unshared_region, .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, + .queue_outgoing_page = csv_queue_outgoing_page, }; static int @@ -1865,6 +1869,163 @@ bool sev_is_gfn_in_unshared_region(unsigned long gfn) return true; } +static CsvBatchCmdList * +csv_batch_cmd_list_create(struct kvm_csv_batch_list_node *head, + CsvDestroyCmdNodeFn func) +{ + CsvBatchCmdList *csv_batch_cmd_list = + g_malloc0(sizeof(*csv_batch_cmd_list)); + + if (!csv_batch_cmd_list) { + return NULL; + } + + csv_batch_cmd_list->head = head; + csv_batch_cmd_list->tail = head; + csv_batch_cmd_list->destroy_fn = func; + + return csv_batch_cmd_list; +} + +static int +csv_batch_cmd_list_add_after(CsvBatchCmdList *list, + struct kvm_csv_batch_list_node *new_node) +{ + list->tail->next_cmd_addr = (__u64)new_node; + list->tail = new_node; + + return 0; +} + +static struct kvm_csv_batch_list_node * +csv_batch_cmd_list_node_create(uint64_t cmd_data_addr, uint64_t addr) +{ + struct kvm_csv_batch_list_node *new_node = + g_malloc0(sizeof(struct kvm_csv_batch_list_node)); + + if (!new_node) { + return NULL; + } + + new_node->cmd_data_addr = cmd_data_addr; + new_node->addr = addr; + new_node->next_cmd_addr = 0; + + return new_node; +} + +static int csv_batch_cmd_list_destroy(CsvBatchCmdList *list) +{ + struct kvm_csv_batch_list_node *node = list->head; + + while (node != NULL) { + if (list->destroy_fn != NULL) + list->destroy_fn((void *)node->cmd_data_addr); + + list->head = (struct kvm_csv_batch_list_node *)node->next_cmd_addr; + g_free(node); + node = list->head; + } + + g_free(list); + return 0; +} + +static void send_update_data_free(void *data) +{ + struct kvm_sev_send_update_data *update = + (struct kvm_sev_send_update_data *)data; + g_free((guchar *)update->hdr_uaddr); + g_free((guchar *)update->trans_uaddr); + g_free(update); +} + +static int +csv_send_queue_data(SevGuestState *s, uint8_t *ptr, + uint32_t size, uint64_t addr) +{ + int ret = 0; + int fw_error; + guchar *trans; + guchar *packet_hdr; + struct kvm_sev_send_update_data *update; + struct kvm_csv_batch_list_node *new_node = NULL; + + /* If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (s->send_packet_hdr_len < 1) { + s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error); + if (s->send_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + } + + packet_hdr = g_new(guchar, s->send_packet_hdr_len); + memset(packet_hdr, 0, s->send_packet_hdr_len); + + update = g_new0(struct kvm_sev_send_update_data, 1); + + /* allocate transport buffer */ + trans = g_new(guchar, size); + + update->hdr_uaddr = (unsigned long)packet_hdr; + update->hdr_len = s->send_packet_hdr_len; + update->guest_uaddr = (unsigned long)ptr; + update->guest_len = size; + update->trans_uaddr = (unsigned long)trans; + update->trans_len = size; + + new_node = csv_batch_cmd_list_node_create((uint64_t)update, addr); + if (!new_node) { + ret = -ENOMEM; + goto err; + } + + if (s->csv_batch_cmd_list == NULL) { + s->csv_batch_cmd_list = csv_batch_cmd_list_create(new_node, + send_update_data_free); + if (s->csv_batch_cmd_list == NULL) { + ret = -ENOMEM; + goto err; + } + } else { + /* Add new_node's command address to the last_node */ + csv_batch_cmd_list_add_after(s->csv_batch_cmd_list, new_node); + } + + trace_kvm_sev_send_update_data(ptr, trans, size); + + return ret; + +err: + g_free(trans); + g_free(update); + g_free(packet_hdr); + g_free(new_node); + if (s->csv_batch_cmd_list) { + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + } + return ret; +} + +int +csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support enqueue pages for HYGON CSV"); + return -EINVAL; + } + + return csv_send_queue_data(s, ptr, sz, addr); +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) -- Gitee From b2091d245563f4bd2974c8d8e6ef186de614f8e2 Mon Sep 17 00:00:00 2001 From: fangbaoshun Date: Mon, 2 Aug 2021 11:41:58 +0800 Subject: [PATCH 308/939] target/i386: csv: add support to encrypt the outgoing pages in the list queued before. The csv_save_queued_outgoing_pages() provide the implementation to encrypt the guest private pages during transmission. The routines uses SEND_START command to create the outgoing encryption context on the first call then uses COMMAND_BATCH command to send the SEND_UPDATE_DATA commands queued in the list to encrypt the data before writing it to the socket. While encrypting the data SEND_UPDATE_DATA produces some metadata (e.g MAC, IV). The metadata is also sent to the target machine. After migration is completed, we issue the SEND_FINISH command to transition the SEV guest state from sending to unrunnable state. Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 4 ++ linux-headers/linux/kvm.h | 8 +++ target/i386/csv.h | 1 + target/i386/sev.c | 88 +++++++++++++++++++++++ target/i386/sev.h | 3 + 5 files changed, 104 insertions(+) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index 8949568acc..c84f8c1efc 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -80,6 +80,10 @@ struct ConfidentialGuestMemoryEncryptionOps { /* Queue the encrypted page and metadata associated with it into a list */ int (*queue_outgoing_page)(uint8_t *ptr, uint32_t size, uint64_t addr); + + /* Write the list queued with encrypted pages and metadata associated + * with them */ + int (*save_queued_outgoing_pages)(QEMUFile *f, uint64_t *bytes_sent); }; typedef struct ConfidentialGuestSupportClass { diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index ca78fdc8b6..fcd09126a1 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1971,6 +1971,9 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* Hygon CSV batch command */ + KVM_CSV_COMMAND_BATCH = 0x18, + KVM_SEV_NR_MAX, }; @@ -2073,6 +2076,11 @@ struct kvm_csv_batch_list_node { __u64 next_cmd_addr; }; +struct kvm_csv_command_batch { + __u32 command_id; + __u64 csv_batch_list_uaddr; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) diff --git a/target/i386/csv.h b/target/i386/csv.h index 4c1ef20029..2a3a3119d9 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -54,5 +54,6 @@ struct CsvBatchCmdList { }; int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); +int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index 7dd35d64ee..1e2bbafe36 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -191,6 +191,7 @@ static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, .queue_outgoing_page = csv_queue_outgoing_page, + .save_queued_outgoing_pages = csv_save_queued_outgoing_pages, }; static int @@ -2012,6 +2013,69 @@ err: return ret; } +static int +csv_command_batch(uint32_t cmd_id, uint64_t head_uaddr, int *fw_err) +{ + int ret; + struct kvm_csv_command_batch command_batch = { }; + + command_batch.command_id = cmd_id; + command_batch.csv_batch_list_uaddr = head_uaddr; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_CSV_COMMAND_BATCH, + &command_batch, fw_err); + if (ret) { + error_report("%s: COMMAND_BATCH ret=%d fw_err=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + } + + return ret; +} + +static int +csv_send_update_data_batch(SevGuestState *s, QEMUFile *f, uint64_t *bytes_sent) +{ + int ret, fw_error = 0; + struct kvm_sev_send_update_data *update; + struct kvm_csv_batch_list_node *node; + + ret = csv_command_batch(KVM_SEV_SEND_UPDATE_DATA, + (uint64_t)s->csv_batch_cmd_list->head, &fw_error); + if (ret) { + error_report("%s: csv_command_batch ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + for (node = s->csv_batch_cmd_list->head; + node != NULL; + node = (struct kvm_csv_batch_list_node *)node->next_cmd_addr) { + if (node != s->csv_batch_cmd_list->head) { + /* head's page header is saved before send_update_data */ + qemu_put_be64(f, node->addr); + *bytes_sent += 8; + if (node->next_cmd_addr != 0) + qemu_put_be32(f, RAM_SAVE_ENCRYPTED_PAGE_BATCH); + else + qemu_put_be32(f, RAM_SAVE_ENCRYPTED_PAGE_BATCH_END); + *bytes_sent += 4; + } + update = (struct kvm_sev_send_update_data *)node->cmd_data_addr; + qemu_put_be32(f, update->hdr_len); + qemu_put_buffer(f, (uint8_t *)update->hdr_uaddr, update->hdr_len); + *bytes_sent += (4 + update->hdr_len); + + qemu_put_be32(f, update->trans_len); + qemu_put_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len); + *bytes_sent += (4 + update->trans_len); + } + +err: + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + return ret; +} + int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) { @@ -2026,6 +2090,30 @@ csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) return csv_send_queue_data(s, ptr, sz, addr); } +int +csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support transfer queued pages for HYGON CSV"); + return -EINVAL; + } + + /* + * If this is a first buffer then create outgoing encryption context + * and write our PDH, policy and session data. + */ + if (!sev_check_state(s, SEV_STATE_SEND_UPDATE) && + sev_send_start(s, f, bytes_sent)) { + error_report("Failed to create outgoing context"); + return 1; + } + + return csv_send_update_data_batch(s, f, bytes_sent); +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index 84e3bdf2df..f7886116e7 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -41,6 +41,9 @@ typedef struct SevKernelLoaderContext { #define RAM_SAVE_ENCRYPTED_PAGE 0x1 #define RAM_SAVE_SHARED_REGIONS_LIST 0x2 +#define RAM_SAVE_ENCRYPTED_PAGE_BATCH 0x4 +#define RAM_SAVE_ENCRYPTED_PAGE_BATCH_END 0x5 + #ifdef CONFIG_SEV bool sev_enabled(void); bool sev_es_enabled(void); -- Gitee From 8125145bcd3b8348e69686e26f482cf16b16ec98 Mon Sep 17 00:00:00 2001 From: fangbaoshun Date: Mon, 2 Aug 2021 13:49:48 +0800 Subject: [PATCH 309/939] target/i386: csv: add support to queue the incoming page into a list The csv_queue_incoming_page() provide the implementation to queue the guest private pages during transmission. The routines queues the incoming socket which contains the guest private pages into a list then uses the COMMAND_BATCH command to load the encrypted pages into the guest memory. Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 3 + target/i386/csv.h | 1 + target/i386/sev.c | 92 +++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index c84f8c1efc..101cc5220a 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -84,6 +84,9 @@ struct ConfidentialGuestMemoryEncryptionOps { /* Write the list queued with encrypted pages and metadata associated * with them */ int (*save_queued_outgoing_pages)(QEMUFile *f, uint64_t *bytes_sent); + + /* Queue the incoming encrypted page into a list */ + int (*queue_incoming_page)(QEMUFile *f, uint8_t *ptr); }; typedef struct ConfidentialGuestSupportClass { diff --git a/target/i386/csv.h b/target/i386/csv.h index 2a3a3119d9..d1bcc8bc16 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -55,5 +55,6 @@ struct CsvBatchCmdList { int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); +int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr); #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index 1e2bbafe36..606aaad328 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -192,6 +192,7 @@ static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, .queue_outgoing_page = csv_queue_outgoing_page, .save_queued_outgoing_pages = csv_save_queued_outgoing_pages, + .queue_incoming_page = csv_queue_incoming_page, }; static int @@ -1941,6 +1942,15 @@ static void send_update_data_free(void *data) g_free(update); } +static void receive_update_data_free(void *data) +{ + struct kvm_sev_receive_update_data *update = + (struct kvm_sev_receive_update_data *)data; + g_free((guchar *)update->hdr_uaddr); + g_free((guchar *)update->trans_uaddr); + g_free(update); +} + static int csv_send_queue_data(SevGuestState *s, uint8_t *ptr, uint32_t size, uint64_t addr) @@ -2013,6 +2023,66 @@ err: return ret; } +static int +csv_receive_queue_data(SevGuestState *s, QEMUFile *f, uint8_t *ptr) +{ + int ret = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_sev_receive_update_data *update; + struct kvm_csv_batch_list_node *new_node = NULL; + + update = g_new0(struct kvm_sev_receive_update_data, 1); + /* get packet header */ + update->hdr_len = qemu_get_be32(f); + hdr = g_new(gchar, update->hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update->hdr_len); + update->hdr_uaddr = (unsigned long)hdr; + + /* get transport buffer */ + update->trans_len = qemu_get_be32(f); + trans = g_new(gchar, update->trans_len); + update->trans_uaddr = (unsigned long)trans; + qemu_get_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len); + + /* set guest address,guest len is page_size */ + update->guest_uaddr = (uint64_t)ptr; + update->guest_len = TARGET_PAGE_SIZE; + + new_node = csv_batch_cmd_list_node_create((uint64_t)update, 0); + if (!new_node) { + ret = -ENOMEM; + goto err; + } + + if (s->csv_batch_cmd_list == NULL) { + s->csv_batch_cmd_list = csv_batch_cmd_list_create(new_node, + receive_update_data_free); + if (s->csv_batch_cmd_list == NULL) { + ret = -ENOMEM; + goto err; + } + } else { + /* Add new_node's command address to the last_node */ + csv_batch_cmd_list_add_after(s->csv_batch_cmd_list, new_node); + } + + trace_kvm_sev_receive_update_data(trans, (void *)ptr, update->guest_len, + (void *)hdr, update->hdr_len); + + return ret; + +err: + g_free(trans); + g_free(update); + g_free(hdr); + g_free(new_node); + if (s->csv_batch_cmd_list) { + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + } + return ret; +} + static int csv_command_batch(uint32_t cmd_id, uint64_t head_uaddr, int *fw_err) { @@ -2090,6 +2160,28 @@ csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) return csv_send_queue_data(s, ptr, sz, addr); } +int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support enqueue received pages for HYGON CSV"); + return -EINVAL; + } + + /* + * If this is first buffer and SEV is not in recieiving state then + * use RECEIVE_START command to create a encryption context. + */ + if (!sev_check_state(s, SEV_STATE_RECEIVE_UPDATE) && + sev_receive_start(s, f)) { + return 1; + } + + return csv_receive_queue_data(s, f, ptr); +} + int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) { -- Gitee From cb5c1c9c70110639eda0ff50c8dfcf24b0be561d Mon Sep 17 00:00:00 2001 From: fangbaoshun Date: Mon, 2 Aug 2021 14:11:43 +0800 Subject: [PATCH 310/939] target/i386: csv: add support to load incoming encrypted pages queued in the CMD list The csv_load_queued_incoming_pages() provide the implementation to read the incoming guest private pages from the socket queued in the CMD list and load them into the guest memory. The routines uses the RECEIVE_START command to create the incoming encryption context on the first call then uses the COMMAND_BATCH carried with RECEIEVE_UPDATE_DATA commands to load the encrypted pages into the guest memory. After migration is completed, we issue the RECEIVE_FINISH command to transition the SEV guest to the runnable state so that it can be executed. Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 3 +++ target/i386/csv.h | 1 + target/i386/sev.c | 32 +++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index 101cc5220a..cb14b815cb 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -87,6 +87,9 @@ struct ConfidentialGuestMemoryEncryptionOps { /* Queue the incoming encrypted page into a list */ int (*queue_incoming_page)(QEMUFile *f, uint8_t *ptr); + + /* Load the incoming encrypted pages queued in list into guest memory */ + int (*load_queued_incoming_pages)(QEMUFile *f); }; typedef struct ConfidentialGuestSupportClass { diff --git a/target/i386/csv.h b/target/i386/csv.h index d1bcc8bc16..977f08b982 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -56,5 +56,6 @@ struct CsvBatchCmdList { int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr); +int csv_load_queued_incoming_pages(QEMUFile *f); #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index 606aaad328..2dee46d852 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -193,6 +193,7 @@ static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .queue_outgoing_page = csv_queue_outgoing_page, .save_queued_outgoing_pages = csv_save_queued_outgoing_pages, .queue_incoming_page = csv_queue_incoming_page, + .load_queued_incoming_pages = csv_load_queued_incoming_pages, }; static int @@ -2146,6 +2147,24 @@ err: return ret; } +static int +csv_receive_update_data_batch(SevGuestState *s) +{ + int ret; + int fw_error; + + ret = csv_command_batch(KVM_SEV_RECEIVE_UPDATE_DATA, + (uint64_t)s->csv_batch_cmd_list->head, &fw_error); + if (ret) { + error_report("%s: csv_command_batch ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + } + + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + return ret; +} + int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) { @@ -2206,6 +2225,19 @@ csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) return csv_send_update_data_batch(s, f, bytes_sent); } +int csv_load_queued_incoming_pages(QEMUFile *f) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support load queued pages for HYGON CSV"); + return -EINVAL; + } + + return csv_receive_update_data_batch(s); +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) -- Gitee From e2b3943bf75d34f5e913e05fbdf8116179812866 Mon Sep 17 00:00:00 2001 From: fangbaoshun Date: Mon, 2 Aug 2021 14:35:51 +0800 Subject: [PATCH 311/939] migration/ram: Accelerate the transmission of CSV guest's encrypted pages When memory encryption is enabled, the guest memory will be encrypted with the guest specific key. The patch introduces an accelerate solution which queued the pages into list and send them togather by COMMAND_BATCH. Signed-off-by: hanliyang --- configs/devices/i386-softmmu/default.mak | 1 + hw/i386/Kconfig | 5 + migration/ram.c | 119 +++++++++++++++++++++++ target/i386/csv.h | 2 + 4 files changed, 127 insertions(+) diff --git a/configs/devices/i386-softmmu/default.mak b/configs/devices/i386-softmmu/default.mak index db83ffcab9..e948e54e4e 100644 --- a/configs/devices/i386-softmmu/default.mak +++ b/configs/devices/i386-softmmu/default.mak @@ -24,6 +24,7 @@ #CONFIG_VTD=n #CONFIG_SGX=n #CONFIG_CSV=n +#CONFIG_HYGON_CSV_MIG_ACCEL=n # Boards: # diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 08f3ae43f8..682e324f1c 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -12,8 +12,13 @@ config SGX config CSV bool + select HYGON_CSV_MIG_ACCEL depends on SEV +config HYGON_CSV_MIG_ACCEL + bool + depends on CSV + config PC bool imply APPLESMC diff --git a/migration/ram.c b/migration/ram.c index 1abe8476f7..7747f5af3a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -67,6 +67,7 @@ /* Defines RAM_SAVE_ENCRYPTED_PAGE and RAM_SAVE_SHARED_REGION_LIST */ #include "target/i386/sev.h" +#include "target/i386/csv.h" #include "sysemu/kvm.h" #include "hw/boards.h" /* for machine_dump_guest_core() */ @@ -2336,6 +2337,112 @@ out: return ret; } +#ifdef CONFIG_HYGON_CSV_MIG_ACCEL +/** + * ram_save_encrypted_pages_in_batch: send the given encrypted pages to + * the stream. + * + * Sending pages of 4K size in batch. The saving stops at the end of + * the block. + * + * The caller must be with ram_state.bitmap_mutex held to call this + * function. + * + * Returns the number of pages written or negative on error + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int +ram_save_encrypted_pages_in_batch(RAMState *rs, PageSearchStatus *pss) +{ + bool page_dirty; + int ret; + int tmppages, pages = 0; + uint8_t *p; + uint32_t host_len = 0; + uint64_t bytes_xmit = 0; + ram_addr_t offset, start_offset = 0; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *)object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + do { + page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); + + /* Check the pages is dirty and if it is send it */ + if (page_dirty) { + /* Process the unencrypted page */ + if (!encrypted_test_list(rs, pss->block, pss->page)) { + tmppages = migration_ops->ram_save_target_page(rs, pss); + } else { + /* Caculate the offset and host virtual address of the page */ + offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + p = pss->block->host + offset; + + /* Record the offset and host virtual address of the first + * page in this loop which will be used below. + */ + if (host_len == 0) { + start_offset = offset | RAM_SAVE_FLAG_ENCRYPTED_DATA; + } else { + offset |= (RAM_SAVE_FLAG_ENCRYPTED_DATA | RAM_SAVE_FLAG_CONTINUE); + } + + /* Queue the outgoing page if the page is not zero page. + * If the queued pages are up to the outgoing page window size, + * process them below. + */ + if (ops->queue_outgoing_page(p, TARGET_PAGE_SIZE, offset)) + return -1; + + tmppages = 1; + host_len += TARGET_PAGE_SIZE; + + stat64_add(&mig_stats.normal_pages, 1); + } + } else { + tmppages = 0; + } + + if (tmppages >= 0) { + pages += tmppages; + } else { + return tmppages; + } + + pss_find_next_dirty(pss); + } while (offset_in_ramblock(pss->block, + ((ram_addr_t)pss->page) << TARGET_PAGE_BITS) && + host_len < CSV_OUTGOING_PAGE_WINDOW_SIZE); + + /* Check if there are any queued pages */ + if (host_len != 0) { + ram_transferred_add(save_page_header(pss, pss->pss_channel, + pss->block, start_offset)); + /* if only one page queued, flag is BATCH_END, else flag is BATCH */ + if (host_len > TARGET_PAGE_SIZE) + qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE_BATCH); + else + qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE_BATCH_END); + ram_transferred_add(4); + /* Process the queued pages in batch */ + ret = ops->save_queued_outgoing_pages(pss->pss_channel, &bytes_xmit); + if (ret) { + return -1; + } + ram_transferred_add(bytes_xmit); + } + + /* The offset we leave with is the last one we looked at */ + pss->page--; + + return pages; +} +#endif + /** * ram_save_host_page: save a whole host page * @@ -2371,6 +2478,18 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } +#ifdef CONFIG_HYGON_CSV_MIG_ACCEL + /* + * If command_batch function is enabled and memory encryption is enabled + * then use command batch APIs to accelerate the sending process + * to write the outgoing buffer to the wire. The encryption APIs + * will re-encrypt the data with transport key so that data is prototect + * on the wire. + */ + if (memcrypt_enabled() && is_hygon_cpu() && !migration_in_postcopy()) + return ram_save_encrypted_pages_in_batch(rs, pss); +#endif + /* Update host page boundary information */ pss_host_page_prepare(pss); diff --git a/target/i386/csv.h b/target/i386/csv.h index 977f08b982..74a54f9b9c 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -44,6 +44,8 @@ static bool __attribute__((unused)) is_hygon_cpu(void) #endif +#define CSV_OUTGOING_PAGE_WINDOW_SIZE (4094 * TARGET_PAGE_SIZE) + typedef struct CsvBatchCmdList CsvBatchCmdList; typedef void (*CsvDestroyCmdNodeFn) (void *data); -- Gitee From eac3cab8dcd005b33365b5196801268d696a11bc Mon Sep 17 00:00:00 2001 From: fangbaoshun Date: Mon, 2 Aug 2021 14:49:45 +0800 Subject: [PATCH 312/939] migration/ram: Accelerate the loading of CSV guest's encrypted pages When memory encryption is enabled, the guest memory will be encrypted with the guest specific key. The patch introduces an accelerate solution which queued the pages into list and load them togather by COMMAND_BATCH. Signed-off-by: hanliyang --- migration/ram.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 7747f5af3a..790c0413c1 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1297,6 +1297,14 @@ static int load_encrypted_data(QEMUFile *f, uint8_t *ptr) return ops->load_incoming_page(f, ptr); } else if (flag == RAM_SAVE_SHARED_REGIONS_LIST) { return ops->load_incoming_shared_regions_list(f); + } else if (flag == RAM_SAVE_ENCRYPTED_PAGE_BATCH) { + return ops->queue_incoming_page(f, ptr); + } else if (flag == RAM_SAVE_ENCRYPTED_PAGE_BATCH_END) { + if (ops->queue_incoming_page(f, ptr)) { + error_report("Failed to queue incoming data"); + return -EINVAL; + } + return ops->load_queued_incoming_pages(f); } else { error_report("unknown encrypted flag %x", flag); return 1; -- Gitee From 940858a3ab39575a0c1d91d4aa5bb65607259a8f Mon Sep 17 00:00:00 2001 From: hanliyang Date: Tue, 7 Jun 2022 15:19:32 +0800 Subject: [PATCH 313/939] target/i386: csv: Add support for migrate VMSA for CSV2 guest CSV2 can protect guest's cpu state through memory encryption. Each vcpu has its corresponding memory, which is also called VMSA, and is encrypted by guest's specific encrytion key. When CSV2 guest exit to host, the vcpu's state will be encrypted and saved to VMSA, and the VMSA will be decrypted and loaded to cpu when the guest's vcpu running at next time. If user wants to migrate one CSV2 guest to target machine, the VMSA of the vcpus also should be migrated to target. CSV firmware provides SEND_UPDATE_VMSA/RECEIVE_UPDATE_VMSA API through which VMSA can be converted into secure data and transmitted to the remote end (for example, network transmission). The migration of cpu state is identified by CPUState.cpu_index which may not equals to vcpu id from KVM's perspective. When migrate the VMSA, the source QEMU will invoke SEND_UPDATE_VMSA to generate data correspond to VMSA, after target QEMU received the data, it will calc target vcpu id in the KVM by CPUState.cpu_index, and then invoke RECEIVE_UPDATE_VMSA to restore VMSA correspond to vcpu. Signed-off-by: hanliyang --- include/exec/confidential-guest-support.h | 6 + linux-headers/linux/kvm.h | 16 ++ migration/ram.c | 42 +++++ target/i386/csv.h | 2 + target/i386/sev.c | 201 ++++++++++++++++++++++ target/i386/sev.h | 1 + target/i386/trace-events | 2 + 7 files changed, 270 insertions(+) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index cb14b815cb..2cba27642f 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -90,6 +90,12 @@ struct ConfidentialGuestMemoryEncryptionOps { /* Load the incoming encrypted pages queued in list into guest memory */ int (*load_queued_incoming_pages)(QEMUFile *f); + + /* Write the encrypted cpu state */ + int (*save_outgoing_cpu_state)(QEMUFile *f, uint64_t *bytes_sent); + + /* Load the encrypted cpu state */ + int (*load_incoming_cpu_state)(QEMUFile *f); }; typedef struct ConfidentialGuestSupportClass { diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index fcd09126a1..e9cd0ebaf1 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2052,6 +2052,14 @@ struct kvm_sev_send_update_data { __u32 trans_len; }; +struct kvm_sev_send_update_vmsa { + __u32 vcpu_id; + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + struct kvm_sev_receive_start { __u32 handle; __u32 policy; @@ -2070,6 +2078,14 @@ struct kvm_sev_receive_update_data { __u32 trans_len; }; +struct kvm_sev_receive_update_vmsa { + __u32 vcpu_id; + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + struct kvm_csv_batch_list_node { __u64 cmd_data_addr; __u64 addr; diff --git a/migration/ram.c b/migration/ram.c index 790c0413c1..1377b9eb37 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1281,6 +1281,33 @@ static int ram_save_shared_region_list(RAMState *rs, QEMUFile *f) return 0; } +/** + * ram_save_encrypted_cpu_state: send the encrypted cpu state + */ +static int ram_save_encrypted_cpu_state(RAMState *rs, QEMUFile *f) +{ + int ret; + uint64_t bytes_xmit = 0; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + ram_transferred_add(save_page_header(pss, f, + pss->last_sent_block, + RAM_SAVE_FLAG_ENCRYPTED_DATA)); + qemu_put_be32(f, RAM_SAVE_ENCRYPTED_CPU_STATE); + ret = ops->save_outgoing_cpu_state(f, &bytes_xmit); + if (ret < 0) { + return ret; + } + ram_transferred_add(4 + bytes_xmit); + + return 0; +} + static int load_encrypted_data(QEMUFile *f, uint8_t *ptr) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -1305,6 +1332,8 @@ static int load_encrypted_data(QEMUFile *f, uint8_t *ptr) return -EINVAL; } return ops->load_queued_incoming_pages(f); + } else if (flag == RAM_SAVE_ENCRYPTED_CPU_STATE) { + return ops->load_incoming_cpu_state(f); } else { error_report("unknown encrypted flag %x", flag); return 1; @@ -3494,6 +3523,19 @@ static int ram_save_complete(QEMUFile *f, void *opaque) qemu_file_set_error(f, ret); return ret; } + + /* + * send the encrypted cpu state, for example, CSV2 guest's + * vmsa for each vcpu. + */ + if (is_hygon_cpu()) { + ret = ram_save_encrypted_cpu_state(rs, f); + if (ret < 0) { + error_report("Failed to save encrypted cpu state"); + qemu_file_set_error(f, ret); + return ret; + } + } } } diff --git a/target/i386/csv.h b/target/i386/csv.h index 74a54f9b9c..47741a0a4f 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -59,5 +59,7 @@ int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr); int csv_load_queued_incoming_pages(QEMUFile *f); +int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent); +int csv_load_incoming_cpu_state(QEMUFile *f); #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index 2dee46d852..6ba71c91d7 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -90,6 +90,10 @@ struct SevGuestState { gchar *send_packet_hdr; size_t send_packet_hdr_len; + /* needed by live migration of HYGON CSV2 guest */ + gchar *send_vmsa_packet_hdr; + size_t send_vmsa_packet_hdr_len; + uint32_t reset_cs; uint32_t reset_ip; bool reset_data_valid; @@ -183,6 +187,9 @@ static const char *const sev_fw_errlist[] = { #define SHARED_REGION_LIST_CONT 0x1 #define SHARED_REGION_LIST_END 0x2 +#define ENCRYPTED_CPU_STATE_CONT 0x1 +#define ENCRYPTED_CPU_STATE_END 0x2 + static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_setup = sev_save_setup, .save_outgoing_page = sev_save_outgoing_page, @@ -194,6 +201,8 @@ static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { .save_queued_outgoing_pages = csv_save_queued_outgoing_pages, .queue_incoming_page = csv_queue_incoming_page, .load_queued_incoming_pages = csv_load_queued_incoming_pages, + .save_outgoing_cpu_state = csv_save_outgoing_cpu_state, + .load_incoming_cpu_state = csv_load_incoming_cpu_state, }; static int @@ -1047,6 +1056,9 @@ sev_send_finish(void) } g_free(sev_guest->send_packet_hdr); + if (sev_es_enabled() && is_hygon_cpu()) { + g_free(sev_guest->send_vmsa_packet_hdr); + } sev_set_guest_state(sev_guest, SEV_STATE_RUNNING); } @@ -2238,6 +2250,195 @@ int csv_load_queued_incoming_pages(QEMUFile *f) return csv_receive_update_data_batch(s); } +static int +sev_send_vmsa_get_packet_len(int *fw_err) +{ + int ret; + struct kvm_sev_send_update_vmsa update = { 0, }; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_VMSA, + &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + ret = 0; + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + goto err; + } + + ret = update.hdr_len; + +err: + return ret; +} + +static int +sev_send_update_vmsa(SevGuestState *s, QEMUFile *f, uint32_t cpu_id, + uint32_t cpu_index, uint32_t size, uint64_t *bytes_sent) +{ + int ret, fw_error; + guchar *trans = NULL; + struct kvm_sev_send_update_vmsa update = {}; + + /* + * If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (!s->send_vmsa_packet_hdr) { + s->send_vmsa_packet_hdr_len = sev_send_vmsa_get_packet_len(&fw_error); + if (s->send_vmsa_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE_VMSA fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + s->send_vmsa_packet_hdr = g_new(gchar, s->send_vmsa_packet_hdr_len); + } + + /* allocate transport buffer */ + trans = g_new(guchar, size); + + update.vcpu_id = cpu_id; + update.hdr_uaddr = (uintptr_t)s->send_vmsa_packet_hdr; + update.hdr_len = s->send_vmsa_packet_hdr_len; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = size; + + trace_kvm_sev_send_update_vmsa(cpu_id, cpu_index, trans, size); + + ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_UPDATE_VMSA, &update, &fw_error); + if (ret) { + error_report("%s: SEND_UPDATE_VMSA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + /* + * Migration of vCPU's VMState according to the instance_id + * (i.e. CPUState.cpu_index) + */ + qemu_put_be32(f, sizeof(uint32_t)); + qemu_put_buffer(f, (uint8_t *)&cpu_index, sizeof(uint32_t)); + *bytes_sent += 4 + sizeof(uint32_t); + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent += 4 + update.hdr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += 4 + update.trans_len; + +err: + g_free(trans); + return ret; +} + +int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + CPUState *cpu; + int ret = 0; + + /* Only support migrate VMSAs for HYGON CSV2 guest */ + if (!sev_es_enabled() || !is_hygon_cpu()) { + return 0; + } + + CPU_FOREACH(cpu) { + qemu_put_be32(f, ENCRYPTED_CPU_STATE_CONT); + *bytes_sent += 4; + ret = sev_send_update_vmsa(s, f, kvm_arch_vcpu_id(cpu), + cpu->cpu_index, TARGET_PAGE_SIZE, bytes_sent); + if (ret) { + goto err; + } + } + + qemu_put_be32(f, ENCRYPTED_CPU_STATE_END); + *bytes_sent += 4; + +err: + return ret; +} + +static int sev_receive_update_vmsa(QEMUFile *f) +{ + int ret = 1, fw_error = 0; + CPUState *cpu; + uint32_t cpu_index, cpu_id = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_sev_receive_update_vmsa update = {}; + + /* get cpu index buffer */ + assert(qemu_get_be32(f) == sizeof(uint32_t)); + qemu_get_buffer(f, (uint8_t *)&cpu_index, sizeof(uint32_t)); + + CPU_FOREACH(cpu) { + if (cpu->cpu_index == cpu_index) { + cpu_id = kvm_arch_vcpu_id(cpu); + break; + } + } + update.vcpu_id = cpu_id; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + if (!check_blob_length(update.hdr_len)) { + return 1; + } + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + if (!check_blob_length(update.trans_len)) { + goto err; + } + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + trace_kvm_sev_receive_update_vmsa(cpu_id, cpu_index, + trans, update.trans_len, hdr, update.hdr_len); + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_UPDATE_VMSA, + &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_UPDATE_VMSA ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + } + +err: + g_free(trans); + g_free(hdr); + return ret; +} + +int csv_load_incoming_cpu_state(QEMUFile *f) +{ + int status, ret = 0; + + /* Only support migrate VMSAs for HYGON CSV2 guest */ + if (!sev_es_enabled() || !is_hygon_cpu()) { + return 0; + } + + status = qemu_get_be32(f); + while (status == ENCRYPTED_CPU_STATE_CONT) { + ret = sev_receive_update_vmsa(f); + if (ret) { + break; + } + + status = qemu_get_be32(f); + } + + return ret; +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) diff --git a/target/i386/sev.h b/target/i386/sev.h index f7886116e7..209c92fd6f 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -43,6 +43,7 @@ typedef struct SevKernelLoaderContext { #define RAM_SAVE_ENCRYPTED_PAGE_BATCH 0x4 #define RAM_SAVE_ENCRYPTED_PAGE_BATCH_END 0x5 +#define RAM_SAVE_ENCRYPTED_CPU_STATE 0x6 #ifdef CONFIG_SEV bool sev_enabled(void); diff --git a/target/i386/trace-events b/target/i386/trace-events index 475de65ad4..87b765c73c 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -17,3 +17,5 @@ kvm_sev_send_finish(void) "" kvm_sev_receive_start(int policy, void *session, void *pdh) "policy 0x%x session %p pdh %p" kvm_sev_receive_update_data(void *src, void *dst, int len, void *hdr, int hdr_len) "guest %p trans %p len %d hdr %p hdr_len %d" kvm_sev_receive_finish(void) "" +kvm_sev_send_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *dst, int len) "cpu_id %d cpu_index %d trans %p len %d" +kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int len, void *hdr, int hdr_len) "cpu_id %d cpu_index %d trans %p len %d hdr %p hdr_len %d" -- Gitee From 6a8b58a3ce6dc162cae4b74ca8f39392672e6cba Mon Sep 17 00:00:00 2001 From: panpingsheng Date: Sat, 12 Jun 2021 15:15:29 +0800 Subject: [PATCH 314/939] target/i386: get/set/migrate GHCB state GHCB state is necessary to CSV2 guest when migrating to target. Add GHCB related definition, it also adds corresponding part to kvm_get/put, and vmstate. Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 2 ++ target/i386/cpu.h | 5 +++++ target/i386/kvm/kvm.c | 11 +++++++++++ target/i386/kvm/sev-stub.c | 2 ++ target/i386/machine.c | 24 ++++++++++++++++++++++++ target/i386/sev.c | 10 ++++++++++ target/i386/sev.h | 2 ++ 7 files changed, 56 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index e9cd0ebaf1..e796105b76 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1203,6 +1203,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_TMM 300 +#define KVM_CAP_SEV_ES_GHCB 500 + #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 6993552cd9..a9a646bba2 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -520,6 +520,8 @@ typedef enum X86Seg { #define MSR_VM_HSAVE_PA 0xc0010117 +#define MSR_AMD64_SEV_ES_GHCB 0xc0010130 + #define MSR_IA32_XFD 0x000001c4 #define MSR_IA32_XFD_ERR 0x000001c5 @@ -1885,6 +1887,9 @@ typedef struct CPUArchState { /* Number of dies within this CPU package. */ unsigned nr_dies; + + /* GHCB guest physical address info */ + uint64_t ghcb_gpa; } CPUX86State; struct kvm_msrs; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 5730d0e0c0..9e65242739 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -3625,6 +3625,10 @@ static int kvm_put_msrs(X86CPU *cpu, int level) } } + if (sev_kvm_has_msr_ghcb) { + kvm_msr_entry_add(cpu, MSR_AMD64_SEV_ES_GHCB, env->ghcb_gpa); + } + return kvm_buf_set_msrs(cpu); } @@ -3999,6 +4003,10 @@ static int kvm_get_msrs(X86CPU *cpu) } } + if (sev_kvm_has_msr_ghcb) { + kvm_msr_entry_add(cpu, MSR_AMD64_SEV_ES_GHCB, 0); + } + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); if (ret < 0) { return ret; @@ -4319,6 +4327,9 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31: env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data; break; + case MSR_AMD64_SEV_ES_GHCB: + env->ghcb_gpa = msrs[i].data; + break; } } diff --git a/target/i386/kvm/sev-stub.c b/target/i386/kvm/sev-stub.c index 99899688e4..a0aac1117f 100644 --- a/target/i386/kvm/sev-stub.c +++ b/target/i386/kvm/sev-stub.c @@ -14,6 +14,8 @@ #include "qemu/osdep.h" #include "sev.h" +bool sev_kvm_has_msr_ghcb; + int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { /* If we get here, cgs must be some non-SEV thing */ diff --git a/target/i386/machine.c b/target/i386/machine.c index a1041ef828..9a1cb8f3b8 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -1605,6 +1605,27 @@ static const VMStateDescription vmstate_triple_fault = { } }; +#if defined(CONFIG_KVM) && defined(TARGET_X86_64) +static bool msr_ghcb_gpa_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return env->ghcb_gpa != 0; +} + +static const VMStateDescription vmstate_msr_ghcb_gpa = { + .name = "cpu/svm_msr_ghcb_gpa", + .version_id = 1, + .minimum_version_id = 1, + .needed = msr_ghcb_gpa_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64(env.ghcb_gpa, X86CPU), + VMSTATE_END_OF_LIST() + } +}; +#endif + const VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, @@ -1751,6 +1772,9 @@ const VMStateDescription vmstate_x86_cpu = { #endif &vmstate_arch_lbr, &vmstate_triple_fault, +#if defined(CONFIG_KVM) && defined(TARGET_X86_64) + &vmstate_msr_ghcb_gpa, +#endif NULL } }; diff --git a/target/i386/sev.c b/target/i386/sev.c index 6ba71c91d7..7744378112 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -152,6 +152,8 @@ QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0); static SevGuestState *sev_guest; static Error *sev_mig_blocker; +bool sev_kvm_has_msr_ghcb; + static const char *const sev_fw_errlist[] = { [SEV_RET_SUCCESS] = "", [SEV_RET_INVALID_PLATFORM_STATE] = "Platform state is invalid", @@ -1198,6 +1200,14 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; QTAILQ_INIT(&sev->shared_regions_list); + /* Determine whether support MSR_AMD64_SEV_ES_GHCB */ + if (sev_es_enabled()) { + sev_kvm_has_msr_ghcb = + kvm_vm_check_extension(kvm_state, KVM_CAP_SEV_ES_GHCB); + } else { + sev_kvm_has_msr_ghcb = false; + } + cgs->ready = true; return 0; diff --git a/target/i386/sev.h b/target/i386/sev.h index 209c92fd6f..0bfe3879ef 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -78,4 +78,6 @@ void sev_del_migrate_blocker(void); int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +extern bool sev_kvm_has_msr_ghcb; + #endif -- Gitee From 366c11c56875ae053043c48c8b93349c6e3125cc Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sun, 19 Jun 2022 16:49:45 +0800 Subject: [PATCH 315/939] target/i386/kvm: Fix the resettable info when emulate Hygon CSV2 guest SEV-ES guest will be terminated by QEMU when receive reboot request. In order to support reboot for CSV2 guest, report resettable in kvm_arch_cpu_check_are_resettable(). But the CSV2 guest is still not resettable if it was migrated to target machine. Signed-off-by: hanliyang --- target/i386/csv-sysemu-stub.c | 16 ++++++++++++++++ target/i386/csv.c | 20 ++++++++++++++++++++ target/i386/csv.h | 2 ++ target/i386/kvm/csv-stub.c | 17 +++++++++++++++++ target/i386/kvm/kvm.c | 4 ++++ target/i386/kvm/meson.build | 1 + target/i386/meson.build | 1 + target/i386/sev.c | 9 +++++++++ 8 files changed, 70 insertions(+) create mode 100644 target/i386/csv-sysemu-stub.c create mode 100644 target/i386/csv.c create mode 100644 target/i386/kvm/csv-stub.c diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c new file mode 100644 index 0000000000..5874e4cc1d --- /dev/null +++ b/target/i386/csv-sysemu-stub.c @@ -0,0 +1,16 @@ +/* + * QEMU CSV system stub + * + * Copyright: Hygon Info Technologies Ltd. 2022 + * + * Author: + * Jiang Xin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "sev.h" +#include "csv.h" diff --git a/target/i386/csv.c b/target/i386/csv.c new file mode 100644 index 0000000000..88fb05ac37 --- /dev/null +++ b/target/i386/csv.c @@ -0,0 +1,20 @@ +/* + * QEMU CSV support + * + * Copyright: Hygon Info Technologies Ltd. 2022 + * + * Author: + * Jiang Xin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "cpu.h" +#include "sev.h" +#include "csv.h" + +bool csv_kvm_cpu_reset_inhibit; diff --git a/target/i386/csv.h b/target/i386/csv.h index 47741a0a4f..ac4bb5bee1 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -46,6 +46,8 @@ static bool __attribute__((unused)) is_hygon_cpu(void) #define CSV_OUTGOING_PAGE_WINDOW_SIZE (4094 * TARGET_PAGE_SIZE) +extern bool csv_kvm_cpu_reset_inhibit; + typedef struct CsvBatchCmdList CsvBatchCmdList; typedef void (*CsvDestroyCmdNodeFn) (void *data); diff --git a/target/i386/kvm/csv-stub.c b/target/i386/kvm/csv-stub.c new file mode 100644 index 0000000000..4d1376f268 --- /dev/null +++ b/target/i386/kvm/csv-stub.c @@ -0,0 +1,17 @@ +/* + * QEMU CSV stub + * + * Copyright Hygon Info Technologies Ltd. 2024 + * + * Authors: + * Han Liyang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "csv.h" + +bool csv_kvm_cpu_reset_inhibit; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 9e65242739..2866a6d0ec 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -32,6 +32,7 @@ #include "sysemu/runstate.h" #include "kvm_i386.h" #include "sev.h" +#include "csv.h" #include "xen-emu.h" #include "hyperv.h" #include "hyperv-proto.h" @@ -5710,6 +5711,9 @@ bool kvm_has_waitpkg(void) bool kvm_arch_cpu_check_are_resettable(void) { + if (is_hygon_cpu()) + return !csv_kvm_cpu_reset_inhibit; + return !sev_es_enabled(); } diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build index 84d9143e60..3c3f8cf93c 100644 --- a/target/i386/kvm/meson.build +++ b/target/i386/kvm/meson.build @@ -8,6 +8,7 @@ i386_kvm_ss.add(files( i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c')) i386_kvm_ss.add(when: 'CONFIG_SEV', if_false: files('sev-stub.c')) +i386_kvm_ss.add(when: 'CONFIG_CSV', if_false: files('csv-stub.c')) i386_system_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c'), if_false: files('hyperv-stub.c')) diff --git a/target/i386/meson.build b/target/i386/meson.build index 7c74bfa859..594a0a6abf 100644 --- a/target/i386/meson.build +++ b/target/i386/meson.build @@ -21,6 +21,7 @@ i386_system_ss.add(files( 'cpu-sysemu.c', )) i386_system_ss.add(when: 'CONFIG_SEV', if_true: files('sev.c'), if_false: files('sev-sysemu-stub.c')) +i386_system_ss.add(when: 'CONFIG_CSV', if_true: files('csv.c'), if_false: files('csv-sysemu-stub.c')) i386_user_ss = ss.source_set() diff --git a/target/i386/sev.c b/target/i386/sev.c index 7744378112..2c6aecd1a3 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1190,6 +1190,15 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) error_setg(errp, "%s: failed to create encryption context", __func__); goto err; } + } else { + /* + * The CSV2 guest is not resettable after migrated to target machine, + * set csv_kvm_cpu_reset_inhibit to true to indicate the CSV2 guest is + * not resettable. + */ + if (is_hygon_cpu() && sev_es_enabled()) { + csv_kvm_cpu_reset_inhibit = true; + } } ram_block_notifier_add(&sev_ram_notifier); -- Gitee From 09934a231a513289caaae68e68912b735cb44b75 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Thu, 15 Apr 2021 08:32:24 -0400 Subject: [PATCH 316/939] kvm: Add support for CSV2 reboot Linux will set vcpu.arch.guest_state_protected to true after execute LAUNCH_UPDATE_VMSA successfully, and then KVM will prevent any changes to VMCB State Save Area. In order to support CSV2 guest reboot, calls cpus_control_pre_system_reset() to set vcpu.arch.guest_state_protected to false, and calls cpus_control_post_system_reset() to restore VMSA of guest's vcpu with data generated by LAUNCH_UPDATE_VMSA. In addition, for memory encrypted guest, additional works may be required during system reset, such as flushing the cache. The function cpus_control_post_system_reset() hints linux to flush caches of guest memory. Signed-off-by: hanliyang --- accel/kvm/kvm-accel-ops.c | 3 +++ accel/kvm/kvm-all.c | 10 ++++++++++ accel/kvm/kvm-cpus.h | 3 +++ include/sysemu/accel-ops.h | 3 +++ include/sysemu/cpus.h | 2 ++ linux-headers/linux/kvm.h | 4 ++++ system/cpus.c | 14 ++++++++++++++ system/runstate.c | 5 +++++ 8 files changed, 44 insertions(+) diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c index 6195150a0b..54f19028b8 100644 --- a/accel/kvm/kvm-accel-ops.c +++ b/accel/kvm/kvm-accel-ops.c @@ -112,6 +112,9 @@ static void kvm_accel_ops_class_init(ObjectClass *oc, void *data) ops->remove_breakpoint = kvm_remove_breakpoint; ops->remove_all_breakpoints = kvm_remove_all_breakpoints; #endif + + ops->control_pre_system_reset = kvm_cpus_control_pre_system_reset; + ops->control_post_system_reset = kvm_cpus_control_post_system_reset; } static const TypeInfo kvm_accel_ops_type = { diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index dc3605e648..8077630825 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2810,6 +2810,16 @@ void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); } +void kvm_cpus_control_pre_system_reset(void) +{ + kvm_vm_ioctl(kvm_state, KVM_CONTROL_VCPU_PRE_SYSTEM_RESET, NULL); +} + +void kvm_cpus_control_post_system_reset(void) +{ + kvm_vm_ioctl(kvm_state, KVM_CONTROL_VCPU_POST_SYSTEM_RESET, NULL); +} + #ifdef KVM_HAVE_MCE_INJECTION static __thread void *pending_sigbus_addr; static __thread int pending_sigbus_code; diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h index ca40add32c..27b9d0d9db 100644 --- a/accel/kvm/kvm-cpus.h +++ b/accel/kvm/kvm-cpus.h @@ -23,4 +23,7 @@ int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); void kvm_remove_all_breakpoints(CPUState *cpu); +void kvm_cpus_control_pre_system_reset(void); +void kvm_cpus_control_post_system_reset(void); + #endif /* KVM_CPUS_H */ diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h index ef91fc28bb..7a32e7f820 100644 --- a/include/sysemu/accel-ops.h +++ b/include/sysemu/accel-ops.h @@ -53,6 +53,9 @@ struct AccelOpsClass { int (*insert_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); int (*remove_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); void (*remove_all_breakpoints)(CPUState *cpu); + + void (*control_pre_system_reset)(void); + void (*control_post_system_reset)(void); }; #endif /* ACCEL_OPS_H */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index b4a566cfe7..f24d27daf5 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -44,6 +44,8 @@ extern int icount_align_option; void qemu_cpu_kick_self(void); bool cpus_are_resettable(void); +void cpus_control_pre_system_reset(void); +void cpus_control_post_system_reset(void); void cpu_synchronize_all_states(void); void cpu_synchronize_all_post_reset(void); diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index e796105b76..eb30402c2d 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1626,6 +1626,10 @@ struct kvm_master_dev_info #define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr) #define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr) +/* ioctls for control vcpu setup during system reset */ +#define KVM_CONTROL_VCPU_PRE_SYSTEM_RESET _IO(KVMIO, 0xe8) +#define KVM_CONTROL_VCPU_POST_SYSTEM_RESET _IO(KVMIO, 0xe9) + /* * ioctls for vcpu fds */ diff --git a/system/cpus.c b/system/cpus.c index f2289e9545..d9de09b9e8 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -193,6 +193,20 @@ void cpu_synchronize_pre_loadvm(CPUState *cpu) } } +void cpus_control_pre_system_reset(void) +{ + if (cpus_accel->control_pre_system_reset) { + cpus_accel->control_pre_system_reset(); + } +} + +void cpus_control_post_system_reset(void) +{ + if (cpus_accel->control_post_system_reset) { + cpus_accel->control_post_system_reset(); + } +} + bool cpus_are_resettable(void) { if (cpus_accel->cpus_are_resettable) { diff --git a/system/runstate.c b/system/runstate.c index 538c645326..7e41626bb1 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -487,6 +487,8 @@ void qemu_system_reset(ShutdownCause reason) mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL; + cpus_control_pre_system_reset(); + cpu_synchronize_all_states(); if (mc && mc->reset) { @@ -503,6 +505,9 @@ void qemu_system_reset(ShutdownCause reason) qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); } cpu_synchronize_all_post_reset(); + + cpus_control_post_system_reset(); + monitor_qapi_event_discard_io_error(); } -- Gitee From fcd3ff011e62739b824c2e465e01b98c47e364f5 Mon Sep 17 00:00:00 2001 From: qihao Date: Fri, 16 Aug 2024 17:01:07 +0800 Subject: [PATCH 317/939] hw/core/ptimer: fix timer zero period condition for freq > 1GHz cheery-pick from 446e5e8b4515e9a7be69ef6a29852975289bb6f0 The real period is zero when both period and period_frac are zero. Check the method ptimer_set_freq, if freq is larger than 1000 MHz, the period is zero, but the period_frac is not, in this case, the ptimer will work but the current code incorrectly recognizes that the ptimer is disabled. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2306 Signed-off-by: JianZhou Yue Message-id: 3DA024AEA8B57545AF1B3CAA37077D0FB75E82C8@SHASXM03.verisilicon.com Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell Signed-off-by: qihao_yewu --- hw/core/ptimer.c | 4 ++-- tests/unit/ptimer-test.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/hw/core/ptimer.c b/hw/core/ptimer.c index e03165febf..7177ecfab0 100644 --- a/hw/core/ptimer.c +++ b/hw/core/ptimer.c @@ -83,7 +83,7 @@ static void ptimer_reload(ptimer_state *s, int delta_adjust) delta = s->delta = s->limit; } - if (s->period == 0) { + if (s->period == 0 && s->period_frac == 0) { if (!qtest_enabled()) { fprintf(stderr, "Timer with period zero, disabling\n"); } @@ -309,7 +309,7 @@ void ptimer_run(ptimer_state *s, int oneshot) assert(s->in_transaction); - if (was_disabled && s->period == 0) { + if (was_disabled && s->period == 0 && s->period_frac == 0) { if (!qtest_enabled()) { fprintf(stderr, "Timer with period zero, disabling\n"); } diff --git a/tests/unit/ptimer-test.c b/tests/unit/ptimer-test.c index 04b5f4e3d0..08240594bb 100644 --- a/tests/unit/ptimer-test.c +++ b/tests/unit/ptimer-test.c @@ -763,6 +763,33 @@ static void check_oneshot_with_load_0(gconstpointer arg) ptimer_free(ptimer); } +static void check_freq_more_than_1000M(gconstpointer arg) +{ + const uint8_t *policy = arg; + ptimer_state *ptimer = ptimer_init(ptimer_trigger, NULL, *policy); + bool no_round_down = (*policy & PTIMER_POLICY_NO_COUNTER_ROUND_DOWN); + + triggered = false; + + ptimer_transaction_begin(ptimer); + ptimer_set_freq(ptimer, 2000000000); + ptimer_set_limit(ptimer, 8, 1); + ptimer_run(ptimer, 1); + ptimer_transaction_commit(ptimer); + + qemu_clock_step(3); + + g_assert_cmpuint(ptimer_get_count(ptimer), ==, no_round_down ? 3 : 2); + g_assert_false(triggered); + + qemu_clock_step(1); + + g_assert_cmpuint(ptimer_get_count(ptimer), ==, 0); + g_assert_true(triggered); + + ptimer_free(ptimer); +} + static void add_ptimer_tests(uint8_t policy) { char policy_name[256] = ""; @@ -857,6 +884,12 @@ static void add_ptimer_tests(uint8_t policy) policy_name), g_memdup2(&policy, 1), check_oneshot_with_load_0, g_free); g_free(tmp); + + g_test_add_data_func_full( + tmp = g_strdup_printf("/ptimer/freq_more_than_1000M policy=%s", + policy_name), + g_memdup2(&policy, 1), check_freq_more_than_1000M, g_free); + g_free(tmp); } static void add_all_ptimer_policies_comb_tests(void) -- Gitee From db722158867b3b7541ed788b0a0f42a29a839ee4 Mon Sep 17 00:00:00 2001 From: qihao Date: Fri, 16 Aug 2024 17:51:45 +0800 Subject: [PATCH 318/939] vvfat: Fix bug in writing to middle of file cheery-pick from b881cf00c99e03bc8a3648581f97736ff275b18b Before this commit, the behavior when calling `commit_one_file` for example with `offset=0x2000` (second cluster), what will happen is that we won't fetch the next cluster from the fat, and instead use the first cluster for the read operation. This is due to off-by-one error here, where `i=0x2000 !< offset=0x2000`, thus not fetching the next cluster. Signed-off-by: Amjad Alsharafi Reviewed-by: Kevin Wolf Tested-by: Kevin Wolf Message-ID: Signed-off-by: Kevin Wolf Signed-off-by: qihao_yewu --- block/vvfat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/vvfat.c b/block/vvfat.c index 9d050ba3ae..9010f3f33f 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2525,7 +2525,7 @@ commit_one_file(BDRVVVFATState* s, int dir_index, uint32_t offset) return -1; } - for (i = s->cluster_size; i < offset; i += s->cluster_size) + for (i = 0; i < offset; i += s->cluster_size) c = modified_fat_get(s, c); fd = qemu_open_old(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); -- Gitee From f74cee44cd57da213a790f7711a68da0f4de061a Mon Sep 17 00:00:00 2001 From: xiongmengbiao Date: Thu, 30 Nov 2023 13:47:21 +0800 Subject: [PATCH 319/939] hw/misc: support vpsp simulate a psp misc device for support tkm's key isolation Signed-off-by: xiongmengbiao --- hw/misc/Kconfig | 4 ++ hw/misc/meson.build | 1 + hw/misc/psp.c | 141 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) create mode 100644 hw/misc/psp.c diff --git a/hw/misc/Kconfig b/hw/misc/Kconfig index cc8a8c1418..2ea5c68eb5 100644 --- a/hw/misc/Kconfig +++ b/hw/misc/Kconfig @@ -200,4 +200,8 @@ config IOSB config XLNX_VERSAL_TRNG bool +config PSP_DEV + bool + default y + source macio/Kconfig diff --git a/hw/misc/meson.build b/hw/misc/meson.build index 36c20d5637..28cba0ac28 100644 --- a/hw/misc/meson.build +++ b/hw/misc/meson.build @@ -9,6 +9,7 @@ system_ss.add(when: 'CONFIG_UNIMP', if_true: files('unimp.c')) system_ss.add(when: 'CONFIG_EMPTY_SLOT', if_true: files('empty_slot.c')) system_ss.add(when: 'CONFIG_LED', if_true: files('led.c')) system_ss.add(when: 'CONFIG_PVPANIC_COMMON', if_true: files('pvpanic.c')) +system_ss.add(when: 'CONFIG_PSP_DEV', if_true: files('psp.c')) # ARM devices system_ss.add(when: 'CONFIG_PL310', if_true: files('arm_l2x0.c')) diff --git a/hw/misc/psp.c b/hw/misc/psp.c new file mode 100644 index 0000000000..6ff2ceec10 --- /dev/null +++ b/hw/misc/psp.c @@ -0,0 +1,141 @@ +/* + * hygon psp device emulation + * + * Copyright 2024 HYGON Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include "qemu/osdep.h" +#include "qemu/compiler.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "migration/vmstate.h" +#include "hw/qdev-properties.h" +#include "sysemu/runstate.h" +#include + +#define TYPE_PSP_DEV "psp" +OBJECT_DECLARE_SIMPLE_TYPE(PSPDevState, PSP_DEV) + +struct PSPDevState { + /* Private */ + DeviceState pdev; + + /* Public */ + Notifier shutdown_notifier; + int dev_fd; + bool enabled; + + /** + * vid is used to identify a virtual machine in qemu. + * When a virtual machine accesses a tkm key, + * the TKM module uses different key spaces based on different vids. + */ + uint32_t vid; +}; + +#define PSP_DEV_PATH "/dev/hygon_psp_config" +#define HYGON_PSP_IOC_TYPE 'H' +#define PSP_IOC_MUTEX_ENABLE _IOWR(HYGON_PSP_IOC_TYPE, 1, NULL) +#define PSP_IOC_MUTEX_DISABLE _IOWR(HYGON_PSP_IOC_TYPE, 2, NULL) +#define PSP_IOC_VPSP_OPT _IOWR(HYGON_PSP_IOC_TYPE, 3, NULL) + +enum VPSP_DEV_CTRL_OPCODE { + VPSP_OP_VID_ADD, + VPSP_OP_VID_DEL, +}; + +struct psp_dev_ctrl { + unsigned char op; + union { + unsigned int vid; + unsigned char reserved[128]; + } data; +}; + +static void psp_dev_destroy(PSPDevState *state) +{ + struct psp_dev_ctrl ctrl = { 0 }; + if (state && state->dev_fd) { + if (state->enabled) { + ctrl.op = VPSP_OP_VID_DEL; + if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { + error_report("VPSP_OP_VID_DEL: %d", -errno); + } else { + state->enabled = false; + } + } + qemu_close(state->dev_fd); + state->dev_fd = 0; + } +} + +/** + * Guest OS performs shut down operations through 'shutdown' and 'powerdown' event. + * The 'powerdown' event will also trigger 'shutdown' in the end, + * so only attention to the 'shutdown' event. + * + * When Guest OS trigger 'reboot' or 'reset' event, to do nothing. +*/ +static void psp_dev_shutdown_notify(Notifier *notifier, void *data) +{ + PSPDevState *state = container_of(notifier, PSPDevState, shutdown_notifier); + psp_dev_destroy(state); +} + +static void psp_dev_realize(DeviceState *dev, Error **errp) +{ + struct psp_dev_ctrl ctrl = { 0 }; + PSPDevState *state = PSP_DEV(dev); + + state->dev_fd = qemu_open_old(PSP_DEV_PATH, O_RDWR); + if (state->dev_fd < 0) { + error_setg(errp, "fail to open %s, errno %d.", PSP_DEV_PATH, errno); + goto end; + } + + ctrl.op = VPSP_OP_VID_ADD; + ctrl.data.vid = state->vid; + if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { + error_setg(errp, "psp_dev_realize VPSP_OP_VID_ADD vid %d, return %d", ctrl.data.vid, -errno); + goto end; + } + + state->enabled = true; + state->shutdown_notifier.notify = psp_dev_shutdown_notify; + qemu_register_shutdown_notifier(&state->shutdown_notifier); +end: + return; +} + +static struct Property psp_dev_properties[] = { + DEFINE_PROP_UINT32("vid", PSPDevState, vid, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void psp_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->desc = "PSP Device"; + dc->realize = psp_dev_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, psp_dev_properties); +} + +static const TypeInfo psp_dev_info = { + .name = TYPE_PSP_DEV, + .parent = TYPE_DEVICE, + .instance_size = sizeof(PSPDevState), + .class_init = psp_dev_class_init, +}; + +static void psp_dev_register_types(void) +{ + type_register_static(&psp_dev_info); +} + +type_init(psp_dev_register_types) -- Gitee From 7b7742e137fbf9283cbbfb823fcf2ebe14df3154 Mon Sep 17 00:00:00 2001 From: gaochuanji Date: Mon, 19 Aug 2024 10:52:49 +0800 Subject: [PATCH 320/939] crypto: Introduce SM3 hash hmac pbkdf algorithm Introduce the SM3 cryptographic hash algorithm (GB/T 32905-2016). SM3 (GB/T 32905-2016) is a cryptographic standard issued by the Organization of State Commercial Cryptography Administration (OSCCA) as an authorized cryptographic algorithm for use within China. Detect the SM3 cryptographic hash algorithm and enable the feature silently if it is available. Signed-off-by: cheliequan --- crypto/hash-gcrypt.c | 3 +++ crypto/hash-nettle.c | 14 ++++++++++++ crypto/hash.c | 3 +++ crypto/hmac-gcrypt.c | 3 +++ crypto/hmac-nettle.c | 11 ++++++++++ crypto/pbkdf-gcrypt.c | 6 ++++++ crypto/pbkdf-nettle.c | 13 ++++++++++++ meson.build | 39 ++++++++++++++++++++++++++++++++++ qapi/crypto.json | 4 +++- tests/unit/test-crypto-hash.c | 16 ++++++++++++++ tests/unit/test-crypto-hmac.c | 8 +++++++ tests/unit/test-crypto-pbkdf.c | 16 ++++++++++++++ 12 files changed, 135 insertions(+), 1 deletion(-) diff --git a/crypto/hash-gcrypt.c b/crypto/hash-gcrypt.c index 829e48258d..d3bdfe5633 100644 --- a/crypto/hash-gcrypt.c +++ b/crypto/hash-gcrypt.c @@ -33,6 +33,9 @@ static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG__MAX] = { [QCRYPTO_HASH_ALG_SHA384] = GCRY_MD_SHA384, [QCRYPTO_HASH_ALG_SHA512] = GCRY_MD_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MD_RMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = GCRY_MD_SM3, +#endif }; gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg) diff --git a/crypto/hash-nettle.c b/crypto/hash-nettle.c index 1ca1a41062..0c2f8ce86c 100644 --- a/crypto/hash-nettle.c +++ b/crypto/hash-nettle.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_CRYPTO_SM3 +#include +#endif typedef void (*qcrypto_nettle_init)(void *ctx); typedef void (*qcrypto_nettle_write)(void *ctx, @@ -42,6 +45,9 @@ union qcrypto_hash_ctx { struct sha384_ctx sha384; struct sha512_ctx sha512; struct ripemd160_ctx ripemd160; +#ifdef CONFIG_CRYPTO_SM3 + struct sm3_ctx sm3; +#endif }; struct qcrypto_hash_alg { @@ -92,6 +98,14 @@ struct qcrypto_hash_alg { .result = (qcrypto_nettle_result)ripemd160_digest, .len = RIPEMD160_DIGEST_SIZE, }, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = { + .init = (qcrypto_nettle_init)sm3_init, + .write = (qcrypto_nettle_write)sm3_update, + .result = (qcrypto_nettle_result)sm3_digest, + .len = SM3_DIGEST_SIZE, + }, +#endif }; gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg) diff --git a/crypto/hash.c b/crypto/hash.c index b0f8228bdc..8f1502ce68 100644 --- a/crypto/hash.c +++ b/crypto/hash.c @@ -30,6 +30,9 @@ static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = { [QCRYPTO_HASH_ALG_SHA384] = 48, [QCRYPTO_HASH_ALG_SHA512] = 64, [QCRYPTO_HASH_ALG_RIPEMD160] = 20, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = 32, +#endif }; size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg) diff --git a/crypto/hmac-gcrypt.c b/crypto/hmac-gcrypt.c index 0c6f979711..888afb86ed 100644 --- a/crypto/hmac-gcrypt.c +++ b/crypto/hmac-gcrypt.c @@ -26,6 +26,9 @@ static int qcrypto_hmac_alg_map[QCRYPTO_HASH_ALG__MAX] = { [QCRYPTO_HASH_ALG_SHA384] = GCRY_MAC_HMAC_SHA384, [QCRYPTO_HASH_ALG_SHA512] = GCRY_MAC_HMAC_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MAC_HMAC_RMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = GCRY_MAC_HMAC_SM3, +#endif }; typedef struct QCryptoHmacGcrypt QCryptoHmacGcrypt; diff --git a/crypto/hmac-nettle.c b/crypto/hmac-nettle.c index 1ad6c4f253..e51e3319ab 100644 --- a/crypto/hmac-nettle.c +++ b/crypto/hmac-nettle.c @@ -38,6 +38,9 @@ struct QCryptoHmacNettle { struct hmac_sha256_ctx sha256_ctx; /* equals hmac_sha224_ctx */ struct hmac_sha512_ctx sha512_ctx; /* equals hmac_sha384_ctx */ struct hmac_ripemd160_ctx ripemd160_ctx; +#ifdef CONFIG_CRYPTO_SM3 + struct hmac_sm3_ctx ctx; +#endif } u; }; @@ -89,6 +92,14 @@ struct qcrypto_nettle_hmac_alg { .digest = (qcrypto_nettle_hmac_digest)hmac_ripemd160_digest, .len = RIPEMD160_DIGEST_SIZE, }, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = { + .setkey = (qcrypto_nettle_hmac_setkey)hmac_sm3_set_key, + .update = (qcrypto_nettle_hmac_update)hmac_sm3_update, + .digest = (qcrypto_nettle_hmac_digest)hmac_sm3_digest, + .len = SM3_DIGEST_SIZE, + }, +#endif }; bool qcrypto_hmac_supports(QCryptoHashAlgorithm alg) diff --git a/crypto/pbkdf-gcrypt.c b/crypto/pbkdf-gcrypt.c index a8d8e64f4d..09b38d0d6e 100644 --- a/crypto/pbkdf-gcrypt.c +++ b/crypto/pbkdf-gcrypt.c @@ -33,6 +33,9 @@ bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash) case QCRYPTO_HASH_ALG_SHA384: case QCRYPTO_HASH_ALG_SHA512: case QCRYPTO_HASH_ALG_RIPEMD160: +#ifdef CONFIG_CRYPTO_SM3 + case QCRYPTO_HASH_ALG_SM3: +#endif return true; default: return false; @@ -54,6 +57,9 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash, [QCRYPTO_HASH_ALG_SHA384] = GCRY_MD_SHA384, [QCRYPTO_HASH_ALG_SHA512] = GCRY_MD_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MD_RMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = GCRY_MD_SM3, +#endif }; int ret; diff --git a/crypto/pbkdf-nettle.c b/crypto/pbkdf-nettle.c index d6293c25a1..5fea570bd3 100644 --- a/crypto/pbkdf-nettle.c +++ b/crypto/pbkdf-nettle.c @@ -34,6 +34,9 @@ bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash) case QCRYPTO_HASH_ALG_SHA384: case QCRYPTO_HASH_ALG_SHA512: case QCRYPTO_HASH_ALG_RIPEMD160: +#ifdef CONFIG_CRYPTO_SM3 + case QCRYPTO_HASH_ALG_SM3: +#endif return true; default: return false; @@ -55,6 +58,9 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash, struct hmac_sha384_ctx sha384; struct hmac_sha512_ctx sha512; struct hmac_ripemd160_ctx ripemd160; +#ifdef CONFIG_CRYPTO_SM3 + struct hmac_sm3_ctx sm3; +#endif } ctx; if (iterations > UINT_MAX) { @@ -106,6 +112,13 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash, PBKDF2(&ctx.ripemd160, hmac_ripemd160_update, hmac_ripemd160_digest, RIPEMD160_DIGEST_SIZE, iterations, nsalt, salt, nout, out); break; +#ifdef CONFIG_CRYPTO_SM3 + case QCRYPTO_HASH_ALG_SM3: + hmac_sm3_set_key(&ctx.sm3, nkey, key); + PBKDF2(&ctx.sm3, hmac_sm3_update, hmac_sm3_digest, + SM3_DIGEST_SIZE, iterations, nsalt, salt, nout, out); + break; +#endif default: error_setg_errno(errp, ENOSYS, diff --git a/meson.build b/meson.build index 089f45d386..4024f9a4bb 100644 --- a/meson.build +++ b/meson.build @@ -1486,6 +1486,7 @@ gcrypt = not_found nettle = not_found hogweed = not_found crypto_sm4 = not_found +crypto_sm3 = not_found xts = 'none' if get_option('nettle').enabled() and get_option('gcrypt').enabled() @@ -1522,6 +1523,17 @@ if not gnutls_crypto.found() }''', dependencies: gcrypt) crypto_sm4 = not_found endif + crypto_sm3 = gcrypt + # SM3 ALG is available in libgcrypt >= 1.8 + if gcrypt.found() and not cc.links(''' + #include + int main(void) { + gcry_md_hd_t handler; + gcry_md_open(&handler, GCRY_MD_SM3, 0); + return 0; + }''', dependencies: gcrypt) + crypto_sm3 = not_found + endif endif if (not get_option('nettle').auto() or have_system) and not gcrypt.found() nettle = dependency('nettle', version: '>=3.4', @@ -1542,6 +1554,31 @@ if not gnutls_crypto.found() }''', dependencies: nettle) crypto_sm4 = not_found endif + crypto_sm3 = nettle + # SM3 ALG is available in nettle >= 3.4 + if nettle.found() and not cc.links(''' + #include + #include + int main(void) { + struct sm3_ctx ctx; + struct hmac_sm3_ctx hmac_ctx; + unsigned char data[64] = {0}; + unsigned char output[32]; + + // SM3 hash function test + sm3_init(&ctx); + sm3_update(&ctx, 64, data); + sm3_digest(&ctx, 32, data); + + // HMAC-SM3 test + hmac_sm3_set_key(&hmac_ctx, 32, data); + hmac_sm3_update(&hmac_ctx, 64, data); + hmac_sm3_digest(&hmac_ctx, 32, output); + + return 0; + }''', dependencies: nettle) + crypto_sm3 = not_found + endif endif endif @@ -2229,6 +2266,7 @@ config_host_data.set('CONFIG_TASN1', tasn1.found()) config_host_data.set('CONFIG_GCRYPT', gcrypt.found()) config_host_data.set('CONFIG_NETTLE', nettle.found()) config_host_data.set('CONFIG_CRYPTO_SM4', crypto_sm4.found()) +config_host_data.set('CONFIG_CRYPTO_SM3', crypto_sm3.found()) config_host_data.set('CONFIG_HOGWEED', hogweed.found()) config_host_data.set('CONFIG_QEMU_PRIVATE_XTS', xts == 'private') config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) @@ -4306,6 +4344,7 @@ if nettle.found() summary_info += {' XTS': xts != 'private'} endif summary_info += {'SM4 ALG support': crypto_sm4} +summary_info += {'SM3 ALG support': crypto_sm3} summary_info += {'AF_ALG support': have_afalg} summary_info += {'rng-none': get_option('rng_none')} summary_info += {'Linux keyring': have_keyring} diff --git a/qapi/crypto.json b/qapi/crypto.json index 2f2aeff5fd..af38f0a4bd 100644 --- a/qapi/crypto.json +++ b/qapi/crypto.json @@ -58,11 +58,13 @@ # # @ripemd160: RIPEMD-160. (since 2.7) # +# @sm3: SM3. (since 8.2.0) +# # Since: 2.6 ## { 'enum': 'QCryptoHashAlgorithm', 'prefix': 'QCRYPTO_HASH_ALG', - 'data': ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'ripemd160']} + 'data': ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'ripemd160', 'sm3']} ## # @QCryptoCipherAlgorithm: diff --git a/tests/unit/test-crypto-hash.c b/tests/unit/test-crypto-hash.c index 1f4abb822b..61908e1769 100644 --- a/tests/unit/test-crypto-hash.c +++ b/tests/unit/test-crypto-hash.c @@ -42,6 +42,9 @@ "63b54e4cb2d2032b393994aa263c0dbb" \ "e00a9f2fe9ef6037352232a1eec55ee7" #define OUTPUT_RIPEMD160 "f3d658fad3fdfb2b52c9369cf0d441249ddfa8a0" +#ifdef CONFIG_CRYPTO_SM3 +#define OUTPUT_SM3 "d4a97db105b477b84c4f20ec9c31a6c814e2705a0b83a5a89748d75f0ef456a1" +#endif #define OUTPUT_MD5_B64 "Yo0gY3FWMDWrjvYvSSveyQ==" #define OUTPUT_SHA1_B64 "sudPJnWKOkIeUJzuBFJEt4dTzAI=" @@ -54,6 +57,10 @@ "7sVe5w==" #define OUTPUT_RIPEMD160_B64 "89ZY+tP9+ytSyTac8NRBJJ3fqKA=" +#ifdef CONFIG_CRYPTO_SM3 +#define OUTPUT_SM3_B64 "1Kl9sQW0d7hMTyDsnDGmyBTicFoLg6Wol0jXXw70VqE=" +#endif + static const char *expected_outputs[] = { [QCRYPTO_HASH_ALG_MD5] = OUTPUT_MD5, [QCRYPTO_HASH_ALG_SHA1] = OUTPUT_SHA1, @@ -62,6 +69,9 @@ static const char *expected_outputs[] = { [QCRYPTO_HASH_ALG_SHA384] = OUTPUT_SHA384, [QCRYPTO_HASH_ALG_SHA512] = OUTPUT_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = OUTPUT_RIPEMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = OUTPUT_SM3, +#endif }; static const char *expected_outputs_b64[] = { [QCRYPTO_HASH_ALG_MD5] = OUTPUT_MD5_B64, @@ -71,6 +81,9 @@ static const char *expected_outputs_b64[] = { [QCRYPTO_HASH_ALG_SHA384] = OUTPUT_SHA384_B64, [QCRYPTO_HASH_ALG_SHA512] = OUTPUT_SHA512_B64, [QCRYPTO_HASH_ALG_RIPEMD160] = OUTPUT_RIPEMD160_B64, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = OUTPUT_SM3_B64, +#endif }; static const int expected_lens[] = { [QCRYPTO_HASH_ALG_MD5] = 16, @@ -80,6 +93,9 @@ static const int expected_lens[] = { [QCRYPTO_HASH_ALG_SHA384] = 48, [QCRYPTO_HASH_ALG_SHA512] = 64, [QCRYPTO_HASH_ALG_RIPEMD160] = 20, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = 32, +#endif }; static const char hex[] = "0123456789abcdef"; diff --git a/tests/unit/test-crypto-hmac.c b/tests/unit/test-crypto-hmac.c index 23eb724d94..b1d04e9fcc 100644 --- a/tests/unit/test-crypto-hmac.c +++ b/tests/unit/test-crypto-hmac.c @@ -76,6 +76,14 @@ static QCryptoHmacTestData test_data[] = { "94964ed4c1155b62b668c241d67279e5" "8a711676", }, +#ifdef CONFIG_CRYPTO_SM3 + { + .alg = QCRYPTO_HASH_ALG_SM3, + .hex_digest = + "760e3799332bc913819b930085360ddb" + "c05529261313d5b15b75bab4fd7ae91e", + }, +#endif }; static const char hex[] = "0123456789abcdef"; diff --git a/tests/unit/test-crypto-pbkdf.c b/tests/unit/test-crypto-pbkdf.c index 43c417f6b4..3d76593c86 100644 --- a/tests/unit/test-crypto-pbkdf.c +++ b/tests/unit/test-crypto-pbkdf.c @@ -326,6 +326,22 @@ static QCryptoPbkdfTestData test_data[] = { "\xce\xbf\x91\x14\x8b\x5c\x48\x41", .nout = 32 }, +#ifdef CONFIG_CRYPTO_SM3 + { + .path = "/crypto/pbkdf/nonrfc/sm3/iter2", + .hash = QCRYPTO_HASH_ALG_SM3, + .iterations = 2, + .key = "password", + .nkey = 8, + .salt = "ATHENA.MIT.EDUraeburn", + .nsalt = 21, + .out = "\x48\x71\x1b\x58\xa3\xcb\xce\x06" + "\xba\xad\x77\xa8\xb5\xb9\xd8\x07" + "\x6a\xe2\xb3\x5b\x95\xce\xc8\xce" + "\xe7\xb1\xcb\xee\x61\xdf\x04\xea", + .nout = 32 + }, +#endif #if 0 { .path = "/crypto/pbkdf/nonrfc/whirlpool/iter1200", -- Gitee From 282d63f9b5915f0529e9d0ae54b47c0ceacc58c3 Mon Sep 17 00:00:00 2001 From: liupingwei Date: Mon, 19 Aug 2024 15:38:23 +0800 Subject: [PATCH 321/939] cvm : bug-fix for incorrect device name check for vhost-user-fs The 'vhost-user-fs' was being parsed as 'virtio-user-fs' during the compilation and this caused the device to erroneously trigger the error branch. Fixes: 5db954cb188d3775aec053fad8a39bf4c26a2b92("Add support for the virtcca cvm feature.) Signed-off-by: liupingwei --- hw/virtio/virtio-bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 7e750d073d..4f16e7ef77 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -83,7 +83,7 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) if (has_iommu) { vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); - if (virtcca_cvm_enabled() && (strcmp(vdev->name, "vhost-user-fs") == 0)) { + if (virtcca_cvm_enabled() && (strcmp(vdev->name, "virtio-user-fs") == 0)) { vdev_has_iommu = true; } -- Gitee From 8f6c35e3acb54208564fcb773cf79809d7412cf5 Mon Sep 17 00:00:00 2001 From: qihao Date: Tue, 20 Aug 2024 09:48:42 +0800 Subject: [PATCH 322/939] virtio-net: Use virtual time for RSC timers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 44bc14fa1e78f01bfddcb265fc41c29204ebbfd8 Receive coalescing is visible to the target machine, so its timers should use virtual time like other timers in virtio-net, to be compatible with record-replay. Signed-off-by: Nicholas Piggin Message-Id: <20240813050638.446172-10-npiggin@gmail.com> Acked-by: Michael S. Tsirkin Signed-off-by: Alex Bennée Message-Id: <20240813202329.1237572-18-alex.bennee@linaro.org> Signed-off-by: qihao_yewu --- hw/net/virtio-net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index c0a54f2d61..91c1504544 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -2141,7 +2141,7 @@ static void virtio_net_rsc_purge(void *opq) chain->stat.timer++; if (!QTAILQ_EMPTY(&chain->buffers)) { timer_mod(chain->drain_timer, - qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout); + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout); } } @@ -2377,7 +2377,7 @@ static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain, chain->stat.empty_cache++; virtio_net_rsc_cache_buf(chain, nc, buf, size); timer_mod(chain->drain_timer, - qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout); + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout); return size; } @@ -2615,7 +2615,7 @@ static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n, chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD; chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; } - chain->drain_timer = timer_new_ns(QEMU_CLOCK_HOST, + chain->drain_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_rsc_purge, chain); memset(&chain->stat, 0, sizeof(chain->stat)); -- Gitee From b78860242162ab5ef1e73973eeca36e0261bfeb5 Mon Sep 17 00:00:00 2001 From: xiaoyuliang Date: Wed, 21 Aug 2024 11:26:41 +0800 Subject: [PATCH 323/939] Add if condition to avoid assertion failed error in blockdev_init --- blockdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index bc7a947dea..d2fe5c361c 100644 --- a/blockdev.c +++ b/blockdev.c @@ -588,7 +588,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, read_only = qemu_opt_get_bool(opts, BDRV_OPT_READ_ONLY, false); - if (!file || !*file) { + if ((!file || !*file) && qdict_size(bs_opts) == 2) { cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH); if (cache && !strcmp(cache, "on")) { bdrv_flags |= BDRV_O_NO_FLUSH; -- Gitee From f2efa9729b4cb4ec98f93c1eafe38459fd82e7ae Mon Sep 17 00:00:00 2001 From: qihao Date: Mon, 26 Aug 2024 09:34:05 +0800 Subject: [PATCH 324/939] hw/display/vhost-user-gpu.c: fix vhost_user_gpu_chr_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from d6192f3f7593536a4285e8ab6c6cf3f34973ce62 fix vhost_user_gpu_chr_read() where `size` was incorrectly passed to `msg->flags`. Fixes: 267f664658 ("hw/display: add vhost-user-vga & gpu-pci") Signed-off-by: Haoran Zhang Reviewed-by: Marc-André Lureau Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev Signed-off-by: qihao_yewu --- hw/display/vhost-user-gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c index 709c8a02a1..373f04a7b4 100644 --- a/hw/display/vhost-user-gpu.c +++ b/hw/display/vhost-user-gpu.c @@ -385,7 +385,7 @@ vhost_user_gpu_chr_read(void *opaque) } msg->request = request; - msg->flags = size; + msg->flags = flags; msg->size = size; if (request == VHOST_USER_GPU_CURSOR_UPDATE || -- Gitee From 80f4d02d7afa212fba4420a3af04f3a670b9a5d4 Mon Sep 17 00:00:00 2001 From: qihao Date: Mon, 26 Aug 2024 10:40:40 +0800 Subject: [PATCH 325/939] hw/nvme: fix leak of uninitialized memory in io_mgmt_recv cheery-pick from 6a22121c4f25b181e99479f65958ecde65da1c92 Yutaro Shimizu from the Cyber Defense Institute discovered a bug in the NVMe emulation that leaks contents of an uninitialized heap buffer if subsystem and FDP emulation are enabled. Cc: qemu-stable@nongnu.org Reported-by: Yutaro Shimizu Signed-off-by: Klaus Jensen Signed-off-by: qihao_yewu --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index aecf7c37bb..104aebc5ea 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4302,7 +4302,7 @@ static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req, nruhsd = ns->fdp.nphs * endgrp->fdp.nrg; trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr); - buf = g_malloc(trans_len); + buf = g_malloc0(trans_len); trans_len = MIN(trans_len, len); -- Gitee From ec07000764f578bb7cd21fe73c8e649a183d7674 Mon Sep 17 00:00:00 2001 From: qihao Date: Mon, 26 Aug 2024 10:56:57 +0800 Subject: [PATCH 326/939] crypto/tlscredspsk: Free username on finalize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 87e012f29f2e47dcd8c385ff8bb8188f9e06d4ea When the creds->username property is set we allocate memory for it in qcrypto_tls_creds_psk_prop_set_username(), but we never free this when the QCryptoTLSCredsPSK is destroyed. Free the memory in finalize. This fixes a LeakSanitizer complaint in migration-test: $ (cd build/asan; ASAN_OPTIONS="fast_unwind_on_malloc=0" QTEST_QEMU_BINARY=./qemu-system-x86_64 ./tests/qtest/migration-test --tap -k -p /x86_64/migration/precopy/unix/tls/psk) ================================================================= ==3867512==ERROR: LeakSanitizer: detected memory leaks Direct leak of 5 byte(s) in 1 object(s) allocated from: #0 0x5624e5c99dee in malloc (/mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/qemu-system-x86_64+0x218edee) (BuildId: a9e623fa1009a9435c0142c037cd7b8c1ad04ce3) #1 0x7fb199ae9738 in g_malloc debian/build/deb/../../../glib/gmem.c:128:13 #2 0x7fb199afe583 in g_strdup debian/build/deb/../../../glib/gstrfuncs.c:361:17 #3 0x5624e82ea919 in qcrypto_tls_creds_psk_prop_set_username /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../crypto/tlscredspsk.c:255:23 #4 0x5624e812c6b5 in property_set_str /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qom/object.c:2277:5 #5 0x5624e8125ce5 in object_property_set /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qom/object.c:1463:5 #6 0x5624e8136e7c in object_set_properties_from_qdict /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qom/object_interfaces.c:55:14 #7 0x5624e81372d2 in user_creatable_add_type /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qom/object_interfaces.c:112:5 #8 0x5624e8137964 in user_creatable_add_qapi /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qom/object_interfaces.c:157:11 #9 0x5624e891ba3c in qmp_object_add /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qom/qom-qmp-cmds.c:227:5 #10 0x5624e8af9118 in qmp_marshal_object_add /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/qapi/qapi-commands-qom.c:337:5 #11 0x5624e8bd1d49 in do_qmp_dispatch_bh /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../qapi/qmp-dispatch.c:128:5 #12 0x5624e8cb2531 in aio_bh_call /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/async.c:171:5 #13 0x5624e8cb340c in aio_bh_poll /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/async.c:218:13 #14 0x5624e8c0be98 in aio_dispatch /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/aio-posix.c:423:5 #15 0x5624e8cba3ce in aio_ctx_dispatch /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/async.c:360:5 #16 0x7fb199ae0d3a in g_main_dispatch debian/build/deb/../../../glib/gmain.c:3419:28 #17 0x7fb199ae0d3a in g_main_context_dispatch debian/build/deb/../../../glib/gmain.c:4137:7 #18 0x5624e8cbe1d9 in glib_pollfds_poll /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/main-loop.c:287:9 #19 0x5624e8cbcb13 in os_host_main_loop_wait /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/main-loop.c:310:5 #20 0x5624e8cbc6dc in main_loop_wait /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../util/main-loop.c:589:11 #21 0x5624e6f3f917 in qemu_main_loop /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../system/runstate.c:801:9 #22 0x5624e893379c in qemu_default_main /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../system/main.c:37:14 #23 0x5624e89337e7 in main /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/../../system/main.c:48:12 #24 0x7fb197972d8f in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16 #25 0x7fb197972e3f in __libc_start_main csu/../csu/libc-start.c:392:3 #26 0x5624e5c16fa4 in _start (/mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/asan/qemu-system-x86_64+0x210bfa4) (BuildId: a9e623fa1009a9435c0142c037cd7b8c1ad04ce3) SUMMARY: AddressSanitizer: 5 byte(s) leaked in 1 allocation(s). Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Daniel P. Berrangé Message-ID: <20240819145021.38524-1-peter.maydell@linaro.org> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- crypto/tlscredspsk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/tlscredspsk.c b/crypto/tlscredspsk.c index 546cad1c5a..0d6b71a37c 100644 --- a/crypto/tlscredspsk.c +++ b/crypto/tlscredspsk.c @@ -243,6 +243,7 @@ qcrypto_tls_creds_psk_finalize(Object *obj) QCryptoTLSCredsPSK *creds = QCRYPTO_TLS_CREDS_PSK(obj); qcrypto_tls_creds_psk_unload(creds); + g_free(creds->username); } static void -- Gitee From d29bc8738131dcaaa1a1ae2870ea29b59a137f30 Mon Sep 17 00:00:00 2001 From: xiongmengbiao Date: Wed, 29 May 2024 00:05:44 +0800 Subject: [PATCH 327/939] hw/i386: add mem2 option for qemu The '-mem2' option is used to create a set of hugepages of memory and map them to a fixed address range of the guest. This allows some devices to easily obtain continuous host physical address ranges for performing DMA operations. Signed-off-by: xiongmengbiao --- hw/i386/pc.c | 121 ++++++++++++++++++++++++++++++++++++++++++++ include/hw/boards.h | 2 + qemu-options.hx | 12 +++++ system/vl.c | 76 ++++++++++++++++++++++++++++ 4 files changed, 211 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 29b9964733..204e34db86 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -743,6 +743,111 @@ void xen_load_linux(PCMachineState *pcms) x86ms->fw_cfg = fw_cfg; } +static int try_create_2MB_page(uint32_t page_num) +{ + char nr_hp_num_s[256] = {0}; + char free_hp_num_s[256] = {0}; + const char *nr_hugepages_dir = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages"; + const char *free_hugepages_dir = "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages"; + int nr_hp_num = -1, free_hp_num = -1, ret = -1; + int nr_fd = qemu_open_old(nr_hugepages_dir, O_RDWR); + int free_fd = qemu_open_old(free_hugepages_dir, O_RDONLY); + + if (nr_fd < 0 || free_fd < 0) { + error_report("%s: qemu_open failed: %s\n", __func__, strerror(errno)); + goto end; + } + + if (read(nr_fd, nr_hp_num_s, 256) < 0) + goto end; + if (read(free_fd, free_hp_num_s, 256) < 0) + goto end; + + nr_hp_num = atoi(nr_hp_num_s); + free_hp_num = atoi(free_hp_num_s); + if (nr_hp_num < 0 || free_hp_num < 0) + goto end; + + if (page_num <= free_hp_num) { + ret = 0; + goto end; + } + + nr_hp_num += (page_num - free_hp_num); + snprintf (nr_hp_num_s, 256, "%d", nr_hp_num); + if (write(nr_fd, nr_hp_num_s, strlen(nr_hp_num_s)) < 0) + goto end; + + ret = 0; +end: + if (nr_fd >= 0) + close(nr_fd); + if (free_fd >= 0) + close(free_fd); + return ret; +} + +#define HUGEPAGE_NUM_MAX 128 +#define HUGEPAGE_SIZE (1024*1024*2) +static void mem2_init(MachineState *ms, MemoryRegion *system_memory) +{ + MemoryRegion *mem2_mr; + char mr_name[128] = {0}; + void *ram = NULL; + int ret = 0, lock_fd; + const char *lock_file = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages"; + uint32_t page_num = ms->ram2_size / HUGEPAGE_SIZE, i; + + if (HUGEPAGE_NUM_MAX < page_num) { + error_report("\"-mem2 'size='\" needs to Less than %dM\n", + (HUGEPAGE_SIZE * HUGEPAGE_NUM_MAX) / (1024 * 1024)); + exit(EXIT_FAILURE); + } + + // Apply for hugepages from OS and use them, which needs to be synchronized + lock_fd = qemu_open_old(lock_file, O_WRONLY); + if (lock_fd < 0) { + error_report("%s: open %s failed: %s\n", __func__, lock_file, strerror(errno)); + exit(EXIT_FAILURE); + } + + while (qemu_lock_fd(lock_fd, 0, 0, true)) { + if (errno != EACCES && errno != EAGAIN) { + error_report("qemu_lock_fd failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } + + /** try to create hugepage. + * If there are enough free hugepages, then do nothing. + */ + ret = try_create_2MB_page(page_num); + if (ret) { + error_report("%s: Failed to allocate hugepage\n", __func__); + goto unlock; + } + + for (i = 0; i < page_num; ++i) { + mem2_mr = g_malloc(sizeof(*mem2_mr)); + ram = mmap(NULL, HUGEPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0); + if (ram == MAP_FAILED) { + error_report("%s: mmap failed: %s", __func__, strerror(errno)); + goto unlock; + } + + sprintf(mr_name, "mem2-%d", i); + memory_region_init_ram_ptr(mem2_mr, NULL, mr_name, HUGEPAGE_SIZE, ram); + memory_region_add_subregion(system_memory, ms->ram2_base + (i * HUGEPAGE_SIZE), mem2_mr); + } + + ret = 0; +unlock: + qemu_unlock_fd(lock_fd, 0, 0); + if (ret) + exit(EXIT_FAILURE); +} + #define PC_ROM_MIN_VGA 0xc0000 #define PC_ROM_MIN_OPTION 0xc8000 #define PC_ROM_MAX 0xe0000 @@ -965,6 +1070,22 @@ void pc_memory_init(PCMachineState *pcms, E820_RAM); } + if (machine->ram2_size && machine->ram2_base) { + if (0x100000000ULL + x86ms->above_4g_mem_size > machine->ram2_base) { + error_report("\"-mem2 'base'\" needs to greater 0x%llx\n", + 0x100000000ULL + x86ms->above_4g_mem_size); + exit(EXIT_FAILURE); + } + if (machine->ram2_base & (HUGEPAGE_SIZE - 1) || + machine->ram2_size & (HUGEPAGE_SIZE - 1)) { + error_report("\"-mem2 'base|size'\" needs to aligned to 0x%x\n", HUGEPAGE_SIZE); + exit(EXIT_FAILURE); + } + + mem2_init(machine, system_memory); + e820_add_entry(machine->ram2_base, machine->ram2_size, E820_RAM); + } + if (pcms->sgx_epc.size != 0) { e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED); } diff --git a/include/hw/boards.h b/include/hw/boards.h index da85f86efb..8ac8cad2a2 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -389,6 +389,8 @@ struct MachineState { ram_addr_t ram_size; ram_addr_t maxram_size; + ram_addr_t ram2_base; + ram_addr_t ram2_size; uint64_t ram_slots; BootConfiguration boot_config; char *kernel_filename; diff --git a/qemu-options.hx b/qemu-options.hx index 42fd09e4de..bc8e66a037 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -5845,6 +5845,18 @@ SRST (qemu) qom-set /objects/iothread1 poll-max-ns 100000 ERST +DEF("mem2", HAS_ARG, QEMU_OPTION_mem2, + "-mem2 base=addr[G],size=n[MG]\n" + " Map guest memory using host hugepages\n" + " base: starting position of guest physical address\n" + " size: the size of mmaped memory\n" + "NOTE: Both `base` and `size` need to be aligned according to 2MB\n", + QEMU_ARCH_I386) +SRST +``-mem2 base=addr[G],size=n[MG]`` + Map the host's large page memory at the specified guest address + so that some devices can use larger contiguous physical memory. +ERST HXCOMM This is the last statement. Insert new options before this line! diff --git a/system/vl.c b/system/vl.c index 8e3357c578..a1e5e68773 100644 --- a/system/vl.c +++ b/system/vl.c @@ -173,6 +173,8 @@ static QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list); static BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue); static bool nographic = false; static int mem_prealloc; /* force preallocation of physical target memory */ +static ram_addr_t ram2_base; +static ram_addr_t ram2_size; static const char *vga_model = NULL; static DisplayOptions dpy; static int num_serial_hds; @@ -504,6 +506,23 @@ static QemuOptsList qemu_action_opts = { }, }; +static QemuOptsList qemu_mem2_opts = { + .name = "mem2", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_mem2_opts.head), + .desc = { + { + .name = "base", + .type = QEMU_OPT_SIZE, + }, + { + .name = "size", + .type = QEMU_OPT_SIZE, + }, + { /* end of list */ } + }, +}; + const char *qemu_get_vm_name(void) { return qemu_name; @@ -1932,6 +1951,9 @@ static void qemu_apply_machine_options(QDict *qdict) { object_set_properties_from_keyval(OBJECT(current_machine), qdict, false, &error_fatal); + current_machine->ram2_size = ram2_size; + current_machine->ram2_base = ram2_base; + if (semihosting_enabled(false) && !semihosting_get_argc()) { /* fall back to the -kernel/-append */ semihosting_arg_fallback(current_machine->kernel_filename, current_machine->kernel_cmdline); @@ -2094,11 +2116,57 @@ static void parse_memory_options(void) loc_pop(&loc); } +static void set_mem2_options(void) +{ + uint64_t sz, base; + const char *mem_str; + QemuOpts *opts = qemu_find_opts_singleton("mem2"); + Location loc; + + loc_push_none(&loc); + qemu_opts_loc_restore(opts); + + mem_str = qemu_opt_get(opts, "base"); + if (mem_str) { + if (!*mem_str) { + error_report("missing 'base' option value"); + exit(EXIT_FAILURE); + } + + base = qemu_opt_get_size(opts, "base", ram2_base); + ram2_base = base; + } + + mem_str = qemu_opt_get(opts, "size"); + if (mem_str) { + if (!*mem_str) { + error_report("missing 'base' option value"); + exit(EXIT_FAILURE); + } + + sz = qemu_opt_get_size(opts, "size", ram2_size); + ram2_size = sz; + } + + if (ram2_base && !ram2_size){ + error_report("missing 'size' option value"); + exit(EXIT_FAILURE); + } + if (!ram2_base && ram2_size){ + error_report("missing 'base' option value"); + exit(EXIT_FAILURE); + } + + loc_pop(&loc); +} + static void qemu_create_machine(QDict *qdict) { MachineClass *machine_class = select_machine(qdict, &error_fatal); object_set_machine_compat_props(machine_class->compat_props); + set_mem2_options(); + current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class))); object_property_add_child(object_get_root(), "machine", OBJECT(current_machine)); @@ -2777,6 +2845,7 @@ void qemu_init(int argc, char **argv) qemu_add_opts(&qemu_semihosting_config_opts); qemu_add_opts(&qemu_fw_cfg_opts); qemu_add_opts(&qemu_action_opts); + qemu_add_opts(&qemu_mem2_opts); qemu_add_run_with_opts(); module_call_init(MODULE_INIT_OPTS); @@ -3596,6 +3665,13 @@ void qemu_init(int argc, char **argv) case QEMU_OPTION_nouserconfig: /* Nothing to be parsed here. Especially, do not error out below. */ break; + case QEMU_OPTION_mem2: + opts = qemu_opts_parse_noisily(qemu_find_opts("mem2"), + optarg, false); + if (!opts) { + exit(EXIT_FAILURE); + } + break; #if defined(CONFIG_POSIX) case QEMU_OPTION_runas: if (!os_set_runas(optarg)) { -- Gitee From 884c4d6bc101454f0e0f3c779bc1155024b056c3 Mon Sep 17 00:00:00 2001 From: xiongmengbiao Date: Wed, 29 May 2024 15:18:55 +0800 Subject: [PATCH 328/939] hw/misc: support tkm use mem2 memory Signed-off-by: xiongmengbiao --- hw/misc/psp.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/hw/misc/psp.c b/hw/misc/psp.c index 6ff2ceec10..4eb5ca0e0b 100644 --- a/hw/misc/psp.c +++ b/hw/misc/psp.c @@ -15,6 +15,9 @@ #include "migration/vmstate.h" #include "hw/qdev-properties.h" #include "sysemu/runstate.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" +#include "hw/i386/e820_memory_layout.h" #include #define TYPE_PSP_DEV "psp" @@ -46,14 +49,24 @@ struct PSPDevState { enum VPSP_DEV_CTRL_OPCODE { VPSP_OP_VID_ADD, VPSP_OP_VID_DEL, + VPSP_OP_SET_DEFAULT_VID_PERMISSION, + VPSP_OP_GET_DEFAULT_VID_PERMISSION, + VPSP_OP_SET_GPA, }; struct psp_dev_ctrl { unsigned char op; + unsigned char resv[3]; union { unsigned int vid; + // Set or check the permissions for the default VID + unsigned int def_vid_perm; + struct { + uint64_t gpa_start; + uint64_t gpa_end; + } gpa; unsigned char reserved[128]; - } data; + } __attribute__ ((packed)) data; }; static void psp_dev_destroy(PSPDevState *state) @@ -86,10 +99,32 @@ static void psp_dev_shutdown_notify(Notifier *notifier, void *data) psp_dev_destroy(state); } +static MemoryRegion *find_memory_region_by_name(MemoryRegion *root, const char *name) { + MemoryRegion *subregion; + MemoryRegion *result; + + if (strcmp(root->name, name) == 0) + return root; + + QTAILQ_FOREACH(subregion, &root->subregions, subregions_link) { + result = find_memory_region_by_name(subregion, name); + if (result) { + return result; + } + } + + return NULL; +} + static void psp_dev_realize(DeviceState *dev, Error **errp) { + int i; + char mr_name[128] = {0}; struct psp_dev_ctrl ctrl = { 0 }; PSPDevState *state = PSP_DEV(dev); + MemoryRegion *root_mr = get_system_memory(); + MemoryRegion *find_mr = NULL; + uint64_t ram2_start = 0, ram2_end = 0; state->dev_fd = qemu_open_old(PSP_DEV_PATH, O_RDWR); if (state->dev_fd < 0) { @@ -104,9 +139,36 @@ static void psp_dev_realize(DeviceState *dev, Error **errp) goto end; } + for (i = 0 ;; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root_mr, mr_name); + if (!find_mr) + break; + + if (!ram2_start) + ram2_start = find_mr->addr; + ram2_end = find_mr->addr + find_mr->size - 1; + } + + if (ram2_start != ram2_end) { + ctrl.op = VPSP_OP_SET_GPA; + ctrl.data.gpa.gpa_start = ram2_start; + ctrl.data.gpa.gpa_end = ram2_end; + if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { + error_setg(errp, "psp_dev_realize VPSP_OP_SET_GPA (start 0x%lx, end 0x%lx), return %d", + ram2_start, ram2_end, -errno); + goto del_vid; + } + } + state->enabled = true; state->shutdown_notifier.notify = psp_dev_shutdown_notify; qemu_register_shutdown_notifier(&state->shutdown_notifier); + + return; +del_vid: + ctrl.op = VPSP_OP_VID_DEL; + ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl); end: return; } -- Gitee From 384b3f41fd69ed6f5bf376ff1aac1a12deeea0fb Mon Sep 17 00:00:00 2001 From: liupingwei Date: Fri, 16 Aug 2024 18:06:10 +0800 Subject: [PATCH 329/939] cvm : Implement command blacklist for cvm security enhancement Added a new feature to intercept and block specific virsh commands(virsh save,virsh restore,virsh dump,virsh suspend,virsh resume)that can impact the security of cvm. Signed-off-by: liupingwei --- dump/dump.c | 7 +++++++ migration/migration-hmp-cmds.c | 6 ++++++ migration/savevm.c | 6 ++++++ monitor/qmp-cmds.c | 6 ++++++ 4 files changed, 25 insertions(+) diff --git a/dump/dump.c b/dump/dump.c index 4819050764..787059ac2c 100644 --- a/dump/dump.c +++ b/dump/dump.c @@ -20,6 +20,7 @@ #include "sysemu/dump.h" #include "sysemu/runstate.h" #include "sysemu/cpus.h" +#include "sysemu/kvm.h" #include "qapi/error.h" #include "qapi/qapi-commands-dump.h" #include "qapi/qapi-events-dump.h" @@ -2065,6 +2066,12 @@ void qmp_dump_guest_memory(bool paging, const char *protocol, Error **errp) { ERRP_GUARD(); + + if (virtcca_cvm_enabled()) { + error_setg(errp, "The dump-guest-memory command is temporarily unsupported in cvm."); + return; + } + const char *p; int fd; DumpState *s; diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 1fa6a5f478..386ba7fc98 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -30,6 +30,7 @@ #include "sysemu/runstate.h" #include "ui/qemu-spice.h" #include "sysemu/sysemu.h" +#include "sysemu/kvm.h" #include "options.h" #include "migration.h" @@ -406,6 +407,11 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict) const char *name = qdict_get_str(qdict, "name"); Error *err = NULL; + if (virtcca_cvm_enabled()) { + error_setg(&err, "The loadvm command is temporarily unsupported in cvm."); + return; + } + vm_stop(RUN_STATE_RESTORE_VM); if (load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) { diff --git a/migration/savevm.c b/migration/savevm.c index 477a19719f..cc65da605e 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -61,6 +61,7 @@ #include "sysemu/replay.h" #include "sysemu/runstate.h" #include "sysemu/sysemu.h" +#include "sysemu/kvm.h" #include "sysemu/xen.h" #include "migration/colo.h" #include "qemu/bitmap.h" @@ -3044,6 +3045,11 @@ int qemu_loadvm_approve_switchover(void) bool save_snapshot(const char *name, bool overwrite, const char *vmstate, bool has_devices, strList *devices, Error **errp) { + if (virtcca_cvm_enabled()) { + error_setg(errp, "The savevm command is temporarily unsupported in cvm."); + return false; + } + BlockDriverState *bs; QEMUSnapshotInfo sn1, *sn = &sn1; int ret = -1, ret2; diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c index e78462b857..c0b66f11bf 100644 --- a/monitor/qmp-cmds.c +++ b/monitor/qmp-cmds.c @@ -23,6 +23,7 @@ #include "sysemu/runstate.h" #include "sysemu/runstate-action.h" #include "sysemu/block-backend.h" +#include "sysemu/kvm.h" #include "qapi/error.h" #include "qapi/qapi-init-commands.h" #include "qapi/qapi-commands-control.h" @@ -50,6 +51,11 @@ void qmp_quit(Error **errp) void qmp_stop(Error **errp) { + if (virtcca_cvm_enabled()) { + error_setg(errp, "The stop command is temporarily unsupported in cvm."); + return; + } + /* if there is a dump in background, we should wait until the dump * finished */ if (qemu_system_dump_in_progress()) { -- Gitee From c31f85b015326ad6619c707ada5cea2713970741 Mon Sep 17 00:00:00 2001 From: lixiang_yewu Date: Mon, 2 Sep 2024 07:35:57 +0000 Subject: [PATCH 330/939] update docs/tools/virtfs-proxy-helper.rst. This place is spelled wrong. Signed-off-by: lixiang_yewu --- docs/tools/virtfs-proxy-helper.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tools/virtfs-proxy-helper.rst b/docs/tools/virtfs-proxy-helper.rst index bd310ebb07..175b480926 100644 --- a/docs/tools/virtfs-proxy-helper.rst +++ b/docs/tools/virtfs-proxy-helper.rst @@ -55,7 +55,7 @@ The following options are supported: .. option:: -f, --fd SOCKET_ID Use given file descriptor as socket descriptor for communicating with - qemu proxy fs drier. Usually a helper like libvirt will create + qemu proxy fs driver. Usually a helper like libvirt will create socketpair and pass one of the fds as parameter to this option. .. option:: -s, --socket SOCKET_FILE -- Gitee From c6b96a0e10db061c9ab790b443f0bfd8220d7d3c Mon Sep 17 00:00:00 2001 From: lixiang_yewu Date: Mon, 2 Sep 2024 07:39:00 +0000 Subject: [PATCH 331/939] update io/trace-events. Parameters should remain consistent. Signed-off-by: lixiang_yewu --- io/trace-events | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io/trace-events b/io/trace-events index 3cc5cf1efd..79e1a19af7 100644 --- a/io/trace-events +++ b/io/trace-events @@ -38,7 +38,7 @@ qio_channel_file_new_path(void *ioc, const char *path, int flags, int mode, int # channel-tls.c qio_channel_tls_new_client(void *ioc, void *master, void *creds, const char *hostname) "TLS new client ioc=%p master=%p creds=%p hostname=%s" -qio_channel_tls_new_server(void *ioc, void *master, void *creds, const char *aclname) "TLS new client ioc=%p master=%p creds=%p acltname=%s" +qio_channel_tls_new_server(void *ioc, void *master, void *creds, const char *aclname) "TLS new client ioc=%p master=%p creds=%p aclname=%s" qio_channel_tls_handshake_start(void *ioc) "TLS handshake start ioc=%p" qio_channel_tls_handshake_pending(void *ioc, int status) "TLS handshake pending ioc=%p status=%d" qio_channel_tls_handshake_fail(void *ioc) "TLS handshake fail ioc=%p" -- Gitee From 87dfbca72fe11b7a8d3f1afce52a7925be0e0b01 Mon Sep 17 00:00:00 2001 From: liupingwei Date: Wed, 4 Sep 2024 14:29:02 +0800 Subject: [PATCH 332/939] cvm : bug fix for undefined reference to 'virtcca_cvm_allowed' while compiling. Fixes a linking error due to an undefined reference to 'virtcca_cvm_allowed' when KVM is not enabled. Signed-off-by: liupingwei --- accel/stubs/kvm-stub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 1b37d9a302..ad39a434c4 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -25,6 +25,8 @@ bool kvm_allowed; bool kvm_readonly_mem_allowed; bool kvm_msi_use_devid; +bool virtcca_cvm_allowed; + void kvm_flush_coalesced_mmio_buffer(void) { } -- Gitee From 5da793de60f37cf0daaffee3fe8300a1a20bf36b Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 22 Aug 2024 09:35:29 -0500 Subject: [PATCH 333/939] nbd/server: CVE-2024-7409: Avoid use-after-free when closing server Commit 3e7ef738 plugged the use-after-free of the global nbd_server object, but overlooked a use-after-free of nbd_server->listener. Although this race is harder to hit, notice that our shutdown path first drops the reference count of nbd_server->listener, then triggers actions that can result in a pending client reaching the nbd_blockdev_client_closed() callback, which in turn calls qio_net_listener_set_client_func on a potentially stale object. If we know we don't want any more clients to connect, and have already told the listener socket to shut down, then we should not be trying to update the listener socket's associated function. Reproducer: > #!/usr/bin/python3 > > import os > from threading import Thread > > def start_stop(): > while 1: > os.system('virsh qemu-monitor-command VM \'{"execute": "nbd-server-start", +"arguments":{"addr":{"type":"unix","data":{"path":"/tmp/nbd-sock"}}}}\'') > os.system('virsh qemu-monitor-command VM \'{"execute": "nbd-server-stop"}\'') > > def nbd_list(): > while 1: > os.system('/path/to/build/qemu-nbd -L -k /tmp/nbd-sock') > > def test(): > sst = Thread(target=start_stop) > sst.start() > nlt = Thread(target=nbd_list) > nlt.start() > > sst.join() > nlt.join() > > test() Fixes: CVE-2024-7409 Fixes: 3e7ef738c8 ("nbd/server: CVE-2024-7409: Close stray clients at server-stop") CC: qemu-stable@nongnu.org Reported-by: Andrey Drobyshev Signed-off-by: Eric Blake Message-ID: <20240822143617.800419-2-eblake@redhat.com> Reviewed-by: Stefan Hajnoczi --- blockdev-nbd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/blockdev-nbd.c b/blockdev-nbd.c index f73409ae49..b36f41b7c5 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -92,10 +92,13 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, static void nbd_update_server_watch(NBDServerData *s) { - if (!s->max_connections || s->connections < s->max_connections) { - qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, NULL); - } else { - qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL); + if (s->listener) { + if (!s->max_connections || s->connections < s->max_connections) { + qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, + NULL); + } else { + qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL); + } } } @@ -113,6 +116,7 @@ static void nbd_server_free(NBDServerData *server) */ qio_net_listener_disconnect(server->listener); object_unref(OBJECT(server->listener)); + server->listener = NULL; QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) { qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH, NULL); -- Gitee From a8bc17bf7f94f684ba518c56e56b41974c50305e Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 1 Jul 2024 20:58:04 +0900 Subject: [PATCH 334/939] virtio-net: Ensure queue index fits with RSS (CVE-2024-6505) Ensure the queue index points to a valid queue when software RSS enabled. The new calculation matches with the behavior of Linux's TAP device with the RSS eBPF program. Fixes: 4474e37a5b3a ("virtio-net: implement RX RSS processing") Reported-by: Zhibin Hu Cc: qemu-stable@nongnu.org Signed-off-by: Akihiko Odaki Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/net/virtio-net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 91c1504544..432c433540 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1931,7 +1931,8 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) { int index = virtio_net_process_rss(nc, buf, size); if (index >= 0) { - NetClientState *nc2 = qemu_get_subqueue(n->nic, index); + NetClientState *nc2 = + qemu_get_subqueue(n->nic, index % n->curr_queue_pairs); return virtio_net_receive_rcu(nc2, buf, size, true); } } -- Gitee From 3323c09d283e02c10bbf6e8dfc43ea9f41e746db Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Wed, 24 Apr 2024 03:29:12 -0400 Subject: [PATCH 335/939] target/i386: Introduce SapphireRapids-v3 to add missing features commit b10b2481738304db13d28252e86c10555121a5b3 upstream. Add the missing features(ss, tsc-adjust, cldemote, movdiri, movdir64b) in the SapphireRapids-v3 CPU model. Intel-SIG: commit b10b24817383 target/i386: Introduce SapphireRapids-v3 to add missing features. 8.2-SPR new model support Signed-off-by: Lei Wang Message-ID: <20240424072912.43188-1-lei4.wang@intel.com> Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 19ebd49e8c..ca7e5337b0 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4020,6 +4020,17 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, + { + .version = 3, + .props = (PropValue[]) { + { "ss", "on" }, + { "tsc-adjust", "on" }, + { "cldemote", "on" }, + { "movdiri", "on" }, + { "movdir64b", "on" }, + { /* end of list */ } + } + }, { /* end of list */ } } }, -- Gitee From 2414b74bec88f4db58040a683191d3c3828f81ab Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:35 +0800 Subject: [PATCH 336/939] hw/loongarch: Move boot functions to boot.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move some boot functions to boot.c and struct loongarch_boot_info into struct LoongArchMachineState. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20240426091551.2397867-2-gaosong@loongson.cn> --- hw/loongarch/boot.c | 128 ++++++++++++++++++++++++++++++++++++ hw/loongarch/meson.build | 1 + hw/loongarch/virt.c | 121 +++------------------------------- include/hw/loongarch/boot.h | 21 ++++++ include/hw/loongarch/virt.h | 2 + 5 files changed, 160 insertions(+), 113 deletions(-) create mode 100644 hw/loongarch/boot.c create mode 100644 include/hw/loongarch/boot.h diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c new file mode 100644 index 0000000000..9feed17db3 --- /dev/null +++ b/hw/loongarch/boot.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch boot helper functions. + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "target/loongarch/cpu.h" +#include "hw/loongarch/virt.h" +#include "hw/loader.h" +#include "elf.h" +#include "qemu/error-report.h" +#include "sysemu/reset.h" +#include "sysemu/qtest.h" + +static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) +{ + return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); +} + +static int64_t load_kernel_info(struct loongarch_boot_info *info) +{ + uint64_t kernel_entry, kernel_low, kernel_high; + ssize_t kernel_size; + + kernel_size = load_elf(info->kernel_filename, NULL, + cpu_loongarch_virt_to_phys, NULL, + &kernel_entry, &kernel_low, + &kernel_high, NULL, 0, + EM_LOONGARCH, 1, 0); + + if (kernel_size < 0) { + error_report("could not load kernel '%s': %s", + info->kernel_filename, + load_elf_strerror(kernel_size)); + exit(1); + } + return kernel_entry; +} + +static void reset_load_elf(void *opaque) +{ + LoongArchCPU *cpu = opaque; + CPULoongArchState *env = &cpu->env; + + cpu_reset(CPU(cpu)); + if (env->load_elf) { + cpu_set_pc(CPU(cpu), env->elf_address); + } +} + +static void fw_cfg_add_kernel_info(struct loongarch_boot_info *info, + FWCfgState *fw_cfg) +{ + /* + * Expose the kernel, the command line, and the initrd in fw_cfg. + * We don't process them here at all, it's all left to the + * firmware. + */ + load_image_to_fw_cfg(fw_cfg, + FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA, + info->kernel_filename, + false); + + if (info->initrd_filename) { + load_image_to_fw_cfg(fw_cfg, + FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA, + info->initrd_filename, false); + } + + if (info->kernel_cmdline) { + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, + strlen(info->kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, + info->kernel_cmdline); + } +} + +static void loongarch_firmware_boot(LoongArchMachineState *lams, + struct loongarch_boot_info *info) +{ + fw_cfg_add_kernel_info(info, lams->fw_cfg); +} + +static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) +{ + int64_t kernel_addr = 0; + LoongArchCPU *lacpu; + CPUState *cs; + + if (info->kernel_filename) { + kernel_addr = load_kernel_info(info); + } else { + if(!qtest_enabled()) { + error_report("Need kernel filename\n"); + exit(1); + } + } + + CPU_FOREACH(cs) { + lacpu = LOONGARCH_CPU(cs); + lacpu->env.load_elf = true; + lacpu->env.elf_address = kernel_addr; + } +} + +void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) +{ + LoongArchMachineState *lams = LOONGARCH_MACHINE(ms); + int i; + + /* register reset function */ + for (i = 0; i < ms->smp.cpus; i++) { + qemu_register_reset(reset_load_elf, LOONGARCH_CPU(qemu_get_cpu(i))); + } + + info->kernel_filename = ms->kernel_filename; + info->kernel_cmdline = ms->kernel_cmdline; + info->initrd_filename = ms->initrd_filename; + + if (lams->bios_loaded) { + loongarch_firmware_boot(lams, info); + } else { + loongarch_direct_kernel_boot(info); + } +} diff --git a/hw/loongarch/meson.build b/hw/loongarch/meson.build index c0421502ab..d306d82c2e 100644 --- a/hw/loongarch/meson.build +++ b/hw/loongarch/meson.build @@ -1,6 +1,7 @@ loongarch_ss = ss.source_set() loongarch_ss.add(files( 'fw_cfg.c', + 'boot.c', )) loongarch_ss.add(when: 'CONFIG_LOONGARCH_VIRT', if_true: [files('virt.c'), fdt]) loongarch_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-build.c')) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index eca3b94581..a0aee28f41 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -48,14 +48,6 @@ #include "hw/block/flash.h" #include "qemu/error-report.h" - -struct loaderparams { - uint64_t ram_size; - const char *kernel_filename; - const char *kernel_cmdline; - const char *initrd_filename; -}; - static bool virt_is_veiointc_enabled(LoongArchMachineState *lams) { if (lams->veiointc == ON_OFF_AUTO_OFF) { @@ -439,31 +431,6 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) memmap_entries++; } -static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) -{ - return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); -} - -static int64_t load_kernel_info(const struct loaderparams *loaderparams) -{ - uint64_t kernel_entry, kernel_low, kernel_high; - ssize_t kernel_size; - - kernel_size = load_elf(loaderparams->kernel_filename, NULL, - cpu_loongarch_virt_to_phys, NULL, - &kernel_entry, &kernel_low, - &kernel_high, NULL, 0, - EM_LOONGARCH, 1, 0); - - if (kernel_size < 0) { - error_report("could not load kernel '%s': %s", - loaderparams->kernel_filename, - load_elf_strerror(kernel_size)); - exit(1); - } - return kernel_entry; -} - static DeviceState *create_acpi_ged(DeviceState *pch_pic, LoongArchMachineState *lams) { DeviceState *dev; @@ -755,67 +722,6 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) } } -static void reset_load_elf(void *opaque) -{ - LoongArchCPU *cpu = opaque; - CPULoongArchState *env = &cpu->env; - - cpu_reset(CPU(cpu)); - if (env->load_elf) { - cpu_set_pc(CPU(cpu), env->elf_address); - } -} - -static void fw_cfg_add_kernel_info(const struct loaderparams *loaderparams, - FWCfgState *fw_cfg) -{ - /* - * Expose the kernel, the command line, and the initrd in fw_cfg. - * We don't process them here at all, it's all left to the - * firmware. - */ - load_image_to_fw_cfg(fw_cfg, - FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA, - loaderparams->kernel_filename, - false); - - if (loaderparams->initrd_filename) { - load_image_to_fw_cfg(fw_cfg, - FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA, - loaderparams->initrd_filename, false); - } - - if (loaderparams->kernel_cmdline) { - fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, - strlen(loaderparams->kernel_cmdline) + 1); - fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, - loaderparams->kernel_cmdline); - } -} - -static void loongarch_firmware_boot(LoongArchMachineState *lams, - const struct loaderparams *loaderparams) -{ - fw_cfg_add_kernel_info(loaderparams, lams->fw_cfg); -} - -static void loongarch_direct_kernel_boot(LoongArchMachineState *lams, - const struct loaderparams *loaderparams) -{ - MachineState *machine = MACHINE(lams); - int64_t kernel_addr = 0; - LoongArchCPU *lacpu; - int i; - - kernel_addr = load_kernel_info(loaderparams); - if (!machine->firmware) { - for (i = 0; i < machine->smp.cpus; i++) { - lacpu = LOONGARCH_CPU(qemu_get_cpu(i)); - lacpu->env.load_elf = true; - lacpu->env.elf_address = kernel_addr; - } - } -} static MemTxResult loongarch_qemu_write(void *opaque, hwaddr addr, uint64_t val, unsigned size, MemTxAttrs attrs) @@ -925,7 +831,6 @@ static void loongarch_init(MachineState *machine) const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); CPUState *cpu; - struct loaderparams loaderparams = { }; if (!cpu_model) { cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); @@ -1028,24 +933,8 @@ static void loongarch_init(MachineState *machine) sizeof(struct memmap_entry) * (memmap_entries)); } fdt_add_fw_cfg_node(lams); - loaderparams.ram_size = ram_size; - loaderparams.kernel_filename = machine->kernel_filename; - loaderparams.kernel_cmdline = machine->kernel_cmdline; - loaderparams.initrd_filename = machine->initrd_filename; - /* load the kernel. */ - if (loaderparams.kernel_filename) { - if (lams->bios_loaded) { - loongarch_firmware_boot(lams, &loaderparams); - } else { - loongarch_direct_kernel_boot(lams, &loaderparams); - } - } fdt_add_flash_node(lams); - /* register reset function */ - for (i = 0; i < machine->smp.cpus; i++) { - lacpu = LOONGARCH_CPU(qemu_get_cpu(i)); - qemu_register_reset(reset_load_elf, lacpu); - } + /* Initialize the IO interrupt subsystem */ loongarch_irq_init(lams); fdt_add_irqchip_node(lams); @@ -1069,7 +958,13 @@ static void loongarch_init(MachineState *machine) */ fdt_base = 1 * MiB; qemu_fdt_dumpdtb(machine->fdt, lams->fdt_size); - rom_add_blob_fixed("fdt", machine->fdt, lams->fdt_size, fdt_base); + rom_add_blob_fixed_as("fdt", machine->fdt, lams->fdt_size, fdt_base, + &address_space_memory); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr_for_as(&address_space_memory, fdt_base, lams->fdt_size)); + + lams->bootinfo.ram_size = ram_size; + loongarch_load_kernel(machine, &lams->bootinfo); } bool loongarch_is_acpi_enabled(LoongArchMachineState *lams) diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h new file mode 100644 index 0000000000..3275c1e295 --- /dev/null +++ b/include/hw/loongarch/boot.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Definitions for LoongArch boot. + * + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#ifndef HW_LOONGARCH_BOOT_H +#define HW_LOONGARCH_BOOT_H + +struct loongarch_boot_info { + uint64_t ram_size; + const char *kernel_filename; + const char *kernel_cmdline; + const char *initrd_filename; + uint64_t a0, a1, a2; +}; + +void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info); + +#endif /* HW_LOONGARCH_BOOT_H */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 99447fd1d6..02c8234b8d 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -13,6 +13,7 @@ #include "qemu/queue.h" #include "hw/intc/loongarch_ipi.h" #include "hw/block/flash.h" +#include "hw/loongarch/boot.h" #define LOONGARCH_MAX_CPUS 256 @@ -58,6 +59,7 @@ struct LoongArchMachineState { MemoryRegion iocsr_mem; AddressSpace as_iocsr; int features; + struct loongarch_boot_info bootinfo; }; #define TYPE_LOONGARCH_MACHINE MACHINE_TYPE_NAME("virt") -- Gitee From 02c5f52da7f9458c0fc41e43f181f6e9b7101b57 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:36 +0800 Subject: [PATCH 337/939] hw/loongarch: Add load initrd we load initrd ramdisk after kernel_high address Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-3-gaosong@loongson.cn> --- hw/loongarch/boot.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 9feed17db3..a5135fe542 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -22,7 +22,8 @@ static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) static int64_t load_kernel_info(struct loongarch_boot_info *info) { - uint64_t kernel_entry, kernel_low, kernel_high; + uint64_t kernel_entry, kernel_low, kernel_high, initrd_size; + ram_addr_t initrd_offset; ssize_t kernel_size; kernel_size = load_elf(info->kernel_filename, NULL, @@ -37,6 +38,31 @@ static int64_t load_kernel_info(struct loongarch_boot_info *info) load_elf_strerror(kernel_size)); exit(1); } + + if (info->initrd_filename) { + initrd_size = get_image_size(info->initrd_filename); + if (initrd_size > 0) { + initrd_offset = ROUND_UP(kernel_high + 4 * kernel_size, 64 * KiB); + + if (initrd_offset + initrd_size > info->ram_size) { + error_report("memory too small for initial ram disk '%s'", + info->initrd_filename); + exit(1); + } + + initrd_size = load_image_targphys(info->initrd_filename, initrd_offset, + info->ram_size - initrd_offset); + } + + if (initrd_size == (target_ulong)-1) { + error_report("could not load initial ram disk '%s'", + info->initrd_filename); + exit(1); + } + } else { + initrd_size = 0; + } + return kernel_entry; } -- Gitee From 2e3e7bcf92284f41c08fce29f6c6d45849721e71 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:37 +0800 Subject: [PATCH 338/939] hw/loongarch: Add slave cpu boot_code Load the slave CPU boot code at pflash0 and set the slave CPU elf_address to VIRT_FLASH0_BASE. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-4-gaosong@loongson.cn> --- hw/loongarch/boot.c | 62 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index a5135fe542..fb6effbaff 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -15,6 +15,54 @@ #include "sysemu/reset.h" #include "sysemu/qtest.h" +static const unsigned int slave_boot_code[] = { + /* Configure reset ebase. */ + 0x0400302c, /* csrwr $t0, LOONGARCH_CSR_EENTRY */ + + /* Disable interrupt. */ + 0x0380100c, /* ori $t0, $zero,0x4 */ + 0x04000180, /* csrxchg $zero, $t0, LOONGARCH_CSR_CRMD */ + + /* Clear mailbox. */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038081ad, /* ori $t1, $t1, CORE_BUF_20 */ + 0x06481da0, /* iocsrwr.d $zero, $t1 */ + + /* Enable IPI interrupt. */ + 0x1400002c, /* lu12i.w $t0, 1(0x1) */ + 0x0400118c, /* csrxchg $t0, $t0, LOONGARCH_CSR_ECFG */ + 0x02fffc0c, /* addi.d $t0, $r0,-1(0xfff) */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038011ad, /* ori $t1, $t1, CORE_EN_OFF */ + 0x064819ac, /* iocsrwr.w $t0, $t1 */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038081ad, /* ori $t1, $t1, CORE_BUF_20 */ + + /* Wait for wakeup <.L11>: */ + 0x06488000, /* idle 0x0 */ + 0x03400000, /* andi $zero, $zero, 0x0 */ + 0x064809ac, /* iocsrrd.w $t0, $t1 */ + 0x43fff59f, /* beqz $t0, -12(0x7ffff4) # 48 <.L11> */ + + /* Read and clear IPI interrupt. */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x064809ac, /* iocsrrd.w $t0, $t1 */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038031ad, /* ori $t1, $t1, CORE_CLEAR_OFF */ + 0x064819ac, /* iocsrwr.w $t0, $t1 */ + + /* Disable IPI interrupt. */ + 0x1400002c, /* lu12i.w $t0, 1(0x1) */ + 0x04001180, /* csrxchg $zero, $t0, LOONGARCH_CSR_ECFG */ + + /* Read mail buf and jump to specified entry */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038081ad, /* ori $t1, $t1, CORE_BUF_20 */ + 0x06480dac, /* iocsrrd.d $t0, $t1 */ + 0x00150181, /* move $ra, $t0 */ + 0x4c000020, /* jirl $zero, $ra,0 */ +}; + static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) { return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); @@ -125,11 +173,23 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) } } + /* Load slave boot code at pflash0 . */ + void *boot_code = g_malloc0(VIRT_FLASH0_SIZE); + memcpy(boot_code, &slave_boot_code, sizeof(slave_boot_code)); + rom_add_blob_fixed("boot_code", boot_code, VIRT_FLASH0_SIZE, VIRT_FLASH0_BASE); + CPU_FOREACH(cs) { lacpu = LOONGARCH_CPU(cs); lacpu->env.load_elf = true; - lacpu->env.elf_address = kernel_addr; + if (cs == first_cpu) { + lacpu->env.elf_address = kernel_addr; + } else { + lacpu->env.elf_address = VIRT_FLASH0_BASE; + } + lacpu->env.boot_info = info; } + + g_free(boot_code); } void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) -- Gitee From 206b799cb8c218c744f4dcdaf161d11f14c21e0f Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:38 +0800 Subject: [PATCH 339/939] hw/loongarch: Add init_cmdline Add init_cmline and set boot_info->a0, a1 Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-5-gaosong@loongson.cn> --- hw/loongarch/boot.c | 30 ++++++++++++++++++++++++++++++ include/hw/loongarch/virt.h | 2 ++ target/loongarch/cpu.h | 2 ++ 3 files changed, 34 insertions(+) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index fb6effbaff..127085bcc4 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -63,6 +63,16 @@ static const unsigned int slave_boot_code[] = { 0x4c000020, /* jirl $zero, $ra,0 */ }; +static void init_cmdline(struct loongarch_boot_info *info, void *p, void *start) +{ + hwaddr cmdline_addr = p - start; + + info->a0 = 1; + info->a1 = cmdline_addr; + + memcpy(p, info->kernel_cmdline, COMMAND_LINE_SIZE); +} + static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) { return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); @@ -121,6 +131,10 @@ static void reset_load_elf(void *opaque) cpu_reset(CPU(cpu)); if (env->load_elf) { + if (cpu == LOONGARCH_CPU(first_cpu)) { + env->gpr[4] = env->boot_info->a0; + env->gpr[5] = env->boot_info->a1; + } cpu_set_pc(CPU(cpu), env->elf_address); } } @@ -158,8 +172,17 @@ static void loongarch_firmware_boot(LoongArchMachineState *lams, fw_cfg_add_kernel_info(info, lams->fw_cfg); } +static void init_boot_rom(struct loongarch_boot_info *info, void *p) +{ + void *start = p; + + init_cmdline(info, p, start); + p += COMMAND_LINE_SIZE; +} + static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) { + void *p, *bp; int64_t kernel_addr = 0; LoongArchCPU *lacpu; CPUState *cs; @@ -173,6 +196,12 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) } } + /* Load cmdline and system tables at [0 - 1 MiB] */ + p = g_malloc0(1 * MiB); + bp = p; + init_boot_rom(info, p); + rom_add_blob_fixed_as("boot_info", bp, 1 * MiB, 0, &address_space_memory); + /* Load slave boot code at pflash0 . */ void *boot_code = g_malloc0(VIRT_FLASH0_SIZE); memcpy(boot_code, &slave_boot_code, sizeof(slave_boot_code)); @@ -190,6 +219,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) } g_free(boot_code); + g_free(bp); } void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 02c8234b8d..ffff075f63 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -33,6 +33,8 @@ #define VIRT_GED_MEM_ADDR (VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN) #define VIRT_GED_REG_ADDR (VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN) +#define COMMAND_LINE_SIZE 512 + struct LoongArchMachineState { /*< private >*/ MachineState parent_obj; diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 0ed24051af..e3a15c593f 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -364,6 +364,8 @@ typedef struct CPUArchState { uint32_t mp_state; /* Store ipistate to access from this struct */ DeviceState *ipistate; + + struct loongarch_boot_info *boot_info; #endif struct { uint64_t guest_addr; -- Gitee From 65ae44689bfa6a1b697fea6ec0e72027fdddee95 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:39 +0800 Subject: [PATCH 340/939] hw/loongarch: Init efi_system_table Add init_systab and set boot_info->a2 Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-6-gaosong@loongson.cn> --- hw/loongarch/boot.c | 22 +++++++++++++++++ include/hw/loongarch/boot.h | 48 +++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 127085bcc4..59889dbc90 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -63,6 +63,25 @@ static const unsigned int slave_boot_code[] = { 0x4c000020, /* jirl $zero, $ra,0 */ }; +static void init_systab(struct loongarch_boot_info *info, void *p, void *start) +{ + struct efi_system_table *systab = p; + + info->a2 = p - start; + + systab->hdr.signature = EFI_SYSTEM_TABLE_SIGNATURE; + systab->hdr.revision = EFI_SPECIFICATION_VERSION; + systab->hdr.revision = sizeof(struct efi_system_table), + systab->fw_revision = FW_VERSION << 16 | FW_PATCHLEVEL << 8; + systab->runtime = 0; + systab->boottime = 0; + systab->nr_tables = 0; + + p += ROUND_UP(sizeof(struct efi_system_table), 64 * KiB); + + systab->tables = p; +} + static void init_cmdline(struct loongarch_boot_info *info, void *p, void *start) { hwaddr cmdline_addr = p - start; @@ -134,6 +153,7 @@ static void reset_load_elf(void *opaque) if (cpu == LOONGARCH_CPU(first_cpu)) { env->gpr[4] = env->boot_info->a0; env->gpr[5] = env->boot_info->a1; + env->gpr[6] = env->boot_info->a2; } cpu_set_pc(CPU(cpu), env->elf_address); } @@ -178,6 +198,8 @@ static void init_boot_rom(struct loongarch_boot_info *info, void *p) init_cmdline(info, p, start); p += COMMAND_LINE_SIZE; + + init_systab(info, p, start); } static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h index 3275c1e295..cf0e4d4f91 100644 --- a/include/hw/loongarch/boot.h +++ b/include/hw/loongarch/boot.h @@ -8,6 +8,54 @@ #ifndef HW_LOONGARCH_BOOT_H #define HW_LOONGARCH_BOOT_H +/* UEFI 2.10 */ +#define EFI_SYSTEM_TABLE_SIGNATURE 0x5453595320494249 +#define EFI_2_100_SYSTEM_TABLE_REVISION ((2<<16) | (100)) +#define EFI_SPECIFICATION_VERSION EFI_SYSTEM_TABLE_REVISION +#define EFI_SYSTEM_TABLE_REVISION EFI_2_100_SYSTEM_TABLE_REVISION + +#define FW_VERSION 0x1 +#define FW_PATCHLEVEL 0x0 + +typedef struct { + uint8_t b[16]; +} efi_guid_t QEMU_ALIGNED(8); + +struct efi_config_table { + efi_guid_t guid; + uint64_t *ptr; + const char name[16]; +}; + +typedef struct { + uint64_t signature; + uint32_t revision; + uint32_t headersize; + uint32_t crc32; + uint32_t reserved; +} efi_table_hdr_t; + +struct efi_configuration_table { + efi_guid_t guid; + void *table; +}; + +struct efi_system_table { + efi_table_hdr_t hdr; + uint64_t fw_vendor; /* physical addr of CHAR16 vendor string */ + uint32_t fw_revision; + uint64_t con_in_handle; + uint64_t *con_in; + uint64_t con_out_handle; + uint64_t *con_out; + uint64_t stderr_handle; + uint64_t stderr_placeholder; + uint64_t *runtime; + uint64_t *boottime; + uint64_t nr_tables; + struct efi_configuration_table *tables; +}; + struct loongarch_boot_info { uint64_t ram_size; const char *kernel_filename; -- Gitee From 0245881108803abedf50e954d34ebcfff294d1c3 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:40 +0800 Subject: [PATCH 341/939] hw/loongarch: Init efi_boot_memmap table The efi_system_table adds a efi_boot_memmap configuration table. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-7-gaosong@loongson.cn> --- hw/loongarch/boot.c | 40 +++++++++++++++++++++++++++++++++++++ hw/loongarch/virt.c | 11 ++-------- include/hw/loongarch/boot.h | 27 +++++++++++++++++++++++++ include/hw/loongarch/virt.h | 10 ++++++++++ 4 files changed, 79 insertions(+), 9 deletions(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 59889dbc90..527fc9c0be 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -63,8 +63,41 @@ static const unsigned int slave_boot_code[] = { 0x4c000020, /* jirl $zero, $ra,0 */ }; +static inline void *guidcpy(void *dst, const void *src) +{ + return memcpy(dst, src, sizeof(efi_guid_t)); +} + +static void init_efi_boot_memmap(struct efi_system_table *systab, + void *p, void *start) +{ + unsigned i; + struct efi_boot_memmap *boot_memmap = p; + efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID; + + /* efi_configuration_table 1 */ + guidcpy(&systab->tables[0].guid, &tbl_guid); + systab->tables[0].table = (struct efi_configuration_table *)(p - start); + systab->nr_tables = 1; + + boot_memmap->desc_size = sizeof(efi_memory_desc_t); + boot_memmap->desc_ver = 1; + boot_memmap->map_size = 0; + + efi_memory_desc_t *map = p + sizeof(struct efi_boot_memmap); + for (i = 0; i < memmap_entries; i++) { + map = (void *)boot_memmap + sizeof(*map); + map[i].type = memmap_table[i].type; + map[i].phys_addr = ROUND_UP(memmap_table[i].address, 64 * KiB); + map[i].num_pages = ROUND_DOWN(memmap_table[i].address + + memmap_table[i].length - map[i].phys_addr, 64 * KiB); + p += sizeof(efi_memory_desc_t); + } +} + static void init_systab(struct loongarch_boot_info *info, void *p, void *start) { + void *bp_tables_start; struct efi_system_table *systab = p; info->a2 = p - start; @@ -80,6 +113,13 @@ static void init_systab(struct loongarch_boot_info *info, void *p, void *start) p += ROUND_UP(sizeof(struct efi_system_table), 64 * KiB); systab->tables = p; + bp_tables_start = p; + + init_efi_boot_memmap(systab, p, start); + p += ROUND_UP(sizeof(struct efi_boot_memmap) + + sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB); + + systab->tables = (struct efi_configuration_table *)(bp_tables_start - start); } static void init_cmdline(struct loongarch_boot_info *info, void *p, void *start) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index a0aee28f41..028356acf5 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -405,15 +405,8 @@ static void virt_powerdown_req(Notifier *notifier, void *opaque) acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS); } -struct memmap_entry { - uint64_t address; - uint64_t length; - uint32_t type; - uint32_t reserved; -}; - -static struct memmap_entry *memmap_table; -static unsigned memmap_entries; +struct memmap_entry *memmap_table; +unsigned memmap_entries; static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) { diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h index cf0e4d4f91..76622af2e2 100644 --- a/include/hw/loongarch/boot.h +++ b/include/hw/loongarch/boot.h @@ -21,6 +21,15 @@ typedef struct { uint8_t b[16]; } efi_guid_t QEMU_ALIGNED(8); +#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ + (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, d } } + +#define LINUX_EFI_BOOT_MEMMAP_GUID \ + EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, \ + 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) + struct efi_config_table { efi_guid_t guid; uint64_t *ptr; @@ -56,6 +65,24 @@ struct efi_system_table { struct efi_configuration_table *tables; }; +typedef struct { + uint32_t type; + uint32_t pad; + uint64_t phys_addr; + uint64_t virt_addr; + uint64_t num_pages; + uint64_t attribute; +} efi_memory_desc_t; + +struct efi_boot_memmap { + uint64_t map_size; + uint64_t desc_size; + uint32_t desc_ver; + uint64_t map_key; + uint64_t buff_size; + efi_memory_desc_t map[32]; +}; + struct loongarch_boot_info { uint64_t ram_size; const char *kernel_filename; diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index ffff075f63..2f9eaf4b0e 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -35,6 +35,16 @@ #define COMMAND_LINE_SIZE 512 +extern struct memmap_entry *memmap_table; +extern unsigned memmap_entries; + +struct memmap_entry { + uint64_t address; + uint64_t length; + uint32_t type; + uint32_t reserved; +}; + struct LoongArchMachineState { /*< private >*/ MachineState parent_obj; -- Gitee From ad674827da4ab972a30d51818f7768de47336984 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:41 +0800 Subject: [PATCH 342/939] hw/loongarch: Init efi_initrd table The efi_system_table adds a efi_initrd configuration table. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-8-gaosong@loongson.cn> --- hw/loongarch/boot.c | 23 +++++++++++++++++++++-- include/hw/loongarch/boot.h | 9 +++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 527fc9c0be..c8b3e742b4 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -15,6 +15,9 @@ #include "sysemu/reset.h" #include "sysemu/qtest.h" +ram_addr_t initrd_offset; +uint64_t initrd_size; + static const unsigned int slave_boot_code[] = { /* Configure reset ebase. */ 0x0400302c, /* csrwr $t0, LOONGARCH_CSR_EENTRY */ @@ -95,6 +98,21 @@ static void init_efi_boot_memmap(struct efi_system_table *systab, } } +static void init_efi_initrd_table(struct efi_system_table *systab, + void *p, void *start) +{ + efi_guid_t tbl_guid = LINUX_EFI_INITRD_MEDIA_GUID; + struct efi_initrd *initrd_table = p; + + /* efi_configuration_table 2 */ + guidcpy(&systab->tables[1].guid, &tbl_guid); + systab->tables[1].table = (struct efi_configuration_table *)(p - start); + systab->nr_tables = 2; + + initrd_table->base = initrd_offset; + initrd_table->size = initrd_size; +} + static void init_systab(struct loongarch_boot_info *info, void *p, void *start) { void *bp_tables_start; @@ -118,6 +136,8 @@ static void init_systab(struct loongarch_boot_info *info, void *p, void *start) init_efi_boot_memmap(systab, p, start); p += ROUND_UP(sizeof(struct efi_boot_memmap) + sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB); + init_efi_initrd_table(systab, p, start); + p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB); systab->tables = (struct efi_configuration_table *)(bp_tables_start - start); } @@ -139,8 +159,7 @@ static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) static int64_t load_kernel_info(struct loongarch_boot_info *info) { - uint64_t kernel_entry, kernel_low, kernel_high, initrd_size; - ram_addr_t initrd_offset; + uint64_t kernel_entry, kernel_low, kernel_high; ssize_t kernel_size; kernel_size = load_elf(info->kernel_filename, NULL, diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h index 76622af2e2..42d1ee3663 100644 --- a/include/hw/loongarch/boot.h +++ b/include/hw/loongarch/boot.h @@ -30,6 +30,10 @@ typedef struct { EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, \ 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) +#define LINUX_EFI_INITRD_MEDIA_GUID \ + EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, \ + 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) + struct efi_config_table { efi_guid_t guid; uint64_t *ptr; @@ -83,6 +87,11 @@ struct efi_boot_memmap { efi_memory_desc_t map[32]; }; +struct efi_initrd { + uint64_t base; + uint64_t size; +}; + struct loongarch_boot_info { uint64_t ram_size; const char *kernel_filename; -- Gitee From 605b2b372f972fffa2d198d8dee4cf37f335559d Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:42 +0800 Subject: [PATCH 343/939] hw/loongarch: Init efi_fdt table The efi_system_table adds a efi_fdt configuration table. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-9-gaosong@loongson.cn> --- hw/loongarch/boot.c | 11 +++++++++++ hw/loongarch/virt.c | 6 ++---- include/hw/loongarch/boot.h | 4 ++++ include/hw/loongarch/virt.h | 2 ++ 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index c8b3e742b4..7d1630b2e7 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -113,6 +113,16 @@ static void init_efi_initrd_table(struct efi_system_table *systab, initrd_table->size = initrd_size; } +static void init_efi_fdt_table(struct efi_system_table *systab) +{ + efi_guid_t tbl_guid = DEVICE_TREE_GUID; + + /* efi_configuration_table 3 */ + guidcpy(&systab->tables[2].guid, &tbl_guid); + systab->tables[2].table = (void *)FDT_BASE; + systab->nr_tables = 3; +} + static void init_systab(struct loongarch_boot_info *info, void *p, void *start) { void *bp_tables_start; @@ -138,6 +148,7 @@ static void init_systab(struct loongarch_boot_info *info, void *p, void *start) sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB); init_efi_initrd_table(systab, p, start); p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB); + init_efi_fdt_table(systab); systab->tables = (struct efi_configuration_table *)(bp_tables_start - start); } diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 028356acf5..99a3dc8696 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -820,7 +820,6 @@ static void loongarch_init(MachineState *machine) int nb_numa_nodes = machine->numa_state->num_nodes; NodeInfo *numa_info = machine->numa_state->nodes; int i; - hwaddr fdt_base; const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); CPUState *cpu; @@ -949,12 +948,11 @@ static void loongarch_init(MachineState *machine) * Put the FDT into the memory map as a ROM image: this will ensure * the FDT is copied again upon reset, even if addr points into RAM. */ - fdt_base = 1 * MiB; qemu_fdt_dumpdtb(machine->fdt, lams->fdt_size); - rom_add_blob_fixed_as("fdt", machine->fdt, lams->fdt_size, fdt_base, + rom_add_blob_fixed_as("fdt", machine->fdt, lams->fdt_size, FDT_BASE, &address_space_memory); qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, - rom_ptr_for_as(&address_space_memory, fdt_base, lams->fdt_size)); + rom_ptr_for_as(&address_space_memory, FDT_BASE, lams->fdt_size)); lams->bootinfo.ram_size = ram_size; loongarch_load_kernel(machine, &lams->bootinfo); diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h index 42d1ee3663..4ebcc89dcf 100644 --- a/include/hw/loongarch/boot.h +++ b/include/hw/loongarch/boot.h @@ -34,6 +34,10 @@ typedef struct { EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, \ 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) +#define DEVICE_TREE_GUID \ + EFI_GUID(0xb1b621d5, 0xf19c, 0x41a5, 0x83, 0x0b, \ + 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0) + struct efi_config_table { efi_guid_t guid; uint64_t *ptr; diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 2f9eaf4b0e..673b57aa2b 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -35,6 +35,8 @@ #define COMMAND_LINE_SIZE 512 +#define FDT_BASE 0x100000 + extern struct memmap_entry *memmap_table; extern unsigned memmap_entries; -- Gitee From cd506fbf0d9a00aa0f25de1e7bd26ad4335c8257 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:44 +0800 Subject: [PATCH 344/939] hw/loongarch: fdt adds cpu interrupt controller node fdt adds cpu interrupt controller node, we use 'loongson,cpu-interrupt-controller'. See: https://github.com/torvalds/linux/blob/v6.7/drivers/irqchip/irq-loongarch-cpu.c https://lore.kernel.org/r/20221114113824.1880-2-liupeibao@loongson.cn Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-11-gaosong@loongson.cn> --- hw/loongarch/virt.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 99a3dc8696..fdc4a5d708 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -133,6 +133,23 @@ static void virt_flash_map(LoongArchMachineState *lams, virt_flash_map1(flash1, VIRT_FLASH1_BASE, VIRT_FLASH1_SIZE, sysmem); } +static void fdt_add_cpuic_node(LoongArchMachineState *lams, + uint32_t *cpuintc_phandle) +{ + MachineState *ms = MACHINE(lams); + char *nodename; + + *cpuintc_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/cpuic"); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *cpuintc_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,cpu-interrupt-controller"); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1); + g_free(nodename); +} + static void fdt_add_flash_node(LoongArchMachineState *lams) { MachineState *ms = MACHINE(lams); @@ -557,6 +574,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) CPULoongArchState *env; CPUState *cpu_state; int cpu, pin, i, start, num; + uint32_t cpuintc_phandle; /* * The connection of interrupts: @@ -591,6 +609,9 @@ static void loongarch_irq_init(LoongArchMachineState *lams) memory_region_add_subregion(&lams->system_iocsr, MAIL_SEND_ADDR, sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); + /* Add cpu interrupt-controller */ + fdt_add_cpuic_node(lams, &cpuintc_phandle); + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { cpu_state = qemu_get_cpu(cpu); cpudev = DEVICE(cpu_state); -- Gitee From ed42940a2d943fd0e666e46bbc9b599b9ed1bd75 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:45 +0800 Subject: [PATCH 345/939] hw/loongarch: fdt adds Extend I/O Interrupt Controller fdt adds Extend I/O Interrupt Controller, we use 'loongson,ls2k2000-eiointc'. See: https://github.com/torvalds/linux/blob/v6.7/drivers/irqchip/irq-loongson-eiointc.c https://lore.kernel.org/r/764e02d924094580ac0f1d15535f4b98308705c6.1683279769.git.zhoubinbin@loongson.cn Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-12-gaosong@loongson.cn> --- hw/loongarch/virt.c | 30 +++++++++++++++++++++++++++++- include/hw/intc/loongarch_extioi.h | 1 + 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index fdc4a5d708..820eb52cba 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -150,6 +150,31 @@ static void fdt_add_cpuic_node(LoongArchMachineState *lams, g_free(nodename); } +static void fdt_add_eiointc_node(LoongArchMachineState *lams, + uint32_t *cpuintc_phandle, + uint32_t *eiointc_phandle) +{ + MachineState *ms = MACHINE(lams); + char *nodename; + hwaddr extioi_base = APIC_BASE; + hwaddr extioi_size = EXTIOI_SIZE; + + *eiointc_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/eiointc@%" PRIx64, extioi_base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *eiointc_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,ls2k2000-eiointc"); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *cpuintc_phandle); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupts", 3); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0, + extioi_base, 0x0, extioi_size); + g_free(nodename); +} + static void fdt_add_flash_node(LoongArchMachineState *lams) { MachineState *ms = MACHINE(lams); @@ -574,7 +599,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) CPULoongArchState *env; CPUState *cpu_state; int cpu, pin, i, start, num; - uint32_t cpuintc_phandle; + uint32_t cpuintc_phandle, eiointc_phandle; /* * The connection of interrupts: @@ -652,6 +677,9 @@ static void loongarch_irq_init(LoongArchMachineState *lams) } } + /* Add Extend I/O Interrupt Controller node */ + fdt_add_eiointc_node(lams, &cpuintc_phandle, &eiointc_phandle); + pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC); num = VIRT_PCH_PIC_IRQ_NUM; qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num); diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index 98f348c49d..722ffee1bc 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -39,6 +39,7 @@ #define EXTIOI_COREISR_END (0xB20 - APIC_OFFSET) #define EXTIOI_COREMAP_START (0xC00 - APIC_OFFSET) #define EXTIOI_COREMAP_END (0xD00 - APIC_OFFSET) +#define EXTIOI_SIZE 0x800 #define EXTIOI_VIRT_BASE (0x40000000) #define EXTIOI_VIRT_SIZE (0x1000) -- Gitee From 78222abb3bde044b4520f23c6fc2f0f0bd805d2a Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:46 +0800 Subject: [PATCH 346/939] hw/loongarch: fdt adds pch_pic Controller fdt adds pch pic controller, we use 'loongson,pch-pic-1.0' See: https://github.com/torvalds/linux/blob/v6.7/drivers/irqchip/irq-loongson-pch-pic.c https://lore.kernel.org/r/20200528152757.1028711-4-jiaxun.yang@flygoat.com Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-13-gaosong@loongson.cn> --- hw/loongarch/virt.c | 30 +++++++++++++++++++++++++++++- include/hw/pci-host/ls7a.h | 1 + 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 820eb52cba..36fcfd12eb 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -175,6 +175,31 @@ static void fdt_add_eiointc_node(LoongArchMachineState *lams, g_free(nodename); } +static void fdt_add_pch_pic_node(LoongArchMachineState *lams, + uint32_t *eiointc_phandle, + uint32_t *pch_pic_phandle) +{ + MachineState *ms = MACHINE(lams); + char *nodename; + hwaddr pch_pic_base = VIRT_PCH_REG_BASE; + hwaddr pch_pic_size = VIRT_PCH_REG_SIZE; + + *pch_pic_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/platic@%" PRIx64, pch_pic_base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *pch_pic_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,pch-pic-1.0"); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0, + pch_pic_base, 0, pch_pic_size); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 2); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *eiointc_phandle); + qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,pic-base-vec", 0); + g_free(nodename); +} + static void fdt_add_flash_node(LoongArchMachineState *lams) { MachineState *ms = MACHINE(lams); @@ -599,7 +624,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) CPULoongArchState *env; CPUState *cpu_state; int cpu, pin, i, start, num; - uint32_t cpuintc_phandle, eiointc_phandle; + uint32_t cpuintc_phandle, eiointc_phandle, pch_pic_phandle; /* * The connection of interrupts: @@ -699,6 +724,9 @@ static void loongarch_irq_init(LoongArchMachineState *lams) qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); } + /* Add PCH PIC node */ + fdt_add_pch_pic_node(lams, &eiointc_phandle, &pch_pic_phandle); + pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); start = num; num = EXTIOI_IRQS - start; diff --git a/include/hw/pci-host/ls7a.h b/include/hw/pci-host/ls7a.h index e753449593..fe260f0183 100644 --- a/include/hw/pci-host/ls7a.h +++ b/include/hw/pci-host/ls7a.h @@ -24,6 +24,7 @@ #define VIRT_PCH_REG_BASE 0x10000000UL #define VIRT_IOAPIC_REG_BASE (VIRT_PCH_REG_BASE) #define VIRT_PCH_MSI_ADDR_LOW 0x2FF00000UL +#define VIRT_PCH_REG_SIZE 0x400 /* * GSI_BASE is hard-coded with 64 in linux kernel, else kernel fails to boot -- Gitee From ea34d3896abfaf67cdf7fdb3cb205cc5a0e2e708 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:47 +0800 Subject: [PATCH 347/939] hw/loongarch: fdt adds pch_msi Controller fdt adds pch msi controller, we use 'loongson,pch-msi-1.0'. See: https://github.com/torvalds/linux/blob/v6.7/drivers/irqchip/irq-loongson-pch-msi.c https://lore.kernel.org/r/20200528152757.1028711-6-jiaxun.yang@flygoat.com Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-14-gaosong@loongson.cn> --- hw/loongarch/virt.c | 33 ++++++++++++++++++++++++++++++++- include/hw/pci-host/ls7a.h | 1 + 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 36fcfd12eb..032106ebad 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -200,6 +200,34 @@ static void fdt_add_pch_pic_node(LoongArchMachineState *lams, g_free(nodename); } +static void fdt_add_pch_msi_node(LoongArchMachineState *lams, + uint32_t *eiointc_phandle, + uint32_t *pch_msi_phandle) +{ + MachineState *ms = MACHINE(lams); + char *nodename; + hwaddr pch_msi_base = VIRT_PCH_MSI_ADDR_LOW; + hwaddr pch_msi_size = VIRT_PCH_MSI_SIZE; + + *pch_msi_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/msi@%" PRIx64, pch_msi_base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *pch_msi_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,pch-msi-1.0"); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", + 0, pch_msi_base, + 0, pch_msi_size); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *eiointc_phandle); + qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,msi-base-vec", + VIRT_PCH_PIC_IRQ_NUM); + qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,msi-num-vecs", + EXTIOI_IRQS - VIRT_PCH_PIC_IRQ_NUM); + g_free(nodename); +} + static void fdt_add_flash_node(LoongArchMachineState *lams) { MachineState *ms = MACHINE(lams); @@ -624,7 +652,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) CPULoongArchState *env; CPUState *cpu_state; int cpu, pin, i, start, num; - uint32_t cpuintc_phandle, eiointc_phandle, pch_pic_phandle; + uint32_t cpuintc_phandle, eiointc_phandle, pch_pic_phandle, pch_msi_phandle; /* * The connection of interrupts: @@ -741,6 +769,9 @@ static void loongarch_irq_init(LoongArchMachineState *lams) qdev_get_gpio_in(extioi, i + start)); } + /* Add PCH MSI node */ + fdt_add_pch_msi_node(lams, &eiointc_phandle, &pch_msi_phandle); + loongarch_devices_init(pch_pic, lams); } diff --git a/include/hw/pci-host/ls7a.h b/include/hw/pci-host/ls7a.h index fe260f0183..cd7c9ec7bc 100644 --- a/include/hw/pci-host/ls7a.h +++ b/include/hw/pci-host/ls7a.h @@ -25,6 +25,7 @@ #define VIRT_IOAPIC_REG_BASE (VIRT_PCH_REG_BASE) #define VIRT_PCH_MSI_ADDR_LOW 0x2FF00000UL #define VIRT_PCH_REG_SIZE 0x400 +#define VIRT_PCH_MSI_SIZE 0x8 /* * GSI_BASE is hard-coded with 64 in linux kernel, else kernel fails to boot -- Gitee From 1325effbd595781b9ab75dceab9f87944156c606 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:48 +0800 Subject: [PATCH 348/939] hw/loongarch: fdt adds pcie irq_map node This patch adds pcie irq_map node for FDT. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-15-gaosong@loongson.cn> --- hw/loongarch/virt.c | 73 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 032106ebad..c32cc3c818 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -379,7 +379,62 @@ static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams) g_free(nodename); } -static void fdt_add_pcie_node(const LoongArchMachineState *lams) +static void fdt_add_pcie_irq_map_node(const LoongArchMachineState *lams, + char *nodename, + uint32_t *pch_pic_phandle) +{ + int pin, dev; + uint32_t irq_map_stride = 0; + uint32_t full_irq_map[GPEX_NUM_IRQS *GPEX_NUM_IRQS * 10] = {}; + uint32_t *irq_map = full_irq_map; + const MachineState *ms = MACHINE(lams); + + /* This code creates a standard swizzle of interrupts such that + * each device's first interrupt is based on it's PCI_SLOT number. + * (See pci_swizzle_map_irq_fn()) + * + * We only need one entry per interrupt in the table (not one per + * possible slot) seeing the interrupt-map-mask will allow the table + * to wrap to any number of devices. + */ + + for (dev = 0; dev < GPEX_NUM_IRQS; dev++) { + int devfn = dev * 0x8; + + for (pin = 0; pin < GPEX_NUM_IRQS; pin++) { + int irq_nr = 16 + ((pin + PCI_SLOT(devfn)) % GPEX_NUM_IRQS); + int i = 0; + + /* Fill PCI address cells */ + irq_map[i] = cpu_to_be32(devfn << 8); + i += 3; + + /* Fill PCI Interrupt cells */ + irq_map[i] = cpu_to_be32(pin + 1); + i += 1; + + /* Fill interrupt controller phandle and cells */ + irq_map[i++] = cpu_to_be32(*pch_pic_phandle); + irq_map[i++] = cpu_to_be32(irq_nr); + + if (!irq_map_stride) { + irq_map_stride = i; + } + irq_map += irq_map_stride; + } + } + + + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-map", full_irq_map, + GPEX_NUM_IRQS * GPEX_NUM_IRQS * + irq_map_stride * sizeof(uint32_t)); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupt-map-mask", + 0x1800, 0, 0, 0x7); +} + +static void fdt_add_pcie_node(const LoongArchMachineState *lams, + uint32_t *pch_pic_phandle, + uint32_t *pch_msi_phandle) { char *nodename; hwaddr base_mmio = VIRT_PCI_MEM_BASE; @@ -410,6 +465,11 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams) 2, base_pio, 2, size_pio, 1, FDT_PCI_RANGE_MMIO, 2, base_mmio, 2, base_mmio, 2, size_mmio); + qemu_fdt_setprop_cells(ms->fdt, nodename, "msi-map", + 0, *pch_msi_phandle, 0, 0x10000); + + fdt_add_pcie_irq_map_node(lams, nodename, pch_pic_phandle); + g_free(nodename); } @@ -569,7 +629,10 @@ static DeviceState *create_platform_bus(DeviceState *pch_pic) return dev; } -static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState *lams) +static void loongarch_devices_init(DeviceState *pch_pic, + LoongArchMachineState *lams, + uint32_t *pch_pic_phandle, + uint32_t *pch_msi_phandle) { MachineClass *mc = MACHINE_GET_CLASS(lams); DeviceState *gpex_dev; @@ -615,6 +678,9 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState * gpex_set_irq_num(GPEX_HOST(gpex_dev), i, 16 + i); } + /* Add pcie node */ + fdt_add_pcie_node(lams, pch_pic_phandle, pch_msi_phandle); + serial_mm_init(get_system_memory(), VIRT_UART_BASE, 0, qdev_get_gpio_in(pch_pic, VIRT_UART_IRQ - VIRT_GSI_BASE), @@ -772,7 +838,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) /* Add PCH MSI node */ fdt_add_pch_msi_node(lams, &eiointc_phandle, &pch_msi_phandle); - loongarch_devices_init(pch_pic, lams); + loongarch_devices_init(pch_pic, lams, &pch_pic_phandle, &pch_msi_phandle); } static void loongarch_firmware_init(LoongArchMachineState *lams) @@ -1048,7 +1114,6 @@ static void loongarch_init(MachineState *machine) lams->powerdown_notifier.notify = virt_powerdown_req; qemu_register_powerdown_notifier(&lams->powerdown_notifier); - fdt_add_pcie_node(lams); /* * Since lowmem region starts from 0 and Linux kernel legacy start address * at 2 MiB, FDT base address is located at 1 MiB to avoid NULL pointer -- Gitee From e87697c72641ab2209d4004f573f47283d118235 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:49 +0800 Subject: [PATCH 349/939] hw/loongarch: fdt remove unused irqchip node This patch removes the unused fdt irqchip node. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-16-gaosong@loongson.cn> --- hw/loongarch/virt.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index c32cc3c818..ff9513034b 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -473,34 +473,6 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams, g_free(nodename); } -static void fdt_add_irqchip_node(LoongArchMachineState *lams) -{ - MachineState *ms = MACHINE(lams); - char *nodename; - uint32_t irqchip_phandle; - - irqchip_phandle = qemu_fdt_alloc_phandle(ms->fdt); - qemu_fdt_setprop_cell(ms->fdt, "/", "interrupt-parent", irqchip_phandle); - - nodename = g_strdup_printf("/intc@%lx", VIRT_IOAPIC_REG_BASE); - qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 3); - qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); - qemu_fdt_setprop_cell(ms->fdt, nodename, "#address-cells", 0x2); - qemu_fdt_setprop_cell(ms->fdt, nodename, "#size-cells", 0x2); - qemu_fdt_setprop(ms->fdt, nodename, "ranges", NULL, 0); - - qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", - "loongarch,ls7a"); - - qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", - 2, VIRT_IOAPIC_REG_BASE, - 2, PCH_PIC_ROUTE_ENTRY_OFFSET); - - qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", irqchip_phandle); - g_free(nodename); -} - static void fdt_add_memory_node(MachineState *ms, uint64_t base, uint64_t size, int node_id) { @@ -1103,8 +1075,7 @@ static void loongarch_init(MachineState *machine) /* Initialize the IO interrupt subsystem */ loongarch_irq_init(lams); - fdt_add_irqchip_node(lams); - platform_bus_add_all_fdt_nodes(machine->fdt, "/intc", + platform_bus_add_all_fdt_nodes(machine->fdt, "/platic", VIRT_PLATFORM_BUS_BASEADDRESS, VIRT_PLATFORM_BUS_SIZE, VIRT_PLATFORM_BUS_IRQ); -- Gitee From 33994eff45e75e91acf0a4753fec77ad0027e4dd Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:50 +0800 Subject: [PATCH 350/939] hw/loongarch: Add cells missing from uart node uart node need interrupts and interrupt-parent cells. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-17-gaosong@loongson.cn> --- hw/loongarch/virt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index ff9513034b..a6aea52ebb 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -272,7 +272,8 @@ static void fdt_add_rtc_node(LoongArchMachineState *lams) g_free(nodename); } -static void fdt_add_uart_node(LoongArchMachineState *lams) +static void fdt_add_uart_node(LoongArchMachineState *lams, + uint32_t *pch_pic_phandle) { char *nodename; hwaddr base = VIRT_UART_BASE; @@ -285,6 +286,10 @@ static void fdt_add_uart_node(LoongArchMachineState *lams) qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0, base, 0x0, size); qemu_fdt_setprop_cell(ms->fdt, nodename, "clock-frequency", 100000000); qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", + VIRT_UART_IRQ - VIRT_GSI_BASE, 0x4); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *pch_pic_phandle); g_free(nodename); } @@ -657,7 +662,7 @@ static void loongarch_devices_init(DeviceState *pch_pic, qdev_get_gpio_in(pch_pic, VIRT_UART_IRQ - VIRT_GSI_BASE), 115200, serial_hd(0), DEVICE_LITTLE_ENDIAN); - fdt_add_uart_node(lams); + fdt_add_uart_node(lams, pch_pic_phandle); /* Network init */ for (i = 0; i < nb_nics; i++) { -- Gitee From 7266141c658cd00426922534a7de4dd5d89486b2 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 26 Apr 2024 17:15:51 +0800 Subject: [PATCH 351/939] hw/loongarch: Add cells missing from rtc node rtc node need interrupts and interrupt-parent cells. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240426091551.2397867-18-gaosong@loongson.cn> --- hw/loongarch/virt.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index a6aea52ebb..0972ebd150 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -258,7 +258,8 @@ static void fdt_add_flash_node(LoongArchMachineState *lams) g_free(nodename); } -static void fdt_add_rtc_node(LoongArchMachineState *lams) +static void fdt_add_rtc_node(LoongArchMachineState *lams, + uint32_t *pch_pic_phandle) { char *nodename; hwaddr base = VIRT_RTC_REG_BASE; @@ -267,8 +268,13 @@ static void fdt_add_rtc_node(LoongArchMachineState *lams) nodename = g_strdup_printf("/rtc@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "loongson,ls7a-rtc"); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,ls7a-rtc"); qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", + VIRT_RTC_IRQ - VIRT_GSI_BASE , 0x4); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *pch_pic_phandle); g_free(nodename); } @@ -677,7 +683,7 @@ static void loongarch_devices_init(DeviceState *pch_pic, sysbus_create_simple("ls7a_rtc", VIRT_RTC_REG_BASE, qdev_get_gpio_in(pch_pic, VIRT_RTC_IRQ - VIRT_GSI_BASE)); - fdt_add_rtc_node(lams); + fdt_add_rtc_node(lams, pch_pic_phandle); /* acpi ged */ lams->acpi_ged = create_acpi_ged(pch_pic, lams); -- Gitee From 0e0326de88282a601ea5178d421242d5b77afbfa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 25 Jan 2024 13:36:37 +0100 Subject: [PATCH 352/939] loongarch: switch boards to "default y" Some targets use "default y" for boards to filter out those that require TCG. For consistency we are switching all other targets to do the same. Continue with Loongarch. No changes to generated config-devices.mak file. Signed-off-by: Paolo Bonzini --- .gitlab-ci.d/buildtest.yml | 2 ++ configs/devices/loongarch64-softmmu/default.mak | 6 +++++- hw/loongarch/Kconfig | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml index 91663946de..3fb99e79e9 100644 --- a/.gitlab-ci.d/buildtest.yml +++ b/.gitlab-ci.d/buildtest.yml @@ -579,6 +579,8 @@ build-tci: - make check-tcg # Check our reduced build configurations +# requires libfdt: aarch64, arm, i386, loongarch64, x86_64 +# does not build without boards: i386, loongarch64, x86_64 build-without-defaults: extends: .native_build_job_template needs: diff --git a/configs/devices/loongarch64-softmmu/default.mak b/configs/devices/loongarch64-softmmu/default.mak index 928bc117ef..ffe705836f 100644 --- a/configs/devices/loongarch64-softmmu/default.mak +++ b/configs/devices/loongarch64-softmmu/default.mak @@ -1,3 +1,7 @@ # Default configuration for loongarch64-softmmu -CONFIG_LOONGARCH_VIRT=y +# Uncomment the following lines to disable these optional devices: +# CONFIG_PCI_DEVICES=n + +# Boards are selected by default, uncomment to keep out of the build. +# CONFIG_LOONGARCH_VIRT=n diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index 5727efed6d..7864050563 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -1,5 +1,7 @@ config LOONGARCH_VIRT bool + default y + depends on LOONGARCH64 select PCI select PCI_EXPRESS_GENERIC_BRIDGE imply VIRTIO_VGA -- Gitee From 5e4d612de23539499b9a22986bebe9a3007edae1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 May 2024 16:51:35 +0200 Subject: [PATCH 353/939] hw/loongarch: move memory map to boot.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure that it can be used even if virt.c is not included in the build, as is the case for --without-default-devices. Signed-off-by: Paolo Bonzini Acked-by: Richard Henderson Message-ID: <20240507145135.270803-1-pbonzini@redhat.com> Signed-off-by: Philippe Mathieu-Daudé --- .gitlab-ci.d/buildtest.yml | 5 +++-- hw/loongarch/boot.c | 3 +++ hw/loongarch/virt.c | 3 --- include/hw/loongarch/boot.h | 10 ++++++++++ include/hw/loongarch/virt.h | 10 ---------- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml index 3fb99e79e9..983c3c132e 100644 --- a/.gitlab-ci.d/buildtest.yml +++ b/.gitlab-ci.d/buildtest.yml @@ -579,8 +579,9 @@ build-tci: - make check-tcg # Check our reduced build configurations -# requires libfdt: aarch64, arm, i386, loongarch64, x86_64 -# does not build without boards: i386, loongarch64, x86_64 +# requires libfdt: aarch64, arm, i386, loongarch64, microblaze, microblazeel, +# mips64el, or1k, ppc, ppc64, riscv32, riscv64, rx, x86_64 +# does not build without boards: i386, s390x, sh4, sh4eb, x86_64 build-without-defaults: extends: .native_build_job_template needs: diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 7d1630b2e7..03f6301a77 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -15,6 +15,9 @@ #include "sysemu/reset.h" #include "sysemu/qtest.h" +struct memmap_entry *memmap_table; +unsigned memmap_entries; + ram_addr_t initrd_offset; uint64_t initrd_size; diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 0972ebd150..76b36539e2 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -543,9 +543,6 @@ static void virt_powerdown_req(Notifier *notifier, void *opaque) acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS); } -struct memmap_entry *memmap_table; -unsigned memmap_entries; - static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) { /* Ensure there are no duplicate entries. */ diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h index 4ebcc89dcf..b3b870df1f 100644 --- a/include/hw/loongarch/boot.h +++ b/include/hw/loongarch/boot.h @@ -104,6 +104,16 @@ struct loongarch_boot_info { uint64_t a0, a1, a2; }; +extern struct memmap_entry *memmap_table; +extern unsigned memmap_entries; + +struct memmap_entry { + uint64_t address; + uint64_t length; + uint32_t type; + uint32_t reserved; +}; + void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info); #endif /* HW_LOONGARCH_BOOT_H */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 673b57aa2b..36158c758f 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -37,16 +37,6 @@ #define FDT_BASE 0x100000 -extern struct memmap_entry *memmap_table; -extern unsigned memmap_entries; - -struct memmap_entry { - uint64_t address; - uint64_t length; - uint32_t type; - uint32_t reserved; -}; - struct LoongArchMachineState { /*< private >*/ MachineState parent_obj; -- Gitee From 8e2986a6fc5dda2afbe33f723efdacd01f147b7a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 8 May 2024 11:11:06 +0800 Subject: [PATCH 354/939] hw/loongarch: Rename LOONGARCH_MACHINE with LOONGARCH_VIRT_MACHINE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On LoongArch system, there is only virt machine type now, name LOONGARCH_MACHINE is confused, rename it with LOONGARCH_VIRT_MACHINE. Machine name about Other real hw boards can be added in future. Signed-off-by: Bibo Mao Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240508031110.2507477-2-maobibo@loongson.cn> Signed-off-by: Philippe Mathieu-Daudé --- hw/loongarch/acpi-build.c | 8 ++++---- hw/loongarch/boot.c | 2 +- hw/loongarch/virt.c | 19 +++++++++---------- include/hw/loongarch/virt.h | 4 ++-- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index f990405d04..fff3497c62 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -167,7 +167,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) int i, arch_id, node_id; uint64_t mem_len, mem_base; int nb_numa_nodes = machine->numa_state->num_nodes; - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); MachineClass *mc = MACHINE_GET_CLASS(lams); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine); AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lams->oem_id, @@ -279,7 +279,7 @@ static void build_la_ged_aml(Aml *dsdt, MachineState *machine) { uint32_t event; - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); build_ged_aml(dsdt, "\\_SB."GED_DEVICE, HOTPLUG_HANDLER(lams->acpi_ged), @@ -391,7 +391,7 @@ static void build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) { Aml *dsdt, *scope, *pkg; - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lams->oem_id, .oem_table_id = lams->oem_table_id }; @@ -421,7 +421,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) static void acpi_build(AcpiBuildTables *tables, MachineState *machine) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); GArray *table_offsets; AcpiFadtData fadt_data; unsigned facs, rsdt, dsdt; diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 03f6301a77..e37512729d 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -319,7 +319,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(ms); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(ms); int i; /* register reset function */ diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 76b36539e2..cca220cb5b 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -970,7 +970,7 @@ static void loongarch_init(MachineState *machine) ram_addr_t ram_size = machine->ram_size; uint64_t highram_size = 0, phyAddr = 0; MemoryRegion *address_space_mem = get_system_memory(); - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); int nb_numa_nodes = machine->numa_state->num_nodes; NodeInfo *numa_info = machine->numa_state->nodes; int i; @@ -1121,7 +1121,7 @@ bool loongarch_is_acpi_enabled(LoongArchMachineState *lams) static void loongarch_get_acpi(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(obj); OnOffAuto acpi = lams->acpi; visit_type_OnOffAuto(v, name, &acpi, errp); @@ -1130,14 +1130,14 @@ static void loongarch_get_acpi(Object *obj, Visitor *v, const char *name, static void loongarch_set_acpi(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(obj); visit_type_OnOffAuto(v, name, &lams->acpi, errp); } static void loongarch_machine_initfn(Object *obj) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(obj); if (tcg_enabled()) { lams->veiointc = ON_OFF_AUTO_OFF; @@ -1172,7 +1172,7 @@ static void virt_machine_device_pre_plug(HotplugHandler *hotplug_dev, static void virt_mem_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); /* the acpi ged is always exist */ hotplug_handler_unplug_request(HOTPLUG_HANDLER(lams->acpi_ged), dev, @@ -1190,7 +1190,7 @@ static void virt_machine_device_unplug_request(HotplugHandler *hotplug_dev, static void virt_mem_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); hotplug_handler_unplug(HOTPLUG_HANDLER(lams->acpi_ged), dev, errp); pc_dimm_unplug(PC_DIMM(dev), MACHINE(lams)); @@ -1208,7 +1208,7 @@ static void virt_machine_device_unplug(HotplugHandler *hotplug_dev, static void virt_mem_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); pc_dimm_plug(PC_DIMM(dev), MACHINE(lams)); hotplug_handler_plug(HOTPLUG_HANDLER(lams->acpi_ged), @@ -1218,7 +1218,7 @@ static void virt_mem_plug(HotplugHandler *hotplug_dev, static void loongarch_machine_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); MachineClass *mc = MACHINE_GET_CLASS(lams); if (device_is_dynamic_sysbus(mc, dev)) { @@ -1300,7 +1300,6 @@ static void loongarch_class_init(ObjectClass *oc, void *data) MachineClass *mc = MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); - mc->desc = "Loongson-3A5000 LS7A1000 machine"; mc->init = loongarch_init; mc->default_ram_size = 1 * GiB; mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464"); @@ -1341,7 +1340,7 @@ static void loongarch_class_init(ObjectClass *oc, void *data) static const TypeInfo loongarch_machine_types[] = { { - .name = TYPE_LOONGARCH_MACHINE, + .name = TYPE_LOONGARCH_VIRT_MACHINE, .parent = TYPE_MACHINE, .instance_size = sizeof(LoongArchMachineState), .class_init = loongarch_class_init, diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 36158c758f..0509b9a9af 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -66,8 +66,8 @@ struct LoongArchMachineState { struct loongarch_boot_info bootinfo; }; -#define TYPE_LOONGARCH_MACHINE MACHINE_TYPE_NAME("virt") -OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE) +#define TYPE_LOONGARCH_VIRT_MACHINE MACHINE_TYPE_NAME("virt") +OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_VIRT_MACHINE) bool loongarch_is_acpi_enabled(LoongArchMachineState *lams); void loongarch_acpi_setup(LoongArchMachineState *lams); #endif -- Gitee From a501582ef5e986bfa9dc198c63582b3e35332643 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 8 May 2024 11:11:07 +0800 Subject: [PATCH 355/939] hw/loongarch: Rename LoongArchMachineState with LoongArchVirtMachineState MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename LoongArchMachineState with LoongArchVirtMachineState, and change variable name LoongArchMachineState *lams with LoongArchVirtMachineState *lvms. Rename function specific for virtmachine loongarch_xxx() with virt_xxx(). However some common functions keep unchanged such as loongarch_acpi_setup()/loongarch_load_kernel(), since there functions can be used for real hw boards. Signed-off-by: Bibo Mao Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240508031110.2507477-3-maobibo@loongson.cn> Signed-off-by: Philippe Mathieu-Daudé --- hw/loongarch/acpi-build.c | 89 +++++---- hw/loongarch/boot.c | 10 +- hw/loongarch/fw_cfg.c | 2 +- hw/loongarch/fw_cfg.h | 2 +- hw/loongarch/virt.c | 366 ++++++++++++++++++------------------ include/hw/loongarch/virt.h | 7 +- 6 files changed, 239 insertions(+), 237 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index fff3497c62..2b4e09bf37 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -105,14 +105,15 @@ build_facs(GArray *table_data) /* build MADT */ static void -build_madt(GArray *table_data, BIOSLinker *linker, LoongArchMachineState *lams) +build_madt(GArray *table_data, BIOSLinker *linker, + LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); MachineClass *mc = MACHINE_GET_CLASS(ms); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms); int i, arch_id; - AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = lams->oem_id, - .oem_table_id = lams->oem_table_id }; + AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = lvms->oem_id, + .oem_table_id = lvms->oem_table_id }; acpi_table_begin(&table, table_data); @@ -167,11 +168,11 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) int i, arch_id, node_id; uint64_t mem_len, mem_base; int nb_numa_nodes = machine->numa_state->num_nodes; - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); - MachineClass *mc = MACHINE_GET_CLASS(lams); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); + MachineClass *mc = MACHINE_GET_CLASS(lvms); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine); - AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lams->oem_id, - .oem_table_id = lams->oem_table_id }; + AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lvms->oem_id, + .oem_table_id = lvms->oem_table_id }; acpi_table_begin(&table, table_data); build_append_int_noprefix(table_data, 1, 4); /* Reserved */ @@ -279,13 +280,13 @@ static void build_la_ged_aml(Aml *dsdt, MachineState *machine) { uint32_t event; - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); build_ged_aml(dsdt, "\\_SB."GED_DEVICE, - HOTPLUG_HANDLER(lams->acpi_ged), + HOTPLUG_HANDLER(lvms->acpi_ged), VIRT_SCI_IRQ, AML_SYSTEM_MEMORY, VIRT_GED_EVT_ADDR); - event = object_property_get_uint(OBJECT(lams->acpi_ged), + event = object_property_get_uint(OBJECT(lvms->acpi_ged), "ged-event", &error_abort); if (event & ACPI_GED_MEM_HOTPLUG_EVT) { build_memory_hotplug_aml(dsdt, machine->ram_slots, "\\_SB", NULL, @@ -295,7 +296,7 @@ build_la_ged_aml(Aml *dsdt, MachineState *machine) acpi_dsdt_add_power_button(dsdt); } -static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams) +static void build_pci_device_aml(Aml *scope, LoongArchVirtMachineState *lvms) { struct GPEXConfig cfg = { .mmio64.base = VIRT_PCI_MEM_BASE, @@ -305,13 +306,13 @@ static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams) .ecam.base = VIRT_PCI_CFG_BASE, .ecam.size = VIRT_PCI_CFG_SIZE, .irq = VIRT_GSI_BASE + VIRT_DEVICE_IRQS, - .bus = lams->pci_bus, + .bus = lvms->pci_bus, }; acpi_dsdt_add_gpex(scope, &cfg); } -static void build_flash_aml(Aml *scope, LoongArchMachineState *lams) +static void build_flash_aml(Aml *scope, LoongArchVirtMachineState *lvms) { Aml *dev, *crs; MemoryRegion *flash_mem; @@ -322,11 +323,11 @@ static void build_flash_aml(Aml *scope, LoongArchMachineState *lams) hwaddr flash1_base; hwaddr flash1_size; - flash_mem = pflash_cfi01_get_memory(lams->flash[0]); + flash_mem = pflash_cfi01_get_memory(lvms->flash[0]); flash0_base = flash_mem->addr; flash0_size = memory_region_size(flash_mem); - flash_mem = pflash_cfi01_get_memory(lams->flash[1]); + flash_mem = pflash_cfi01_get_memory(lvms->flash[1]); flash1_base = flash_mem->addr; flash1_size = memory_region_size(flash_mem); @@ -352,7 +353,7 @@ static void build_flash_aml(Aml *scope, LoongArchMachineState *lams) } #ifdef CONFIG_TPM -static void acpi_dsdt_add_tpm(Aml *scope, LoongArchMachineState *vms) +static void acpi_dsdt_add_tpm(Aml *scope, LoongArchVirtMachineState *vms) { PlatformBusDevice *pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev); hwaddr pbus_base = VIRT_PLATFORM_BUS_BASEADDRESS; @@ -391,18 +392,18 @@ static void build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) { Aml *dsdt, *scope, *pkg; - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); - AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lams->oem_id, - .oem_table_id = lams->oem_table_id }; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); + AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lvms->oem_id, + .oem_table_id = lvms->oem_table_id }; acpi_table_begin(&table, table_data); dsdt = init_aml_allocator(); build_uart_device_aml(dsdt); - build_pci_device_aml(dsdt, lams); + build_pci_device_aml(dsdt, lvms); build_la_ged_aml(dsdt, machine); - build_flash_aml(dsdt, lams); + build_flash_aml(dsdt, lvms); #ifdef CONFIG_TPM - acpi_dsdt_add_tpm(dsdt, lams); + acpi_dsdt_add_tpm(dsdt, lvms); #endif /* System State Package */ scope = aml_scope("\\"); @@ -421,7 +422,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) static void acpi_build(AcpiBuildTables *tables, MachineState *machine) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); GArray *table_offsets; AcpiFadtData fadt_data; unsigned facs, rsdt, dsdt; @@ -455,14 +456,14 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) fadt_data.dsdt_tbl_offset = &dsdt; fadt_data.xdsdt_tbl_offset = &dsdt; build_fadt(tables_blob, tables->linker, &fadt_data, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); acpi_add_table(table_offsets, tables_blob); - build_madt(tables_blob, tables->linker, lams); + build_madt(tables_blob, tables->linker, lvms); acpi_add_table(table_offsets, tables_blob); build_pptt(tables_blob, tables->linker, machine, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); acpi_add_table(table_offsets, tables_blob); build_srat(tables_blob, tables->linker, machine); @@ -470,13 +471,13 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) if (machine->numa_state->num_nodes) { if (machine->numa_state->have_numa_distance) { acpi_add_table(table_offsets, tables_blob); - build_slit(tables_blob, tables->linker, machine, lams->oem_id, - lams->oem_table_id); + build_slit(tables_blob, tables->linker, machine, lvms->oem_id, + lvms->oem_table_id); } if (machine->numa_state->hmat_enabled) { acpi_add_table(table_offsets, tables_blob); build_hmat(tables_blob, tables->linker, machine->numa_state, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); } } @@ -486,8 +487,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) .base = cpu_to_le64(VIRT_PCI_CFG_BASE), .size = cpu_to_le64(VIRT_PCI_CFG_SIZE), }; - build_mcfg(tables_blob, tables->linker, &mcfg, lams->oem_id, - lams->oem_table_id); + build_mcfg(tables_blob, tables->linker, &mcfg, lvms->oem_id, + lvms->oem_table_id); } #ifdef CONFIG_TPM @@ -495,8 +496,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) if (tpm_get_version(tpm_find()) == TPM_VERSION_2_0) { acpi_add_table(table_offsets, tables_blob); build_tpm2(tables_blob, tables->linker, - tables->tcpalog, lams->oem_id, - lams->oem_table_id); + tables->tcpalog, lvms->oem_id, + lvms->oem_table_id); } #endif /* Add tables supplied by user (if any) */ @@ -510,13 +511,13 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) /* RSDT is pointed to by RSDP */ rsdt = tables_blob->len; build_rsdt(tables_blob, tables->linker, table_offsets, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); /* RSDP is in FSEG memory, so allocate it separately */ { AcpiRsdpData rsdp_data = { .revision = 0, - .oem_id = lams->oem_id, + .oem_id = lvms->oem_id, .xsdt_tbl_offset = NULL, .rsdt_tbl_offset = &rsdt, }; @@ -593,17 +594,25 @@ static const VMStateDescription vmstate_acpi_build = { }, }; -void loongarch_acpi_setup(LoongArchMachineState *lams) +static bool loongarch_is_acpi_enabled(LoongArchVirtMachineState *lvms) +{ + if (lvms->acpi == ON_OFF_AUTO_OFF) { + return false; + } + return true; +} + +void loongarch_acpi_setup(LoongArchVirtMachineState *lvms) { AcpiBuildTables tables; AcpiBuildState *build_state; - if (!lams->fw_cfg) { + if (!lvms->fw_cfg) { ACPI_BUILD_DPRINTF("No fw cfg. Bailing out.\n"); return; } - if (!loongarch_is_acpi_enabled(lams)) { + if (!loongarch_is_acpi_enabled(lvms)) { ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n"); return; } @@ -611,7 +620,7 @@ void loongarch_acpi_setup(LoongArchMachineState *lams) build_state = g_malloc0(sizeof *build_state); acpi_build_tables_init(&tables); - acpi_build(&tables, MACHINE(lams)); + acpi_build(&tables, MACHINE(lvms)); /* Now expose it all to Guest */ build_state->table_mr = acpi_add_rom_blob(acpi_build_update, diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index e37512729d..b8e1aa18d5 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -259,10 +259,10 @@ static void fw_cfg_add_kernel_info(struct loongarch_boot_info *info, } } -static void loongarch_firmware_boot(LoongArchMachineState *lams, +static void loongarch_firmware_boot(LoongArchVirtMachineState *lvms, struct loongarch_boot_info *info) { - fw_cfg_add_kernel_info(info, lams->fw_cfg); + fw_cfg_add_kernel_info(info, lvms->fw_cfg); } static void init_boot_rom(struct loongarch_boot_info *info, void *p) @@ -319,7 +319,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(ms); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); int i; /* register reset function */ @@ -331,8 +331,8 @@ void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) info->kernel_cmdline = ms->kernel_cmdline; info->initrd_filename = ms->initrd_filename; - if (lams->bios_loaded) { - loongarch_firmware_boot(lams, info); + if (lvms->bios_loaded) { + loongarch_firmware_boot(lvms, info); } else { loongarch_direct_kernel_boot(info); } diff --git a/hw/loongarch/fw_cfg.c b/hw/loongarch/fw_cfg.c index f15a17416c..35aeb2decb 100644 --- a/hw/loongarch/fw_cfg.c +++ b/hw/loongarch/fw_cfg.c @@ -17,7 +17,7 @@ static void fw_cfg_boot_set(void *opaque, const char *boot_device, fw_cfg_modify_i16(opaque, FW_CFG_BOOT_DEVICE, boot_device[0]); } -FWCfgState *loongarch_fw_cfg_init(ram_addr_t ram_size, MachineState *ms) +FWCfgState *virt_fw_cfg_init(ram_addr_t ram_size, MachineState *ms) { FWCfgState *fw_cfg; int max_cpus = ms->smp.max_cpus; diff --git a/hw/loongarch/fw_cfg.h b/hw/loongarch/fw_cfg.h index 7c0de4db4a..27ee68286e 100644 --- a/hw/loongarch/fw_cfg.h +++ b/hw/loongarch/fw_cfg.h @@ -11,5 +11,5 @@ #include "hw/boards.h" #include "hw/nvram/fw_cfg.h" -FWCfgState *loongarch_fw_cfg_init(ram_addr_t ram_size, MachineState *ms); +FWCfgState *virt_fw_cfg_init(ram_addr_t ram_size, MachineState *ms); #endif diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index cca220cb5b..e39989193e 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -48,9 +48,9 @@ #include "hw/block/flash.h" #include "qemu/error-report.h" -static bool virt_is_veiointc_enabled(LoongArchMachineState *lams) +static bool virt_is_veiointc_enabled(LoongArchVirtMachineState *lvms) { - if (lams->veiointc == ON_OFF_AUTO_OFF) { + if (lvms->veiointc == ON_OFF_AUTO_OFF) { return false; } return true; @@ -59,8 +59,8 @@ static bool virt_is_veiointc_enabled(LoongArchMachineState *lams) static void virt_get_veiointc(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); - OnOffAuto veiointc = lams->veiointc; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); + OnOffAuto veiointc = lvms->veiointc; visit_type_OnOffAuto(v, name, &veiointc, errp); } @@ -68,12 +68,12 @@ static void virt_get_veiointc(Object *obj, Visitor *v, const char *name, static void virt_set_veiointc(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); - visit_type_OnOffAuto(v, name, &lams->veiointc, errp); + visit_type_OnOffAuto(v, name, &lvms->veiointc, errp); } -static PFlashCFI01 *virt_flash_create1(LoongArchMachineState *lams, +static PFlashCFI01 *virt_flash_create1(LoongArchVirtMachineState *lvms, const char *name, const char *alias_prop_name) { @@ -88,16 +88,16 @@ static PFlashCFI01 *virt_flash_create1(LoongArchMachineState *lams, qdev_prop_set_uint16(dev, "id2", 0x00); qdev_prop_set_uint16(dev, "id3", 0x00); qdev_prop_set_string(dev, "name", name); - object_property_add_child(OBJECT(lams), name, OBJECT(dev)); - object_property_add_alias(OBJECT(lams), alias_prop_name, + object_property_add_child(OBJECT(lvms), name, OBJECT(dev)); + object_property_add_alias(OBJECT(lvms), alias_prop_name, OBJECT(dev), "drive"); return PFLASH_CFI01(dev); } -static void virt_flash_create(LoongArchMachineState *lams) +static void virt_flash_create(LoongArchVirtMachineState *lvms) { - lams->flash[0] = virt_flash_create1(lams, "virt.flash0", "pflash0"); - lams->flash[1] = virt_flash_create1(lams, "virt.flash1", "pflash1"); + lvms->flash[0] = virt_flash_create1(lvms, "virt.flash0", "pflash0"); + lvms->flash[1] = virt_flash_create1(lvms, "virt.flash1", "pflash1"); } static void virt_flash_map1(PFlashCFI01 *flash, @@ -123,20 +123,20 @@ static void virt_flash_map1(PFlashCFI01 *flash, sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0)); } -static void virt_flash_map(LoongArchMachineState *lams, +static void virt_flash_map(LoongArchVirtMachineState *lvms, MemoryRegion *sysmem) { - PFlashCFI01 *flash0 = lams->flash[0]; - PFlashCFI01 *flash1 = lams->flash[1]; + PFlashCFI01 *flash0 = lvms->flash[0]; + PFlashCFI01 *flash1 = lvms->flash[1]; virt_flash_map1(flash0, VIRT_FLASH0_BASE, VIRT_FLASH0_SIZE, sysmem); virt_flash_map1(flash1, VIRT_FLASH1_BASE, VIRT_FLASH1_SIZE, sysmem); } -static void fdt_add_cpuic_node(LoongArchMachineState *lams, +static void fdt_add_cpuic_node(LoongArchVirtMachineState *lvms, uint32_t *cpuintc_phandle) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); char *nodename; *cpuintc_phandle = qemu_fdt_alloc_phandle(ms->fdt); @@ -150,11 +150,11 @@ static void fdt_add_cpuic_node(LoongArchMachineState *lams, g_free(nodename); } -static void fdt_add_eiointc_node(LoongArchMachineState *lams, +static void fdt_add_eiointc_node(LoongArchVirtMachineState *lvms, uint32_t *cpuintc_phandle, uint32_t *eiointc_phandle) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); char *nodename; hwaddr extioi_base = APIC_BASE; hwaddr extioi_size = EXTIOI_SIZE; @@ -175,11 +175,11 @@ static void fdt_add_eiointc_node(LoongArchMachineState *lams, g_free(nodename); } -static void fdt_add_pch_pic_node(LoongArchMachineState *lams, +static void fdt_add_pch_pic_node(LoongArchVirtMachineState *lvms, uint32_t *eiointc_phandle, uint32_t *pch_pic_phandle) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); char *nodename; hwaddr pch_pic_base = VIRT_PCH_REG_BASE; hwaddr pch_pic_size = VIRT_PCH_REG_SIZE; @@ -200,11 +200,11 @@ static void fdt_add_pch_pic_node(LoongArchMachineState *lams, g_free(nodename); } -static void fdt_add_pch_msi_node(LoongArchMachineState *lams, +static void fdt_add_pch_msi_node(LoongArchVirtMachineState *lvms, uint32_t *eiointc_phandle, uint32_t *pch_msi_phandle) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); char *nodename; hwaddr pch_msi_base = VIRT_PCH_MSI_ADDR_LOW; hwaddr pch_msi_size = VIRT_PCH_MSI_SIZE; @@ -228,9 +228,9 @@ static void fdt_add_pch_msi_node(LoongArchMachineState *lams, g_free(nodename); } -static void fdt_add_flash_node(LoongArchMachineState *lams) +static void fdt_add_flash_node(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); char *nodename; MemoryRegion *flash_mem; @@ -240,11 +240,11 @@ static void fdt_add_flash_node(LoongArchMachineState *lams) hwaddr flash1_base; hwaddr flash1_size; - flash_mem = pflash_cfi01_get_memory(lams->flash[0]); + flash_mem = pflash_cfi01_get_memory(lvms->flash[0]); flash0_base = flash_mem->addr; flash0_size = memory_region_size(flash_mem); - flash_mem = pflash_cfi01_get_memory(lams->flash[1]); + flash_mem = pflash_cfi01_get_memory(lvms->flash[1]); flash1_base = flash_mem->addr; flash1_size = memory_region_size(flash_mem); @@ -258,13 +258,13 @@ static void fdt_add_flash_node(LoongArchMachineState *lams) g_free(nodename); } -static void fdt_add_rtc_node(LoongArchMachineState *lams, +static void fdt_add_rtc_node(LoongArchVirtMachineState *lvms, uint32_t *pch_pic_phandle) { char *nodename; hwaddr base = VIRT_RTC_REG_BASE; hwaddr size = VIRT_RTC_LEN; - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/rtc@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -278,13 +278,13 @@ static void fdt_add_rtc_node(LoongArchMachineState *lams, g_free(nodename); } -static void fdt_add_uart_node(LoongArchMachineState *lams, +static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, uint32_t *pch_pic_phandle) { char *nodename; hwaddr base = VIRT_UART_BASE; hwaddr size = VIRT_UART_SIZE; - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/serial@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -299,11 +299,11 @@ static void fdt_add_uart_node(LoongArchMachineState *lams, g_free(nodename); } -static void create_fdt(LoongArchMachineState *lams) +static void create_fdt(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); - ms->fdt = create_device_tree(&lams->fdt_size); + ms->fdt = create_device_tree(&lvms->fdt_size); if (!ms->fdt) { error_report("create_device_tree() failed"); exit(1); @@ -317,10 +317,10 @@ static void create_fdt(LoongArchMachineState *lams) qemu_fdt_add_subnode(ms->fdt, "/chosen"); } -static void fdt_add_cpu_nodes(const LoongArchMachineState *lams) +static void fdt_add_cpu_nodes(const LoongArchVirtMachineState *lvms) { int num; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); int smp_cpus = ms->smp.cpus; qemu_fdt_add_subnode(ms->fdt, "/cpus"); @@ -374,11 +374,11 @@ static void fdt_add_cpu_nodes(const LoongArchMachineState *lams) } } -static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams) +static void fdt_add_fw_cfg_node(const LoongArchVirtMachineState *lvms) { char *nodename; hwaddr base = VIRT_FWCFG_BASE; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/fw_cfg@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -390,7 +390,7 @@ static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams) g_free(nodename); } -static void fdt_add_pcie_irq_map_node(const LoongArchMachineState *lams, +static void fdt_add_pcie_irq_map_node(const LoongArchVirtMachineState *lvms, char *nodename, uint32_t *pch_pic_phandle) { @@ -398,7 +398,7 @@ static void fdt_add_pcie_irq_map_node(const LoongArchMachineState *lams, uint32_t irq_map_stride = 0; uint32_t full_irq_map[GPEX_NUM_IRQS *GPEX_NUM_IRQS * 10] = {}; uint32_t *irq_map = full_irq_map; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); /* This code creates a standard swizzle of interrupts such that * each device's first interrupt is based on it's PCI_SLOT number. @@ -443,7 +443,7 @@ static void fdt_add_pcie_irq_map_node(const LoongArchMachineState *lams, 0x1800, 0, 0, 0x7); } -static void fdt_add_pcie_node(const LoongArchMachineState *lams, +static void fdt_add_pcie_node(const LoongArchVirtMachineState *lvms, uint32_t *pch_pic_phandle, uint32_t *pch_msi_phandle) { @@ -456,7 +456,7 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams, hwaddr size_pcie = VIRT_PCI_CFG_SIZE; hwaddr base = base_pcie; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/pcie@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -479,7 +479,7 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams, qemu_fdt_setprop_cells(ms->fdt, nodename, "msi-map", 0, *pch_msi_phandle, 0, 0x10000); - fdt_add_pcie_irq_map_node(lams, nodename, pch_pic_phandle); + fdt_add_pcie_irq_map_node(lvms, nodename, pch_pic_phandle); g_free(nodename); } @@ -501,15 +501,15 @@ static void fdt_add_memory_node(MachineState *ms, g_free(nodename); } -static void virt_build_smbios(LoongArchMachineState *lams) +static void virt_build_smbios(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); - MachineClass *mc = MACHINE_GET_CLASS(lams); + MachineState *ms = MACHINE(lvms); + MachineClass *mc = MACHINE_GET_CLASS(lvms); uint8_t *smbios_tables, *smbios_anchor; size_t smbios_tables_len, smbios_anchor_len; const char *product = "QEMU Virtual Machine"; - if (!lams->fw_cfg) { + if (!lvms->fw_cfg) { return; } @@ -520,26 +520,26 @@ static void virt_build_smbios(LoongArchMachineState *lams) &smbios_anchor, &smbios_anchor_len, &error_fatal); if (smbios_anchor) { - fw_cfg_add_file(lams->fw_cfg, "etc/smbios/smbios-tables", + fw_cfg_add_file(lvms->fw_cfg, "etc/smbios/smbios-tables", smbios_tables, smbios_tables_len); - fw_cfg_add_file(lams->fw_cfg, "etc/smbios/smbios-anchor", + fw_cfg_add_file(lvms->fw_cfg, "etc/smbios/smbios-anchor", smbios_anchor, smbios_anchor_len); } } -static void virt_machine_done(Notifier *notifier, void *data) +static void virt_done(Notifier *notifier, void *data) { - LoongArchMachineState *lams = container_of(notifier, - LoongArchMachineState, machine_done); - virt_build_smbios(lams); - loongarch_acpi_setup(lams); + LoongArchVirtMachineState *lvms = container_of(notifier, + LoongArchVirtMachineState, machine_done); + virt_build_smbios(lvms); + loongarch_acpi_setup(lvms); } static void virt_powerdown_req(Notifier *notifier, void *opaque) { - LoongArchMachineState *s = container_of(notifier, - LoongArchMachineState, powerdown_notifier); + LoongArchVirtMachineState *s; + s = container_of(notifier, LoongArchVirtMachineState, powerdown_notifier); acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS); } @@ -559,10 +559,11 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) memmap_entries++; } -static DeviceState *create_acpi_ged(DeviceState *pch_pic, LoongArchMachineState *lams) +static DeviceState *create_acpi_ged(DeviceState *pch_pic, + LoongArchVirtMachineState *lvms) { DeviceState *dev; - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); uint32_t event = ACPI_GED_PWR_DOWN_EVT; if (ms->ram_slots) { @@ -609,12 +610,12 @@ static DeviceState *create_platform_bus(DeviceState *pch_pic) return dev; } -static void loongarch_devices_init(DeviceState *pch_pic, - LoongArchMachineState *lams, +static void virt_devices_init(DeviceState *pch_pic, + LoongArchVirtMachineState *lvms, uint32_t *pch_pic_phandle, uint32_t *pch_msi_phandle) { - MachineClass *mc = MACHINE_GET_CLASS(lams); + MachineClass *mc = MACHINE_GET_CLASS(lvms); DeviceState *gpex_dev; SysBusDevice *d; PCIBus *pci_bus; @@ -626,7 +627,7 @@ static void loongarch_devices_init(DeviceState *pch_pic, d = SYS_BUS_DEVICE(gpex_dev); sysbus_realize_and_unref(d, &error_fatal); pci_bus = PCI_HOST_BRIDGE(gpex_dev)->bus; - lams->pci_bus = pci_bus; + lvms->pci_bus = pci_bus; /* Map only part size_ecam bytes of ECAM space */ ecam_alias = g_new0(MemoryRegion, 1); @@ -659,13 +660,13 @@ static void loongarch_devices_init(DeviceState *pch_pic, } /* Add pcie node */ - fdt_add_pcie_node(lams, pch_pic_phandle, pch_msi_phandle); + fdt_add_pcie_node(lvms, pch_pic_phandle, pch_msi_phandle); serial_mm_init(get_system_memory(), VIRT_UART_BASE, 0, qdev_get_gpio_in(pch_pic, VIRT_UART_IRQ - VIRT_GSI_BASE), 115200, serial_hd(0), DEVICE_LITTLE_ENDIAN); - fdt_add_uart_node(lams, pch_pic_phandle); + fdt_add_uart_node(lvms, pch_pic_phandle); /* Network init */ for (i = 0; i < nb_nics; i++) { @@ -680,17 +681,17 @@ static void loongarch_devices_init(DeviceState *pch_pic, sysbus_create_simple("ls7a_rtc", VIRT_RTC_REG_BASE, qdev_get_gpio_in(pch_pic, VIRT_RTC_IRQ - VIRT_GSI_BASE)); - fdt_add_rtc_node(lams, pch_pic_phandle); + fdt_add_rtc_node(lvms, pch_pic_phandle); /* acpi ged */ - lams->acpi_ged = create_acpi_ged(pch_pic, lams); + lvms->acpi_ged = create_acpi_ged(pch_pic, lvms); /* platform bus */ - lams->platform_bus_dev = create_platform_bus(pch_pic); + lvms->platform_bus_dev = create_platform_bus(pch_pic); } -static void loongarch_irq_init(LoongArchMachineState *lams) +static void virt_irq_init(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); DeviceState *pch_pic, *pch_msi, *cpudev; DeviceState *ipi, *extioi; SysBusDevice *d; @@ -728,20 +729,20 @@ static void loongarch_irq_init(LoongArchMachineState *lams) sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); /* IPI iocsr memory region */ - memory_region_add_subregion(&lams->system_iocsr, SMP_IPI_MAILBOX, + memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX, sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); - memory_region_add_subregion(&lams->system_iocsr, MAIL_SEND_ADDR, + memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR, sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); /* Add cpu interrupt-controller */ - fdt_add_cpuic_node(lams, &cpuintc_phandle); + fdt_add_cpuic_node(lvms, &cpuintc_phandle); for (cpu = 0; cpu < ms->smp.cpus; cpu++) { cpu_state = qemu_get_cpu(cpu); cpudev = DEVICE(cpu_state); lacpu = LOONGARCH_CPU(cpu_state); env = &(lacpu->env); - env->address_space_iocsr = &lams->as_iocsr; + env->address_space_iocsr = &lvms->as_iocsr; /* connect ipi irq to cpu irq */ qdev_connect_gpio_out(ipi, cpu, qdev_get_gpio_in(cpudev, IRQ_IPI)); @@ -751,18 +752,18 @@ static void loongarch_irq_init(LoongArchMachineState *lams) /* Create EXTIOI device */ extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.cpus); - if (virt_is_veiointc_enabled(lams)) { + if (virt_is_veiointc_enabled(lvms)) { qdev_prop_set_bit(extioi, "has-virtualization-extension", true); } sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); - memory_region_add_subregion(&lams->system_iocsr, APIC_BASE, + memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE, sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); - if (virt_is_veiointc_enabled(lams)) { - memory_region_add_subregion(&lams->system_iocsr, EXTIOI_VIRT_BASE, + if (virt_is_veiointc_enabled(lvms)) { + memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE, sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); } - lams->extioi = extioi; + lvms->extioi = extioi; /* * connect ext irq to the cpu irq @@ -777,7 +778,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) } /* Add Extend I/O Interrupt Controller node */ - fdt_add_eiointc_node(lams, &cpuintc_phandle, &eiointc_phandle); + fdt_add_eiointc_node(lvms, &cpuintc_phandle, &eiointc_phandle); pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC); num = VIRT_PCH_PIC_IRQ_NUM; @@ -799,7 +800,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) } /* Add PCH PIC node */ - fdt_add_pch_pic_node(lams, &eiointc_phandle, &pch_pic_phandle); + fdt_add_pch_pic_node(lvms, &eiointc_phandle, &pch_pic_phandle); pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); start = num; @@ -816,30 +817,30 @@ static void loongarch_irq_init(LoongArchMachineState *lams) } /* Add PCH MSI node */ - fdt_add_pch_msi_node(lams, &eiointc_phandle, &pch_msi_phandle); + fdt_add_pch_msi_node(lvms, &eiointc_phandle, &pch_msi_phandle); - loongarch_devices_init(pch_pic, lams, &pch_pic_phandle, &pch_msi_phandle); + virt_devices_init(pch_pic, lvms, &pch_pic_phandle, &pch_msi_phandle); } -static void loongarch_firmware_init(LoongArchMachineState *lams) +static void virt_firmware_init(LoongArchVirtMachineState *lvms) { - char *filename = MACHINE(lams)->firmware; + char *filename = MACHINE(lvms)->firmware; char *bios_name = NULL; int bios_size, i; BlockBackend *pflash_blk0; MemoryRegion *mr; - lams->bios_loaded = false; + lvms->bios_loaded = false; /* Map legacy -drive if=pflash to machine properties */ - for (i = 0; i < ARRAY_SIZE(lams->flash); i++) { - pflash_cfi01_legacy_drive(lams->flash[i], + for (i = 0; i < ARRAY_SIZE(lvms->flash); i++) { + pflash_cfi01_legacy_drive(lvms->flash[i], drive_get(IF_PFLASH, 0, i)); } - virt_flash_map(lams, get_system_memory()); + virt_flash_map(lvms, get_system_memory()); - pflash_blk0 = pflash_cfi01_get_blk(lams->flash[0]); + pflash_blk0 = pflash_cfi01_get_blk(lvms->flash[0]); if (pflash_blk0) { if (filename) { @@ -847,7 +848,7 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) "options at once"); exit(1); } - lams->bios_loaded = true; + lvms->bios_loaded = true; return; } @@ -858,14 +859,14 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) exit(1); } - mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(lams->flash[0]), 0); + mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(lvms->flash[0]), 0); bios_size = load_image_mr(bios_name, mr); if (bios_size < 0) { error_report("Could not load ROM image '%s'", bios_name); exit(1); } g_free(bios_name); - lams->bios_loaded = true; + lvms->bios_loaded = true; } } @@ -873,16 +874,16 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) static MemTxResult loongarch_qemu_write(void *opaque, hwaddr addr, uint64_t val, unsigned size, MemTxAttrs attrs) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(opaque); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque); uint64_t features; switch (addr) { case MISC_FUNC_REG: - if (!virt_is_veiointc_enabled(lams)) { + if (!virt_is_veiointc_enabled(lvms)) { return MEMTX_OK; } - features = address_space_ldl(&lams->as_iocsr, + features = address_space_ldl(&lvms->as_iocsr, EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, attrs, NULL); if (val & BIT_ULL(IOCSRM_EXTIOI_EN)) { @@ -892,7 +893,7 @@ static MemTxResult loongarch_qemu_write(void *opaque, hwaddr addr, uint64_t val, features |= BIT(EXTIOI_ENABLE_INT_ENCODE); } - address_space_stl(&lams->as_iocsr, + address_space_stl(&lvms->as_iocsr, EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, features, attrs, NULL); } @@ -904,7 +905,7 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, uint64_t *data, unsigned size, MemTxAttrs attrs) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(opaque); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque); uint64_t ret = 0; int features; @@ -926,12 +927,12 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, ret = 0x303030354133ULL; /* "3A5000" */ break; case MISC_FUNC_REG: - if (!virt_is_veiointc_enabled(lams)) { + if (!virt_is_veiointc_enabled(lvms)) { ret |= BIT_ULL(IOCSRM_EXTIOI_EN); break; } - features = address_space_ldl(&lams->as_iocsr, + features = address_space_ldl(&lvms->as_iocsr, EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, attrs, NULL); if (features & BIT(EXTIOI_ENABLE)) { @@ -948,7 +949,7 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, return MEMTX_OK; } -static const MemoryRegionOps loongarch_qemu_ops = { +static const MemoryRegionOps virt_iocsr_misc_ops = { .read_with_attrs = loongarch_qemu_read, .write_with_attrs = loongarch_qemu_write, .endianness = DEVICE_LITTLE_ENDIAN, @@ -962,7 +963,7 @@ static const MemoryRegionOps loongarch_qemu_ops = { }, }; -static void loongarch_init(MachineState *machine) +static void virt_init(MachineState *machine) { LoongArchCPU *lacpu; const char *cpu_model = machine->cpu_type; @@ -970,7 +971,7 @@ static void loongarch_init(MachineState *machine) ram_addr_t ram_size = machine->ram_size; uint64_t highram_size = 0, phyAddr = 0; MemoryRegion *address_space_mem = get_system_memory(); - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(machine); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); int nb_numa_nodes = machine->numa_state->num_nodes; NodeInfo *numa_info = machine->numa_state->nodes; int i; @@ -986,16 +987,16 @@ static void loongarch_init(MachineState *machine) error_report("ram_size must be greater than 1G."); exit(1); } - create_fdt(lams); + create_fdt(lvms); /* Create IOCSR space */ - memory_region_init_io(&lams->system_iocsr, OBJECT(machine), NULL, + memory_region_init_io(&lvms->system_iocsr, OBJECT(machine), NULL, machine, "iocsr", UINT64_MAX); - address_space_init(&lams->as_iocsr, &lams->system_iocsr, "IOCSR"); - memory_region_init_io(&lams->iocsr_mem, OBJECT(machine), - &loongarch_qemu_ops, + address_space_init(&lvms->as_iocsr, &lvms->system_iocsr, "IOCSR"); + memory_region_init_io(&lvms->iocsr_mem, OBJECT(machine), + &virt_iocsr_misc_ops, machine, "iocsr_misc", 0x428); - memory_region_add_subregion(&lams->system_iocsr, 0, &lams->iocsr_mem); + memory_region_add_subregion(&lvms->system_iocsr, 0, &lvms->iocsr_mem); /* Init CPUs */ possible_cpus = mc->possible_cpu_arch_ids(machine); @@ -1006,14 +1007,14 @@ static void loongarch_init(MachineState *machine) lacpu = LOONGARCH_CPU(cpu); lacpu->phy_id = machine->possible_cpus->cpus[i].arch_id; } - fdt_add_cpu_nodes(lams); + fdt_add_cpu_nodes(lvms); /* Node0 memory */ memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0); - memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram", + memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.node0.lowram", machine->ram, offset, VIRT_LOWMEM_SIZE); - memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem); + memory_region_add_subregion(address_space_mem, phyAddr, &lvms->lowmem); offset += VIRT_LOWMEM_SIZE; if (nb_numa_nodes > 0) { @@ -1025,9 +1026,9 @@ static void loongarch_init(MachineState *machine) phyAddr = VIRT_HIGHMEM_BASE; memmap_add_entry(phyAddr, highram_size, 1); fdt_add_memory_node(machine, phyAddr, highram_size, 0); - memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram", + memory_region_init_alias(&lvms->highmem, NULL, "loongarch.node0.highram", machine->ram, offset, highram_size); - memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem); + memory_region_add_subregion(address_space_mem, phyAddr, &lvms->highmem); /* Node1 - Nodemax memory */ offset += highram_size; @@ -1068,30 +1069,30 @@ static void loongarch_init(MachineState *machine) } /* load the BIOS image. */ - loongarch_firmware_init(lams); + virt_firmware_init(lvms); /* fw_cfg init */ - lams->fw_cfg = loongarch_fw_cfg_init(ram_size, machine); - rom_set_fw(lams->fw_cfg); - if (lams->fw_cfg != NULL) { - fw_cfg_add_file(lams->fw_cfg, "etc/memmap", + lvms->fw_cfg = virt_fw_cfg_init(ram_size, machine); + rom_set_fw(lvms->fw_cfg); + if (lvms->fw_cfg != NULL) { + fw_cfg_add_file(lvms->fw_cfg, "etc/memmap", memmap_table, sizeof(struct memmap_entry) * (memmap_entries)); } - fdt_add_fw_cfg_node(lams); - fdt_add_flash_node(lams); + fdt_add_fw_cfg_node(lvms); + fdt_add_flash_node(lvms); /* Initialize the IO interrupt subsystem */ - loongarch_irq_init(lams); + virt_irq_init(lvms); platform_bus_add_all_fdt_nodes(machine->fdt, "/platic", VIRT_PLATFORM_BUS_BASEADDRESS, VIRT_PLATFORM_BUS_SIZE, VIRT_PLATFORM_BUS_IRQ); - lams->machine_done.notify = virt_machine_done; - qemu_add_machine_init_done_notifier(&lams->machine_done); + lvms->machine_done.notify = virt_done; + qemu_add_machine_init_done_notifier(&lvms->machine_done); /* connect powerdown request */ - lams->powerdown_notifier.notify = virt_powerdown_req; - qemu_register_powerdown_notifier(&lams->powerdown_notifier); + lvms->powerdown_notifier.notify = virt_powerdown_req; + qemu_register_powerdown_notifier(&lvms->powerdown_notifier); /* * Since lowmem region starts from 0 and Linux kernel legacy start address @@ -1100,52 +1101,44 @@ static void loongarch_init(MachineState *machine) * Put the FDT into the memory map as a ROM image: this will ensure * the FDT is copied again upon reset, even if addr points into RAM. */ - qemu_fdt_dumpdtb(machine->fdt, lams->fdt_size); - rom_add_blob_fixed_as("fdt", machine->fdt, lams->fdt_size, FDT_BASE, + qemu_fdt_dumpdtb(machine->fdt, lvms->fdt_size); + rom_add_blob_fixed_as("fdt", machine->fdt, lvms->fdt_size, FDT_BASE, &address_space_memory); qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, - rom_ptr_for_as(&address_space_memory, FDT_BASE, lams->fdt_size)); + rom_ptr_for_as(&address_space_memory, FDT_BASE, lvms->fdt_size)); - lams->bootinfo.ram_size = ram_size; - loongarch_load_kernel(machine, &lams->bootinfo); + lvms->bootinfo.ram_size = ram_size; + loongarch_load_kernel(machine, &lvms->bootinfo); } -bool loongarch_is_acpi_enabled(LoongArchMachineState *lams) +static void virt_get_acpi(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) { - if (lams->acpi == ON_OFF_AUTO_OFF) { - return false; - } - return true; -} - -static void loongarch_get_acpi(Object *obj, Visitor *v, const char *name, - void *opaque, Error **errp) -{ - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(obj); - OnOffAuto acpi = lams->acpi; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); + OnOffAuto acpi = lvms->acpi; visit_type_OnOffAuto(v, name, &acpi, errp); } -static void loongarch_set_acpi(Object *obj, Visitor *v, const char *name, +static void virt_set_acpi(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(obj); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); - visit_type_OnOffAuto(v, name, &lams->acpi, errp); + visit_type_OnOffAuto(v, name, &lvms->acpi, errp); } -static void loongarch_machine_initfn(Object *obj) +static void virt_initfn(Object *obj) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(obj); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); if (tcg_enabled()) { - lams->veiointc = ON_OFF_AUTO_OFF; + lvms->veiointc = ON_OFF_AUTO_OFF; } - lams->acpi = ON_OFF_AUTO_AUTO; - lams->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); - lams->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); - virt_flash_create(lams); + lvms->acpi = ON_OFF_AUTO_AUTO; + lvms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); + lvms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); + virt_flash_create(lvms); } static bool memhp_type_supported(DeviceState *dev) @@ -1161,7 +1154,7 @@ static void virt_mem_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), NULL, errp); } -static void virt_machine_device_pre_plug(HotplugHandler *hotplug_dev, +static void virt_device_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { if (memhp_type_supported(dev)) { @@ -1172,14 +1165,14 @@ static void virt_machine_device_pre_plug(HotplugHandler *hotplug_dev, static void virt_mem_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); /* the acpi ged is always exist */ - hotplug_handler_unplug_request(HOTPLUG_HANDLER(lams->acpi_ged), dev, + hotplug_handler_unplug_request(HOTPLUG_HANDLER(lvms->acpi_ged), dev, errp); } -static void virt_machine_device_unplug_request(HotplugHandler *hotplug_dev, +static void virt_device_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { if (memhp_type_supported(dev)) { @@ -1190,14 +1183,14 @@ static void virt_machine_device_unplug_request(HotplugHandler *hotplug_dev, static void virt_mem_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); - hotplug_handler_unplug(HOTPLUG_HANDLER(lams->acpi_ged), dev, errp); - pc_dimm_unplug(PC_DIMM(dev), MACHINE(lams)); + hotplug_handler_unplug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, errp); + pc_dimm_unplug(PC_DIMM(dev), MACHINE(lvms)); qdev_unrealize(dev); } -static void virt_machine_device_unplug(HotplugHandler *hotplug_dev, +static void virt_device_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { if (memhp_type_supported(dev)) { @@ -1208,31 +1201,32 @@ static void virt_machine_device_unplug(HotplugHandler *hotplug_dev, static void virt_mem_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); - pc_dimm_plug(PC_DIMM(dev), MACHINE(lams)); - hotplug_handler_plug(HOTPLUG_HANDLER(lams->acpi_ged), + pc_dimm_plug(PC_DIMM(dev), MACHINE(lvms)); + hotplug_handler_plug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &error_abort); } -static void loongarch_machine_device_plug_cb(HotplugHandler *hotplug_dev, +static void virt_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_VIRT_MACHINE(hotplug_dev); - MachineClass *mc = MACHINE_GET_CLASS(lams); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + MachineClass *mc = MACHINE_GET_CLASS(lvms); + PlatformBusDevice *pbus; if (device_is_dynamic_sysbus(mc, dev)) { - if (lams->platform_bus_dev) { - platform_bus_link_device(PLATFORM_BUS_DEVICE(lams->platform_bus_dev), - SYS_BUS_DEVICE(dev)); + if (lvms->platform_bus_dev) { + pbus = PLATFORM_BUS_DEVICE(lvms->platform_bus_dev); + platform_bus_link_device(pbus, SYS_BUS_DEVICE(dev)); } } else if (memhp_type_supported(dev)) { virt_mem_plug(hotplug_dev, dev, errp); } } -static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, - DeviceState *dev) +static HotplugHandler *virt_get_hotplug_handler(MachineState *machine, + DeviceState *dev) { MachineClass *mc = MACHINE_GET_CLASS(machine); @@ -1272,8 +1266,8 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) return ms->possible_cpus; } -static CpuInstanceProperties -virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index) +static CpuInstanceProperties virt_cpu_index_to_props(MachineState *ms, + unsigned cpu_index) { MachineClass *mc = MACHINE_GET_CLASS(ms); const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); @@ -1295,12 +1289,12 @@ static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx) return nidx; } -static void loongarch_class_init(ObjectClass *oc, void *data) +static void virt_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); - mc->init = loongarch_init; + mc->init = virt_init; mc->default_ram_size = 1 * GiB; mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464"); mc->default_ram_id = "loongarch.ram"; @@ -1316,15 +1310,15 @@ static void loongarch_class_init(ObjectClass *oc, void *data) mc->numa_mem_supported = true; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; - mc->get_hotplug_handler = virt_machine_get_hotplug_handler; + mc->get_hotplug_handler = virt_get_hotplug_handler; mc->default_nic = "virtio-net-pci"; - hc->plug = loongarch_machine_device_plug_cb; - hc->pre_plug = virt_machine_device_pre_plug; - hc->unplug_request = virt_machine_device_unplug_request; - hc->unplug = virt_machine_device_unplug; + hc->plug = virt_device_plug_cb; + hc->pre_plug = virt_device_pre_plug; + hc->unplug_request = virt_device_unplug_request; + hc->unplug = virt_device_unplug; object_class_property_add(oc, "acpi", "OnOffAuto", - loongarch_get_acpi, loongarch_set_acpi, + virt_get_acpi, virt_set_acpi, NULL, NULL); object_class_property_set_description(oc, "acpi", "Enable ACPI"); @@ -1338,13 +1332,13 @@ static void loongarch_class_init(ObjectClass *oc, void *data) #endif } -static const TypeInfo loongarch_machine_types[] = { +static const TypeInfo virt_machine_types[] = { { .name = TYPE_LOONGARCH_VIRT_MACHINE, .parent = TYPE_MACHINE, - .instance_size = sizeof(LoongArchMachineState), - .class_init = loongarch_class_init, - .instance_init = loongarch_machine_initfn, + .instance_size = sizeof(LoongArchVirtMachineState), + .class_init = virt_class_init, + .instance_init = virt_initfn, .interfaces = (InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, { } @@ -1352,4 +1346,4 @@ static const TypeInfo loongarch_machine_types[] = { } }; -DEFINE_TYPES(loongarch_machine_types) +DEFINE_TYPES(virt_machine_types) diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 0509b9a9af..0a4d9a25f0 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -37,7 +37,7 @@ #define FDT_BASE 0x100000 -struct LoongArchMachineState { +struct LoongArchVirtMachineState { /*< private >*/ MachineState parent_obj; @@ -67,7 +67,6 @@ struct LoongArchMachineState { }; #define TYPE_LOONGARCH_VIRT_MACHINE MACHINE_TYPE_NAME("virt") -OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_VIRT_MACHINE) -bool loongarch_is_acpi_enabled(LoongArchMachineState *lams); -void loongarch_acpi_setup(LoongArchMachineState *lams); +OBJECT_DECLARE_SIMPLE_TYPE(LoongArchVirtMachineState, LOONGARCH_VIRT_MACHINE) +void loongarch_acpi_setup(LoongArchVirtMachineState *lvms); #endif -- Gitee From a9f9a4a0a60596f2e738e6e434c20a3f5266fa17 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 19 Mar 2024 10:26:06 +0800 Subject: [PATCH 356/939] hw/loongarch: Refine default numa id calculation With numa_test test case, there is subcase named test_def_cpu_split(), there are 8 sockets and 2 numa nodes. Here is command line: "-machine smp.cpus=8,smp.sockets=8 -numa node,memdev=ram -numa node" The required result is: node 0 cpus: 0 2 4 6 node 1 cpus: 1 3 5 7 Test case numa_test fails on LoongArch, since the actual result is: node 0 cpus: 0 1 2 3 node 1 cpus: 4 5 6 7 It will be better if all the cpus in one socket share the same numa node. Here socket id is used to calculate numa id in function virt_get_default_cpu_node_id(). Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240319022606.2994565-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index e39989193e..e82e3b6792 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1278,15 +1278,14 @@ static CpuInstanceProperties virt_cpu_index_to_props(MachineState *ms, static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx) { - int64_t nidx = 0; + int64_t socket_id; if (ms->numa_state->num_nodes) { - nidx = idx / (ms->smp.cpus / ms->numa_state->num_nodes); - if (ms->numa_state->num_nodes <= nidx) { - nidx = ms->numa_state->num_nodes - 1; - } + socket_id = ms->possible_cpus->cpus[idx].props.socket_id; + return socket_id % ms->numa_state->num_nodes; + } else { + return 0; } - return nidx; } static void virt_class_init(ObjectClass *oc, void *data) -- Gitee From 0437c11a20b3c66882770e468518d33ff71a932a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 14 May 2024 10:51:09 +0800 Subject: [PATCH 357/939] hw/loongarch: Add VM mode in IOCSR feature register in kvm mode If VM runs in kvm mode, VM mode is added in IOCSR feature register. So guest can detect kvm hypervisor type and enable possible pv functions. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240514025109.3238398-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index e82e3b6792..c3514f9293 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -10,6 +10,7 @@ #include "qapi/error.h" #include "hw/boards.h" #include "hw/char/serial.h" +#include "sysemu/kvm.h" #include "sysemu/sysemu.h" #include "sysemu/qtest.h" #include "sysemu/runstate.h" @@ -914,12 +915,11 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, ret = 0x11ULL; break; case FEATURE_REG: - ret = 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI | - 1ULL << IOCSRF_CSRIPI; + ret = BIT(IOCSRF_MSI) | BIT(IOCSRF_EXTIOI) | BIT(IOCSRF_CSRIPI); if (kvm_enabled()) { - ret |= 1ULL << IOCSRF_VM; + ret |= BIT(IOCSRF_VM); } - break; + return ret; case VENDOR_REG: ret = 0x6e6f73676e6f6f4cULL; /* "Loongson" */ break; -- Gitee From 1c9b7b7e76a63738721ac1092fdfff12ae87993a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 15 May 2024 17:39:22 +0800 Subject: [PATCH 358/939] hw/loongarch: Refine acpi srat table for numa memory One LoongArch virt machine platform, there is limitation for memory map information. The minimum memory size is 256M and minimum memory size for numa node0 is 256M also. With qemu numa qtest, it is possible that memory size of numa node0 is 128M. Limitations for minimum memory size for both total memory and numa node0 is removed for acpi srat table creation. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240515093927.3453674-2-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/acpi-build.c | 58 +++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index 2b4e09bf37..2555c6763c 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -166,8 +166,9 @@ static void build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) { int i, arch_id, node_id; - uint64_t mem_len, mem_base; - int nb_numa_nodes = machine->numa_state->num_nodes; + hwaddr len, base, gap; + NodeInfo *numa_info; + int nodes, nb_numa_nodes = machine->numa_state->num_nodes; LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); MachineClass *mc = MACHINE_GET_CLASS(lvms); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine); @@ -196,35 +197,44 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_append_int_noprefix(table_data, 0, 4); /* Reserved */ } - /* Node0 */ - build_srat_memory(table_data, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, - 0, MEM_AFFINITY_ENABLED); - mem_base = VIRT_HIGHMEM_BASE; - if (!nb_numa_nodes) { - mem_len = machine->ram_size - VIRT_LOWMEM_SIZE; - } else { - mem_len = machine->numa_state->nodes[0].node_mem - VIRT_LOWMEM_SIZE; + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + numa_info = machine->numa_state->nodes; + nodes = nb_numa_nodes; + if (!nodes) { + nodes = 1; } - if (mem_len) - build_srat_memory(table_data, mem_base, mem_len, 0, MEM_AFFINITY_ENABLED); - - /* Node1 - Nodemax */ - if (nb_numa_nodes) { - mem_base += mem_len; - for (i = 1; i < nb_numa_nodes; ++i) { - if (machine->numa_state->nodes[i].node_mem > 0) { - build_srat_memory(table_data, mem_base, - machine->numa_state->nodes[i].node_mem, i, - MEM_AFFINITY_ENABLED); - mem_base += machine->numa_state->nodes[i].node_mem; - } + + for (i = 0; i < nodes; i++) { + if (nb_numa_nodes) { + len = numa_info[i].node_mem; + } else { + len = machine->ram_size; + } + + /* + * memory for the node splited into two part + * lowram: [base, +gap) + * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) + */ + if (len >= gap) { + build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED); + len -= gap; + base = VIRT_HIGHMEM_BASE; + gap = machine->ram_size - VIRT_LOWMEM_SIZE; + } + + if (len) { + build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED); + base += len; + gap -= len; } } if (machine->device_memory) { build_srat_memory(table_data, machine->device_memory->base, memory_region_size(&machine->device_memory->mr), - nb_numa_nodes - 1, + nodes - 1, MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); } -- Gitee From d39247ec5d4ef52a4b9422aaecccc284cbd1a5dd Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 15 May 2024 17:39:23 +0800 Subject: [PATCH 359/939] hw/loongarch: Refine fadt memory table for numa memory One LoongArch virt machine platform, there is limitation for memory map information. The minimum memory size is 256M and minimum memory size for numa node0 is 256M also. With qemu numa qtest, it is possible that memory size of numa node0 is 128M. Limitations for minimum memory size for both total memory and numa node0 is removed for fadt numa memory table creation. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240515093927.3453674-3-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index c3514f9293..31a2598e7c 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -502,6 +502,48 @@ static void fdt_add_memory_node(MachineState *ms, g_free(nodename); } +static void fdt_add_memory_nodes(MachineState *ms) +{ + hwaddr base, size, ram_size, gap; + int i, nb_numa_nodes, nodes; + NodeInfo *numa_info; + + ram_size = ms->ram_size; + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + nodes = nb_numa_nodes = ms->numa_state->num_nodes; + numa_info = ms->numa_state->nodes; + if (!nodes) { + nodes = 1; + } + + for (i = 0; i < nodes; i++) { + if (nb_numa_nodes) { + size = numa_info[i].node_mem; + } else { + size = ram_size; + } + + /* + * memory for the node splited into two part + * lowram: [base, +gap) + * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) + */ + if (size >= gap) { + fdt_add_memory_node(ms, base, gap, i); + size -= gap; + base = VIRT_HIGHMEM_BASE; + gap = ram_size - VIRT_LOWMEM_SIZE; + } + + if (size) { + fdt_add_memory_node(ms, base, size, i); + base += size; + gap -= size; + } + } +} + static void virt_build_smbios(LoongArchVirtMachineState *lvms) { MachineState *ms = MACHINE(lvms); @@ -1008,10 +1050,10 @@ static void virt_init(MachineState *machine) lacpu->phy_id = machine->possible_cpus->cpus[i].arch_id; } fdt_add_cpu_nodes(lvms); + fdt_add_memory_nodes(machine); /* Node0 memory */ memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); - fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0); memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.node0.lowram", machine->ram, offset, VIRT_LOWMEM_SIZE); memory_region_add_subregion(address_space_mem, phyAddr, &lvms->lowmem); @@ -1025,7 +1067,6 @@ static void virt_init(MachineState *machine) } phyAddr = VIRT_HIGHMEM_BASE; memmap_add_entry(phyAddr, highram_size, 1); - fdt_add_memory_node(machine, phyAddr, highram_size, 0); memory_region_init_alias(&lvms->highmem, NULL, "loongarch.node0.highram", machine->ram, offset, highram_size); memory_region_add_subregion(address_space_mem, phyAddr, &lvms->highmem); @@ -1041,7 +1082,6 @@ static void virt_init(MachineState *machine) offset, numa_info[i].node_mem); memory_region_add_subregion(address_space_mem, phyAddr, nodemem); memmap_add_entry(phyAddr, numa_info[i].node_mem, 1); - fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i); offset += numa_info[i].node_mem; phyAddr += numa_info[i].node_mem; } -- Gitee From 88b12e40d6a479dfb376fb6a91ef24e07a59d33a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 15 May 2024 17:39:24 +0800 Subject: [PATCH 360/939] hw/loongarch: Refine fwcfg memory map Memory map table for fwcfg is used for UEFI BIOS, UEFI BIOS uses the first entry from fwcfg memory map as the first memory HOB, the second memory HOB will be used if the first memory HOB is used up. Memory map table for fwcfg does not care about numa node, however in generic the first memory HOB is part of numa node0, so that runtime memory of UEFI which is allocated from the first memory HOB is located at numa node0. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240515093927.3453674-4-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 60 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 31a2598e7c..7e89921431 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1005,6 +1005,62 @@ static const MemoryRegionOps virt_iocsr_misc_ops = { }, }; +static void fw_cfg_add_memory(MachineState *ms) +{ + hwaddr base, size, ram_size, gap; + int nb_numa_nodes, nodes; + NodeInfo *numa_info; + + ram_size = ms->ram_size; + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + nodes = nb_numa_nodes = ms->numa_state->num_nodes; + numa_info = ms->numa_state->nodes; + if (!nodes) { + nodes = 1; + } + + /* add fw_cfg memory map of node0 */ + if (nb_numa_nodes) { + size = numa_info[0].node_mem; + } else { + size = ram_size; + } + + if (size >= gap) { + memmap_add_entry(base, gap, 1); + size -= gap; + base = VIRT_HIGHMEM_BASE; + gap = ram_size - VIRT_LOWMEM_SIZE; + } + + if (size) { + memmap_add_entry(base, size, 1); + base += size; + } + + if (nodes < 2) { + return; + } + + /* add fw_cfg memory map of other nodes */ + size = ram_size - numa_info[0].node_mem; + gap = VIRT_LOWMEM_BASE + VIRT_LOWMEM_SIZE; + if (base < gap && (base + size) > gap) { + /* + * memory map for the maining nodes splited into two part + * lowram: [base, +(gap - base)) + * highram: [VIRT_HIGHMEM_BASE, +(size - (gap - base))) + */ + memmap_add_entry(base, gap - base, 1); + size -= gap - base; + base = VIRT_HIGHMEM_BASE; + } + + if (size) + memmap_add_entry(base, size, 1); +} + static void virt_init(MachineState *machine) { LoongArchCPU *lacpu; @@ -1051,9 +1107,9 @@ static void virt_init(MachineState *machine) } fdt_add_cpu_nodes(lvms); fdt_add_memory_nodes(machine); + fw_cfg_add_memory(machine); /* Node0 memory */ - memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.node0.lowram", machine->ram, offset, VIRT_LOWMEM_SIZE); memory_region_add_subregion(address_space_mem, phyAddr, &lvms->lowmem); @@ -1066,7 +1122,6 @@ static void virt_init(MachineState *machine) highram_size = ram_size - VIRT_LOWMEM_SIZE; } phyAddr = VIRT_HIGHMEM_BASE; - memmap_add_entry(phyAddr, highram_size, 1); memory_region_init_alias(&lvms->highmem, NULL, "loongarch.node0.highram", machine->ram, offset, highram_size); memory_region_add_subregion(address_space_mem, phyAddr, &lvms->highmem); @@ -1081,7 +1136,6 @@ static void virt_init(MachineState *machine) memory_region_init_alias(nodemem, NULL, ramName, machine->ram, offset, numa_info[i].node_mem); memory_region_add_subregion(address_space_mem, phyAddr, nodemem); - memmap_add_entry(phyAddr, numa_info[i].node_mem, 1); offset += numa_info[i].node_mem; phyAddr += numa_info[i].node_mem; } -- Gitee From 1a7f567308756a2a26020802b24fe7fd106cf84a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 15 May 2024 17:39:25 +0800 Subject: [PATCH 361/939] hw/loongarch: Refine system dram memory region For system dram memory region, it is not necessary to use numa node information. There is only low memory region and high memory region. Remove numa node information for ddr memory region here, it can reduce memory region number on LoongArch virt machine. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240515093927.3453674-5-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 53 +++++++++++++++------------------------------ 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 7e89921431..96755f5deb 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1065,14 +1065,10 @@ static void virt_init(MachineState *machine) { LoongArchCPU *lacpu; const char *cpu_model = machine->cpu_type; - ram_addr_t offset = 0; - ram_addr_t ram_size = machine->ram_size; - uint64_t highram_size = 0, phyAddr = 0; MemoryRegion *address_space_mem = get_system_memory(); LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); - int nb_numa_nodes = machine->numa_state->num_nodes; - NodeInfo *numa_info = machine->numa_state->nodes; int i; + hwaddr base, size, ram_size = machine->ram_size; const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); CPUState *cpu; @@ -1110,40 +1106,27 @@ static void virt_init(MachineState *machine) fw_cfg_add_memory(machine); /* Node0 memory */ - memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.node0.lowram", - machine->ram, offset, VIRT_LOWMEM_SIZE); - memory_region_add_subregion(address_space_mem, phyAddr, &lvms->lowmem); - - offset += VIRT_LOWMEM_SIZE; - if (nb_numa_nodes > 0) { - assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE); - highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE; - } else { - highram_size = ram_size - VIRT_LOWMEM_SIZE; + size = ram_size; + base = VIRT_LOWMEM_BASE; + if (size > VIRT_LOWMEM_SIZE) { + size = VIRT_LOWMEM_SIZE; } - phyAddr = VIRT_HIGHMEM_BASE; - memory_region_init_alias(&lvms->highmem, NULL, "loongarch.node0.highram", - machine->ram, offset, highram_size); - memory_region_add_subregion(address_space_mem, phyAddr, &lvms->highmem); - - /* Node1 - Nodemax memory */ - offset += highram_size; - phyAddr += highram_size; - - for (i = 1; i < nb_numa_nodes; i++) { - MemoryRegion *nodemem = g_new(MemoryRegion, 1); - g_autofree char *ramName = g_strdup_printf("loongarch.node%d.ram", i); - memory_region_init_alias(nodemem, NULL, ramName, machine->ram, - offset, numa_info[i].node_mem); - memory_region_add_subregion(address_space_mem, phyAddr, nodemem); - offset += numa_info[i].node_mem; - phyAddr += numa_info[i].node_mem; + + memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.lowram", + machine->ram, base, size); + memory_region_add_subregion(address_space_mem, base, &lvms->lowmem); + base += size; + if (ram_size - size) { + base = VIRT_HIGHMEM_BASE; + memory_region_init_alias(&lvms->highmem, NULL, "loongarch.highram", + machine->ram, VIRT_LOWMEM_BASE + size, ram_size - size); + memory_region_add_subregion(address_space_mem, base, &lvms->highmem); + base += ram_size - size; } /* initialize device memory address space */ if (machine->ram_size < machine->maxram_size) { ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; - hwaddr device_mem_base; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -1157,9 +1140,7 @@ static void virt_init(MachineState *machine) "%d bytes", TARGET_PAGE_SIZE); exit(EXIT_FAILURE); } - /* device memory base is the top of high memory address. */ - device_mem_base = ROUND_UP(VIRT_HIGHMEM_BASE + highram_size, 1 * GiB); - machine_memory_devices_init(machine, device_mem_base, device_mem_size); + machine_memory_devices_init(machine, base, device_mem_size); } /* load the BIOS image. */ -- Gitee From 858f16ea09fbbac9966ca73b6b86d290a36be6f5 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 15 May 2024 17:39:26 +0800 Subject: [PATCH 362/939] hw/loongarch: Remove minimum and default memory size Some qtest test cases such as numa use default memory size of generic machine class, which is 128M by fault. Here generic default memory size is used, and also remove minimum memory size which is 1G originally. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240515093927.3453674-6-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 96755f5deb..11ba879e52 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1077,10 +1077,6 @@ static void virt_init(MachineState *machine) cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); } - if (ram_size < 1 * GiB) { - error_report("ram_size must be greater than 1G."); - exit(1); - } create_fdt(lvms); /* Create IOCSR space */ @@ -1369,7 +1365,6 @@ static void virt_class_init(ObjectClass *oc, void *data) HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); mc->init = virt_init; - mc->default_ram_size = 1 * GiB; mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464"); mc->default_ram_id = "loongarch.ram"; mc->max_cpus = LOONGARCH_MAX_CPUS; -- Gitee From 254957f2de480901a063759d762d4b1eca5b5bb0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 28 May 2024 16:20:53 +0800 Subject: [PATCH 363/939] tests/libqos: Add loongarch virt machine node Add loongarch virt machine to the graph. It is a modified copy of the existing riscv virtmachine in riscv-virt-machine.c It contains a generic-pcihost controller, and an extra function loongarch_config_qpci_bus() to configure GPEX pci host controller information, such as ecam and pio_base addresses. Also hotplug handle checking about TYPE_VIRTIO_IOMMU_PCI device is added on loongarch virt machine, since virtio_mmu_pci device requires it. Signed-off-by: Bibo Mao Acked-by: Thomas Huth Message-Id: <20240528082053.938564-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 2 + tests/qtest/libqos/loongarch-virt-machine.c | 114 ++++++++++++++++++++ tests/qtest/libqos/meson.build | 1 + 3 files changed, 117 insertions(+) create mode 100644 tests/qtest/libqos/loongarch-virt-machine.c diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 11ba879e52..f7874bccf9 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -47,6 +47,7 @@ #include "sysemu/tpm.h" #include "sysemu/block-backend.h" #include "hw/block/flash.h" +#include "hw/virtio/virtio-iommu.h" #include "qemu/error-report.h" static bool virt_is_veiointc_enabled(LoongArchVirtMachineState *lvms) @@ -1302,6 +1303,7 @@ static HotplugHandler *virt_get_hotplug_handler(MachineState *machine, MachineClass *mc = MACHINE_GET_CLASS(machine); if (device_is_dynamic_sysbus(mc, dev) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || memhp_type_supported(dev)) { return HOTPLUG_HANDLER(machine); } diff --git a/tests/qtest/libqos/loongarch-virt-machine.c b/tests/qtest/libqos/loongarch-virt-machine.c new file mode 100644 index 0000000000..c12089c015 --- /dev/null +++ b/tests/qtest/libqos/loongarch-virt-machine.c @@ -0,0 +1,114 @@ +/* + * libqos driver framework + * + * Copyright (c) 2018 Emanuele Giuseppe Esposito + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "qemu/osdep.h" +#include "../libqtest.h" +#include "qemu/module.h" +#include "libqos-malloc.h" +#include "qgraph.h" +#include "virtio-mmio.h" +#include "generic-pcihost.h" +#include "hw/pci/pci_regs.h" + +#define LOONGARCH_PAGE_SIZE 0x1000 +#define LOONGARCH_VIRT_RAM_ADDR 0x100000 +#define LOONGARCH_VIRT_RAM_SIZE 0xFF00000 + +#define LOONGARCH_VIRT_PIO_BASE 0x18000000 +#define LOONGARCH_VIRT_PCIE_PIO_OFFSET 0x4000 +#define LOONGARCH_VIRT_PCIE_PIO_LIMIT 0x10000 +#define LOONGARCH_VIRT_PCIE_ECAM_BASE 0x20000000 +#define LOONGARCH_VIRT_PCIE_MMIO32_BASE 0x40000000 +#define LOONGARCH_VIRT_PCIE_MMIO32_LIMIT 0x80000000 + +typedef struct QVirtMachine QVirtMachine; + +struct QVirtMachine { + QOSGraphObject obj; + QGuestAllocator alloc; + QVirtioMMIODevice virtio_mmio; + QGenericPCIHost bridge; +}; + +static void virt_destructor(QOSGraphObject *obj) +{ + QVirtMachine *machine = (QVirtMachine *) obj; + alloc_destroy(&machine->alloc); +} + +static void *virt_get_driver(void *object, const char *interface) +{ + QVirtMachine *machine = object; + if (!g_strcmp0(interface, "memory")) { + return &machine->alloc; + } + + fprintf(stderr, "%s not present in loongarch/virtio\n", interface); + g_assert_not_reached(); +} + +static QOSGraphObject *virt_get_device(void *obj, const char *device) +{ + QVirtMachine *machine = obj; + if (!g_strcmp0(device, "generic-pcihost")) { + return &machine->bridge.obj; + } else if (!g_strcmp0(device, "virtio-mmio")) { + return &machine->virtio_mmio.obj; + } + + fprintf(stderr, "%s not present in loongarch/virt\n", device); + g_assert_not_reached(); +} + +static void loongarch_config_qpci_bus(QGenericPCIBus *qpci) +{ + qpci->gpex_pio_base = LOONGARCH_VIRT_PIO_BASE; + qpci->bus.pio_alloc_ptr = LOONGARCH_VIRT_PCIE_PIO_OFFSET; + qpci->bus.pio_limit = LOONGARCH_VIRT_PCIE_PIO_LIMIT; + qpci->bus.mmio_alloc_ptr = LOONGARCH_VIRT_PCIE_MMIO32_BASE; + qpci->bus.mmio_limit = LOONGARCH_VIRT_PCIE_MMIO32_LIMIT; + qpci->ecam_alloc_ptr = LOONGARCH_VIRT_PCIE_ECAM_BASE; +} + +static void *qos_create_machine_loongarch_virt(QTestState *qts) +{ + QVirtMachine *machine = g_new0(QVirtMachine, 1); + + alloc_init(&machine->alloc, 0, + LOONGARCH_VIRT_RAM_ADDR, + LOONGARCH_VIRT_RAM_ADDR + LOONGARCH_VIRT_RAM_SIZE, + LOONGARCH_PAGE_SIZE); + + qos_create_generic_pcihost(&machine->bridge, qts, &machine->alloc); + loongarch_config_qpci_bus(&machine->bridge.pci); + + machine->obj.get_device = virt_get_device; + machine->obj.get_driver = virt_get_driver; + machine->obj.destructor = virt_destructor; + return machine; +} + +static void virt_machine_register_nodes(void) +{ + qos_node_create_machine_args("loongarch64/virt", + qos_create_machine_loongarch_virt, + " -cpu la464"); + qos_node_contains("loongarch64/virt", "generic-pcihost", NULL); +} + +libqos_init(virt_machine_register_nodes); diff --git a/tests/qtest/libqos/meson.build b/tests/qtest/libqos/meson.build index 90aae42a22..482c9b2aab 100644 --- a/tests/qtest/libqos/meson.build +++ b/tests/qtest/libqos/meson.build @@ -60,6 +60,7 @@ libqos_srcs = files( 'arm-xilinx-zynq-a9-machine.c', 'ppc64_pseries-machine.c', 'x86_64_pc-machine.c', + 'loongarch-virt-machine.c', ) if have_virtfs -- Gitee From b63b7b0b6c9bed8e1a316f3838aab7db2e8f2037 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 28 May 2024 16:38:54 +0800 Subject: [PATCH 364/939] hw/loongarch/virt: Use MemTxAttrs interface for misc ops Use MemTxAttrs interface read_with_attrs/write_with_attrs for virt_iocsr_misc_ops. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240528083855.1912757-3-gaosong@loongson.cn> --- hw/loongarch/virt.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index f7874bccf9..12816c6023 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -915,8 +915,8 @@ static void virt_firmware_init(LoongArchVirtMachineState *lvms) } -static MemTxResult loongarch_qemu_write(void *opaque, hwaddr addr, uint64_t val, - unsigned size, MemTxAttrs attrs) +static MemTxResult virt_iocsr_misc_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) { LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque); uint64_t features; @@ -945,9 +945,9 @@ static MemTxResult loongarch_qemu_write(void *opaque, hwaddr addr, uint64_t val, return MEMTX_OK; } -static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, - uint64_t *data, - unsigned size, MemTxAttrs attrs) +static MemTxResult virt_iocsr_misc_read(void *opaque, hwaddr addr, + uint64_t *data, + unsigned size, MemTxAttrs attrs) { LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque); uint64_t ret = 0; @@ -962,7 +962,7 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, if (kvm_enabled()) { ret |= BIT(IOCSRF_VM); } - return ret; + break; case VENDOR_REG: ret = 0x6e6f73676e6f6f4cULL; /* "Loongson" */ break; @@ -986,6 +986,8 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, ret |= BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE); } break; + default: + g_assert_not_reached(); } *data = ret; @@ -993,8 +995,8 @@ static MemTxResult loongarch_qemu_read(void *opaque, hwaddr addr, } static const MemoryRegionOps virt_iocsr_misc_ops = { - .read_with_attrs = loongarch_qemu_read, - .write_with_attrs = loongarch_qemu_write, + .read_with_attrs = virt_iocsr_misc_read, + .write_with_attrs = virt_iocsr_misc_write, .endianness = DEVICE_LITTLE_ENDIAN, .valid = { .min_access_size = 4, -- Gitee From f9cc704bbcf8bb8a06095289921dc88944d0fe94 Mon Sep 17 00:00:00 2001 From: Dmitry Frolov Date: Fri, 28 Jun 2024 15:39:10 +0300 Subject: [PATCH 365/939] hw/loongarch/boot.c: fix out-of-bound reading memcpy() is trying to READ 512 bytes from memory, pointed by info->kernel_cmdline, which was (presumable) allocated by g_strdup(""); Found with ASAN, making check with enabled sanitizers. Signed-off-by: Dmitry Frolov Reviewed-by: Song Gao Message-Id: <20240628123910.577740-1-frolov@swemel.ru> Signed-off-by: Song Gao --- hw/loongarch/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index b8e1aa18d5..cb668703bd 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -163,7 +163,7 @@ static void init_cmdline(struct loongarch_boot_info *info, void *p, void *start) info->a0 = 1; info->a1 = cmdline_addr; - memcpy(p, info->kernel_cmdline, COMMAND_LINE_SIZE); + g_strlcpy(p, info->kernel_cmdline, COMMAND_LINE_SIZE); } static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) -- Gitee From 67c68371e457a85e460221a8c56d8dc93186f79f Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 24 Jun 2024 11:23:00 +0800 Subject: [PATCH 366/939] hw/loongarch: Change the tpm support by default Add devices that support tpm by default, Fixed incomplete tpm acpi table information. Signed-off-by: Xianglai Li Reviewed-by: Song Gao Message-Id: <20240624032300.999157-1-lixianglai@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/Kconfig | 1 + hw/loongarch/acpi-build.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index 7864050563..b2a3adb7dc 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -7,6 +7,7 @@ config LOONGARCH_VIRT imply VIRTIO_VGA imply PCI_DEVICES imply NVDIMM + imply TPM_TIS_SYSBUS select SERIAL select VIRTIO_PCI select PLATFORM_BUS diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index 2555c6763c..6593476409 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -646,6 +646,9 @@ void loongarch_acpi_setup(LoongArchVirtMachineState *lvms) build_state, tables.rsdp, ACPI_BUILD_RSDP_FILE); + fw_cfg_add_file(lvms->fw_cfg, ACPI_BUILD_TPMLOG_FILE, tables.tcpalog->data, + acpi_data_len(tables.tcpalog)); + qemu_register_reset(acpi_build_reset, build_state); acpi_build_reset(build_state); vmstate_register(NULL, 0, &vmstate_acpi_build, build_state); -- Gitee From 5e0ec61ac98a025124912fc47552550b471ab638 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 12 Jun 2024 11:36:37 +0800 Subject: [PATCH 367/939] hw/loongarch/virt: Remove unused assignment There is abuse usage about local variable gap. Remove duplicated assignment and solve Coverity reported error. Resolves: Coverity CID 1546441 Fixes: 3cc451cbce ("hw/loongarch: Refine fwcfg memory map") Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240612033637.167787-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 12816c6023..a7283e6755 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1034,7 +1034,6 @@ static void fw_cfg_add_memory(MachineState *ms) memmap_add_entry(base, gap, 1); size -= gap; base = VIRT_HIGHMEM_BASE; - gap = ram_size - VIRT_LOWMEM_SIZE; } if (size) { @@ -1047,17 +1046,17 @@ static void fw_cfg_add_memory(MachineState *ms) } /* add fw_cfg memory map of other nodes */ - size = ram_size - numa_info[0].node_mem; - gap = VIRT_LOWMEM_BASE + VIRT_LOWMEM_SIZE; - if (base < gap && (base + size) > gap) { + if (numa_info[0].node_mem < gap && ram_size > gap) { /* * memory map for the maining nodes splited into two part - * lowram: [base, +(gap - base)) - * highram: [VIRT_HIGHMEM_BASE, +(size - (gap - base))) + * lowram: [base, +(gap - numa_info[0].node_mem)) + * highram: [VIRT_HIGHMEM_BASE, +(ram_size - gap)) */ - memmap_add_entry(base, gap - base, 1); - size -= gap - base; + memmap_add_entry(base, gap - numa_info[0].node_mem, 1); + size = ram_size - gap; base = VIRT_HIGHMEM_BASE; + } else { + size = ram_size - numa_info[0].node_mem; } if (size) -- Gitee From 087201cd62e71801855775c3aa6395c7e1c00cee Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Tue, 20 Aug 2024 19:42:33 +0100 Subject: [PATCH 368/939] hw/loongarch: Fix length for lowram in ACPI SRAT The size of lowram should be "gap" instead of the whole node. This is failing kernel's sanity check: [ 0.000000] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0xffffffff] [ 0.000000] ACPI: SRAT: Node 0 PXM 0 [mem 0x80000000-0x16fffffff] [ 0.000000] ACPI: SRAT: Node 1 PXM 1 [mem 0x170000000-0x26fffffff] [ 0.000000] Warning: node 0 [mem 0x00000000-0xffffffff] overlaps with itself [mem 0x80000000-0x16fffffff] Fixes: fc100011f38d ("hw/loongarch: Refine acpi srat table for numa memory") Signed-off-by: Jiaxun Yang Reviewed-by: Bibo Mao Signed-off-by: Song Gao --- hw/loongarch/acpi-build.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index 6593476409..1a9d25fc51 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -218,7 +218,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) */ if (len >= gap) { - build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED); + build_srat_memory(table_data, base, gap, i, MEM_AFFINITY_ENABLED); len -= gap; base = VIRT_HIGHMEM_BASE; gap = machine->ram_size - VIRT_LOWMEM_SIZE; -- Gitee From 94fa0b75c098ca6fc987f103760c1e07695ffd1a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 23 Aug 2024 15:30:50 +0800 Subject: [PATCH 369/939] hw/loongarch: Remove default enable with VIRTIO_VGA device For virtio VGA deivce libvirt will select VIRTIO_VGA firstly rather than VIRTIO_GPU, VIRTIO_VGA device supports frame buffer however it requires legacy VGA compatible support. Frame buffer area 0xa0000 -- 0xc0000 conflicts with low memory area 0 -- 0x10000000. Here remove default support for VIRTIO_VGA device, VIRTIO_GPU is prefered on LoongArch system. For frame buffer video card support, standard VGA can be used. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240823073050.2619484-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index b2a3adb7dc..40944a8365 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -4,7 +4,6 @@ config LOONGARCH_VIRT depends on LOONGARCH64 select PCI select PCI_EXPRESS_GENERIC_BRIDGE - imply VIRTIO_VGA imply PCI_DEVICES imply NVDIMM imply TPM_TIS_SYSBUS -- Gitee From 04895c794652c5da1ece0cad82741bed9aa8ad02 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 7 Sep 2024 16:34:39 +0200 Subject: [PATCH 370/939] hw/loongarch: virt: support up to 4 serial ports In order to support additional channels of communication using `-serial`, add several serial ports, up to the standard 4 generally supported by the 8250 driver. Fixed: https://lore.kernel.org/all/20240907143439.2792924-1-Jason@zx2c4.com/ Signed-off-by: Jason A. Donenfeld Tested-by: Bibo Mao [gaosong: ACPI uart need't reverse order] Signed-off-by: Song Gao Message-Id: <20240907143439.2792924-1-Jason@zx2c4.com> --- hw/loongarch/acpi-build.c | 23 +++++++++++++++-------- hw/loongarch/virt.c | 27 +++++++++++++++++---------- include/hw/pci-host/ls7a.h | 9 +++++---- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index 1a9d25fc51..33a92223d8 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -31,6 +31,7 @@ #include "hw/acpi/generic_event_device.h" #include "hw/pci-host/gpex.h" +#include "sysemu/sysemu.h" #include "sysemu/tpm.h" #include "hw/platform-bus.h" #include "hw/acpi/aml-build.h" @@ -252,23 +253,27 @@ struct AcpiBuildState { MemoryRegion *linker_mr; } AcpiBuildState; -static void build_uart_device_aml(Aml *table) +static void build_uart_device_aml(Aml *table, int index) { Aml *dev; Aml *crs; Aml *pkg0, *pkg1, *pkg2; - uint32_t uart_irq = VIRT_UART_IRQ; - - Aml *scope = aml_scope("_SB"); - dev = aml_device("COMA"); + Aml *scope; + uint32_t uart_irq; + uint64_t base; + + uart_irq = VIRT_UART_IRQ + index; + base = VIRT_UART_BASE + index * VIRT_UART_SIZE; + scope = aml_scope("_SB"); + dev = aml_device("COM%d", index); aml_append(dev, aml_name_decl("_HID", aml_string("PNP0501"))); - aml_append(dev, aml_name_decl("_UID", aml_int(0))); + aml_append(dev, aml_name_decl("_UID", aml_int(index))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); crs = aml_resource_template(); aml_append(crs, aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED, AML_NON_CACHEABLE, AML_READ_WRITE, - 0, VIRT_UART_BASE, VIRT_UART_BASE + VIRT_UART_SIZE - 1, + 0, base, base + VIRT_UART_SIZE - 1, 0, VIRT_UART_SIZE)); aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH, AML_SHARED, &uart_irq, 1)); @@ -401,6 +406,7 @@ static void acpi_dsdt_add_tpm(Aml *scope, LoongArchVirtMachineState *vms) static void build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) { + int i; Aml *dsdt, *scope, *pkg; LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lvms->oem_id, @@ -408,7 +414,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) acpi_table_begin(&table, table_data); dsdt = init_aml_allocator(); - build_uart_device_aml(dsdt); + for (i = 0; i < VIRT_UART_COUNT; i++) + build_uart_device_aml(dsdt, i); build_pci_device_aml(dsdt, lvms); build_la_ged_aml(dsdt, machine); build_flash_aml(dsdt, lvms); diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index a7283e6755..a6e9309064 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -281,10 +281,10 @@ static void fdt_add_rtc_node(LoongArchVirtMachineState *lvms, } static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, - uint32_t *pch_pic_phandle) + uint32_t *pch_pic_phandle, hwaddr base, + int irq, bool chosen) { char *nodename; - hwaddr base = VIRT_UART_BASE; hwaddr size = VIRT_UART_SIZE; MachineState *ms = MACHINE(lvms); @@ -293,9 +293,9 @@ static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "ns16550a"); qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0, base, 0x0, size); qemu_fdt_setprop_cell(ms->fdt, nodename, "clock-frequency", 100000000); - qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename); - qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", - VIRT_UART_IRQ - VIRT_GSI_BASE, 0x4); + if (chosen) + qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", irq, 0x4); qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", *pch_pic_phandle); g_free(nodename); @@ -706,11 +706,18 @@ static void virt_devices_init(DeviceState *pch_pic, /* Add pcie node */ fdt_add_pcie_node(lvms, pch_pic_phandle, pch_msi_phandle); - serial_mm_init(get_system_memory(), VIRT_UART_BASE, 0, - qdev_get_gpio_in(pch_pic, - VIRT_UART_IRQ - VIRT_GSI_BASE), - 115200, serial_hd(0), DEVICE_LITTLE_ENDIAN); - fdt_add_uart_node(lvms, pch_pic_phandle); + /* + * Create uart fdt node in reverse order so that they appear + * in the finished device tree lowest address first + */ + for (i = VIRT_UART_COUNT; i --> 0;) { + hwaddr base = VIRT_UART_BASE + i * VIRT_UART_SIZE; + int irq = VIRT_UART_IRQ + i - VIRT_GSI_BASE; + serial_mm_init(get_system_memory(), base, 0, + qdev_get_gpio_in(pch_pic, irq), + 115200, serial_hd(i), DEVICE_LITTLE_ENDIAN); + fdt_add_uart_node(lvms, pch_pic_phandle, base, irq, i == 0); + } /* Network init */ for (i = 0; i < nb_nics; i++) { diff --git a/include/hw/pci-host/ls7a.h b/include/hw/pci-host/ls7a.h index cd7c9ec7bc..79d4ea8501 100644 --- a/include/hw/pci-host/ls7a.h +++ b/include/hw/pci-host/ls7a.h @@ -36,17 +36,18 @@ #define VIRT_PCH_PIC_IRQ_NUM 32 #define VIRT_GSI_BASE 64 #define VIRT_DEVICE_IRQS 16 +#define VIRT_UART_COUNT 4 #define VIRT_UART_IRQ (VIRT_GSI_BASE + 2) #define VIRT_UART_BASE 0x1fe001e0 -#define VIRT_UART_SIZE 0X100 -#define VIRT_RTC_IRQ (VIRT_GSI_BASE + 3) +#define VIRT_UART_SIZE 0x100 +#define VIRT_RTC_IRQ (VIRT_GSI_BASE + 6) #define VIRT_MISC_REG_BASE (VIRT_PCH_REG_BASE + 0x00080000) #define VIRT_RTC_REG_BASE (VIRT_MISC_REG_BASE + 0x00050100) #define VIRT_RTC_LEN 0x100 -#define VIRT_SCI_IRQ (VIRT_GSI_BASE + 4) +#define VIRT_SCI_IRQ (VIRT_GSI_BASE + 7) #define VIRT_PLATFORM_BUS_BASEADDRESS 0x16000000 #define VIRT_PLATFORM_BUS_SIZE 0x2000000 #define VIRT_PLATFORM_BUS_NUM_IRQS 2 -#define VIRT_PLATFORM_BUS_IRQ (VIRT_GSI_BASE + 5) +#define VIRT_PLATFORM_BUS_IRQ (VIRT_GSI_BASE + 8) #endif -- Gitee From 573f3bec8137caf829457620380d794165c96a92 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 5 Sep 2024 17:33:16 +0200 Subject: [PATCH 371/939] hw/loongarch: virt: pass random seed to fdt If the FDT contains /chosen/rng-seed, then the Linux RNG will use it to initialize early. Set this using the usual guest random number generation function. This is the same procedure that's done in b91b6b5a2c ("hw/microblaze: pass random seed to fdt"), e4b4f0b71c ("hw/riscv: virt: pass random seed to fdt"), c6fe3e6b4c ("hw/openrisc: virt: pass random seed to fdt"), 67f7e426e5 ("hw/i386: pass RNG seed via setup_data entry"), c287941a4d ("hw/rx: pass random seed to fdt"), 5e19cc68fb ("hw/mips: boston: pass random seed to fdt"), 6b23a67916 ("hw/nios2: virt: pass random seed to fdt") c4b075318e ("hw/ppc: pass random seed to fdt"), and 5242876f37 ("hw/arm/virt: dt: add rng-seed property"). These earlier commits later were amended to rerandomize the RNG seed on snapshot load, but the LoongArch code somehow already does that, despite not having this patch here, presumably due to some lucky copy and pasting. Signed-off-by: Jason A. Donenfeld Reviewed-by: Song Gao Message-Id: <20240905153316.2038769-1-Jason@zx2c4.com> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index a6e9309064..79b16953d2 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -49,6 +49,7 @@ #include "hw/block/flash.h" #include "hw/virtio/virtio-iommu.h" #include "qemu/error-report.h" +#include "qemu/guest-random.h" static bool virt_is_veiointc_enabled(LoongArchVirtMachineState *lvms) { @@ -304,6 +305,7 @@ static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, static void create_fdt(LoongArchVirtMachineState *lvms) { MachineState *ms = MACHINE(lvms); + uint8_t rng_seed[32]; ms->fdt = create_device_tree(&lvms->fdt_size); if (!ms->fdt) { @@ -317,6 +319,10 @@ static void create_fdt(LoongArchVirtMachineState *lvms) qemu_fdt_setprop_cell(ms->fdt, "/", "#address-cells", 0x2); qemu_fdt_setprop_cell(ms->fdt, "/", "#size-cells", 0x2); qemu_fdt_add_subnode(ms->fdt, "/chosen"); + + /* Pass seed to RNG */ + qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); + qemu_fdt_setprop(ms->fdt, "/chosen", "rng-seed", rng_seed, sizeof(rng_seed)); } static void fdt_add_cpu_nodes(const LoongArchVirtMachineState *lvms) -- Gitee From fe22e0efe4c1c99fc876a42446cb2c87f9457afb Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sat, 7 Sep 2024 15:30:37 +0800 Subject: [PATCH 372/939] hw/loongarch: Add acpi SPCR table support Serial port console redirection table can be used for default serial port selection, like chosen stdout-path selection with FDT method. With acpi SPCR table added, early debug console can be parsed from SPCR table with simple kernel parameter earlycon rather than earlycon=uart,mmio,0x1fe001e0 Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240907073037.243353-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/acpi-build.c | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index 33a92223d8..bcdec2e1cb 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -242,6 +242,44 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) acpi_table_end(linker, &table); } +/* + * Serial Port Console Redirection Table (SPCR) + * https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table + */ +static void +spcr_setup(GArray *table_data, BIOSLinker *linker, MachineState *machine) +{ + LoongArchVirtMachineState *lvms; + AcpiSpcrData serial = { + .interface_type = 0, /* 16550 compatible */ + .base_addr.id = AML_AS_SYSTEM_MEMORY, + .base_addr.width = 32, + .base_addr.offset = 0, + .base_addr.size = 1, + .base_addr.addr = VIRT_UART_BASE, + .interrupt_type = 0, /* Interrupt not supported */ + .pc_interrupt = 0, + .interrupt = VIRT_UART_IRQ, + .baud_rate = 7, /* 115200 */ + .parity = 0, + .stop_bits = 1, + .flow_control = 0, + .terminal_type = 3, /* ANSI */ + .language = 0, /* Language */ + .pci_device_id = 0xffff, /* not a PCI device*/ + .pci_vendor_id = 0xffff, /* not a PCI device*/ + .pci_bus = 0, + .pci_device = 0, + .pci_function = 0, + .pci_flags = 0, + .pci_segment = 0, + }; + + lvms = LOONGARCH_VIRT_MACHINE(machine); + build_spcr(table_data, linker, &serial, 2, lvms->oem_id, + lvms->oem_table_id); +} + typedef struct AcpiBuildState { /* Copy of table in RAM (for patching). */ @@ -484,6 +522,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); build_srat(tables_blob, tables->linker, machine); + acpi_add_table(table_offsets, tables_blob); + spcr_setup(tables_blob, tables->linker, machine); if (machine->numa_state->num_nodes) { if (machine->numa_state->have_numa_distance) { -- Gitee From 080ca7865257d70b6be671cbc17a97c5ebffbd68 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 13 Sep 2024 17:52:02 +0800 Subject: [PATCH 373/939] hw/loongarch/virt: Add description for virt machine type The description about virt machine type is removed by mistake, add new description here. Here is output result with command "./qemu-system-loongarch64 -M help" Supported machines are: none empty machine virt QEMU LoongArch Virtual Machine (default) x-remote Experimental remote machine Without the patch, it shows as follows: Supported machines are: none empty machine virt (null) (default) x-remote Experimental remote machine Fixes: ef2f11454c(hw/loongarch/virt: Replace Loongson IPI with LoongArch IPI) Signed-off-by: Bibo Mao Reviewed-by: Thomas Huth Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev --- hw/loongarch/virt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 79b16953d2..9f47107379 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1383,6 +1383,7 @@ static void virt_class_init(ObjectClass *oc, void *data) mc->init = virt_init; mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464"); mc->default_ram_id = "loongarch.ram"; + mc->desc = "QEMU LoongArch Virtual Machine"; mc->max_cpus = LOONGARCH_MAX_CPUS; mc->is_default = 1; mc->default_kernel_irqchip_split = false; -- Gitee From fa276847efb3fd47a730d279f1b14705fe3991b1 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 18 Sep 2024 09:42:06 +0800 Subject: [PATCH 374/939] hw/loongarch/virt: Add FDT table support with acpi ged pm register ACPI ged is used for power management on LoongArch virt platform, in general it is parsed from acpi table. However if system boot directly from elf kernel, no UEFI bios is provided and acpi table cannot be used also. Here acpi ged pm register is exposed with FDT table, it is compatbile with syscon method in FDT table, only that acpi ged pm register is accessed with 8-bit mode, rather with 32-bit mode. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Tested-by: Song Gao Message-Id: <20240918014206.2165821-3-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/loongarch/virt.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 9f47107379..9510aa7a7e 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -281,6 +281,44 @@ static void fdt_add_rtc_node(LoongArchVirtMachineState *lvms, g_free(nodename); } +static void fdt_add_ged_reset(LoongArchVirtMachineState *lvms) +{ + char *name; + uint32_t ged_handle; + MachineState *ms = MACHINE(lvms); + hwaddr base = VIRT_GED_REG_ADDR; + hwaddr size = ACPI_GED_REG_COUNT; + + ged_handle = qemu_fdt_alloc_phandle(ms->fdt); + name = g_strdup_printf("/ged@%" PRIx64, base); + qemu_fdt_add_subnode(ms->fdt, name); + qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon"); + qemu_fdt_setprop_cells(ms->fdt, name, "reg", 0x0, base, 0x0, size); + /* 8 bit registers */ + qemu_fdt_setprop_cell(ms->fdt, name, "reg-shift", 0); + qemu_fdt_setprop_cell(ms->fdt, name, "reg-io-width", 1); + qemu_fdt_setprop_cell(ms->fdt, name, "phandle", ged_handle); + ged_handle = qemu_fdt_get_phandle(ms->fdt, name); + g_free(name); + + name = g_strdup_printf("/reboot"); + qemu_fdt_add_subnode(ms->fdt, name); + qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon-reboot"); + qemu_fdt_setprop_cell(ms->fdt, name, "regmap", ged_handle); + qemu_fdt_setprop_cell(ms->fdt, name, "offset", ACPI_GED_REG_RESET); + qemu_fdt_setprop_cell(ms->fdt, name, "value", ACPI_GED_RESET_VALUE); + g_free(name); + + name = g_strdup_printf("/poweroff"); + qemu_fdt_add_subnode(ms->fdt, name); + qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon-poweroff"); + qemu_fdt_setprop_cell(ms->fdt, name, "regmap", ged_handle); + qemu_fdt_setprop_cell(ms->fdt, name, "offset", ACPI_GED_REG_SLEEP_CTL); + qemu_fdt_setprop_cell(ms->fdt, name, "value", ACPI_GED_SLP_EN | + (ACPI_GED_SLP_TYP_S5 << ACPI_GED_SLP_TYP_POS)); + g_free(name); +} + static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, uint32_t *pch_pic_phandle, hwaddr base, int irq, bool chosen) @@ -739,6 +777,7 @@ static void virt_devices_init(DeviceState *pch_pic, qdev_get_gpio_in(pch_pic, VIRT_RTC_IRQ - VIRT_GSI_BASE)); fdt_add_rtc_node(lvms, pch_pic_phandle); + fdt_add_ged_reset(lvms); /* acpi ged */ lvms->acpi_ged = create_acpi_ged(pch_pic, lvms); -- Gitee From c10277bf2fb63f1c1cc206711651bc6d7c268c53 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 18 Sep 2024 09:42:05 +0800 Subject: [PATCH 375/939] acpi: ged: Add macro for acpi sleep control register Macro definition is added for acpi sleep control register, ged emulation driver can use the macro , also it can be used in FDT table if ged is exposed with FDT table. Signed-off-by: Bibo Mao Reviewed-by: Igor Mammedov Message-Id: <20240918014206.2165821-2-maobibo@loongson.cn> Signed-off-by: Song Gao --- hw/acpi/generic_event_device.c | 6 +++--- include/hw/acpi/generic_event_device.h | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 4731a614a3..2ce7031f1a 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -203,9 +203,9 @@ static void ged_regs_write(void *opaque, hwaddr addr, uint64_t data, switch (addr) { case ACPI_GED_REG_SLEEP_CTL: - slp_typ = (data >> 2) & 0x07; - slp_en = (data >> 5) & 0x01; - if (slp_en && slp_typ == 5) { + slp_typ = (data >> ACPI_GED_SLP_TYP_POS) & ACPI_GED_SLP_TYP_MASK; + slp_en = !!(data & ACPI_GED_SLP_EN); + if (slp_en && slp_typ == ACPI_GED_SLP_TYP_S5) { qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); } return; diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index 90fc41cbb8..8ed9534c57 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -81,8 +81,11 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) /* ACPI_GED_REG_RESET value for reset*/ #define ACPI_GED_RESET_VALUE 0x42 -/* ACPI_GED_REG_SLEEP_CTL.SLP_TYP value for S5 (aka poweroff) */ -#define ACPI_GED_SLP_TYP_S5 0x05 +/* [ACPI 5.0 Chapter 4.8.3.7] Sleep Control and Status Register */ +#define ACPI_GED_SLP_TYP_POS 0x2 /* SLP_TYPx Bit Offset */ +#define ACPI_GED_SLP_TYP_MASK 0x07 /* SLP_TYPx 3-bit mask */ +#define ACPI_GED_SLP_TYP_S5 0x05 /* System _S5 State (Soft Off) */ +#define ACPI_GED_SLP_EN 0x20 /* SLP_EN write-only bit */ #define GED_DEVICE "GED" #define AML_GED_EVT_REG "EREG" -- Gitee From 478c205000b3df3710a7d30f3e3e5bd4f35b0156 Mon Sep 17 00:00:00 2001 From: Sia Jee Heng Date: Sun, 28 Jan 2024 18:14:39 -0800 Subject: [PATCH 376/939] hw/arm/virt-acpi-build.c: Migrate SPCR creation to common location RISC-V should also generate the SPCR in a manner similar to ARM. Therefore, instead of replicating the code, relocate this function to the common AML build. Signed-off-by: Sia Jee Heng Reviewed-by: Alistair Francis Message-ID: <20240129021440.17640-2-jeeheng.sia@starfivetech.com> [ Changes by AF: - Add missing Language SPCR entry ] Signed-off-by: Alistair Francis --- hw/acpi/aml-build.c | 53 +++++++++++++++++++++++++++++ hw/arm/virt-acpi-build.c | 68 +++++++++++++++---------------------- include/hw/acpi/acpi-defs.h | 33 ++++++++++++++++++ include/hw/acpi/aml-build.h | 4 +++ 4 files changed, 117 insertions(+), 41 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index bf9c59f544..b0b68efa02 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -2016,6 +2016,59 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, } } +void build_spcr(GArray *table_data, BIOSLinker *linker, + const AcpiSpcrData *f, const uint8_t rev, + const char *oem_id, const char *oem_table_id) +{ + AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* Interface type */ + build_append_int_noprefix(table_data, f->interface_type, 1); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 3); + /* Base Address */ + build_append_gas(table_data, f->base_addr.id, f->base_addr.width, + f->base_addr.offset, f->base_addr.size, + f->base_addr.addr); + /* Interrupt type */ + build_append_int_noprefix(table_data, f->interrupt_type, 1); + /* IRQ */ + build_append_int_noprefix(table_data, f->pc_interrupt, 1); + /* Global System Interrupt */ + build_append_int_noprefix(table_data, f->interrupt, 4); + /* Baud Rate */ + build_append_int_noprefix(table_data, f->baud_rate, 1); + /* Parity */ + build_append_int_noprefix(table_data, f->parity, 1); + /* Stop Bits */ + build_append_int_noprefix(table_data, f->stop_bits, 1); + /* Flow Control */ + build_append_int_noprefix(table_data, f->flow_control, 1); + /* Language */ + build_append_int_noprefix(table_data, f->language, 1); + /* Terminal Type */ + build_append_int_noprefix(table_data, f->terminal_type, 1); + /* PCI Device ID */ + build_append_int_noprefix(table_data, f->pci_device_id, 2); + /* PCI Vendor ID */ + build_append_int_noprefix(table_data, f->pci_vendor_id, 2); + /* PCI Bus Number */ + build_append_int_noprefix(table_data, f->pci_bus, 1); + /* PCI Device Number */ + build_append_int_noprefix(table_data, f->pci_device, 1); + /* PCI Function Number */ + build_append_int_noprefix(table_data, f->pci_function, 1); + /* PCI Flags */ + build_append_int_noprefix(table_data, f->pci_flags, 4); + /* PCI Segment */ + build_append_int_noprefix(table_data, f->pci_segment, 1); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + + acpi_table_end(linker, &table); +} /* * ACPI spec, Revision 6.3 * 5.2.29.2 Cache Type Structure (Type 1) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 179600d4fe..bc637b8619 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -550,48 +550,34 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * Rev: 1.07 */ static void -build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) +spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { - AcpiTable table = { .sig = "SPCR", .rev = 2, .oem_id = vms->oem_id, - .oem_table_id = vms->oem_table_id }; - - acpi_table_begin(&table, table_data); - - /* Interface Type */ - build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */ - build_append_int_noprefix(table_data, 0, 3); /* Reserved */ - /* Base Address */ - build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3, - vms->memmap[VIRT_UART].base); - /* Interrupt Type */ - build_append_int_noprefix(table_data, - (1 << 3) /* Bit[3] ARMH GIC interrupt */, 1); - build_append_int_noprefix(table_data, 0, 1); /* IRQ */ - /* Global System Interrupt */ - build_append_int_noprefix(table_data, - vms->irqmap[VIRT_UART] + ARM_SPI_BASE, 4); - build_append_int_noprefix(table_data, 3 /* 9600 */, 1); /* Baud Rate */ - build_append_int_noprefix(table_data, 0 /* No Parity */, 1); /* Parity */ - /* Stop Bits */ - build_append_int_noprefix(table_data, 1 /* 1 Stop bit */, 1); - /* Flow Control */ - build_append_int_noprefix(table_data, - (1 << 1) /* RTS/CTS hardware flow control */, 1); - /* Terminal Type */ - build_append_int_noprefix(table_data, 0 /* VT100 */, 1); - build_append_int_noprefix(table_data, 0, 1); /* Language */ - /* PCI Device ID */ - build_append_int_noprefix(table_data, 0xffff /* not a PCI device*/, 2); - /* PCI Vendor ID */ - build_append_int_noprefix(table_data, 0xffff /* not a PCI device*/, 2); - build_append_int_noprefix(table_data, 0, 1); /* PCI Bus Number */ - build_append_int_noprefix(table_data, 0, 1); /* PCI Device Number */ - build_append_int_noprefix(table_data, 0, 1); /* PCI Function Number */ - build_append_int_noprefix(table_data, 0, 4); /* PCI Flags */ - build_append_int_noprefix(table_data, 0, 1); /* PCI Segment */ - build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + AcpiSpcrData serial = { + .interface_type = 3, /* ARM PL011 UART */ + .base_addr.id = AML_AS_SYSTEM_MEMORY, + .base_addr.width = 32, + .base_addr.offset = 0, + .base_addr.size = 3, + .base_addr.addr = vms->memmap[VIRT_UART].base, + .interrupt_type = (1 << 3),/* Bit[3] ARMH GIC interrupt*/ + .pc_interrupt = 0, /* IRQ */ + .interrupt = (vms->irqmap[VIRT_UART] + ARM_SPI_BASE), + .baud_rate = 3, /* 9600 */ + .parity = 0, /* No Parity */ + .stop_bits = 1, /* 1 Stop bit */ + .flow_control = 1 << 1, /* RTS/CTS hardware flow control */ + .terminal_type = 0, /* VT100 */ + .language = 0, /* Language */ + .pci_device_id = 0xffff, /* not a PCI device*/ + .pci_vendor_id = 0xffff, /* not a PCI device*/ + .pci_bus = 0, + .pci_device = 0, + .pci_function = 0, + .pci_flags = 0, + .pci_segment = 0, + }; - acpi_table_end(linker, &table); + build_spcr(table_data, linker, &serial, 2, vms->oem_id, vms->oem_table_id); } /* @@ -1149,7 +1135,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) } acpi_add_table(table_offsets, tables_blob); - build_spcr(tables_blob, tables->linker, vms); + spcr_setup(tables_blob, tables->linker, vms); acpi_add_table(table_offsets, tables_blob); build_dbg2(tables_blob, tables->linker, vms); diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index b1f389fb4b..7a8b708cda 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -90,6 +90,39 @@ typedef struct AcpiFadtData { unsigned *xdsdt_tbl_offset; } AcpiFadtData; +typedef struct AcpiGas { + uint8_t id; /* Address space ID */ + uint8_t width; /* Register bit width */ + uint8_t offset; /* Register bit offset */ + uint8_t size; /* Access size */ + uint64_t addr; /* Address */ +} AcpiGas; + +/* SPCR (Serial Port Console Redirection table) */ +typedef struct AcpiSpcrData { + uint8_t interface_type; + uint8_t reserved[3]; + struct AcpiGas base_addr; + uint8_t interrupt_type; + uint8_t pc_interrupt; + uint32_t interrupt; /* Global system interrupt */ + uint8_t baud_rate; + uint8_t parity; + uint8_t stop_bits; + uint8_t flow_control; + uint8_t terminal_type; + uint8_t language; + uint8_t reserved1; + uint16_t pci_device_id; /* Must be 0xffff if not PCI device */ + uint16_t pci_vendor_id; /* Must be 0xffff if not PCI device */ + uint8_t pci_bus; + uint8_t pci_device; + uint8_t pci_function; + uint32_t pci_flags; + uint8_t pci_segment; + uint32_t reserved2; +} AcpiSpcrData; + #define ACPI_FADT_ARM_PSCI_COMPLIANT (1 << 0) #define ACPI_FADT_ARM_PSCI_USE_HVC (1 << 1) diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index 7281c281f6..c0b852cbd2 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -548,4 +548,8 @@ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog, const char *oem_id, const char *oem_table_id); + +void build_spcr(GArray *table_data, BIOSLinker *linker, + const AcpiSpcrData *f, const uint8_t rev, + const char *oem_id, const char *oem_table_id); #endif -- Gitee From 033e2a67885cf7347473e09454a6704074e05878 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 6 May 2024 09:19:12 +0800 Subject: [PATCH 377/939] target/loongarch: Add TCG macro in structure CPUArchState In structure CPUArchState some struct elements are only used in TCG mode, and it is not used in KVM mode. Macro CONFIG_TCG is added to make it simpiler in KVM mode, also there is the same modification in c code when these structure elements are used. When VM runs in KVM mode, TLB entries are not used and do not need migrate. It is only useful when it runs in TCG mode. Signed-off-by: Bibo Mao Reviewed-by: Richard Henderson Message-Id: <20240506011912.2108842-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 7 +++-- target/loongarch/cpu.h | 16 +++++++---- target/loongarch/cpu_helper.c | 9 ++++++ target/loongarch/machine.c | 52 ++++++++++++++++++++++++----------- 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index f7b5dae7ed..220d40fb01 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -536,7 +536,9 @@ static void loongarch_cpu_reset_hold(Object *obj) lacc->parent_phases.hold(obj); } +#ifdef CONFIG_TCG env->fcsr0_mask = FCSR0_M1 | FCSR0_M2 | FCSR0_M3; +#endif env->fcsr0 = 0x0; int n; @@ -581,7 +583,9 @@ static void loongarch_cpu_reset_hold(Object *obj) #ifndef CONFIG_USER_ONLY env->pc = 0x1c000000; +#ifdef CONFIG_TCG memset(env->tlb, 0, sizeof(env->tlb)); +#endif if (kvm_enabled()) { kvm_arch_reset_vcpu(env); } @@ -778,8 +782,7 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) int i; qemu_fprintf(f, " PC=%016" PRIx64 " ", env->pc); - qemu_fprintf(f, " FCSR0 0x%08x fp_status 0x%02x\n", env->fcsr0, - get_float_exception_flags(&env->fp_status)); + qemu_fprintf(f, " FCSR0 0x%08x\n", env->fcsr0); /* gpr */ for (i = 0; i < 32; i++) { diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index e3a15c593f..19bcad28de 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -275,6 +275,7 @@ union fpr_t { VReg vreg; }; +#ifdef CONFIG_TCG struct LoongArchTLB { uint64_t tlb_misc; /* Fields corresponding to CSR_TLBELO0/1 */ @@ -282,23 +283,18 @@ struct LoongArchTLB { uint64_t tlb_entry1; }; typedef struct LoongArchTLB LoongArchTLB; +#endif typedef struct CPUArchState { uint64_t gpr[32]; uint64_t pc; fpr_t fpr[32]; - float_status fp_status; bool cf[8]; - uint32_t fcsr0; - uint32_t fcsr0_mask; uint32_t cpucfg[21]; - uint64_t lladdr; /* LL virtual address compared against SC */ - uint64_t llval; - /* LoongArch CSRs */ uint64_t CSR_CRMD; uint64_t CSR_PRMD; @@ -355,8 +351,16 @@ typedef struct CPUArchState { uint64_t CSR_DERA; uint64_t CSR_DSAVE; +#ifdef CONFIG_TCG + float_status fp_status; + uint32_t fcsr0_mask; + uint64_t lladdr; /* LL virtual address compared against SC */ + uint64_t llval; +#endif #ifndef CONFIG_USER_ONLY +#ifdef CONFIG_TCG LoongArchTLB tlb[LOONGARCH_TLB_MAX]; +#endif AddressSpace *address_space_iocsr; bool load_elf; diff --git a/target/loongarch/cpu_helper.c b/target/loongarch/cpu_helper.c index f68d63f466..39037eecb4 100644 --- a/target/loongarch/cpu_helper.c +++ b/target/loongarch/cpu_helper.c @@ -11,6 +11,7 @@ #include "internals.h" #include "cpu-csr.h" +#ifdef CONFIG_TCG static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical, int *prot, target_ulong address, int access_type, int index, int mmu_idx) @@ -154,6 +155,14 @@ static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, return TLBRET_NOMATCH; } +#else +static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx) +{ + return TLBRET_NOMATCH; +} +#endif static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va, target_ulong dmw) diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index ec5abe56db..4bbf495d6b 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -8,6 +8,7 @@ #include "qemu/osdep.h" #include "cpu.h" #include "migration/cpu.h" +#include "sysemu/tcg.h" #include "vec.h" #include "kvm/kvm_loongarch.h" #include "sysemu/kvm.h" @@ -111,19 +112,6 @@ static const VMStateDescription vmstate_lasx = { }, }; -/* TLB state */ -const VMStateDescription vmstate_tlb = { - .name = "cpu/tlb", - .version_id = 0, - .minimum_version_id = 0, - .fields = (VMStateField[]) { - VMSTATE_UINT64(tlb_misc, LoongArchTLB), - VMSTATE_UINT64(tlb_entry0, LoongArchTLB), - VMSTATE_UINT64(tlb_entry1, LoongArchTLB), - VMSTATE_END_OF_LIST() - } -}; - static int cpu_post_load(void *opaque, int version_id) { #ifdef CONFIG_KVM @@ -142,6 +130,38 @@ static int cpu_pre_save(void *opaque) return 0; } +#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) +static bool tlb_needed(void *opaque) +{ + return tcg_enabled(); +} + +/* TLB state */ +static const VMStateDescription vmstate_tlb_entry = { + .name = "cpu/tlb_entry", + .version_id = 0, + .minimum_version_id = 0, + .fields = (VMStateField[]) { + VMSTATE_UINT64(tlb_misc, LoongArchTLB), + VMSTATE_UINT64(tlb_entry0, LoongArchTLB), + VMSTATE_UINT64(tlb_entry1, LoongArchTLB), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_tlb = { + .name = "cpu/tlb", + .version_id = 0, + .minimum_version_id = 0, + .needed = tlb_needed, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX, + 0, vmstate_tlb_entry, LoongArchTLB), + VMSTATE_END_OF_LIST() + } +}; +#endif + /* LoongArch CPU state */ const VMStateDescription vmstate_loongarch_cpu = { .name = "cpu", @@ -212,9 +232,6 @@ const VMStateDescription vmstate_loongarch_cpu = { VMSTATE_UINT64(env.CSR_DBG, LoongArchCPU), VMSTATE_UINT64(env.CSR_DERA, LoongArchCPU), VMSTATE_UINT64(env.CSR_DSAVE, LoongArchCPU), - /* TLB */ - VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX, - 0, vmstate_tlb, LoongArchTLB), VMSTATE_UINT64(kvm_state_counter, LoongArchCPU), @@ -224,6 +241,9 @@ const VMStateDescription vmstate_loongarch_cpu = { &vmstate_fpu, &vmstate_lsx, &vmstate_lasx, +#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) + &vmstate_tlb, +#endif NULL } }; -- Gitee From 717faefc8f56490ad94ef69b42c2d2491225ace8 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 28 Apr 2024 11:16:51 +0800 Subject: [PATCH 378/939] target/loongarch: Put cpucfg operation before CSR register On Loongarch, cpucfg is register for cpu feature, some other registers depend on cpucfg feature such as perf CSR registers. Here put cpucfg read/write operations before CSR register, so that KVM knows how many perf CSR registers are valid from pre-set cpucfg feature information. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240428031651.1354587-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/kvm/kvm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 5c88270132..407d454919 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -714,22 +714,22 @@ int kvm_arch_get_registers(CPUState *cs) return ret; } - ret = kvm_loongarch_get_csr(cs); + ret = kvm_loongarch_get_cpucfg(cs); if (ret) { return ret; } - ret = kvm_loongarch_get_regs_fp(cs); + ret = kvm_loongarch_get_csr(cs); if (ret) { return ret; } - ret = kvm_loongarch_get_mpstate(cs); + ret = kvm_loongarch_get_regs_fp(cs); if (ret) { return ret; } - ret = kvm_loongarch_get_cpucfg(cs); + ret = kvm_loongarch_get_mpstate(cs); return ret; } @@ -742,22 +742,22 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } - ret = kvm_loongarch_put_csr(cs, level); + ret = kvm_loongarch_put_cpucfg(cs); if (ret) { return ret; } - ret = kvm_loongarch_put_regs_fp(cs); + ret = kvm_loongarch_put_csr(cs, level); if (ret) { return ret; } - ret = kvm_loongarch_put_mpstate(cs); + ret = kvm_loongarch_put_regs_fp(cs); if (ret) { return ret; } - ret = kvm_loongarch_put_cpucfg(cs); + ret = kvm_loongarch_put_mpstate(cs); return ret; } -- Gitee From 520e792f674a7ab192a9237519c4e0c8f50abc71 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 8 May 2024 10:47:32 +0800 Subject: [PATCH 379/939] target/loongarch/kvm: Fix VM recovery from disk failures vmstate does not save kvm_state_conter, which can cause VM recovery from disk to fail. Cc: qemu-stable@nongnu.org Signed-off-by: Song Gao Acked-by: Peter Xu Message-Id: <20240508024732.3127792-1-gaosong@loongson.cn> --- target/loongarch/machine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index 4bbf495d6b..97e1152ffd 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -165,11 +165,11 @@ static const VMStateDescription vmstate_tlb = { /* LoongArch CPU state */ const VMStateDescription vmstate_loongarch_cpu = { .name = "cpu", - .version_id = 1, - .minimum_version_id = 1, + .version_id = 2, + .minimum_version_id = 2, .post_load = cpu_post_load, .pre_save = cpu_pre_save, - .fields = (VMStateField[]) { + .fields = (const VMStateField[]) { VMSTATE_UINTTL_ARRAY(env.gpr, LoongArchCPU, 32), VMSTATE_UINTTL(env.pc, LoongArchCPU), -- Gitee From f572c385e0d368cbf12acf7d6f0b33b5f2efd7f0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 21 May 2024 16:05:48 +0800 Subject: [PATCH 380/939] target/loongarch: Add loongarch vector property unconditionally Currently LSX/LASX vector property is decided by the default value. Instead vector property should be added unconditionally, and it is irrelative with its default value. If vector is disabled by default, vector also can be enabled from command line. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240521080549.434197-2-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 220d40fb01..f89740a5aa 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -720,14 +720,10 @@ void loongarch_cpu_post_init(Object *obj) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); - if (FIELD_EX32(cpu->env.cpucfg[2], CPUCFG2, LSX)) { - object_property_add_bool(obj, "lsx", loongarch_get_lsx, - loongarch_set_lsx); - } - if (FIELD_EX32(cpu->env.cpucfg[2], CPUCFG2, LASX)) { - object_property_add_bool(obj, "lasx", loongarch_get_lasx, - loongarch_set_lasx); - } + object_property_add_bool(obj, "lsx", loongarch_get_lsx, + loongarch_set_lsx); + object_property_add_bool(obj, "lasx", loongarch_get_lasx, + loongarch_set_lasx); if (kvm_enabled()) { object_property_add_bool(obj, "pmu", loongarch_get_pmu, -- Gitee From 9a6ef31fa2fcf1f1257fb849cc6cabe2b4c440e0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 7 Jun 2024 11:50:16 +0800 Subject: [PATCH 381/939] target/loongarch/kvm: Add software breakpoint support With KVM virtualization, debug exception is injected to guest kernel rather than host for normal break intruction. Here hypercall instruction with special code is used for sw breakpoint usage, and detailed instruction comes from kvm kernel with user API KVM_REG_LOONGARCH_DEBUG_INST. Now only software breakpoint is supported, and it is allowed to insert/remove software breakpoint. We can debug guest kernel with gdb method after kernel is loaded, hardware breakpoint will be added in later. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Tested-by: Song Gao Message-Id: <20240607035016.2975799-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- configs/targets/loongarch64-softmmu.mak | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/targets/loongarch64-softmmu.mak b/configs/targets/loongarch64-softmmu.mak index f23780fdd8..0034c33620 100644 --- a/configs/targets/loongarch64-softmmu.mak +++ b/configs/targets/loongarch64-softmmu.mak @@ -1,5 +1,6 @@ TARGET_ARCH=loongarch64 TARGET_BASE_ARCH=loongarch +TARGET_KVM_HAVE_GUEST_DEBUG=y TARGET_SUPPORTS_MTTCG=y TARGET_XML_FILES= gdb-xml/loongarch-base32.xml gdb-xml/loongarch-base64.xml gdb-xml/loongarch-fpu.xml TARGET_NEED_FDT=y -- Gitee From 3b3fdfa6d5439298b883e2e223fa04a2209612f5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Fri, 28 Jun 2024 13:33:57 +1000 Subject: [PATCH 382/939] target/loongarch: Remove avail_64 in trans_srai_w() and simplify it Since srai.w is a valid instruction on la32, remove the avail_64 check and simplify trans_srai_w(). Fixes: c0c0461e3a06 ("target/loongarch: Add avail_64 to check la64-only instructions") Reviewed-by: Richard Henderson Signed-off-by: Feiyang Chen Message-Id: <20240628033357.50027-1-chris.chenfeiyang@gmail.com> Signed-off-by: Song Gao --- target/loongarch/tcg/insn_trans/trans_shift.c.inc | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/target/loongarch/tcg/insn_trans/trans_shift.c.inc b/target/loongarch/tcg/insn_trans/trans_shift.c.inc index 2f4bd6ff28..377307785a 100644 --- a/target/loongarch/tcg/insn_trans/trans_shift.c.inc +++ b/target/loongarch/tcg/insn_trans/trans_shift.c.inc @@ -67,19 +67,9 @@ static void gen_rotr_d(TCGv dest, TCGv src1, TCGv src2) tcg_gen_rotr_tl(dest, src1, t0); } -static bool trans_srai_w(DisasContext *ctx, arg_srai_w *a) +static void gen_sari_w(TCGv dest, TCGv src1, target_long imm) { - TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE); - TCGv src1 = gpr_src(ctx, a->rj, EXT_ZERO); - - if (!avail_64(ctx)) { - return false; - } - - tcg_gen_sextract_tl(dest, src1, a->imm, 32 - a->imm); - gen_set_gpr(a->rd, dest, EXT_NONE); - - return true; + tcg_gen_sextract_tl(dest, src1, imm, 32 - imm); } TRANS(sll_w, ALL, gen_rrr, EXT_ZERO, EXT_NONE, EXT_SIGN, gen_sll_w) @@ -94,6 +84,7 @@ TRANS(slli_w, ALL, gen_rri_c, EXT_NONE, EXT_SIGN, tcg_gen_shli_tl) TRANS(slli_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_shli_tl) TRANS(srli_w, ALL, gen_rri_c, EXT_ZERO, EXT_SIGN, tcg_gen_shri_tl) TRANS(srli_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_shri_tl) +TRANS(srai_w, ALL, gen_rri_c, EXT_NONE, EXT_NONE, gen_sari_w) TRANS(srai_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_sari_tl) TRANS(rotri_w, 64, gen_rri_v, EXT_NONE, EXT_NONE, gen_rotr_w) TRANS(rotri_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_rotri_tl) -- Gitee From f677a8f2311e823a87ec70dbdbc07712d54e5a85 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 5 Jul 2024 10:18:38 +0800 Subject: [PATCH 383/939] target/loongarch: Set CSR_PRCFG1 and CSR_PRCFG2 values We set the value of register CSR_PRCFG3, but left out CSR_PRCFG1 and CSR_PRCFG2. Set CSR_PRCFG1 and CSR_PRCFG2 according to the default values of the physical machine. Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240705021839.1004374-1-gaosong@loongson.cn> --- target/loongarch/cpu.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index f89740a5aa..5bb9e5656a 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -472,6 +472,18 @@ static void loongarch_la464_initfn(Object *obj) env->cpucfg[20] = data; env->CSR_ASID = FIELD_DP64(0, CSR_ASID, ASIDBITS, 0xa); + + env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, SAVE_NUM, 8); + env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, TIMER_BITS, 0x2f); + env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, VSMAX, 7); + + env->CSR_PRCFG2 = 0x3ffff000; + + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2); + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63); + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_WAYS, 7); + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_SETS, 8); + loongarch_cpu_post_init(obj); } @@ -569,11 +581,6 @@ static void loongarch_cpu_reset_hold(Object *obj) env->CSR_MERRCTL = FIELD_DP64(env->CSR_MERRCTL, CSR_MERRCTL, ISMERR, 0); env->CSR_TID = cs->cpu_index; - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2); - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63); - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_WAYS, 7); - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_SETS, 8); - for (n = 0; n < 4; n++) { env->CSR_DMW[n] = FIELD_DP64(env->CSR_DMW[n], CSR_DMW, PLV0, 0); env->CSR_DMW[n] = FIELD_DP64(env->CSR_DMW[n], CSR_DMW, PLV1, 0); -- Gitee From d909e6bfef50fc67708358e455a3b53d869249e6 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Fri, 5 Jul 2024 10:18:39 +0800 Subject: [PATCH 384/939] target/loongarch: Fix cpu_reset set wrong CSR_CRMD After cpu_reset, DATF in CSR_CRMD is 0, DATM is 0. See the manual[1] 6.4. [1]: https://github.com/loongson/LoongArch-Documentation/releases/download/2023.04.20/LoongArch-Vol1-v1.10-EN.pdf Signed-off-by: Song Gao Reviewed-by: Bibo Mao Message-Id: <20240705021839.1004374-2-gaosong@loongson.cn> --- target/loongarch/cpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 5bb9e5656a..d8a31929b4 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -554,13 +554,13 @@ static void loongarch_cpu_reset_hold(Object *obj) env->fcsr0 = 0x0; int n; - /* Set csr registers value after reset */ + /* Set csr registers value after reset, see the manual 6.4. */ env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PLV, 0); env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, IE, 0); env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DA, 1); env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PG, 0); - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATF, 1); - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATM, 1); + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATF, 0); + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATM, 0); env->CSR_EUEN = FIELD_DP64(env->CSR_EUEN, CSR_EUEN, FPE, 0); env->CSR_EUEN = FIELD_DP64(env->CSR_EUEN, CSR_EUEN, SXE, 0); -- Gitee From 2c6cf54ea2f52774f2587e7e66eed9beba3a3dec Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 27 Aug 2024 11:58:07 +0800 Subject: [PATCH 385/939] target/loongarch: Add compatible support about VM reboot With edk2-stable202408 LoongArch UEFI bios, CSR PGD register is set only if its value is equal to zero for boot cpu, it causes reboot issue. Since CSR PGD register is changed with linux kernel, UEFI BIOS cannot use it. Add workaround to clear CSR registers relative with TLB in function loongarch_cpu_reset_hold(), so that VM can reboot with edk2-stable202408 UEFI bios. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240827035807.3326293-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index d8a31929b4..2038984d02 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -580,6 +580,20 @@ static void loongarch_cpu_reset_hold(Object *obj) env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR, 0); env->CSR_MERRCTL = FIELD_DP64(env->CSR_MERRCTL, CSR_MERRCTL, ISMERR, 0); env->CSR_TID = cs->cpu_index; + /* + * Workaround for edk2-stable202408, CSR PGD register is set only if + * its value is equal to zero for boot cpu, it causes reboot issue. + * + * Here clear CSR registers relative with TLB. + */ + env->CSR_PGDH = 0; + env->CSR_PGDL = 0; + env->CSR_PWCL = 0; + env->CSR_PWCH = 0; + env->CSR_STLBPS = 0; + env->CSR_EENTRY = 0; + env->CSR_TLBRENTRY = 0; + env->CSR_MERRENTRY = 0; for (n = 0; n < 4; n++) { env->CSR_DMW[n] = FIELD_DP64(env->CSR_DMW[n], CSR_DMW, PLV0, 0); -- Gitee From ad00cc7da8ab03d6d612a3bd7ec0c4b7af594894 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 22 Aug 2024 10:28:27 +0800 Subject: [PATCH 386/939] target/loongarch/kvm: Add vCPU reset function KVM provides interface KVM_REG_LOONGARCH_VCPU_RESET to reset vCPU, it can be used to clear internal state about kvm kernel. vCPU reset function is added here for kvm mode. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240822022827.2273534-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 2 +- target/loongarch/kvm/kvm.c | 5 ++++- target/loongarch/kvm/kvm_loongarch.h | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 2038984d02..63d1f65608 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -608,7 +608,7 @@ static void loongarch_cpu_reset_hold(Object *obj) memset(env->tlb, 0, sizeof(env->tlb)); #endif if (kvm_enabled()) { - kvm_arch_reset_vcpu(env); + kvm_arch_reset_vcpu(cs); } #endif diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 407d454919..90c8379c46 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -485,9 +485,12 @@ static int kvm_loongarch_put_regs_fp(CPUState *cs) return ret; } -void kvm_arch_reset_vcpu(CPULoongArchState *env) +void kvm_arch_reset_vcpu(CPUState *cs) { + CPULoongArchState *env = cpu_env(cs); + env->mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_one_reg(cs, KVM_REG_LOONGARCH_VCPU_RESET, 0); } static int kvm_loongarch_get_mpstate(CPUState *cs) diff --git a/target/loongarch/kvm/kvm_loongarch.h b/target/loongarch/kvm/kvm_loongarch.h index 551878a725..8482f9308d 100644 --- a/target/loongarch/kvm/kvm_loongarch.h +++ b/target/loongarch/kvm/kvm_loongarch.h @@ -11,8 +11,8 @@ #define QEMU_KVM_LOONGARCH_H int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level); -void kvm_arch_reset_vcpu(CPULoongArchState *env); int kvm_loongarch_put_pvtime(LoongArchCPU *cpu); int kvm_loongarch_get_pvtime(LoongArchCPU *cpu); +void kvm_arch_reset_vcpu(CPUState *cs); #endif -- Gitee From 2f19b259a16985ce515727c819c3a7eb4f41e6d0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 22 Aug 2024 14:52:45 +0800 Subject: [PATCH 387/939] target/loongarch: Support QMP dump-guest-memory Add the support needed for creating prstatus elf notes. This allows us to use QMP dump-guest-memory. Now ELF notes of LoongArch only supports general elf notes, LSX and LASX is not supported, since it is mainly used to dump guest memory. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Tested-by: Song Gao Message-Id: <20240822065245.2286214-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/arch_dump.c | 167 +++++++++++++++++++++++++++++++++++ target/loongarch/cpu.c | 1 + target/loongarch/internals.h | 2 + target/loongarch/meson.build | 1 + 4 files changed, 171 insertions(+) create mode 100644 target/loongarch/arch_dump.c diff --git a/target/loongarch/arch_dump.c b/target/loongarch/arch_dump.c new file mode 100644 index 0000000000..4986db970e --- /dev/null +++ b/target/loongarch/arch_dump.c @@ -0,0 +1,167 @@ +/* + * Support for writing ELF notes for LoongArch architectures + * + * Copyright (c) 2023 Loongarch Technology + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "elf.h" +#include "sysemu/dump.h" +#include "internals.h" + +/* struct user_pt_regs from arch/loongarch/include/uapi/asm/ptrace.h */ +struct loongarch_user_regs { + uint64_t gpr[32]; + uint64_t pad1[1]; + /* Special CSR registers. */ + uint64_t csr_era; + uint64_t csr_badv; + uint64_t pad2[10]; +} QEMU_PACKED; + +QEMU_BUILD_BUG_ON(sizeof(struct loongarch_user_regs) != 360); + +/* struct elf_prstatus from include/uapi/linux/elfcore.h */ +struct loongarch_elf_prstatus { + char pad1[32]; /* 32 == offsetof(struct elf_prstatus, pr_pid) */ + uint32_t pr_pid; + /* + * 76 == offsetof(struct elf_prstatus, pr_reg) - + * offsetof(struct elf_prstatus, pr_ppid) + */ + char pad2[76]; + struct loongarch_user_regs pr_reg; + uint32_t pr_fpvalid; + char pad3[4]; +} QEMU_PACKED; + +QEMU_BUILD_BUG_ON(sizeof(struct loongarch_elf_prstatus) != 480); + +/* struct user_fp_state from arch/loongarch/include/uapi/asm/ptrace.h */ +struct loongarch_fpu_struct { + uint64_t fpr[32]; + uint64_t fcc; + unsigned int fcsr; +} QEMU_PACKED; + +QEMU_BUILD_BUG_ON(sizeof(struct loongarch_fpu_struct) != 268); + +struct loongarch_note { + Elf64_Nhdr hdr; + char name[8]; /* align_up(sizeof("CORE"), 4) */ + union { + struct loongarch_elf_prstatus prstatus; + struct loongarch_fpu_struct fpu; + }; +} QEMU_PACKED; + +#define LOONGARCH_NOTE_HEADER_SIZE offsetof(struct loongarch_note, prstatus) +#define LOONGARCH_PRSTATUS_NOTE_SIZE \ + (LOONGARCH_NOTE_HEADER_SIZE + sizeof(struct loongarch_elf_prstatus)) +#define LOONGARCH_PRFPREG_NOTE_SIZE \ + (LOONGARCH_NOTE_HEADER_SIZE + sizeof(struct loongarch_fpu_struct)) + +static void loongarch_note_init(struct loongarch_note *note, DumpState *s, + const char *name, Elf64_Word namesz, + Elf64_Word type, Elf64_Word descsz) +{ + memset(note, 0, sizeof(*note)); + + note->hdr.n_namesz = cpu_to_dump32(s, namesz); + note->hdr.n_descsz = cpu_to_dump32(s, descsz); + note->hdr.n_type = cpu_to_dump32(s, type); + + memcpy(note->name, name, namesz); +} + +static int loongarch_write_elf64_fprpreg(WriteCoreDumpFunction f, + CPULoongArchState *env, int cpuid, + DumpState *s) +{ + struct loongarch_note note; + int ret, i; + + loongarch_note_init(¬e, s, "CORE", 5, NT_PRFPREG, sizeof(note.fpu)); + note.fpu.fcsr = cpu_to_dump64(s, env->fcsr0); + + for (i = 0; i < 8; i++) { + note.fpu.fcc |= env->cf[i] << (8 * i); + } + note.fpu.fcc = cpu_to_dump64(s, note.fpu.fcc); + + for (i = 0; i < 32; ++i) { + note.fpu.fpr[i] = cpu_to_dump64(s, env->fpr[i].vreg.UD[0]); + } + + ret = f(¬e, LOONGARCH_PRFPREG_NOTE_SIZE, s); + if (ret < 0) { + return -1; + } + + return 0; +} + +int loongarch_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, + int cpuid, DumpState *s) +{ + struct loongarch_note note; + CPULoongArchState *env = &LOONGARCH_CPU(cs)->env; + int ret, i; + + loongarch_note_init(¬e, s, "CORE", 5, NT_PRSTATUS, + sizeof(note.prstatus)); + note.prstatus.pr_pid = cpu_to_dump32(s, cpuid); + note.prstatus.pr_fpvalid = cpu_to_dump32(s, 1); + + for (i = 0; i < 32; ++i) { + note.prstatus.pr_reg.gpr[i] = cpu_to_dump64(s, env->gpr[i]); + } + note.prstatus.pr_reg.csr_era = cpu_to_dump64(s, env->CSR_ERA); + note.prstatus.pr_reg.csr_badv = cpu_to_dump64(s, env->CSR_BADV); + ret = f(¬e, LOONGARCH_PRSTATUS_NOTE_SIZE, s); + if (ret < 0) { + return -1; + } + + ret = loongarch_write_elf64_fprpreg(f, env, cpuid, s); + if (ret < 0) { + return -1; + } + + return ret; +} + +int cpu_get_dump_info(ArchDumpInfo *info, + const GuestPhysBlockList *guest_phys_blocks) +{ + info->d_machine = EM_LOONGARCH; + info->d_endian = ELFDATA2LSB; + info->d_class = ELFCLASS64; + + return 0; +} + +ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) +{ + size_t note_size = 0; + + if (class == ELFCLASS64) { + note_size = LOONGARCH_PRSTATUS_NOTE_SIZE + LOONGARCH_PRFPREG_NOTE_SIZE; + } + + return note_size * nr_cpus; +} diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 63d1f65608..d6a13de901 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -861,6 +861,7 @@ static struct TCGCPUOps loongarch_tcg_ops = { #include "hw/core/sysemu-cpu-ops.h" static const struct SysemuCPUOps loongarch_sysemu_ops = { + .write_elf64_note = loongarch_cpu_write_elf64_note, .get_phys_page_debug = loongarch_cpu_get_phys_page_debug, }; diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index 944153b180..1a02427627 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -72,5 +72,7 @@ void write_fcc(CPULoongArchState *env, uint64_t val); int loongarch_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n); int loongarch_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n); void loongarch_cpu_register_gdb_regs_for_features(CPUState *cs); +int loongarch_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cpu, + int cpuid, DumpState *s); #endif diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build index e002e9aaf6..7817318287 100644 --- a/target/loongarch/meson.build +++ b/target/loongarch/meson.build @@ -8,6 +8,7 @@ loongarch_ss.add(files( loongarch_system_ss = ss.source_set() loongarch_system_ss.add(files( + 'arch_dump.c', 'cpu_helper.c', 'loongarch-qmp-cmds.c', 'machine.c', -- Gitee From 1b5bad7f9b10bba438fe12082c8aa29805c03092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 24 Sep 2024 15:49:47 +0400 Subject: [PATCH 388/939] target/loongarch: fix -Werror=maybe-uninitialized false-positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ../target/loongarch/gdbstub.c:55:20: error: ‘val’ may be used uninitialized [-Werror=maybe-uninitialized] 55 | return gdb_get_reg32(mem_buf, val); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ ../target/loongarch/gdbstub.c:39:18: note: ‘val’ was declared here 39 | uint64_t val; Signed-off-by: Marc-André Lureau Reviewed-by: Vladimir Sementsov-Ogievskiy --- target/loongarch/gdbstub.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/target/loongarch/gdbstub.c b/target/loongarch/gdbstub.c index 5fc2f19e96..f8e3324bae 100644 --- a/target/loongarch/gdbstub.c +++ b/target/loongarch/gdbstub.c @@ -33,28 +33,29 @@ void write_fcc(CPULoongArchState *env, uint64_t val) int loongarch_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n) { - LoongArchCPU *cpu = LOONGARCH_CPU(cs); - CPULoongArchState *env = &cpu->env; - uint64_t val; - - if (0 <= n && n < 32) { - val = env->gpr[n]; - } else if (n == 32) { - /* orig_a0 */ - val = 0; - } else if (n == 33) { - val = env->pc; - } else if (n == 34) { - val = env->CSR_BADV; - } + CPULoongArchState *env = cpu_env(cs); if (0 <= n && n <= 34) { + uint64_t val; + + if (n < 32) { + val = env->gpr[n]; + } else if (n == 32) { + /* orig_a0 */ + val = 0; + } else if (n == 33) { + val = env->pc; + } else /* if (n == 34) */ { + val = env->CSR_BADV; + } + if (is_la64(env)) { return gdb_get_reg64(mem_buf, val); } else { return gdb_get_reg32(mem_buf, val); } } + return 0; } -- Gitee From 43ac751187131f91b043ecf611ec795422b42c6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Fri, 4 Oct 2024 11:59:56 +0200 Subject: [PATCH 389/939] target/loongarch: Use explicit little-endian LD/ST API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LoongArch architecture uses little endianness. Directly use the little-endian LD/ST API. Mechanical change using: $ end=le; \ for acc in uw w l q tul; do \ sed -i -e "s/ld${acc}_p(/ld${acc}_${end}_p(/" \ -e "s/st${acc}_p(/st${acc}_${end}_p(/" \ $(git grep -wlE '(ld|st)t?u?[wlq]_p' target/loongarch/); \ done Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Message-Id: <20241004163042.85922-13-philmd@linaro.org> --- target/loongarch/gdbstub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/loongarch/gdbstub.c b/target/loongarch/gdbstub.c index f8e3324bae..cc72680c38 100644 --- a/target/loongarch/gdbstub.c +++ b/target/loongarch/gdbstub.c @@ -68,10 +68,10 @@ int loongarch_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) int length = 0; if (is_la64(env)) { - tmp = ldq_p(mem_buf); + tmp = ldq_le_p(mem_buf); read_length = 8; } else { - tmp = ldl_p(mem_buf); + tmp = ldl_le_p(mem_buf); read_length = 4; } @@ -104,13 +104,13 @@ static int loongarch_gdb_set_fpu(CPULoongArchState *env, int length = 0; if (0 <= n && n < 32) { - env->fpr[n].vreg.D(0) = ldq_p(mem_buf); + env->fpr[n].vreg.D(0) = ldq_le_p(mem_buf); length = 8; } else if (32 <= n && n < 40) { env->cf[n - 32] = ldub_p(mem_buf); length = 1; } else if (n == 40) { - env->fcsr0 = ldl_p(mem_buf); + env->fcsr0 = ldl_le_p(mem_buf); length = 4; } return length; -- Gitee From fa79379bd4c5b72e11f14f24439d5d501b8cc98b Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sat, 14 Sep 2024 14:46:45 +0800 Subject: [PATCH 390/939] target/loongarch: Avoid bits shift exceeding width of bool type Variable env->cf[i] is defined as bool type, it is treated as int type with shift operation. However the max possible width is 56 for the shift operation, exceeding the width of int type. And there is existing api read_fcc() which is converted to u64 type with bitwise shift, it can be used to dump fp registers into coredump note segment. Resolves: Coverity CID 1561133 Signed-off-by: Bibo Mao Reviewed-by: Richard Henderson Message-Id: <20240914064645.2099169-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/arch_dump.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/target/loongarch/arch_dump.c b/target/loongarch/arch_dump.c index 4986db970e..d9e1120333 100644 --- a/target/loongarch/arch_dump.c +++ b/target/loongarch/arch_dump.c @@ -97,11 +97,7 @@ static int loongarch_write_elf64_fprpreg(WriteCoreDumpFunction f, loongarch_note_init(¬e, s, "CORE", 5, NT_PRFPREG, sizeof(note.fpu)); note.fpu.fcsr = cpu_to_dump64(s, env->fcsr0); - - for (i = 0; i < 8; i++) { - note.fpu.fcc |= env->cf[i] << (8 * i); - } - note.fpu.fcc = cpu_to_dump64(s, note.fpu.fcc); + note.fpu.fcc = cpu_to_dump64(s, read_fcc(env)); for (i = 0; i < 32; ++i) { note.fpu.fpr[i] = cpu_to_dump64(s, env->fpr[i].vreg.UD[0]); -- Gitee From 23cede66eaa62e8ec559cfa538a59e72375c9fa8 Mon Sep 17 00:00:00 2001 From: gaosong Date: Sun, 8 Sep 2024 03:28:16 +0800 Subject: [PATCH 391/939] sync loongarch linux-headers Signed-off-by: gaosong --- linux-headers/asm-loongarch/kvm.h | 36 +++++++++++++++++++++++++++- linux-headers/asm-loongarch/unistd.h | 1 + 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h index 81fec85f0a..13c1280662 100644 --- a/linux-headers/asm-loongarch/kvm.h +++ b/linux-headers/asm-loongarch/kvm.h @@ -19,6 +19,7 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_DIRTY_LOG_PAGE_OFFSET 64 +#define __KVM_HAVE_IRQ_LINE #define KVM_GUESTDBG_USE_SW_BP 0x00010000 /* @@ -66,6 +67,7 @@ struct kvm_fpu { #define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) #define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) #define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_LBT (KVM_REG_LOONGARCH | 0x50000ULL) #define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) #define KVM_CSR_IDX_MASK 0x7fff #define KVM_CPUCFG_IDX_MASK 0x7fff @@ -79,13 +81,34 @@ struct kvm_fpu { /* Debugging: Special instruction for software breakpoint */ #define KVM_REG_LOONGARCH_DEBUG_INST (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3) +/* LBT registers */ +#define KVM_REG_LOONGARCH_LBT_SCR0 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_LBT_SCR1 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 2) +#define KVM_REG_LOONGARCH_LBT_SCR2 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 3) +#define KVM_REG_LOONGARCH_LBT_SCR3 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 4) +#define KVM_REG_LOONGARCH_LBT_EFLAGS (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 5) +#define KVM_REG_LOONGARCH_LBT_FTOP (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 6) + #define LOONGARCH_REG_SHIFT 3 #define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) #define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) + +/* Device Control API on vm fd */ +#define KVM_LOONGARCH_VM_FEAT_CTRL 0 +#define KVM_LOONGARCH_VM_FEAT_LSX 0 +#define KVM_LOONGARCH_VM_FEAT_LASX 1 +#define KVM_LOONGARCH_VM_FEAT_X86BT 2 +#define KVM_LOONGARCH_VM_FEAT_ARMBT 3 +#define KVM_LOONGARCH_VM_FEAT_MIPSBT 4 +#define KVM_LOONGARCH_VM_FEAT_PMU 5 +#define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 +#define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 + +/* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 #define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1 -#define KVM_LOONGARCH_VCPU_PVTIME_GPA 0 +#define KVM_LOONGARCH_VCPU_PVTIME_GPA 0 struct kvm_debug_exit_arch { }; @@ -112,4 +135,15 @@ struct kvm_iocsr_entry { #define KVM_IRQCHIP_NUM_PINS 64 #define KVM_MAX_CORES 256 +#define KVM_LOONGARCH_VM_HAVE_IRQCHIP 0x40000001 + +#define KVM_DEV_LOONGARCH_IPI_GRP_REGS 0x40000002 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS 0x40000003 + +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL 0x40000004 +#define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT 0 + +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS 0x40000005 + #endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/linux-headers/asm-loongarch/unistd.h b/linux-headers/asm-loongarch/unistd.h index fcb668984f..b344b1f917 100644 --- a/linux-headers/asm-loongarch/unistd.h +++ b/linux-headers/asm-loongarch/unistd.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE3 -- Gitee From 962f649aa5a06169f0ac23f61e273f0860942ebb Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 29 Sep 2024 15:04:04 +0800 Subject: [PATCH 392/939] target/loongarch: Add loongson binary translation feature Loongson Binary Translation (LBT) is used to accelerate binary translation, which contains 4 scratch registers (scr0 to scr3), x86/ARM eflags (eflags) and x87 fpu stack pointer (ftop). Now LBT feature is added in kvm mode, not supported in TCG mode since it is not emulated. Feature variable lbt is added with OnOffAuto type, If lbt feature is not supported with KVM host, it reports error if there is lbt=on command line. If there is no any command line about lbt parameter, it checks whether KVM host supports lbt feature and set the corresponding value in cpucfg. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240929070405.235200-2-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 20 ++++++++++ target/loongarch/cpu.h | 6 +++ target/loongarch/kvm/kvm.c | 57 ++++++++++++++++++++++++++- target/loongarch/loongarch-qmp-cmds.c | 2 +- 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index d6a13de901..a57067938d 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -737,6 +737,18 @@ static void loongarch_set_pmnum(Object *obj, Visitor *v, } } +static bool loongarch_get_lbt(Object *obj, Error **errp) +{ + return LOONGARCH_CPU(obj)->lbt != ON_OFF_AUTO_OFF; +} + +static void loongarch_set_lbt(Object *obj, bool value, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + + cpu->lbt = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; +} + void loongarch_cpu_post_init(Object *obj) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); @@ -756,6 +768,14 @@ void loongarch_cpu_post_init(Object *obj) loongarch_set_pmnum, NULL, (void *)&value); } + + cpu->lbt = ON_OFF_AUTO_AUTO; + object_property_add_bool(obj, "lbt", loongarch_get_lbt, + loongarch_set_lbt); + object_property_set_description(obj, "lbt", + "Set off to disable Binary Tranlation."); + } else { + cpu->lbt = ON_OFF_AUTO_OFF; } } diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 19bcad28de..3e2bcbf608 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -155,6 +155,7 @@ FIELD(CPUCFG2, LLFTP_VER, 15, 3) FIELD(CPUCFG2, LBT_X86, 18, 1) FIELD(CPUCFG2, LBT_ARM, 19, 1) FIELD(CPUCFG2, LBT_MIPS, 20, 1) +FIELD(CPUCFG2, LBT_ALL, 18, 3) FIELD(CPUCFG2, LSPW, 21, 1) FIELD(CPUCFG2, LAM, 22, 1) @@ -285,6 +286,10 @@ struct LoongArchTLB { typedef struct LoongArchTLB LoongArchTLB; #endif +enum loongarch_features { + LOONGARCH_FEATURE_LBT, /* loongson binary translation extension */ +}; + typedef struct CPUArchState { uint64_t gpr[32]; uint64_t pc; @@ -388,6 +393,7 @@ struct ArchCPU { CPULoongArchState env; QEMUTimer timer; uint32_t phy_id; + OnOffAuto lbt; /* 'compatible' string for this CPU for Linux device trees */ const char *dtb_compatible; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 90c8379c46..567404bdb5 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -9,6 +9,7 @@ #include #include +#include "qapi/error.h" #include "qemu/timer.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -786,17 +787,71 @@ static void kvm_loongarch_vm_stage_change(void *opaque, bool running, } } +static bool kvm_feature_supported(CPUState *cs, enum loongarch_features feature) +{ + int ret; + struct kvm_device_attr attr; + + switch (feature) { + case LOONGARCH_FEATURE_LBT: + /* + * Return all if all the LBT features are supported such as: + * KVM_LOONGARCH_VM_FEAT_X86BT + * KVM_LOONGARCH_VM_FEAT_ARMBT + * KVM_LOONGARCH_VM_FEAT_MIPSBT + */ + attr.group = KVM_LOONGARCH_VM_FEAT_CTRL; + attr.attr = KVM_LOONGARCH_VM_FEAT_X86BT; + ret = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + attr.attr = KVM_LOONGARCH_VM_FEAT_ARMBT; + ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + attr.attr = KVM_LOONGARCH_VM_FEAT_MIPSBT; + ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + return (ret == 0); + default: + return false; + } +} + +static int kvm_cpu_check_lbt(CPUState *cs, Error **errp) +{ + CPULoongArchState *env = cpu_env(cs); + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + bool kvm_supported; + + kvm_supported = kvm_feature_supported(cs, LOONGARCH_FEATURE_LBT); + if (cpu->lbt == ON_OFF_AUTO_ON) { + if (kvm_supported) { + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LBT_ALL, 7); + } else { + error_setg(errp, "'lbt' feature not supported by KVM on this host"); + return -ENOTSUP; + } + } else if ((cpu->lbt == ON_OFF_AUTO_AUTO) && kvm_supported) { + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LBT_ALL, 7); + } + + return 0; +} + int kvm_arch_init_vcpu(CPUState *cs) { uint64_t val; + int ret; + Error *local_err = NULL; + ret = 0; qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs); if (!kvm_get_one_reg(cs, KVM_REG_LOONGARCH_DEBUG_INST, &val)) { brk_insn = val; } - return 0; + ret = kvm_cpu_check_lbt(cs, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + return ret; } int kvm_arch_destroy_vcpu(CPUState *cs) diff --git a/target/loongarch/loongarch-qmp-cmds.c b/target/loongarch/loongarch-qmp-cmds.c index 2612f43de9..644b528824 100644 --- a/target/loongarch/loongarch-qmp-cmds.c +++ b/target/loongarch/loongarch-qmp-cmds.c @@ -42,7 +42,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) } static const char *cpu_model_advertised_features[] = { - "lsx", "lasx", "pmu", "pmnum", NULL + "lsx", "lasx", "lbt", "pmu", "pmnum", NULL }; CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type, -- Gitee From a7b08284143f7ace3635036bf0366cbec4d52c99 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 29 Sep 2024 15:04:05 +0800 Subject: [PATCH 393/939] target/loongarch: Implement lbt registers save/restore function Six registers scr0 - scr3, eflags and ftop are added in percpu vmstate. And two functions kvm_loongarch_get_lbt/kvm_loongarch_put_lbt are added to save/restore lbt registers. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240929070405.235200-3-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.h | 13 ++++++++ target/loongarch/kvm/kvm.c | 62 ++++++++++++++++++++++++++++++++++++++ target/loongarch/machine.c | 24 +++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 3e2bcbf608..2f8c5cf2dd 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -18,6 +18,7 @@ #endif #include "cpu-csr.h" #include "cpu-qom.h" +#include "qapi/qapi-types-common.h" #define IOCSRF_TEMP 0 #define IOCSRF_NODECNT 1 @@ -290,6 +291,17 @@ enum loongarch_features { LOONGARCH_FEATURE_LBT, /* loongson binary translation extension */ }; +typedef struct LoongArchBT { + /* scratch registers */ + uint64_t scr0; + uint64_t scr1; + uint64_t scr2; + uint64_t scr3; + /* loongarch eflags */ + uint32_t eflags; + uint32_t ftop; +} lbt_t; + typedef struct CPUArchState { uint64_t gpr[32]; uint64_t pc; @@ -297,6 +309,7 @@ typedef struct CPUArchState { fpr_t fpr[32]; bool cf[8]; uint32_t fcsr0; + lbt_t lbt; uint32_t cpucfg[21]; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 567404bdb5..118f66f742 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -486,6 +486,58 @@ static int kvm_loongarch_put_regs_fp(CPUState *cs) return ret; } +static int kvm_loongarch_put_lbt(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + uint64_t val; + int ret; + + /* check whether vm support LBT firstly */ + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LBT_ALL) != 7) { + return 0; + } + + /* set six LBT registers including scr0-scr3, eflags, ftop */ + ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR0, &env->lbt.scr0); + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR1, &env->lbt.scr1); + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR2, &env->lbt.scr2); + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR3, &env->lbt.scr3); + /* + * Be cautious, KVM_REG_LOONGARCH_LBT_FTOP is defined as 64-bit however + * lbt.ftop is 32-bit; the same with KVM_REG_LOONGARCH_LBT_EFLAGS register + */ + val = env->lbt.eflags; + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_EFLAGS, &val); + val = env->lbt.ftop; + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_FTOP, &val); + + return ret; +} + +static int kvm_loongarch_get_lbt(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + uint64_t val; + int ret; + + /* check whether vm support LBT firstly */ + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LBT_ALL) != 7) { + return 0; + } + + /* get six LBT registers including scr0-scr3, eflags, ftop */ + ret = kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR0, &env->lbt.scr0); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR1, &env->lbt.scr1); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR2, &env->lbt.scr2); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR3, &env->lbt.scr3); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_EFLAGS, &val); + env->lbt.eflags = (uint32_t)val; + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_FTOP, &val); + env->lbt.ftop = (uint32_t)val; + + return ret; +} + void kvm_arch_reset_vcpu(CPUState *cs) { CPULoongArchState *env = cpu_env(cs); @@ -733,6 +785,11 @@ int kvm_arch_get_registers(CPUState *cs) return ret; } + ret = kvm_loongarch_get_lbt(cs); + if (ret) { + return ret; + } + ret = kvm_loongarch_get_mpstate(cs); return ret; } @@ -761,6 +818,11 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } + ret = kvm_loongarch_put_lbt(cs); + if (ret) { + return ret; + } + ret = kvm_loongarch_put_mpstate(cs); return ret; } diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index 97e1152ffd..5d62aabd51 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -130,6 +130,29 @@ static int cpu_pre_save(void *opaque) return 0; } +static bool lbt_needed(void *opaque) +{ + LoongArchCPU *cpu = opaque; + + return !!FIELD_EX64(cpu->env.cpucfg[2], CPUCFG2, LBT_ALL); +} + +static const VMStateDescription vmstate_lbt = { + .name = "cpu/lbt", + .version_id = 0, + .minimum_version_id = 0, + .needed = lbt_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(env.lbt.scr0, LoongArchCPU), + VMSTATE_UINT64(env.lbt.scr1, LoongArchCPU), + VMSTATE_UINT64(env.lbt.scr2, LoongArchCPU), + VMSTATE_UINT64(env.lbt.scr3, LoongArchCPU), + VMSTATE_UINT32(env.lbt.eflags, LoongArchCPU), + VMSTATE_UINT32(env.lbt.ftop, LoongArchCPU), + VMSTATE_END_OF_LIST() + }, +}; + #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) static bool tlb_needed(void *opaque) { @@ -244,6 +267,7 @@ const VMStateDescription vmstate_loongarch_cpu = { #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) &vmstate_tlb, #endif + &vmstate_lbt, NULL } }; -- Gitee From b87b4782e8147fd481becd946ca909edaaa58b41 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 18 Sep 2024 16:23:15 +0800 Subject: [PATCH 394/939] target/loongarch/kvm: Implement LoongArch PMU extension Implement PMU extension for LoongArch kvm mode. Use OnOffAuto type variable pmu to check the PMU feature. If the PMU Feature is not supported with KVM host, it reports error if there is pmu=on command line. If there is no any command line about pmu parameter, it checks whether KVM host supports the PMU Feature and set the corresponding value in cpucfg. This patch is based on lbt patch located at https://lore.kernel.org/qemu-devel/20240904061859.86615-1-maobibo@loongson.cn Co-developed-by: Song Gao Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-Id: <20240918082315.2345034-1-maobibo@loongson.cn> Signed-off-by: Song Gao --- target/loongarch/cpu.c | 63 +++++++-------------------- target/loongarch/cpu.h | 2 + target/loongarch/kvm/kvm.c | 41 +++++++++++++++++ target/loongarch/loongarch-qmp-cmds.c | 2 +- 4 files changed, 59 insertions(+), 49 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index a57067938d..2ee1d63989 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -695,58 +695,28 @@ static void loongarch_set_lasx(Object *obj, bool value, Error **errp) } } -static bool loongarch_get_pmu(Object *obj, Error **errp) -{ - LoongArchCPU *cpu = LOONGARCH_CPU(obj); - - return !!(FIELD_EX32(cpu->env.cpucfg[6], CPUCFG6, PMP)); -} - -static void loongarch_set_pmu(Object *obj, bool value, Error **errp) -{ - LoongArchCPU *cpu = LOONGARCH_CPU(obj); - - cpu->env.cpucfg[6] = FIELD_DP32(cpu->env.cpucfg[6], CPUCFG6, PMP, value); -} - -static void loongarch_get_pmnum(Object *obj, Visitor *v, - const char *name, void *opaque, - Error **errp) +static bool loongarch_get_lbt(Object *obj, Error **errp) { - LoongArchCPU *cpu = LOONGARCH_CPU(obj); - uint32_t value = FIELD_EX32(cpu->env.cpucfg[6], CPUCFG6, PMNUM); - - visit_type_uint32(v, name, &value, errp); + return LOONGARCH_CPU(obj)->lbt != ON_OFF_AUTO_OFF; } -static void loongarch_set_pmnum(Object *obj, Visitor *v, - const char *name, void *opaque, - Error **errp) +static void loongarch_set_lbt(Object *obj, bool value, Error **errp) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); - uint32_t *value= opaque; - if (!visit_type_uint32(v, name, value, errp)) { - return; - } - if ((*value <= PMNUM_MAX) && (*value > 0)) { - cpu->env.cpucfg[6] = FIELD_DP32(cpu->env.cpucfg[6], CPUCFG6, PMNUM, *value -1); - } else { - error_report("Performance counter number need be in [1- %d]\n", PMNUM_MAX); - exit(EXIT_FAILURE); - } + cpu->lbt = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } -static bool loongarch_get_lbt(Object *obj, Error **errp) +static bool loongarch_get_pmu(Object *obj, Error **errp) { - return LOONGARCH_CPU(obj)->lbt != ON_OFF_AUTO_OFF; + return LOONGARCH_CPU(obj)->pmu != ON_OFF_AUTO_OFF; } -static void loongarch_set_lbt(Object *obj, bool value, Error **errp) +static void loongarch_set_pmu(Object *obj, bool value, Error **errp) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); - cpu->lbt = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; + cpu->pmu = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } void loongarch_cpu_post_init(Object *obj) @@ -759,21 +729,18 @@ void loongarch_cpu_post_init(Object *obj) loongarch_set_lasx); if (kvm_enabled()) { - object_property_add_bool(obj, "pmu", loongarch_get_pmu, - loongarch_set_pmu); - if (FIELD_EX32(cpu->env.cpucfg[6], CPUCFG6, PMP)) { - uint32_t value = 4; - object_property_add(obj, "pmnum", "uint32", - loongarch_get_pmnum, - loongarch_set_pmnum, NULL, - (void *)&value); - } - cpu->lbt = ON_OFF_AUTO_AUTO; object_property_add_bool(obj, "lbt", loongarch_get_lbt, loongarch_set_lbt); object_property_set_description(obj, "lbt", "Set off to disable Binary Tranlation."); + + cpu->pmu = ON_OFF_AUTO_AUTO; + object_property_add_bool(obj, "pmu", loongarch_get_pmu, + loongarch_set_pmu); + object_property_set_description(obj, "pmu", + "Set off to performance monitor unit."); + } else { cpu->lbt = ON_OFF_AUTO_OFF; } diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 2f8c5cf2dd..8ff00d17e1 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -289,6 +289,7 @@ typedef struct LoongArchTLB LoongArchTLB; enum loongarch_features { LOONGARCH_FEATURE_LBT, /* loongson binary translation extension */ + LOONGARCH_FEATURE_PMU, }; typedef struct LoongArchBT { @@ -407,6 +408,7 @@ struct ArchCPU { QEMUTimer timer; uint32_t phy_id; OnOffAuto lbt; + OnOffAuto pmu; /* 'compatible' string for this CPU for Linux device trees */ const char *dtb_compatible; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 118f66f742..8b0f86a201 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -870,9 +870,18 @@ static bool kvm_feature_supported(CPUState *cs, enum loongarch_features feature) attr.attr = KVM_LOONGARCH_VM_FEAT_MIPSBT; ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); return (ret == 0); + + case LOONGARCH_FEATURE_PMU: + attr.group = KVM_LOONGARCH_VM_FEAT_CTRL; + attr.attr = KVM_LOONGARCH_VM_FEAT_PMU; + ret = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + return (ret == 0); + default: return false; } + + return false; } static int kvm_cpu_check_lbt(CPUState *cs, Error **errp) @@ -896,6 +905,32 @@ static int kvm_cpu_check_lbt(CPUState *cs, Error **errp) return 0; } +static int kvm_cpu_check_pmu(CPUState *cs, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = cpu_env(cs); + bool kvm_supported; + + kvm_supported = kvm_feature_supported(cs, LOONGARCH_FEATURE_PMU); + if (cpu->pmu == ON_OFF_AUTO_ON) { + if (!kvm_supported) { + error_setg(errp, "'pmu' feature not supported by KVM on the host"); + return -ENOTSUP; + } + } else if (cpu->pmu != ON_OFF_AUTO_AUTO) { + /* disable pmu if ON_OFF_AUTO_OFF is set */ + kvm_supported = false; + } + + if (kvm_supported) { + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMP, 1); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMNUM, 3); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMBITS, 63); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, UPM, 1); + } + return 0; +} + int kvm_arch_init_vcpu(CPUState *cs) { uint64_t val; @@ -913,6 +948,12 @@ int kvm_arch_init_vcpu(CPUState *cs) if (ret < 0) { error_report_err(local_err); } + + ret = kvm_cpu_check_pmu(cs, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + return ret; } diff --git a/target/loongarch/loongarch-qmp-cmds.c b/target/loongarch/loongarch-qmp-cmds.c index 644b528824..dc78a3ffa2 100644 --- a/target/loongarch/loongarch-qmp-cmds.c +++ b/target/loongarch/loongarch-qmp-cmds.c @@ -42,7 +42,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) } static const char *cpu_model_advertised_features[] = { - "lsx", "lasx", "lbt", "pmu", "pmnum", NULL + "lsx", "lasx", "lbt", "pmu", NULL }; CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type, -- Gitee From 734b877ee97c73c7cbeeb02c560b9b4e6a8c0dda Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 17 Oct 2024 10:07:07 +0800 Subject: [PATCH 395/939] linux-headers: loongarch: Add kvm_para.h and unistd_64.h KVM LBT supports on LoongArch depends on the linux-header file kvm_para.h, also unistd_64.h is required by unistd.h on LoongArch since 6.11, otherwise there will be compiling error such as: linux-headers/asm/unistd.h:3:10: fatal error: asm/unistd_64.h: No such file or directory #include Signed-off-by: Bibo Mao Acked-by: Song Gao Message-Id: <20241017020708.1728620-2-maobibo@loongson.cn> Signed-off-by: Song Gao --- scripts/update-linux-headers.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 34295c0fe5..88c76b8f69 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -156,6 +156,10 @@ for arch in $ARCHLIST; do cp_portable "$tmpdir/bootparam.h" \ "$output/include/standard-headers/asm-$arch" fi + if [ $arch = loongarch ]; then + cp "$hdrdir/include/asm/kvm_para.h" "$output/linux-headers/asm-loongarch/" + cp "$hdrdir/include/asm/unistd_64.h" "$output/linux-headers/asm-loongarch/" + fi done rm -rf "$output/linux-headers/linux" -- Gitee From 8febab6bcb01e3e10ca4ac0021bae2a812a4452b Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 30 Sep 2024 14:40:40 +0800 Subject: [PATCH 396/939] target/loongarch: Add steal time support on migration With pv steal time supported, VM machine needs get physical address of each vcpu and notify new host during migration. Here two functions kvm_get_stealtime/kvm_set_stealtime, and guest steal time physical address is only updated on KVM_PUT_FULL_STATE stage. Signed-off-by: Bibo Mao Reviewed-by: Song Gao Message-ID: <20240930064040.753929-1-maobibo@loongson.cn> --- target/loongarch/cpu.h | 3 ++ target/loongarch/kvm/kvm.c | 65 ++++++++++++++++++++++++++++++++++++++ target/loongarch/machine.c | 6 ++-- 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 8ff00d17e1..4c90cf9ef3 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -369,6 +369,9 @@ typedef struct CPUArchState { uint64_t CSR_DBG; uint64_t CSR_DERA; uint64_t CSR_DSAVE; + struct { + uint64_t guest_addr; + } stealtime; #ifdef CONFIG_TCG float_status fp_status; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 8b0f86a201..550f14269e 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -35,6 +35,55 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = { KVM_CAP_LAST_INFO }; +static int kvm_get_stealtime(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + int err; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&env->stealtime.guest_addr, + }; + + err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr); + if (err) { + return 0; + } + + err = kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, attr); + if (err) { + error_report("PVTIME: KVM_GET_DEVICE_ATTR: %s", strerror(errno)); + return err; + } + + return 0; +} + +static int kvm_set_stealtime(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + int err; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&env->stealtime.guest_addr, + }; + + err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr); + if (err) { + return 0; + } + + err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr); + if (err) { + error_report("PVTIME: KVM_SET_DEVICE_ATTR %s with gpa "TARGET_FMT_lx, + strerror(errno), env->stealtime.guest_addr); + return err; + } + + return 0; +} + static int kvm_loongarch_get_regs_core(CPUState *cs) { int ret = 0; @@ -790,6 +839,11 @@ int kvm_arch_get_registers(CPUState *cs) return ret; } + ret = kvm_get_stealtime(cs); + if (ret) { + return ret; + } + ret = kvm_loongarch_get_mpstate(cs); return ret; } @@ -823,6 +877,17 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } + if (level >= KVM_PUT_FULL_STATE) { + /* + * only KVM_PUT_FULL_STATE is required, kvm kernel will clear + * guest_addr for KVM_PUT_RESET_STATE + */ + ret = kvm_set_stealtime(cs); + if (ret) { + return ret; + } + } + ret = kvm_loongarch_put_mpstate(cs); return ret; } diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index 5d62aabd51..fd69ea05dc 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -188,8 +188,8 @@ static const VMStateDescription vmstate_tlb = { /* LoongArch CPU state */ const VMStateDescription vmstate_loongarch_cpu = { .name = "cpu", - .version_id = 2, - .minimum_version_id = 2, + .version_id = 3, + .minimum_version_id = 3, .post_load = cpu_post_load, .pre_save = cpu_pre_save, .fields = (const VMStateField[]) { @@ -257,6 +257,8 @@ const VMStateDescription vmstate_loongarch_cpu = { VMSTATE_UINT64(env.CSR_DSAVE, LoongArchCPU), VMSTATE_UINT64(kvm_state_counter, LoongArchCPU), + /* PV steal time */ + VMSTATE_UINT64(env.stealtime.guest_addr, LoongArchCPU), VMSTATE_END_OF_LIST() }, -- Gitee From 2464d0d6115e1794468ff455e3acdb98e0d71a31 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:56 +0100 Subject: [PATCH 397/939] accel/kvm: Extract common KVM vCPU {creation,parking} code KVM vCPU creation is done once during the vCPU realization when Qemu vCPU thread is spawned. This is common to all the architectures as of now. Hot-unplug of vCPU results in destruction of the vCPU object in QOM but the corresponding KVM vCPU object in the Host KVM is not destroyed as KVM doesn't support vCPU removal. Therefore, its representative KVM vCPU object/context in Qemu is parked. Refactor architecture common logic so that some APIs could be reused by vCPU Hotplug code of some architectures likes ARM, Loongson etc. Update new/old APIs with trace events. New APIs qemu_{create,park,unpark}_vcpu() can be externally called. No functional change is intended here. Signed-off-by: Salil Mehta Reviewed-by: Gavin Shan Tested-by: Vishnu Pajjuri Reviewed-by: Jonathan Cameron Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Reviewed-by: Vishnu Pajjuri Reviewed-by: Nicholas Piggin Tested-by: Zhao Liu Reviewed-by: Zhao Liu Reviewed-by: Harsh Prateek Bora Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-2-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- accel/kvm/kvm-all.c | 71 +++++++++++++++++++++--------------------- accel/kvm/trace-events | 11 +++++++ include/sysemu/kvm.h | 27 ++++++++++++++-- 3 files changed, 71 insertions(+), 38 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 8077630825..8dea8f98bb 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -141,7 +141,6 @@ static QemuMutex kml_slots_lock; #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock) static void kvm_slot_init_dirty_bitmap(KVMSlot *mem); -static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id); static inline void kvm_resample_fd_remove(int gsi) { @@ -334,39 +333,57 @@ void kvm_park_vcpu(CPUState *cpu) { struct KVMParkedVcpu *vcpu; + trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); + vcpu = g_malloc0(sizeof(*vcpu)); vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); vcpu->kvm_fd = cpu->kvm_fd; QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); } +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id) +{ + struct KVMParkedVcpu *cpu; + int kvm_fd = -ENOENT; + + QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { + if (cpu->vcpu_id == vcpu_id) { + QLIST_REMOVE(cpu, node); + kvm_fd = cpu->kvm_fd; + g_free(cpu); + } + } + + trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked"); + + return kvm_fd; +} + int kvm_create_vcpu(CPUState *cpu) { - unsigned long vcpu_id = cpu->cpu_index; + unsigned long vcpu_id = kvm_arch_vcpu_id(cpu); KVMState *s = kvm_state; - int ret; - - DPRINTF("kvm_create_vcpu\n"); + int kvm_fd; /* check if the KVM vCPU already exist but is parked */ - ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); - if (ret > 0) { - goto found; - } - - /* create a new KVM vcpu */ - ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); - if (ret < 0) { - return ret; + kvm_fd = kvm_unpark_vcpu(s, vcpu_id); + if (kvm_fd < 0) { + /* vCPU not parked: create a new KVM vCPU */ + kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id); + if (kvm_fd < 0) { + error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id); + return kvm_fd; + } } -found: - cpu->vcpu_dirty = true; - cpu->kvm_fd = ret; + cpu->kvm_fd = kvm_fd; cpu->kvm_state = s; + cpu->vcpu_dirty = true; cpu->dirty_pages = 0; cpu->throttle_us_per_full = 0; + trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd); + return 0; } @@ -376,7 +393,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) long mmap_size; int ret = 0; - DPRINTF("kvm_destroy_vcpu\n"); + trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); ret = kvm_arch_destroy_vcpu(cpu); if (ret < 0) { @@ -415,24 +432,6 @@ void kvm_destroy_vcpu(CPUState *cpu) } } -static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) -{ - struct KVMParkedVcpu *cpu; - - QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { - if (cpu->vcpu_id == vcpu_id) { - int kvm_fd; - - QLIST_REMOVE(cpu, node); - kvm_fd = cpu->kvm_fd; - g_free(cpu); - return kvm_fd; - } - } - - return -1; -} - int kvm_init_vcpu(CPUState *cpu, Error **errp) { KVMState *s = kvm_state; diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index 399aaeb0ec..9c880fdcf4 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -9,6 +9,10 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p" kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s" kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s" kvm_init_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_create_vcpu(int cpu_index, unsigned long arch_cpu_id, int kvm_fd) "index: %d, id: %lu, kvm fd: %d" +kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s" kvm_irqchip_commit_routes(void) "" kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d" kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" @@ -26,3 +30,10 @@ kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"P kvm_dirty_ring_reaper_kick(const char *reason) "%s" kvm_dirty_ring_flush(int finished) "%d" +kvm_failed_get_vcpu_mmap_size(void) "" +kvm_cpu_exec(void) "" +kvm_interrupt_exit_request(void) "" +kvm_io_window_exit(void) "" +kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32 +kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s" +kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64 diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 31af5f0e24..7ffb5e4992 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -319,6 +319,31 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test); */ bool kvm_device_supported(int vmfd, uint64_t type); +/** + * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU + * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created. + * + * @returns: 0 when success, errno (<0) when failed. + */ +int kvm_create_vcpu(CPUState *cpu); + +/** + * kvm_park_vcpu - Park QEMU KVM vCPU context + * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked. + * + * @returns: none + */ +void kvm_park_vcpu(CPUState *cpu); + +/** + * kvm_unpark_vcpu - unpark QEMU KVM vCPU context + * @s: KVM State + * @vcpu_id: Architecture vCPU ID of the parked vCPU + * + * @returns: KVM fd + */ +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id); + /* Arch specific hooks */ extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; @@ -440,8 +465,6 @@ void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len); int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr, hwaddr *phys_addr); -int kvm_create_vcpu(CPUState *cpu); -void kvm_park_vcpu(CPUState *cpu); #endif /* NEED_CPU_H */ -- Gitee From a8416845f721aa5ba03446b3ccf83b096b7a0d77 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:57 +0100 Subject: [PATCH 398/939] hw/acpi: Move CPU ctrl-dev MMIO region len macro to common header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU ctrl-dev MMIO region length could be used in ACPI GED and various other architecture specific places. Move ACPI_CPU_HOTPLUG_REG_LEN macro to more appropriate common header file. Signed-off-by: Salil Mehta Reviewed-by: Alex Bennée Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Tested-by: Vishnu Pajjuri Tested-by: Xianglai Li Tested-by: Miguel Luis Tested-by: Zhao Liu Reviewed-by: Zhao Liu Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-3-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/acpi/cpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index fced952152..fa5b5e5f01 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -18,6 +18,8 @@ #include "hw/boards.h" #include "hw/hotplug.h" +#define ACPI_CPU_HOTPLUG_REG_LEN 12 + typedef struct AcpiCpuStatus { CPUState *cpu; uint64_t arch_id; -- Gitee From ac96f216155002d0c874ff88e301e83495093085 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:58 +0100 Subject: [PATCH 399/939] hw/acpi: Update ACPI GED framework to support vCPU Hotplug ACPI GED (as described in the ACPI 6.4 spec) uses an interrupt listed in the _CRS object of GED to intimate OSPM about an event. Later then demultiplexes the notified event by evaluating ACPI _EVT method to know the type of event. Use ACPI GED to also notify the guest kernel about any CPU hot(un)plug events. Note, GED interface is used by many hotplug events like memory hotplug, NVDIMM hotplug and non-hotplug events like system power down event. Each of these can be selected using a bit in the 32 bit GED IO interface. A bit has been reserved for the CPU hotplug event. ACPI CPU hotplug related initialization should only happen if ACPI_CPU_HOTPLUG support has been enabled for particular architecture. Add cpu_hotplug_hw_init() stub to avoid compilation break. Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Tested-by: Vishnu Pajjuri Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Vishnu Pajjuri Tested-by: Zhao Liu Reviewed-by: Zhao Liu Message-Id: <20240716111502.202344-4-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Igor Mammedov --- docs/specs/acpi_hw_reduced_hotplug.rst | 3 ++- hw/acpi/generic_event_device.c | 37 ++++++++++++++++++++++++++ include/hw/acpi/generic_event_device.h | 1 + 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst b/docs/specs/acpi_hw_reduced_hotplug.rst index 0bd3f9399f..3acd6fcd8b 100644 --- a/docs/specs/acpi_hw_reduced_hotplug.rst +++ b/docs/specs/acpi_hw_reduced_hotplug.rst @@ -64,7 +64,8 @@ GED IO interface (4 byte access) 0: Memory hotplug event 1: System power down event 2: NVDIMM hotplug event - 3-31: Reserved + 3: CPU hotplug event + 4-31: Reserved **write_access:** diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 2ce7031f1a..755653dc26 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -397,6 +397,42 @@ static const VMStateDescription vmstate_acpi_ged = { } }; +static void acpi_ged_realize(DeviceState *dev, Error **errp) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + AcpiGedState *s = ACPI_GED(dev); + uint32_t ged_events; + int i; + + ged_events = ctpop32(s->ged_event_bitmap); + + for (i = 0; i < ARRAY_SIZE(ged_supported_events) && ged_events; i++) { + uint32_t event = s->ged_event_bitmap & ged_supported_events[i]; + + if (!event) { + continue; + } + + switch (event) { + case ACPI_GED_CPU_HOTPLUG_EVT: + /* initialize CPU Hotplug related regions */ + memory_region_init(&s->container_cpuhp, OBJECT(dev), + "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(sbd, &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); + break; + } + ged_events--; + } + + if (ged_events) { + error_report("Unsupported events specified"); + abort(); + } +} + static void acpi_ged_initfn(Object *obj) { DeviceState *dev = DEVICE(obj); @@ -447,6 +483,7 @@ static void acpi_ged_class_init(ObjectClass *class, void *data) dc->desc = "ACPI Generic Event Device"; device_class_set_props(dc, acpi_ged_properties); dc->vmsd = &vmstate_acpi_ged; + dc->realize = acpi_ged_realize; hc->plug = acpi_ged_device_plug_cb; hc->unplug_request = acpi_ged_unplug_request_cb; diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index 8ed9534c57..d1df3c12e5 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -63,6 +63,7 @@ #include "hw/acpi/cpu_hotplug.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/ghes.h" +#include "hw/acpi/cpu.h" #include "qom/object.h" #define ACPI_POWER_BUTTON_DEVICE "PWRB" -- Gitee From 16d44ddb63becd559cc2185549c4b18d26feab60 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:15:00 +0100 Subject: [PATCH 400/939] hw/acpi: Update CPUs AML with cpu-(ctrl)dev change CPUs Control device(\\_SB.PCI0) register interface for the x86 arch is IO port based and existing CPUs AML code assumes _CRS objects would evaluate to a system resource which describes IO Port address. But on ARM arch CPUs control device(\\_SB.PRES) register interface is memory-mapped hence _CRS object should evaluate to system resource which describes memory-mapped base address. Update build CPUs AML function to accept both IO/MEMORY region spaces and accordingly update the _CRS object. Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta Reviewed-by: Gavin Shan Tested-by: Vishnu Pajjuri Reviewed-by: Jonathan Cameron Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Tested-by: Zhao Liu Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-6-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/cpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 292e1daca2..5e9093991e 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -392,11 +392,13 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_name_decl("_UID", aml_string("CPU Hotplug resources"))); aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0)); + assert((rs == AML_SYSTEM_IO) || (rs == AML_SYSTEM_MEMORY)); + crs = aml_resource_template(); if (rs == AML_SYSTEM_IO) { aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1, ACPI_CPU_HOTPLUG_REG_LEN)); - } else { + } else if (rs == AML_SYSTEM_MEMORY) { aml_append(crs, aml_memory32_fixed(base_addr, ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE)); } -- Gitee From 7efd5d829730d0481659cda91f725df3b141f469 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:15:01 +0100 Subject: [PATCH 401/939] physmem: Add helper function to destroy CPU AddressSpace Virtual CPU Hot-unplug leads to unrealization of a CPU object. This also involves destruction of the CPU AddressSpace. Add common function to help destroy the CPU AddressSpace. Signed-off-by: Salil Mehta Tested-by: Vishnu Pajjuri Reviewed-by: Gavin Shan Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Tested-by: Zhao Liu Acked-by: Igor Mammedov Message-Id: <20240716111502.202344-7-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/core/cpu.h | 4 ++-- system/physmem.c | 18 +++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index ee04ee44c2..37f3a469c8 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -495,8 +495,8 @@ struct CPUState { QemuMutex work_mutex; QSIMPLEQ_HEAD(, qemu_work_item) work_list; - CPUAddressSpace *cpu_ases; - int cpu_ases_ref_count; + struct CPUAddressSpace *cpu_ases; + int cpu_ases_count; int num_ases; AddressSpace *as; MemoryRegion *memory; diff --git a/system/physmem.c b/system/physmem.c index 2c8b83f811..c50ac24786 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -761,7 +761,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (!cpu->cpu_ases) { cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); - cpu->cpu_ases_ref_count = cpu->num_ases; + cpu->cpu_ases_count = cpu->num_ases; } newas = &cpu->cpu_ases[asidx]; @@ -779,24 +779,28 @@ void cpu_address_space_destroy(CPUState *cpu, int asidx) { CPUAddressSpace *cpuas; - assert(asidx < cpu->num_ases); - assert(asidx == 0 || !kvm_enabled()); assert(cpu->cpu_ases); + assert(asidx >= 0 && asidx < cpu->num_ases); + /* KVM cannot currently support multiple address spaces. */ + assert(asidx == 0 || !kvm_enabled()); cpuas = &cpu->cpu_ases[asidx]; if (tcg_enabled()) { memory_listener_unregister(&cpuas->tcg_as_listener); } - cpuas->as->free_in_rcu = true; address_space_destroy(cpuas->as); + g_free_rcu(cpuas->as, rcu); - if (cpu->cpu_ases_ref_count == 1) { + if (asidx == 0) { + /* reset the convenience alias for address space 0 */ + cpu->as = NULL; + } + + if (--cpu->cpu_ases_count == 0) { g_free(cpu->cpu_ases); cpu->cpu_ases = NULL; } - - cpu->cpu_ases_ref_count--; } AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) -- Gitee From 7754cf384417295dc74add4e774c506d751671a9 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:15:02 +0100 Subject: [PATCH 402/939] gdbstub: Add helper function to unregister GDB register space Add common function to help unregister the GDB register space. This shall be done in context to the CPU unrealization. Note: These are common functions exported to arch specific code. For example, for ARM this code is being referred in associated arch specific patch-set: Link: https://lore.kernel.org/qemu-devel/20230926103654.34424-1-salil.mehta@huawei.com/ Signed-off-by: Salil Mehta Tested-by: Vishnu Pajjuri Reviewed-by: Gavin Shan Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Reviewed-by: Vishnu Pajjuri Tested-by: Zhao Liu Acked-by: Igor Mammedov Message-Id: <20240716111502.202344-8-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- gdbstub/gdbstub.c | 7 +++++++ hw/core/cpu-common.c | 4 ++++ include/exec/gdbstub.h | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c index f16006d2a8..31c3dae525 100644 --- a/gdbstub/gdbstub.c +++ b/gdbstub/gdbstub.c @@ -584,8 +584,15 @@ void gdb_register_coprocessor(CPUState *cpu, void gdb_unregister_coprocessor_all(CPUState *cpu) { + /* + * Safe to nuke everything. GDBRegisterState::xml is static const char so + * it won't be freed + */ g_array_free(cpu->gdb_regs, true); + cpu->gdb_regs = NULL; + cpu->gdb_num_regs = 0; + cpu->gdb_num_g_regs = 0; } static void gdb_process_breakpoint_remove_all(GDBProcess *p) diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index 82dae51a55..e36ca2c207 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -262,6 +262,10 @@ static void cpu_common_finalize(Object *obj) { CPUState *cpu = CPU(obj); + /* If cleanup didn't happen in context to gdb_unregister_coprocessor_all */ + if (cpu->gdb_regs) { + g_array_free(cpu->gdb_regs, TRUE); + } qemu_lockcnt_destroy(&cpu->in_ioctl_lock); qemu_mutex_destroy(&cpu->work_mutex); } diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h index d123b838c2..e2e8dff051 100644 --- a/include/exec/gdbstub.h +++ b/include/exec/gdbstub.h @@ -39,6 +39,11 @@ typedef int (*gdb_set_reg_cb)(CPUArchState *env, uint8_t *buf, int reg); void gdb_register_coprocessor(CPUState *cpu, gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg, int num_regs, const char *xml, int g_pos); + +/** + * gdb_unregister_coprocessor_all() - unregisters supplemental set of registers + * @cpu - the CPU associated with registers + */ void gdb_unregister_coprocessor_all(CPUState *cpu); /** -- Gitee From 9bbc73e18d36d75c5dd842e478ed1f1b47ed4222 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Thu, 1 Aug 2024 10:15:03 +0100 Subject: [PATCH 403/939] accel/kvm/kvm-all: Fixes the missing break in vCPU unpark logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Loop should exit prematurely on successfully finding out the parked vCPU (struct KVMParkedVcpu) in the 'struct KVMState' maintained 'kvm_parked_vcpus' list of parked vCPUs. Fixes: Coverity CID 1558552 Fixes: 08c3286822 ("accel/kvm: Extract common KVM vCPU {creation,parking} code") Reported-by: Peter Maydell Signed-off-by: Salil Mehta Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Gavin Shan Reviewed-by: Zhao Liu Reviewed-by: Igor Mammedov Message-id: 20240725145132.99355-1-salil.mehta@huawei.com Suggested-by: Peter Maydell Message-ID: Signed-off-by: Salil Mehta Signed-off-by: Peter Maydell --- accel/kvm/kvm-all.c | 1 + 1 file changed, 1 insertion(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 8dea8f98bb..79d5671841 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -351,6 +351,7 @@ int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id) QLIST_REMOVE(cpu, node); kvm_fd = cpu->kvm_fd; g_free(cpu); + break; } } -- Gitee From 8d440efd992fd6be0aca55118a9b60c224f6eade Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 23 Oct 2024 15:13:10 +0800 Subject: [PATCH 404/939] hw/loongarch/virt: Add CPU topology support Add topological relationships for Loongarch VCPU and initialize topology member variables. Also physical cpu id calculation method comes from its topo information. Co-developed-by: Xianglai Li Signed-off-by: Bibo Mao Message-ID: <20241023071312.881866-2-maobibo@loongson.cn> --- docs/system/loongarch/virt.rst | 31 +++++++++++++ hw/loongarch/virt.c | 82 ++++++++++++++++++++++++++++------ target/loongarch/cpu.c | 12 +++++ target/loongarch/cpu.h | 11 +++++ 4 files changed, 122 insertions(+), 14 deletions(-) diff --git a/docs/system/loongarch/virt.rst b/docs/system/loongarch/virt.rst index c37268b404..aa4719d4bd 100644 --- a/docs/system/loongarch/virt.rst +++ b/docs/system/loongarch/virt.rst @@ -28,6 +28,37 @@ The ``qemu-system-loongarch64`` provides emulation for virt machine. You can specify the machine type ``virt`` and cpu type ``la464``. +CPU Topology +------------ + +The ``LA464`` type CPUs have the concept of Socket Core and Thread. + +For example: + +``-smp 1,maxcpus=M,sockets=S,cores=C,threads=T`` + +The above parameters indicate that the machine has a maximum of ``M`` vCPUs and +``S`` sockets, each socket has ``C`` cores, each core has ``T`` threads, +and each thread corresponds to a vCPU. + +Then ``M`` ``S`` ``C`` ``T`` has the following relationship: + +``M = S * C * T`` + +In the CPU topology relationship, When we know the ``socket_id`` ``core_id`` +and ``thread_id`` of the CPU, we can calculate its ``arch_id``: + +``arch_id = (socket_id * S) + (core_id * C) + (thread_id * T)`` + +Similarly, when we know the ``arch_id`` of the CPU, +we can also get its ``socket_id`` ``core_id`` and ``thread_id``: + +``socket_id = arch_id / (C * T)`` + +``core_id = (arch_id / T) % C`` + +``thread_id = arch_id % T`` + Boot options ------------ diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 9510aa7a7e..8d1e53ff62 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1123,9 +1123,7 @@ static void virt_init(MachineState *machine) LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); int i; hwaddr base, size, ram_size = machine->ram_size; - const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); - CPUState *cpu; if (!cpu_model) { cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); @@ -1143,14 +1141,39 @@ static void virt_init(MachineState *machine) memory_region_add_subregion(&lvms->system_iocsr, 0, &lvms->iocsr_mem); /* Init CPUs */ - possible_cpus = mc->possible_cpu_arch_ids(machine); - for (i = 0; i < possible_cpus->len; i++) { - cpu = cpu_create(machine->cpu_type); - cpu->cpu_index = i; - machine->possible_cpus->cpus[i].cpu = OBJECT(cpu); - lacpu = LOONGARCH_CPU(cpu); + mc->possible_cpu_arch_ids(machine); + for (i = 0; i < machine->smp.cpus; i++) { + Object *cpuobj; + cpuobj = object_new(machine->cpu_type); + lacpu = LOONGARCH_CPU(cpuobj); + lacpu->phy_id = machine->possible_cpus->cpus[i].arch_id; + object_property_set_int(cpuobj, "socket-id", + machine->possible_cpus->cpus[i].props.socket_id, + NULL); + object_property_set_int(cpuobj, "core-id", + machine->possible_cpus->cpus[i].props.core_id, + NULL); + object_property_set_int(cpuobj, "thread-id", + machine->possible_cpus->cpus[i].props.thread_id, + NULL); + /* + * The CPU in place at the time of machine startup will also enter + * the CPU hot-plug process when it is created, but at this time, + * the GED device has not been created, resulting in exit in the CPU + * hot-plug process, which can avoid the incumbent CPU repeatedly + * applying for resources. + * + * The interrupt resource of the in-place CPU will be requested at + * the current function call loongarch_irq_init(). + * + * The interrupt resource of the subsequently inserted CPU will be + * requested in the CPU hot-plug process. + */ + qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); + object_unref(cpuobj); } + fdt_add_cpu_nodes(lvms); fdt_add_memory_nodes(machine); fw_cfg_add_memory(machine); @@ -1266,6 +1289,27 @@ static void virt_initfn(Object *obj) virt_flash_create(lvms); } +static int virt_get_arch_id_from_topo(MachineState *ms, LoongArchCPUTopo *topo) +{ + int arch_id, sock_vcpu_num, core_vcpu_num; + + /* + * calculate total logical cpus across socket/core/thread. + * For more information on how to calculate the arch_id, + * you can refer to the CPU Topology chapter of the + * docs/system/loongarch/virt.rst document. + */ + sock_vcpu_num = topo->socket_id * (ms->smp.threads * ms->smp.cores); + core_vcpu_num = topo->core_id * ms->smp.threads; + + /* get vcpu-id(logical cpu index) for this vcpu from this topology */ + arch_id = (sock_vcpu_num + core_vcpu_num) + topo->thread_id; + + assert(arch_id >= 0 && arch_id < ms->possible_cpus->len); + + return arch_id; +} + static bool memhp_type_supported(DeviceState *dev) { /* we only support pc dimm now */ @@ -1363,10 +1407,19 @@ static HotplugHandler *virt_get_hotplug_handler(MachineState *machine, return NULL; } +static void virt_get_cpu_topo_from_index(MachineState *ms, + LoongArchCPUTopo *topo, int index) +{ + topo->socket_id = index / (ms->smp.cores * ms->smp.threads); + topo->core_id = index / ms->smp.threads % ms->smp.cores; + topo->thread_id = index % ms->smp.threads; +} + static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) { int n; unsigned int max_cpus = ms->smp.max_cpus; + LoongArchCPUTopo topo; if (ms->possible_cpus) { assert(ms->possible_cpus->len == max_cpus); @@ -1377,17 +1430,18 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) sizeof(CPUArchId) * max_cpus); ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { + ms->possible_cpus->cpus[n].vcpus_count = ms->smp.threads; ms->possible_cpus->cpus[n].type = ms->cpu_type; - ms->possible_cpus->cpus[n].arch_id = n; + virt_get_cpu_topo_from_index(ms, &topo, n); ms->possible_cpus->cpus[n].props.has_socket_id = true; - ms->possible_cpus->cpus[n].props.socket_id = - n / (ms->smp.cores * ms->smp.threads); + ms->possible_cpus->cpus[n].props.socket_id = topo.socket_id; ms->possible_cpus->cpus[n].props.has_core_id = true; - ms->possible_cpus->cpus[n].props.core_id = - n / ms->smp.threads % ms->smp.cores; + ms->possible_cpus->cpus[n].props.core_id = topo.core_id; ms->possible_cpus->cpus[n].props.has_thread_id = true; - ms->possible_cpus->cpus[n].props.thread_id = n % ms->smp.threads; + ms->possible_cpus->cpus[n].props.thread_id = topo.thread_id; + ms->possible_cpus->cpus[n].arch_id = + virt_get_arch_id_from_topo(ms, &topo); } return ms->possible_cpus; } diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 2ee1d63989..673ed8ea18 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -17,6 +17,7 @@ #include "kvm/kvm_loongarch.h" #include "exec/exec-all.h" #include "cpu.h" +#include "hw/qdev-properties.h" #include "internals.h" #include "fpu/softfloat-helpers.h" #include "cpu-csr.h" @@ -860,6 +861,15 @@ static int64_t loongarch_cpu_get_arch_id(CPUState *cs) } #endif +static Property loongarch_cpu_properties[] = { + DEFINE_PROP_INT32("socket-id", LoongArchCPU, socket_id, 0), + DEFINE_PROP_INT32("core-id", LoongArchCPU, core_id, 0), + DEFINE_PROP_INT32("thread-id", LoongArchCPU, thread_id, 0), + DEFINE_PROP_INT32("node-id", LoongArchCPU, node_id, CPU_UNSET_NUMA_NODE_ID), + + DEFINE_PROP_END_OF_LIST() +}; + static void loongarch_cpu_class_init(ObjectClass *c, void *data) { LoongArchCPUClass *lacc = LOONGARCH_CPU_CLASS(c); @@ -867,6 +877,7 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data) DeviceClass *dc = DEVICE_CLASS(c); ResettableClass *rc = RESETTABLE_CLASS(c); + device_class_set_props(dc, loongarch_cpu_properties); device_class_set_parent_realize(dc, loongarch_cpu_realizefn, &lacc->parent_realize); resettable_class_set_parent_phases(rc, NULL, loongarch_cpu_reset_hold, NULL, @@ -890,6 +901,7 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data) #ifdef CONFIG_TCG cc->tcg_ops = &loongarch_tcg_ops; #endif + dc->user_creatable = true; } static const gchar *loongarch32_gdb_arch_name(CPUState *cs) diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 4c90cf9ef3..9af622aba5 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -398,6 +398,12 @@ typedef struct CPUArchState { } st; } CPULoongArchState; +typedef struct LoongArchCPUTopo { + int32_t socket_id; /* socket-id of this VCPU */ + int32_t core_id; /* core-id of this VCPU */ + int32_t thread_id; /* thread-id of this VCPU */ +} LoongArchCPUTopo; + /** * LoongArchCPU: * @env: #CPULoongArchState @@ -412,6 +418,10 @@ struct ArchCPU { uint32_t phy_id; OnOffAuto lbt; OnOffAuto pmu; + int32_t socket_id; /* socket-id of this VCPU */ + int32_t core_id; /* core-id of this VCPU */ + int32_t thread_id; /* thread-id of this VCPU */ + int32_t node_id; /* NUMA node this CPU belongs to */ /* 'compatible' string for this CPU for Linux device trees */ const char *dtb_compatible; @@ -430,6 +440,7 @@ struct LoongArchCPUClass { CPUClass parent_class; DeviceRealize parent_realize; + DeviceUnrealize parent_unrealize; ResettablePhases parent_phases; }; -- Gitee From 212ea93178ad1e65e625ec6942ee9aff93dd5321 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 23 Oct 2024 15:13:11 +0800 Subject: [PATCH 405/939] hw/loongarch/virt: Add basic CPU plug support Implement interface for cpu hotplug function, and enable cpu hotplug feature on virt machine. Co-developed-by: Xianglai Li Signed-off-by: Bibo Mao Message-ID: <20241023071312.881866-3-maobibo@loongson.cn> --- hw/loongarch/Kconfig | 1 + hw/loongarch/virt.c | 193 +++++++++++++++++++++++++++++++++++- include/hw/loongarch/virt.h | 1 + target/loongarch/cpu.c | 13 +++ 4 files changed, 206 insertions(+), 2 deletions(-) diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index 40944a8365..b42a8573d4 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -16,6 +16,7 @@ config LOONGARCH_VIRT select LOONGARCH_EXTIOI select LS7A_RTC select SMBIOS + select ACPI_CPU_HOTPLUG select ACPI_PCI select ACPI_HW_REDUCED select FW_CFG_DMA diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 8d1e53ff62..e7734ed3c0 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -821,7 +821,7 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) /* Create IPI device */ ipi = qdev_new(TYPE_LOONGARCH_IPI); - qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.cpus); + qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.max_cpus); sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); /* IPI iocsr memory region */ @@ -845,9 +845,11 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) env->ipistate = ipi; } + lvms->ipi = ipi; + /* Create EXTIOI device */ extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); - qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.cpus); + qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus); if (virt_is_veiointc_enabled(lvms)) { qdev_prop_set_bit(extioi, "has-virtualization-extension", true); } @@ -873,6 +875,8 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) } } + lvms->extioi = extioi; + /* Add Extend I/O Interrupt Controller node */ fdt_add_eiointc_node(lvms, &cpuintc_phandle, &eiointc_phandle); @@ -1310,6 +1314,181 @@ static int virt_get_arch_id_from_topo(MachineState *ms, LoongArchCPUTopo *topo) return arch_id; } +/* find cpu slot in machine->possible_cpus by arch_id */ +static CPUArchId *virt_find_cpu_slot(MachineState *ms, int arch_id, int *index) +{ + int n; + for (n = 0; n < ms->possible_cpus->len; n++) { + if (ms->possible_cpus->cpus[n].arch_id == arch_id) { + if (index) { + *index = n; + } + return &ms->possible_cpus->cpus[n]; + } + } + + return NULL; +} + +static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + MachineState *ms = MACHINE(OBJECT(hotplug_dev)); + MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + CPUState *cs = CPU(dev); + CPUArchId *cpu_slot; + Error *local_err = NULL; + LoongArchCPUTopo topo; + int arch_id, index; + + if (dev->hotplugged && !mc->has_hotpluggable_cpus) { + error_setg(&local_err, "CPU hotplug not supported for this machine"); + goto out; + } + + /* sanity check the cpu */ + if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { + error_setg(&local_err, "Invalid CPU type, expected cpu type: '%s'", + ms->cpu_type); + goto out; + } + + if ((cpu->thread_id < 0) || (cpu->thread_id >= ms->smp.threads)) { + error_setg(&local_err, + "Invalid thread-id %u specified, must be in range 1:%u", + cpu->thread_id, ms->smp.threads - 1); + goto out; + } + + if ((cpu->core_id < 0) || (cpu->core_id >= ms->smp.cores)) { + error_setg(&local_err, + "Invalid core-id %u specified, must be in range 1:%u", + cpu->core_id, ms->smp.cores - 1); + goto out; + } + + if ((cpu->socket_id < 0) || (cpu->socket_id >= ms->smp.sockets)) { + error_setg(&local_err, + "Invalid socket-id %u specified, must be in range 1:%u", + cpu->socket_id, ms->smp.sockets - 1); + goto out; + } + + topo.socket_id = cpu->socket_id; + topo.core_id = cpu->core_id; + topo.thread_id = cpu->thread_id; + arch_id = virt_get_arch_id_from_topo(ms, &topo); + cpu_slot = virt_find_cpu_slot(ms, arch_id, &index); + if (CPU(cpu_slot->cpu)) { + error_setg(&local_err, + "cpu(id%d=%d:%d:%d) with arch-id %" PRIu64 " exists", + cs->cpu_index, cpu->socket_id, cpu->core_id, + cpu->thread_id, cpu_slot->arch_id); + goto out; + } + cpu->phy_id = arch_id; + /* + * update cpu_index calculation method since it is easily used as index + * with possible_cpus array by function virt_cpu_index_to_props + */ + cs->cpu_index = index; + numa_cpu_pre_plug(cpu_slot, dev, &local_err); + return ; + +out: + error_propagate(errp, local_err); +} + +static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + Error *local_err = NULL; + HotplugHandlerClass *hhc; + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + CPUState *cs = CPU(dev); + + if (!lvms->acpi_ged) { + error_setg(&local_err, "CPU hot unplug not supported without ACPI"); + error_propagate(errp, local_err); + return; + } + + if (cs->cpu_index == 0) { + error_setg(&local_err, + "hot-unplug of boot cpu(id%d=%d:%d:%d) not supported", + cs->cpu_index, cpu->socket_id, + cpu->core_id, cpu->thread_id); + error_propagate(errp, local_err); + return; + } + + hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged); + hhc->unplug_request(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err); +} + +static void virt_cpu_unplug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + CPUArchId *cpu_slot; + HotplugHandlerClass *hhc; + Error *local_err = NULL; + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + + hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged); + hhc->unplug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + cpu_slot = virt_find_cpu_slot(MACHINE(lvms), cpu->phy_id, NULL); + cpu_slot->cpu = NULL; + return; +} + +static void virt_cpu_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + CPUArchId *cpu_slot; + HotplugHandlerClass *hhc; + Error *local_err = NULL; + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + CPUState *cs = CPU(cpu); + CPULoongArchState *env; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + int pin; + + if (lvms->acpi_ged) { + env = &(cpu->env); + env->address_space_iocsr = &lvms->as_iocsr; + + env->ipistate = lvms->ipi; + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) { + /* connect ipi irq to cpu irq, logic cpu index used here */ + qdev_connect_gpio_out(lvms->ipi, cs->cpu_index, + qdev_get_gpio_in(dev, IRQ_IPI)); + + for (pin = 0; pin < LS3A_INTC_IP; pin++) { + qdev_connect_gpio_out(lvms->extioi, (cs->cpu_index * 8 + pin), + qdev_get_gpio_in(dev, pin + 2)); + } + } + hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged); + hhc->plug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + + cpu_slot = virt_find_cpu_slot(MACHINE(lvms), cpu->phy_id, NULL); + cpu_slot->cpu = OBJECT(dev); + return; +} + static bool memhp_type_supported(DeviceState *dev) { /* we only support pc dimm now */ @@ -1328,6 +1507,8 @@ static void virt_device_pre_plug(HotplugHandler *hotplug_dev, { if (memhp_type_supported(dev)) { virt_mem_pre_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_pre_plug(hotplug_dev, dev, errp); } } @@ -1346,6 +1527,8 @@ static void virt_device_unplug_request(HotplugHandler *hotplug_dev, { if (memhp_type_supported(dev)) { virt_mem_unplug_request(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_unplug_request(hotplug_dev, dev, errp); } } @@ -1364,6 +1547,8 @@ static void virt_device_unplug(HotplugHandler *hotplug_dev, { if (memhp_type_supported(dev)) { virt_mem_unplug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_unplug(hotplug_dev, dev, errp); } } @@ -1391,6 +1576,8 @@ static void virt_device_plug_cb(HotplugHandler *hotplug_dev, } } else if (memhp_type_supported(dev)) { virt_mem_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_plug(hotplug_dev, dev, errp); } } @@ -1400,6 +1587,7 @@ static HotplugHandler *virt_get_hotplug_handler(MachineState *machine, MachineClass *mc = MACHINE_GET_CLASS(machine); if (device_is_dynamic_sysbus(mc, dev) || + object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || memhp_type_supported(dev)) { return HOTPLUG_HANDLER(machine); @@ -1489,6 +1677,7 @@ static void virt_class_init(ObjectClass *oc, void *data) mc->numa_mem_supported = true; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; + mc->has_hotpluggable_cpus = true; mc->get_hotplug_handler = virt_get_hotplug_handler; mc->default_nic = "virtio-net-pci"; hc->plug = virt_device_plug_cb; diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 0a4d9a25f0..27c52af9f3 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -64,6 +64,7 @@ struct LoongArchVirtMachineState { AddressSpace as_iocsr; int features; struct loongarch_boot_info bootinfo; + DeviceState *ipi; }; #define TYPE_LOONGARCH_VIRT_MACHINE MACHINE_TYPE_NAME("virt") diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 673ed8ea18..ee764f0bc7 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -644,6 +644,17 @@ static void loongarch_cpu_realizefn(DeviceState *dev, Error **errp) lacc->parent_realize(dev, errp); } +static void loongarch_cpu_unrealizefn(DeviceState *dev) +{ + LoongArchCPUClass *mcc = LOONGARCH_CPU_GET_CLASS(dev); + +#ifndef CONFIG_USER_ONLY + cpu_remove_sync(CPU(dev)); +#endif + + mcc->parent_unrealize(dev); +} + static bool loongarch_get_lsx(Object *obj, Error **errp) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); @@ -880,6 +891,8 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data) device_class_set_props(dc, loongarch_cpu_properties); device_class_set_parent_realize(dc, loongarch_cpu_realizefn, &lacc->parent_realize); + device_class_set_parent_unrealize(dc, loongarch_cpu_unrealizefn, + &lacc->parent_unrealize); resettable_class_set_parent_phases(rc, NULL, loongarch_cpu_reset_hold, NULL, &lacc->parent_phases); -- Gitee From a3728999125cd9fc9e3e841b66a1677663933c27 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 23 Oct 2024 15:13:12 +0800 Subject: [PATCH 406/939] hw/loongarch/virt: Update the ACPI table for hotplug cpu On LoongArch virt machine, ACPI GED hardware is used for cpu hotplug, here cpu hotplug support feature is added on GED device, also cpu scan and reject method is added about CPU device in DSDT table. Co-developed-by: Xianglai Li Signed-off-by: Bibo Mao Message-ID: <20241023071312.881866-4-maobibo@loongson.cn> --- hw/loongarch/acpi-build.c | 35 +++++++++++++++++++++++++++++++++-- hw/loongarch/virt.c | 10 ++++++++++ include/hw/loongarch/virt.h | 1 + 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index bcdec2e1cb..a54c5e0e70 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -47,6 +47,22 @@ #define ACPI_BUILD_DPRINTF(fmt, ...) #endif +static void virt_madt_cpu_entry(int uid, + const CPUArchIdList *apic_ids, + GArray *entry, bool force_enabled) +{ + uint32_t flags, apic_id = apic_ids->cpus[uid].arch_id; + + flags = apic_ids->cpus[uid].cpu || force_enabled ? 1 /* Enabled */ : 0; + + /* Rev 1.0b, Table 5-13 Processor Local APIC Structure */ + build_append_int_noprefix(entry, 0, 1); /* Type */ + build_append_int_noprefix(entry, 8, 1); /* Length */ + build_append_int_noprefix(entry, uid, 1); /* ACPI Processor ID */ + build_append_int_noprefix(entry, apic_id, 1); /* APIC ID */ + build_append_int_noprefix(entry, flags, 4); /* Flags */ +} + /* build FADT */ static void init_common_fadt_data(AcpiFadtData *data) { @@ -123,15 +139,17 @@ build_madt(GArray *table_data, BIOSLinker *linker, build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */ for (i = 0; i < arch_ids->len; i++) { + uint32_t flags; + /* Processor Core Interrupt Controller Structure */ arch_id = arch_ids->cpus[i].arch_id; - + flags = arch_ids->cpus[i].cpu ? 1 : 0; build_append_int_noprefix(table_data, 17, 1); /* Type */ build_append_int_noprefix(table_data, 15, 1); /* Length */ build_append_int_noprefix(table_data, 1, 1); /* Version */ build_append_int_noprefix(table_data, i, 4); /* ACPI Processor ID */ build_append_int_noprefix(table_data, arch_id, 4); /* Core ID */ - build_append_int_noprefix(table_data, 1, 4); /* Flags */ + build_append_int_noprefix(table_data, flags, 4); /* Flags */ } /* Extend I/O Interrupt Controller Structure */ @@ -334,6 +352,7 @@ build_la_ged_aml(Aml *dsdt, MachineState *machine) { uint32_t event; LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); + CPUHotplugFeatures opts; build_ged_aml(dsdt, "\\_SB."GED_DEVICE, HOTPLUG_HANDLER(lvms->acpi_ged), @@ -346,6 +365,18 @@ build_la_ged_aml(Aml *dsdt, MachineState *machine) AML_SYSTEM_MEMORY, VIRT_GED_MEM_ADDR); } + + if (event & ACPI_GED_CPU_HOTPLUG_EVT) { + opts.acpi_1_compatible = false; + opts.has_legacy_cphp = false; + opts.fw_unplugs_cpu = false; + opts.smi_path = NULL; + + build_cpus_aml(dsdt, machine, opts, virt_madt_cpu_entry, NULL, + VIRT_GED_CPUHP_ADDR, "\\_SB", + NULL, AML_SYSTEM_MEMORY); + } + acpi_dsdt_add_power_button(dsdt); } diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index e7734ed3c0..6159fd9470 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -652,11 +652,17 @@ static DeviceState *create_acpi_ged(DeviceState *pch_pic, { DeviceState *dev; MachineState *ms = MACHINE(lvms); + MachineClass *mc = MACHINE_GET_CLASS(lvms); uint32_t event = ACPI_GED_PWR_DOWN_EVT; if (ms->ram_slots) { event |= ACPI_GED_MEM_HOTPLUG_EVT; } + + if (mc->has_hotpluggable_cpus) { + event |= ACPI_GED_CPU_HOTPLUG_EVT; + } + dev = qdev_new(TYPE_ACPI_GED); qdev_prop_set_uint32(dev, "ged-event", event); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); @@ -668,6 +674,10 @@ static DeviceState *create_acpi_ged(DeviceState *pch_pic, /* ged regs used for reset and power down */ sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, VIRT_GED_REG_ADDR); + if (mc->has_hotpluggable_cpus) { + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 3, VIRT_GED_CPUHP_ADDR); + } + sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, qdev_get_gpio_in(pch_pic, VIRT_SCI_IRQ - VIRT_GSI_BASE)); return dev; diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 27c52af9f3..98c990327b 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -32,6 +32,7 @@ #define VIRT_GED_EVT_ADDR 0x100e0000 #define VIRT_GED_MEM_ADDR (VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN) #define VIRT_GED_REG_ADDR (VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN) +#define VIRT_GED_CPUHP_ADDR (VIRT_GED_REG_ADDR + ACPI_GED_REG_COUNT) #define COMMAND_LINE_SIZE 512 -- Gitee From 93959a5378f57190fb79dd1ccdefb8d8cd095b58 Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 10:29:32 +0800 Subject: [PATCH 407/939] hw/misc/bcm2835_property: Fix handling of FRAMEBUFFER_SET_PALETTE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry picked from commit 0892fffc2abaadfb5d8b79bb0250ae1794862560 The documentation of the "Set palette" mailbox property at https://github.com/raspberrypi/firmware/wiki/Mailbox-property-interface#set-palette says it has the form: Length: 24..1032 Value: u32: offset: first palette index to set (0-255) u32: length: number of palette entries to set (1-256) u32...: RGBA palette values (offset to offset+length-1) We get this wrong in a couple of ways: * we aren't checking the offset and length are in range, so the guest can make us spin for a long time by providing a large length * the bounds check on our loop is wrong: we should iterate through 'length' palette entries, not 'length - offset' entries Fix the loop to implement the bounds checks and get the loop condition right. In the process, make the variables local to this switch case, rather than function-global, so it's clearer what type they are when reading the code. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-id: 20240723131029.1159908-2-peter.maydell@linaro.org Signed-off-by: Gao Jiazhen --- hw/misc/bcm2835_property.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c index ff55a4e2cd..12a1bc558a 100644 --- a/hw/misc/bcm2835_property.c +++ b/hw/misc/bcm2835_property.c @@ -28,8 +28,6 @@ static void bcm2835_property_mbox_push(BCM2835PropertyState *s, uint32_t value) uint32_t tot_len; size_t resplen; uint32_t tmp; - int n; - uint32_t offset, length, color; /* * Copy the current state of the framebuffer config; we will update @@ -264,18 +262,25 @@ static void bcm2835_property_mbox_push(BCM2835PropertyState *s, uint32_t value) resplen = 16; break; case RPI_FWREQ_FRAMEBUFFER_SET_PALETTE: - offset = ldl_le_phys(&s->dma_as, value + 12); - length = ldl_le_phys(&s->dma_as, value + 16); - n = 0; - while (n < length - offset) { - color = ldl_le_phys(&s->dma_as, value + 20 + (n << 2)); - stl_le_phys(&s->dma_as, - s->fbdev->vcram_base + ((offset + n) << 2), color); - n++; + { + uint32_t offset = ldl_le_phys(&s->dma_as, value + 12); + uint32_t length = ldl_le_phys(&s->dma_as, value + 16); + int resp; + + if (offset > 255 || length < 1 || length > 256) { + resp = 1; /* invalid request */ + } else { + for (uint32_t e = 0; e < length; e++) { + uint32_t color = ldl_le_phys(&s->dma_as, value + 20 + (e << 2)); + stl_le_phys(&s->dma_as, + s->fbdev->vcram_base + ((offset + e) << 2), color); + } + resp = 0; } - stl_le_phys(&s->dma_as, value + 12, 0); + stl_le_phys(&s->dma_as, value + 12, resp); resplen = 4; break; + } case RPI_FWREQ_FRAMEBUFFER_GET_NUM_DISPLAYS: stl_le_phys(&s->dma_as, value + 12, 1); resplen = 4; -- Gitee From 073620787702404e2d71486c30967455c3c7904c Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 10:57:38 +0800 Subject: [PATCH 408/939] hw/loongarch: Fix fdt memory node wrong 'reg' cherry picked from commitd b11f9814526b833b3a052be2559457b1affad7f5 The right fdt memory node like [1], not [2] [1] memory@0 { device_type = "memory"; reg = <0x00 0x00 0x00 0x10000000>; }; [2] memory@0 { device_type = "memory"; reg = <0x02 0x00 0x02 0x10000000>; }; Reviewed-by: Bibo Mao Signed-off-by: Song Gao Message-Id: <20240426091551.2397867-10-gaosong@loongson.cn> Signed-off-by: Gao Jiazhen --- hw/loongarch/virt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 01e59f3a95..fc7b70ed4e 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -360,7 +360,7 @@ static void fdt_add_memory_node(MachineState *ms, char *nodename = g_strdup_printf("/memory@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0, base, 0, size); qemu_fdt_setprop_string(ms->fdt, nodename, "device_type", "memory"); if (ms->numa_state && ms->numa_state->num_nodes) { -- Gitee From 004e0a984118380ff89ceaabb6ace1ebbfb1eb6d Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 11:08:13 +0800 Subject: [PATCH 409/939] target/i386: no single-step exception after MOV or POP SS cherry picked from commitd f0f0136abba688a6516647a79cc91e03fad6d5d7 Intel SDM 18.3.1.4 "If an occurrence of the MOV or POP instruction loads the SS register executes with EFLAGS.TF = 1, no single-step debug exception occurs following the MOV or POP instruction." Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Gao Jiazhen --- target/i386/tcg/translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 037bc47e7c..dc672d7995 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2790,7 +2790,7 @@ do_gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, bool jr) if (recheck_tf) { gen_helper_rechecking_single_step(tcg_env); tcg_gen_exit_tb(NULL, 0); - } else if (s->flags & HF_TF_MASK) { + } else if ((s->flags & HF_TF_MASK) && !inhibit) { gen_helper_single_step(tcg_env); } else if (jr) { tcg_gen_lookup_and_goto_ptr(); -- Gitee From a113ddc33b432c8b4d21160dccb54ba19580ab01 Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 11:22:56 +0800 Subject: [PATCH 410/939] target/arm: Disable SVE extensions when SVE is disabled cherry picked from commit daf9748ac002ec35258e5986b6257961fd04b565 Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2304 Reported-by: Marcin Juszkiewicz Signed-off-by: Richard Henderson Signed-off-by: Marcin Juszkiewicz Message-id: 20240526204551.553282-1-richard.henderson@linaro.org Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell Signed-off-by: Gao Jiazhen --- target/arm/cpu64.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 5d28838175..6eca55ac29 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -110,6 +110,11 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp) */ if (!cpu_isar_feature(aa64_sve, cpu)) { /* SVE is disabled and so are all vector lengths. Good. */ + /* + * SVE is disabled and so are all vector lengths. Good. + * Disable all SVE extensions as well. + */ + cpu->isar.id_aa64zfr0 = 0; return; } -- Gitee From 4f76ccdc5bdad57b9c70da7a4fc00502cc335060 Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 11:27:12 +0800 Subject: [PATCH 411/939] target/loongarch: fix a wrong print in cpu dump cherry picked from commit 78f932ea1f7b3b9b0ac628dc2a91281318fe51fa description: loongarch_cpu_dump_state() want to dump all loongarch cpu state registers, but there is a tiny typographical error when printing "PRCFG2". Cc: qemu-stable@nongnu.org Signed-off-by: lanyanzhi Reviewed-by: Richard Henderson Reviewed-by: Song Gao Message-Id: <20240604073831.666690-1-lanyanzhi22b@ict.ac.cn> Signed-off-by: Song Gao Signed-off-by: Gao Jiazhen --- target/loongarch/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 8e7c8332da..f7b5dae7ed 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -802,7 +802,7 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) qemu_fprintf(f, "EENTRY=%016" PRIx64 "\n", env->CSR_EENTRY); qemu_fprintf(f, "PRCFG1=%016" PRIx64 ", PRCFG2=%016" PRIx64 "," " PRCFG3=%016" PRIx64 "\n", - env->CSR_PRCFG1, env->CSR_PRCFG3, env->CSR_PRCFG3); + env->CSR_PRCFG1, env->CSR_PRCFG2, env->CSR_PRCFG3); qemu_fprintf(f, "TLBRENTRY=%016" PRIx64 "\n", env->CSR_TLBRENTRY); qemu_fprintf(f, "TLBRBADV=%016" PRIx64 "\n", env->CSR_TLBRBADV); qemu_fprintf(f, "TLBRERA=%016" PRIx64 "\n", env->CSR_TLBRERA); -- Gitee From 015fc431353ae348e7e9cef2036b674a4e33eb1c Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 15:04:16 +0800 Subject: [PATCH 412/939] =?UTF-8?q?migration/colo:=20Fix=20bdrv=5Fgraph=5F?= =?UTF-8?q?rdlock=5Fmain=5Floop:=20Assertion=20`!qemu=5Fin=5F=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry picked from commit 2cc637f1ea08d2a1b19fc5b1a30bc609f948de93 …coroutine()' failed. bdrv_activate_all() should not be called from the coroutine context, move it to the QEMU thread colo_process_incoming_thread() with the bql_lock protected. The backtrace is as follows: #4 0x0000561af7948362 in bdrv_graph_rdlock_main_loop () at ../block/graph-lock.c:260 #5 0x0000561af7907a68 in graph_lockable_auto_lock_mainloop (x=0x7fd29810be7b) at /patch/to/qemu/include/block/graph-lock.h:259 #6 0x0000561af79167d1 in bdrv_activate_all (errp=0x7fd29810bed0) at ../block.c:6906 #7 0x0000561af762b4af in colo_incoming_co () at ../migration/colo.c:935 #8 0x0000561af7607e57 in process_incoming_migration_co (opaque=0x0) at ../migration/migration.c:793 #9 0x0000561af7adbeeb in coroutine_trampoline (i0=-106876144, i1=22042) at ../util/coroutine-ucontext.c:175 #10 0x00007fd2a5cf21c0 in () at /lib64/libc.so.6 Cc: qemu-stable@nongnu.org Cc: Fabiano Rosas Closes: https://gitlab.com/qemu-project/qemu/-/issues/2277 Fixes: 2b3912f ("block: Mark bdrv_first_blk() and bdrv_is_root_node() GRAPH_RDLOCK") Signed-off-by: Li Zhijian Reviewed-by: Zhang Chen Tested-by: Zhang Chen Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240417025634.1014582-1-lizhijian@fujitsu.com Signed-off-by: Peter Xu Signed-off-by: Gao Jiazhen --- migration/colo.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/migration/colo.c b/migration/colo.c index 4447e34914..8f301b7e57 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -830,6 +830,16 @@ static void *colo_process_incoming_thread(void *opaque) return NULL; } + /* Make sure all file formats throw away their mutable metadata */ + qemu_mutex_lock_iothread(); + bdrv_activate_all(&local_err); + if (local_err) { + qemu_mutex_unlock_iothread(); + error_report_err(local_err); + return NULL; + } + qemu_mutex_unlock_iothread(); + failover_init_state(); mis->to_src_file = qemu_file_get_return_path(mis->from_src_file); @@ -917,7 +927,6 @@ out: int coroutine_fn colo_incoming_co(void) { MigrationIncomingState *mis = migration_incoming_get_current(); - Error *local_err = NULL; QemuThread th; assert(qemu_mutex_iothread_locked()); @@ -926,13 +935,6 @@ int coroutine_fn colo_incoming_co(void) return 0; } - /* Make sure all file formats throw away their mutable metadata */ - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - return -EINVAL; - } - qemu_thread_create(&th, "COLO incoming", colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE); -- Gitee From 11e71bc99d8811644ddf1a854e556170bb8f5db3 Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 16:01:04 +0800 Subject: [PATCH 413/939] virtio-pci: fix use of a released vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry picked from commit 2ce6cff94df2650c460f809e5ad263f1d22507c0 During the booting process of the non-standard image, the behavior of the called function in qemu is as follows: 1. vhost_net_stop() was triggered by guest image. This will call the function virtio_pci_set_guest_notifiers() with assgin= false, virtio_pci_set_guest_notifiers() will release the irqfd for vector 0 2. virtio_reset() was triggered, this will set configure vector to VIRTIO_NO_VECTOR 3.vhost_net_start() was called (at this time, the configure vector is still VIRTIO_NO_VECTOR) and then call virtio_pci_set_guest_notifiers() with assgin=true, so the irqfd for vector 0 is still not "init" during this process 4. The system continues to boot and sets the vector back to 0. After that msix_fire_vector_notifier() was triggered to unmask the vector 0 and meet the crash To fix the issue, we need to support changing the vector after VIRTIO_CONFIG_S_DRIVER_OK is set. (gdb) bt 0 __pthread_kill_implementation (threadid=, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44 1 0x00007fc87148ec53 in __pthread_kill_internal (signo=6, threadid=) at pthread_kill.c:78 2 0x00007fc87143e956 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 3 0x00007fc8714287f4 in __GI_abort () at abort.c:79 4 0x00007fc87142871b in __assert_fail_base (fmt=0x7fc8715bbde0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d "../accel/kvm/kvm-all.c", line=1837, function=) at assert.c:92 5 0x00007fc871437536 in __GI___assert_fail (assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d "../accel/kvm/kvm-all.c", line=1837, function=0x5606413f06f0 <__PRETTY_FUNCTION__.19> "kvm_irqchip_commit_routes") at assert.c:101 6 0x0000560640f884b5 in kvm_irqchip_commit_routes (s=0x560642cae1f0) at ../accel/kvm/kvm-all.c:1837 7 0x0000560640c98f8e in virtio_pci_one_vector_unmask (proxy=0x560643c65f00, queue_no=4294967295, vector=0, msg=..., n=0x560643c6e4c8) at ../hw/virtio/virtio-pci.c:1005 8 0x0000560640c99201 in virtio_pci_vector_unmask (dev=0x560643c65f00, vector=0, msg=...) at ../hw/virtio/virtio-pci.c:1070 9 0x0000560640bc402e in msix_fire_vector_notifier (dev=0x560643c65f00, vector=0, is_masked=false) at ../hw/pci/msix.c:120 10 0x0000560640bc40f1 in msix_handle_mask_update (dev=0x560643c65f00, vector=0, was_masked=true) at ../hw/pci/msix.c:140 11 0x0000560640bc4503 in msix_table_mmio_write (opaque=0x560643c65f00, addr=12, val=0, size=4) at ../hw/pci/msix.c:231 12 0x0000560640f26d83 in memory_region_write_accessor (mr=0x560643c66540, addr=12, value=0x7fc86b7bc628, size=4, shift=0, mask=4294967295, attrs=...) at ../system/memory.c:497 13 0x0000560640f270a6 in access_with_adjusted_size (addr=12, value=0x7fc86b7bc628, size=4, access_size_min=1, access_size_max=4, access_fn=0x560640f26c8d , mr=0x560643c66540, attrs=...) at ../system/memory.c:573 14 0x0000560640f2a2b5 in memory_region_dispatch_write (mr=0x560643c66540, addr=12, data=0, op=MO_32, attrs=...) at ../system/memory.c:1521 15 0x0000560640f37bac in flatview_write_continue (fv=0x7fc65805e0b0, addr=4273803276, attrs=..., ptr=0x7fc871e9c028, len=4, addr1=12, l=4, mr=0x560643c66540) at ../system/physmem.c:2714 16 0x0000560640f37d0f in flatview_write (fv=0x7fc65805e0b0, addr=4273803276, attrs=..., buf=0x7fc871e9c028, len=4) at ../system/physmem.c:2756 17 0x0000560640f380bf in address_space_write (as=0x560642161ae0 , addr=4273803276, attrs=..., buf=0x7fc871e9c028, len=4) at ../system/physmem.c:2863 18 0x0000560640f3812c in address_space_rw (as=0x560642161ae0 , addr=4273803276, attrs=..., buf=0x7fc871e9c028, len=4, is_write=true) at ../system/physmem.c:2873 --Type for more, q to quit, c to continue without paging-- 19 0x0000560640f8aa55 in kvm_cpu_exec (cpu=0x560642f205e0) at ../accel/kvm/kvm-all.c:2915 20 0x0000560640f8d731 in kvm_vcpu_thread_fn (arg=0x560642f205e0) at ../accel/kvm/kvm-accel-ops.c:51 21 0x00005606411949f4 in qemu_thread_start (args=0x560642f292b0) at ../util/qemu-thread-posix.c:541 22 0x00007fc87148cdcd in start_thread (arg=) at pthread_create.c:442 23 0x00007fc871512630 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 (gdb) MST: coding style and typo fixups Fixes: f9a09ca ("vhost: add support for configure interrupt") Cc: qemu-stable@nongnu.org Signed-off-by: Cindy Lu Message-ID: <2321ade5f601367efe7380c04e3f61379c59b48f.1713173550.git.mst@redhat.com> Cc: Lei Yang Cc: Jason Wang Signed-off-by: Michael S. Tsirkin Tested-by: Cindy Lu Signed-off-by: Gao Jiazhen --- hw/virtio/virtio-pci.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index f8adb0520a..3ad7487411 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1456,6 +1456,38 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, return offset; } +static void virtio_pci_set_vector(VirtIODevice *vdev, + VirtIOPCIProxy *proxy, + int queue_no, uint16_t old_vector, + uint16_t new_vector) +{ + bool kvm_irqfd = (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) && + msix_enabled(&proxy->pci_dev) && kvm_msi_via_irqfd_enabled(); + + if (new_vector == old_vector) { + return; + } + + /* + * If the device uses irqfd and the vector changes after DRIVER_OK is + * set, we need to release the old vector and set up the new one. + * Otherwise just need to set the new vector on the device. + */ + if (kvm_irqfd && old_vector != VIRTIO_NO_VECTOR) { + kvm_virtio_pci_vector_release_one(proxy, queue_no); + } + /* Set the new vector on the device. */ + if (queue_no == VIRTIO_CONFIG_IRQ_IDX) { + vdev->config_vector = new_vector; + } else { + virtio_queue_set_vector(vdev, queue_no, new_vector); + } + /* If the new vector changed need to set it up. */ + if (kvm_irqfd && new_vector != VIRTIO_NO_VECTOR) { + kvm_virtio_pci_vector_use_one(proxy, queue_no); + } +} + int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, uint8_t bar, uint64_t offset, uint64_t length, uint8_t id) @@ -1602,7 +1634,8 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, } else { val = VIRTIO_NO_VECTOR; } - vdev->config_vector = val; + virtio_pci_set_vector(vdev, proxy, VIRTIO_CONFIG_IRQ_IDX, + vdev->config_vector, val); break; case VIRTIO_PCI_COMMON_STATUS: if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) { @@ -1642,7 +1675,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, } else { val = VIRTIO_NO_VECTOR; } - virtio_queue_set_vector(vdev, vdev->queue_sel, val); + virtio_pci_set_vector(vdev, proxy, vdev->queue_sel, vector, val); break; case VIRTIO_PCI_COMMON_Q_ENABLE: if (val == 1) { -- Gitee From 2651409cf43002dc497483ae3ae227d4c602ca45 Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 17:02:38 +0800 Subject: [PATCH 414/939] load_elf: fix iterator's type for elf file processing cherry picked from commit 410c2a4d75f52f6a2fe978eda5a9b6f854afe5ea j is used while loading an ELF file to byteswap segments' data. If data is larger than 2GB an overflow may happen. So j should be elf_word. This commit fixes a minor bug: it's unlikely anybody is trying to load ELF files with 2GB+ segments for wrong-endianness targets, but if they did, it wouldn't work correctly. Found by Linux Verification Center (linuxtesting.org) with SVACE. Cc: qemu-stable@nongnu.org Fixes: 7ef295e ("loader: Add data swap option to load-elf") Signed-off-by: Anastasia Belova Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell Signed-off-by: Gao Jiazhen --- include/hw/elf_ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h index 0a5c258fe6..9c35d1b9da 100644 --- a/include/hw/elf_ops.h +++ b/include/hw/elf_ops.h @@ -500,7 +500,7 @@ static ssize_t glue(load_elf, SZ)(const char *name, int fd, } if (data_swab) { - int j; + elf_word j; for (j = 0; j < file_size; j += (1 << data_swab)) { uint8_t *dp = data + j; switch (data_swab) { -- Gitee From 5661b12a28b650226cca100aeddd92d5cc788153 Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 20:41:18 +0800 Subject: [PATCH 415/939] char-stdio: Restore blocking mode of stdout on exit cherry picked from commit a0124e333e2176640f233e5ea57a2f413985d9b5 qemu_chr_open_fd() sets stdout into non-blocking mode. Restore the old fd flags on exit to avoid breaking unsuspecting applications that run on the same terminal after qemu and don't expect to get EAGAIN. While at at, also ensure term_exit is called once (at the moment it's called both from char_stdio_finalize() and as the atexit() hook. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2423 Signed-off-by: Maxim Mikityanskiy Link: https://lore.kernel.org/r/20240703190812.3459514-1-m Signed-off-by: Gao Jiazhen --- chardev/char-stdio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chardev/char-stdio.c b/chardev/char-stdio.c index 3c648678ab..b960ddd4e4 100644 --- a/chardev/char-stdio.c +++ b/chardev/char-stdio.c @@ -41,6 +41,7 @@ /* init terminal so that we can grab keys */ static struct termios oldtty; static int old_fd0_flags; +static int old_fd1_flags; static bool stdio_in_use; static bool stdio_allow_signal; static bool stdio_echo_state; @@ -50,6 +51,8 @@ static void term_exit(void) if (stdio_in_use) { tcsetattr(0, TCSANOW, &oldtty); fcntl(0, F_SETFL, old_fd0_flags); + fcntl(1, F_SETFL, old_fd1_flags); + stdio_in_use = false; } } @@ -102,6 +105,7 @@ static void qemu_chr_open_stdio(Chardev *chr, stdio_in_use = true; old_fd0_flags = fcntl(0, F_GETFL); + old_fd1_flags = fcntl(1, F_GETFL); tcgetattr(0, &oldtty); if (!g_unix_set_fd_nonblocking(0, true, NULL)) { error_setg_errno(errp, errno, "Failed to set FD nonblocking"); -- Gitee From 2781f5673cc43d13b73e66fb266e7ea0b945429d Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Thu, 12 Sep 2024 20:55:38 +0800 Subject: [PATCH 416/939] backends/cryptodev-builtin: Fix local_error leaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry picked from commit 06479dbf3d7d245572c4b3016e5a1d923ff04d66 backends/cryptodev-builtin: Fix local_error leaks It seems that this error does not need to be propagated to the upper, directly output the error to avoid the leaks Closes: https://gitlab.com/qemu-project/qemu/-/issues/2283 Fixes: 2fda101 ("virtio-crypto: Support asynchronous mode") Signed-off-by: Li Zhijian Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: zhenwei pi Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev Signed-off-by: Gao Jiazhen --- backends/cryptodev-builtin.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c index 39d0455280..0822f198d9 100644 --- a/backends/cryptodev-builtin.c +++ b/backends/cryptodev-builtin.c @@ -23,6 +23,7 @@ #include "qemu/osdep.h" #include "sysemu/cryptodev.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "standard-headers/linux/virtio_crypto.h" #include "crypto/cipher.h" @@ -396,8 +397,8 @@ static int cryptodev_builtin_create_session( case VIRTIO_CRYPTO_HASH_CREATE_SESSION: case VIRTIO_CRYPTO_MAC_CREATE_SESSION: default: - error_setg(&local_error, "Unsupported opcode :%" PRIu32 "", - sess_info->op_code); + error_report("Unsupported opcode :%" PRIu32 "", + sess_info->op_code); return -VIRTIO_CRYPTO_NOTSUPP; } @@ -552,8 +553,8 @@ static int cryptodev_builtin_operation( if (op_info->session_id >= MAX_NUM_SESSIONS || builtin->sessions[op_info->session_id] == NULL) { - error_setg(&local_error, "Cannot find a valid session id: %" PRIu64 "", - op_info->session_id); + error_report("Cannot find a valid session id: %" PRIu64 "", + op_info->session_id); return -VIRTIO_CRYPTO_INVSESS; } -- Gitee From 24e4e6742bdc8d804760e84f4e4bde5460e1e024 Mon Sep 17 00:00:00 2001 From: gaosong Date: Sun, 8 Sep 2024 09:29:00 +0800 Subject: [PATCH 417/939] hw/loongarch: Add KVM IPI device support Added ipi interrupt controller for kvm emulation. The main process is to send the command word for creating an ipi device to the kernel. When the VM is saved, the ioctl obtains the ipi interrupt controller data in the kernel and saves it. When the VM is recovered, the saved data is sent to the kernel. Signed-off-by: gaosong --- hw/intc/Kconfig | 3 + hw/intc/loongarch_ipi_kvm.c | 207 ++++++++++++++++++++++++++++++++ hw/intc/meson.build | 1 + hw/loongarch/Kconfig | 1 + hw/loongarch/virt.c | 35 ++++-- include/hw/intc/loongarch_ipi.h | 23 ++++ linux-headers/linux/kvm.h | 2 + target/loongarch/kvm/kvm.c | 4 + 8 files changed, 263 insertions(+), 13 deletions(-) create mode 100644 hw/intc/loongarch_ipi_kvm.c diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig index 97d550b06b..cbba74c22e 100644 --- a/hw/intc/Kconfig +++ b/hw/intc/Kconfig @@ -93,6 +93,9 @@ config NIOS2_VIC config LOONGARCH_IPI bool +config LOONGARCH_IPI_KVM + bool + config LOONGARCH_PCH_PIC bool select UNIMP diff --git a/hw/intc/loongarch_ipi_kvm.c b/hw/intc/loongarch_ipi_kvm.c new file mode 100644 index 0000000000..fd308eb0c0 --- /dev/null +++ b/hw/intc/loongarch_ipi_kvm.c @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm ipi interrupt support + * + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_ipi.h" +#include "hw/sysbus.h" +#include "linux/kvm.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" + +#define IPI_DEV_FD_UNDEF -1 + +static void kvm_ipi_access_regs(int fd, uint64_t addr, + uint32_t *val, int is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_IPI_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static int kvm_loongarch_ipi_pre_save(void *opaque) +{ + KVMLoongArchIPI *ipi = (KVMLoongArchIPI *)opaque; + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(ipi); + IPICore *cpu; + uint64_t attr; + int cpu_id = 0; + int fd = ipi_class->dev_fd; + + for (cpu_id = 0; cpu_id < ipi->num_cpu; cpu_id++) { + cpu = &ipi->cpu[cpu_id]; + attr = (cpu_id << 16) | CORE_STATUS_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->status, false); + + attr = (cpu_id << 16) | CORE_EN_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->en, false); + + attr = (cpu_id << 16) | CORE_SET_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->set, false); + + attr = (cpu_id << 16) | CORE_CLEAR_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->clear, false); + + attr = (cpu_id << 16) | CORE_BUF_20; + kvm_ipi_access_regs(fd, attr, &cpu->buf[0], false); + + attr = (cpu_id << 16) | CORE_BUF_28; + kvm_ipi_access_regs(fd, attr, &cpu->buf[2], false); + + attr = (cpu_id << 16) | CORE_BUF_30; + kvm_ipi_access_regs(fd, attr, &cpu->buf[4], false); + + attr = (cpu_id << 16) | CORE_BUF_38; + kvm_ipi_access_regs(fd, attr, &cpu->buf[6], false); + } + + return 0; +} + +static int kvm_loongarch_ipi_post_load(void *opaque, int version_id) +{ + KVMLoongArchIPI *ipi = (KVMLoongArchIPI *)opaque; + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(ipi); + IPICore *cpu; + uint64_t attr; + int cpu_id = 0; + int fd = ipi_class->dev_fd; + + for (cpu_id = 0; cpu_id < ipi->num_cpu; cpu_id++) { + cpu = &ipi->cpu[cpu_id]; + attr = (cpu_id << 16) | CORE_STATUS_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->status, true); + + attr = (cpu_id << 16) | CORE_EN_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->en, true); + + attr = (cpu_id << 16) | CORE_SET_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->set, true); + + attr = (cpu_id << 16) | CORE_CLEAR_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->clear, true); + + attr = (cpu_id << 16) | CORE_BUF_20; + kvm_ipi_access_regs(fd, attr, &cpu->buf[0], true); + + attr = (cpu_id << 16) | CORE_BUF_28; + kvm_ipi_access_regs(fd, attr, &cpu->buf[2], true); + + attr = (cpu_id << 16) | CORE_BUF_30; + kvm_ipi_access_regs(fd, attr, &cpu->buf[4], true); + + attr = (cpu_id << 16) | CORE_BUF_38; + kvm_ipi_access_regs(fd, attr, &cpu->buf[6], true); + } + + return 0; +} + +static void kvm_loongarch_ipi_realize(DeviceState *dev, Error **errp) +{ + KVMLoongArchIPI *ipi = KVM_LOONGARCH_IPI(dev); + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(dev); + struct kvm_create_device cd = {0}; + Error *err = NULL; + int ret; + + if (ipi->num_cpu == 0) { + error_setg(errp, "num-cpu must be at least 1"); + return; + } + + ipi_class->parent_realize(dev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + ipi->cpu = g_new0(IPICore, ipi->num_cpu); + if (ipi->cpu == NULL) { + error_setg(errp, "Memory allocation for ExtIOICore faile"); + return; + } + + if (!ipi_class->is_created) { + cd.type = KVM_DEV_TYPE_LA_IPI; + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); + if (ret < 0) { + error_setg_errno(errp, errno, "Creating the KVM device failed"); + return; + } + ipi_class->is_created = true; + ipi_class->dev_fd = cd.fd; + fprintf(stdout, "Create LoongArch IPI irqchip in KVM done!\n"); + } + + assert(ipi_class->dev_fd != IPI_DEV_FD_UNDEF); +} + +static Property kvm_loongarch_ipi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", KVMLoongArchIPI, num_cpu, 1), + DEFINE_PROP_END_OF_LIST() +}; + +static const VMStateDescription vmstate_kvm_ipi_core = { + .name = "kvm-ipi-single", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(status, IPICore), + VMSTATE_UINT32(en, IPICore), + VMSTATE_UINT32(set, IPICore), + VMSTATE_UINT32(clear, IPICore), + VMSTATE_UINT32_ARRAY(buf, IPICore, 8), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_kvm_loongarch_ipi = { + .name = TYPE_KVM_LOONGARCH_IPI, + .version_id = 1, + .minimum_version_id = 1, + .pre_save = kvm_loongarch_ipi_pre_save, + .post_load = kvm_loongarch_ipi_post_load, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, KVMLoongArchIPI, num_cpu, + vmstate_kvm_ipi_core, IPICore), + + VMSTATE_END_OF_LIST() + } +}; + +static void kvm_loongarch_ipi_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_CLASS(oc); + + ipi_class->parent_realize = dc->realize; + dc->realize = kvm_loongarch_ipi_realize; + + ipi_class->is_created = false; + ipi_class->dev_fd = IPI_DEV_FD_UNDEF; + + device_class_set_props(dc, kvm_loongarch_ipi_properties); + + dc->vmsd = &vmstate_kvm_loongarch_ipi; +} + +static const TypeInfo kvm_loongarch_ipi_info = { + .name = TYPE_KVM_LOONGARCH_IPI, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMLoongArchIPI), + .class_size = sizeof(KVMLoongArchIPIClass), + .class_init = kvm_loongarch_ipi_class_init, +}; + +static void kvm_loongarch_ipi_register_types(void) +{ + type_register_static(&kvm_loongarch_ipi_info); +} + +type_init(kvm_loongarch_ipi_register_types) diff --git a/hw/intc/meson.build b/hw/intc/meson.build index ed355941d1..9deeeb51bb 100644 --- a/hw/intc/meson.build +++ b/hw/intc/meson.build @@ -70,6 +70,7 @@ specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_XIVE'], specific_ss.add(when: 'CONFIG_M68K_IRQC', if_true: files('m68k_irqc.c')) specific_ss.add(when: 'CONFIG_NIOS2_VIC', if_true: files('nios2_vic.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_IPI', if_true: files('loongarch_ipi.c')) +specific_ss.add(when: 'CONFIG_LOONGARCH_IPI_KVM', if_true: files('loongarch_ipi_kvm.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC', if_true: files('loongarch_pch_pic.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_MSI', if_true: files('loongarch_pch_msi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI', if_true: files('loongarch_extioi.c')) diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index b42a8573d4..1e761624c6 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -14,6 +14,7 @@ config LOONGARCH_VIRT select LOONGARCH_PCH_PIC select LOONGARCH_PCH_MSI select LOONGARCH_EXTIOI + select LOONGARCH_IPI_KVM if KVM select LS7A_RTC select SMBIOS select ACPI_CPU_HOTPLUG diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 6159fd9470..f065eb75f8 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -829,16 +829,28 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) * +--------+ +---------+ +---------+ */ - /* Create IPI device */ - ipi = qdev_new(TYPE_LOONGARCH_IPI); - qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.max_cpus); - sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); - - /* IPI iocsr memory region */ - memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); - memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + ipi = qdev_new(TYPE_KVM_LOONGARCH_IPI); + qdev_prop_set_int32(ipi, "num-cpu", ms->smp.max_cpus); + sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); + } else { + ipi = qdev_new(TYPE_LOONGARCH_IPI); + qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.max_cpus); + sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); + + /* IPI iocsr memory region */ + memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); + memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + cpu_state = qemu_get_cpu(cpu); + cpudev = DEVICE(cpu_state); + + /* connect ipi irq to cpu irq */ + qdev_connect_gpio_out(ipi, cpu, qdev_get_gpio_in(cpudev, IRQ_IPI)); + } + } /* Add cpu interrupt-controller */ fdt_add_cpuic_node(lvms, &cpuintc_phandle); @@ -849,9 +861,6 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) lacpu = LOONGARCH_CPU(cpu_state); env = &(lacpu->env); env->address_space_iocsr = &lvms->as_iocsr; - - /* connect ipi irq to cpu irq */ - qdev_connect_gpio_out(ipi, cpu, qdev_get_gpio_in(cpudev, IRQ_IPI)); env->ipistate = ipi; } diff --git a/include/hw/intc/loongarch_ipi.h b/include/hw/intc/loongarch_ipi.h index 1c1e834849..601b4f18a7 100644 --- a/include/hw/intc/loongarch_ipi.h +++ b/include/hw/intc/loongarch_ipi.h @@ -32,6 +32,7 @@ #define TYPE_LOONGARCH_IPI "loongarch_ipi" OBJECT_DECLARE_SIMPLE_TYPE(LoongArchIPI, LOONGARCH_IPI) +#define TYPE_KVM_LOONGARCH_IPI "loongarch-ipi-kvm" typedef struct IPICore { uint32_t status; @@ -51,4 +52,26 @@ struct LoongArchIPI { IPICore *cpu; }; +struct KVMLoongArchIPI { + SysBusDevice parent_obj; + uint32_t num_cpu; + IPICore *cpu; +}; +typedef struct KVMLoongArchIPI KVMLoongArchIPI; +DECLARE_INSTANCE_CHECKER(KVMLoongArchIPI, KVM_LOONGARCH_IPI, + TYPE_KVM_LOONGARCH_IPI) + +struct KVMLoongArchIPIClass { + SysBusDeviceClass parent_class; + DeviceRealize parent_realize; + + bool is_created; + int dev_fd; + +}; +typedef struct KVMLoongArchIPIClass KVMLoongArchIPIClass; +DECLARE_CLASS_CHECKERS(KVMLoongArchIPIClass, KVM_LOONGARCH_IPI, + TYPE_KVM_LOONGARCH_IPI) + + #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index eb30402c2d..ea1f821a9f 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1470,6 +1470,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA + KVM_DEV_TYPE_LA_IPI, +#define KVM_DEV_TYPE_LA_IPI KVM_DEV_TYPE_LA_IPI KVM_DEV_TYPE_MAX, }; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 550f14269e..ab1ea3d4fd 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -1066,6 +1066,10 @@ int kvm_arch_get_default_type(MachineState *ms) int kvm_arch_init(MachineState *ms, KVMState *s) { cap_has_mp_state = kvm_check_extension(s, KVM_CAP_MP_STATE); + if(!kvm_vm_check_attr(kvm_state, KVM_LOONGARCH_VM_HAVE_IRQCHIP, KVM_LOONGARCH_VM_HAVE_IRQCHIP)) { + s->kernel_irqchip_allowed = false; + } + return 0; } -- Gitee From 833cdea8037d9124cd2e0328739de1b85aaec2a2 Mon Sep 17 00:00:00 2001 From: gaosong Date: Sun, 8 Sep 2024 09:50:50 +0800 Subject: [PATCH 418/939] hw/loongarch: Add KVM extioi device support Added extioi interrupt controller for kvm emulation. The main process is to send the command word for creating an extioi device to the kernel. When the VM is saved, the ioctl obtains the related data of the extioi interrupt controller in the kernel and saves it. When the VM is recovered, the saved data is sent to the kernel. Signed-off-by: gaosong --- hw/intc/Kconfig | 3 + hw/intc/loongarch_extioi_kvm.c | 150 +++++++++++++++++++++++++++++ hw/intc/meson.build | 1 + hw/loongarch/Kconfig | 1 + hw/loongarch/virt.c | 50 +++++----- include/hw/intc/loongarch_extioi.h | 36 ++++++- include/hw/loongarch/virt.h | 15 +++ linux-headers/linux/kvm.h | 2 + 8 files changed, 232 insertions(+), 26 deletions(-) create mode 100644 hw/intc/loongarch_extioi_kvm.c diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig index cbba74c22e..f1e8bd2fc9 100644 --- a/hw/intc/Kconfig +++ b/hw/intc/Kconfig @@ -107,3 +107,6 @@ config LOONGARCH_PCH_MSI config LOONGARCH_EXTIOI bool + +config LOONGARCH_EXTIOI_KVM + bool diff --git a/hw/intc/loongarch_extioi_kvm.c b/hw/intc/loongarch_extioi_kvm.c new file mode 100644 index 0000000000..f5bbc33255 --- /dev/null +++ b/hw/intc/loongarch_extioi_kvm.c @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm extioi interrupt support + * + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_extioi.h" +#include "hw/sysbus.h" +#include "linux/kvm.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" + +static void kvm_extioi_access_regs(int fd, uint64_t addr, + void *val, int is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static int kvm_loongarch_extioi_pre_save(void *opaque) +{ + KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque; + KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_extioi_access_regs(fd, EXTIOI_NODETYPE_START, + (void *)s->nodetype, false); + kvm_extioi_access_regs(fd, EXTIOI_IPMAP_START, (void *)s->ipmap, false); + kvm_extioi_access_regs(fd, EXTIOI_ENABLE_START, (void *)s->enable, false); + kvm_extioi_access_regs(fd, EXTIOI_BOUNCE_START, (void *)s->bounce, false); + kvm_extioi_access_regs(fd, EXTIOI_ISR_START, (void *)s->isr, false); + kvm_extioi_access_regs(fd, EXTIOI_COREMAP_START, + (void *)s->coremap, false); + kvm_extioi_access_regs(fd, EXTIOI_SW_COREMAP_FLAG, + (void *)s->sw_coremap, false); + kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, + (void *)s->coreisr, false); + + return 0; +} + +static int kvm_loongarch_extioi_post_load(void *opaque, int version_id) +{ + KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque; + KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_extioi_access_regs(fd, EXTIOI_NODETYPE_START, + (void *)s->nodetype, true); + kvm_extioi_access_regs(fd, EXTIOI_IPMAP_START, (void *)s->ipmap, true); + kvm_extioi_access_regs(fd, EXTIOI_ENABLE_START, (void *)s->enable, true); + kvm_extioi_access_regs(fd, EXTIOI_BOUNCE_START, (void *)s->bounce, true); + kvm_extioi_access_regs(fd, EXTIOI_ISR_START, (void *)s->isr, true); + kvm_extioi_access_regs(fd, EXTIOI_COREMAP_START, (void *)s->coremap, true); + kvm_extioi_access_regs(fd, EXTIOI_SW_COREMAP_FLAG, + (void *)s->sw_coremap, true); + kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, (void *)s->coreisr, true); + + return 0; +} + +static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) +{ + KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_GET_CLASS(dev); + struct kvm_create_device cd = {0}; + Error *err = NULL; + int ret,i; + + extioi_class->parent_realize(dev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (!extioi_class->is_created) { + cd.type = KVM_DEV_TYPE_LA_EXTIOI; + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); + if (ret < 0) { + error_setg_errno(errp, errno, + "Creating the KVM extioi device failed"); + return; + } + extioi_class->is_created = true; + extioi_class->dev_fd = cd.fd; + fprintf(stdout, "Create LoongArch extioi irqchip in KVM done!\n"); + } + + kvm_async_interrupts_allowed = true; + kvm_msi_via_irqfd_allowed = kvm_irqfds_enabled(); + if (kvm_has_gsi_routing()) { + for (i = 0; i < 64; ++i) { + kvm_irqchip_add_irq_route(kvm_state, i, 0, i); + } + kvm_gsi_routing_allowed = true; + } +} + +static const VMStateDescription vmstate_kvm_extioi_core = { + .name = "kvm-extioi-single", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = kvm_loongarch_extioi_pre_save, + .post_load = kvm_loongarch_extioi_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT32_ARRAY(nodetype, KVMLoongArchExtIOI, + EXTIOI_IRQS_NODETYPE_COUNT / 2), + VMSTATE_UINT32_ARRAY(bounce, KVMLoongArchExtIOI, + EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_UINT32_ARRAY(isr, KVMLoongArchExtIOI, EXTIOI_IRQS / 32), + VMSTATE_UINT32_2DARRAY(coreisr, KVMLoongArchExtIOI, EXTIOI_CPUS, + EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_UINT32_ARRAY(enable, KVMLoongArchExtIOI, EXTIOI_IRQS / 32), + VMSTATE_UINT32_ARRAY(ipmap, KVMLoongArchExtIOI, + EXTIOI_IRQS_IPMAP_SIZE / 4), + VMSTATE_UINT32_ARRAY(coremap, KVMLoongArchExtIOI, EXTIOI_IRQS / 4), + VMSTATE_UINT8_ARRAY(sw_coremap, KVMLoongArchExtIOI, EXTIOI_IRQS), + VMSTATE_END_OF_LIST() + } +}; + +static void kvm_loongarch_extioi_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_CLASS(oc); + + extioi_class->parent_realize = dc->realize; + dc->realize = kvm_loongarch_extioi_realize; + extioi_class->is_created = false; + dc->vmsd = &vmstate_kvm_extioi_core; +} + +static const TypeInfo kvm_loongarch_extioi_info = { + .name = TYPE_KVM_LOONGARCH_EXTIOI, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMLoongArchExtIOI), + .class_size = sizeof(KVMLoongArchExtIOIClass), + .class_init = kvm_loongarch_extioi_class_init, +}; + +static void kvm_loongarch_extioi_register_types(void) +{ + type_register_static(&kvm_loongarch_extioi_info); +} + +type_init(kvm_loongarch_extioi_register_types) diff --git a/hw/intc/meson.build b/hw/intc/meson.build index 9deeeb51bb..a37d7da8aa 100644 --- a/hw/intc/meson.build +++ b/hw/intc/meson.build @@ -74,3 +74,4 @@ specific_ss.add(when: 'CONFIG_LOONGARCH_IPI_KVM', if_true: files('loongarch_ipi_ specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC', if_true: files('loongarch_pch_pic.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_MSI', if_true: files('loongarch_pch_msi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI', if_true: files('loongarch_extioi.c')) +specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI_KVM', if_true: files('loongarch_extioi_kvm.c')) diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index 1e761624c6..1a47d44a64 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -15,6 +15,7 @@ config LOONGARCH_VIRT select LOONGARCH_PCH_MSI select LOONGARCH_EXTIOI select LOONGARCH_IPI_KVM if KVM + select LOONGARCH_EXTIOI_KVM if KVM select LS7A_RTC select SMBIOS select ACPI_CPU_HOTPLUG diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index f065eb75f8..71e2a3735c 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -867,31 +867,33 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) lvms->ipi = ipi; /* Create EXTIOI device */ - extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); - qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus); - if (virt_is_veiointc_enabled(lvms)) { - qdev_prop_set_bit(extioi, "has-virtualization-extension", true); - } - sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); - - memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE, - sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); - if (virt_is_veiointc_enabled(lvms)) { - memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE, - sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); - } - lvms->extioi = extioi; - - /* - * connect ext irq to the cpu irq - * cpu_pin[9:2] <= intc_pin[7:0] - */ - for (cpu = 0; cpu < ms->smp.cpus; cpu++) { - cpudev = DEVICE(qemu_get_cpu(cpu)); - for (pin = 0; pin < LS3A_INTC_IP; pin++) { - qdev_connect_gpio_out(extioi, (cpu * 8 + pin), - qdev_get_gpio_in(cpudev, pin + 2)); + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + extioi = qdev_new(TYPE_KVM_LOONGARCH_EXTIOI); + sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + } else { + extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); + qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus); + if (virt_is_veiointc_enabled(lvms)) { + qdev_prop_set_bit(extioi, "has-virtualization-extension", true); } + sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); + if (virt_is_veiointc_enabled(lvms)) { + memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); + } + /* + * connect ext irq to the cpu irq + * cpu_pin[9:2] <= intc_pin[7:0] + */ + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + cpudev = DEVICE(qemu_get_cpu(cpu)); + for (pin = 0; pin < LS3A_INTC_IP; pin++) { + qdev_connect_gpio_out(extioi, (cpu * 8 + pin), + qdev_get_gpio_in(cpudev, pin + 2)); + } + } } lvms->extioi = extioi; diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index 722ffee1bc..9966cd98d3 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -15,7 +15,7 @@ #define EXTIOI_IRQS (256) #define EXTIOI_IRQS_BITMAP_SIZE (256 / 8) /* irq from EXTIOI is routed to no more than 4 cpus */ -#define EXTIOI_CPUS (4) +#define EXTIOI_CPUS (256) /* map to ipnum per 32 irqs */ #define EXTIOI_IRQS_IPMAP_SIZE (256 / 32) #define EXTIOI_IRQS_COREMAP_SIZE 256 @@ -59,13 +59,17 @@ #define EXTIOI_VIRT_COREMAP_START (0x40) #define EXTIOI_VIRT_COREMAP_END (0x240) +#define EXTIOI_SW_COREMAP_FLAG (1 << 0) + typedef struct ExtIOICore { uint32_t coreisr[EXTIOI_IRQS_GROUP_COUNT]; DECLARE_BITMAP(sw_isr[LS3A_INTC_IP], EXTIOI_IRQS); qemu_irq parent_irq[LS3A_INTC_IP]; } ExtIOICore; -#define TYPE_LOONGARCH_EXTIOI "loongarch.extioi" +#define TYPE_LOONGARCH_EXTIOI "loongarch-extioi" +#define TYPE_KVM_LOONGARCH_EXTIOI "loongarch-kvm-extioi" + OBJECT_DECLARE_SIMPLE_TYPE(LoongArchExtIOI, LOONGARCH_EXTIOI) struct LoongArchExtIOI { SysBusDevice parent_obj; @@ -87,4 +91,32 @@ struct LoongArchExtIOI { MemoryRegion extioi_system_mem; MemoryRegion virt_extend; }; + +struct KVMLoongArchExtIOI { + SysBusDevice parent_obj; + /* hardware state */ + uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2]; + uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT]; + uint32_t isr[EXTIOI_IRQS / 32]; + uint32_t coreisr[EXTIOI_CPUS][EXTIOI_IRQS_GROUP_COUNT]; + uint32_t enable[EXTIOI_IRQS / 32]; + uint32_t ipmap[EXTIOI_IRQS_IPMAP_SIZE / 4]; + uint32_t coremap[EXTIOI_IRQS / 4]; + uint8_t sw_coremap[EXTIOI_IRQS]; +}; +typedef struct KVMLoongArchExtIOI KVMLoongArchExtIOI; +DECLARE_INSTANCE_CHECKER(KVMLoongArchExtIOI, KVM_LOONGARCH_EXTIOI, + TYPE_KVM_LOONGARCH_EXTIOI) + +struct KVMLoongArchExtIOIClass { + SysBusDeviceClass parent_class; + DeviceRealize parent_realize; + + bool is_created; + int dev_fd; +}; +typedef struct KVMLoongArchExtIOIClass KVMLoongArchExtIOIClass; +DECLARE_CLASS_CHECKERS(KVMLoongArchExtIOIClass, KVM_LOONGARCH_EXTIOI, + TYPE_KVM_LOONGARCH_EXTIOI) + #endif /* LOONGARCH_EXTIOI_H */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 98c990327b..168b40c31b 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -38,6 +38,21 @@ #define FDT_BASE 0x100000 +/* KVM_IRQ_LINE irq field index values */ +#define KVM_LOONGARCH_IRQ_TYPE_SHIFT 24 +#define KVM_LOONGARCH_IRQ_TYPE_MASK 0xff +#define KVM_LOONGARCH_IRQ_VCPU_SHIFT 16 +#define KVM_LOONGARCH_IRQ_VCPU_MASK 0xff +#define KVM_LOONGARCH_IRQ_NUM_SHIFT 0 +#define KVM_LOONGARCH_IRQ_NUM_MASK 0xffff + +/* irq_type field */ +#define KVM_LOONGARCH_IRQ_TYPE_CPU_IP 0 +#define KVM_LOONGARCH_IRQ_TYPE_CPU_IO 1 +#define KVM_LOONGARCH_IRQ_TYPE_HT 2 +#define KVM_LOONGARCH_IRQ_TYPE_MSI 3 +#define KVM_LOONGARCH_IRQ_TYPE_IOAPIC 4 + struct LoongArchVirtMachineState { /*< private >*/ MachineState parent_obj; diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index ea1f821a9f..0c0b82d1ef 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1472,6 +1472,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_LA_IPI, #define KVM_DEV_TYPE_LA_IPI KVM_DEV_TYPE_LA_IPI + KVM_DEV_TYPE_LA_EXTIOI, +#define KVM_DEV_TYPE_LA_EXTIOI KVM_DEV_TYPE_LA_EXTIOI KVM_DEV_TYPE_MAX, }; -- Gitee From 30f88e80a47d9bcde08c44c0d752c22c11f2224c Mon Sep 17 00:00:00 2001 From: gaosong Date: Sun, 8 Sep 2024 10:13:29 +0800 Subject: [PATCH 419/939] hw/loongarch: Add KVM pch pic device support Added pch_pic interrupt controller for kvm emulation. The main process is to send the command word for creating an pch_pic device to the kernel, Delivers the pch pic interrupt controller configuration register base address to the kernel. When the VM is saved, the ioctl obtains the pch_pic interrupt controller data in the kernel and saves it. When the VM is recovered, the saved data is sent to the kernel. Signed-off-by: gaosong --- hw/intc/Kconfig | 3 + hw/intc/loongarch_pch_pic.c | 24 +++- hw/intc/loongarch_pch_pic_kvm.c | 189 ++++++++++++++++++++++++++++ hw/intc/meson.build | 1 + hw/loongarch/Kconfig | 1 + hw/loongarch/virt.c | 70 ++++++----- include/hw/intc/loongarch_pch_pic.h | 51 +++++++- linux-headers/linux/kvm.h | 2 + 8 files changed, 303 insertions(+), 38 deletions(-) create mode 100644 hw/intc/loongarch_pch_pic_kvm.c diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig index f1e8bd2fc9..91c7aa668e 100644 --- a/hw/intc/Kconfig +++ b/hw/intc/Kconfig @@ -100,6 +100,9 @@ config LOONGARCH_PCH_PIC bool select UNIMP +config LOONGARCH_PCH_PIC_KVM + bool + config LOONGARCH_PCH_MSI select MSI_NONBROKEN bool diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c index 6aa4cadfa4..beb4ac188d 100644 --- a/hw/intc/loongarch_pch_pic.c +++ b/hw/intc/loongarch_pch_pic.c @@ -16,19 +16,28 @@ #include "migration/vmstate.h" #include "trace.h" #include "qapi/error.h" +#include "sysemu/kvm.h" static void pch_pic_update_irq(LoongArchPCHPIC *s, uint64_t mask, int level) { uint64_t val; int irq; + int kvm_irq; if (level) { val = mask & s->intirr & ~s->int_mask; if (val) { irq = ctz64(val); s->intisr |= MAKE_64BIT_MASK(irq, 1); - qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 1); - } + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irq = ( + KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT) + | (0 << KVM_LOONGARCH_IRQ_VCPU_SHIFT) | s->htmsi_vector[irq]; + kvm_set_irq(kvm_state, kvm_irq, !!level); + } else { + qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 1); + } + } } else { /* * intirr means requested pending irq @@ -38,8 +47,15 @@ static void pch_pic_update_irq(LoongArchPCHPIC *s, uint64_t mask, int level) if (val) { irq = ctz64(val); s->intisr &= ~MAKE_64BIT_MASK(irq, 1); - qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 0); - } + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irq = ( + KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT) + | (0 << KVM_LOONGARCH_IRQ_VCPU_SHIFT) | s->htmsi_vector[irq]; + kvm_set_irq(kvm_state, kvm_irq, !!level); + } else { + qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 0); + } + } } } diff --git a/hw/intc/loongarch_pch_pic_kvm.c b/hw/intc/loongarch_pch_pic_kvm.c new file mode 100644 index 0000000000..8f66d9a01f --- /dev/null +++ b/hw/intc/loongarch_pch_pic_kvm.c @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm pch pic interrupt support + * + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_pch_pic.h" +#include "hw/sysbus.h" +#include "linux/kvm.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" +#include "hw/loongarch/virt.h" +#include "hw/pci-host/ls7a.h" +#include "qemu/error-report.h" + +static void kvm_pch_pic_access_regs(int fd, uint64_t addr, + void *val, int is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static int kvm_loongarch_pch_pic_pre_save(void *opaque) +{ + KVMLoongArchPCHPIC *s = (KVMLoongArchPCHPIC *)opaque; + KVMLoongArchPCHPICClass *class = KVM_LOONGARCH_PCH_PIC_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_pch_pic_access_regs(fd, PCH_PIC_MASK_START, + (void *)&s->int_mask, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_EN_START, + (void *)&s->htmsi_en, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_EDGE_START, + (void *)&s->intedge, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL0_START, + (void *)&s->auto_crtl0, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL1_START, + (void *)&s->auto_crtl1, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_ROUTE_ENTRY_START, + (void *)s->route_entry, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_VEC_START, + (void *)s->htmsi_vector, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_IRR_START, + (void *)&s->intirr, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_ISR_START, + (void *)&s->intisr, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_POLARITY_START, + (void *)&s->int_polarity, false); + + return 0; +} + +static int kvm_loongarch_pch_pic_post_load(void *opaque, int version_id) +{ + KVMLoongArchPCHPIC *s = (KVMLoongArchPCHPIC *)opaque; + KVMLoongArchPCHPICClass *class = KVM_LOONGARCH_PCH_PIC_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_pch_pic_access_regs(fd, PCH_PIC_MASK_START, + (void *)&s->int_mask, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_EN_START, + (void *)&s->htmsi_en, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_EDGE_START, + (void *)&s->intedge, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL0_START, + (void *)&s->auto_crtl0, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL1_START, + (void *)&s->auto_crtl1, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_ROUTE_ENTRY_START, + (void *)s->route_entry, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_VEC_START, + (void *)s->htmsi_vector, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_IRR_START, + (void *)&s->intirr, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_ISR_START, + (void *)&s->intisr, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_POLARITY_START, + (void *)&s->int_polarity, true); + + return 0; +} + +static void kvm_pch_pic_handler(void *opaque, int irq, int level) +{ + int kvm_irq; + + if (kvm_enabled()) { + kvm_irq = \ + (KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT) + | (0 << KVM_LOONGARCH_IRQ_VCPU_SHIFT) | irq; + kvm_set_irq(kvm_state, kvm_irq, !!level); + } +} + +static void kvm_loongarch_pch_pic_realize(DeviceState *dev, Error **errp) +{ + KVMLoongArchPCHPICClass *pch_pic_class = + KVM_LOONGARCH_PCH_PIC_GET_CLASS(dev); + struct kvm_create_device cd = {0}; + uint64_t pch_pic_base = VIRT_PCH_REG_BASE; + Error *err = NULL; + int ret; + + pch_pic_class->parent_realize(dev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (!pch_pic_class->is_created) { + cd.type = KVM_DEV_TYPE_LA_PCH_PIC; + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); + if (ret < 0) { + error_setg_errno(errp, errno, + "Creating the KVM pch pic device failed"); + return; + } + pch_pic_class->is_created = true; + pch_pic_class->dev_fd = cd.fd; + fprintf(stdout, "Create LoongArch pch pic irqchip in KVM done!\n"); + + ret = kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL, + KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT, + &pch_pic_base, true, NULL); + if (ret < 0) { + error_report( + "KVM EXTIOI: failed to set the base address of EXTIOI"); + exit(1); + } + + qdev_init_gpio_in(dev, kvm_pch_pic_handler, VIRT_PCH_PIC_IRQ_NUM); + } +} + +static const VMStateDescription vmstate_kvm_loongarch_pch_pic = { + .name = TYPE_LOONGARCH_PCH_PIC, + .version_id = 1, + .minimum_version_id = 1, + .pre_save = kvm_loongarch_pch_pic_pre_save, + .post_load = kvm_loongarch_pch_pic_post_load, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(int_mask, KVMLoongArchPCHPIC), + VMSTATE_UINT64(htmsi_en, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intedge, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intclr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(auto_crtl0, KVMLoongArchPCHPIC), + VMSTATE_UINT64(auto_crtl1, KVMLoongArchPCHPIC), + VMSTATE_UINT8_ARRAY(route_entry, KVMLoongArchPCHPIC, 64), + VMSTATE_UINT8_ARRAY(htmsi_vector, KVMLoongArchPCHPIC, 64), + VMSTATE_UINT64(last_intirr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intirr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intisr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(int_polarity, KVMLoongArchPCHPIC), + VMSTATE_END_OF_LIST() + } +}; + + +static void kvm_loongarch_pch_pic_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + KVMLoongArchPCHPICClass *pch_pic_class = KVM_LOONGARCH_PCH_PIC_CLASS(oc); + + pch_pic_class->parent_realize = dc->realize; + dc->realize = kvm_loongarch_pch_pic_realize; + pch_pic_class->is_created = false; + dc->vmsd = &vmstate_kvm_loongarch_pch_pic; + +} + +static const TypeInfo kvm_loongarch_pch_pic_info = { + .name = TYPE_KVM_LOONGARCH_PCH_PIC, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMLoongArchPCHPIC), + .class_size = sizeof(KVMLoongArchPCHPICClass), + .class_init = kvm_loongarch_pch_pic_class_init, +}; + +static void kvm_loongarch_pch_pic_register_types(void) +{ + type_register_static(&kvm_loongarch_pch_pic_info); +} + +type_init(kvm_loongarch_pch_pic_register_types) diff --git a/hw/intc/meson.build b/hw/intc/meson.build index a37d7da8aa..49b4501315 100644 --- a/hw/intc/meson.build +++ b/hw/intc/meson.build @@ -75,3 +75,4 @@ specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC', if_true: files('loongarch_pch_ specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_MSI', if_true: files('loongarch_pch_msi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI', if_true: files('loongarch_extioi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI_KVM', if_true: files('loongarch_extioi_kvm.c')) +specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC_KVM', if_true: files('loongarch_pch_pic_kvm.c')) diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index 1a47d44a64..16c854c0d5 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -15,6 +15,7 @@ config LOONGARCH_VIRT select LOONGARCH_PCH_MSI select LOONGARCH_EXTIOI select LOONGARCH_IPI_KVM if KVM + select LOONGARCH_PCH_PIC_KVM if KVM select LOONGARCH_EXTIOI_KVM if KVM select LS7A_RTC select SMBIOS diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 71e2a3735c..270dcfd38f 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -901,45 +901,49 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) /* Add Extend I/O Interrupt Controller node */ fdt_add_eiointc_node(lvms, &cpuintc_phandle, &eiointc_phandle); - pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC); - num = VIRT_PCH_PIC_IRQ_NUM; - qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num); - d = SYS_BUS_DEVICE(pch_pic); - sysbus_realize_and_unref(d, &error_fatal); - memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE, - sysbus_mmio_get_region(d, 0)); - memory_region_add_subregion(get_system_memory(), - VIRT_IOAPIC_REG_BASE + PCH_PIC_ROUTE_ENTRY_OFFSET, - sysbus_mmio_get_region(d, 1)); - memory_region_add_subregion(get_system_memory(), - VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO, - sysbus_mmio_get_region(d, 2)); - - /* Connect pch_pic irqs to extioi */ - for (i = 0; i < num; i++) { - qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); - } - /* Add PCH PIC node */ fdt_add_pch_pic_node(lvms, &eiointc_phandle, &pch_pic_phandle); - pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); - start = num; - num = EXTIOI_IRQS - start; - qdev_prop_set_uint32(pch_msi, "msi_irq_base", start); - qdev_prop_set_uint32(pch_msi, "msi_irq_num", num); - d = SYS_BUS_DEVICE(pch_msi); - sysbus_realize_and_unref(d, &error_fatal); - sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); - for (i = 0; i < num; i++) { - /* Connect pch_msi irqs to extioi */ - qdev_connect_gpio_out(DEVICE(d), i, - qdev_get_gpio_in(extioi, i + start)); - } - /* Add PCH MSI node */ fdt_add_pch_msi_node(lvms, &eiointc_phandle, &pch_msi_phandle); + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + pch_pic = qdev_new(TYPE_KVM_LOONGARCH_PCH_PIC); + sysbus_realize_and_unref(SYS_BUS_DEVICE(pch_pic), &error_fatal); + } else { + pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC); + num = VIRT_PCH_PIC_IRQ_NUM; + qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num); + d = SYS_BUS_DEVICE(pch_pic); + sysbus_realize_and_unref(d, &error_fatal); + memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE, + sysbus_mmio_get_region(d, 0)); + memory_region_add_subregion(get_system_memory(), + VIRT_IOAPIC_REG_BASE + PCH_PIC_ROUTE_ENTRY_OFFSET, + sysbus_mmio_get_region(d, 1)); + memory_region_add_subregion(get_system_memory(), + VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO, + sysbus_mmio_get_region(d, 2)); + /* Connect pch_pic irqs to extioi */ + for (i = 0; i < num; i++) { + qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); + } + + pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); + start = num; + num = EXTIOI_IRQS - start; + qdev_prop_set_uint32(pch_msi, "msi_irq_base", start); + qdev_prop_set_uint32(pch_msi, "msi_irq_num", num); + d = SYS_BUS_DEVICE(pch_msi); + sysbus_realize_and_unref(d, &error_fatal); + sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); + for (i = 0; i < num; i++) { + /* Connect pch_msi irqs to extioi */ + qdev_connect_gpio_out(DEVICE(d), i, + qdev_get_gpio_in(extioi, i + start)); + } + } + virt_devices_init(pch_pic, lvms, &pch_pic_phandle, &pch_msi_phandle); } diff --git a/include/hw/intc/loongarch_pch_pic.h b/include/hw/intc/loongarch_pch_pic.h index d5437e88f2..77f4cd74a1 100644 --- a/include/hw/intc/loongarch_pch_pic.h +++ b/include/hw/intc/loongarch_pch_pic.h @@ -7,7 +7,8 @@ #include "hw/sysbus.h" -#define TYPE_LOONGARCH_PCH_PIC "loongarch_pch_pic" +#define TYPE_LOONGARCH_PCH_PIC "loongarch_pch_pic" +#define TYPE_KVM_LOONGARCH_PCH_PIC "loongarch_kvm_pch_pic" #define PCH_PIC_NAME(name) TYPE_LOONGARCH_PCH_PIC#name OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHPIC, LOONGARCH_PCH_PIC) @@ -37,6 +38,19 @@ OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHPIC, LOONGARCH_PCH_PIC) #define PCH_PIC_INT_POL_LO 0x3e0 #define PCH_PIC_INT_POL_HI 0x3e4 +#define PCH_PIC_INT_ID_START PCH_PIC_INT_ID_LO +#define PCH_PIC_MASK_START PCH_PIC_INT_MASK_LO +#define PCH_PIC_HTMSI_EN_START PCH_PIC_HTMSI_EN_LO +#define PCH_PIC_EDGE_START PCH_PIC_INT_EDGE_LO +#define PCH_PIC_CLEAR_START PCH_PIC_INT_CLEAR_LO +#define PCH_PIC_AUTO_CTRL0_START PCH_PIC_AUTO_CTRL0_LO +#define PCH_PIC_AUTO_CTRL1_START PCH_PIC_AUTO_CTRL1_LO +#define PCH_PIC_ROUTE_ENTRY_START PCH_PIC_ROUTE_ENTRY_OFFSET +#define PCH_PIC_HTMSI_VEC_START PCH_PIC_HTMSI_VEC_OFFSET +#define PCH_PIC_INT_IRR_START 0x380 +#define PCH_PIC_INT_ISR_START PCH_PIC_INT_STATUS_LO +#define PCH_PIC_POLARITY_START PCH_PIC_INT_POL_LO + #define STATUS_LO_START 0 #define STATUS_HI_START 0x4 #define POL_LO_START 0x40 @@ -67,3 +81,38 @@ struct LoongArchPCHPIC { MemoryRegion iomem8; unsigned int irq_num; }; + +struct KVMLoongArchPCHPIC { + SysBusDevice parent_obj; + uint64_t int_mask; /*0x020 interrupt mask register*/ + uint64_t htmsi_en; /*0x040 1=msi*/ + uint64_t intedge; /*0x060 edge=1 level =0*/ + uint64_t intclr; /*0x080 for clean edge int,set 1 clean,set 0 is noused*/ + uint64_t auto_crtl0; /*0x0c0*/ + uint64_t auto_crtl1; /*0x0e0*/ + uint64_t last_intirr; /* edge detection */ + uint64_t intirr; /* 0x380 interrupt request register */ + uint64_t intisr; /* 0x3a0 interrupt service register */ + /* + * 0x3e0 interrupt level polarity selection + * register 0 for high level trigger + */ + uint64_t int_polarity; + + uint8_t route_entry[64]; /*0x100 - 0x138*/ + uint8_t htmsi_vector[64]; /*0x200 - 0x238*/ +}; +typedef struct KVMLoongArchPCHPIC KVMLoongArchPCHPIC; +DECLARE_INSTANCE_CHECKER(KVMLoongArchPCHPIC, KVM_LOONGARCH_PCH_PIC, + TYPE_KVM_LOONGARCH_PCH_PIC) + +struct KVMLoongArchPCHPICClass { + SysBusDeviceClass parent_class; + DeviceRealize parent_realize; + + bool is_created; + int dev_fd; +}; +typedef struct KVMLoongArchPCHPICClass KVMLoongArchPCHPICClass; +DECLARE_CLASS_CHECKERS(KVMLoongArchPCHPICClass, KVM_LOONGARCH_PCH_PIC, + TYPE_KVM_LOONGARCH_PCH_PIC) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 0c0b82d1ef..887f8268e7 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1470,6 +1470,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA + KVM_DEV_TYPE_LA_PCH_PIC = 0x100, +#define KVM_DEV_TYPE_LA_PCH_PIC KVM_DEV_TYPE_LA_PCH_PIC KVM_DEV_TYPE_LA_IPI, #define KVM_DEV_TYPE_LA_IPI KVM_DEV_TYPE_LA_IPI KVM_DEV_TYPE_LA_EXTIOI, -- Gitee From 24bd774f8146247c7ac6071492f6016140a97267 Mon Sep 17 00:00:00 2001 From: gaosong Date: Sun, 8 Sep 2024 22:18:50 +0800 Subject: [PATCH 420/939] hw/loongarch: Add KVM pch msi device support Added pch_msi interrupt controller handling during kernel emulation of irq chip. Signed-off-by: gaosong --- hw/intc/loongarch_pch_msi.c | 39 ++++++++++++++++++++++------- hw/loongarch/virt.c | 22 +++++++++------- include/hw/intc/loongarch_pch_msi.h | 2 +- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/hw/intc/loongarch_pch_msi.c b/hw/intc/loongarch_pch_msi.c index ecf3ed0267..901c2c21be 100644 --- a/hw/intc/loongarch_pch_msi.c +++ b/hw/intc/loongarch_pch_msi.c @@ -14,6 +14,8 @@ #include "hw/misc/unimp.h" #include "migration/vmstate.h" #include "trace.h" +#include "sysemu/kvm.h" +#include "hw/loongarch/virt.h" static uint64_t loongarch_msi_mem_read(void *opaque, hwaddr addr, unsigned size) { @@ -26,14 +28,24 @@ static void loongarch_msi_mem_write(void *opaque, hwaddr addr, LoongArchPCHMSI *s = (LoongArchPCHMSI *)opaque; int irq_num; - /* - * vector number is irq number from upper extioi intc - * need subtract irq base to get msi vector offset - */ - irq_num = (val & 0xff) - s->irq_base; - trace_loongarch_msi_set_irq(irq_num); - assert(irq_num < s->irq_num); - qemu_set_irq(s->pch_msi_irq[irq_num], 1); + MSIMessage msg = { + .address = addr, + .data = val, + }; + + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irqchip_send_msi(kvm_state, msg); + } else { + /* + * vector number is irq number from upper extioi intc + * need subtract irq base to get msi vector offset + */ + irq_num = (val & 0xff) - s->irq_base; + trace_loongarch_msi_set_irq(irq_num); + assert(irq_num < s->irq_num); + + qemu_set_irq(s->pch_msi_irq[irq_num], 1); + } } static const MemoryRegionOps loongarch_pch_msi_ops = { @@ -46,7 +58,16 @@ static void pch_msi_irq_handler(void *opaque, int irq, int level) { LoongArchPCHMSI *s = LOONGARCH_PCH_MSI(opaque); - qemu_set_irq(s->pch_msi_irq[irq], level); + MSIMessage msg = { + .address = 0, + .data = irq, + }; + + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irqchip_send_msi(kvm_state, msg); + } else { + qemu_set_irq(s->pch_msi_irq[irq], level); + } } static void loongarch_pch_msi_realize(DeviceState *dev, Error **errp) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 270dcfd38f..5b0468f6cb 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -928,22 +928,26 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) for (i = 0; i < num; i++) { qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); } + } - pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); - start = num; - num = EXTIOI_IRQS - start; - qdev_prop_set_uint32(pch_msi, "msi_irq_base", start); - qdev_prop_set_uint32(pch_msi, "msi_irq_num", num); - d = SYS_BUS_DEVICE(pch_msi); - sysbus_realize_and_unref(d, &error_fatal); - sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); + pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); + num = VIRT_PCH_PIC_IRQ_NUM; + start = num; + num = EXTIOI_IRQS - start; + qdev_prop_set_uint32(pch_msi, "msi_irq_base", start); + qdev_prop_set_uint32(pch_msi, "msi_irq_num", num); + d = SYS_BUS_DEVICE(pch_msi); + sysbus_realize_and_unref(d, &error_fatal); + + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) { + /* Connect pch_msi irqs to extioi */ for (i = 0; i < num; i++) { - /* Connect pch_msi irqs to extioi */ qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i + start)); } } + sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); virt_devices_init(pch_pic, lvms, &pch_pic_phandle, &pch_msi_phandle); } diff --git a/include/hw/intc/loongarch_pch_msi.h b/include/hw/intc/loongarch_pch_msi.h index b8586fb3b6..fd4ea97a83 100644 --- a/include/hw/intc/loongarch_pch_msi.h +++ b/include/hw/intc/loongarch_pch_msi.h @@ -7,7 +7,7 @@ #include "hw/sysbus.h" -#define TYPE_LOONGARCH_PCH_MSI "loongarch_pch_msi" +#define TYPE_LOONGARCH_PCH_MSI "loongarch_pch_msi" OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHMSI, LOONGARCH_PCH_MSI) /* MSI irq start from 32 to 255 */ -- Gitee From 4a74147e1b2e276eb2ad2855bafc3c0136bc18a3 Mon Sep 17 00:00:00 2001 From: gaosong Date: Sun, 8 Sep 2024 22:34:57 +0800 Subject: [PATCH 421/939] hw/loongarch: clean code remove some unused code Signed-off-by: gaosong --- target/loongarch/kvm/kvm.c | 103 --------------------------- target/loongarch/kvm/kvm_loongarch.h | 2 - target/loongarch/machine.c | 20 ------ 3 files changed, 125 deletions(-) diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index ab1ea3d4fd..0acdd5c4c1 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -684,53 +684,6 @@ static int kvm_check_cpucfg2(CPUState *cs) return ret; } -static int kvm_check_cpucfg6(CPUState *cs) -{ - int ret; - uint64_t val; - struct kvm_device_attr attr = { - .group = KVM_LOONGARCH_VCPU_CPUCFG, - .attr = 6, - .addr = (uint64_t)&val, - }; - LoongArchCPU *cpu = LOONGARCH_CPU(cs); - CPULoongArchState *env = &cpu->env; - - ret = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, &attr); - if (!ret) { - kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, &attr); - - if (FIELD_EX32(env->cpucfg[6], CPUCFG6, PMP)) { - /* Check PMP */ - if (!FIELD_EX32(val, CPUCFG6, PMP)) { - error_report("'pmu' feature not supported by KVM on this host" - " Please disable 'pmu' with " - "'... -cpu XXX,pmu=off ...'\n"); - exit(EXIT_FAILURE); - } - /* Check PMNUM */ - int guest_pmnum = FIELD_EX32(env->cpucfg[6], CPUCFG6, PMNUM); - int host_pmnum = FIELD_EX32(val, CPUCFG6, PMNUM); - if (guest_pmnum > host_pmnum){ - warn_report("The guest pmnum %d larger than KVM support %d\n", - guest_pmnum, host_pmnum); - env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, - PMNUM, host_pmnum); - } - /* Check PMBITS */ - int guest_pmbits = FIELD_EX32(env->cpucfg[6], CPUCFG6, PMBITS); - int host_pmbits = FIELD_EX32(val, CPUCFG6, PMBITS); - if (guest_pmbits != host_pmbits) { - warn_report("The host not support PMBITS %d\n", guest_pmbits); - env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, - PMBITS, host_pmbits); - } - } - } - - return ret; -} - static int kvm_loongarch_put_cpucfg(CPUState *cs) { int i, ret = 0; @@ -745,12 +698,6 @@ static int kvm_loongarch_put_cpucfg(CPUState *cs) return ret; } } - if (i == 6) { - ret = kvm_check_cpucfg6(cs); - if (ret) { - return ret; - } - } val = env->cpucfg[i]; ret = kvm_set_one_reg(cs, KVM_IOC_CPUCFG(i), &val); if (ret < 0) { @@ -760,56 +707,6 @@ static int kvm_loongarch_put_cpucfg(CPUState *cs) return ret; } -int kvm_loongarch_put_pvtime(LoongArchCPU *cpu) -{ - CPULoongArchState *env = &cpu->env; - int err; - struct kvm_device_attr attr = { - .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, - .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, - .addr = (uint64_t)&env->st.guest_addr, - }; - - err = kvm_vcpu_ioctl(CPU(cpu), KVM_HAS_DEVICE_ATTR, attr); - if (err != 0) { - /* It's ok even though kvm has not such attr */ - return 0; - } - - err = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEVICE_ATTR, attr); - if (err != 0) { - error_report("PVTIME IPA: KVM_SET_DEVICE_ATTR: %s", strerror(-err)); - return err; - } - - return 0; -} - -int kvm_loongarch_get_pvtime(LoongArchCPU *cpu) -{ - CPULoongArchState *env = &cpu->env; - int err; - struct kvm_device_attr attr = { - .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, - .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, - .addr = (uint64_t)&env->st.guest_addr, - }; - - err = kvm_vcpu_ioctl(CPU(cpu), KVM_HAS_DEVICE_ATTR, attr); - if (err != 0) { - /* It's ok even though kvm has not such attr */ - return 0; - } - - err = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEVICE_ATTR, attr); - if (err != 0) { - error_report("PVTIME IPA: KVM_GET_DEVICE_ATTR: %s", strerror(-err)); - return err; - } - - return 0; -} - int kvm_arch_get_registers(CPUState *cs) { int ret; diff --git a/target/loongarch/kvm/kvm_loongarch.h b/target/loongarch/kvm/kvm_loongarch.h index 8482f9308d..1051a341ec 100644 --- a/target/loongarch/kvm/kvm_loongarch.h +++ b/target/loongarch/kvm/kvm_loongarch.h @@ -11,8 +11,6 @@ #define QEMU_KVM_LOONGARCH_H int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level); -int kvm_loongarch_put_pvtime(LoongArchCPU *cpu); -int kvm_loongarch_get_pvtime(LoongArchCPU *cpu); void kvm_arch_reset_vcpu(CPUState *cs); #endif diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index fd69ea05dc..57abdddc09 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -112,24 +112,6 @@ static const VMStateDescription vmstate_lasx = { }, }; -static int cpu_post_load(void *opaque, int version_id) -{ -#ifdef CONFIG_KVM - LoongArchCPU *cpu = opaque; - kvm_loongarch_put_pvtime(cpu); -#endif - return 0; -} - -static int cpu_pre_save(void *opaque) -{ -#ifdef CONFIG_KVM - LoongArchCPU *cpu = opaque; - kvm_loongarch_get_pvtime(cpu); -#endif - return 0; -} - static bool lbt_needed(void *opaque) { LoongArchCPU *cpu = opaque; @@ -190,8 +172,6 @@ const VMStateDescription vmstate_loongarch_cpu = { .name = "cpu", .version_id = 3, .minimum_version_id = 3, - .post_load = cpu_post_load, - .pre_save = cpu_pre_save, .fields = (const VMStateField[]) { VMSTATE_UINTTL_ARRAY(env.gpr, LoongArchCPU, 32), VMSTATE_UINTTL(env.pc, LoongArchCPU), -- Gitee From b7217c8f9b3f1d611485bad1263e109484a743e3 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 30 Oct 2024 09:23:59 +0800 Subject: [PATCH 422/939] hw/loongarch/boot: Use warn_report when no kernel filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we run “qemu-system-loongarch64 -qmp stdio -vnc none -S”, we get an error message “Need kernel filename” and then we can't use qmp cmd to query some information. So, we just throw a warning and then the cpus starts running from address VIRT_FLASH0_BASE. Signed-off-by: Song Gao Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20241030012359.4040817-1-gaosong@loongson.cn> --- hw/loongarch/boot.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index cb668703bd..f258eefe9a 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -278,7 +278,7 @@ static void init_boot_rom(struct loongarch_boot_info *info, void *p) static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) { void *p, *bp; - int64_t kernel_addr = 0; + int64_t kernel_addr = VIRT_FLASH0_BASE; LoongArchCPU *lacpu; CPUState *cs; @@ -286,8 +286,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) kernel_addr = load_kernel_info(info); } else { if(!qtest_enabled()) { - error_report("Need kernel filename\n"); - exit(1); + warn_report("No kernel provided, booting from flash drive."); } } -- Gitee From f3f7b49a8a323ebfe2be176985336aaf2c97c6c2 Mon Sep 17 00:00:00 2001 From: gaosong Date: Mon, 9 Sep 2024 04:14:49 +0800 Subject: [PATCH 423/939] hw/loongarch: fix cpu hotplug reset Signed-off-by: gaosong --- hw/loongarch/boot.c | 2 +- hw/loongarch/virt.c | 1 + include/hw/loongarch/virt.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index f258eefe9a..53dcefbb55 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -216,7 +216,7 @@ static int64_t load_kernel_info(struct loongarch_boot_info *info) return kernel_entry; } -static void reset_load_elf(void *opaque) +void reset_load_elf(void *opaque) { LoongArchCPU *cpu = opaque; CPULoongArchState *env = &cpu->env; diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 5b0468f6cb..0c24e632bb 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -1494,6 +1494,7 @@ static void virt_cpu_plug(HotplugHandler *hotplug_dev, env = &(cpu->env); env->address_space_iocsr = &lvms->as_iocsr; + qemu_register_reset(reset_load_elf, LOONGARCH_CPU(qemu_get_cpu(cs->cpu_index))); env->ipistate = lvms->ipi; if (!(kvm_enabled() && kvm_irqchip_in_kernel())) { /* connect ipi irq to cpu irq, logic cpu index used here */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 168b40c31b..a79ad41663 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -86,4 +86,5 @@ struct LoongArchVirtMachineState { #define TYPE_LOONGARCH_VIRT_MACHINE MACHINE_TYPE_NAME("virt") OBJECT_DECLARE_SIMPLE_TYPE(LoongArchVirtMachineState, LOONGARCH_VIRT_MACHINE) void loongarch_acpi_setup(LoongArchVirtMachineState *lvms); +void reset_load_elf(void *opaque); #endif -- Gitee From b9e94d97025251cfd13b3ad859b97002504285ce Mon Sep 17 00:00:00 2001 From: Gao Jiazhen Date: Fri, 13 Sep 2024 18:57:20 +0800 Subject: [PATCH 424/939] hw/loongarch/virt: Fix FDT memory node address width cherry picked from commitd 6204af704a071ea68d3af55c0502b112a7af9546 Higher bits for memory nodes were omitted at qemu_fdt_setprop_cells. Cc: mailto:qemu-stable@nongnu.org Signed-off-by: Jiaxun Yang jiaxun.yang@flygoat.com Reviewed-by: Song Gao gaosong@loongson.cn Message-Id: 20240520-loongarch-fdt-memnode-v1-1-5ea9be93911e@flygoat.com Signed-off-by: Song Gao gaosong@loongson.cn Signed-off-by: Gao Jiazhen gaojiazhen_yewu@cmss.chinamobile.com --- hw/loongarch/virt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index fc7b70ed4e..5d4fcb7a55 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -360,7 +360,8 @@ static void fdt_add_memory_node(MachineState *ms, char *nodename = g_strdup_printf("/memory@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0, base, 0, size); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", base >> 32, base, + size >> 32, size); qemu_fdt_setprop_string(ms->fdt, nodename, "device_type", "memory"); if (ms->numa_state && ms->numa_state->num_nodes) { -- Gitee From 9fd6abb40a7223f83244cdad4edf1f8ba21071aa Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Thu, 7 Sep 2023 06:04:23 -0700 Subject: [PATCH 425/939] system/physmem: Propagate AddressSpace to MapClient helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 5c62719710bab66a98f68ebdba333e2240ed6668 Propagate AddressSpace handler to following helpers: - register_map_client() - unregister_map_client() - notify_map_clients[_locked]() Rename them using 'address_space_' prefix instead of 'cpu_'. The AddressSpace argument will be used in the next commit. Reviewed-by: Peter Xu Tested-by: Jonathan Cameron Signed-off-by: Mattias Nissler Message-ID: <20240507094210.300566-2-mnissler@rivosinc.com> [PMD: Split patch, part 1/2] Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: liuxiangdong --- include/exec/cpu-common.h | 2 -- include/exec/memory.h | 26 ++++++++++++++++++++++++-- system/dma-helpers.c | 4 ++-- system/physmem.c | 24 ++++++++++++------------ 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 2a3d4aa1c8..c7fd30d5b9 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -165,8 +165,6 @@ void *cpu_physical_memory_map(hwaddr addr, bool is_write); void cpu_physical_memory_unmap(void *buffer, hwaddr len, bool is_write, hwaddr access_len); -void cpu_register_map_client(QEMUBH *bh); -void cpu_unregister_map_client(QEMUBH *bh); bool cpu_physical_memory_is_io(hwaddr phys_addr); diff --git a/include/exec/memory.h b/include/exec/memory.h index 91c42c9a6a..4b7dc7f055 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2916,8 +2916,8 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, hwaddr len, * May return %NULL and set *@plen to zero(0), if resources needed to perform * the mapping are exhausted. * Use only for reads OR writes - not for read-modify-write operations. - * Use cpu_register_map_client() to know when retrying the map operation is - * likely to succeed. + * Use address_space_register_map_client() to know when retrying the map + * operation is likely to succeed. * * @as: #AddressSpace to be accessed * @addr: address within that address space @@ -2942,6 +2942,28 @@ void *address_space_map(AddressSpace *as, hwaddr addr, void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, bool is_write, hwaddr access_len); +/* + * address_space_register_map_client: Register a callback to invoke when + * resources for address_space_map() are available again. + * + * address_space_map may fail when there are not enough resources available, + * such as when bounce buffer memory would exceed the limit. The callback can + * be used to retry the address_space_map operation. Note that the callback + * gets automatically removed after firing. + * + * @as: #AddressSpace to be accessed + * @bh: callback to invoke when address_space_map() retry is appropriate + */ +void address_space_register_map_client(AddressSpace *as, QEMUBH *bh); + +/* + * address_space_unregister_map_client: Unregister a callback that has + * previously been registered and not fired yet. + * + * @as: #AddressSpace to be accessed + * @bh: callback to unregister + */ +void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh); /* Internal functions, part of the implementation of address_space_read. */ MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, diff --git a/system/dma-helpers.c b/system/dma-helpers.c index 36211acc7e..611ea04ffb 100644 --- a/system/dma-helpers.c +++ b/system/dma-helpers.c @@ -167,7 +167,7 @@ static void dma_blk_cb(void *opaque, int ret) if (dbs->iov.size == 0) { trace_dma_map_wait(dbs); dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs); - cpu_register_map_client(dbs->bh); + address_space_register_map_client(dbs->sg->as, dbs->bh); goto out; } @@ -197,7 +197,7 @@ static void dma_aio_cancel(BlockAIOCB *acb) } if (dbs->bh) { - cpu_unregister_map_client(dbs->bh); + address_space_unregister_map_client(dbs->sg->as, dbs->bh); qemu_bh_delete(dbs->bh); dbs->bh = NULL; } diff --git a/system/physmem.c b/system/physmem.c index 0c629233bd..1d01e7a32b 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -3040,24 +3040,24 @@ QemuMutex map_client_list_lock; static QLIST_HEAD(, MapClient) map_client_list = QLIST_HEAD_INITIALIZER(map_client_list); -static void cpu_unregister_map_client_do(MapClient *client) +static void address_space_unregister_map_client_do(MapClient *client) { QLIST_REMOVE(client, link); g_free(client); } -static void cpu_notify_map_clients_locked(void) +static void address_space_notify_map_clients_locked(AddressSpace *as) { MapClient *client; while (!QLIST_EMPTY(&map_client_list)) { client = QLIST_FIRST(&map_client_list); qemu_bh_schedule(client->bh); - cpu_unregister_map_client_do(client); + address_space_unregister_map_client_do(client); } } -void cpu_register_map_client(QEMUBH *bh) +void address_space_register_map_client(AddressSpace *as, QEMUBH *bh) { MapClient *client = g_malloc(sizeof(*client)); @@ -3067,7 +3067,7 @@ void cpu_register_map_client(QEMUBH *bh) /* Write map_client_list before reading in_use. */ smp_mb(); if (!qatomic_read(&bounce.in_use)) { - cpu_notify_map_clients_locked(); + address_space_notify_map_clients_locked(as); } qemu_mutex_unlock(&map_client_list_lock); } @@ -3088,24 +3088,24 @@ void cpu_exec_init_all(void) qemu_mutex_init(&map_client_list_lock); } -void cpu_unregister_map_client(QEMUBH *bh) +void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh) { MapClient *client; qemu_mutex_lock(&map_client_list_lock); QLIST_FOREACH(client, &map_client_list, link) { if (client->bh == bh) { - cpu_unregister_map_client_do(client); + address_space_unregister_map_client_do(client); break; } } qemu_mutex_unlock(&map_client_list_lock); } -static void cpu_notify_map_clients(void) +static void address_space_notify_map_clients(AddressSpace *as) { qemu_mutex_lock(&map_client_list_lock); - cpu_notify_map_clients_locked(); + address_space_notify_map_clients_locked(as); qemu_mutex_unlock(&map_client_list_lock); } @@ -3173,8 +3173,8 @@ flatview_extend_translation(FlatView *fv, hwaddr addr, * May map a subset of the requested range, given by and returned in *plen. * May return NULL if resources needed to perform the mapping are exhausted. * Use only for reads OR writes - not for read-modify-write operations. - * Use cpu_register_map_client() to know when retrying the map operation is - * likely to succeed. + * Use address_space_register_map_client() to know when retrying the map + * operation is likely to succeed. */ void *address_space_map(AddressSpace *as, hwaddr addr, @@ -3257,7 +3257,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, memory_region_unref(bounce.mr); /* Clear in_use before reading map_client_list. */ qatomic_set_mb(&bounce.in_use, false); - cpu_notify_map_clients(); + address_space_notify_map_clients(as); } void *cpu_physical_memory_map(hwaddr addr, -- Gitee From 215731d484366474a90a2e14f3a75bb84fd314a3 Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Thu, 7 Sep 2023 06:04:23 -0700 Subject: [PATCH 426/939] system/physmem: Per-AddressSpace bounce buffering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 69e78f1b3484e429274352a464a94fa1d78be339 Instead of using a single global bounce buffer, give each AddressSpace its own bounce buffer. The MapClient callback mechanism moves to AddressSpace accordingly. This is in preparation for generalizing bounce buffer handling further to allow multiple bounce buffers, with a total allocation limit configured per AddressSpace. Reviewed-by: Peter Xu Tested-by: Jonathan Cameron Signed-off-by: Mattias Nissler Message-ID: <20240507094210.300566-2-mnissler@rivosinc.com> Reviewed-by: Philippe Mathieu-Daudé [PMD: Split patch, part 2/2] Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: liuxiangdong --- include/exec/memory.h | 19 +++++++++++ system/memory.c | 7 ++++ system/physmem.c | 79 ++++++++++++++++--------------------------- 3 files changed, 56 insertions(+), 49 deletions(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 4b7dc7f055..40dcf70530 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1106,6 +1106,19 @@ struct MemoryListener { QTAILQ_ENTRY(MemoryListener) link_as; }; +typedef struct AddressSpaceMapClient { + QEMUBH *bh; + QLIST_ENTRY(AddressSpaceMapClient) link; +} AddressSpaceMapClient; + +typedef struct { + MemoryRegion *mr; + void *buffer; + hwaddr addr; + hwaddr len; + bool in_use; +} BounceBuffer; + /** * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects */ @@ -1124,6 +1137,12 @@ struct AddressSpace { struct MemoryRegionIoeventfd *ioeventfds; QTAILQ_HEAD(, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; + + /* Bounce buffer to use for this address space. */ + BounceBuffer bounce; + /* List of callbacks to invoke when buffers free up */ + QemuMutex map_client_list_lock; + QLIST_HEAD(, AddressSpaceMapClient) map_client_list; }; typedef struct AddressSpaceDispatch AddressSpaceDispatch; diff --git a/system/memory.c b/system/memory.c index fb817e54bc..026e47dcb8 100644 --- a/system/memory.c +++ b/system/memory.c @@ -3117,6 +3117,9 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); + as->bounce.in_use = false; + qemu_mutex_init(&as->map_client_list_lock); + QLIST_INIT(&as->map_client_list); as->name = g_strdup(name ? name : "anonymous"); address_space_update_topology(as); address_space_update_ioeventfds(as); @@ -3124,6 +3127,10 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) static void do_address_space_destroy(AddressSpace *as) { + assert(!qatomic_read(&as->bounce.in_use)); + assert(QLIST_EMPTY(&as->map_client_list)); + qemu_mutex_destroy(&as->map_client_list_lock); + assert(QTAILQ_EMPTY(&as->listeners)); flatview_unref(as->current_map); diff --git a/system/physmem.c b/system/physmem.c index 1d01e7a32b..4491a7dbd1 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -3021,26 +3021,8 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len) NULL, len, FLUSH_CACHE); } -typedef struct { - MemoryRegion *mr; - void *buffer; - hwaddr addr; - hwaddr len; - bool in_use; -} BounceBuffer; - -static BounceBuffer bounce; - -typedef struct MapClient { - QEMUBH *bh; - QLIST_ENTRY(MapClient) link; -} MapClient; - -QemuMutex map_client_list_lock; -static QLIST_HEAD(, MapClient) map_client_list - = QLIST_HEAD_INITIALIZER(map_client_list); - -static void address_space_unregister_map_client_do(MapClient *client) +static void +address_space_unregister_map_client_do(AddressSpaceMapClient *client) { QLIST_REMOVE(client, link); g_free(client); @@ -3048,10 +3030,10 @@ static void address_space_unregister_map_client_do(MapClient *client) static void address_space_notify_map_clients_locked(AddressSpace *as) { - MapClient *client; + AddressSpaceMapClient *client; - while (!QLIST_EMPTY(&map_client_list)) { - client = QLIST_FIRST(&map_client_list); + while (!QLIST_EMPTY(&as->map_client_list)) { + client = QLIST_FIRST(&as->map_client_list); qemu_bh_schedule(client->bh); address_space_unregister_map_client_do(client); } @@ -3059,17 +3041,17 @@ static void address_space_notify_map_clients_locked(AddressSpace *as) void address_space_register_map_client(AddressSpace *as, QEMUBH *bh) { - MapClient *client = g_malloc(sizeof(*client)); + AddressSpaceMapClient *client = g_malloc(sizeof(*client)); - qemu_mutex_lock(&map_client_list_lock); + qemu_mutex_lock(&as->map_client_list_lock); client->bh = bh; - QLIST_INSERT_HEAD(&map_client_list, client, link); + QLIST_INSERT_HEAD(&as->map_client_list, client, link); /* Write map_client_list before reading in_use. */ smp_mb(); - if (!qatomic_read(&bounce.in_use)) { + if (!qatomic_read(&as->bounce.in_use)) { address_space_notify_map_clients_locked(as); } - qemu_mutex_unlock(&map_client_list_lock); + qemu_mutex_unlock(&as->map_client_list_lock); } void cpu_exec_init_all(void) @@ -3085,28 +3067,27 @@ void cpu_exec_init_all(void) finalize_target_page_bits(); io_mem_init(); memory_map_init(); - qemu_mutex_init(&map_client_list_lock); } void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh) { - MapClient *client; + AddressSpaceMapClient *client; - qemu_mutex_lock(&map_client_list_lock); - QLIST_FOREACH(client, &map_client_list, link) { + qemu_mutex_lock(&as->map_client_list_lock); + QLIST_FOREACH(client, &as->map_client_list, link) { if (client->bh == bh) { address_space_unregister_map_client_do(client); break; } } - qemu_mutex_unlock(&map_client_list_lock); + qemu_mutex_unlock(&as->map_client_list_lock); } static void address_space_notify_map_clients(AddressSpace *as) { - qemu_mutex_lock(&map_client_list_lock); + qemu_mutex_lock(&as->map_client_list_lock); address_space_notify_map_clients_locked(as); - qemu_mutex_unlock(&map_client_list_lock); + qemu_mutex_unlock(&as->map_client_list_lock); } static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, @@ -3197,25 +3178,25 @@ void *address_space_map(AddressSpace *as, mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); if (!memory_access_is_direct(mr, is_write)) { - if (qatomic_xchg(&bounce.in_use, true)) { + if (qatomic_xchg(&as->bounce.in_use, true)) { *plen = 0; return NULL; } /* Avoid unbounded allocations */ l = MIN(l, TARGET_PAGE_SIZE); - bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); - bounce.addr = addr; - bounce.len = l; + as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); + as->bounce.addr = addr; + as->bounce.len = l; memory_region_ref(mr); - bounce.mr = mr; + as->bounce.mr = mr; if (!is_write) { flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, - bounce.buffer, l); + as->bounce.buffer, l); } *plen = l; - return bounce.buffer; + return as->bounce.buffer; } @@ -3233,7 +3214,7 @@ void *address_space_map(AddressSpace *as, void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, bool is_write, hwaddr access_len) { - if (buffer != bounce.buffer) { + if (buffer != as->bounce.buffer) { MemoryRegion *mr; ram_addr_t addr1; @@ -3249,14 +3230,14 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, return; } if (is_write) { - address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED, - bounce.buffer, access_len); + address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED, + as->bounce.buffer, access_len); } - qemu_vfree(bounce.buffer); - bounce.buffer = NULL; - memory_region_unref(bounce.mr); + qemu_vfree(as->bounce.buffer); + as->bounce.buffer = NULL; + memory_region_unref(as->bounce.mr); /* Clear in_use before reading map_client_list. */ - qatomic_set_mb(&bounce.in_use, false); + qatomic_set_mb(&as->bounce.in_use, false); address_space_notify_map_clients(as); } -- Gitee From 17ba0dab19bd20d6388ce26e71b02c211e1d4690 Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Mon, 19 Aug 2024 06:54:54 -0700 Subject: [PATCH 427/939] softmmu: Support concurrent bounce buffers(CVE-2024-8612) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 637b0aa139565cb82a7b9269e62214f87082635c When DMA memory can't be directly accessed, as is the case when running the device model in a separate process without shareable DMA file descriptors, bounce buffering is used. It is not uncommon for device models to request mapping of several DMA regions at the same time. Examples include: * net devices, e.g. when transmitting a packet that is split across several TX descriptors (observed with igb) * USB host controllers, when handling a packet with multiple data TRBs (observed with xhci) Previously, qemu only provided a single bounce buffer per AddressSpace and would fail DMA map requests while the buffer was already in use. In turn, this would cause DMA failures that ultimately manifest as hardware errors from the guest perspective. This change allocates DMA bounce buffers dynamically instead of supporting only a single buffer. Thus, multiple DMA mappings work correctly also when RAM can't be mmap()-ed. The total bounce buffer allocation size is limited individually for each AddressSpace. The default limit is 4096 bytes, matching the previous maximum buffer size. A new x-max-bounce-buffer-size parameter is provided to configure the limit for PCI devices. Signed-off-by: Mattias Nissler Reviewed-by: Philippe Mathieu-Daudé Acked-by: Peter Xu Link: https://lore.kernel.org/r/20240819135455.2957406-1-mnissler@rivosinc.com Signed-off-by: Peter Xu --- hw/pci/pci.c | 8 ++++ include/exec/memory.h | 14 +++---- include/hw/pci/pci_device.h | 3 ++ system/memory.c | 5 ++- system/physmem.c | 82 ++++++++++++++++++++++++++----------- 5 files changed, 76 insertions(+), 36 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 9da41088df..7467a2a9de 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -85,6 +85,8 @@ static Property pci_props[] = { QEMU_PCIE_ERR_UNC_MASK_BITNR, true), DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), DEFINE_PROP_END_OF_LIST() }; @@ -1201,6 +1203,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, "bus master container", UINT64_MAX); address_space_init(&pci_dev->bus_master_as, &pci_dev->bus_master_container_region, pci_dev->name); + pci_dev->bus_master_as.max_bounce_buffer_size = + pci_dev->max_bounce_buffer_size; if (phase_check(PHASE_MACHINE_READY)) { pci_init_bus_master(pci_dev); @@ -2658,6 +2662,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data) k->unrealize = pci_qdev_unrealize; k->bus_type = TYPE_PCI_BUS; device_class_set_props(k, pci_props); + object_class_property_set_description( + klass, "x-max-bounce-buffer-size", + "Maximum buffer size allocated for bounce buffers used for mapped " + "access to indirect DMA memory"); } static void pci_device_class_base_init(ObjectClass *klass, void *data) diff --git a/include/exec/memory.h b/include/exec/memory.h index 40dcf70530..73d274d8f3 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1111,13 +1111,7 @@ typedef struct AddressSpaceMapClient { QLIST_ENTRY(AddressSpaceMapClient) link; } AddressSpaceMapClient; -typedef struct { - MemoryRegion *mr; - void *buffer; - hwaddr addr; - hwaddr len; - bool in_use; -} BounceBuffer; +#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096) /** * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects @@ -1138,8 +1132,10 @@ struct AddressSpace { QTAILQ_HEAD(, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; - /* Bounce buffer to use for this address space. */ - BounceBuffer bounce; + /* Maximum DMA bounce buffer size used for indirect memory map requests */ + size_t max_bounce_buffer_size; + /* Total size of bounce buffers currently allocated, atomically accessed */ + size_t bounce_buffer_size; /* List of callbacks to invoke when buffers free up */ QemuMutex map_client_list_lock; QLIST_HEAD(, AddressSpaceMapClient) map_client_list; diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index d3dd0f64b2..253b48a688 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -160,6 +160,9 @@ struct PCIDevice { /* ID of standby device in net_failover pair */ char *failover_pair_id; uint32_t acpi_index; + + /* Maximum DMA bounce buffer size used for indirect memory map requests */ + uint32_t max_bounce_buffer_size; }; static inline int pci_intx(PCIDevice *pci_dev) diff --git a/system/memory.c b/system/memory.c index 026e47dcb8..1ae03074f3 100644 --- a/system/memory.c +++ b/system/memory.c @@ -3117,7 +3117,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); - as->bounce.in_use = false; + as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE; + as->bounce_buffer_size = 0; qemu_mutex_init(&as->map_client_list_lock); QLIST_INIT(&as->map_client_list); as->name = g_strdup(name ? name : "anonymous"); @@ -3127,7 +3128,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) static void do_address_space_destroy(AddressSpace *as) { - assert(!qatomic_read(&as->bounce.in_use)); + assert(qatomic_read(&as->bounce_buffer_size) == 0); assert(QLIST_EMPTY(&as->map_client_list)); qemu_mutex_destroy(&as->map_client_list_lock); diff --git a/system/physmem.c b/system/physmem.c index 4491a7dbd1..2c8b83f811 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -3021,6 +3021,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len) NULL, len, FLUSH_CACHE); } +/* + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used + * to detect illegal pointers passed to address_space_unmap. + */ +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed + +typedef struct { + uint64_t magic; + MemoryRegion *mr; + hwaddr addr; + size_t len; + uint8_t buffer[]; +} BounceBuffer; + static void address_space_unregister_map_client_do(AddressSpaceMapClient *client) { @@ -3046,9 +3060,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh) qemu_mutex_lock(&as->map_client_list_lock); client->bh = bh; QLIST_INSERT_HEAD(&as->map_client_list, client, link); - /* Write map_client_list before reading in_use. */ + /* Write map_client_list before reading bounce_buffer_size. */ smp_mb(); - if (!qatomic_read(&as->bounce.in_use)) { + if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) { address_space_notify_map_clients_locked(as); } qemu_mutex_unlock(&as->map_client_list_lock); @@ -3178,28 +3192,40 @@ void *address_space_map(AddressSpace *as, mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); if (!memory_access_is_direct(mr, is_write)) { - if (qatomic_xchg(&as->bounce.in_use, true)) { + size_t used = qatomic_read(&as->bounce_buffer_size); + for (;;) { + hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); + size_t new_size = used + alloc; + size_t actual = + qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size); + if (actual == used) { + l = alloc; + break; + } + used = actual; + } + + if (l == 0) { *plen = 0; return NULL; } - /* Avoid unbounded allocations */ - l = MIN(l, TARGET_PAGE_SIZE); - as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); - as->bounce.addr = addr; - as->bounce.len = l; + BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer)); + bounce->magic = BOUNCE_BUFFER_MAGIC; memory_region_ref(mr); - as->bounce.mr = mr; + bounce->mr = mr; + bounce->addr = addr; + bounce->len = l; + if (!is_write) { flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, - as->bounce.buffer, l); + bounce->buffer, l); } *plen = l; - return as->bounce.buffer; + return bounce->buffer; } - memory_region_ref(mr); *plen = flatview_extend_translation(fv, addr, len, mr, xlat, l, is_write, attrs); @@ -3214,12 +3240,11 @@ void *address_space_map(AddressSpace *as, void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, bool is_write, hwaddr access_len) { - if (buffer != as->bounce.buffer) { - MemoryRegion *mr; - ram_addr_t addr1; + MemoryRegion *mr; + ram_addr_t addr1; - mr = memory_region_from_host(buffer, &addr1); - assert(mr != NULL); + mr = memory_region_from_host(buffer, &addr1); + if (mr != NULL) { if (is_write) { invalidate_and_set_dirty(mr, addr1, access_len); } @@ -3229,15 +3254,22 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, memory_region_unref(mr); return; } + + + BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer); + assert(bounce->magic == BOUNCE_BUFFER_MAGIC); + if (is_write) { - address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED, - as->bounce.buffer, access_len); - } - qemu_vfree(as->bounce.buffer); - as->bounce.buffer = NULL; - memory_region_unref(as->bounce.mr); - /* Clear in_use before reading map_client_list. */ - qatomic_set_mb(&as->bounce.in_use, false); + address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED, + bounce->buffer, access_len); + } + + qatomic_sub(&as->bounce_buffer_size, bounce->len); + bounce->magic = ~BOUNCE_BUFFER_MAGIC; + memory_region_unref(bounce->mr); + g_free(bounce); + /* Write bounce_buffer_size before reading map_client_list. */ + smp_mb(); address_space_notify_map_clients(as); } -- Gitee From 234034ba7e8ab516f12cb199fc45cfe7229eb281 Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Mon, 16 Sep 2024 10:57:08 -0700 Subject: [PATCH 428/939] mac_dbdma: Remove leftover `dma_memory_unmap` calls(CVE-2024-8612) cherry-pick from 2d0a071e625d7234e8c5623b7e7bf445e1bef72c These were passing a NULL buffer pointer unconditionally, which happens to behave in a mostly benign way (except for the chance of an excess memory region unref and a bounce buffer leak). Per the function comment, this was never meant to be accepted though, and triggers an assertion with the "softmmu: Support concurrent bounce buffers" change. Given that the code in question never sets up any mappings, just remove the unnecessary dma_memory_unmap calls along with the DBDMA_io struct fields that are now entirely unused. Signed-off-by: Mattias Nissler Message-Id: <20240916175708.1829059-1-mnissler@rivosinc.com> Fixes: be1e343995 ("macio: switch over to new byte-aligned DMA helpers") Reviewed-by: Mark Cave-Ayland Tested-by: Mark Cave-Ayland Signed-off-by: Mark Cave-Ayland --- hw/ide/macio.c | 6 ------ include/hw/ppc/mac_dbdma.h | 4 ---- 2 files changed, 10 deletions(-) diff --git a/hw/ide/macio.c b/hw/ide/macio.c index dca1cc9efc..3d895c07f4 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -119,9 +119,6 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) return; done: - dma_memory_unmap(&address_space_memory, io->dma_mem, io->dma_len, - io->dir, io->dma_len); - if (ret < 0) { block_acct_failed(blk_get_stats(s->blk), &s->acct); } else { @@ -202,9 +199,6 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) return; done: - dma_memory_unmap(&address_space_memory, io->dma_mem, io->dma_len, - io->dir, io->dma_len); - if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { if (ret < 0) { block_acct_failed(blk_get_stats(s->blk), &s->acct); diff --git a/include/hw/ppc/mac_dbdma.h b/include/hw/ppc/mac_dbdma.h index 4a3f644516..c774f6bf84 100644 --- a/include/hw/ppc/mac_dbdma.h +++ b/include/hw/ppc/mac_dbdma.h @@ -44,10 +44,6 @@ struct DBDMA_io { DBDMA_end dma_end; /* DMA is in progress, don't start another one */ bool processing; - /* DMA request */ - void *dma_mem; - dma_addr_t dma_len; - DMADirection dir; }; /* -- Gitee From a0c5ce95e94a4621b12262423bfa021accb07625 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Mon, 16 Sep 2024 17:22:02 +0800 Subject: [PATCH 429/939] =?UTF-8?q?crypto:=20avoid=20leak=20of=20ctx=20whe?= =?UTF-8?q?n=20bad=20cipher=20mode=20is=20given=20Fixes:=20Coverity=20CID?= =?UTF-8?q?=201546884=20cherry=20picked=20from=20586ac2c67d707c2588766c519?= =?UTF-8?q?5d94fa553cc25af=20Reviewed-by:=20Peter=20Maydell=20=20Reviewed-by:=20Philippe=20Mathieu-Daud=C3=A9?= =?UTF-8?q?=20=20Signed-off-by:=20Daniel=20P.=20Berrang?= =?UTF-8?q?=C3=A9=20=20Signed-off-by:=20dinglimin=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crypto/cipher-nettle.c.inc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crypto/cipher-nettle.c.inc b/crypto/cipher-nettle.c.inc index 42b39e18a2..766de036ba 100644 --- a/crypto/cipher-nettle.c.inc +++ b/crypto/cipher-nettle.c.inc @@ -734,16 +734,19 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, #ifdef CONFIG_CRYPTO_SM4 case QCRYPTO_CIPHER_ALG_SM4: { - QCryptoNettleSm4 *ctx = g_new0(QCryptoNettleSm4, 1); + QCryptoNettleSm4 *ctx; + const QCryptoCipherDriver *drv; switch (mode) { case QCRYPTO_CIPHER_MODE_ECB: - ctx->base.driver = &qcrypto_nettle_sm4_driver_ecb; + drv = &qcrypto_nettle_sm4_driver_ecb; break; default: goto bad_cipher_mode; } + ctx = g_new0(QCryptoNettleSm4, 1); + ctx->base.driver = drv; sm4_set_encrypt_key(&ctx->key[0], key); sm4_set_decrypt_key(&ctx->key[1], key); -- Gitee From 450b67a5dd3954db8441a1ad65a5e4594ba2e405 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Mon, 16 Sep 2024 17:34:34 +0800 Subject: [PATCH 430/939] hw/ufs: add basic info of query response upiu cherry picked from de2cc4078240f8b745a7caeed461b02f2577e2d2 Modify to fill the opcode, idn, index, selector information of all Query Response UPIU. because attr and flag operation of query response upiu need these information too. Signed-off-by: KyoungrulKim Reviewed-by: Minwoo Im Reviewed-by: Jeuk Kim Signed-off-by: Jeuk Kim Signed-off-by: dinglimin --- hw/ufs/ufs.c | 13 +++++++++---- hw/ufs/ufs.h | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c index bac78a32bb..068895b27b 100644 --- a/hw/ufs/ufs.c +++ b/hw/ufs/ufs.c @@ -455,6 +455,14 @@ void ufs_build_upiu_header(UfsRequest *req, uint8_t trans_type, uint8_t flags, req->rsp_upiu.header.data_segment_length = cpu_to_be16(data_segment_length); } +void ufs_build_query_response(UfsRequest *req) +{ + req->rsp_upiu.qr.opcode = req->req_upiu.qr.opcode; + req->rsp_upiu.qr.idn = req->req_upiu.qr.idn; + req->rsp_upiu.qr.index = req->req_upiu.qr.index; + req->rsp_upiu.qr.selector = req->req_upiu.qr.selector; +} + static UfsReqResult ufs_exec_scsi_cmd(UfsRequest *req) { UfsHc *u = req->hc; @@ -931,10 +939,6 @@ static QueryRespCode ufs_read_desc(UfsRequest *req) if (length > req->rsp_upiu.qr.data[0]) { length = req->rsp_upiu.qr.data[0]; } - req->rsp_upiu.qr.opcode = req->req_upiu.qr.opcode; - req->rsp_upiu.qr.idn = req->req_upiu.qr.idn; - req->rsp_upiu.qr.index = req->req_upiu.qr.index; - req->rsp_upiu.qr.selector = req->req_upiu.qr.selector; req->rsp_upiu.qr.length = cpu_to_be16(length); return status; @@ -1015,6 +1019,7 @@ static UfsReqResult ufs_exec_query_cmd(UfsRequest *req) data_segment_length = be16_to_cpu(req->rsp_upiu.qr.length); ufs_build_upiu_header(req, UFS_UPIU_TRANSACTION_QUERY_RSP, 0, status, 0, data_segment_length); + ufs_build_query_response(req); if (status != UFS_QUERY_RESULT_SUCCESS) { return UFS_REQUEST_FAIL; diff --git a/hw/ufs/ufs.h b/hw/ufs/ufs.h index 8fda94f4ef..8a74b4c2ab 100644 --- a/hw/ufs/ufs.h +++ b/hw/ufs/ufs.h @@ -132,6 +132,7 @@ static inline bool is_wlun(uint8_t lun) void ufs_build_upiu_header(UfsRequest *req, uint8_t trans_type, uint8_t flags, uint8_t response, uint8_t scsi_status, uint16_t data_segment_length); +void ufs_build_query_response(UfsRequest *req); void ufs_complete_req(UfsRequest *req, UfsReqResult req_result); void ufs_init_wlu(UfsLu *wlu, uint8_t wlun); #endif /* HW_UFS_UFS_H */ -- Gitee From fc5b9cb39257527568911f65c64d80e23f9f6ae3 Mon Sep 17 00:00:00 2001 From: qihao Date: Wed, 18 Sep 2024 10:32:42 -0400 Subject: [PATCH 431/939] hw/block: fix uint32 overflow cheery-pick from 89cd6254b80784a1b3f574407192493ef92fe65f The product bs->bl.zone_size * (bs->bl.nr_zones - 1) may overflow uint32. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Frolov Message-id: 20240917080356.270576-2-frolov@swemel.ru Signed-off-by: Stefan Hajnoczi Signed-off-by: qihao_yewu --- hw/block/virtio-blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 2eb096a6dc..beedc0cf5f 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -860,7 +860,7 @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) } else { if (bs->bl.zone_size > capacity - offset) { /* The zoned device allows the last smaller zone. */ - len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1); + len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1ull); } else { len = bs->bl.zone_size; } -- Gitee From c357946fa7c1d45a09b40214b5113f689bf7bbd0 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Fri, 1 Mar 2024 14:12:44 +0800 Subject: [PATCH 432/939] target/i386: sev: Fix incompatibility between SEV and CSV on the GET_ID API If the length of GET_ID request is too small, Hygon CSV will return SEV_RET_INVALID_PARAM. This return code doesn't comply with SEV API Spec. Hygon will consider to fix the compitibility issue of return value of the GET_ID API, so also check whether the return value is SEV_RET_INVALID_LEN on Hygon CPUs. Signed-off-by: hanliyang --- target/i386/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index 2c6aecd1a3..04888bc3a8 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -589,7 +589,8 @@ static int sev_get_cpu0_id(int fd, guchar **id, size_t *id_len, Error **errp) /* query the ID length */ r = sev_platform_ioctl(fd, SEV_GET_ID2, &get_id2, &err); - if (r < 0 && err != SEV_RET_INVALID_LEN) { + if (r < 0 && err != SEV_RET_INVALID_LEN && + !(is_hygon_cpu() && err == SEV_RET_INVALID_PARAM)) { error_setg(errp, "SEV: Failed to get ID ret=%d fw_err=%d (%s)", r, err, fw_error_to_str(err)); return 1; -- Gitee From 8f4f8a2071e69130f0b9327ce8f9b92a5ae42c8d Mon Sep 17 00:00:00 2001 From: appleLin Date: Wed, 3 Aug 2022 21:02:41 +0800 Subject: [PATCH 433/939] target/i386: sev: Add support for reuse ASID for different CSV guests In you want to reuse one ASID for many CSV guests, you should provide a label (i.e. userid) and the length of the label when launch CSV guest. The CSV guests which were provided the same userid will share the same ASID. Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 5 +++++ qapi/qom.json | 6 ++++- qemu-options.hx | 5 ++++- target/i386/csv.h | 2 ++ target/i386/sev.c | 46 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 61 insertions(+), 3 deletions(-) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index eb30402c2d..8dc00808ec 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2103,6 +2103,11 @@ struct kvm_csv_command_batch { __u64 csv_batch_list_uaddr; }; +struct kvm_csv_init { + __u64 userid_addr; + __u32 len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) diff --git a/qapi/qom.json b/qapi/qom.json index 213edd8db2..8c7461a113 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -866,6 +866,9 @@ # designated guest firmware page for measured boot with -kernel # (default: false) (since 6.2) # +# @user-id: the user id of the guest owner, only support on Hygon CPUs +# (since 8.2) +# # Since: 2.12 ## { 'struct': 'SevGuestProperties', @@ -876,7 +879,8 @@ '*handle': 'uint32', '*cbitpos': 'uint32', 'reduced-phys-bits': 'uint32', - '*kernel-hashes': 'bool' } } + '*kernel-hashes': 'bool', + '*user-id': 'str' } } ## # @ThreadContextProperties: diff --git a/qemu-options.hx b/qemu-options.hx index 42fd09e4de..9829b1020a 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -5637,7 +5637,7 @@ SRST -object secret,id=sec0,keyid=secmaster0,format=base64,\\ data=$SECRET,iv=$(dh_cert_file = g_strdup(value); } +static char * +sev_guest_get_user_id(Object *obj, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + return g_strdup(s->user_id); +} + +static void +sev_guest_set_user_id(Object *obj, const char *value, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + s->user_id = g_strdup(value); +} + static char * sev_guest_get_sev_device(Object *obj, Error **errp) { @@ -426,6 +443,11 @@ sev_guest_class_init(ObjectClass *oc, void *data) sev_guest_set_kernel_hashes); object_class_property_set_description(oc, "kernel-hashes", "add kernel hashes to guest firmware for measured Linux boot"); + object_class_property_add_str(oc, "user-id", + sev_guest_get_user_id, + sev_guest_set_user_id); + object_class_property_set_description(oc, "user-id", + "user id of the guest owner"); } static void @@ -1174,7 +1196,29 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) } trace_kvm_sev_init(); - ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error); + + /* Only support reuse asid for CSV/CSV2 guest */ + if (is_hygon_cpu() && + (sev_guest->policy & GUEST_POLICY_REUSE_ASID)) { + char *user_id = NULL; + struct kvm_csv_init *init_cmd_buf = NULL; + + user_id = object_property_get_str(OBJECT(sev), "user-id", NULL); + if (user_id && strlen(user_id)) { + init_cmd_buf = g_new0(struct kvm_csv_init, 1); + init_cmd_buf->len = strlen(user_id); + init_cmd_buf->userid_addr = (__u64)user_id; + } + ret = sev_ioctl(sev->sev_fd, cmd, init_cmd_buf, &fw_error); + + if (user_id) { + g_free(user_id); + g_free(init_cmd_buf); + } + } else { + ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error); + } + if (ret) { error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'", __func__, ret, fw_error, fw_error_to_str(fw_error)); -- Gitee From 1f0c212191d0f63744ef61e0725ab4c859b1d189 Mon Sep 17 00:00:00 2001 From: yangxiangkai Date: Mon, 23 Sep 2024 19:23:37 +0800 Subject: [PATCH 434/939] Added CoDA feature support in the context of CVM. When virtcca cvm is enabled, the iommu is tagged as secure. --- hw/vfio/container.c | 15 +++++++++++++++ hw/virtio/virtio-bus.c | 7 +++++++ linux-headers/linux/vfio.h | 2 ++ 3 files changed, 24 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index d8b9117f4f..422235a221 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -33,6 +33,7 @@ #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" +#include "sysemu/kvm.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -399,6 +400,14 @@ static int vfio_get_iommu_type(VFIOContainer *container, VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; int i; + if (virtcca_cvm_enabled()) { + if (ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_S_IOMMU)) { + return VFIO_TYPE1v2_S_IOMMU; + } else { + return -errno; + } + } + for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { return iommu_types[i]; @@ -625,6 +634,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, switch (container->iommu_type) { case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_S_IOMMU: { struct vfio_iommu_type1_info *info; @@ -857,6 +867,11 @@ static int vfio_get_device(VFIOGroup *group, const char *name, return -1; } + if (!virtcca_cvm_enabled() && (info->flags & VFIO_DEVICE_FLAGS_SECURE)) { + error_setg(errp, "Normal vm cannot use confidential device."); + return -1; + } + /* * Set discarding of RAM as not broken for this group if the driver knows * the device operates compatibly with discarding. Setting must be diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 4f16e7ef77..749df6478e 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -30,6 +30,7 @@ #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio.h" #include "exec/address-spaces.h" +#include "sysemu/kvm.h" /* #define DEBUG_VIRTIO_BUS */ @@ -71,6 +72,12 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) return; } + if (virtcca_cvm_enabled() && (strcmp(vdev->name, "vhost-user-fs") == 0)) { + /* VIRTIO_F_IOMMU_PLATFORM should be enabled for vhost-user-fs using swiotlb */ + error_setg(errp, "iommu_platform is not supported by this device"); + return; + } + if (klass->device_plugged != NULL) { klass->device_plugged(qbus->parent, &local_err); } diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 956154e509..c27a43d74b 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -25,6 +25,7 @@ #define VFIO_TYPE1_IOMMU 1 #define VFIO_SPAPR_TCE_IOMMU 2 #define VFIO_TYPE1v2_IOMMU 3 +#define VFIO_TYPE1v2_S_IOMMU 12 /* * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping). This * capability is subject to change as groups are added or removed. @@ -224,6 +225,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ +#define VFIO_DEVICE_FLAGS_SECURE (1 << 9) /* secure pci device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ -- Gitee From 360bd43ff3c4e4938ee8af1a5ccf981152f7ca95 Mon Sep 17 00:00:00 2001 From: yangdepei Date: Mon, 26 Aug 2024 15:40:25 +0800 Subject: [PATCH 435/939] hw/vfio/hct: fix ccp_index error caused by uninitialized buf Signed-off-by: yangdepei --- hw/vfio/hct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/vfio/hct.c b/hw/vfio/hct.c index 790bb78439..9374e95e85 100644 --- a/hw/vfio/hct.c +++ b/hw/vfio/hct.c @@ -235,8 +235,8 @@ static int hct_check_duplicated_index(int index) static int hct_get_ccp_index(HCTDevState *state) { - char path[PATH_MAX]; - char buf[CCP_INDEX_BYTES]; + char path[PATH_MAX] = {0}; + char buf[CCP_INDEX_BYTES] = {0}; int fd; int ret; int ccp_index; -- Gitee From 32855e315c3050f09388f1335c0869bba065fbae Mon Sep 17 00:00:00 2001 From: yangdepei Date: Fri, 27 Sep 2024 17:08:08 +0800 Subject: [PATCH 436/939] hw/vfio/hct: qemu startup terminate once error happened in hct Signed-off-by: yangdepei --- hw/vfio/hct.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/hw/vfio/hct.c b/hw/vfio/hct.c index 9374e95e85..7fd3977182 100644 --- a/hw/vfio/hct.c +++ b/hw/vfio/hct.c @@ -136,7 +136,9 @@ static const MemoryRegionOps hct_mmio_ops = { static void vfio_hct_detach_device(HCTDevState *state) { vfio_detach_device(&state->vdev); - g_free(state->vdev.name); + + if (state->vdev.name) + g_free(state->vdev.name); } static void vfio_hct_exit(PCIDevice *dev) @@ -413,7 +415,6 @@ static int hct_data_init(HCTDevState *state) int ret; if (hct_data.init == 0) { - hct_data.hct_fd = qemu_open_old(HCT_SHARE_DEV, O_RDWR); if (hct_data.hct_fd < 0) { error_report("fail to open %s, errno %d.", HCT_SHARE_DEV, errno); @@ -465,7 +466,6 @@ static void vfio_hct_realize(PCIDevice *pci_dev, Error **errp) { int ret; char *mdevid; - Error *err = NULL; HCTDevState *state = PCI_HCT_DEV(pci_dev); /* parsing mdev device name from startup scripts */ @@ -475,14 +475,18 @@ static void vfio_hct_realize(PCIDevice *pci_dev, Error **errp) ret = hct_data_init(state); if (ret < 0) { g_free(state->vdev.name); + state->vdev.name = NULL; + error_setg(errp, "hct data init failed"); goto out; } ret = vfio_attach_device(state->vdev.name, &state->vdev, - pci_device_iommu_address_space(pci_dev), &err); + pci_device_iommu_address_space(pci_dev), errp); if (ret) { - error_report("attach device failed, name = %s", state->vdev.name); + g_free(state->vdev.name); + state->vdev.name = NULL; + error_setg(errp, "attach device failed, name = %s", state->vdev.name); goto data_uninit_out; } @@ -491,7 +495,12 @@ static void vfio_hct_realize(PCIDevice *pci_dev, Error **errp) ret = vfio_hct_region_mmap(state); if (ret < 0) + { + g_free(state->vdev.name); + state->vdev.name = NULL; + error_setg(errp, "region mmap failed, name = %s", state->vdev.name); goto detach_device_out; + } return; -- Gitee From 67ce79a910ab02d8c1e08a9ebfa6c5aae2e9d5af Mon Sep 17 00:00:00 2001 From: qihao_ss Date: Sun, 29 Sep 2024 06:44:29 -0400 Subject: [PATCH 437/939] target/ppc: Fix lxvx/stxvx facility check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 8bded2e73e80823a67f730140788a3c5e60bf4b5 The XT check for the lxvx/stxvx instructions is currently inverted. This was introduced during the move to decodetree. >From the ISA: Chapter 7. Vector-Scalar Extension Facility Load VSX Vector Indexed X-form lxvx XT,RA,RB if TX=0 & MSR.VSX=0 then VSX_Unavailable() if TX=1 & MSR.VEC=0 then Vector_Unavailable() ... Let XT be the value 32×TX + T. The code currently does the opposite: if (paired || a->rt >= 32) { REQUIRE_VSX(ctx); } else { REQUIRE_VECTOR(ctx); } This was already fixed for lxv/stxv at commit "2cc0e449d1 (target/ppc: Fix lxv/stxv MSR facility check)", but the indexed forms were missed. Cc: qemu-stable@nongnu.org Fixes: 70426b5bb7 ("target/ppc: moved stxvx and lxvx from legacy to decodtree") Signed-off-by: Fabiano Rosas Reviewed-by: Claudio Fontana Acked-by: Ilya Leoshkevich Reviewed-by: Fabiano Rosas Message-ID: <20240911141651.6914-1-farosas@suse.de> Signed-off-by: Richard Henderson Signed-off-by: qihao_yewu --- target/ppc/translate/vsx-impl.c.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 6db87ab336..a2020da9fd 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2292,7 +2292,7 @@ static bool do_lstxv_PLS_D(DisasContext *ctx, arg_PLS_D *a, static bool do_lstxv_X(DisasContext *ctx, arg_X *a, bool store, bool paired) { - if (paired || a->rt >= 32) { + if (paired || a->rt < 32) { REQUIRE_VSX(ctx); } else { REQUIRE_VECTOR(ctx); -- Gitee From a8b171a0e5be721ee173a533f98594f62b0f0250 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Sun, 29 Sep 2024 07:07:36 -0400 Subject: [PATCH 438/939] target/ppc: Fix lxv/stxv MSR facility check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 2cc0e449d17310877fb28a942d4627ad22bb68ea The move to decodetree flipped the inequality test for the VEC / VSX MSR facility check. This caused application crashes under Linux, where these facility unavailable interrupts are used for lazy-switching of VEC/VSX register sets. Getting the incorrect interrupt would result in wrong registers being loaded, potentially overwriting live values and/or exposing stale ones. Cc: qemu-stable@nongnu.org Reported-by: Joel Stanley Fixes: 70426b5bb738 ("target/ppc: moved stxvx and lxvx from legacy to decodtree") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1769 Reviewed-by: Harsh Prateek Bora Tested-by: Harsh Prateek Bora Reviewed-by: Cédric Le Goater Tested-by: Cédric Le Goater Signed-off-by: Nicholas Piggin Signed-off-by: qihao_yewu --- target/ppc/translate/vsx-impl.c.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 6db87ab336..0266f09119 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2268,7 +2268,7 @@ static bool do_lstxv(DisasContext *ctx, int ra, TCGv displ, static bool do_lstxv_D(DisasContext *ctx, arg_D *a, bool store, bool paired) { - if (paired || a->rt >= 32) { + if (paired || a->rt < 32) { REQUIRE_VSX(ctx); } else { REQUIRE_VECTOR(ctx); -- Gitee From 5651eb5cfd3a49506be4be97f8def3fed713c641 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 30 Apr 2024 13:53:33 +0300 Subject: [PATCH 439/939] virtio-net: drop too short packets early Reproducer from https://gitlab.com/qemu-project/qemu/-/issues/1451 creates small packet (1 segment, len = 10 == n->guest_hdr_len), then destroys queue. "if (n->host_hdr_len != n->guest_hdr_len)" is triggered, if body creates zero length/zero segment packet as there is nothing after guest header. qemu_sendv_packet_async() tries to send it. slirp discards it because it is smaller than Ethernet header, but returns 0 because tx hooks are supposed to return total length of data. 0 is propagated upwards and is interpreted as "packet has been sent" which is terrible because queue is being destroyed, nobody is waiting for TX to complete and assert it triggered. Fix is discard such empty packets instead of sending them. Length 1 packets will go via different codepath: virtqueue_push(q->tx_vq, elem, 0); virtio_notify(vdev, q->tx_vq); g_free(elem); and aren't problematic. Signed-off-by: Alexey Dobriyan Signed-off-by: Jason Wang (cherry picked from commit 2c3e4e2de699cd4d9f6c71f30a22d8f125cd6164) Signed-off-by: zhujun2 --- hw/net/virtio-net.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 432c433540..b17137a686 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -2732,18 +2732,14 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) out_sg = elem->out_sg; if (out_num < 1) { virtio_error(vdev, "virtio-net header not in first element"); - virtqueue_detach_element(q->tx_vq, elem, 0); - g_free(elem); - return -EINVAL; + goto detach; } if (n->has_vnet_hdr) { if (iov_to_buf(out_sg, out_num, 0, &vhdr, n->guest_hdr_len) < n->guest_hdr_len) { virtio_error(vdev, "virtio-net header incorrect"); - virtqueue_detach_element(q->tx_vq, elem, 0); - g_free(elem); - return -EINVAL; + goto detach; } if (n->needs_vnet_hdr_swap) { virtio_net_hdr_swap(vdev, (void *) &vhdr); @@ -2774,6 +2770,11 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) n->guest_hdr_len, -1); out_num = sg_num; out_sg = sg; + + if (out_num < 1) { + virtio_error(vdev, "virtio-net nothing to send"); + goto detach; + } } ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index), @@ -2794,6 +2795,11 @@ drop: } } return num_packets; + +detach: + virtqueue_detach_element(q->tx_vq, elem, 0); + g_free(elem); + return -EINVAL; } static void virtio_net_tx_timer(void *opaque); -- Gitee From 6d0eefdf70135a01476b787df50f34da77ae5529 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 6 Jun 2024 10:53:19 +0100 Subject: [PATCH 440/939] target/i386: fix size of EBP writeback in gen_enter() The calculation of FrameTemp is done using the size indicated by mo_pushpop() before being written back to EBP, but the final writeback to EBP is done using the size indicated by mo_stacksize(). In the case where mo_pushpop() is MO_32 and mo_stacksize() is MO_16 then the final writeback to EBP is done using MO_16 which can leave junk in the top 16-bits of EBP after executing ENTER. Change the writeback of EBP to use the same size indicated by mo_pushpop() to ensure that the full value is written back. Signed-off-by: Mark Cave-Ayland Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2198 Message-ID: <20240606095319.229650-5-mark.cave-ayland@ilande.co.uk> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini (cherry picked from commit 3973615e7fbaeef1deeaa067577e373781ced70a) Signed-off-by: zhujun2 --- target/i386/tcg/translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index dc672d7995..19b8250452 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2661,7 +2661,7 @@ static void gen_enter(DisasContext *s, int esp_addend, int level) } /* Copy the FrameTemp value to EBP. */ - gen_op_mov_reg_v(s, a_ot, R_EBP, s->T1); + gen_op_mov_reg_v(s, d_ot, R_EBP, s->T1); /* Compute the final value of ESP. */ tcg_gen_subi_tl(s->T1, s->T1, esp_addend + size * level); -- Gitee From 228e14db9a85e7e978c38b97ae622302a0d4f784 Mon Sep 17 00:00:00 2001 From: Dongwon Kim Date: Fri, 26 Apr 2024 15:50:59 -0700 Subject: [PATCH 441/939] ui/gtk: Draw guest frame at refresh cycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Draw routine needs to be manually invoked in the next refresh if there is a scanout blob from the guest. This is to prevent a situation where there is a scheduled draw event but it won't happen bacause the window is currently in inactive state (minimized or tabified). If draw is not done for a long time, gl_block timeout and/or fence timeout (on the guest) will happen eventually. v2: Use gd_gl_area_draw(vc) in gtk-gl-area.c Suggested-by: Vivek Kasireddy Cc: Gerd Hoffmann Cc: Marc-André Lureau Cc: Daniel P. Berrangé Signed-off-by: Dongwon Kim Acked-by: Marc-André Lureau Message-Id: <20240426225059.3871283-1-dongwon.kim@intel.com> (cherry picked from commit 77bf310084dad38b3a2badf01766c659056f1cf2) Signed-off-by: zhujun2 --- ui/gtk-egl.c | 1 + ui/gtk-gl-area.c | 1 + 2 files changed, 2 insertions(+) diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c index 3af5ac5bcf..75f6b9011a 100644 --- a/ui/gtk-egl.c +++ b/ui/gtk-egl.c @@ -150,6 +150,7 @@ void gd_egl_refresh(DisplayChangeListener *dcl) vc, vc->window ? vc->window : vc->gfx.drawing_area); if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) { + gd_egl_draw(vc); return; } diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c index 52dcac161e..4fff957c3f 100644 --- a/ui/gtk-gl-area.c +++ b/ui/gtk-gl-area.c @@ -126,6 +126,7 @@ void gd_gl_area_refresh(DisplayChangeListener *dcl) gd_update_monitor_refresh_rate(vc, vc->window ? vc->window : vc->gfx.drawing_area); if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) { + gd_gl_area_draw(vc); return; } -- Gitee From 540314912566c91341226d9eb6df5b782f277813 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 5 Jun 2024 15:14:41 +0200 Subject: [PATCH 442/939] stdvga: fix screen blanking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case the display surface uses a shared buffer (i.e. uses vga vram directly instead of a shadow) go unshare the buffer before clearing it. This avoids vga memory corruption, which in turn fixes unblanking not working properly with X11. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2067 Signed-off-by: Gerd Hoffmann Reviewed-by: Marc-André Lureau Message-ID: <20240605131444.797896-2-kraxel@redhat.com> Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit b1cf266c82cb1211ee2785f1813a6a3f3e693390) Signed-off-by: zhujun2 --- hw/display/vga.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/display/vga.c b/hw/display/vga.c index 37557c3442..cb6b6ee2ca 100644 --- a/hw/display/vga.c +++ b/hw/display/vga.c @@ -1748,6 +1748,13 @@ static void vga_draw_blank(VGACommonState *s, int full_update) if (s->last_scr_width <= 0 || s->last_scr_height <= 0) return; + if (is_buffer_shared(surface)) { + /* unshare buffer, otherwise the blanking corrupts vga vram */ + surface = qemu_create_displaysurface(s->last_scr_width, + s->last_scr_height); + dpy_gfx_replace_surface(s->con, surface); + } + w = s->last_scr_width * surface_bytes_per_pixel(surface); d = surface_data(surface); for(i = 0; i < s->last_scr_height; i++) { -- Gitee From 8ee63ce50289adb4ea346901366bd30aa23e412a Mon Sep 17 00:00:00 2001 From: "yang.zhang" Date: Tue, 9 Apr 2024 09:44:45 +0800 Subject: [PATCH 443/939] hw/intc/riscv_aplic: APLICs should add child earlier than realize Since only root APLICs can have hw IRQ lines, aplic->parent should be initialized first. Fixes: e8f79343cf ("hw/intc: Add RISC-V AIA APLIC device emulation") Reviewed-by: Daniel Henrique Barboza Signed-off-by: yang.zhang Cc: qemu-stable Message-ID: <20240409014445.278-1-gaoshanliukou@163.com> Signed-off-by: Alistair Francis (cherry picked from commit c76b121840c6ca79dc6305a5f4bcf17c72217d9c) Signed-off-by: zhujun2 --- hw/intc/riscv_aplic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c index c677b5cfbb..2fdf85444e 100644 --- a/hw/intc/riscv_aplic.c +++ b/hw/intc/riscv_aplic.c @@ -974,16 +974,16 @@ DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size, qdev_prop_set_bit(dev, "msimode", msimode); qdev_prop_set_bit(dev, "mmode", mmode); + if (parent) { + riscv_aplic_add_child(parent, dev); + } + sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); if (!is_kvm_aia(msimode)) { sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr); } - if (parent) { - riscv_aplic_add_child(parent, dev); - } - if (!msimode) { for (i = 0; i < num_harts; i++) { CPUState *cpu = cpu_by_arch_id(hartid_base + i); -- Gitee From 7d0006839846bef68fa3d96886b8e5d8f8ec52f1 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 7 May 2024 10:22:39 +0800 Subject: [PATCH 444/939] hw/loongarch/virt: Fix memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The char pointer 'ramName' point to a block of memory, but never free it. Use 'g_autofree' to automatically free it. Resolves: Coverity CID 1544773 Fixes: 0cf1478d6 ("hw/loongarch: Add numa support") Signed-off-by: Song Gao Reviewed-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240507022239.3113987-1-gaosong@loongson.cn> Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit 54c52ec719fb8c83bbde54cb87b58688ab27c166) Signed-off-by: zhujun2 --- hw/loongarch/virt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 5d4fcb7a55..eca3b94581 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -925,7 +925,6 @@ static void loongarch_init(MachineState *machine) const CPUArchIdList *possible_cpus; MachineClass *mc = MACHINE_GET_CLASS(machine); CPUState *cpu; - char *ramName = NULL; struct loaderparams loaderparams = { }; if (!cpu_model) { @@ -985,7 +984,7 @@ static void loongarch_init(MachineState *machine) for (i = 1; i < nb_numa_nodes; i++) { MemoryRegion *nodemem = g_new(MemoryRegion, 1); - ramName = g_strdup_printf("loongarch.node%d.ram", i); + g_autofree char *ramName = g_strdup_printf("loongarch.node%d.ram", i); memory_region_init_alias(nodemem, NULL, ramName, machine->ram, offset, numa_info[i].node_mem); memory_region_add_subregion(address_space_mem, phyAddr, nodemem); -- Gitee From 6165cf85acd2600c8e0edb062d627e4cb42083af Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Wed, 23 Aug 2023 02:29:30 -0700 Subject: [PATCH 445/939] hw/remote/vfio-user: Fix config space access byte order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI config space is little-endian, so on a big-endian host we need to perform byte swaps for values as they are passed to and received from the generic PCI config space access machinery. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Reviewed-by: Jagannathan Raman Signed-off-by: Mattias Nissler Message-ID: <20240507094210.300566-6-mnissler@rivosinc.com> Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit e6578f1f68a0e90789a841ada532c3e494c9a04c) Signed-off-by: zhujun2 --- hw/remote/vfio-user-obj.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c index 8b10c32a3c..8b708422fe 100644 --- a/hw/remote/vfio-user-obj.c +++ b/hw/remote/vfio-user-obj.c @@ -281,7 +281,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, while (bytes > 0) { len = (bytes > pci_access_width) ? pci_access_width : bytes; if (is_write) { - memcpy(&val, ptr, len); + val = ldn_le_p(ptr, len); pci_host_config_write_common(o->pci_dev, offset, pci_config_size(o->pci_dev), val, len); @@ -289,7 +289,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, } else { val = pci_host_config_read_common(o->pci_dev, offset, pci_config_size(o->pci_dev), len); - memcpy(ptr, &val, len); + stn_le_p(ptr, len, val); trace_vfu_cfg_read(offset, val); } offset += len; -- Gitee From 400e9dbe4dae8efc110a2363590ce35fd11d7d29 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 1 Jul 2024 09:52:08 +0200 Subject: [PATCH 446/939] virtio: remove virtio_tswap16s() call in vring_packed_event_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit d152cdd6f6 ("virtio: use virtio accessor to access packed event") switched using of address_space_read_cached() to virito_lduw_phys_cached() to access packed descriptor event. When we used address_space_read_cached(), we needed to call virtio_tswap16s() to handle the endianess of the field, but virito_lduw_phys_cached() already handles it internally, so we no longer need to call virtio_tswap16s() (as the commit had done for `off_wrap`, but forgot for `flags`). Fixes: d152cdd6f6 ("virtio: use virtio accessor to access packed event") Cc: jasowang@redhat.com Cc: qemu-stable@nongnu.org Reported-by: Xoykie Link: https://lore.kernel.org/qemu-devel/CAFU8RB_pjr77zMLsM0Unf9xPNxfr_--Tjr49F_eX32ZBc5o2zQ@mail.gmail.com Signed-off-by: Stefano Garzarella Message-Id: <20240701075208.19634-1-sgarzare@redhat.com> Acked-by: Jason Wang Reviewed-by: Peter Maydell Reviewed-by: Eugenio Pérez Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 7aa6492401e95fb296dec7cda81e67d91f6037d7) Signed-off-by: zhujun2 --- hw/virtio/virtio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 202aae868e..8c3b6b87aa 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -322,7 +322,6 @@ static void vring_packed_event_read(VirtIODevice *vdev, /* Make sure flags is seen before off_wrap */ smp_rmb(); e->off_wrap = virtio_lduw_phys_cached(vdev, cache, off_off); - virtio_tswap16s(vdev, &e->flags); } static void vring_packed_off_wrap_write(VirtIODevice *vdev, -- Gitee From 33d8e65f37caa34bf0c18a3ecbaa48d3b706564b Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 22 Apr 2024 14:14:25 -0300 Subject: [PATCH 447/939] target/riscv/kvm: tolerate KVM disable ext errors Running a KVM guest using a 6.9-rc3 kernel, in a 6.8 host that has zkr enabled, will fail with a kernel oops SIGILL right at the start. The reason is that we can't expose zkr without implementing the SEED CSR. Disabling zkr in the guest would be a workaround, but if the KVM doesn't allow it we'll error out and never boot. In hindsight this is too strict. If we keep proceeding, despite not disabling the extension in the KVM vcpu, we'll not add the extension in the riscv,isa. The guest kernel will be unaware of the extension, i.e. it doesn't matter if the KVM vcpu has it enabled underneath or not. So it's ok to keep booting in this case. Change our current logic to not error out if we fail to disable an extension in kvm_set_one_reg(), but show a warning and keep booting. It is important to throw a warning because we must make the user aware that the extension is still available in the vcpu, meaning that an ill-behaved guest can ignore the riscv,isa settings and use the extension. The case we're handling happens with an EINVAL error code. If we fail to disable the extension in KVM for any other reason, error out. We'll also keep erroring out when we fail to enable an extension in KVM, since adding the extension in riscv,isa at this point will cause a guest malfunction because the extension isn't enabled in the vcpu. Suggested-by: Andrew Jones Signed-off-by: Daniel Henrique Barboza Reviewed-by: Andrew Jones Cc: qemu-stable Message-ID: <20240422171425.333037-2-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis (cherry picked from commit 1215d45b2aa97512a2867e401aa59f3d0c23cb23) Signed-off-by: zhujun2 --- target/riscv/kvm/kvm-cpu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c index 45b6cf1cfa..b3dc2070f9 100644 --- a/target/riscv/kvm/kvm-cpu.c +++ b/target/riscv/kvm/kvm-cpu.c @@ -369,10 +369,14 @@ static void kvm_riscv_update_cpu_cfg_isa_ext(RISCVCPU *cpu, CPUState *cs) reg = kvm_cpu_cfg_get(cpu, multi_ext_cfg); ret = kvm_set_one_reg(cs, id, ®); if (ret != 0) { - error_report("Unable to %s extension %s in KVM, error %d", - reg ? "enable" : "disable", - multi_ext_cfg->name, ret); - exit(EXIT_FAILURE); + if (!reg && ret == -EINVAL) { + warn_report("KVM cannot disable extension %s", + multi_ext_cfg->name); + } else { + error_report("Unable to enable extension %s in KVM, error %d", + multi_ext_cfg->name, ret); + exit(EXIT_FAILURE); + } } } } -- Gitee From 2edda423f2bca2348595e99a4ef9f5c73e262e77 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Wed, 9 Oct 2024 07:25:22 -0400 Subject: [PATCH 448/939] block: fix -Werror=maybe-uninitialized false-positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from eb5d28c783078ad2d7fb42349e146190cd98678b ../block/file-posix.c:1405:17: error: ‘zoned’ may be used uninitialized [-Werror=maybe-uninitialized] 1405 | if (ret < 0 || zoned == BLK_Z_NONE) { Signed-off-by: Marc-André Lureau Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: qihao_yewu --- block/file-posix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4ac8f684f1..787f613d52 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1423,7 +1423,7 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st, Error **errp) { BDRVRawState *s = bs->opaque; - BlockZoneModel zoned; + BlockZoneModel zoned = BLK_Z_NONE; int ret; ret = get_sysfs_zoned_model(st, &zoned); -- Gitee From 855f389c98787baaa8afd1139fb82e0710167d9a Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Wed, 11 Sep 2024 09:14:30 +0000 Subject: [PATCH 449/939] ui/sdl2: set swap interval explicitly when OpenGL is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before 176e3783f2ab (ui/sdl2: OpenGL window context) SDL_CreateRenderer was called unconditionally setting the swap interval to 0. Since SDL_CreateRenderer is now no longer called when OpenGL is enabled, the swap interval is no longer set explicitly and vsync handling depends on the environment settings which may lead to a performance regression with virgl as reported in https://gitlab.com/qemu-project/qemu/-/issues/2565 Restore the old vsync handling by explicitly calling SDL_GL_SetSwapInterval if OpenGL is enabled. Fixes: 176e3783f2ab (ui/sdl2: OpenGL window context) Closes: https://gitlab.com/qemu-project/qemu/-/issues/2565 Signed-off-by: Gert Wollny Acked-by: Marc-André Lureau Message-ID: <01020191e05ce6df-84da6386-62c2-4ce8-840e-ad216ac253dd-000000@eu-west-1.amazonses.com> Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit ae23cd00170baaa2777eb1ee87b70f472dbb3c44) Signed-off-by: zhujun2 --- ui/sdl2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ui/sdl2.c b/ui/sdl2.c index 4971963f00..cc44d2708b 100644 --- a/ui/sdl2.c +++ b/ui/sdl2.c @@ -115,6 +115,7 @@ void sdl2_window_create(struct sdl2_console *scon) SDL_SetHint(SDL_HINT_RENDER_BATCHING, "1"); scon->winctx = SDL_GL_CreateContext(scon->real_window); + SDL_GL_SetSwapInterval(0); } else { /* The SDL renderer is only used by sdl2-2D, when OpenGL is disabled */ scon->real_renderer = SDL_CreateRenderer(scon->real_window, -1, 0); -- Gitee From 0cb9a00d295cbf0ade0a55cea1039aec793fddf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Kl=C3=B6tzke?= Date: Fri, 13 Sep 2024 15:31:50 +0100 Subject: [PATCH 450/939] hw/intc/arm_gic: fix spurious level triggered interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On GICv2 and later, level triggered interrupts are pending when either the interrupt line is asserted or the interrupt was made pending by a GICD_ISPENDRn write. Making a level triggered interrupt pending by software persists until either the interrupt is acknowledged or cleared by writing GICD_ICPENDRn. As long as the interrupt line is asserted, the interrupt is pending in any case. This logic is transparently implemented in gic_test_pending() for GICv1 and GICv2. The function combines the "pending" irq_state flag (used for edge triggered interrupts and software requests) and the line status (tracked in the "level" field). However, we also incorrectly set the pending flag on a guest write to GICD_ISENABLERn if the line of a level triggered interrupt was asserted. This keeps the interrupt pending even if the line is de-asserted after some time. This incorrect logic is a leftover of the initial 11MPCore GIC implementation. That handles things slightly differently to the architected GICv1 and GICv2. The 11MPCore TRM does not give a lot of detail on the corner cases of its GIC's behaviour, and historically we have not wanted to investigate exactly what it does in reality, so QEMU's GIC model takes the approach of "retain our existing behaviour for 11MPCore, and implement the architectural standard for later GIC revisions". On that basis, commit 8d999995e45c10 in 2013 is where we added the "level-triggered interrupt with the line asserted" handling to gic_test_pending(), and we deliberately kept the old behaviour of gic_test_pending() for REV_11MPCORE. That commit should have added the "only if 11MPCore" condition to the setting of the pending bit on writes to GICD_ISENABLERn, but forgot it. Add the missing "if REV_11MPCORE" condition, so that our behaviour on GICv1 and GICv2 matches the GIC architecture requirements. Cc: qemu-stable@nongnu.org Fixes: 8d999995e45c10 ("arm_gic: Fix GIC pending behavior") Signed-off-by: Jan Klötzke Message-id: 20240911114826.3558302-1-jan.kloetzke@kernkonzept.com Reviewed-by: Peter Maydell [PMM: expanded comment a little and converted to coding-style form; expanded commit message with the historical backstory] Signed-off-by: Peter Maydell (cherry picked from commit 110684c9a69a02cbabfbddcd3afa921826ad565c) Signed-off-by: zhujun2 --- hw/intc/arm_gic.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c index 074cf50af2..dfe7a0a729 100644 --- a/hw/intc/arm_gic.c +++ b/hw/intc/arm_gic.c @@ -1263,9 +1263,14 @@ static void gic_dist_writeb(void *opaque, hwaddr offset, trace_gic_enable_irq(irq + i); } GIC_DIST_SET_ENABLED(irq + i, cm); - /* If a raised level triggered IRQ enabled then mark - is as pending. */ - if (GIC_DIST_TEST_LEVEL(irq + i, mask) + /* + * If a raised level triggered IRQ enabled then mark + * it as pending on 11MPCore. For other GIC revisions we + * handle the "level triggered and line asserted" check + * at the other end in gic_test_pending(). + */ + if (s->revision == REV_11MPCORE + && GIC_DIST_TEST_LEVEL(irq + i, mask) && !GIC_DIST_TEST_EDGE_TRIGGER(irq + i)) { DPRINTF("Set %d pending mask %x\n", irq + i, mask); GIC_DIST_SET_PENDING(irq + i, mask); -- Gitee From 0981edabf57b5728211deeca459fb15927e7cc36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Volker=20R=C3=BCmelin?= Date: Sun, 1 Sep 2024 15:01:12 +0200 Subject: [PATCH 451/939] hw/audio/virtio-sound: fix heap buffer overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the guest may write to the device configuration space, whereas the virtio sound device specification in chapter 5.14.4 clearly states that the fields in the device configuration space are driver-read-only. Remove the set_config function from the virtio_snd class. This also prevents a heap buffer overflow. See QEMU issue #2296. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2296 Signed-off-by: Volker Rümelin Message-Id: <20240901130112.8242-1-vr_qemu@t-online.de> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 7fc6611cad3e9627b23ce83e550b668abba6c886) Signed-off-by: zhujun2 --- hw/audio/trace-events | 1 - hw/audio/virtio-snd.c | 24 ------------------------ 2 files changed, 25 deletions(-) diff --git a/hw/audio/trace-events b/hw/audio/trace-events index b1870ff224..b8ef572767 100644 --- a/hw/audio/trace-events +++ b/hw/audio/trace-events @@ -41,7 +41,6 @@ asc_update_irq(int irq, int a, int b) "set IRQ to %d (A: 0x%x B: 0x%x)" #virtio-snd.c virtio_snd_get_config(void *vdev, uint32_t jacks, uint32_t streams, uint32_t chmaps) "snd %p: get_config jacks=%"PRIu32" streams=%"PRIu32" chmaps=%"PRIu32"" -virtio_snd_set_config(void *vdev, uint32_t jacks, uint32_t new_jacks, uint32_t streams, uint32_t new_streams, uint32_t chmaps, uint32_t new_chmaps) "snd %p: set_config jacks from %"PRIu32"->%"PRIu32", streams from %"PRIu32"->%"PRIu32", chmaps from %"PRIu32"->%"PRIu32 virtio_snd_get_features(void *vdev, uint64_t features) "snd %p: get_features 0x%"PRIx64 virtio_snd_vm_state_running(void) "vm state running" virtio_snd_vm_state_stopped(void) "vm state stopped" diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index 137fa77a01..cb7049abb3 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -107,29 +107,6 @@ virtio_snd_get_config(VirtIODevice *vdev, uint8_t *config) } -static void -virtio_snd_set_config(VirtIODevice *vdev, const uint8_t *config) -{ - VirtIOSound *s = VIRTIO_SND(vdev); - const virtio_snd_config *sndconfig = - (const virtio_snd_config *)config; - - - trace_virtio_snd_set_config(vdev, - s->snd_conf.jacks, - sndconfig->jacks, - s->snd_conf.streams, - sndconfig->streams, - s->snd_conf.chmaps, - sndconfig->chmaps); - - memcpy(&s->snd_conf, sndconfig, sizeof(virtio_snd_config)); - le32_to_cpus(&s->snd_conf.jacks); - le32_to_cpus(&s->snd_conf.streams); - le32_to_cpus(&s->snd_conf.chmaps); - -} - static void virtio_snd_pcm_buffer_free(VirtIOSoundPCMBuffer *buffer) { @@ -1399,7 +1376,6 @@ static void virtio_snd_class_init(ObjectClass *klass, void *data) vdc->realize = virtio_snd_realize; vdc->unrealize = virtio_snd_unrealize; vdc->get_config = virtio_snd_get_config; - vdc->set_config = virtio_snd_set_config; vdc->get_features = get_features; vdc->reset = virtio_snd_reset; vdc->legacy_features = 0; -- Gitee From d199d3a9af9f5bd7877a6ace1243c77097264f1a Mon Sep 17 00:00:00 2001 From: Tiago Pasqualini Date: Wed, 4 Sep 2024 20:52:30 -0300 Subject: [PATCH 452/939] crypto: run qcrypto_pbkdf2_count_iters in a new thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU time accounting in the kernel has been demonstrated to have a sawtooth pattern[1][2]. This can cause the getrusage system call to not be as accurate as we are expecting, which can cause this calculation to stall. The kernel discussions shows that this inaccuracy happens when CPU time gets big enough, so this patch changes qcrypto_pbkdf2_count_iters to run in a fresh thread to avoid this inaccuracy. It also adds a sanity check to fail the process if CPU time is not accounted. [1] https://lore.kernel.org/lkml/159231011694.16989.16351419333851309713.tip-bot2@tip-bot2/ [2] https://lore.kernel.org/lkml/20221226031010.4079885-1-maxing.lan@bytedance.com/t/#m1c7f2fdc0ea742776a70fd1aa2a2e414c437f534 Resolves: #2398 Signed-off-by: Tiago Pasqualini Signed-off-by: Daniel P. Berrangé (cherry picked from commit c72cab5ad9f849bbcfcf4be7952b8b8946cc626e) Signed-off-by: zhujun2 --- crypto/pbkdf.c | 53 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/crypto/pbkdf.c b/crypto/pbkdf.c index 8d198c152c..d1c06ef3ed 100644 --- a/crypto/pbkdf.c +++ b/crypto/pbkdf.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "qemu/thread.h" #include "qapi/error.h" #include "crypto/pbkdf.h" #ifndef _WIN32 @@ -85,12 +86,28 @@ static int qcrypto_pbkdf2_get_thread_cpu(unsigned long long *val_ms, #endif } -uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, - const uint8_t *key, size_t nkey, - const uint8_t *salt, size_t nsalt, - size_t nout, - Error **errp) +typedef struct CountItersData { + QCryptoHashAlgorithm hash; + const uint8_t *key; + size_t nkey; + const uint8_t *salt; + size_t nsalt; + size_t nout; + uint64_t iterations; + Error **errp; +} CountItersData; + +static void *threaded_qcrypto_pbkdf2_count_iters(void *data) { + CountItersData *iters_data = (CountItersData *) data; + QCryptoHashAlgorithm hash = iters_data->hash; + const uint8_t *key = iters_data->key; + size_t nkey = iters_data->nkey; + const uint8_t *salt = iters_data->salt; + size_t nsalt = iters_data->nsalt; + size_t nout = iters_data->nout; + Error **errp = iters_data->errp; + uint64_t ret = -1; g_autofree uint8_t *out = g_new(uint8_t, nout); uint64_t iterations = (1 << 15); @@ -114,7 +131,10 @@ uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, delta_ms = end_ms - start_ms; - if (delta_ms > 500) { + if (delta_ms == 0) { /* sanity check */ + error_setg(errp, "Unable to get accurate CPU usage"); + goto cleanup; + } else if (delta_ms > 500) { break; } else if (delta_ms < 100) { iterations = iterations * 10; @@ -129,5 +149,24 @@ uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, cleanup: memset(out, 0, nout); - return ret; + iters_data->iterations = ret; + return NULL; +} + +uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, + const uint8_t *key, size_t nkey, + const uint8_t *salt, size_t nsalt, + size_t nout, + Error **errp) +{ + CountItersData data = { + hash, key, nkey, salt, nsalt, nout, 0, errp + }; + QemuThread thread; + + qemu_thread_create(&thread, "pbkdf2", threaded_qcrypto_pbkdf2_count_iters, + &data, QEMU_THREAD_JOINABLE); + qemu_thread_join(&thread); + + return data.iterations; } -- Gitee From c6d6cbb2c33c3c7b2574c3baa2d2477d9d4ac91c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 28 Aug 2024 11:07:43 +0200 Subject: [PATCH 453/939] softmmu/physmem: fix memory leak in dirty_memory_extend() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As reported by Peter, we might be leaking memory when removing the highest RAMBlock (in the weird ram_addr_t space), and adding a new one. We will fail to realize that we already allocated bitmaps for more dirty memory blocks, and effectively discard the pointers to them. Fix it by getting rid of last_ram_page() and by remembering the number of dirty memory blocks that have been allocated already. While at it, let's use "unsigned int" for the number of blocks, which should be sufficient until we reach ~32 exabytes. Looks like this leak was introduced as we switched from using a single bitmap_zero_extend() to allocating multiple bitmaps: bitmap_zero_extend() relies on g_renew() which should have taken care of this. Resolves: https://lkml.kernel.org/r/CAFEAcA-k7a+VObGAfCFNygQNfCKL=AfX6A4kScq=VSSK0peqPg@mail.gmail.com Reported-by: Peter Maydell Fixes: 5b82b703b69a ("memory: RCU ram_list.dirty_memory[] for safe RAM hotplug") Reviewed-by: Stefan Hajnoczi Reviewed-by: Peter Xu Tested-by: Peter Maydell Cc: qemu-stable@nongnu.org Cc: Stefan Hajnoczi Cc: Paolo Bonzini Cc: Peter Xu Cc: Philippe Mathieu-Daudé Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20240828090743.128647-1-david@redhat.com Signed-off-by: Peter Xu (cherry picked from commit b84f06c2bee727b3870b4eeccbe3a45c5aea14c1) Signed-off-by: Michael Tokarev Signed-off-by: zhujun2 --- include/exec/ramlist.h | 1 + system/physmem.c | 35 +++++++++-------------------------- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/include/exec/ramlist.h b/include/exec/ramlist.h index 2ad2a81acc..d9cfe530be 100644 --- a/include/exec/ramlist.h +++ b/include/exec/ramlist.h @@ -50,6 +50,7 @@ typedef struct RAMList { /* RCU-enabled, writes protected by the ramlist lock. */ QLIST_HEAD(, RAMBlock) blocks; DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM]; + unsigned int num_dirty_blocks; uint32_t version; QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; } RAMList; diff --git a/system/physmem.c b/system/physmem.c index 2c8b83f811..87f49e70c1 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1531,18 +1531,6 @@ static ram_addr_t find_ram_offset(ram_addr_t size) return offset; } -static unsigned long last_ram_page(void) -{ - RAMBlock *block; - ram_addr_t last = 0; - - RCU_READ_LOCK_GUARD(); - RAMBLOCK_FOREACH(block) { - last = MAX(last, block->offset + block->max_length); - } - return last >> TARGET_PAGE_BITS; -} - static void qemu_ram_setup_dump(void *addr, ram_addr_t size) { int ret; @@ -1795,13 +1783,11 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length) } /* Called with ram_list.mutex held */ -static void dirty_memory_extend(ram_addr_t old_ram_size, - ram_addr_t new_ram_size) +static void dirty_memory_extend(ram_addr_t new_ram_size) { - ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size, - DIRTY_MEMORY_BLOCK_SIZE); - ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size, - DIRTY_MEMORY_BLOCK_SIZE); + unsigned int old_num_blocks = ram_list.num_dirty_blocks; + unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size, + DIRTY_MEMORY_BLOCK_SIZE); int i; /* Only need to extend if block count increased */ @@ -1833,6 +1819,8 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, g_free_rcu(old_blocks, rcu); } } + + ram_list.num_dirty_blocks = new_num_blocks; } static void ram_block_add(RAMBlock *new_block, Error **errp) @@ -1841,11 +1829,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) const bool shared = qemu_ram_is_shared(new_block); RAMBlock *block; RAMBlock *last_block = NULL; - ram_addr_t old_ram_size, new_ram_size; + ram_addr_t ram_size; Error *err = NULL; - old_ram_size = last_ram_page(); - qemu_mutex_lock_ramlist(); new_block->offset = find_ram_offset(new_block->max_length); @@ -1873,11 +1859,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) } } - new_ram_size = MAX(old_ram_size, - (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); - if (new_ram_size > old_ram_size) { - dirty_memory_extend(old_ram_size, new_ram_size); - } + ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; + dirty_memory_extend(ram_size); /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. -- Gitee From 10794f7a9bc3c88c8a26f094e5d3ef42e9fd290f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=A5=9A=E5=90=9B?= Date: Fri, 11 Oct 2024 14:04:54 +0800 Subject: [PATCH 454/939] hw/gpio/aspeed_gpio: Avoid shift into sign bit In aspeed_gpio_update() we calculate "mask = 1 << gpio", where gpio can be between 0 and 31. Coverity complains about this beacuse 1 << 31 won't fit in a signed integer. For QEMU this isn't an error because we enable -fwrapv, but we can keep Coverity happy by doing the shift on unsigned numbers. Resolves: Coverity CID 1547742 Signed-off-by: Peter Maydell Reviewed-by: Cedric Le Goater Signed-off-by: zhangchujun --- hw/gpio/aspeed_gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/gpio/aspeed_gpio.c b/hw/gpio/aspeed_gpio.c index 1e267dd482..0fc3d4c05f 100644 --- a/hw/gpio/aspeed_gpio.c +++ b/hw/gpio/aspeed_gpio.c @@ -281,7 +281,7 @@ static void aspeed_gpio_update(AspeedGPIOState *s, GPIOSets *regs, diff &= mode_mask; if (diff) { for (gpio = 0; gpio < ASPEED_GPIOS_PER_SET; gpio++) { - uint32_t mask = 1 << gpio; + uint32_t mask = 1U << gpio; /* If the gpio needs to be updated... */ if (!(diff & mask)) { -- Gitee From 7bd04536327357a97206d8048f5d9341780bbe5a Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 12 Oct 2024 11:26:16 +0800 Subject: [PATCH 455/939] crypto: use consistent error reporting pattern for unsupported cipher modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all paths in qcrypto_cipher_ctx_new() were correctly distinguishing between valid user input for cipher mode (which should report a user facing error), vs program logic errors (which should assert). Reported-by: Peter Maydell Signed-off-by: Daniel P. Berrangé Signed-off-by: dinglimin --- crypto/cipher-nettle.c.inc | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/crypto/cipher-nettle.c.inc b/crypto/cipher-nettle.c.inc index 766de036ba..2654b439c1 100644 --- a/crypto/cipher-nettle.c.inc +++ b/crypto/cipher-nettle.c.inc @@ -525,8 +525,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_CTR: drv = &qcrypto_nettle_des_driver_ctr; break; - default: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleDES, 1); @@ -551,8 +553,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_CTR: drv = &qcrypto_nettle_des3_driver_ctr; break; - default: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleDES3, 1); @@ -663,8 +667,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_CTR: drv = &qcrypto_nettle_cast128_driver_ctr; break; - default: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleCAST128, 1); @@ -741,8 +747,12 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_ECB: drv = &qcrypto_nettle_sm4_driver_ecb; break; - default: + case QCRYPTO_CIPHER_MODE_CBC: + case QCRYPTO_CIPHER_MODE_CTR: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleSm4, 1); -- Gitee From c64bd463b120056ff1e6c32e48fa24b6afd17f23 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 12 Oct 2024 13:47:25 +0800 Subject: [PATCH 456/939] crypto: drop gnutls debug logging support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GNUTLS already supports dynamically enabling its logging at runtime by setting the env var 'GNUTLS_DEBUG_LEVEL=10', so there is no need to re-invent this logic in QEMU in a way that requires a re-compile. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Daniel P. Berrangé Signed-off-by: dinglimin --- crypto/init.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/crypto/init.c b/crypto/init.c index fb7f1bff10..674d237fa9 100644 --- a/crypto/init.c +++ b/crypto/init.c @@ -34,14 +34,11 @@ #include "crypto/random.h" -/* #define DEBUG_GNUTLS */ -#ifdef DEBUG_GNUTLS -static void qcrypto_gnutls_log(int level, const char *str) -{ - fprintf(stderr, "%d: %s", level, str); -} -#endif +/* + * To debug GNUTLS see env vars listed in + * https://gnutls.org/manual/html_node/Debugging-and-auditing.html + */ int qcrypto_init(Error **errp) { #ifdef CONFIG_GNUTLS @@ -53,10 +50,6 @@ int qcrypto_init(Error **errp) gnutls_strerror(ret)); return -1; } -#ifdef DEBUG_GNUTLS - gnutls_global_set_log_level(10); - gnutls_global_set_log_function(qcrypto_gnutls_log); -#endif #endif #ifdef CONFIG_GCRYPT -- Gitee From e1aaa51fc2de072871cce45dd165e2cb38515978 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 12 Oct 2024 14:00:08 +0800 Subject: [PATCH 457/939] crypto: factor out conversion of QAPI to gcrypt constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conversion of cipher mode will shortly be required in more than one place. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Signed-off-by: Daniel P. Berrangé Signed-off-by: dinglimin --- crypto/cipher-gcrypt.c.inc | 116 +++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 56 deletions(-) diff --git a/crypto/cipher-gcrypt.c.inc b/crypto/cipher-gcrypt.c.inc index 1377cbaf14..6b82280f90 100644 --- a/crypto/cipher-gcrypt.c.inc +++ b/crypto/cipher-gcrypt.c.inc @@ -20,6 +20,56 @@ #include +static int qcrypto_cipher_alg_to_gcry_alg(QCryptoCipherAlgorithm alg) +{ + switch (alg) { + case QCRYPTO_CIPHER_ALG_DES: + return GCRY_CIPHER_DES; + case QCRYPTO_CIPHER_ALG_3DES: + return GCRY_CIPHER_3DES; + case QCRYPTO_CIPHER_ALG_AES_128: + return GCRY_CIPHER_AES128; + case QCRYPTO_CIPHER_ALG_AES_192: + return GCRY_CIPHER_AES192; + case QCRYPTO_CIPHER_ALG_AES_256: + return GCRY_CIPHER_AES256; + case QCRYPTO_CIPHER_ALG_CAST5_128: + return GCRY_CIPHER_CAST5; + case QCRYPTO_CIPHER_ALG_SERPENT_128: + return GCRY_CIPHER_SERPENT128; + case QCRYPTO_CIPHER_ALG_SERPENT_192: + return GCRY_CIPHER_SERPENT192; + case QCRYPTO_CIPHER_ALG_SERPENT_256: + return GCRY_CIPHER_SERPENT256; + case QCRYPTO_CIPHER_ALG_TWOFISH_128: + return GCRY_CIPHER_TWOFISH128; + case QCRYPTO_CIPHER_ALG_TWOFISH_256: + return GCRY_CIPHER_TWOFISH; +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: + return GCRY_CIPHER_SM4; +#endif + default: + return GCRY_CIPHER_NONE; + } +} + +static int qcrypto_cipher_mode_to_gcry_mode(QCryptoCipherMode mode) +{ + switch (mode) { + case QCRYPTO_CIPHER_MODE_ECB: + return GCRY_CIPHER_MODE_ECB; + case QCRYPTO_CIPHER_MODE_XTS: + return GCRY_CIPHER_MODE_XTS; + case QCRYPTO_CIPHER_MODE_CBC: + return GCRY_CIPHER_MODE_CBC; + case QCRYPTO_CIPHER_MODE_CTR: + return GCRY_CIPHER_MODE_CTR; + default: + return GCRY_CIPHER_MODE_NONE; + } +} + bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, QCryptoCipherMode mode) { @@ -188,72 +238,26 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, return NULL; } - switch (alg) { - case QCRYPTO_CIPHER_ALG_DES: - gcryalg = GCRY_CIPHER_DES; - break; - case QCRYPTO_CIPHER_ALG_3DES: - gcryalg = GCRY_CIPHER_3DES; - break; - case QCRYPTO_CIPHER_ALG_AES_128: - gcryalg = GCRY_CIPHER_AES128; - break; - case QCRYPTO_CIPHER_ALG_AES_192: - gcryalg = GCRY_CIPHER_AES192; - break; - case QCRYPTO_CIPHER_ALG_AES_256: - gcryalg = GCRY_CIPHER_AES256; - break; - case QCRYPTO_CIPHER_ALG_CAST5_128: - gcryalg = GCRY_CIPHER_CAST5; - break; - case QCRYPTO_CIPHER_ALG_SERPENT_128: - gcryalg = GCRY_CIPHER_SERPENT128; - break; - case QCRYPTO_CIPHER_ALG_SERPENT_192: - gcryalg = GCRY_CIPHER_SERPENT192; - break; - case QCRYPTO_CIPHER_ALG_SERPENT_256: - gcryalg = GCRY_CIPHER_SERPENT256; - break; - case QCRYPTO_CIPHER_ALG_TWOFISH_128: - gcryalg = GCRY_CIPHER_TWOFISH128; - break; - case QCRYPTO_CIPHER_ALG_TWOFISH_256: - gcryalg = GCRY_CIPHER_TWOFISH; - break; -#ifdef CONFIG_CRYPTO_SM4 - case QCRYPTO_CIPHER_ALG_SM4: - gcryalg = GCRY_CIPHER_SM4; - break; -#endif - default: + gcryalg = qcrypto_cipher_alg_to_gcry_alg(alg); + if (gcryalg == GCRY_CIPHER_NONE) { error_setg(errp, "Unsupported cipher algorithm %s", QCryptoCipherAlgorithm_str(alg)); return NULL; } - drv = &qcrypto_gcrypt_driver; - switch (mode) { - case QCRYPTO_CIPHER_MODE_ECB: - gcrymode = GCRY_CIPHER_MODE_ECB; - break; - case QCRYPTO_CIPHER_MODE_XTS: - gcrymode = GCRY_CIPHER_MODE_XTS; - break; - case QCRYPTO_CIPHER_MODE_CBC: - gcrymode = GCRY_CIPHER_MODE_CBC; - break; - case QCRYPTO_CIPHER_MODE_CTR: - drv = &qcrypto_gcrypt_ctr_driver; - gcrymode = GCRY_CIPHER_MODE_CTR; - break; - default: + gcrymode = qcrypto_cipher_mode_to_gcry_mode(mode); + if (gcrymode == GCRY_CIPHER_MODE_NONE) { error_setg(errp, "Unsupported cipher mode %s", QCryptoCipherMode_str(mode)); return NULL; } + if (mode == QCRYPTO_CIPHER_MODE_CTR) { + drv = &qcrypto_gcrypt_ctr_driver; + } else { + drv = &qcrypto_gcrypt_driver; + } + ctx = g_new0(QCryptoCipherGcrypt, 1); ctx->base.driver = drv; -- Gitee From 60b9463e35fe801e49db14539ccb8c9a6057e5c3 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 12 Oct 2024 14:12:17 +0800 Subject: [PATCH 458/939] Consider discard option when writing zeros When opening an image with discard=off, we punch hole in the image when writing zeroes, making the image sparse. This breaks users that want to ensure that writes cannot fail with ENOSPACE by using fully allocated images[1]. bdrv_co_pwrite_zeroes() correctly disables BDRV_REQ_MAY_UNMAP if we opened the child without discard=unmap or discard=on. But we don't go through this function when accessing the top node. Move the check down to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths. This change implements the documented behavior, punching holes only when opening the image with discard=on or discard=unmap. This may not be the best default but can improve it later. The test depends on a file system supporting discard, deallocating the entire file when punching hole with the length of the entire file. Tested with xfs, ext4, and tmpfs. [1] https://lists.nongnu.org/archive/html/qemu-discuss/2024-06/msg00003.html Signed-off-by: Nir Soffer Message-id: 20240628202058.1964986-3-nsoffer@redhat.com Signed-off-by: Stefan Hajnoczi Signed-off-by: dinglimin --- block/io.c | 9 +- tests/qemu-iotests/tests/write-zeroes-unmap | 127 ++++++++++++++++++ .../qemu-iotests/tests/write-zeroes-unmap.out | 81 +++++++++++ 3 files changed, 213 insertions(+), 4 deletions(-) create mode 100644 tests/qemu-iotests/tests/write-zeroes-unmap create mode 100644 tests/qemu-iotests/tests/write-zeroes-unmap.out diff --git a/block/io.c b/block/io.c index 7e62fabbf5..a280a5a4c9 100644 --- a/block/io.c +++ b/block/io.c @@ -1885,6 +1885,11 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, return -EINVAL; } + /* If opened with discard=off we should never unmap. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); @@ -2338,10 +2343,6 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); assert_bdrv_graph_readable(); - if (!(child->bs->open_flags & BDRV_O_UNMAP)) { - flags &= ~BDRV_REQ_MAY_UNMAP; - } - return bdrv_co_pwritev(child, offset, bytes, NULL, BDRV_REQ_ZERO_WRITE | flags); } diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap b/tests/qemu-iotests/tests/write-zeroes-unmap new file mode 100644 index 0000000000..7cfeeaf839 --- /dev/null +++ b/tests/qemu-iotests/tests/write-zeroes-unmap @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# group: quick +# +# Test write zeros unmap. +# +# Copyright (C) Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq="$(basename $0)" +echo "QA output created by $seq" + +trap _cleanup_test_img exit + +# get standard environment, filters and checks +cd .. +. ./common.rc +. ./common.filter + +_supported_fmt raw +_supported_proto file +_supported_os Linux + +create_test_image() { + _make_test_img -f $IMGFMT 1m +} + +filter_command() { + _filter_testdir | _filter_qemu_io | _filter_qemu | _filter_hmp +} + +print_disk_usage() { + du -sh $TEST_IMG | _filter_testdir +} + +echo +echo "=== defaults - write zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ + | filter_command +print_disk_usage + +echo +echo "=== defaults - write zeros unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ + | filter_command +print_disk_usage + + +echo +echo "=== defaults - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ + | filter_command +print_disk_usage + +echo +echo "=== discard=off - write zeroes unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=off \ + | filter_command +print_disk_usage + +echo +echo "=== detect-zeroes=on - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,detect-zeroes=on \ + | filter_command +print_disk_usage + +echo +echo "=== detect-zeroes=on,discard=on - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,detect-zeroes=on,discard=on \ + | filter_command +print_disk_usage + +echo +echo "=== discard=on - write zeroes ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=on \ + | filter_command +print_disk_usage + +echo +echo "=== discard=on - write zeroes unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=on \ + | filter_command +print_disk_usage diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap.out b/tests/qemu-iotests/tests/write-zeroes-unmap.out new file mode 100644 index 0000000000..c931994897 --- /dev/null +++ b/tests/qemu-iotests/tests/write-zeroes-unmap.out @@ -0,0 +1,81 @@ +QA output created by write-zeroes-unmap + +=== defaults - write zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -z 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== defaults - write zeros unmap === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -zu 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== defaults - write actual zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -P 0 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== discard=off - write zeroes unmap === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -zu 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== detect-zeroes=on - write actual zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -P 0 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== detect-zeroes=on,discard=on - write actual zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -P 0 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== discard=on - write zeroes === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -z 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== discard=on - write zeroes unmap === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -zu 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +0 TEST_DIR/t.raw -- Gitee From 56a588dad8d085a89b24fe2103bd623d4260e02d Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Mon, 8 Jul 2024 10:09:49 +0300 Subject: [PATCH 459/939] virtio-snd: add max size bounds check in input cb(CVE-2024-7730) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 98e77e3dd8dd6e7aa9a7dffa60f49c8c8a49d4e3 When reading input audio in the virtio-snd input callback, virtio_snd_pcm_in_cb(), we do not check whether the iov can actually fit the data buffer. This is because we use the buffer->size field as a total-so-far accumulator instead of byte-size-left like in TX buffers. This triggers an out of bounds write if the size of the virtio queue element is equal to virtio_snd_pcm_status, which makes the available space for audio data zero. This commit adds a check for reaching the maximum buffer size before attempting any writes. Reported-by: Zheyu Ma Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2427 Signed-off-by: Manos Pitsidianakis Message-Id: Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/audio/virtio-snd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index 137fa77a01..15986af41e 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -1274,7 +1274,7 @@ static void virtio_snd_pcm_in_cb(void *data, int available) { VirtIOSoundPCMStream *stream = data; VirtIOSoundPCMBuffer *buffer; - size_t size; + size_t size, max_size; WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) { while (!QSIMPLEQ_EMPTY(&stream->queue)) { @@ -1288,7 +1288,12 @@ static void virtio_snd_pcm_in_cb(void *data, int available) continue; } + max_size = iov_size(buffer->elem->in_sg, buffer->elem->in_num); for (;;) { + if (buffer->size >= max_size) { + return_rx_buffer(stream, buffer); + break; + } size = AUD_read(stream->voice.in, buffer->data + buffer->size, MIN(available, (stream->params.period_bytes - -- Gitee From ef3d2918827d6c5204af06e1597dc4dbde22414a Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 17 Oct 2024 09:43:01 +0800 Subject: [PATCH 460/939] util/userfaultfd: Remove unused uffd_poll_events chery-pick from ccf6b78275816c9dec84d3a40e9aa3b6ba6ebc06 uffd_poll_events has been unused since it was added; it's also just a wrapper around a plain old poll call, so doesn't add anything. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240919134626.166183-8-dave@treblig.org Signed-off-by: Peter Xu Signed-off-by: Zhang Jiao --- include/qemu/userfaultfd.h | 1 - util/userfaultfd.c | 28 ---------------------------- 2 files changed, 29 deletions(-) diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h index 18a4314212..a1979308d7 100644 --- a/include/qemu/userfaultfd.h +++ b/include/qemu/userfaultfd.h @@ -39,7 +39,6 @@ int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake); int uffd_wakeup(int uffd_fd, void *addr, uint64_t length); int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count); -bool uffd_poll_events(int uffd_fd, int tmo); #endif /* CONFIG_LINUX */ diff --git a/util/userfaultfd.c b/util/userfaultfd.c index fdff4867e8..b7d320d0b1 100644 --- a/util/userfaultfd.c +++ b/util/userfaultfd.c @@ -356,31 +356,3 @@ int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) return (int) (res / sizeof(struct uffd_msg)); } - -/** - * uffd_poll_events: poll UFFD file descriptor for read - * - * Returns true if events are available for read, false otherwise - * - * @uffd_fd: UFFD file descriptor - * @tmo: timeout value - */ -bool uffd_poll_events(int uffd_fd, int tmo) -{ - int res; - struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; - - do { - res = poll(&poll_fd, 1, tmo); - } while (res < 0 && errno == EINTR); - - if (res == 0) { - return false; - } - if (res < 0) { - error_report("uffd_poll_events() failed: errno=%i", errno); - return false; - } - - return (poll_fd.revents & POLLIN) != 0; -} -- Gitee From 0f62625a0f8b6244203fbd2838b8e9c29efc2eea Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 17 Oct 2024 10:49:54 +0800 Subject: [PATCH 461/939] tests/avocado: fix typo in replay_linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 2d8508bbab39bf342fe80e73c0b528eb3960fa37 Reviewed-by: Pavel Dovgalyuk Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-3-alex.bennee@linaro.org> Signed-off-by: Zhang Jiao --- tests/avocado/replay_linux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/avocado/replay_linux.py b/tests/avocado/replay_linux.py index 270ccc1eae..e95bff3299 100644 --- a/tests/avocado/replay_linux.py +++ b/tests/avocado/replay_linux.py @@ -94,7 +94,7 @@ def launch_and_wait(self, record, args, shift): else: vm.event_wait('SHUTDOWN', self.timeout) vm.wait() - logger.info('successfully fihished the replay') + logger.info('successfully finished the replay') elapsed = time.time() - start_time logger.info('elapsed time %.2f sec' % elapsed) return elapsed -- Gitee From 0b89dd1ae05d17f0bacbd34218799f00d04c8174 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 17 Oct 2024 11:13:06 +0800 Subject: [PATCH 462/939] hw/net/can/sja1000: fix bug for single acceptance filter and standard frame cheery-pick from 25145a7d7735344a469551946fc2a7f19eb4aa3d A CAN sja1000 standard frame filter mask has been computed and applied incorrectly for standard frames when single Acceptance Filter Mode (MOD_AFM = 1) has been selected. The problem has not been found by Linux kernel testing because it uses dual filter mode (MOD_AFM = 0) and leaves falters fully open. The problem has been noticed by Grant Ramsay when testing with Zephyr RTOS which uses single filter mode. Signed-off-by: Pavel Pisa Reported-by: Grant Ramsay Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2028 Fixes: 733210e754 ("hw/net/can: SJA1000 chip register level emulation") Message-ID: <20240103231426.5685-1-pisa@fel.cvut.cz> Signed-off-by: Zhang Jiao --- hw/net/can/can_sja1000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/can/can_sja1000.c b/hw/net/can/can_sja1000.c index 73201f9139..575df7d2f8 100644 --- a/hw/net/can/can_sja1000.c +++ b/hw/net/can/can_sja1000.c @@ -108,7 +108,7 @@ void can_sja_single_filter(struct qemu_can_filter *filter, } filter->can_mask = (uint32_t)amr[0] << 3; - filter->can_mask |= (uint32_t)amr[1] << 5; + filter->can_mask |= (uint32_t)amr[1] >> 5; filter->can_mask = ~filter->can_mask & QEMU_CAN_SFF_MASK; if (!(amr[1] & 0x10)) { filter->can_mask |= QEMU_CAN_RTR_FLAG; -- Gitee From 0b23e1ad9e27fa60525b3d014da0425d2c24885f Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 17 Oct 2024 13:19:51 +0800 Subject: [PATCH 463/939] platform-bus: fix refcount leak cheery-pick from 99ec7b440a1d6a6ef07450b68687d24d13a25fb5 memory_region_find() returns an MR which it is the caller's responsibility to unref, but platform_bus_map_mmio() was forgetting to do so, thus leaking the MR. Signed-off-by: Gao Shiyuan gaoshiyuan@baidu.com Message-id: 20240829131005.9196-1-gaoshiyuan@baidu.com Reviewed-by: Peter Maydell peter.maydell@linaro.org [PMM: tweaked commit message] Signed-off-by: Peter Maydell peter.maydell@linaro.org Signed-off-by: Zhang Jiao --- hw/core/platform-bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/core/platform-bus.c b/hw/core/platform-bus.c index b8487b26b6..dc58bf505a 100644 --- a/hw/core/platform-bus.c +++ b/hw/core/platform-bus.c @@ -145,9 +145,12 @@ static void platform_bus_map_mmio(PlatformBusDevice *pbus, SysBusDevice *sbdev, * the target device's memory region */ for (off = 0; off < pbus->mmio_size; off += alignment) { - if (!memory_region_find(&pbus->mmio, off, size).mr) { + MemoryRegion *mr = memory_region_find(&pbus->mmio, off, size).mr; + if (!mr) { found_region = true; break; + } else { + memory_region_unref(mr); } } -- Gitee From edf3b2b0a9b9aa992592951a979d1b4642026fe5 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Fri, 18 Oct 2024 09:12:50 +0800 Subject: [PATCH 464/939] edu: fix DMA range upper bound check cheery-pick from 2c5107e1b455d4a157124f021826ead4e04b4aea The edu_check_range function checks that start <= end1 < end2, where end1 is the upper bound (exclusive) of the guest-supplied DMA range and end2 is the upper bound (exclusive) of the device's allowed DMA range. When the guest tries to transfer exactly DMA_SIZE (4096) bytes, end1 will be equal to end2, so the check fails and QEMU aborts with this puzzling error message (newlines added for formatting): qemu: hardware error: EDU: DMA range 0x0000000000040000-0x0000000000040fff out of bounds (0x0000000000040000-0x0000000000040fff)! By checking end1 <= end2 instead, guests will be allowed to transfer exactly 4096 bytes. It is not necessary to explicitly check for start <= end1 because the previous two checks (within(addr, start, end2) and end1 > addr) imply start < end1. Fixes: b30934cb52a7 ("hw: misc, add educational driver", 2015-01-21) Signed-off-by: Max Erenberg Signed-off-by: Michael Tokarev Signed-off-by: Zhang Jiao --- hw/misc/edu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/misc/edu.c b/hw/misc/edu.c index a1f8bc77e7..e64a246d3f 100644 --- a/hw/misc/edu.c +++ b/hw/misc/edu.c @@ -115,7 +115,7 @@ static void edu_check_range(uint64_t addr, uint64_t size1, uint64_t start, uint64_t end2 = start + size2; if (within(addr, start, end2) && - end1 > addr && within(end1, start, end2)) { + end1 > addr && end1 <= end2) { return; } -- Gitee From d490ccc1254c7d4dbe8ab40dd78e189108155ae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=A5=9A=E5=90=9B?= Date: Fri, 18 Oct 2024 10:10:17 +0800 Subject: [PATCH 465/939] dma: Fix function names in documentation Ensure the function names match. Signed-off-by: Akihiko Odaki Message-id: 20241012-dma-v2-1-6afddf5f3c8d@daynix.com Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell Signed-off-by: Zhang Chujun --- include/sysemu/dma.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h index a1ac5bc1b5..5a49a30628 100644 --- a/include/sysemu/dma.h +++ b/include/sysemu/dma.h @@ -152,7 +152,7 @@ static inline MemTxResult dma_memory_read(AddressSpace *as, dma_addr_t addr, } /** - * address_space_write: Write to address space from DMA controller. + * dma_memory_write: Write to address space from DMA controller. * * Return a MemTxResult indicating whether the operation succeeded * or failed (eg unassigned memory, device rejected the transaction, @@ -189,7 +189,7 @@ MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr, uint8_t c, dma_addr_t len, MemTxAttrs attrs); /** - * address_space_map: Map a physical memory region into a host virtual address. + * dma_memory_map: Map a physical memory region into a host virtual address. * * May map a subset of the requested range, given by and returned in @plen. * May return %NULL and set *@plen to zero(0), if resources needed to perform @@ -216,16 +216,15 @@ static inline void *dma_memory_map(AddressSpace *as, } /** - * address_space_unmap: Unmaps a memory region previously mapped - * by dma_memory_map() + * dma_memory_unmap: Unmaps a memory region previously mapped by dma_memory_map() * * Will also mark the memory as dirty if @dir == %DMA_DIRECTION_FROM_DEVICE. * @access_len gives the amount of memory that was actually read or written * by the caller. * * @as: #AddressSpace used - * @buffer: host pointer as returned by address_space_map() - * @len: buffer length as returned by address_space_map() + * @buffer: host pointer as returned by dma_memory_map() + * @len: buffer length as returned by dma_memory_map() * @dir: indicates the transfer direction * @access_len: amount of data actually transferred */ -- Gitee From fab03a72da74e938a2a476f1824ac0acd4a1fee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=A5=9A=E5=90=9B?= Date: Fri, 18 Oct 2024 10:17:10 +0800 Subject: [PATCH 466/939] audio/pw: Report more accurate error when connecting to PipeWire fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to its man page [1], pw_context_connect() sets errno on failure: Returns a Core on success or NULL with errno set on error. It may be handy to see errno when figuring out why PipeWire failed to connect. That leaves us with just one possible path to reach 'fail_error' label which is then moved to that path and also its error message is adjusted slightly. 1: https://docs.pipewire.org/group__pw__core.html#ga5994e3a54e4ec718094ca02a1234815b Signed-off-by: Michal Privoznik Reviewed-by: Manos Pitsidianakis Reviewed-by: Marc-André Lureau Message-ID: <3a78811ad5b0e87816b7616ab21d2eeef00b9c52.1726647033.git.mprivozn@redhat.com> Signed-off-by: Zhang Chujun --- audio/pwaudio.c | 8 +- audio/pwaudio.c.orig | 858 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 862 insertions(+), 4 deletions(-) create mode 100644 audio/pwaudio.c.orig diff --git a/audio/pwaudio.c b/audio/pwaudio.c index 3ce5f6507b..5d1c7126d3 100644 --- a/audio/pwaudio.c +++ b/audio/pwaudio.c @@ -770,13 +770,15 @@ qpw_audio_init(Audiodev *dev, Error **errp) pw->core = pw_context_connect(pw->context, NULL, 0); if (pw->core == NULL) { pw_thread_loop_unlock(pw->thread_loop); - goto fail_error; + error_setg_errno(errp, errno, "Failed to connect to PipeWire instance"); + goto fail; } if (pw_core_add_listener(pw->core, &pw->core_listener, &core_events, pw) < 0) { pw_thread_loop_unlock(pw->thread_loop); - goto fail_error; + error_setg(errp, "Failed to add PipeWire listener"); + goto fail; } if (wait_resync(pw) < 0) { pw_thread_loop_unlock(pw->thread_loop); @@ -786,8 +788,6 @@ qpw_audio_init(Audiodev *dev, Error **errp) return g_steal_pointer(&pw); -fail_error: - error_setg(errp, "Failed to initialize PW context"); fail: if (pw->thread_loop) { pw_thread_loop_stop(pw->thread_loop); diff --git a/audio/pwaudio.c.orig b/audio/pwaudio.c.orig new file mode 100644 index 0000000000..3ce5f6507b --- /dev/null +++ b/audio/pwaudio.c.orig @@ -0,0 +1,858 @@ +/* + * QEMU PipeWire audio driver + * + * Copyright (c) 2023 Red Hat Inc. + * + * Author: Dorinda Bassey + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" +#include "audio.h" +#include +#include "qemu/error-report.h" +#include "qapi/error.h" +#include +#include +#include +#include + +#include +#include "trace.h" + +#define AUDIO_CAP "pipewire" +#define RINGBUFFER_SIZE (1u << 22) +#define RINGBUFFER_MASK (RINGBUFFER_SIZE - 1) + +#include "audio_int.h" + +typedef struct pwvolume { + uint32_t channels; + float values[SPA_AUDIO_MAX_CHANNELS]; +} pwvolume; + +typedef struct pwaudio { + Audiodev *dev; + struct pw_thread_loop *thread_loop; + struct pw_context *context; + + struct pw_core *core; + struct spa_hook core_listener; + int last_seq, pending_seq, error; +} pwaudio; + +typedef struct PWVoice { + pwaudio *g; + struct pw_stream *stream; + struct spa_hook stream_listener; + struct spa_audio_info_raw info; + uint32_t highwater_mark; + uint32_t frame_size, req; + struct spa_ringbuffer ring; + uint8_t buffer[RINGBUFFER_SIZE]; + + pwvolume volume; + bool muted; +} PWVoice; + +typedef struct PWVoiceOut { + HWVoiceOut hw; + PWVoice v; +} PWVoiceOut; + +typedef struct PWVoiceIn { + HWVoiceIn hw; + PWVoice v; +} PWVoiceIn; + +#define PW_VOICE_IN(v) ((PWVoiceIn *)v) +#define PW_VOICE_OUT(v) ((PWVoiceOut *)v) + +static void +stream_destroy(void *data) +{ + PWVoice *v = (PWVoice *) data; + spa_hook_remove(&v->stream_listener); + v->stream = NULL; +} + +/* output data processing function to read stuffs from the buffer */ +static void +playback_on_process(void *data) +{ + PWVoice *v = data; + void *p; + struct pw_buffer *b; + struct spa_buffer *buf; + uint32_t req, index, n_bytes; + int32_t avail; + + assert(v->stream); + + /* obtain a buffer to read from */ + b = pw_stream_dequeue_buffer(v->stream); + if (b == NULL) { + error_report("out of buffers: %s", strerror(errno)); + return; + } + + buf = b->buffer; + p = buf->datas[0].data; + if (p == NULL) { + return; + } + /* calculate the total no of bytes to read data from buffer */ + req = b->requested * v->frame_size; + if (req == 0) { + req = v->req; + } + n_bytes = SPA_MIN(req, buf->datas[0].maxsize); + + /* get no of available bytes to read data from buffer */ + avail = spa_ringbuffer_get_read_index(&v->ring, &index); + + if (avail <= 0) { + PWVoiceOut *vo = container_of(data, PWVoiceOut, v); + audio_pcm_info_clear_buf(&vo->hw.info, p, n_bytes / v->frame_size); + } else { + if ((uint32_t) avail < n_bytes) { + /* + * PipeWire immediately calls this callback again if we provide + * less than n_bytes. Then audio_pcm_info_clear_buf() fills the + * rest of the buffer with silence. + */ + n_bytes = avail; + } + + spa_ringbuffer_read_data(&v->ring, + v->buffer, RINGBUFFER_SIZE, + index & RINGBUFFER_MASK, p, n_bytes); + + index += n_bytes; + spa_ringbuffer_read_update(&v->ring, index); + + } + buf->datas[0].chunk->offset = 0; + buf->datas[0].chunk->stride = v->frame_size; + buf->datas[0].chunk->size = n_bytes; + + /* queue the buffer for playback */ + pw_stream_queue_buffer(v->stream, b); +} + +/* output data processing function to generate stuffs in the buffer */ +static void +capture_on_process(void *data) +{ + PWVoice *v = (PWVoice *) data; + void *p; + struct pw_buffer *b; + struct spa_buffer *buf; + int32_t filled; + uint32_t index, offs, n_bytes; + + assert(v->stream); + + /* obtain a buffer */ + b = pw_stream_dequeue_buffer(v->stream); + if (b == NULL) { + error_report("out of buffers: %s", strerror(errno)); + return; + } + + /* Write data into buffer */ + buf = b->buffer; + p = buf->datas[0].data; + if (p == NULL) { + return; + } + offs = SPA_MIN(buf->datas[0].chunk->offset, buf->datas[0].maxsize); + n_bytes = SPA_MIN(buf->datas[0].chunk->size, buf->datas[0].maxsize - offs); + + filled = spa_ringbuffer_get_write_index(&v->ring, &index); + + + if (filled < 0) { + error_report("%p: underrun write:%u filled:%d", p, index, filled); + } else { + if ((uint32_t) filled + n_bytes > RINGBUFFER_SIZE) { + error_report("%p: overrun write:%u filled:%d + size:%u > max:%u", + p, index, filled, n_bytes, RINGBUFFER_SIZE); + } + } + spa_ringbuffer_write_data(&v->ring, + v->buffer, RINGBUFFER_SIZE, + index & RINGBUFFER_MASK, + SPA_PTROFF(p, offs, void), n_bytes); + index += n_bytes; + spa_ringbuffer_write_update(&v->ring, index); + + /* queue the buffer for playback */ + pw_stream_queue_buffer(v->stream, b); +} + +static void +on_stream_state_changed(void *data, enum pw_stream_state old, + enum pw_stream_state state, const char *error) +{ + PWVoice *v = (PWVoice *) data; + + trace_pw_state_changed(pw_stream_get_node_id(v->stream), + pw_stream_state_as_string(state)); +} + +static const struct pw_stream_events capture_stream_events = { + PW_VERSION_STREAM_EVENTS, + .destroy = stream_destroy, + .state_changed = on_stream_state_changed, + .process = capture_on_process +}; + +static const struct pw_stream_events playback_stream_events = { + PW_VERSION_STREAM_EVENTS, + .destroy = stream_destroy, + .state_changed = on_stream_state_changed, + .process = playback_on_process +}; + +static size_t +qpw_read(HWVoiceIn *hw, void *data, size_t len) +{ + PWVoiceIn *pw = (PWVoiceIn *) hw; + PWVoice *v = &pw->v; + pwaudio *c = v->g; + const char *error = NULL; + size_t l; + int32_t avail; + uint32_t index; + + pw_thread_loop_lock(c->thread_loop); + if (pw_stream_get_state(v->stream, &error) != PW_STREAM_STATE_STREAMING) { + /* wait for stream to become ready */ + l = 0; + goto done_unlock; + } + /* get no of available bytes to read data from buffer */ + avail = spa_ringbuffer_get_read_index(&v->ring, &index); + + trace_pw_read(avail, index, len); + + if (avail < (int32_t) len) { + len = avail; + } + + spa_ringbuffer_read_data(&v->ring, + v->buffer, RINGBUFFER_SIZE, + index & RINGBUFFER_MASK, data, len); + index += len; + spa_ringbuffer_read_update(&v->ring, index); + l = len; + +done_unlock: + pw_thread_loop_unlock(c->thread_loop); + return l; +} + +static size_t qpw_buffer_get_free(HWVoiceOut *hw) +{ + PWVoiceOut *pw = (PWVoiceOut *)hw; + PWVoice *v = &pw->v; + pwaudio *c = v->g; + const char *error = NULL; + int32_t filled, avail; + uint32_t index; + + pw_thread_loop_lock(c->thread_loop); + if (pw_stream_get_state(v->stream, &error) != PW_STREAM_STATE_STREAMING) { + /* wait for stream to become ready */ + avail = 0; + goto done_unlock; + } + + filled = spa_ringbuffer_get_write_index(&v->ring, &index); + avail = v->highwater_mark - filled; + +done_unlock: + pw_thread_loop_unlock(c->thread_loop); + return avail; +} + +static size_t +qpw_write(HWVoiceOut *hw, void *data, size_t len) +{ + PWVoiceOut *pw = (PWVoiceOut *) hw; + PWVoice *v = &pw->v; + pwaudio *c = v->g; + const char *error = NULL; + int32_t filled, avail; + uint32_t index; + + pw_thread_loop_lock(c->thread_loop); + if (pw_stream_get_state(v->stream, &error) != PW_STREAM_STATE_STREAMING) { + /* wait for stream to become ready */ + len = 0; + goto done_unlock; + } + filled = spa_ringbuffer_get_write_index(&v->ring, &index); + avail = v->highwater_mark - filled; + + trace_pw_write(filled, avail, index, len); + + if (len > avail) { + len = avail; + } + + if (filled < 0) { + error_report("%p: underrun write:%u filled:%d", pw, index, filled); + } else { + if ((uint32_t) filled + len > RINGBUFFER_SIZE) { + error_report("%p: overrun write:%u filled:%d + size:%zu > max:%u", + pw, index, filled, len, RINGBUFFER_SIZE); + } + } + + spa_ringbuffer_write_data(&v->ring, + v->buffer, RINGBUFFER_SIZE, + index & RINGBUFFER_MASK, data, len); + index += len; + spa_ringbuffer_write_update(&v->ring, index); + +done_unlock: + pw_thread_loop_unlock(c->thread_loop); + return len; +} + +static int +audfmt_to_pw(AudioFormat fmt, int endianness) +{ + int format; + + switch (fmt) { + case AUDIO_FORMAT_S8: + format = SPA_AUDIO_FORMAT_S8; + break; + case AUDIO_FORMAT_U8: + format = SPA_AUDIO_FORMAT_U8; + break; + case AUDIO_FORMAT_S16: + format = endianness ? SPA_AUDIO_FORMAT_S16_BE : SPA_AUDIO_FORMAT_S16_LE; + break; + case AUDIO_FORMAT_U16: + format = endianness ? SPA_AUDIO_FORMAT_U16_BE : SPA_AUDIO_FORMAT_U16_LE; + break; + case AUDIO_FORMAT_S32: + format = endianness ? SPA_AUDIO_FORMAT_S32_BE : SPA_AUDIO_FORMAT_S32_LE; + break; + case AUDIO_FORMAT_U32: + format = endianness ? SPA_AUDIO_FORMAT_U32_BE : SPA_AUDIO_FORMAT_U32_LE; + break; + case AUDIO_FORMAT_F32: + format = endianness ? SPA_AUDIO_FORMAT_F32_BE : SPA_AUDIO_FORMAT_F32_LE; + break; + default: + dolog("Internal logic error: Bad audio format %d\n", fmt); + format = SPA_AUDIO_FORMAT_U8; + break; + } + return format; +} + +static AudioFormat +pw_to_audfmt(enum spa_audio_format fmt, int *endianness, + uint32_t *sample_size) +{ + switch (fmt) { + case SPA_AUDIO_FORMAT_S8: + *sample_size = 1; + return AUDIO_FORMAT_S8; + case SPA_AUDIO_FORMAT_U8: + *sample_size = 1; + return AUDIO_FORMAT_U8; + case SPA_AUDIO_FORMAT_S16_BE: + *sample_size = 2; + *endianness = 1; + return AUDIO_FORMAT_S16; + case SPA_AUDIO_FORMAT_S16_LE: + *sample_size = 2; + *endianness = 0; + return AUDIO_FORMAT_S16; + case SPA_AUDIO_FORMAT_U16_BE: + *sample_size = 2; + *endianness = 1; + return AUDIO_FORMAT_U16; + case SPA_AUDIO_FORMAT_U16_LE: + *sample_size = 2; + *endianness = 0; + return AUDIO_FORMAT_U16; + case SPA_AUDIO_FORMAT_S32_BE: + *sample_size = 4; + *endianness = 1; + return AUDIO_FORMAT_S32; + case SPA_AUDIO_FORMAT_S32_LE: + *sample_size = 4; + *endianness = 0; + return AUDIO_FORMAT_S32; + case SPA_AUDIO_FORMAT_U32_BE: + *sample_size = 4; + *endianness = 1; + return AUDIO_FORMAT_U32; + case SPA_AUDIO_FORMAT_U32_LE: + *sample_size = 4; + *endianness = 0; + return AUDIO_FORMAT_U32; + case SPA_AUDIO_FORMAT_F32_BE: + *sample_size = 4; + *endianness = 1; + return AUDIO_FORMAT_F32; + case SPA_AUDIO_FORMAT_F32_LE: + *sample_size = 4; + *endianness = 0; + return AUDIO_FORMAT_F32; + default: + *sample_size = 1; + dolog("Internal logic error: Bad spa_audio_format %d\n", fmt); + return AUDIO_FORMAT_U8; + } +} + +static int +qpw_stream_new(pwaudio *c, PWVoice *v, const char *stream_name, + const char *name, enum spa_direction dir) +{ + int res; + uint32_t n_params; + const struct spa_pod *params[2]; + uint8_t buffer[1024]; + struct spa_pod_builder b; + uint64_t buf_samples; + struct pw_properties *props; + + props = pw_properties_new(NULL, NULL); + if (!props) { + error_report("Failed to create PW properties: %s", g_strerror(errno)); + return -1; + } + + /* 75% of the timer period for faster updates */ + buf_samples = (uint64_t)v->g->dev->timer_period * v->info.rate + * 3 / 4 / 1000000; + pw_properties_setf(props, PW_KEY_NODE_LATENCY, "%" PRIu64 "/%u", + buf_samples, v->info.rate); + + trace_pw_period(buf_samples, v->info.rate); + if (name) { + pw_properties_set(props, PW_KEY_TARGET_OBJECT, name); + } + v->stream = pw_stream_new(c->core, stream_name, props); + if (v->stream == NULL) { + error_report("Failed to create PW stream: %s", g_strerror(errno)); + return -1; + } + + if (dir == SPA_DIRECTION_INPUT) { + pw_stream_add_listener(v->stream, + &v->stream_listener, &capture_stream_events, v); + } else { + pw_stream_add_listener(v->stream, + &v->stream_listener, &playback_stream_events, v); + } + + n_params = 0; + spa_pod_builder_init(&b, buffer, sizeof(buffer)); + params[n_params++] = spa_format_audio_raw_build(&b, + SPA_PARAM_EnumFormat, + &v->info); + + /* connect the stream to a sink or source */ + res = pw_stream_connect(v->stream, + dir == + SPA_DIRECTION_INPUT ? PW_DIRECTION_INPUT : + PW_DIRECTION_OUTPUT, PW_ID_ANY, + PW_STREAM_FLAG_AUTOCONNECT | + PW_STREAM_FLAG_INACTIVE | + PW_STREAM_FLAG_MAP_BUFFERS | + PW_STREAM_FLAG_RT_PROCESS, params, n_params); + if (res < 0) { + error_report("Failed to connect PW stream: %s", g_strerror(errno)); + pw_stream_destroy(v->stream); + return -1; + } + + return 0; +} + +static void +qpw_set_position(uint32_t channels, uint32_t position[SPA_AUDIO_MAX_CHANNELS]) +{ + memcpy(position, (uint32_t[SPA_AUDIO_MAX_CHANNELS]) { SPA_AUDIO_CHANNEL_UNKNOWN, }, + sizeof(uint32_t) * SPA_AUDIO_MAX_CHANNELS); + /* + * TODO: This currently expects the only frontend supporting more than 2 + * channels is the usb-audio. We will need some means to set channel + * order when a new frontend gains multi-channel support. + */ + switch (channels) { + case 8: + position[6] = SPA_AUDIO_CHANNEL_SL; + position[7] = SPA_AUDIO_CHANNEL_SR; + /* fallthrough */ + case 6: + position[2] = SPA_AUDIO_CHANNEL_FC; + position[3] = SPA_AUDIO_CHANNEL_LFE; + position[4] = SPA_AUDIO_CHANNEL_RL; + position[5] = SPA_AUDIO_CHANNEL_RR; + /* fallthrough */ + case 2: + position[0] = SPA_AUDIO_CHANNEL_FL; + position[1] = SPA_AUDIO_CHANNEL_FR; + break; + case 1: + position[0] = SPA_AUDIO_CHANNEL_MONO; + break; + default: + dolog("Internal error: unsupported channel count %d\n", channels); + } +} + +static int +qpw_init_out(HWVoiceOut *hw, struct audsettings *as, void *drv_opaque) +{ + PWVoiceOut *pw = (PWVoiceOut *) hw; + PWVoice *v = &pw->v; + struct audsettings obt_as = *as; + pwaudio *c = v->g = drv_opaque; + AudiodevPipewireOptions *popts = &c->dev->u.pipewire; + AudiodevPipewirePerDirectionOptions *ppdo = popts->out; + int r; + + pw_thread_loop_lock(c->thread_loop); + + v->info.format = audfmt_to_pw(as->fmt, as->endianness); + v->info.channels = as->nchannels; + qpw_set_position(as->nchannels, v->info.position); + v->info.rate = as->freq; + + obt_as.fmt = + pw_to_audfmt(v->info.format, &obt_as.endianness, &v->frame_size); + v->frame_size *= as->nchannels; + + v->req = (uint64_t)c->dev->timer_period * v->info.rate + * 1 / 2 / 1000000 * v->frame_size; + + /* call the function that creates a new stream for playback */ + r = qpw_stream_new(c, v, ppdo->stream_name ? : c->dev->id, + ppdo->name, SPA_DIRECTION_OUTPUT); + if (r < 0) { + pw_thread_loop_unlock(c->thread_loop); + return -1; + } + + /* report the audio format we support */ + audio_pcm_init_info(&hw->info, &obt_as); + + /* report the buffer size to qemu */ + hw->samples = audio_buffer_frames( + qapi_AudiodevPipewirePerDirectionOptions_base(ppdo), &obt_as, 46440); + v->highwater_mark = MIN(RINGBUFFER_SIZE, + (ppdo->has_latency ? ppdo->latency : 46440) + * (uint64_t)v->info.rate / 1000000 * v->frame_size); + + pw_thread_loop_unlock(c->thread_loop); + return 0; +} + +static int +qpw_init_in(HWVoiceIn *hw, struct audsettings *as, void *drv_opaque) +{ + PWVoiceIn *pw = (PWVoiceIn *) hw; + PWVoice *v = &pw->v; + struct audsettings obt_as = *as; + pwaudio *c = v->g = drv_opaque; + AudiodevPipewireOptions *popts = &c->dev->u.pipewire; + AudiodevPipewirePerDirectionOptions *ppdo = popts->in; + int r; + + pw_thread_loop_lock(c->thread_loop); + + v->info.format = audfmt_to_pw(as->fmt, as->endianness); + v->info.channels = as->nchannels; + qpw_set_position(as->nchannels, v->info.position); + v->info.rate = as->freq; + + obt_as.fmt = + pw_to_audfmt(v->info.format, &obt_as.endianness, &v->frame_size); + v->frame_size *= as->nchannels; + + /* call the function that creates a new stream for recording */ + r = qpw_stream_new(c, v, ppdo->stream_name ? : c->dev->id, + ppdo->name, SPA_DIRECTION_INPUT); + if (r < 0) { + pw_thread_loop_unlock(c->thread_loop); + return -1; + } + + /* report the audio format we support */ + audio_pcm_init_info(&hw->info, &obt_as); + + /* report the buffer size to qemu */ + hw->samples = audio_buffer_frames( + qapi_AudiodevPipewirePerDirectionOptions_base(ppdo), &obt_as, 46440); + + pw_thread_loop_unlock(c->thread_loop); + return 0; +} + +static void +qpw_voice_fini(PWVoice *v) +{ + pwaudio *c = v->g; + + if (!v->stream) { + return; + } + pw_thread_loop_lock(c->thread_loop); + pw_stream_destroy(v->stream); + v->stream = NULL; + pw_thread_loop_unlock(c->thread_loop); +} + +static void +qpw_fini_out(HWVoiceOut *hw) +{ + qpw_voice_fini(&PW_VOICE_OUT(hw)->v); +} + +static void +qpw_fini_in(HWVoiceIn *hw) +{ + qpw_voice_fini(&PW_VOICE_IN(hw)->v); +} + +static void +qpw_voice_set_enabled(PWVoice *v, bool enable) +{ + pwaudio *c = v->g; + pw_thread_loop_lock(c->thread_loop); + pw_stream_set_active(v->stream, enable); + pw_thread_loop_unlock(c->thread_loop); +} + +static void +qpw_enable_out(HWVoiceOut *hw, bool enable) +{ + qpw_voice_set_enabled(&PW_VOICE_OUT(hw)->v, enable); +} + +static void +qpw_enable_in(HWVoiceIn *hw, bool enable) +{ + qpw_voice_set_enabled(&PW_VOICE_IN(hw)->v, enable); +} + +static void +qpw_voice_set_volume(PWVoice *v, Volume *vol) +{ + pwaudio *c = v->g; + int i, ret; + + pw_thread_loop_lock(c->thread_loop); + v->volume.channels = vol->channels; + + for (i = 0; i < vol->channels; ++i) { + v->volume.values[i] = (float)vol->vol[i] / 255; + } + + ret = pw_stream_set_control(v->stream, + SPA_PROP_channelVolumes, v->volume.channels, v->volume.values, 0); + trace_pw_vol(ret == 0 ? "success" : "failed"); + + v->muted = vol->mute; + float val = v->muted ? 1.f : 0.f; + ret = pw_stream_set_control(v->stream, SPA_PROP_mute, 1, &val, 0); + pw_thread_loop_unlock(c->thread_loop); +} + +static void +qpw_volume_out(HWVoiceOut *hw, Volume *vol) +{ + qpw_voice_set_volume(&PW_VOICE_OUT(hw)->v, vol); +} + +static void +qpw_volume_in(HWVoiceIn *hw, Volume *vol) +{ + qpw_voice_set_volume(&PW_VOICE_IN(hw)->v, vol); +} + +static int wait_resync(pwaudio *pw) +{ + int res; + pw->pending_seq = pw_core_sync(pw->core, PW_ID_CORE, pw->pending_seq); + + while (true) { + pw_thread_loop_wait(pw->thread_loop); + + res = pw->error; + if (res < 0) { + pw->error = 0; + return res; + } + if (pw->pending_seq == pw->last_seq) { + break; + } + } + return 0; +} + +static void +on_core_error(void *data, uint32_t id, int seq, int res, const char *message) +{ + pwaudio *pw = data; + + error_report("error id:%u seq:%d res:%d (%s): %s", + id, seq, res, spa_strerror(res), message); + + /* stop and exit the thread loop */ + pw_thread_loop_signal(pw->thread_loop, FALSE); +} + +static void +on_core_done(void *data, uint32_t id, int seq) +{ + pwaudio *pw = data; + assert(id == PW_ID_CORE); + pw->last_seq = seq; + if (pw->pending_seq == seq) { + /* stop and exit the thread loop */ + pw_thread_loop_signal(pw->thread_loop, FALSE); + } +} + +static const struct pw_core_events core_events = { + PW_VERSION_CORE_EVENTS, + .done = on_core_done, + .error = on_core_error, +}; + +static void * +qpw_audio_init(Audiodev *dev, Error **errp) +{ + g_autofree pwaudio *pw = g_new0(pwaudio, 1); + + assert(dev->driver == AUDIODEV_DRIVER_PIPEWIRE); + trace_pw_audio_init(); + + pw_init(NULL, NULL); + + pw->dev = dev; + pw->thread_loop = pw_thread_loop_new("PipeWire thread loop", NULL); + if (pw->thread_loop == NULL) { + error_setg_errno(errp, errno, "Could not create PipeWire loop"); + goto fail; + } + + pw->context = + pw_context_new(pw_thread_loop_get_loop(pw->thread_loop), NULL, 0); + if (pw->context == NULL) { + error_setg_errno(errp, errno, "Could not create PipeWire context"); + goto fail; + } + + if (pw_thread_loop_start(pw->thread_loop) < 0) { + error_setg_errno(errp, errno, "Could not start PipeWire loop"); + goto fail; + } + + pw_thread_loop_lock(pw->thread_loop); + + pw->core = pw_context_connect(pw->context, NULL, 0); + if (pw->core == NULL) { + pw_thread_loop_unlock(pw->thread_loop); + goto fail_error; + } + + if (pw_core_add_listener(pw->core, &pw->core_listener, + &core_events, pw) < 0) { + pw_thread_loop_unlock(pw->thread_loop); + goto fail_error; + } + if (wait_resync(pw) < 0) { + pw_thread_loop_unlock(pw->thread_loop); + } + + pw_thread_loop_unlock(pw->thread_loop); + + return g_steal_pointer(&pw); + +fail_error: + error_setg(errp, "Failed to initialize PW context"); +fail: + if (pw->thread_loop) { + pw_thread_loop_stop(pw->thread_loop); + } + g_clear_pointer(&pw->context, pw_context_destroy); + g_clear_pointer(&pw->thread_loop, pw_thread_loop_destroy); + return NULL; +} + +static void +qpw_audio_fini(void *opaque) +{ + pwaudio *pw = opaque; + + if (pw->thread_loop) { + pw_thread_loop_stop(pw->thread_loop); + } + + if (pw->core) { + spa_hook_remove(&pw->core_listener); + spa_zero(pw->core_listener); + pw_core_disconnect(pw->core); + } + + if (pw->context) { + pw_context_destroy(pw->context); + } + pw_thread_loop_destroy(pw->thread_loop); + + g_free(pw); +} + +static struct audio_pcm_ops qpw_pcm_ops = { + .init_out = qpw_init_out, + .fini_out = qpw_fini_out, + .write = qpw_write, + .buffer_get_free = qpw_buffer_get_free, + .run_buffer_out = audio_generic_run_buffer_out, + .enable_out = qpw_enable_out, + .volume_out = qpw_volume_out, + .volume_in = qpw_volume_in, + + .init_in = qpw_init_in, + .fini_in = qpw_fini_in, + .read = qpw_read, + .run_buffer_in = audio_generic_run_buffer_in, + .enable_in = qpw_enable_in +}; + +static struct audio_driver pw_audio_driver = { + .name = "pipewire", + .descr = "http://www.pipewire.org/", + .init = qpw_audio_init, + .fini = qpw_audio_fini, + .pcm_ops = &qpw_pcm_ops, + .max_voices_out = INT_MAX, + .max_voices_in = INT_MAX, + .voice_size_out = sizeof(PWVoiceOut), + .voice_size_in = sizeof(PWVoiceIn), +}; + +static void +register_audio_pw(void) +{ + audio_driver_register(&pw_audio_driver); +} + +type_init(register_audio_pw); -- Gitee From 6adb429abb287b3143ed447b334aa89c1a1c0d71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=A5=9A=E5=90=9B?= Date: Fri, 18 Oct 2024 10:29:16 +0800 Subject: [PATCH 467/939] audio/pw: Report more accurate error when connecting to PipeWire fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to its man page [1], pw_context_connect() sets errno on failure: Returns a Core on success or NULL with errno set on error. It may be handy to see errno when figuring out why PipeWire failed to connect. That leaves us with just one possible path to reach 'fail_error' label which is then moved to that path and also its error message is adjusted slightly. 1: https://docs.pipewire.org/group__pw__core.html#ga5994e3a54e4ec718094ca02a1234815b Signed-off-by: Michal Privoznik Reviewed-by: Manos Pitsidianakis Reviewed-by: Marc-André Lureau Message-ID: <3a78811ad5b0e87816b7616ab21d2eeef00b9c52.1726647033.git.mprivozn@redhat.com> Signed-off-by: Zhang Chujun --- audio/pwaudio.c.orig | 858 ------------------------------------------- 1 file changed, 858 deletions(-) delete mode 100644 audio/pwaudio.c.orig diff --git a/audio/pwaudio.c.orig b/audio/pwaudio.c.orig deleted file mode 100644 index 3ce5f6507b..0000000000 --- a/audio/pwaudio.c.orig +++ /dev/null @@ -1,858 +0,0 @@ -/* - * QEMU PipeWire audio driver - * - * Copyright (c) 2023 Red Hat Inc. - * - * Author: Dorinda Bassey - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#include "qemu/osdep.h" -#include "qemu/module.h" -#include "audio.h" -#include -#include "qemu/error-report.h" -#include "qapi/error.h" -#include -#include -#include -#include - -#include -#include "trace.h" - -#define AUDIO_CAP "pipewire" -#define RINGBUFFER_SIZE (1u << 22) -#define RINGBUFFER_MASK (RINGBUFFER_SIZE - 1) - -#include "audio_int.h" - -typedef struct pwvolume { - uint32_t channels; - float values[SPA_AUDIO_MAX_CHANNELS]; -} pwvolume; - -typedef struct pwaudio { - Audiodev *dev; - struct pw_thread_loop *thread_loop; - struct pw_context *context; - - struct pw_core *core; - struct spa_hook core_listener; - int last_seq, pending_seq, error; -} pwaudio; - -typedef struct PWVoice { - pwaudio *g; - struct pw_stream *stream; - struct spa_hook stream_listener; - struct spa_audio_info_raw info; - uint32_t highwater_mark; - uint32_t frame_size, req; - struct spa_ringbuffer ring; - uint8_t buffer[RINGBUFFER_SIZE]; - - pwvolume volume; - bool muted; -} PWVoice; - -typedef struct PWVoiceOut { - HWVoiceOut hw; - PWVoice v; -} PWVoiceOut; - -typedef struct PWVoiceIn { - HWVoiceIn hw; - PWVoice v; -} PWVoiceIn; - -#define PW_VOICE_IN(v) ((PWVoiceIn *)v) -#define PW_VOICE_OUT(v) ((PWVoiceOut *)v) - -static void -stream_destroy(void *data) -{ - PWVoice *v = (PWVoice *) data; - spa_hook_remove(&v->stream_listener); - v->stream = NULL; -} - -/* output data processing function to read stuffs from the buffer */ -static void -playback_on_process(void *data) -{ - PWVoice *v = data; - void *p; - struct pw_buffer *b; - struct spa_buffer *buf; - uint32_t req, index, n_bytes; - int32_t avail; - - assert(v->stream); - - /* obtain a buffer to read from */ - b = pw_stream_dequeue_buffer(v->stream); - if (b == NULL) { - error_report("out of buffers: %s", strerror(errno)); - return; - } - - buf = b->buffer; - p = buf->datas[0].data; - if (p == NULL) { - return; - } - /* calculate the total no of bytes to read data from buffer */ - req = b->requested * v->frame_size; - if (req == 0) { - req = v->req; - } - n_bytes = SPA_MIN(req, buf->datas[0].maxsize); - - /* get no of available bytes to read data from buffer */ - avail = spa_ringbuffer_get_read_index(&v->ring, &index); - - if (avail <= 0) { - PWVoiceOut *vo = container_of(data, PWVoiceOut, v); - audio_pcm_info_clear_buf(&vo->hw.info, p, n_bytes / v->frame_size); - } else { - if ((uint32_t) avail < n_bytes) { - /* - * PipeWire immediately calls this callback again if we provide - * less than n_bytes. Then audio_pcm_info_clear_buf() fills the - * rest of the buffer with silence. - */ - n_bytes = avail; - } - - spa_ringbuffer_read_data(&v->ring, - v->buffer, RINGBUFFER_SIZE, - index & RINGBUFFER_MASK, p, n_bytes); - - index += n_bytes; - spa_ringbuffer_read_update(&v->ring, index); - - } - buf->datas[0].chunk->offset = 0; - buf->datas[0].chunk->stride = v->frame_size; - buf->datas[0].chunk->size = n_bytes; - - /* queue the buffer for playback */ - pw_stream_queue_buffer(v->stream, b); -} - -/* output data processing function to generate stuffs in the buffer */ -static void -capture_on_process(void *data) -{ - PWVoice *v = (PWVoice *) data; - void *p; - struct pw_buffer *b; - struct spa_buffer *buf; - int32_t filled; - uint32_t index, offs, n_bytes; - - assert(v->stream); - - /* obtain a buffer */ - b = pw_stream_dequeue_buffer(v->stream); - if (b == NULL) { - error_report("out of buffers: %s", strerror(errno)); - return; - } - - /* Write data into buffer */ - buf = b->buffer; - p = buf->datas[0].data; - if (p == NULL) { - return; - } - offs = SPA_MIN(buf->datas[0].chunk->offset, buf->datas[0].maxsize); - n_bytes = SPA_MIN(buf->datas[0].chunk->size, buf->datas[0].maxsize - offs); - - filled = spa_ringbuffer_get_write_index(&v->ring, &index); - - - if (filled < 0) { - error_report("%p: underrun write:%u filled:%d", p, index, filled); - } else { - if ((uint32_t) filled + n_bytes > RINGBUFFER_SIZE) { - error_report("%p: overrun write:%u filled:%d + size:%u > max:%u", - p, index, filled, n_bytes, RINGBUFFER_SIZE); - } - } - spa_ringbuffer_write_data(&v->ring, - v->buffer, RINGBUFFER_SIZE, - index & RINGBUFFER_MASK, - SPA_PTROFF(p, offs, void), n_bytes); - index += n_bytes; - spa_ringbuffer_write_update(&v->ring, index); - - /* queue the buffer for playback */ - pw_stream_queue_buffer(v->stream, b); -} - -static void -on_stream_state_changed(void *data, enum pw_stream_state old, - enum pw_stream_state state, const char *error) -{ - PWVoice *v = (PWVoice *) data; - - trace_pw_state_changed(pw_stream_get_node_id(v->stream), - pw_stream_state_as_string(state)); -} - -static const struct pw_stream_events capture_stream_events = { - PW_VERSION_STREAM_EVENTS, - .destroy = stream_destroy, - .state_changed = on_stream_state_changed, - .process = capture_on_process -}; - -static const struct pw_stream_events playback_stream_events = { - PW_VERSION_STREAM_EVENTS, - .destroy = stream_destroy, - .state_changed = on_stream_state_changed, - .process = playback_on_process -}; - -static size_t -qpw_read(HWVoiceIn *hw, void *data, size_t len) -{ - PWVoiceIn *pw = (PWVoiceIn *) hw; - PWVoice *v = &pw->v; - pwaudio *c = v->g; - const char *error = NULL; - size_t l; - int32_t avail; - uint32_t index; - - pw_thread_loop_lock(c->thread_loop); - if (pw_stream_get_state(v->stream, &error) != PW_STREAM_STATE_STREAMING) { - /* wait for stream to become ready */ - l = 0; - goto done_unlock; - } - /* get no of available bytes to read data from buffer */ - avail = spa_ringbuffer_get_read_index(&v->ring, &index); - - trace_pw_read(avail, index, len); - - if (avail < (int32_t) len) { - len = avail; - } - - spa_ringbuffer_read_data(&v->ring, - v->buffer, RINGBUFFER_SIZE, - index & RINGBUFFER_MASK, data, len); - index += len; - spa_ringbuffer_read_update(&v->ring, index); - l = len; - -done_unlock: - pw_thread_loop_unlock(c->thread_loop); - return l; -} - -static size_t qpw_buffer_get_free(HWVoiceOut *hw) -{ - PWVoiceOut *pw = (PWVoiceOut *)hw; - PWVoice *v = &pw->v; - pwaudio *c = v->g; - const char *error = NULL; - int32_t filled, avail; - uint32_t index; - - pw_thread_loop_lock(c->thread_loop); - if (pw_stream_get_state(v->stream, &error) != PW_STREAM_STATE_STREAMING) { - /* wait for stream to become ready */ - avail = 0; - goto done_unlock; - } - - filled = spa_ringbuffer_get_write_index(&v->ring, &index); - avail = v->highwater_mark - filled; - -done_unlock: - pw_thread_loop_unlock(c->thread_loop); - return avail; -} - -static size_t -qpw_write(HWVoiceOut *hw, void *data, size_t len) -{ - PWVoiceOut *pw = (PWVoiceOut *) hw; - PWVoice *v = &pw->v; - pwaudio *c = v->g; - const char *error = NULL; - int32_t filled, avail; - uint32_t index; - - pw_thread_loop_lock(c->thread_loop); - if (pw_stream_get_state(v->stream, &error) != PW_STREAM_STATE_STREAMING) { - /* wait for stream to become ready */ - len = 0; - goto done_unlock; - } - filled = spa_ringbuffer_get_write_index(&v->ring, &index); - avail = v->highwater_mark - filled; - - trace_pw_write(filled, avail, index, len); - - if (len > avail) { - len = avail; - } - - if (filled < 0) { - error_report("%p: underrun write:%u filled:%d", pw, index, filled); - } else { - if ((uint32_t) filled + len > RINGBUFFER_SIZE) { - error_report("%p: overrun write:%u filled:%d + size:%zu > max:%u", - pw, index, filled, len, RINGBUFFER_SIZE); - } - } - - spa_ringbuffer_write_data(&v->ring, - v->buffer, RINGBUFFER_SIZE, - index & RINGBUFFER_MASK, data, len); - index += len; - spa_ringbuffer_write_update(&v->ring, index); - -done_unlock: - pw_thread_loop_unlock(c->thread_loop); - return len; -} - -static int -audfmt_to_pw(AudioFormat fmt, int endianness) -{ - int format; - - switch (fmt) { - case AUDIO_FORMAT_S8: - format = SPA_AUDIO_FORMAT_S8; - break; - case AUDIO_FORMAT_U8: - format = SPA_AUDIO_FORMAT_U8; - break; - case AUDIO_FORMAT_S16: - format = endianness ? SPA_AUDIO_FORMAT_S16_BE : SPA_AUDIO_FORMAT_S16_LE; - break; - case AUDIO_FORMAT_U16: - format = endianness ? SPA_AUDIO_FORMAT_U16_BE : SPA_AUDIO_FORMAT_U16_LE; - break; - case AUDIO_FORMAT_S32: - format = endianness ? SPA_AUDIO_FORMAT_S32_BE : SPA_AUDIO_FORMAT_S32_LE; - break; - case AUDIO_FORMAT_U32: - format = endianness ? SPA_AUDIO_FORMAT_U32_BE : SPA_AUDIO_FORMAT_U32_LE; - break; - case AUDIO_FORMAT_F32: - format = endianness ? SPA_AUDIO_FORMAT_F32_BE : SPA_AUDIO_FORMAT_F32_LE; - break; - default: - dolog("Internal logic error: Bad audio format %d\n", fmt); - format = SPA_AUDIO_FORMAT_U8; - break; - } - return format; -} - -static AudioFormat -pw_to_audfmt(enum spa_audio_format fmt, int *endianness, - uint32_t *sample_size) -{ - switch (fmt) { - case SPA_AUDIO_FORMAT_S8: - *sample_size = 1; - return AUDIO_FORMAT_S8; - case SPA_AUDIO_FORMAT_U8: - *sample_size = 1; - return AUDIO_FORMAT_U8; - case SPA_AUDIO_FORMAT_S16_BE: - *sample_size = 2; - *endianness = 1; - return AUDIO_FORMAT_S16; - case SPA_AUDIO_FORMAT_S16_LE: - *sample_size = 2; - *endianness = 0; - return AUDIO_FORMAT_S16; - case SPA_AUDIO_FORMAT_U16_BE: - *sample_size = 2; - *endianness = 1; - return AUDIO_FORMAT_U16; - case SPA_AUDIO_FORMAT_U16_LE: - *sample_size = 2; - *endianness = 0; - return AUDIO_FORMAT_U16; - case SPA_AUDIO_FORMAT_S32_BE: - *sample_size = 4; - *endianness = 1; - return AUDIO_FORMAT_S32; - case SPA_AUDIO_FORMAT_S32_LE: - *sample_size = 4; - *endianness = 0; - return AUDIO_FORMAT_S32; - case SPA_AUDIO_FORMAT_U32_BE: - *sample_size = 4; - *endianness = 1; - return AUDIO_FORMAT_U32; - case SPA_AUDIO_FORMAT_U32_LE: - *sample_size = 4; - *endianness = 0; - return AUDIO_FORMAT_U32; - case SPA_AUDIO_FORMAT_F32_BE: - *sample_size = 4; - *endianness = 1; - return AUDIO_FORMAT_F32; - case SPA_AUDIO_FORMAT_F32_LE: - *sample_size = 4; - *endianness = 0; - return AUDIO_FORMAT_F32; - default: - *sample_size = 1; - dolog("Internal logic error: Bad spa_audio_format %d\n", fmt); - return AUDIO_FORMAT_U8; - } -} - -static int -qpw_stream_new(pwaudio *c, PWVoice *v, const char *stream_name, - const char *name, enum spa_direction dir) -{ - int res; - uint32_t n_params; - const struct spa_pod *params[2]; - uint8_t buffer[1024]; - struct spa_pod_builder b; - uint64_t buf_samples; - struct pw_properties *props; - - props = pw_properties_new(NULL, NULL); - if (!props) { - error_report("Failed to create PW properties: %s", g_strerror(errno)); - return -1; - } - - /* 75% of the timer period for faster updates */ - buf_samples = (uint64_t)v->g->dev->timer_period * v->info.rate - * 3 / 4 / 1000000; - pw_properties_setf(props, PW_KEY_NODE_LATENCY, "%" PRIu64 "/%u", - buf_samples, v->info.rate); - - trace_pw_period(buf_samples, v->info.rate); - if (name) { - pw_properties_set(props, PW_KEY_TARGET_OBJECT, name); - } - v->stream = pw_stream_new(c->core, stream_name, props); - if (v->stream == NULL) { - error_report("Failed to create PW stream: %s", g_strerror(errno)); - return -1; - } - - if (dir == SPA_DIRECTION_INPUT) { - pw_stream_add_listener(v->stream, - &v->stream_listener, &capture_stream_events, v); - } else { - pw_stream_add_listener(v->stream, - &v->stream_listener, &playback_stream_events, v); - } - - n_params = 0; - spa_pod_builder_init(&b, buffer, sizeof(buffer)); - params[n_params++] = spa_format_audio_raw_build(&b, - SPA_PARAM_EnumFormat, - &v->info); - - /* connect the stream to a sink or source */ - res = pw_stream_connect(v->stream, - dir == - SPA_DIRECTION_INPUT ? PW_DIRECTION_INPUT : - PW_DIRECTION_OUTPUT, PW_ID_ANY, - PW_STREAM_FLAG_AUTOCONNECT | - PW_STREAM_FLAG_INACTIVE | - PW_STREAM_FLAG_MAP_BUFFERS | - PW_STREAM_FLAG_RT_PROCESS, params, n_params); - if (res < 0) { - error_report("Failed to connect PW stream: %s", g_strerror(errno)); - pw_stream_destroy(v->stream); - return -1; - } - - return 0; -} - -static void -qpw_set_position(uint32_t channels, uint32_t position[SPA_AUDIO_MAX_CHANNELS]) -{ - memcpy(position, (uint32_t[SPA_AUDIO_MAX_CHANNELS]) { SPA_AUDIO_CHANNEL_UNKNOWN, }, - sizeof(uint32_t) * SPA_AUDIO_MAX_CHANNELS); - /* - * TODO: This currently expects the only frontend supporting more than 2 - * channels is the usb-audio. We will need some means to set channel - * order when a new frontend gains multi-channel support. - */ - switch (channels) { - case 8: - position[6] = SPA_AUDIO_CHANNEL_SL; - position[7] = SPA_AUDIO_CHANNEL_SR; - /* fallthrough */ - case 6: - position[2] = SPA_AUDIO_CHANNEL_FC; - position[3] = SPA_AUDIO_CHANNEL_LFE; - position[4] = SPA_AUDIO_CHANNEL_RL; - position[5] = SPA_AUDIO_CHANNEL_RR; - /* fallthrough */ - case 2: - position[0] = SPA_AUDIO_CHANNEL_FL; - position[1] = SPA_AUDIO_CHANNEL_FR; - break; - case 1: - position[0] = SPA_AUDIO_CHANNEL_MONO; - break; - default: - dolog("Internal error: unsupported channel count %d\n", channels); - } -} - -static int -qpw_init_out(HWVoiceOut *hw, struct audsettings *as, void *drv_opaque) -{ - PWVoiceOut *pw = (PWVoiceOut *) hw; - PWVoice *v = &pw->v; - struct audsettings obt_as = *as; - pwaudio *c = v->g = drv_opaque; - AudiodevPipewireOptions *popts = &c->dev->u.pipewire; - AudiodevPipewirePerDirectionOptions *ppdo = popts->out; - int r; - - pw_thread_loop_lock(c->thread_loop); - - v->info.format = audfmt_to_pw(as->fmt, as->endianness); - v->info.channels = as->nchannels; - qpw_set_position(as->nchannels, v->info.position); - v->info.rate = as->freq; - - obt_as.fmt = - pw_to_audfmt(v->info.format, &obt_as.endianness, &v->frame_size); - v->frame_size *= as->nchannels; - - v->req = (uint64_t)c->dev->timer_period * v->info.rate - * 1 / 2 / 1000000 * v->frame_size; - - /* call the function that creates a new stream for playback */ - r = qpw_stream_new(c, v, ppdo->stream_name ? : c->dev->id, - ppdo->name, SPA_DIRECTION_OUTPUT); - if (r < 0) { - pw_thread_loop_unlock(c->thread_loop); - return -1; - } - - /* report the audio format we support */ - audio_pcm_init_info(&hw->info, &obt_as); - - /* report the buffer size to qemu */ - hw->samples = audio_buffer_frames( - qapi_AudiodevPipewirePerDirectionOptions_base(ppdo), &obt_as, 46440); - v->highwater_mark = MIN(RINGBUFFER_SIZE, - (ppdo->has_latency ? ppdo->latency : 46440) - * (uint64_t)v->info.rate / 1000000 * v->frame_size); - - pw_thread_loop_unlock(c->thread_loop); - return 0; -} - -static int -qpw_init_in(HWVoiceIn *hw, struct audsettings *as, void *drv_opaque) -{ - PWVoiceIn *pw = (PWVoiceIn *) hw; - PWVoice *v = &pw->v; - struct audsettings obt_as = *as; - pwaudio *c = v->g = drv_opaque; - AudiodevPipewireOptions *popts = &c->dev->u.pipewire; - AudiodevPipewirePerDirectionOptions *ppdo = popts->in; - int r; - - pw_thread_loop_lock(c->thread_loop); - - v->info.format = audfmt_to_pw(as->fmt, as->endianness); - v->info.channels = as->nchannels; - qpw_set_position(as->nchannels, v->info.position); - v->info.rate = as->freq; - - obt_as.fmt = - pw_to_audfmt(v->info.format, &obt_as.endianness, &v->frame_size); - v->frame_size *= as->nchannels; - - /* call the function that creates a new stream for recording */ - r = qpw_stream_new(c, v, ppdo->stream_name ? : c->dev->id, - ppdo->name, SPA_DIRECTION_INPUT); - if (r < 0) { - pw_thread_loop_unlock(c->thread_loop); - return -1; - } - - /* report the audio format we support */ - audio_pcm_init_info(&hw->info, &obt_as); - - /* report the buffer size to qemu */ - hw->samples = audio_buffer_frames( - qapi_AudiodevPipewirePerDirectionOptions_base(ppdo), &obt_as, 46440); - - pw_thread_loop_unlock(c->thread_loop); - return 0; -} - -static void -qpw_voice_fini(PWVoice *v) -{ - pwaudio *c = v->g; - - if (!v->stream) { - return; - } - pw_thread_loop_lock(c->thread_loop); - pw_stream_destroy(v->stream); - v->stream = NULL; - pw_thread_loop_unlock(c->thread_loop); -} - -static void -qpw_fini_out(HWVoiceOut *hw) -{ - qpw_voice_fini(&PW_VOICE_OUT(hw)->v); -} - -static void -qpw_fini_in(HWVoiceIn *hw) -{ - qpw_voice_fini(&PW_VOICE_IN(hw)->v); -} - -static void -qpw_voice_set_enabled(PWVoice *v, bool enable) -{ - pwaudio *c = v->g; - pw_thread_loop_lock(c->thread_loop); - pw_stream_set_active(v->stream, enable); - pw_thread_loop_unlock(c->thread_loop); -} - -static void -qpw_enable_out(HWVoiceOut *hw, bool enable) -{ - qpw_voice_set_enabled(&PW_VOICE_OUT(hw)->v, enable); -} - -static void -qpw_enable_in(HWVoiceIn *hw, bool enable) -{ - qpw_voice_set_enabled(&PW_VOICE_IN(hw)->v, enable); -} - -static void -qpw_voice_set_volume(PWVoice *v, Volume *vol) -{ - pwaudio *c = v->g; - int i, ret; - - pw_thread_loop_lock(c->thread_loop); - v->volume.channels = vol->channels; - - for (i = 0; i < vol->channels; ++i) { - v->volume.values[i] = (float)vol->vol[i] / 255; - } - - ret = pw_stream_set_control(v->stream, - SPA_PROP_channelVolumes, v->volume.channels, v->volume.values, 0); - trace_pw_vol(ret == 0 ? "success" : "failed"); - - v->muted = vol->mute; - float val = v->muted ? 1.f : 0.f; - ret = pw_stream_set_control(v->stream, SPA_PROP_mute, 1, &val, 0); - pw_thread_loop_unlock(c->thread_loop); -} - -static void -qpw_volume_out(HWVoiceOut *hw, Volume *vol) -{ - qpw_voice_set_volume(&PW_VOICE_OUT(hw)->v, vol); -} - -static void -qpw_volume_in(HWVoiceIn *hw, Volume *vol) -{ - qpw_voice_set_volume(&PW_VOICE_IN(hw)->v, vol); -} - -static int wait_resync(pwaudio *pw) -{ - int res; - pw->pending_seq = pw_core_sync(pw->core, PW_ID_CORE, pw->pending_seq); - - while (true) { - pw_thread_loop_wait(pw->thread_loop); - - res = pw->error; - if (res < 0) { - pw->error = 0; - return res; - } - if (pw->pending_seq == pw->last_seq) { - break; - } - } - return 0; -} - -static void -on_core_error(void *data, uint32_t id, int seq, int res, const char *message) -{ - pwaudio *pw = data; - - error_report("error id:%u seq:%d res:%d (%s): %s", - id, seq, res, spa_strerror(res), message); - - /* stop and exit the thread loop */ - pw_thread_loop_signal(pw->thread_loop, FALSE); -} - -static void -on_core_done(void *data, uint32_t id, int seq) -{ - pwaudio *pw = data; - assert(id == PW_ID_CORE); - pw->last_seq = seq; - if (pw->pending_seq == seq) { - /* stop and exit the thread loop */ - pw_thread_loop_signal(pw->thread_loop, FALSE); - } -} - -static const struct pw_core_events core_events = { - PW_VERSION_CORE_EVENTS, - .done = on_core_done, - .error = on_core_error, -}; - -static void * -qpw_audio_init(Audiodev *dev, Error **errp) -{ - g_autofree pwaudio *pw = g_new0(pwaudio, 1); - - assert(dev->driver == AUDIODEV_DRIVER_PIPEWIRE); - trace_pw_audio_init(); - - pw_init(NULL, NULL); - - pw->dev = dev; - pw->thread_loop = pw_thread_loop_new("PipeWire thread loop", NULL); - if (pw->thread_loop == NULL) { - error_setg_errno(errp, errno, "Could not create PipeWire loop"); - goto fail; - } - - pw->context = - pw_context_new(pw_thread_loop_get_loop(pw->thread_loop), NULL, 0); - if (pw->context == NULL) { - error_setg_errno(errp, errno, "Could not create PipeWire context"); - goto fail; - } - - if (pw_thread_loop_start(pw->thread_loop) < 0) { - error_setg_errno(errp, errno, "Could not start PipeWire loop"); - goto fail; - } - - pw_thread_loop_lock(pw->thread_loop); - - pw->core = pw_context_connect(pw->context, NULL, 0); - if (pw->core == NULL) { - pw_thread_loop_unlock(pw->thread_loop); - goto fail_error; - } - - if (pw_core_add_listener(pw->core, &pw->core_listener, - &core_events, pw) < 0) { - pw_thread_loop_unlock(pw->thread_loop); - goto fail_error; - } - if (wait_resync(pw) < 0) { - pw_thread_loop_unlock(pw->thread_loop); - } - - pw_thread_loop_unlock(pw->thread_loop); - - return g_steal_pointer(&pw); - -fail_error: - error_setg(errp, "Failed to initialize PW context"); -fail: - if (pw->thread_loop) { - pw_thread_loop_stop(pw->thread_loop); - } - g_clear_pointer(&pw->context, pw_context_destroy); - g_clear_pointer(&pw->thread_loop, pw_thread_loop_destroy); - return NULL; -} - -static void -qpw_audio_fini(void *opaque) -{ - pwaudio *pw = opaque; - - if (pw->thread_loop) { - pw_thread_loop_stop(pw->thread_loop); - } - - if (pw->core) { - spa_hook_remove(&pw->core_listener); - spa_zero(pw->core_listener); - pw_core_disconnect(pw->core); - } - - if (pw->context) { - pw_context_destroy(pw->context); - } - pw_thread_loop_destroy(pw->thread_loop); - - g_free(pw); -} - -static struct audio_pcm_ops qpw_pcm_ops = { - .init_out = qpw_init_out, - .fini_out = qpw_fini_out, - .write = qpw_write, - .buffer_get_free = qpw_buffer_get_free, - .run_buffer_out = audio_generic_run_buffer_out, - .enable_out = qpw_enable_out, - .volume_out = qpw_volume_out, - .volume_in = qpw_volume_in, - - .init_in = qpw_init_in, - .fini_in = qpw_fini_in, - .read = qpw_read, - .run_buffer_in = audio_generic_run_buffer_in, - .enable_in = qpw_enable_in -}; - -static struct audio_driver pw_audio_driver = { - .name = "pipewire", - .descr = "http://www.pipewire.org/", - .init = qpw_audio_init, - .fini = qpw_audio_fini, - .pcm_ops = &qpw_pcm_ops, - .max_voices_out = INT_MAX, - .max_voices_in = INT_MAX, - .voice_size_out = sizeof(PWVoiceOut), - .voice_size_in = sizeof(PWVoiceIn), -}; - -static void -register_audio_pw(void) -{ - audio_driver_register(&pw_audio_driver); -} - -type_init(register_audio_pw); -- Gitee From ac7182ca1b9ed7dbb524da734a9f426b2ca07503 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Mon, 21 Oct 2024 09:48:30 +0800 Subject: [PATCH 468/939] docs/tools/qemu-img.rst: fix typo (sumarizes) cheery-pick from 8a8be21dde814e7cef43acac8140a7ccd0c4f6fb Signed-off-by: Samuel Tardieu Reviewed-by: Zhao Liu Signed-off-by: Michael Tokarev Signed-off-by: Zhang Jiao --- docs/tools/qemu-img.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index 4459c065f1..3653adb963 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -406,7 +406,7 @@ Command description: Compare exits with ``0`` in case the images are equal and with ``1`` in case the images differ. Other exit codes mean an error occurred during execution and standard error output should contain an error message. - The following table sumarizes all exit codes of the compare subcommand: + The following table summarizes all exit codes of the compare subcommand: 0 Images are identical (or requested help was printed) -- Gitee From 358b772c1289c1bf42dfe8c62b04b8a28d60ebf1 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Mon, 21 Oct 2024 14:28:13 +0800 Subject: [PATCH 469/939] s390x/sclp: Simplify get_sclp_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 3d9836e46dbe1e46c39fe76a62d3085a71ddbf7a get_sclp_device() scans the whole machine to find a TYPE_SCLP object. Now that the SCLPDevice instance is available under the machine state, use it to simplify the lookup. While at it, remove the inline to let the compiler decide on how to optimize. Signed-off-by: Cédric Le Goater Message-ID: <20240502131533.377719-4-clg@redhat.com> Reviewed-by: Thomas Huth Signed-off-by: Thomas Huth Signed-off-by: Zhang Jiao --- hw/s390x/sclp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c index 893e71a41b..69bf04e23a 100644 --- a/hw/s390x/sclp.c +++ b/hw/s390x/sclp.c @@ -21,13 +21,14 @@ #include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/ipl.h" #include "hw/s390x/cpu-topology.h" +#include "hw/s390x/s390-virtio-ccw.h" -static inline SCLPDevice *get_sclp_device(void) +static SCLPDevice *get_sclp_device(void) { static SCLPDevice *sclp; if (!sclp) { - sclp = SCLP(object_resolve_path_type("", TYPE_SCLP, NULL)); + sclp = S390_CCW_MACHINE(qdev_get_machine())->sclp; } return sclp; } -- Gitee From c761dac5d72f0d7c4643125e0611c75334b4ec4e Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Mon, 21 Oct 2024 15:58:54 +0800 Subject: [PATCH 470/939] ui: remove break after g_assert_not_reached() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from b3372e0ec818d7747963a2ec7ae04fd1a8152afd Use of assert(false) can trip spurious control flow warnings from some versions of GCC (i.e. using -fsanitize=thread with gcc-12): error: control reaches end of non-void function [-Werror=return-type] default: g_assert_not_reached(); break; | ^^^^^ Solve that by removing the unreachable 'break' statement, unifying the code base on g_assert_not_reached() instead. Signed-off-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240910221606.1817478-37-pierrick.bouvier@linaro.org> [PMD: Add description suggested by Eric Blake] Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- ui/qemu-pixman.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ui/qemu-pixman.c b/ui/qemu-pixman.c index 5ca55dd199..6cada8b45e 100644 --- a/ui/qemu-pixman.c +++ b/ui/qemu-pixman.c @@ -49,7 +49,6 @@ PixelFormat qemu_pixelformat_from_pixman(pixman_format_code_t format) break; default: g_assert_not_reached(); - break; } pf.amax = (1 << pf.abits) - 1; -- Gitee From ddaa38853d386e5b9f9fa1c3813048872c8ad687 Mon Sep 17 00:00:00 2001 From: niuyongwen Date: Sun, 29 Sep 2024 09:45:15 +0800 Subject: [PATCH 471/939] hw/misc/psp: Pin the hugepage memory specified by mem2 during use for psp Signed-off-by: niuyongwen --- hw/misc/psp.c | 138 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 121 insertions(+), 17 deletions(-) diff --git a/hw/misc/psp.c b/hw/misc/psp.c index 4eb5ca0e0b..03e8663027 100644 --- a/hw/misc/psp.c +++ b/hw/misc/psp.c @@ -17,6 +17,7 @@ #include "sysemu/runstate.h" #include "exec/memory.h" #include "exec/address-spaces.h" +#include "exec/ramblock.h" #include "hw/i386/e820_memory_layout.h" #include @@ -38,6 +39,8 @@ struct PSPDevState { * the TKM module uses different key spaces based on different vids. */ uint32_t vid; + /* pinned hugepage numbers */ + int hp_num; }; #define PSP_DEV_PATH "/dev/hygon_psp_config" @@ -45,6 +48,8 @@ struct PSPDevState { #define PSP_IOC_MUTEX_ENABLE _IOWR(HYGON_PSP_IOC_TYPE, 1, NULL) #define PSP_IOC_MUTEX_DISABLE _IOWR(HYGON_PSP_IOC_TYPE, 2, NULL) #define PSP_IOC_VPSP_OPT _IOWR(HYGON_PSP_IOC_TYPE, 3, NULL) +#define PSP_IOC_PIN_USER_PAGE _IOWR(HYGON_PSP_IOC_TYPE, 4, NULL) +#define PSP_IOC_UNPIN_USER_PAGE _IOWR(HYGON_PSP_IOC_TYPE, 5, NULL) enum VPSP_DEV_CTRL_OPCODE { VPSP_OP_VID_ADD, @@ -69,6 +74,109 @@ struct psp_dev_ctrl { } __attribute__ ((packed)) data; }; +static MemoryRegion *find_memory_region_by_name(MemoryRegion *root, const char *name) { + MemoryRegion *subregion; + MemoryRegion *result; + + if (strcmp(root->name, name) == 0) + return root; + + QTAILQ_FOREACH(subregion, &root->subregions, subregions_link) { + result = find_memory_region_by_name(subregion, name); + if (result) { + return result; + } + } + + return NULL; +} + +static int pin_user_hugepage(int fd, uint64_t vaddr) +{ + int ret; + + ret = ioctl(fd, PSP_IOC_PIN_USER_PAGE, vaddr); + /* 22: Invalid argument, some old kernel doesn't support this ioctl command */ + if (ret != 0 && errno == EINVAL) { + ret = 0; + } + return ret; +} + +static int unpin_user_hugepage(int fd, uint64_t vaddr) +{ + int ret; + + ret = ioctl(fd, PSP_IOC_UNPIN_USER_PAGE, vaddr); + /* 22: Invalid argument, some old kernel doesn't support this ioctl command */ + if (ret != 0 && errno == EINVAL) { + ret = 0; + } + return ret; +} + +static int pin_psp_user_hugepages(struct PSPDevState *state, MemoryRegion *root) +{ + int ret = 0; + char mr_name[128] = {0}; + int i, pinned_num; + MemoryRegion *find_mr = NULL; + + for (i = 0 ; i < state->hp_num; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root, mr_name); + if (!find_mr) { + error_report("fail to find memory region by name %s.", mr_name); + ret = -ENOMEM; + goto end; + } + + ret = pin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host); + if (ret) { + error_report("fail to pin_user_hugepage, ret: %d.", ret); + goto end; + } + } +end: + if (ret) { + pinned_num = i; + for (i = 0 ; i < pinned_num; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root, mr_name); + if (!find_mr) { + continue; + } + unpin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host); + } + + } + return ret; +} + +static int unpin_psp_user_hugepages(struct PSPDevState *state, MemoryRegion *root) +{ + int ret = 0; + char mr_name[128] = {0}; + int i; + MemoryRegion *find_mr = NULL; + + for (i = 0 ; i < state->hp_num; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root, mr_name); + if (!find_mr) { + continue; + } + + ret = unpin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host); + if (ret) { + error_report("fail to unpin_user_hugepage, ret: %d.", ret); + goto end; + } + } +end: + return ret; +} + static void psp_dev_destroy(PSPDevState *state) { struct psp_dev_ctrl ctrl = { 0 }; @@ -77,6 +185,11 @@ static void psp_dev_destroy(PSPDevState *state) ctrl.op = VPSP_OP_VID_DEL; if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { error_report("VPSP_OP_VID_DEL: %d", -errno); + } + + /* Unpin hugepage memory */ + if (unpin_psp_user_hugepages(state, get_system_memory())) { + error_report("unpin_psp_user_hugepages failed"); } else { state->enabled = false; } @@ -99,23 +212,6 @@ static void psp_dev_shutdown_notify(Notifier *notifier, void *data) psp_dev_destroy(state); } -static MemoryRegion *find_memory_region_by_name(MemoryRegion *root, const char *name) { - MemoryRegion *subregion; - MemoryRegion *result; - - if (strcmp(root->name, name) == 0) - return root; - - QTAILQ_FOREACH(subregion, &root->subregions, subregions_link) { - result = find_memory_region_by_name(subregion, name); - if (result) { - return result; - } - } - - return NULL; -} - static void psp_dev_realize(DeviceState *dev, Error **errp) { int i; @@ -150,6 +246,8 @@ static void psp_dev_realize(DeviceState *dev, Error **errp) ram2_end = find_mr->addr + find_mr->size - 1; } + state->hp_num = i; + if (ram2_start != ram2_end) { ctrl.op = VPSP_OP_SET_GPA; ctrl.data.gpa.gpa_start = ram2_start; @@ -159,6 +257,12 @@ static void psp_dev_realize(DeviceState *dev, Error **errp) ram2_start, ram2_end, -errno); goto del_vid; } + + /* Pin hugepage memory */ + if(pin_psp_user_hugepages(state, root_mr)) { + error_setg(errp, "pin_psp_user_hugepages failed."); + goto del_vid; + } } state->enabled = true; -- Gitee From 55ea1e473095ea5be692bb4ba2e44131a4a88e73 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Wed, 23 Oct 2024 13:40:51 +0800 Subject: [PATCH 472/939] hw/pci-bridge: Add a Kconfig switch for the normal PCI bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from e779e5c05ad5d8237e2a7d8ba8b432cd24c1708b The pci-bridge device is not usable on s390x, so introduce a Kconfig switch that allows to disable it. Message-ID: <20240913144844.427899-1-thuth@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cédric Le Goater Signed-off-by: Thomas Huth Signed-off-by: Zhang Jiao --- hw/pci-bridge/Kconfig | 5 +++++ hw/pci-bridge/meson.build | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/pci-bridge/Kconfig b/hw/pci-bridge/Kconfig index 67077366cc..449ec98643 100644 --- a/hw/pci-bridge/Kconfig +++ b/hw/pci-bridge/Kconfig @@ -1,3 +1,8 @@ +config PCI_BRIDGE + bool + default y if PCI_DEVICES + depends on PCI + config PCIE_PORT bool default y if PCI_DEVICES diff --git a/hw/pci-bridge/meson.build b/hw/pci-bridge/meson.build index 6d5ad9f37b..a8b88e9099 100644 --- a/hw/pci-bridge/meson.build +++ b/hw/pci-bridge/meson.build @@ -1,5 +1,5 @@ pci_ss = ss.source_set() -pci_ss.add(files('pci_bridge_dev.c')) +pci_ss.add(when: 'CONFIG_PCI_BRIDGE', if_true: files('pci_bridge_dev.c')) pci_ss.add(when: 'CONFIG_I82801B11', if_true: files('i82801b11.c')) pci_ss.add(when: 'CONFIG_IOH3420', if_true: files('ioh3420.c')) pci_ss.add(when: 'CONFIG_PCIE_PORT', if_true: files('pcie_root_port.c', 'gen_pcie_root_port.c')) -- Gitee From 15b6c032ed2f92aa3210fe30376119eb468af039 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Wed, 23 Oct 2024 14:19:00 +0800 Subject: [PATCH 473/939] hw/intc/openpic: Improve errors for out of bounds property values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 627c1e012cb3f14745f9b7d991642894a4402d5c The error message doesn't matter much, as the "openpic" device isn't user-creatable. But it's the last use of QERR_PROPERTY_VALUE_OUT_OF_RANGE, which has to go. Change the message just like the previous commit did for x86 CPUs. Signed-off-by: Markus Armbruster Message-ID: <20241010150144.986655-7-armbru@redhat.com> Reviewed-by: Daniel P. Berrangé Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- hw/intc/openpic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index a6f91d4bcd..0f99b77a17 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -41,7 +41,6 @@ #include "hw/pci/msi.h" #include "qapi/error.h" #include "qemu/bitops.h" -#include "qapi/qmp/qerror.h" #include "qemu/module.h" #include "qemu/timer.h" #include "qemu/error-report.h" @@ -1535,9 +1534,7 @@ static void openpic_realize(DeviceState *dev, Error **errp) }; if (opp->nb_cpus > MAX_CPU) { - error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE, - TYPE_OPENPIC, "nb_cpus", (uint64_t)opp->nb_cpus, - (uint64_t)0, (uint64_t)MAX_CPU); + error_setg(errp, "property 'nb_cpus' can be at most %d", MAX_CPU); return; } -- Gitee From 07fa80eacaa17d3cc3865050244b79d39cc61944 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Wed, 23 Oct 2024 14:34:56 +0800 Subject: [PATCH 474/939] acpi: ged: Add macro for acpi sleep control register cheery-pick from edafc90ba481c586d0a649f34dcb8cd1f29c4259 Macro definition is added for acpi sleep control register, ged emulation driver can use the macro , also it can be used in FDT table if ged is exposed with FDT table. Signed-off-by: Bibo Mao Reviewed-by: Igor Mammedov Message-Id: <20240918014206.2165821-2-maobibo@loongson.cn> Signed-off-by: Song Gao Signed-off-by: Zhang Jiao --- hw/acpi/generic_event_device.c | 6 +++--- include/hw/acpi/generic_event_device.h | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 4731a614a3..2ce7031f1a 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -203,9 +203,9 @@ static void ged_regs_write(void *opaque, hwaddr addr, uint64_t data, switch (addr) { case ACPI_GED_REG_SLEEP_CTL: - slp_typ = (data >> 2) & 0x07; - slp_en = (data >> 5) & 0x01; - if (slp_en && slp_typ == 5) { + slp_typ = (data >> ACPI_GED_SLP_TYP_POS) & ACPI_GED_SLP_TYP_MASK; + slp_en = !!(data & ACPI_GED_SLP_EN); + if (slp_en && slp_typ == ACPI_GED_SLP_TYP_S5) { qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); } return; diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index 90fc41cbb8..8ed9534c57 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -81,8 +81,11 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) /* ACPI_GED_REG_RESET value for reset*/ #define ACPI_GED_RESET_VALUE 0x42 -/* ACPI_GED_REG_SLEEP_CTL.SLP_TYP value for S5 (aka poweroff) */ -#define ACPI_GED_SLP_TYP_S5 0x05 +/* [ACPI 5.0 Chapter 4.8.3.7] Sleep Control and Status Register */ +#define ACPI_GED_SLP_TYP_POS 0x2 /* SLP_TYPx Bit Offset */ +#define ACPI_GED_SLP_TYP_MASK 0x07 /* SLP_TYPx 3-bit mask */ +#define ACPI_GED_SLP_TYP_S5 0x05 /* System _S5 State (Soft Off) */ +#define ACPI_GED_SLP_EN 0x20 /* SLP_EN write-only bit */ #define GED_DEVICE "GED" #define AML_GED_EVT_REG "EREG" -- Gitee From c4d618ea0dc507084d9c1e2b61e58691a73c2cf4 Mon Sep 17 00:00:00 2001 From: Susanooo Date: Thu, 24 Oct 2024 10:10:34 +0800 Subject: [PATCH 475/939] tests: Wait for migration completion on destination QEMU to avoid failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than waiting for the completion of migration on the source side, wait for it on the destination QEMU side to avoid accessing the TPM TIS memory mapped registers before QEMU could restore their state. This error condition could be triggered on busy systems where the destination QEMU did not have enough time to restore the TIS state while the test case was already reading its registers. The test case was for example reading the STS register and received an unexpected value (0xffffffff), whic lead to a segmentation fault later on due to trying to read 0xffff bytes from the TIS into a buffer. Cc: Reported-by: Fabiano Rosas Reviewed-by: Daniel P. Berrangé Signed-off-by: Stefan Berger Signed-off-by: zhangchujun --- tests/qtest/tpm-tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/tpm-tests.c b/tests/qtest/tpm-tests.c index fb94496bbd..197714f8d9 100644 --- a/tests/qtest/tpm-tests.c +++ b/tests/qtest/tpm-tests.c @@ -114,7 +114,7 @@ void tpm_test_swtpm_migration_test(const char *src_tpm_path, sizeof(tpm_pcrread_resp)); tpm_util_migrate(src_qemu, uri); - tpm_util_wait_for_migration_complete(src_qemu); + tpm_util_wait_for_migration_complete(dst_qemu); tpm_util_pcrread(dst_qemu, tx, tpm_pcrread_resp, sizeof(tpm_pcrread_resp)); -- Gitee From cbd62b91ecdd0ec5f4ccb4c726e0adcdd2808270 Mon Sep 17 00:00:00 2001 From: Susanooo Date: Thu, 24 Oct 2024 10:34:17 +0800 Subject: [PATCH 476/939] hw/loongarch/virt: Remove unnecessary 'cpu.h' inclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Song Gao Reviewed-by: Thomas Huth Message-Id: <20240927213254.17552-2-philmd@linaro.org> Signed-off-by: Song Gao Signed-off-by: zhangchujun --- include/hw/loongarch/virt.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 99447fd1d6..17a792e596 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -8,7 +8,6 @@ #ifndef HW_LOONGARCH_H #define HW_LOONGARCH_H -#include "target/loongarch/cpu.h" #include "hw/boards.h" #include "qemu/queue.h" #include "hw/intc/loongarch_ipi.h" -- Gitee From 16fb3ec642af7ec7980b7ceff1b25abee3fecee2 Mon Sep 17 00:00:00 2001 From: Susanooo Date: Fri, 25 Oct 2024 09:20:38 +0800 Subject: [PATCH 477/939] raw-format: Fix error message for invalid offset/size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit s->offset and s->size are only set at the end of the function and still contain the old values when formatting the error message. Print the parameters with the new values that we actually checked instead. Fixes: 500e243 ('raw-format: Split raw_read_options()') Signed-off-by: Kevin Wolf Message-ID: <20240829185527.47152-1-kwolf@redhat.com> Reviewed-by: Daniel P. Berrangé Reviewed-by: Hanna Czenczek Signed-off-by: Kevin Wolf Signed-off-by: zhangchujun --- block/raw-format.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/raw-format.c b/block/raw-format.c index 1111dffd54..8195ed87cc 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -111,7 +111,7 @@ raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset, if (offset > real_size) { error_setg(errp, "Offset (%" PRIu64 ") cannot be greater than " "size of the containing file (%" PRId64 ")", - s->offset, real_size); + offset, real_size); return -EINVAL; } @@ -119,7 +119,7 @@ raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset, error_setg(errp, "The sum of offset (%" PRIu64 ") and size " "(%" PRIu64 ") has to be smaller or equal to the " " actual size of the containing file (%" PRId64 ")", - s->offset, s->size, real_size); + offset, size, real_size); return -EINVAL; } -- Gitee From 34af051406f75bdef6f2ef598cde51e756ea8489 Mon Sep 17 00:00:00 2001 From: Susanooo Date: Fri, 25 Oct 2024 09:26:25 +0800 Subject: [PATCH 478/939] linux-user: Clean up unused header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up unused (already commented-out) header from syscall.c. Signed-off-by: Gustavo Romero Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev Signed-off-by: zhangchujun --- linux-user/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/linux-user/syscall.c b/linux-user/syscall.c index e384e14248..513996e6fa 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -53,7 +53,6 @@ #include #include #include -//#include #include #include #include -- Gitee From e41395594aab30a22ffaf1556d19ee623a33e6ec Mon Sep 17 00:00:00 2001 From: Susanooo Date: Fri, 25 Oct 2024 09:33:41 +0800 Subject: [PATCH 479/939] ui/console-vc: Silence warning about sprintf() on OpenBSD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The linker on OpenBSD complains: ld: warning: console-vc.c:824 (../src/ui/console-vc.c:824)([...]): warning: sprintf() is often misused, please use snprintf() Using g_strdup_printf() is certainly better here, so let's switch to that function instead. Signed-off-by: Thomas Huth Reviewed-by: Marc-André Lureau Reviewed-by: Alex Bennée Reviewed-by: Richard Henderson Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev Signed-off-by: zhangchujun --- ui/console-vc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/console-vc.c b/ui/console-vc.c index 9c13cc2981..b1903c3e48 100644 --- a/ui/console-vc.c +++ b/ui/console-vc.c @@ -648,7 +648,7 @@ static void vc_putchar(VCChardev *vc, int ch) QemuTextConsole *s = vc->console; int i; int x, y; - char response[40]; + g_autofree char *response = NULL; switch(vc->state) { case TTY_STATE_NORM: @@ -821,7 +821,7 @@ static void vc_putchar(VCChardev *vc, int ch) break; case 6: /* report cursor position */ - sprintf(response, "\033[%d;%dR", + response = g_strdup_printf("\033[%d;%dR", (s->y_base + s->y) % s->total_height + 1, s->x + 1); vc_respond_str(vc, response); -- Gitee From 199dcd16027e3573f5eeaa4396c361cfec91cbe1 Mon Sep 17 00:00:00 2001 From: Susanooo Date: Fri, 25 Oct 2024 09:44:21 +0800 Subject: [PATCH 480/939] meson.build: Remove ncurses workaround for OpenBSD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit meson.build: Remove ncurses workaround for OpenBSD OpenBSD 7.5 has upgraded to ncurses 6.4. Signed-off-by: Brad Smith Reviewed-by: Daniel P. Berrangé Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev Signed-off-by: zhangchujun --- meson.build | 2 +- ui/curses.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index 4024f9a4bb..b3ee125b72 100644 --- a/meson.build +++ b/meson.build @@ -1139,7 +1139,7 @@ iconv = not_found curses = not_found if have_system and get_option('curses').allowed() curses_test = ''' - #if defined(__APPLE__) || defined(__OpenBSD__) + #ifdef __APPLE__ #define _XOPEN_SOURCE_EXTENDED 1 #endif #include diff --git a/ui/curses.c b/ui/curses.c index 8bde8c5cf7..26438486fc 100644 --- a/ui/curses.c +++ b/ui/curses.c @@ -38,7 +38,7 @@ #include "ui/input.h" #include "sysemu/sysemu.h" -#if defined(__APPLE__) || defined(__OpenBSD__) +#ifdef __APPLE__ #define _XOPEN_SOURCE_EXTENDED 1 #endif -- Gitee From cc875acdbf0ab210ce467f27c621fe7dc2159110 Mon Sep 17 00:00:00 2001 From: zhangchujun Date: Wed, 30 Oct 2024 10:57:05 +0800 Subject: [PATCH 481/939] virtio-net: Avoid indirection_table_mask overflow We computes indirections_len by adding 1 to indirection_table_mask, but it may overflow indirection_table_mask is UINT16_MAX. Check if indirection_table_mask is small enough before adding 1. Fixes: 5907902 ("virtio-net: implement RSS configuration command") Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang Signed-off-by: zhangchujun --- hw/net/virtio-net.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 432c433540..d5008b65ec 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1400,17 +1400,17 @@ static uint16_t virtio_net_handle_rss(VirtIONet *n, n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types); n->rss_data.indirections_len = virtio_lduw_p(vdev, &cfg.indirection_table_mask); - n->rss_data.indirections_len++; if (!do_rss) { - n->rss_data.indirections_len = 1; + n->rss_data.indirections_len = 0; } - if (!is_power_of_2(n->rss_data.indirections_len)) { - err_msg = "Invalid size of indirection table"; + if (n->rss_data.indirections_len >= VIRTIO_NET_RSS_MAX_TABLE_LEN) { + err_msg = "Too large indirection table"; err_value = n->rss_data.indirections_len; goto error; } - if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) { - err_msg = "Too large indirection table"; + n->rss_data.indirections_len++; + if (!is_power_of_2(n->rss_data.indirections_len)) { + err_msg = "Invalid size of indirection table"; err_value = n->rss_data.indirections_len; goto error; } -- Gitee From f6ad72a5b215bc5b2d8df86cd537bf1c0f468108 Mon Sep 17 00:00:00 2001 From: zhangchujun Date: Wed, 30 Oct 2024 13:33:58 +0800 Subject: [PATCH 482/939] Fix calculation of minimum in colo_compare_tcp GitHub's CodeQL reports a critical error which is fixed by using the MIN macro: Unsigned difference expression compared to zero Signed-off-by: Stefan Weil Cc: qemu-stable@nongnu.org Reviewed-by: Zhang Chen Signed-off-by: Jason Wang Signed-off-by: zhangchujun --- net/colo-compare.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/colo-compare.c b/net/colo-compare.c index 7f9e6f89ce..d4e51cb306 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -413,8 +413,7 @@ static void colo_compare_tcp(CompareState *s, Connection *conn) * can ensure that the packet's payload is acknowledged by * primary and secondary. */ - uint32_t min_ack = conn->pack - conn->sack > 0 ? - conn->sack : conn->pack; + uint32_t min_ack = MIN(conn->pack, conn->sack); pri: if (g_queue_is_empty(&conn->primary_list)) { -- Gitee From 6b1b8553ea3810e497d225d64e98dd6eac7b4e2c Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Tue, 5 Nov 2024 03:21:00 -0500 Subject: [PATCH 483/939] target/riscv/csr.c: Fix an access to VXSAT cheery-pick from 5a60026cad4e9dba929cab4f63229e4b9110cf0a The register VXSAT should be RW only to the first bit. The remaining bits should be 0. The RISC-V Instruction Set Manual Volume I: Unprivileged Architecture The vxsat CSR has a single read-write least-significant bit (vxsat[0]) that indicates if a fixed-point instruction has had to saturate an output value to fit into a destination format. Bits vxsat[XLEN-1:1] should be written as zeros. Signed-off-by: Evgenii Prokopiev Reviewed-by: Daniel Henrique Barboza Reviewed-by: Alistair Francis Message-ID: <20241002084436.89347-1-evgenii.prokopiev@syntacore.com> Signed-off-by: Alistair Francis Signed-off-by: qihao_yewu --- target/riscv/csr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/riscv/csr.c b/target/riscv/csr.c index fde7ce1a53..d1bb7bc0d3 100644 --- a/target/riscv/csr.c +++ b/target/riscv/csr.c @@ -704,7 +704,7 @@ static RISCVException write_vxrm(CPURISCVState *env, int csrno, static RISCVException read_vxsat(CPURISCVState *env, int csrno, target_ulong *val) { - *val = env->vxsat; + *val = env->vxsat & BIT(0); return RISCV_EXCP_NONE; } @@ -714,7 +714,7 @@ static RISCVException write_vxsat(CPURISCVState *env, int csrno, #if !defined(CONFIG_USER_ONLY) env->mstatus |= MSTATUS_VS; #endif - env->vxsat = val; + env->vxsat = val & BIT(0); return RISCV_EXCP_NONE; } -- Gitee From 28bf94c86d3914b8b517dae483d1d69b3afabacc Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Tue, 5 Nov 2024 07:03:48 -0500 Subject: [PATCH 484/939] hw/audio/hda: free timer on exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from f27206ceedbe2efae37c8d143c5eb2db05251508 Fixes: 280c1e1cd ("audio/hda: create millisecond timers that handle IO") Signed-off-by: Marc-André Lureau Reviewed-by: Akihiko Odaki Message-ID: <20241008125028.1177932-2-marcandre.lureau@redhat.com> Signed-off-by: qihao_yewu --- hw/audio/hda-codec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c index 0bc20d49f6..19f401cabe 100644 --- a/hw/audio/hda-codec.c +++ b/hw/audio/hda-codec.c @@ -751,7 +751,7 @@ static void hda_audio_exit(HDACodecDevice *hda) continue; } if (a->use_timer) { - timer_del(st->buft); + timer_free(st->buft); } if (st->output) { AUD_close_out(&a->card, st->voice.out); -- Gitee From 66eb68e54a521bc0dac015415a9eca25fe479543 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 4 Nov 2024 20:55:34 +0800 Subject: [PATCH 485/939] intel_iommu: Send IQE event when setting reserved bit in IQT_TAIL According to VTD spec, Figure 11-22, Invalidation Queue Tail Register, "When Descriptor Width (DW) field in Invalidation Queue Address Register (IQA_REG) is Set (256-bit descriptors), hardware treats bit-4 as reserved and a value of 1 in the bit will result in invalidation queue error." Current code missed to send IQE event to guest, fix it. Fixes: c0c1d351849b ("intel_iommu: add 256 bits qi_desc support") Suggested-by: Yi Liu Signed-off-by: Zhenzhong Duan Message-Id: <20241104125536.1236118-2-zhenzhong.duan@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Zhongrui Tang --- hw/i386/intel_iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 5085a6fee3..3da56e439e 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2813,6 +2813,7 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s) if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { error_report_once("%s: RSV bit is set: val=0x%"PRIx64, __func__, val); + vtd_handle_inv_queue_error(s); return; } s->iq_tail = VTD_IQT_QT(s->iq_dw, val); -- Gitee From 3c108b874b8b142a42939d785d6706a44e7035d7 Mon Sep 17 00:00:00 2001 From: Roque Arcudia Hernandez Date: Fri, 1 Nov 2024 21:59:23 +0000 Subject: [PATCH 486/939] hw/pci: Add parenthesis to PCI_BUILD_BDF macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bus parameter in the macro PCI_BUILD_BDF is not surrounded by parenthesis. This can create a compile error when warnings are treated as errors or can potentially create runtime errors due to the operator precedence. For instance: file.c:x:32: error: suggest parentheses around '-' inside '<<' [-Werror=parentheses] 171 | uint16_t bdf = PCI_BUILD_BDF(a - b, sdev->devfn); | ~~^~~ include/hw/pci/pci.h:19:41: note: in definition of macro 'PCI_BUILD_BDF' 19 | #define PCI_BUILD_BDF(bus, devfn) ((bus << 8) | (devfn)) | ^~~ cc1: all warnings being treated as errors Signed-off-by: Roque Arcudia Hernandez Reviewed-by: Nabih Estefan Message-Id: <20241101215923.3399311-1-roqueh@google.com> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Zhongrui Tang --- include/hw/pci/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index fa6313aabc..7cf7b5619a 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -15,7 +15,7 @@ extern bool pci_available; #define PCI_BUS_NUM(x) (((x) >> 8) & 0xff) #define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) #define PCI_FUNC(devfn) ((devfn) & 0x07) -#define PCI_BUILD_BDF(bus, devfn) ((bus << 8) | (devfn)) +#define PCI_BUILD_BDF(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BDF_TO_DEVFN(x) ((x) & 0xff) #define PCI_BUS_MAX 256 #define PCI_DEVFN_MAX 256 -- Gitee From 830009038a73e496598c26679b7e30d7e931a1cf Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Nov 2024 13:39:16 +0000 Subject: [PATCH 487/939] hw/cxl: Ensure there is enough data for the header in cmd_ccls_set_lsa() The properties of the requested set command cannot be established if len_in is less than the size of the header. Reported-by: Esifiel Signed-off-by: Jonathan Cameron Message-Id: <20241101133917.27634-10-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Zhongrui Tang --- hw/cxl/cxl-mailbox-utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 6eff56fb1b..9f2304389b 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -897,8 +897,8 @@ static CXLRetCode cmd_ccls_set_lsa(const struct cxl_cmd *cmd, const size_t hdr_len = offsetof(struct set_lsa_pl, data); *len_out = 0; - if (!len_in) { - return CXL_MBOX_SUCCESS; + if (len_in < hdr_len) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; } if (set_lsa_payload->offset + len_in > cvc->get_lsa_size(ct3d) + hdr_len) { -- Gitee From d96c34e132df55ca7be458095f23d81dfc14e0d5 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Nov 2024 13:39:17 +0000 Subject: [PATCH 488/939] hw/cxl: Ensure there is enough data to read the input header in cmd_get_physical_port_state() If len_in is smaller than the header length then the accessing the number of ports will result in an out of bounds access. Add a check to avoid this. Reported-by: Esifiel Signed-off-by: Jonathan Cameron Message-Id: <20241101133917.27634-11-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Signed-off-by: Zhongrui Tang --- hw/cxl/cxl-mailbox-utils.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 6eff56fb1b..11a26525a2 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -505,6 +505,9 @@ static CXLRetCode cmd_get_physical_port_state(const struct cxl_cmd *cmd, in = (struct cxl_fmapi_get_phys_port_state_req_pl *)payload_in; out = (struct cxl_fmapi_get_phys_port_state_resp_pl *)payload_out; + if (len_in < sizeof(*in)) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; + } /* Check if what was requested can fit */ if (sizeof(*out) + sizeof(*out->ports) * in->num_ports > cci->payload_max) { return CXL_MBOX_INVALID_INPUT; -- Gitee From 8c7e606ff2e59df7be719b13f28fe629414fcb30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 5 Mar 2024 12:09:37 +0000 Subject: [PATCH 489/939] tests: bump QOS_PATH_MAX_ELEMENT_SIZE again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We "fixed" a bug with LTO builds with 100c459f194 (tests/qtest: bump up QOS_PATH_MAX_ELEMENT_SIZE) but it seems it has triggered again. The array is sized according to the maximum anticipated length of a path on the graph. However, the worst case for a depth-first search is to push all nodes on the graph. So it's not really LTO, it depends on the ordering of the constructors. Lets be more assertive raising QOS_PATH_MAX_ELEMENT_SIZE to make it go away again. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1186 (again) Reviewed-by: Thomas Huth Signed-off-by: Alex Bennée Message-Id: <20240305121005.3528075-2-alex.bennee@linaro.org> --- tests/qtest/libqos/qgraph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/libqos/qgraph.h b/tests/qtest/libqos/qgraph.h index 287022a67c..1b5de02e7b 100644 --- a/tests/qtest/libqos/qgraph.h +++ b/tests/qtest/libqos/qgraph.h @@ -24,7 +24,7 @@ #include "libqos-malloc.h" /* maximum path length */ -#define QOS_PATH_MAX_ELEMENT_SIZE 64 +#define QOS_PATH_MAX_ELEMENT_SIZE 128 typedef struct QOSGraphObject QOSGraphObject; typedef struct QOSGraphNode QOSGraphNode; -- Gitee From 9cd544b83ccd37b9dd7977717a245437533830cd Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Tue, 6 Aug 2024 17:37:12 +0800 Subject: [PATCH 490/939] virtio-pci: Fix the use of an uninitialized irqfd The crash was reported in MAC OS and NixOS, here is the link for this bug https://gitlab.com/qemu-project/qemu/-/issues/2334 https://gitlab.com/qemu-project/qemu/-/issues/2321 In this bug, they are using the virtio_input device. The guest notifier was not supported for this device, The function virtio_pci_set_guest_notifiers() was not called, and the vector_irqfd was not initialized. So the fix is adding the check for vector_irqfd in virtio_pci_get_notifier() The function virtio_pci_get_notifier() can be used in various devices. It could also be called when VIRTIO_CONFIG_S_DRIVER_OK is not set. In this situation, the vector_irqfd being NULL is acceptable. We can allow the device continue to boot If the vector_irqfd still hasn't been initialized after VIRTIO_CONFIG_S_DRIVER_OK is set, it means that the function set_guest_notifiers was not called before the driver started. This indicates that the device is not using the notifier. At this point, we will let the check fail. This fix is verified in vyatta,MacOS,NixOS,fedora system. The bt tree for this bug is: Thread 6 "CPU 0/KVM" received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7c817be006c0 (LWP 1269146)] kvm_virtio_pci_vq_vector_use () at ../qemu-9.0.0/hw/virtio/virtio-pci.c:817 817 if (irqfd->users == 0) { (gdb) thread apply all bt ... Thread 6 (Thread 0x7c817be006c0 (LWP 1269146) "CPU 0/KVM"): 0 kvm_virtio_pci_vq_vector_use () at ../qemu-9.0.0/hw/virtio/virtio-pci.c:817 1 kvm_virtio_pci_vector_use_one () at ../qemu-9.0.0/hw/virtio/virtio-pci.c:893 2 0x00005983657045e2 in memory_region_write_accessor () at ../qemu-9.0.0/system/memory.c:497 3 0x0000598365704ba6 in access_with_adjusted_size () at ../qemu-9.0.0/system/memory.c:573 4 0x0000598365705059 in memory_region_dispatch_write () at ../qemu-9.0.0/system/memory.c:1528 5 0x00005983659b8e1f in flatview_write_continue_step.isra.0 () at ../qemu-9.0.0/system/physmem.c:2713 6 0x000059836570ba7d in flatview_write_continue () at ../qemu-9.0.0/system/physmem.c:2743 7 flatview_write () at ../qemu-9.0.0/system/physmem.c:2774 8 0x000059836570bb76 in address_space_write () at ../qemu-9.0.0/system/physmem.c:2894 9 0x0000598365763afe in address_space_rw () at ../qemu-9.0.0/system/physmem.c:2904 10 kvm_cpu_exec () at ../qemu-9.0.0/accel/kvm/kvm-all.c:2917 11 0x000059836576656e in kvm_vcpu_thread_fn () at ../qemu-9.0.0/accel/kvm/kvm-accel-ops.c:50 12 0x0000598365926ca8 in qemu_thread_start () at ../qemu-9.0.0/util/qemu-thread-posix.c:541 13 0x00007c8185bcd1cf in ??? () at /usr/lib/libc.so.6 14 0x00007c8185c4e504 in clone () at /usr/lib/libc.so.6 Fixes: 2ce6cff94d ("virtio-pci: fix use of a released vector") Cc: qemu-stable@nongnu.org Signed-off-by: Cindy Lu Message-Id: <20240806093715.65105-1-lulu@redhat.com> Acked-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit a8e63ff289d137197ad7a701a587cc432872d798) Signed-off-by: zhujun2 --- hw/virtio/virtio-pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 3ad7487411..06b125ec62 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -860,6 +860,9 @@ static int virtio_pci_get_notifier(VirtIOPCIProxy *proxy, int queue_no, VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); VirtQueue *vq; + if (!proxy->vector_irqfd && vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) + return -1; + if (queue_no == VIRTIO_CONFIG_IRQ_IDX) { *n = virtio_config_get_guest_notifier(vdev); *vector = vdev->config_vector; -- Gitee From 9daf2b936101d612a295217822791d323e908fc9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 8 Aug 2024 10:05:45 +0200 Subject: [PATCH 491/939] block/blkio: use FUA flag on write zeroes only if supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit libblkio supports BLKIO_REQ_FUA with write zeros requests only since version 1.4.0, so let's inform the block layer that the blkio driver supports it only in this case. Otherwise we can have runtime errors as reported in https://issues.redhat.com/browse/RHEL-32878 Fixes: fd66dbd424 ("blkio: add libblkio block driver") Cc: qemu-stable@nongnu.org Buglink: https://issues.redhat.com/browse/RHEL-32878 Signed-off-by: Stefano Garzarella Reviewed-by: Eric Blake Reviewed-by: Philippe Mathieu-Daudé Message-id: 20240808080545.40744-1-sgarzare@redhat.com Signed-off-by: Stefan Hajnoczi (cherry picked from commit 547c4e50929ec6c091d9c16a7b280e829b12b463) Signed-off-by: zhujun2 --- block/blkio.c | 6 ++++-- meson.build | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index b989617608..027c16ceb6 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -899,8 +899,10 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, } bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; - bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | - BDRV_REQ_NO_FALLBACK; + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; +#ifdef CONFIG_BLKIO_WRITE_ZEROS_FUA + bs->supported_zero_flags |= BDRV_REQ_FUA; +#endif qemu_mutex_init(&s->blkio_lock); qemu_co_mutex_init(&s->bounce_lock); diff --git a/meson.build b/meson.build index 4024f9a4bb..ce2fd07963 100644 --- a/meson.build +++ b/meson.build @@ -2181,6 +2181,8 @@ config_host_data.set('CONFIG_BLKIO', blkio.found()) if blkio.found() config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD', blkio.version().version_compare('>=1.3.0')) + config_host_data.set('CONFIG_BLKIO_WRITE_ZEROS_FUA', + blkio.version().version_compare('>=1.4.0')) endif config_host_data.set('CONFIG_CURL', curl.found()) config_host_data.set('CONFIG_CURSES', curses.found()) -- Gitee From e16c3aa63a203e376a40404314252a11e85a5bda Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 29 Jul 2024 13:05:33 +0100 Subject: [PATCH 492/939] docs/sphinx/depfile.py: Handle env.doc2path() returning a Path not a str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In newer versions of Sphinx the env.doc2path() API is going to change to return a Path object rather than a str. This was originally visible in Sphinx 8.0.0rc1, but has been rolled back for the final 8.0.0 release. However it will probably emit a deprecation warning and is likely to change for good in 9.0: https://github.com/sphinx-doc/sphinx/issues/12686 Our use in depfile.py assumes a str, and if it is passed a Path it will fall over: Handler for event 'build-finished' threw an exception (exception: unsupported operand type(s) for +: 'PosixPath' and 'str') Wrapping the env.doc2path() call in str() will coerce a Path object to the str we expect, and have no effect in older Sphinx versions that do return a str. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2458 Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240729120533.2486427-1-peter.maydell@linaro.org> Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit 48e5b5f994bccf161dd88a67fdd819d4bfb400f1) Signed-off-by: zhujun2 --- docs/sphinx/depfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/depfile.py b/docs/sphinx/depfile.py index afdcbcec6e..e74be6af98 100644 --- a/docs/sphinx/depfile.py +++ b/docs/sphinx/depfile.py @@ -19,7 +19,7 @@ def get_infiles(env): for x in env.found_docs: - yield env.doc2path(x) + yield str(env.doc2path(x)) yield from ((os.path.join(env.srcdir, dep) for dep in env.dependencies[x])) for mod in sys.modules.values(): -- Gitee From 95f371c36858dd003c0c6a3d4f6ddfbc299dda9f Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Thu, 7 Nov 2024 20:56:18 -0500 Subject: [PATCH 493/939] target/arm: Fix SVE SDOT/UDOT/USDOT (4-way, indexed) cheery-pick from e6b2fa1b81ac6b05c4397237c846a295a9857920 Our implementation of the indexed version of SVE SDOT/UDOT/USDOT got the calculation of the inner loop terminator wrong. Although we correctly account for the element size when we calculate the terminator for the first iteration: intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); we don't do that when we move it forward after the first inner loop completes. The intention is that we process the vector in 128-bit segments, which for a 64-bit element size should mean (1, 2), (3, 4), (5, 6), etc. This bug meant that we would iterate (1, 2), (3, 4, 5, 6), (7, 8, 9, 10) etc and apply the wrong indexed element to some of the operations, and also index off the end of the vector. You don't see this bug if the vector length is small enough that we don't need to iterate the outer loop, i.e. if it is only 128 bits, or if it is the 64-bit special case from AA32/AA64 AdvSIMD. If the vector length is 256 bits then we calculate the right results for the elements in the vector but do index off the end of the vector. Vector lengths greater than 256 bits see wrong answers. The instructions that produce 32-bit results behave correctly. Fix the recalculation of 'segend' for subsequent iterations, and restore a version of the comment that was lost in the refactor of commit 7020ffd656a5 that explains why we only need to clamp segend to opr_sz_n for the first iteration, not the later ones. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2595 Fixes: 7020ffd656a5 ("target/arm: Macroize helper_gvec_{s,u}dot_idx_{b,h}") Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20241101185544.2130972-1-peter.maydell@linaro.org Signed-off-by: qihao_yewu --- target/arm/tcg/vec_helper.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 1f93510b85..11e874c05a 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -692,6 +692,13 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ { \ intptr_t i = 0, opr_sz = simd_oprsz(desc); \ intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ + /* \ + * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ + * first iteration might not be a full 16 byte segment. But \ + * for vector lengths beyond that this must be SVE and we know \ + * opr_sz is a multiple of 16, so we need not clamp segend \ + * to opr_sz_n when we advance it at the end of the loop. \ + */ \ intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ intptr_t index = simd_data(desc); \ TYPED *d = vd, *a = va; \ @@ -709,7 +716,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ n[i * 4 + 2] * m2 + \ n[i * 4 + 3] * m3); \ } while (++i < segend); \ - segend = i + 4; \ + segend = i + (16 / sizeof(TYPED)); \ } while (i < opr_sz_n); \ clear_tail(d, opr_sz, simd_maxsz(desc)); \ } -- Gitee From ea21c12b545ad6eecded5f34472d3f226f5a2e15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 10 Sep 2024 18:38:52 +0100 Subject: [PATCH 494/939] tests/docker: update debian i686 and mipsel images to bookworm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whatever issues there were which stopped these being updates when the rest were have now been resolved. However mips64el continues to be broken so don't update it here. Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20240910173900.4154726-3-alex.bennee@linaro.org> (cherry picked from commit 19d2111059c87d3f58349f27b9be9dee81fc1681) Signed-off-by: zhujun2 --- tests/docker/dockerfiles/debian-i686-cross.docker | 10 ++++------ tests/docker/dockerfiles/debian-mipsel-cross.docker | 10 ++++------ tests/lcitool/refresh | 4 ++-- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/docker/dockerfiles/debian-i686-cross.docker b/tests/docker/dockerfiles/debian-i686-cross.docker index 3fc4e15acd..e1c8e2b494 100644 --- a/tests/docker/dockerfiles/debian-i686-cross.docker +++ b/tests/docker/dockerfiles/debian-i686-cross.docker @@ -1,10 +1,10 @@ # THIS FILE WAS AUTO-GENERATED # -# $ lcitool dockerfile --layers all --cross-arch i686 debian-11 qemu +# $ lcitool dockerfile --layers all --cross-arch i686 debian-12 qemu # # https://gitlab.com/libvirt/libvirt-ci -FROM docker.io/library/debian:11-slim +FROM docker.io/library/debian:12-slim RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && \ @@ -47,16 +47,15 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ python3-opencv \ python3-pillow \ python3-pip \ - python3-setuptools \ python3-sphinx \ python3-sphinx-rtd-theme \ python3-venv \ - python3-wheel \ python3-yaml \ rpm2cpio \ sed \ socat \ sparse \ + swtpm \ tar \ tesseract-ocr \ tesseract-ocr-eng \ @@ -67,8 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ sed -Ei 's,^# (en_US\.UTF-8 .*)$,\1,' /etc/locale.gen && \ dpkg-reconfigure locales -RUN /usr/bin/pip3 install tomli - ENV CCACHE_WRAPPERSDIR "/usr/libexec/ccache-wrappers" ENV LANG "en_US.UTF-8" ENV MAKE "/usr/bin/make" @@ -145,6 +142,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ libvdeplug-dev:i386 \ libvirglrenderer-dev:i386 \ libvte-2.91-dev:i386 \ + libxdp-dev:i386 \ libzstd-dev:i386 \ nettle-dev:i386 \ systemtap-sdt-dev:i386 \ diff --git a/tests/docker/dockerfiles/debian-mipsel-cross.docker b/tests/docker/dockerfiles/debian-mipsel-cross.docker index 5fcd641f15..79ce4ae503 100644 --- a/tests/docker/dockerfiles/debian-mipsel-cross.docker +++ b/tests/docker/dockerfiles/debian-mipsel-cross.docker @@ -1,10 +1,10 @@ # THIS FILE WAS AUTO-GENERATED # -# $ lcitool dockerfile --layers all --cross-arch mipsel debian-11 qemu +# $ lcitool dockerfile --layers all --cross-arch mipsel debian-12 qemu # # https://gitlab.com/libvirt/libvirt-ci -FROM docker.io/library/debian:11-slim +FROM docker.io/library/debian:12-slim RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && \ @@ -47,16 +47,15 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ python3-opencv \ python3-pillow \ python3-pip \ - python3-setuptools \ python3-sphinx \ python3-sphinx-rtd-theme \ python3-venv \ - python3-wheel \ python3-yaml \ rpm2cpio \ sed \ socat \ sparse \ + swtpm \ tar \ tesseract-ocr \ tesseract-ocr-eng \ @@ -67,8 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ sed -Ei 's,^# (en_US\.UTF-8 .*)$,\1,' /etc/locale.gen && \ dpkg-reconfigure locales -RUN /usr/bin/pip3 install tomli - ENV CCACHE_WRAPPERSDIR "/usr/libexec/ccache-wrappers" ENV LANG "en_US.UTF-8" ENV MAKE "/usr/bin/make" @@ -143,6 +140,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ libvdeplug-dev:mipsel \ libvirglrenderer-dev:mipsel \ libvte-2.91-dev:mipsel \ + libxdp-dev:mipsel \ libzstd-dev:mipsel \ nettle-dev:mipsel \ systemtap-sdt-dev:mipsel \ diff --git a/tests/lcitool/refresh b/tests/lcitool/refresh index 0c93557ad6..42ed7eba1d 100755 --- a/tests/lcitool/refresh +++ b/tests/lcitool/refresh @@ -159,7 +159,7 @@ try: trailer=cross_build("arm-linux-gnueabihf-", "arm-softmmu,arm-linux-user")) - generate_dockerfile("debian-i686-cross", "debian-11", + generate_dockerfile("debian-i686-cross", "debian-12", cross="i686", trailer=cross_build("x86_64-linux-gnu-", "x86_64-softmmu," @@ -171,7 +171,7 @@ try: trailer=cross_build("mips64el-linux-gnuabi64-", "mips64el-softmmu,mips64el-linux-user")) - generate_dockerfile("debian-mipsel-cross", "debian-11", + generate_dockerfile("debian-mipsel-cross", "debian-12", cross="mipsel", trailer=cross_build("mipsel-linux-gnu-", "mipsel-softmmu,mipsel-linux-user")) -- Gitee From 7187ed9e2010adfe937d6444eb79d8025c118c2c Mon Sep 17 00:00:00 2001 From: Alexander Ivanov Date: Fri, 9 Aug 2024 14:13:40 +0200 Subject: [PATCH 495/939] module: Prevent crash by resetting local_err in module_load_qom_all() Set local_err to NULL after it has been freed in error_report_err(). This avoids triggering assert(*errp == NULL) failure in error_setv() when local_err is reused in the loop. Signed-off-by: Alexander Ivanov Reviewed-by: Claudio Fontana Reviewed-by: Denis V. Lunev Link: https://lore.kernel.org/r/20240809121340.992049-2-alexander.ivanov@virtuozzo.com [Do the same by moving the declaration instead. - Paolo] Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini (cherry picked from commit 940d802b24e63650e0eacad3714e2ce171cba17c) Signed-off-by: zhujun2 --- util/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/module.c b/util/module.c index 32e263163c..3eb0f06df1 100644 --- a/util/module.c +++ b/util/module.c @@ -354,13 +354,13 @@ int module_load_qom(const char *type, Error **errp) void module_load_qom_all(void) { const QemuModinfo *modinfo; - Error *local_err = NULL; if (module_loaded_qom_all) { return; } for (modinfo = module_info; modinfo->name != NULL; modinfo++) { + Error *local_err = NULL; if (!modinfo->objs) { continue; } -- Gitee From 87ff72f354301147e35009dabdb8be68e9dfa30c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 13 Aug 2024 11:42:49 +0100 Subject: [PATCH 496/939] target/arm: Clear high SVE elements in handle_vec_simd_wshli AdvSIMD instructions are supposed to zero bits beyond 128. Affects SSHLL, USHLL, SSHLL2, USHLL2. Cc: qemu-stable@nongnu.org Signed-off-by: Richard Henderson Message-id: 20240717060903.205098-15-richard.henderson@linaro.org Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell (cherry picked from commit 8e0c9a9efa21a16190cbac288e414bbf1d80f639) Signed-off-by: zhujun2 --- target/arm/tcg/translate-a64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index a2e49c39f9..5560a53630 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -10141,6 +10141,7 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u, tcg_gen_shli_i64(tcg_rd, tcg_rd, shift); write_vec_element(s, tcg_rd, rd, i, size + 1); } + clear_vec_high(s, true, rd); } /* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */ -- Gitee From fa282d002c45e1cc1cca6a2541b75ab5889c8c01 Mon Sep 17 00:00:00 2001 From: Arman Nabiev Date: Thu, 22 Aug 2024 19:56:53 +0300 Subject: [PATCH 497/939] target/ppc: Fix migration of CPUs with TLB_EMB TLB type In vmstate_tlbemb a cut-and-paste error meant we gave this vmstate subsection the same "cpu/tlb6xx" name as the vmstate_tlb6xx subsection. This breaks migration load for any CPU using the TLB_EMB CPU type, because when we see the "tlb6xx" name in the incoming data we try to interpret it as a vmstate_tlb6xx subsection, which it isn't the right format for: $ qemu-system-ppc -drive if=none,format=qcow2,file=/home/petmay01/test-images/virt/dummy.qcow2 -monitor stdio -M bamboo QEMU 9.0.92 monitor - type 'help' for more information (qemu) savevm foo (qemu) loadvm foo Missing section footer for cpu Error: Error -22 while loading VM state Correct the incorrect vmstate section name. Since migration for these CPU types was completely broken before, we don't need to care that this is a migration compatibility break. This affects the PPC 405, 440, 460 and e200 CPU families. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2522 Reviewed-by: Peter Maydell Signed-off-by: Arman Nabiev Signed-off-by: Fabiano Rosas (cherry picked from commit 203beb6f047467a4abfc8267c234393cea3f471c) Signed-off-by: zhujun2 --- target/ppc/machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/machine.c b/target/ppc/machine.c index 68cbdffecd..3e010f3a07 100644 --- a/target/ppc/machine.c +++ b/target/ppc/machine.c @@ -621,7 +621,7 @@ static bool tlbemb_needed(void *opaque) } static const VMStateDescription vmstate_tlbemb = { - .name = "cpu/tlb6xx", + .name = "cpu/tlbemb", .version_id = 1, .minimum_version_id = 1, .needed = tlbemb_needed, -- Gitee From b1a14fd9b59803a17626903c5fb54f1aa2655d00 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 3 Sep 2024 17:22:10 +0200 Subject: [PATCH 498/939] target/hppa: Fix PSW V-bit packaging in cpu_hppa_get for hppa64 While adding hppa64 support, the psw_v variable got extended from 32 to 64 bits. So, when packaging the PSW-V bit from the psw_v variable for interrupt processing, check bit 31 instead the 63th (sign) bit. This fixes a hard to find Linux kernel boot issue where the loss of the PSW-V bit due to an ITLB interruption in the middle of a series of ds/addc instructions (from the divU milicode library) generated the wrong division result and thus triggered a Linux kernel crash. Link: https://lore.kernel.org/lkml/718b8afe-222f-4b3a-96d3-93af0e4ceff1@roeck-us.net/ Reported-by: Guenter Roeck Signed-off-by: Helge Deller Reviewed-by: Richard Henderson Tested-by: Guenter Roeck Fixes: 931adff31478 ("target/hppa: Update cpu_hppa_get/put_psw for hppa64") Cc: qemu-stable@nongnu.org # v8.2+ (cherry picked from commit ead5078cf1a5f11d16e3e8462154c859620bcc7e) Signed-off-by: zhujun2 --- target/hppa/cpu.h | 2 +- target/hppa/helper.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h index 8be45c69c9..ba100c21a2 100644 --- a/target/hppa/cpu.h +++ b/target/hppa/cpu.h @@ -188,7 +188,7 @@ typedef struct CPUArchState { target_ulong psw; /* All psw bits except the following: */ target_ulong psw_n; /* boolean */ - target_long psw_v; /* in most significant bit */ + target_long psw_v; /* in bit 31 */ /* Splitting the carry-borrow field into the MSB and "the rest", allows * for "the rest" to be deleted when it is unused, but the MSB is in use. diff --git a/target/hppa/helper.c b/target/hppa/helper.c index 859644c47a..9e35b65f29 100644 --- a/target/hppa/helper.c +++ b/target/hppa/helper.c @@ -53,7 +53,7 @@ target_ulong cpu_hppa_get_psw(CPUHPPAState *env) } psw |= env->psw_n * PSW_N; - psw |= (env->psw_v < 0) * PSW_V; + psw |= ((env->psw_v >> 31) & 1) * PSW_V; psw |= env->psw; return psw; -- Gitee From c73b18ef8f2dd15934d90f65ba825bef19d11f73 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Thu, 7 Nov 2024 22:07:23 -0500 Subject: [PATCH 499/939] ppc/xive: Fix ESB length overflow on 32-bit hosts cheery-pick from 07f2770503e24889720028ddf9ef54788ddf3b6d The length of this region can be > 32-bits, which overflows size_t on 32-bit hosts. Change to uint64_t. Signed-off-by: Nicholas Piggin Signed-off-by: qihao_yewu --- hw/intc/spapr_xive_kvm.c | 4 ++-- hw/intc/xive.c | 2 +- include/hw/ppc/xive.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 5789062379..7a86197fc9 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -720,7 +720,7 @@ int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers, { SpaprXive *xive = SPAPR_XIVE(intc); XiveSource *xsrc = &xive->source; - size_t esb_len = xive_source_esb_len(xsrc); + uint64_t esb_len = xive_source_esb_len(xsrc); size_t tima_len = 4ull << TM_SHIFT; CPUState *cs; int fd; @@ -824,7 +824,7 @@ void kvmppc_xive_disconnect(SpaprInterruptController *intc) { SpaprXive *xive = SPAPR_XIVE(intc); XiveSource *xsrc; - size_t esb_len; + uint64_t esb_len; assert(xive->fd != -1); diff --git a/hw/intc/xive.c b/hw/intc/xive.c index a3585593d8..0cfc172dd4 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -1238,7 +1238,7 @@ static void xive_source_reset(void *dev) static void xive_source_realize(DeviceState *dev, Error **errp) { XiveSource *xsrc = XIVE_SOURCE(dev); - size_t esb_len = xive_source_esb_len(xsrc); + uint64_t esb_len = xive_source_esb_len(xsrc); assert(xsrc->xive); diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index f120874e0f..00023c0233 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -218,7 +218,7 @@ static inline bool xive_source_esb_has_2page(XiveSource *xsrc) xsrc->esb_shift == XIVE_ESB_4K_2PAGE; } -static inline size_t xive_source_esb_len(XiveSource *xsrc) +static inline uint64_t xive_source_esb_len(XiveSource *xsrc) { return (1ull << xsrc->esb_shift) * xsrc->nr_irqs; } -- Gitee From 91e07a78026caafa181134beeb8c5b79157718ad Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Wed, 6 Nov 2024 12:09:27 +0000 Subject: [PATCH 500/939] next-kbd: convert to use qemu_input_handler_register() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the next-kbd device from the legacy UI qemu_add_kbd_event_handler() function to use qemu_input_handler_register(). Signed-off-by: Mark Cave-Ayland Reviewed-by: Thomas Huth Reviewed-by: Daniel P. Berrangé Message-ID: <20241106120928.242443-2-mark.cave-ayland@ilande.co.uk> [thuth: Removed the NEXTKBD_NO_KEY definition - replaced by 0 now] Signed-off-by: Thomas Huth Signed-off-by: Zhongrui Tang --- hw/m68k/next-kbd.c | 158 +++++++++++++++++++++++++++++---------------- 1 file changed, 103 insertions(+), 55 deletions(-) diff --git a/hw/m68k/next-kbd.c b/hw/m68k/next-kbd.c index 0c348c18cf..880ebe3602 100644 --- a/hw/m68k/next-kbd.c +++ b/hw/m68k/next-kbd.c @@ -68,7 +68,6 @@ struct NextKBDState { uint16_t shift; }; -static void queue_code(void *opaque, int code); /* lots of magic numbers here */ static uint32_t kbd_read_byte(void *opaque, hwaddr addr) @@ -166,68 +165,70 @@ static const MemoryRegionOps kbd_ops = { .endianness = DEVICE_NATIVE_ENDIAN, }; -static void nextkbd_event(void *opaque, int ch) -{ - /* - * Will want to set vars for caps/num lock - * if (ch & 0x80) -> key release - * there's also e0 escaped scancodes that might need to be handled - */ - queue_code(opaque, ch); -} - -static const unsigned char next_keycodes[128] = { - 0x00, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x50, 0x4F, - 0x4E, 0x1E, 0x1F, 0x20, 0x1D, 0x1C, 0x1B, 0x00, - 0x42, 0x43, 0x44, 0x45, 0x48, 0x47, 0x46, 0x06, - 0x07, 0x08, 0x00, 0x00, 0x2A, 0x00, 0x39, 0x3A, - 0x3B, 0x3C, 0x3D, 0x40, 0x3F, 0x3E, 0x2D, 0x2C, - 0x2B, 0x26, 0x00, 0x00, 0x31, 0x32, 0x33, 0x34, - 0x35, 0x37, 0x36, 0x2e, 0x2f, 0x30, 0x00, 0x00, - 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +static const int qcode_to_nextkbd_keycode[] = { + [Q_KEY_CODE_ESC] = 0x49, + [Q_KEY_CODE_1] = 0x4a, + [Q_KEY_CODE_2] = 0x4b, + [Q_KEY_CODE_3] = 0x4c, + [Q_KEY_CODE_4] = 0x4d, + [Q_KEY_CODE_5] = 0x50, + [Q_KEY_CODE_6] = 0x4f, + [Q_KEY_CODE_7] = 0x4e, + [Q_KEY_CODE_8] = 0x1e, + [Q_KEY_CODE_9] = 0x1f, + [Q_KEY_CODE_0] = 0x20, + [Q_KEY_CODE_MINUS] = 0x1d, + [Q_KEY_CODE_EQUAL] = 0x1c, + [Q_KEY_CODE_BACKSPACE] = 0x1b, + + [Q_KEY_CODE_Q] = 0x42, + [Q_KEY_CODE_W] = 0x43, + [Q_KEY_CODE_E] = 0x44, + [Q_KEY_CODE_R] = 0x45, + [Q_KEY_CODE_T] = 0x48, + [Q_KEY_CODE_Y] = 0x47, + [Q_KEY_CODE_U] = 0x46, + [Q_KEY_CODE_I] = 0x06, + [Q_KEY_CODE_O] = 0x07, + [Q_KEY_CODE_P] = 0x08, + [Q_KEY_CODE_RET] = 0x2a, + [Q_KEY_CODE_A] = 0x39, + [Q_KEY_CODE_S] = 0x3a, + + [Q_KEY_CODE_D] = 0x3b, + [Q_KEY_CODE_F] = 0x3c, + [Q_KEY_CODE_G] = 0x3d, + [Q_KEY_CODE_H] = 0x40, + [Q_KEY_CODE_J] = 0x3f, + [Q_KEY_CODE_K] = 0x3e, + [Q_KEY_CODE_L] = 0x2d, + [Q_KEY_CODE_SEMICOLON] = 0x2c, + [Q_KEY_CODE_APOSTROPHE] = 0x2b, + [Q_KEY_CODE_GRAVE_ACCENT] = 0x26, + [Q_KEY_CODE_Z] = 0x31, + [Q_KEY_CODE_X] = 0x32, + [Q_KEY_CODE_C] = 0x33, + [Q_KEY_CODE_V] = 0x34, + + [Q_KEY_CODE_B] = 0x35, + [Q_KEY_CODE_N] = 0x37, + [Q_KEY_CODE_M] = 0x36, + [Q_KEY_CODE_COMMA] = 0x2e, + [Q_KEY_CODE_DOT] = 0x2f, + [Q_KEY_CODE_SLASH] = 0x30, + + [Q_KEY_CODE_SPC] = 0x38, }; -static void queue_code(void *opaque, int code) +static void nextkbd_put_keycode(NextKBDState *s, int keycode) { - NextKBDState *s = NEXTKBD(opaque); KBDQueue *q = &s->queue; - int key = code & KD_KEYMASK; - int release = code & 0x80; - static int ext; - - if (code == 0xE0) { - ext = 1; - } - - if (code == 0x2A || code == 0x1D || code == 0x36) { - if (code == 0x2A) { - s->shift = KD_LSHIFT; - } else if (code == 0x36) { - s->shift = KD_RSHIFT; - ext = 0; - } else if (code == 0x1D && !ext) { - s->shift = KD_LCOMM; - } else if (code == 0x1D && ext) { - ext = 0; - s->shift = KD_RCOMM; - } - return; - } else if (code == (0x2A | 0x80) || code == (0x1D | 0x80) || - code == (0x36 | 0x80)) { - s->shift = 0; - return; - } if (q->count >= KBD_QUEUE_SIZE) { return; } - q->data[q->wptr] = next_keycodes[key] | release; - + q->data[q->wptr] = keycode; if (++q->wptr == KBD_QUEUE_SIZE) { q->wptr = 0; } @@ -241,6 +242,53 @@ static void queue_code(void *opaque, int code) /* s->update_irq(s->update_arg, 1); */ } +static void nextkbd_event(DeviceState *dev, QemuConsole *src, InputEvent *evt) +{ + NextKBDState *s = NEXTKBD(dev); + int qcode, keycode; + bool key_down = evt->u.key.data->down; + + qcode = qemu_input_key_value_to_qcode(evt->u.key.data->key); + if (qcode >= ARRAY_SIZE(qcode_to_nextkbd_keycode)) { + return; + } + + /* Shift key currently has no keycode, so handle separately */ + if (qcode == Q_KEY_CODE_SHIFT) { + if (key_down) { + s->shift |= KD_LSHIFT; + } else { + s->shift &= ~KD_LSHIFT; + } + } + + if (qcode == Q_KEY_CODE_SHIFT_R) { + if (key_down) { + s->shift |= KD_RSHIFT; + } else { + s->shift &= ~KD_RSHIFT; + } + } + + keycode = qcode_to_nextkbd_keycode[qcode]; + if (!keycode) { + return; + } + + /* If key release event, create keyboard break code */ + if (!key_down) { + keycode |= 0x80; + } + + nextkbd_put_keycode(s, keycode); +} + +static const QemuInputHandler nextkbd_handler = { + .name = "QEMU NeXT Keyboard", + .mask = INPUT_EVENT_MASK_KEY, + .event = nextkbd_event, +}; + static void nextkbd_reset(DeviceState *dev) { NextKBDState *nks = NEXTKBD(dev); @@ -256,7 +304,7 @@ static void nextkbd_realize(DeviceState *dev, Error **errp) memory_region_init_io(&s->mr, OBJECT(dev), &kbd_ops, s, "next.kbd", 0x1000); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->mr); - qemu_add_kbd_event_handler(nextkbd_event, s); + qemu_input_handler_register(dev, &nextkbd_handler); } static const VMStateDescription nextkbd_vmstate = { -- Gitee From 93e7987cb5a7b33c2d2e0a02b7f310955ca11851 Mon Sep 17 00:00:00 2001 From: Christian Schoenebeck Date: Tue, 5 Nov 2024 11:25:26 +0100 Subject: [PATCH 501/939] 9pfs: fix crash on 'Treaddir' request A bad (broken or malicious) 9p client (guest) could cause QEMU host to crash by sending a 9p 'Treaddir' request with a numeric file ID (FID) that was previously opened for a file instead of an expected directory: #0 0x0000762aff8f4919 in __GI___rewinddir (dirp=0xf) at ../sysdeps/unix/sysv/linux/rewinddir.c:29 #1 0x0000557b7625fb40 in do_readdir_many (pdu=0x557bb67d2eb0, fidp=0x557bb67955b0, entries=0x762afe9fff58, offset=0, maxsize=131072, dostat=) at ../hw/9pfs/codir.c:101 #2 v9fs_co_readdir_many (pdu=pdu@entry=0x557bb67d2eb0, fidp=fidp@entry=0x557bb67955b0, entries=entries@entry=0x762afe9fff58, offset=0, maxsize=131072, dostat=false) at ../hw/9pfs/codir.c:226 #3 0x0000557b7625c1f9 in v9fs_do_readdir (pdu=0x557bb67d2eb0, fidp=0x557bb67955b0, offset=, max_count=) at ../hw/9pfs/9p.c:2488 #4 v9fs_readdir (opaque=0x557bb67d2eb0) at ../hw/9pfs/9p.c:2602 That's because V9fsFidOpenState was declared as union type. So the same memory region is used for either an open POSIX file handle (int), or a POSIX DIR* pointer, etc., so 9p server incorrectly used the previously opened (valid) POSIX file handle (0xf) as DIR* pointer, eventually causing a crash in glibc's rewinddir() function. Root cause was therefore a missing check in 9p server's 'Treaddir' request handler, which must ensure that the client supplied FID was really opened as directory stream before trying to access the aforementioned union and its DIR* member. Cc: qemu-stable@nongnu.org Fixes: d62dbb51f7 ("virtio-9p: Add fidtype so that we can do type ...") Reported-by: Akihiro Suda Tested-by: Akihiro Suda Signed-off-by: Christian Schoenebeck Reviewed-by: Greg Kurz Message-Id: Signed-off-by: Zhongrui Tang --- hw/9pfs/9p.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index af636cfb2d..9a291d1b51 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -2587,6 +2587,11 @@ static void coroutine_fn v9fs_readdir(void *opaque) retval = -EINVAL; goto out_nofid; } + if (fidp->fid_type != P9_FID_DIR) { + warn_report_once("9p: bad client: T_readdir on non-directory stream"); + retval = -ENOTDIR; + goto out; + } if (!fidp->fs.dir.stream) { retval = -EINVAL; goto out; -- Gitee From c4423b70160eb7ae91dac9f2cf61513758ee017d Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 29 Oct 2024 13:15:19 +0100 Subject: [PATCH 502/939] hw/nvme: fix handling of over-committed queues If a host chooses to use the SQHD "hint" in the CQE to know if there is room in the submission queue for additional commands, it may result in a situation where there are not enough internal resources (struct NvmeRequest) available to process the command. For a lack of a better term, the host may "over-commit" the device (i.e., it may have more inflight commands than the queue size). For example, assume a queue with N entries. The host submits N commands and all are picked up for processing, advancing the head and emptying the queue. Regardless of which of these N commands complete first, the SQHD field of that CQE will indicate to the host that the queue is empty, which allows the host to issue N commands again. However, if the device has not posted CQEs for all the previous commands yet, the device will have less than N resources available to process the commands, so queue processing is suspended. And here lies an 11 year latent bug. In the absense of any additional tail updates on the submission queue, we never schedule the processing bottom-half again unless we observe a head update on an associated full completion queue. This has been sufficient to handle N-to-1 SQ/CQ setups (in the absense of over-commit of course). Incidentially, that "kick all associated SQs" mechanism can now be killed since we now just schedule queue processing when we return a processing resource to a non-empty submission queue, which happens to cover both edge cases. However, we must retain kicking the CQ if it was previously full. So, apparently, no previous driver tested with hw/nvme has ever used SQHD (e.g., neither the Linux NVMe driver or SPDK uses it). But then OSv shows up with the driver that actually does. I salute you. Fixes: f3c507adcd7b ("NVMe: Initial commit for new storage interface") Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2388 Reported-by: Waldemar Kozaczuk Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen Signed-off-by: Zhongrui Tang --- hw/nvme/ctrl.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 104aebc5ea..29445938d5 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1516,9 +1516,16 @@ static void nvme_post_cqes(void *opaque) stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); break; } + QTAILQ_REMOVE(&cq->req_list, req, entry); + nvme_inc_cq_tail(cq); nvme_sg_unmap(&req->sg); + + if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) { + qemu_bh_schedule(sq->bh); + } + QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -7575,7 +7582,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) /* Completion queue doorbell write */ uint16_t new_head = val & 0xffff; - int start_sqs; NvmeCQueue *cq; qid = (addr - (0x1000 + (1 << 2))) >> 3; @@ -7626,18 +7632,15 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); - start_sqs = nvme_cq_full(cq) ? 1 : 0; + /* scheduled deferred cqe posting if queue was previously full */ + if (nvme_cq_full(cq)) { + qemu_bh_schedule(cq->bh); + } + cq->head = new_head; if (!qid && n->dbbuf_enabled) { stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); } - if (start_sqs) { - NvmeSQueue *sq; - QTAILQ_FOREACH(sq, &cq->sq_list, entry) { - qemu_bh_schedule(sq->bh); - } - qemu_bh_schedule(cq->bh); - } if (cq->tail == cq->head) { if (cq->irq_enabled) { -- Gitee From dffc0f55d93ececee55a8548d7dab227ee76b234 Mon Sep 17 00:00:00 2001 From: liupingwei Date: Thu, 24 Oct 2024 19:05:58 +0800 Subject: [PATCH 503/939] cvm : Add support for TEE-based national encryption acceleration. This commit enables the use of TEE for national encryption acceleration in cvm and speeds up OpenSSL encrption /decryption operations. Signed-off-by: liupingwei --- hw/arm/virt.c | 61 ++++++++++++++++++++++++++++++- include/hw/arm/virt.h | 1 + linux-headers/asm-arm64/kvm.h | 10 ++++++ qapi/qom.json | 1 + target/arm/kvm-tmm.c | 68 +++++++++++++++++++++++++++++++++-- target/arm/kvm_arm.h | 4 +++ 6 files changed, 142 insertions(+), 3 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index e73a795d3d..248788db03 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1967,6 +1967,10 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) "kvm-type", &error_abort); if (!strcmp(kvm_type, "cvm")) { + /* support kae vf device tree nodes */ + vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; + vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; + vms->memmap[VIRT_MEM].base = 3 * GiB; vms->memmap[VIRT_MEM].size = ms->ram_size; info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), @@ -2380,6 +2384,56 @@ out: return; } +static void fdt_add_hisi_sec_nodes(const VirtMachineState *vms, int dev_id) +{ + const MachineState *ms = MACHINE(vms); + hwaddr size = 0x10000; + + /* + * Calculate the base address for the sec device node. + * Each device group contains one sec device and one hpre device,spaced by 2 * size. + */ + hwaddr base = vms->memmap[VIRT_KAE_DEVICE].base + dev_id * 2 * size; + char *nodename; + + tmm_set_sec_addr(base, dev_id); + + nodename = g_strdup_printf("/hisi-sec@%" PRIx64, base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "hisilicon,hip07-sec-vf"); + qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + g_free(nodename); +} + +static void fdt_add_hisi_hpre_nodes(const VirtMachineState *vms, int dev_id) +{ + const MachineState *ms = MACHINE(vms); + hwaddr size = 0x10000; + + /* + * Calculate the base address for the hpre device node. + * Each hpre device follows the corresponding sec device by an additional offset of size. + */ + hwaddr base = vms->memmap[VIRT_KAE_DEVICE].base + dev_id * 2 * size + size; + char *nodename; + + tmm_set_hpre_addr(base, dev_id); + + nodename = g_strdup_printf("/hisi-hpre@%" PRIx64, base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "hisilicon,hip07-hpre-vf"); + qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + g_free(nodename); +} + +static void fdt_add_all_hisi_nodes(const VirtMachineState *vms, int dev_id) +{ + for (int i = 0; i < dev_id; i++) { + fdt_add_hisi_sec_nodes(vms, i); + fdt_add_hisi_hpre_nodes(vms, i); + } +} + static void machvirt_init(MachineState *machine) { VirtMachineState *vms = VIRT_MACHINE(machine); @@ -2530,14 +2584,19 @@ static void machvirt_init(MachineState *machine) } } + create_fdt(vms); + if (virtcca_cvm_enabled()) { + int kae_num = tmm_get_kae_num(); + fdt_add_all_hisi_nodes(vms, kae_num); + int ret = kvm_arm_tmm_init(machine->cgs, &error_fatal); if (ret != 0) { error_report("fail to initialize TMM"); exit(1); } } - create_fdt(vms); + qemu_log("cpu init start\n"); cpu_class = object_class_by_name(machine->cpu_type); diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 27f5333772..76a0d3fa5b 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -66,6 +66,7 @@ enum { VIRT_FW_CFG, VIRT_PCIE, VIRT_PCIE_MMIO, + VIRT_KAE_DEVICE, VIRT_PCIE_PIO, VIRT_PCIE_ECAM, VIRT_PLATFORM_BUS, diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 2b040b5d60..552fdcb18f 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -541,6 +541,9 @@ struct reg_mask_range { #define KVM_CAP_ARM_TMM_CFG_SVE 2 #define KVM_CAP_ARM_TMM_CFG_DBG 3 #define KVM_CAP_ARM_TMM_CFG_PMU 4 +#define KVM_CAP_ARM_TMM_CFG_KAE 5 + +#define KVM_ARM_TMM_MAX_KAE_VF_NUM 11 struct kvm_cap_arm_tmm_config_item { __u32 cfg; @@ -570,6 +573,13 @@ struct kvm_cap_arm_tmm_config_item { struct { __u32 num_pmu_cntrs; }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_KAE */ + struct { + __u32 kae_vf_num; + __u64 sec_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; + __u64 hpre_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; + }; /* Fix the size of the union */ __u8 reserved[256]; }; diff --git a/qapi/qom.json b/qapi/qom.json index 213edd8db2..293d727a04 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -921,6 +921,7 @@ { 'struct': 'TmmGuestProperties', 'data': { '*sve-vector-length': 'uint32', '*num-pmu-counters': 'uint32', + '*kae': 'uint32', '*measurement-algo': 'TmmGuestMeasurementAlgo' } } ## diff --git a/target/arm/kvm-tmm.c b/target/arm/kvm-tmm.c index efe2ca0006..ea6bcc0f40 100644 --- a/target/arm/kvm-tmm.c +++ b/target/arm/kvm-tmm.c @@ -19,13 +19,20 @@ #include "sysemu/kvm.h" #include "sysemu/runstate.h" #include "hw/loader.h" +#include "linux-headers/asm-arm64/kvm.h" #define TYPE_TMM_GUEST "tmm-guest" OBJECT_DECLARE_SIMPLE_TYPE(TmmGuest, TMM_GUEST) #define TMM_PAGE_SIZE qemu_real_host_page_size() -#define TMM_MAX_PMU_CTRS 0x20 -#define TMM_MAX_CFG 5 +#define TMM_MAX_PMU_CTRS 0x20 +#define TMM_MAX_CFG 6 + +typedef struct { + uint32_t kae_vf_num; + hwaddr sec_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; + hwaddr hpre_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; +} KaeDeviceInfo; struct TmmGuest { ConfidentialGuestSupport parent_obj; @@ -33,6 +40,7 @@ struct TmmGuest { TmmGuestMeasurementAlgo measurement_algo; uint32_t sve_vl; uint32_t num_pmu_cntrs; + KaeDeviceInfo kae_device_info; }; typedef struct { @@ -92,6 +100,17 @@ static int tmm_configure_one(TmmGuest *guest, uint32_t cfg, Error **errp) args.num_pmu_cntrs = guest->num_pmu_cntrs; cfg_str = "PMU"; break; + case KVM_CAP_ARM_TMM_CFG_KAE: + if (!guest->kae_device_info.kae_vf_num) { + return 0; + } + args.kae_vf_num= guest->kae_device_info.kae_vf_num; + for (int i = 0; i < guest->kae_device_info.kae_vf_num; i++) { + args.sec_addr[i] = guest->kae_device_info.sec_addr[i]; + args.hpre_addr[i] = guest->kae_device_info.hpre_addr[i]; + } + cfg_str = "KAE"; + break; default: g_assert_not_reached(); } @@ -289,6 +308,47 @@ static void tmm_set_measurement_algo(Object *obj, int algo, Error **errp G_GNUC_ guest->measurement_algo = algo; } +static void tmm_get_kae_vf_num(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + + visit_type_uint32(v, name, &guest->kae_device_info.kae_vf_num, errp); +} + +static void tmm_set_kae_vf_num(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value > KVM_ARM_TMM_MAX_KAE_VF_NUM) { + error_setg(errp, "invalid number of kae vfs"); + return; + } + + guest->kae_device_info.kae_vf_num = value; +} + +int tmm_get_kae_num(void) +{ + return tmm_guest->kae_device_info.kae_vf_num; +} + +void tmm_set_sec_addr(hwaddr base, int num) +{ + tmm_guest->kae_device_info.sec_addr[num] = base; +} + +void tmm_set_hpre_addr(hwaddr base, int num) +{ + tmm_guest->kae_device_info.hpre_addr[num] = base; +} + static void tmm_guest_class_init(ObjectClass *oc, void *data) { object_class_property_add_enum(oc, "measurement-algo", @@ -314,6 +374,10 @@ static void tmm_guest_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "num-pmu-counters", "Number of PMU counters"); + object_class_property_add(oc, "kae", "uint32", tmm_get_kae_vf_num, + tmm_set_kae_vf_num, NULL, NULL); + object_class_property_set_description(oc, "kae", + "Number of KAE virtual functions. 0 disables KAE (the default)"); } static void tmm_guest_instance_init(Object *obj) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index d6c7139f4a..31457a57f7 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -390,6 +390,10 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate); +int tmm_get_kae_num(void); +void tmm_set_sec_addr(hwaddr base, int num); +void tmm_set_hpre_addr(hwaddr base, int num); + int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp); bool kvm_arm_tmm_enabled(void); -- Gitee From 0cf5a4c56d34542bcc2f646446bf54828a51a014 Mon Sep 17 00:00:00 2001 From: yangxiangkai Date: Tue, 12 Nov 2024 09:03:51 +0800 Subject: [PATCH 504/939] Add virtCCA Coda annotation Adjust the position of the security device Signed-off-by: yangxiangkai --- hw/arm/virt.c | 1 + linux-headers/linux/vfio.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index e73a795d3d..a744393f6e 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -162,6 +162,7 @@ static const MemMapEntry base_memmap[] = { [VIRT_PVTIME] = { 0x090a0000, 0x00010000 }, [VIRT_SECURE_GPIO] = { 0x090b0000, 0x00001000 }, [VIRT_CPUHP_ACPI] = { 0x090c0000, ACPI_CPU_HOTPLUG_REG_LEN}, + /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ [VIRT_MMIO] = { 0x0a000000, 0x00000200 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index c27a43d74b..5b1e2871af 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -225,7 +225,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ -#define VFIO_DEVICE_FLAGS_SECURE (1 << 9) /* secure pci device */ +#define VFIO_DEVICE_FLAGS_SECURE (1 << 15) /* secure pci device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ -- Gitee From 54648e0e5a45acf2e472430ee83bb8dfa057fb30 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Tue, 24 Aug 2021 14:57:28 +0800 Subject: [PATCH 505/939] target/i386: csv: Add CSV3 context CSV/CSV2/CSV3 are the secure virtualization features on Hygon CPUs. The CSV and CSV2 are compatible with the AMD SEV and SEV-ES, respectively. From CSV3, we introduced more secure features to protect the guest, users can bit 6 of the guest policy to run a CSV3 guest. Add the context and the build option. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- target/i386/csv.c | 11 +++++++++++ target/i386/csv.h | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/target/i386/csv.c b/target/i386/csv.c index 88fb05ac37..9a1de04db7 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -18,3 +18,14 @@ #include "csv.h" bool csv_kvm_cpu_reset_inhibit; + +Csv3GuestState csv3_guest = { 0 }; + +bool +csv3_enabled(void) +{ + if (!is_hygon_cpu()) + return false; + + return sev_es_enabled() && (csv3_guest.policy & GUEST_POLICY_CSV3_BIT); +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 05e7fd8dc1..ea87c1ba27 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -14,6 +14,9 @@ #ifndef I386_CSV_H #define I386_CSV_H +#include "qapi/qapi-commands-misc-target.h" + +#define GUEST_POLICY_CSV3_BIT (1 << 6) #define GUEST_POLICY_REUSE_ASID (1 << 7) #ifdef CONFIG_CSV @@ -40,9 +43,12 @@ static bool __attribute__((unused)) is_hygon_cpu(void) return false; } +bool csv3_enabled(void); + #else #define is_hygon_cpu() (false) +#define csv3_enabled() (false) #endif @@ -66,4 +72,15 @@ int csv_load_queued_incoming_pages(QEMUFile *f); int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent); int csv_load_incoming_cpu_state(QEMUFile *f); +/* CSV3 */ +struct Csv3GuestState { + uint32_t policy; + int sev_fd; + void *state; +}; + +typedef struct Csv3GuestState Csv3GuestState; + +extern struct Csv3GuestState csv3_guest; + #endif -- Gitee From 9a12c439cb9d1e59175be4b96adf0732dca39db3 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Tue, 12 Nov 2024 13:30:29 +0800 Subject: [PATCH 506/939] exec/memop: Remove unused memop_big_endian() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 5caa0e1b1bf8597ea7277391b0e17e8584fad18f Last use of memop_big_endian() was removed in commit 592134617c9 ("accel/tcg: Reorg system mode store helpers"). Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Message-Id: <20241003234211.53644-3-philmd@linaro.org> Signed-off-by: Zhang Jiao --- include/exec/memop.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/exec/memop.h b/include/exec/memop.h index a86dc6743a..5b9064819c 100644 --- a/include/exec/memop.h +++ b/include/exec/memop.h @@ -164,10 +164,4 @@ static inline MemOp size_memop(unsigned size) return ctz32(size); } -/* Big endianness from MemOp. */ -static inline bool memop_big_endian(MemOp op) -{ - return (op & MO_BSWAP) == MO_BE; -} - #endif -- Gitee From 4fc36060bec2ac7de500068211b1282c38e3e073 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Tue, 12 Nov 2024 14:05:45 +0800 Subject: [PATCH 507/939] qemu/bswap: Undefine CPU_CONVERT() once done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 1d73353f236209e9b5987d7c6b30b2a32b739210 Better undefined macros once we are done with them, like we do few lines later with DO_STN_LDN_P(). Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Message-Id: <20241003234211.53644-2-philmd@linaro.org> Signed-off-by: Zhang Jiao --- include/qemu/bswap.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/qemu/bswap.h b/include/qemu/bswap.h index 933a66ee87..49e4944457 100644 --- a/include/qemu/bswap.h +++ b/include/qemu/bswap.h @@ -138,6 +138,8 @@ CPU_CONVERT(le, 16, uint16_t) CPU_CONVERT(le, 32, uint32_t) CPU_CONVERT(le, 64, uint64_t) +#undef CPU_CONVERT + /* * Same as cpu_to_le{16,32,64}, except that gcc will figure the result is * a compile-time constant if you pass in a constant. So this can be -- Gitee From 322f39889ff60a6fda87d7d95a6f233efb558e8a Mon Sep 17 00:00:00 2001 From: Marco Palumbi Date: Thu, 1 Aug 2024 10:15:02 +0100 Subject: [PATCH 508/939] hw/arm/mps2-tz.c: fix RX/TX interrupts order The order of the RX and TX interrupts are swapped. This commit fixes the order as per the following documents: * https://developer.arm.com/documentation/dai0505/latest/ * https://developer.arm.com/documentation/dai0521/latest/ * https://developer.arm.com/documentation/dai0524/latest/ * https://developer.arm.com/documentation/dai0547/latest/ Cc: qemu-stable@nongnu.org Signed-off-by: Marco Palumbi Message-id: 20240730073123.72992-1-marco@palumbi.it Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell (cherry picked from commit 5a558be93ad628e5bed6e0ee062870f49251725c) Signed-off-by: zhujun2 --- hw/arm/mps2-tz.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c index 668db5ed61..9d9c263ef8 100644 --- a/hw/arm/mps2-tz.c +++ b/hw/arm/mps2-tz.c @@ -435,7 +435,7 @@ static MemoryRegion *make_uart(MPS2TZMachineState *mms, void *opaque, const char *name, hwaddr size, const int *irqs, const PPCExtraData *extradata) { - /* The irq[] array is tx, rx, combined, in that order */ + /* The irq[] array is rx, tx, combined, in that order */ MPS2TZMachineClass *mmc = MPS2TZ_MACHINE_GET_CLASS(mms); CMSDKAPBUART *uart = opaque; int i = uart - &mms->uart[0]; @@ -447,8 +447,8 @@ static MemoryRegion *make_uart(MPS2TZMachineState *mms, void *opaque, qdev_prop_set_uint32(DEVICE(uart), "pclk-frq", mmc->apb_periph_frq); sysbus_realize(SYS_BUS_DEVICE(uart), &error_fatal); s = SYS_BUS_DEVICE(uart); - sysbus_connect_irq(s, 0, get_sse_irq_in(mms, irqs[0])); - sysbus_connect_irq(s, 1, get_sse_irq_in(mms, irqs[1])); + sysbus_connect_irq(s, 0, get_sse_irq_in(mms, irqs[1])); + sysbus_connect_irq(s, 1, get_sse_irq_in(mms, irqs[0])); sysbus_connect_irq(s, 2, qdev_get_gpio_in(orgate_dev, i * 2)); sysbus_connect_irq(s, 3, qdev_get_gpio_in(orgate_dev, i * 2 + 1)); sysbus_connect_irq(s, 4, get_sse_irq_in(mms, irqs[2])); -- Gitee From 1b0d08faf1daaed39809ed1a3516eaa0f7d61534 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Wed, 31 Jul 2024 18:00:19 +0100 Subject: [PATCH 509/939] hw/i386/amd_iommu: Don't leak memory in amdvi_update_iotlb() In amdvi_update_iotlb() we will only put a new entry in the hash table if to_cache.perm is not IOMMU_NONE. However we allocate the memory for the new AMDVIIOTLBEntry and for the hash table key regardless. This means that in the IOMMU_NONE case we will leak the memory we alloacted. Move the allocations into the if() to the point where we know we're going to add the item to the hash table. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2452 Signed-off-by: Peter Maydell Message-Id: <20240731170019.3590563-1-peter.maydell@linaro.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin (cherry picked from commit 9a45b0761628cc59267b3283a85d15294464ac31) Signed-off-by: zhujun2 --- hw/i386/amd_iommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 4203144da9..12742b1433 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -346,12 +346,12 @@ static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, uint64_t gpa, IOMMUTLBEntry to_cache, uint16_t domid) { - AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); - uint64_t *key = g_new(uint64_t, 1); - uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; - /* don't cache erroneous translations */ if (to_cache.perm != IOMMU_NONE) { + AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); + uint64_t *key = g_new(uint64_t, 1); + uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; + trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), gpa, to_cache.translated_addr); -- Gitee From e025c40fac7d6cc5b4752c392a9c66a074dcaa0b Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 14 Nov 2024 14:24:58 +0800 Subject: [PATCH 510/939] hw/ppc/e500: Add missing device tree properties to i2c controller node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from b5d65592d931d07d4f4bcb915d018ec9598058b4 When compiling a decompiled device tree blob created with dumpdtb, dtc complains with: /soc@e0000000/i2c@3000: incorrect #address-cells for I2C bus /soc@e0000000/i2c@3000: incorrect #size-cells for I2C bus Fix this by adding the missing device tree properties. Reviewed-by: Cédric Le Goater Signed-off-by: Bernhard Beschow Message-ID: <20241103133412.73536-6-shentey@gmail.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- hw/ppc/e500.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 384226296b..8d394d749a 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -203,6 +203,8 @@ static void dt_i2c_create(void *fdt, const char *soc, const char *mpic, qemu_fdt_setprop_cells(fdt, i2c, "cell-index", 0); qemu_fdt_setprop_cells(fdt, i2c, "interrupts", irq0, 0x2); qemu_fdt_setprop_phandle(fdt, i2c, "interrupt-parent", mpic); + qemu_fdt_setprop_cell(fdt, i2c, "#size-cells", 0); + qemu_fdt_setprop_cell(fdt, i2c, "#address-cells", 1); qemu_fdt_setprop_string(fdt, "/aliases", alias, i2c); g_free(i2c); -- Gitee From 239e256d5510b9aaa3e099359dcda54970e2f08a Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 14 Nov 2024 14:40:02 +0800 Subject: [PATCH 511/939] hw/ppc/e500: Remove unused "irqs" parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 2a309354ac5decf78763c9de999bfb42c8612069 Reviewed-by: BALATON Zoltan Signed-off-by: Bernhard Beschow Message-ID: <20241103133412.73536-5-shentey@gmail.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- hw/ppc/e500.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 384226296b..8ab1ccc969 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -832,7 +832,7 @@ static DeviceState *ppce500_init_mpic_qemu(PPCE500MachineState *pms, } static DeviceState *ppce500_init_mpic_kvm(const PPCE500MachineClass *pmc, - IrqLines *irqs, Error **errp) + Error **errp) { #ifdef CONFIG_KVM DeviceState *dev; @@ -872,7 +872,7 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms, Error *err = NULL; if (kvm_kernel_irqchip_allowed()) { - dev = ppce500_init_mpic_kvm(pmc, irqs, &err); + dev = ppce500_init_mpic_kvm(pmc, &err); } if (kvm_kernel_irqchip_required() && !dev) { error_reportf_err(err, -- Gitee From c7fe47e4aab35c1817c4c53f0025a741a9e2ad57 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Fri, 28 Jun 2024 13:27:56 +0200 Subject: [PATCH 512/939] sphinx/qapidoc: Fix to generate doc for explicit, unboxed arguments When a command's arguments are specified as an explicit type T, generated documentation points to the members of T. Example: ## # @announce-self: # # Trigger generation of broadcast RARP frames to update network [...] ## { 'command': 'announce-self', 'boxed': true, 'data' : 'AnnounceParameters'} generates "announce-self" (Command) ------------------------- Trigger generation of broadcast RARP frames to update network [...] Arguments ~~~~~~~~~ The members of "AnnounceParameters" Except when the command takes its arguments unboxed , i.e. it doesn't have 'boxed': true, we generate *nothing*. A few commands have a reference in their doc comment to compensate, but most don't. Example: ## # @blockdev-snapshot-sync: # # Takes a synchronous snapshot of a block device. # # For the arguments, see the documentation of BlockdevSnapshotSync. [...] ## { 'command': 'blockdev-snapshot-sync', 'data': 'BlockdevSnapshotSync', 'allow-preconfig': true } generates "blockdev-snapshot-sync" (Command) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Takes a synchronous snapshot of a block device. For the arguments, see the documentation of BlockdevSnapshotSync. [...] Same for event data. Fix qapidoc.py to generate the reference regardless of boxing. Delete now redundant references in the doc comments. Fixes: 4078ee5469e5 (docs/sphinx: Add new qapi-doc Sphinx extension) Cc: qemu-stable@nongnu.org Signed-off-by: Markus Armbruster Message-ID: <20240628112756.794237-1-armbru@redhat.com> Reviewed-by: John Snow (cherry picked from commit e389929d19a543ea5b34d02553b355f9f1c03162) Signed-off-by: zhujun2 --- docs/sphinx/qapidoc.py | 12 +++++------- qapi/block-core.json | 7 ------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/docs/sphinx/qapidoc.py b/docs/sphinx/qapidoc.py index 658c288f8f..3d19853444 100644 --- a/docs/sphinx/qapidoc.py +++ b/docs/sphinx/qapidoc.py @@ -229,15 +229,15 @@ def _nodes_for_enum_values(self, doc): section += dlnode return [section] - def _nodes_for_arguments(self, doc, boxed_arg_type): + def _nodes_for_arguments(self, doc, arg_type): """Return list of doctree nodes for the arguments section""" - if boxed_arg_type: + if arg_type and not arg_type.is_implicit(): assert not doc.args section = self._make_section('Arguments') dlnode = nodes.definition_list() dlnode += self._make_dlitem( [nodes.Text('The members of '), - nodes.literal('', boxed_arg_type.name)], + nodes.literal('', arg_type.name)], None) section += dlnode return [section] @@ -341,8 +341,7 @@ def visit_command(self, name, info, ifcond, features, arg_type, allow_preconfig, coroutine): doc = self._cur_doc self._add_doc('Command', - self._nodes_for_arguments(doc, - arg_type if boxed else None) + self._nodes_for_arguments(doc, arg_type) + self._nodes_for_features(doc) + self._nodes_for_sections(doc) + self._nodes_for_if_section(ifcond)) @@ -350,8 +349,7 @@ def visit_command(self, name, info, ifcond, features, arg_type, def visit_event(self, name, info, ifcond, features, arg_type, boxed): doc = self._cur_doc self._add_doc('Event', - self._nodes_for_arguments(doc, - arg_type if boxed else None) + self._nodes_for_arguments(doc, arg_type) + self._nodes_for_features(doc) + self._nodes_for_sections(doc) + self._nodes_for_if_section(ifcond)) diff --git a/qapi/block-core.json b/qapi/block-core.json index ded6f0f6d2..0fa184698a 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1662,8 +1662,6 @@ # # Takes a synchronous snapshot of a block device. # -# For the arguments, see the documentation of BlockdevSnapshotSync. -# # Returns: # - nothing on success # - If @device is not a valid block device, DeviceNotFound @@ -1693,8 +1691,6 @@ # device, the block device changes to using 'overlay' as its new # active image. # -# For the arguments, see the documentation of BlockdevSnapshot. -# # Features: # # @allow-write-only-overlay: If present, the check whether this @@ -6037,9 +6033,6 @@ # string, or a snapshot with name already exists, the operation will # fail. # -# For the arguments, see the documentation of -# BlockdevSnapshotInternal. -# # Returns: # - nothing on success # - If @device is not a valid block device, GenericError -- Gitee From b85c8374d8b78a6ec1c250bb7562423e6f5d89a0 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 14 Nov 2024 15:12:32 +0800 Subject: [PATCH 513/939] hw/ppc/e500: Prefer QOM cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from c620b4ee92ed3664a3d98e0fbb0b651e19fba5b6 Reviewed-by: BALATON Zoltan Signed-off-by: Bernhard Beschow Message-ID: <20241103133412.73536-4-shentey@gmail.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- hw/ppc/e500.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 384226296b..df5a20d3ec 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -1024,7 +1024,7 @@ void ppce500_init(MachineState *machine) sysbus_connect_irq(s, 0, qdev_get_gpio_in(mpicdev, MPC8544_I2C_IRQ)); memory_region_add_subregion(ccsr_addr_space, MPC8544_I2C_REGS_OFFSET, sysbus_mmio_get_region(s, 0)); - i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c"); + i2c = I2C_BUS(qdev_get_child_bus(dev, "i2c")); i2c_slave_create_simple(i2c, "ds1338", RTC_REGS_OFFSET); /* eSDHC */ @@ -1073,7 +1073,7 @@ void ppce500_init(MachineState *machine) memory_region_add_subregion(ccsr_addr_space, MPC8544_PCI_REGS_OFFSET, sysbus_mmio_get_region(s, 0)); - pci_bus = (PCIBus *)qdev_get_child_bus(dev, "pci.0"); + pci_bus = PCI_BUS(qdev_get_child_bus(dev, "pci.0")); if (!pci_bus) printf("couldn't create PCI controller!\n"); -- Gitee From 148e01eba8041bad93081a19a240034bb8138988 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 25 Jun 2024 11:35:26 -0700 Subject: [PATCH 514/939] target/arm: Fix FJCVTZS vs flush-to-zero Input denormals cause the Javascript inexact bit (output to Z) to be set. Cc: qemu-stable@nongnu.org Fixes: 6c1f6f2733a ("target/arm: Implement ARMv8.3-JSConv") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2375 Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson Message-id: 20240625183536.1672454-4-richard.henderson@linaro.org [PMM: fixed hardcoded tab in test case] Signed-off-by: Peter Maydell (cherry picked from commit 7619129f0d4a14d918227c5c47ad7433662e9ccc) Signed-off-by: zhujun2 --- target/arm/vfp_helper.c | 18 +++++++++--------- tests/tcg/aarch64/Makefile.target | 3 ++- tests/tcg/aarch64/test-2375.c | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 10 deletions(-) create mode 100644 tests/tcg/aarch64/test-2375.c diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index 3e5e37abbe..ff59bc5522 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -1121,8 +1121,8 @@ const FloatRoundMode arm_rmode_to_sf_map[] = { uint64_t HELPER(fjcvtzs)(float64 value, void *vstatus) { float_status *status = vstatus; - uint32_t inexact, frac; - uint32_t e_old, e_new; + uint32_t frac, e_old, e_new; + bool inexact; e_old = get_float_exception_flags(status); set_float_exception_flags(0, status); @@ -1130,13 +1130,13 @@ uint64_t HELPER(fjcvtzs)(float64 value, void *vstatus) e_new = get_float_exception_flags(status); set_float_exception_flags(e_old | e_new, status); - if (value == float64_chs(float64_zero)) { - /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */ - inexact = 1; - } else { - /* Normal inexact or overflow or NaN */ - inexact = e_new & (float_flag_inexact | float_flag_invalid); - } + /* Normal inexact, denormal with flush-to-zero, or overflow or NaN */ + inexact = e_new & (float_flag_inexact | + float_flag_input_denormal | + float_flag_invalid); + + /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */ + inexact |= value == float64_chs(float64_zero); /* Pack the result and the env->ZF representation of Z together. */ return deposit64(frac, 32, 32, inexact); diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target index cded1d01fc..6d593c6392 100644 --- a/tests/tcg/aarch64/Makefile.target +++ b/tests/tcg/aarch64/Makefile.target @@ -40,8 +40,9 @@ endif # Pauth Tests ifneq ($(CROSS_CC_HAS_ARMV8_3),) -AARCH64_TESTS += pauth-1 pauth-2 pauth-4 pauth-5 +AARCH64_TESTS += pauth-1 pauth-2 pauth-4 pauth-5 test-2375 pauth-%: CFLAGS += -march=armv8.3-a +test-2375: CFLAGS += -march=armv8.3-a run-pauth-1: QEMU_OPTS += -cpu max run-pauth-2: QEMU_OPTS += -cpu max # Choose a cpu with FEAT_Pauth but without FEAT_FPAC for pauth-[45]. diff --git a/tests/tcg/aarch64/test-2375.c b/tests/tcg/aarch64/test-2375.c new file mode 100644 index 0000000000..84c7e7de71 --- /dev/null +++ b/tests/tcg/aarch64/test-2375.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2024 Linaro Ltd */ +/* See https://gitlab.com/qemu-project/qemu/-/issues/2375 */ + +#include + +int main(void) +{ + int r, z; + + asm("msr fpcr, %2\n\t" + "fjcvtzs %w0, %d3\n\t" + "cset %1, eq" + : "=r"(r), "=r"(z) + : "r"(0x01000000L), /* FZ = 1 */ + "w"(0xfcff00L)); /* denormal */ + + assert(r == 0); + assert(z == 0); + return 0; +} -- Gitee From 6c76354fdfbebca55e080fea5ae6bfc8a3db2d91 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 17 Jun 2024 15:57:17 -0300 Subject: [PATCH 515/939] migration: Fix file migration with fdset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the "file:" migration support was added we missed the special case in the qemu_open_old implementation that allows for a particular file name format to be used to refer to a set of file descriptors that have been previously provided to QEMU via the add-fd QMP command. When using this fdset feature, we should not truncate the migration file because being given an fd means that the management layer is in control of the file and will likely already have some data written to it. This is further indicated by the presence of the 'offset' argument, which indicates the start of the region where QEMU is allowed to write. Fix the issue by replacing the O_TRUNC flag on open by an ftruncate call, which will take the offset into consideration. Fixes: 385f510df5 ("migration: file URI offset") Suggested-by: Daniel P. Berrangé Reviewed-by: Prasad Pandit Reviewed-by: Peter Xu Reviewed-by: Daniel P. Berrangé Signed-off-by: Fabiano Rosas (cherry picked from commit 6d3279655ac49b806265f08415165f471d33e032) Signed-off-by: Michael Tokarev Signed-off-by: zhujun2 --- migration/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/migration/file.c b/migration/file.c index 5d4975f43e..fb3f743e54 100644 --- a/migration/file.c +++ b/migration/file.c @@ -46,12 +46,19 @@ void file_start_outgoing_migration(MigrationState *s, trace_migration_file_outgoing(filename); - fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC, - 0600, errp); + fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY, 0600, errp); if (!fioc) { return; } + if (ftruncate(fioc->fd, offset)) { + error_setg_errno(errp, errno, + "failed to truncate migration file to offset %" PRIx64, + offset); + object_unref(OBJECT(fioc)); + return; + } + ioc = QIO_CHANNEL(fioc); if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { return; -- Gitee From 6477ff9d89317a6124f3a46215b1567306b6ebe4 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 19 Jun 2024 05:41:13 +0000 Subject: [PATCH 516/939] tcg/loongarch64: Fix tcg_out_movi vs some pcrel pointers Simplify the logic for two-part, 32-bit pc-relative addresses. Rather than assume all such fit in int32_t, do some arithmetic and assert a result, do some arithmetic first and then check to see if the pieces are in range. Cc: qemu-stable@nongnu.org Fixes: dacc51720db ("tcg/loongarch64: Implement tcg_out_mov and tcg_out_movi") Reviewed-by: Song Gao Reported-by: Song Gao Signed-off-by: Richard Henderson (cherry picked from commit 521d7fb3ebdf88112ed13556a93e3037742b9eb8) Signed-off-by: zhujun2 --- tcg/loongarch64/tcg-target.c.inc | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index bab0a173a3..ad2690b90d 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -365,8 +365,7 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, * back to the slow path. */ - intptr_t pc_offset; - tcg_target_long val_lo, val_hi, pc_hi, offset_hi; + intptr_t src_rx, pc_offset; tcg_target_long hi12, hi32, hi52; /* Value fits in signed i32. */ @@ -376,24 +375,23 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, } /* PC-relative cases. */ - pc_offset = tcg_pcrel_diff(s, (void *)val); - if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) { - /* Single pcaddu2i. */ - tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2); - return; + src_rx = (intptr_t)tcg_splitwx_to_rx(s->code_ptr); + if ((val & 3) == 0) { + pc_offset = val - src_rx; + if (pc_offset == sextreg(pc_offset, 0, 22)) { + /* Single pcaddu2i. */ + tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2); + return; + } } - if (pc_offset == (int32_t)pc_offset) { - /* Offset within 32 bits; load with pcalau12i + ori. */ - val_lo = sextreg(val, 0, 12); - val_hi = val >> 12; - pc_hi = (val - pc_offset) >> 12; - offset_hi = val_hi - pc_hi; - - tcg_debug_assert(offset_hi == sextreg(offset_hi, 0, 20)); - tcg_out_opc_pcalau12i(s, rd, offset_hi); + pc_offset = (val >> 12) - (src_rx >> 12); + if (pc_offset == sextreg(pc_offset, 0, 20)) { + /* Load with pcalau12i + ori. */ + tcg_target_long val_lo = val & 0xfff; + tcg_out_opc_pcalau12i(s, rd, pc_offset); if (val_lo != 0) { - tcg_out_opc_ori(s, rd, rd, val_lo & 0xfff); + tcg_out_opc_ori(s, rd, rd, val_lo); } return; } -- Gitee From 378d79fa6b9410af702776ffa93865219f273380 Mon Sep 17 00:00:00 2001 From: Anton Johansson Date: Wed, 12 Jun 2024 15:30:31 +0200 Subject: [PATCH 517/939] accel/tcg: Fix typo causing tb->page_addr[1] to not be recorded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For TBs crossing page boundaries, the 2nd page will never be recorded/removed, as the index of the 2nd page is computed from the address of the 1st page. This is due to a typo, fix it. Cc: qemu-stable@nongnu.org Fixes: deba78709a ("accel/tcg: Always lock pages before translation") Signed-off-by: Anton Johansson Reviewed-by: Manos Pitsidianakis Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alex Bennée Message-Id: <20240612133031.15298-1-anjo@rev.ng> Signed-off-by: Richard Henderson (cherry picked from commit 3b279f73fa37bec8d3ba04a15f5153d6491cffaf) Signed-off-by: zhujun2 --- accel/tcg/tb-maint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c index 3d2a896220..eb37f9e8a8 100644 --- a/accel/tcg/tb-maint.c +++ b/accel/tcg/tb-maint.c @@ -712,7 +712,7 @@ static void tb_record(TranslationBlock *tb) tb_page_addr_t paddr0 = tb_page_addr0(tb); tb_page_addr_t paddr1 = tb_page_addr1(tb); tb_page_addr_t pindex0 = paddr0 >> TARGET_PAGE_BITS; - tb_page_addr_t pindex1 = paddr0 >> TARGET_PAGE_BITS; + tb_page_addr_t pindex1 = paddr1 >> TARGET_PAGE_BITS; assert(paddr0 != -1); if (unlikely(paddr1 != -1) && pindex0 != pindex1) { @@ -744,7 +744,7 @@ static void tb_remove(TranslationBlock *tb) tb_page_addr_t paddr0 = tb_page_addr0(tb); tb_page_addr_t paddr1 = tb_page_addr1(tb); tb_page_addr_t pindex0 = paddr0 >> TARGET_PAGE_BITS; - tb_page_addr_t pindex1 = paddr0 >> TARGET_PAGE_BITS; + tb_page_addr_t pindex1 = paddr1 >> TARGET_PAGE_BITS; assert(paddr0 != -1); if (unlikely(paddr1 != -1) && pindex0 != pindex1) { -- Gitee From 194c3cadc1879ff4c3d2fc6c5f962ad751c83d9c Mon Sep 17 00:00:00 2001 From: Huang Tao Date: Mon, 25 Mar 2024 10:16:54 +0800 Subject: [PATCH 518/939] target/riscv: Fix the element agnostic function problem In RVV and vcrypto instructions, the masked and tail elements are set to 1s using vext_set_elems_1s function if the vma/vta bit is set. It is the element agnostic policy. However, this function can't deal the big endian situation. This patch fixes the problem by adding handling of such case. Signed-off-by: Huang Tao Suggested-by: Richard Henderson Reviewed-by: LIU Zhiwei Cc: qemu-stable Message-ID: <20240325021654.6594-1-eric.huang@linux.alibaba.com> Signed-off-by: Alistair Francis (cherry picked from commit 75115d880c6d396f8a2d56aab8c12236d85a90e0) Signed-off-by: zhujun2 --- target/riscv/vector_internals.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/target/riscv/vector_internals.c b/target/riscv/vector_internals.c index 9cf5c17cde..be6eb040d2 100644 --- a/target/riscv/vector_internals.c +++ b/target/riscv/vector_internals.c @@ -29,6 +29,28 @@ void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt, if (tot - cnt == 0) { return ; } + + if (HOST_BIG_ENDIAN) { + /* + * Deal the situation when the elements are insdie + * only one uint64 block including setting the + * masked-off element. + */ + if (((tot - 1) ^ cnt) < 8) { + memset(base + H1(tot - 1), -1, tot - cnt); + return; + } + /* + * Otherwise, at least cross two uint64_t blocks. + * Set first unaligned block. + */ + if (cnt % 8 != 0) { + uint32_t j = ROUND_UP(cnt, 8); + memset(base + H1(j - 1), -1, j - cnt); + cnt = j; + } + /* Set other 64bit aligend blocks */ + } memset(base + cnt, -1, tot - cnt); } -- Gitee From 4dccc6603af2cd3deefb6ac94c3e7aec4b60485d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 17 May 2024 21:50:14 -0500 Subject: [PATCH 519/939] qio: Inherit follow_coroutine_ctx across TLS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since qemu 8.2, the combination of NBD + TLS + iothread crashes on an assertion failure: qemu-kvm: ../io/channel.c:534: void qio_channel_restart_read(void *): Assertion `qemu_get_current_aio_context() == qemu_coroutine_get_aio_context(co)' failed. It turns out that when we removed AioContext locking, we did so by having NBD tell its qio channels that it wanted to opt in to qio_channel_set_follow_coroutine_ctx(); but while we opted in on the main channel, we did not opt in on the TLS wrapper channel. qemu-iotests has coverage of NBD+iothread and NBD+TLS, but apparently no coverage of NBD+TLS+iothread, or we would have noticed this regression sooner. (I'll add that in the next patch) But while we could manually opt in to the TLS channel in nbd/server.c (a one-line change), it is more generic if all qio channels that wrap other channels inherit the follow status, in the same way that they inherit feature bits. CC: Stefan Hajnoczi CC: Daniel P. Berrangé CC: qemu-stable@nongnu.org Fixes: https://issues.redhat.com/browse/RHEL-34786 Fixes: 06e0f098 ("io: follow coroutine AioContext in qio_channel_yield()", v8.2.0) Signed-off-by: Eric Blake Reviewed-by: Stefan Hajnoczi Reviewed-by: Daniel P. Berrangé Message-ID: <20240518025246.791593-5-eblake@redhat.com> (cherry picked from commit 199e84de1c903ba5aa1f7256310bbc4a20dd930b) Signed-off-by: zhujun2 --- io/channel-tls.c | 26 +++++++++++++++----------- io/channel-websock.c | 1 + 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/io/channel-tls.c b/io/channel-tls.c index 58fe1aceee..a8ad89c3d1 100644 --- a/io/channel-tls.c +++ b/io/channel-tls.c @@ -69,37 +69,40 @@ qio_channel_tls_new_server(QIOChannel *master, const char *aclname, Error **errp) { - QIOChannelTLS *ioc; + QIOChannelTLS *tioc; + QIOChannel *ioc; - ioc = QIO_CHANNEL_TLS(object_new(TYPE_QIO_CHANNEL_TLS)); + tioc = QIO_CHANNEL_TLS(object_new(TYPE_QIO_CHANNEL_TLS)); + ioc = QIO_CHANNEL(tioc); - ioc->master = master; + tioc->master = master; + ioc->follow_coroutine_ctx = master->follow_coroutine_ctx; if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) { - qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SHUTDOWN); + qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); } object_ref(OBJECT(master)); - ioc->session = qcrypto_tls_session_new( + tioc->session = qcrypto_tls_session_new( creds, NULL, aclname, QCRYPTO_TLS_CREDS_ENDPOINT_SERVER, errp); - if (!ioc->session) { + if (!tioc->session) { goto error; } qcrypto_tls_session_set_callbacks( - ioc->session, + tioc->session, qio_channel_tls_write_handler, qio_channel_tls_read_handler, - ioc); + tioc); - trace_qio_channel_tls_new_server(ioc, master, creds, aclname); - return ioc; + trace_qio_channel_tls_new_server(tioc, master, creds, aclname); + return tioc; error: - object_unref(OBJECT(ioc)); + object_unref(OBJECT(tioc)); return NULL; } @@ -116,6 +119,7 @@ qio_channel_tls_new_client(QIOChannel *master, ioc = QIO_CHANNEL(tioc); tioc->master = master; + ioc->follow_coroutine_ctx = master->follow_coroutine_ctx; if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) { qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); } diff --git a/io/channel-websock.c b/io/channel-websock.c index a12acc27cf..de39f0d182 100644 --- a/io/channel-websock.c +++ b/io/channel-websock.c @@ -883,6 +883,7 @@ qio_channel_websock_new_server(QIOChannel *master) ioc = QIO_CHANNEL(wioc); wioc->master = master; + ioc->follow_coroutine_ctx = master->follow_coroutine_ctx; if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) { qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); } -- Gitee From 20541823659dc78a6a7be427f8fc03ccc58c88d1 Mon Sep 17 00:00:00 2001 From: Andrey Shumilin Date: Thu, 23 May 2024 16:06:20 +0100 Subject: [PATCH 520/939] hw/intc/arm_gic: Fix handling of NS view of GICC_APR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In gic_cpu_read() and gic_cpu_write(), we delegate the handling of reading and writing the Non-Secure view of the GICC_APR registers to functions gic_apr_ns_view() and gic_apr_write_ns_view(). Unfortunately we got the order of the arguments wrong, swapping the CPU number and the register number (which the compiler doesn't catch because they're both integers). Most guests probably didn't notice this bug because directly accessing the APR registers is typically something only done by firmware when it is doing state save for going into a sleep mode. Correct the mismatched call arguments. Found by Linux Verification Center (linuxtesting.org) with SVACE. Cc: qemu-stable@nongnu.org Fixes: 51fd06e0ee ("hw/intc/arm_gic: Fix handling of GICC_APR, GICC_NSAPR registers") Signed-off-by: Andrey Shumilin [PMM: Rewrote commit message] Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alex Bennée (cherry picked from commit daafa78b297291fea36fb4daeed526705fa7c035) Signed-off-by: zhujun2 --- hw/intc/arm_gic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c index dfe7a0a729..f0582f7a49 100644 --- a/hw/intc/arm_gic.c +++ b/hw/intc/arm_gic.c @@ -1663,7 +1663,7 @@ static MemTxResult gic_cpu_read(GICState *s, int cpu, int offset, *data = s->h_apr[gic_get_vcpu_real_id(cpu)]; } else if (gic_cpu_ns_access(s, cpu, attrs)) { /* NS view of GICC_APR is the top half of GIC_NSAPR */ - *data = gic_apr_ns_view(s, regno, cpu); + *data = gic_apr_ns_view(s, cpu, regno); } else { *data = s->apr[regno][cpu]; } @@ -1751,7 +1751,7 @@ static MemTxResult gic_cpu_write(GICState *s, int cpu, int offset, s->h_apr[gic_get_vcpu_real_id(cpu)] = value; } else if (gic_cpu_ns_access(s, cpu, attrs)) { /* NS view of GICC_APR is the top half of GIC_NSAPR */ - gic_apr_write_ns_view(s, regno, cpu, value); + gic_apr_write_ns_view(s, cpu, regno, value); } else { s->apr[regno][cpu] = value; } -- Gitee From ab7c657e05f896600c310c74e7584fc345ff235c Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 23 May 2024 16:06:19 +0100 Subject: [PATCH 521/939] hvf: arm: Fix encodings for ID_AA64PFR1_EL1 and debug System registers We wrongly encoded ID_AA64PFR1_EL1 using {3,0,0,4,2} in hvf_sreg_match[] so we fail to get the expected ARMCPRegInfo from cp_regs hash table with the wrong key. Fix it with the correct encoding {3,0,0,4,1}. With that fixed, the Linux guest can properly detect FEAT_SSBS2 on my M1 HW. All DBG{B,W}{V,C}R_EL1 registers are also wrongly encoded with op0 == 14. It happens to work because HVF_SYSREG(CRn, CRm, 14, op1, op2) equals to HVF_SYSREG(CRn, CRm, 2, op1, op2), by definition. But we shouldn't rely on it. Cc: qemu-stable@nongnu.org Fixes: a1477da3ddeb ("hvf: Add Apple Silicon support") Signed-off-by: Zenghui Yu Reviewed-by: Alexander Graf Message-id: 20240503153453.54389-1-zenghui.yu@linux.dev Signed-off-by: Peter Maydell (cherry picked from commit 19ed42e8adc87a3c739f61608b66a046bb9237e2) Signed-off-by: zhujun2 --- target/arm/hvf/hvf.c | 160 +++++++++++++++++++++---------------------- 1 file changed, 80 insertions(+), 80 deletions(-) diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index b4e98a99e2..d7cc00a084 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -392,85 +392,85 @@ struct hvf_sreg_match { }; static struct hvf_sreg_match hvf_sreg_match[] = { - { HV_SYS_REG_DBGBVR0_EL1, HVF_SYSREG(0, 0, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR0_EL1, HVF_SYSREG(0, 0, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR0_EL1, HVF_SYSREG(0, 0, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR0_EL1, HVF_SYSREG(0, 0, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR1_EL1, HVF_SYSREG(0, 1, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR1_EL1, HVF_SYSREG(0, 1, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR1_EL1, HVF_SYSREG(0, 1, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR1_EL1, HVF_SYSREG(0, 1, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR2_EL1, HVF_SYSREG(0, 2, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR2_EL1, HVF_SYSREG(0, 2, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR2_EL1, HVF_SYSREG(0, 2, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR2_EL1, HVF_SYSREG(0, 2, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR3_EL1, HVF_SYSREG(0, 3, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR3_EL1, HVF_SYSREG(0, 3, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR3_EL1, HVF_SYSREG(0, 3, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR3_EL1, HVF_SYSREG(0, 3, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR4_EL1, HVF_SYSREG(0, 4, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR4_EL1, HVF_SYSREG(0, 4, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR4_EL1, HVF_SYSREG(0, 4, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR4_EL1, HVF_SYSREG(0, 4, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR5_EL1, HVF_SYSREG(0, 5, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR5_EL1, HVF_SYSREG(0, 5, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR5_EL1, HVF_SYSREG(0, 5, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR5_EL1, HVF_SYSREG(0, 5, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR6_EL1, HVF_SYSREG(0, 6, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR6_EL1, HVF_SYSREG(0, 6, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR6_EL1, HVF_SYSREG(0, 6, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR6_EL1, HVF_SYSREG(0, 6, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR7_EL1, HVF_SYSREG(0, 7, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR7_EL1, HVF_SYSREG(0, 7, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR7_EL1, HVF_SYSREG(0, 7, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR7_EL1, HVF_SYSREG(0, 7, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR8_EL1, HVF_SYSREG(0, 8, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR8_EL1, HVF_SYSREG(0, 8, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR8_EL1, HVF_SYSREG(0, 8, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR8_EL1, HVF_SYSREG(0, 8, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR9_EL1, HVF_SYSREG(0, 9, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR9_EL1, HVF_SYSREG(0, 9, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR9_EL1, HVF_SYSREG(0, 9, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR9_EL1, HVF_SYSREG(0, 9, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR10_EL1, HVF_SYSREG(0, 10, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR10_EL1, HVF_SYSREG(0, 10, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR10_EL1, HVF_SYSREG(0, 10, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR10_EL1, HVF_SYSREG(0, 10, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR11_EL1, HVF_SYSREG(0, 11, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR11_EL1, HVF_SYSREG(0, 11, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR11_EL1, HVF_SYSREG(0, 11, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR11_EL1, HVF_SYSREG(0, 11, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR12_EL1, HVF_SYSREG(0, 12, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR12_EL1, HVF_SYSREG(0, 12, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR12_EL1, HVF_SYSREG(0, 12, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR12_EL1, HVF_SYSREG(0, 12, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR13_EL1, HVF_SYSREG(0, 13, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR13_EL1, HVF_SYSREG(0, 13, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR13_EL1, HVF_SYSREG(0, 13, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR13_EL1, HVF_SYSREG(0, 13, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR14_EL1, HVF_SYSREG(0, 14, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR14_EL1, HVF_SYSREG(0, 14, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR14_EL1, HVF_SYSREG(0, 14, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR14_EL1, HVF_SYSREG(0, 14, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR15_EL1, HVF_SYSREG(0, 15, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR15_EL1, HVF_SYSREG(0, 15, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR15_EL1, HVF_SYSREG(0, 15, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR15_EL1, HVF_SYSREG(0, 15, 14, 0, 7) }, + { HV_SYS_REG_DBGBVR0_EL1, HVF_SYSREG(0, 0, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR0_EL1, HVF_SYSREG(0, 0, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR0_EL1, HVF_SYSREG(0, 0, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR0_EL1, HVF_SYSREG(0, 0, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR1_EL1, HVF_SYSREG(0, 1, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR1_EL1, HVF_SYSREG(0, 1, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR1_EL1, HVF_SYSREG(0, 1, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR1_EL1, HVF_SYSREG(0, 1, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR2_EL1, HVF_SYSREG(0, 2, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR2_EL1, HVF_SYSREG(0, 2, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR2_EL1, HVF_SYSREG(0, 2, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR2_EL1, HVF_SYSREG(0, 2, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR3_EL1, HVF_SYSREG(0, 3, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR3_EL1, HVF_SYSREG(0, 3, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR3_EL1, HVF_SYSREG(0, 3, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR3_EL1, HVF_SYSREG(0, 3, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR4_EL1, HVF_SYSREG(0, 4, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR4_EL1, HVF_SYSREG(0, 4, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR4_EL1, HVF_SYSREG(0, 4, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR4_EL1, HVF_SYSREG(0, 4, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR5_EL1, HVF_SYSREG(0, 5, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR5_EL1, HVF_SYSREG(0, 5, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR5_EL1, HVF_SYSREG(0, 5, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR5_EL1, HVF_SYSREG(0, 5, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR6_EL1, HVF_SYSREG(0, 6, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR6_EL1, HVF_SYSREG(0, 6, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR6_EL1, HVF_SYSREG(0, 6, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR6_EL1, HVF_SYSREG(0, 6, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR7_EL1, HVF_SYSREG(0, 7, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR7_EL1, HVF_SYSREG(0, 7, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR7_EL1, HVF_SYSREG(0, 7, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR7_EL1, HVF_SYSREG(0, 7, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR8_EL1, HVF_SYSREG(0, 8, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR8_EL1, HVF_SYSREG(0, 8, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR8_EL1, HVF_SYSREG(0, 8, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR8_EL1, HVF_SYSREG(0, 8, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR9_EL1, HVF_SYSREG(0, 9, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR9_EL1, HVF_SYSREG(0, 9, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR9_EL1, HVF_SYSREG(0, 9, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR9_EL1, HVF_SYSREG(0, 9, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR10_EL1, HVF_SYSREG(0, 10, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR10_EL1, HVF_SYSREG(0, 10, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR10_EL1, HVF_SYSREG(0, 10, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR10_EL1, HVF_SYSREG(0, 10, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR11_EL1, HVF_SYSREG(0, 11, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR11_EL1, HVF_SYSREG(0, 11, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR11_EL1, HVF_SYSREG(0, 11, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR11_EL1, HVF_SYSREG(0, 11, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR12_EL1, HVF_SYSREG(0, 12, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR12_EL1, HVF_SYSREG(0, 12, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR12_EL1, HVF_SYSREG(0, 12, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR12_EL1, HVF_SYSREG(0, 12, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR13_EL1, HVF_SYSREG(0, 13, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR13_EL1, HVF_SYSREG(0, 13, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR13_EL1, HVF_SYSREG(0, 13, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR13_EL1, HVF_SYSREG(0, 13, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR14_EL1, HVF_SYSREG(0, 14, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR14_EL1, HVF_SYSREG(0, 14, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR14_EL1, HVF_SYSREG(0, 14, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR14_EL1, HVF_SYSREG(0, 14, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR15_EL1, HVF_SYSREG(0, 15, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR15_EL1, HVF_SYSREG(0, 15, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR15_EL1, HVF_SYSREG(0, 15, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR15_EL1, HVF_SYSREG(0, 15, 2, 0, 7) }, #ifdef SYNC_NO_RAW_REGS /* @@ -482,7 +482,7 @@ static struct hvf_sreg_match hvf_sreg_match[] = { { HV_SYS_REG_MPIDR_EL1, HVF_SYSREG(0, 0, 3, 0, 5) }, { HV_SYS_REG_ID_AA64PFR0_EL1, HVF_SYSREG(0, 4, 3, 0, 0) }, #endif - { HV_SYS_REG_ID_AA64PFR1_EL1, HVF_SYSREG(0, 4, 3, 0, 2) }, + { HV_SYS_REG_ID_AA64PFR1_EL1, HVF_SYSREG(0, 4, 3, 0, 1) }, { HV_SYS_REG_ID_AA64DFR0_EL1, HVF_SYSREG(0, 5, 3, 0, 0) }, { HV_SYS_REG_ID_AA64DFR1_EL1, HVF_SYSREG(0, 5, 3, 0, 1) }, { HV_SYS_REG_ID_AA64ISAR0_EL1, HVF_SYSREG(0, 6, 3, 0, 0) }, -- Gitee From 34fc72b12cc4887cb2b551b171f6a76c860b6997 Mon Sep 17 00:00:00 2001 From: Yuquan Wang Date: Sun, 7 Apr 2024 16:35:39 +0800 Subject: [PATCH 522/939] qemu-options: Fix CXL Fixed Memory Window interleave-granularity typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the unit typo of interleave-granularity of CXL Fixed Memory Window in qemu-option.hx. Fixes: 03b39fcf64 ("hw/cxl: Make the CFMW a machine parameter.") Signed-off-by: Yuquan Wang wangyuquan1236@phytium.com.cn Message-ID: <20240407083539.1488172-2-wangyuquan1236@phytium.com.cn> [PMD: Reworded] Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit aa88f99c87c0e5d195d6d96190374650553ea61f) Signed-off-by: zhujun2 --- qemu-options.hx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index 9829b1020a..4df4dcea21 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -149,14 +149,14 @@ SRST platform and configuration dependent. ``interleave-granularity=granularity`` sets the granularity of - interleave. Default 256KiB. Only 256KiB, 512KiB, 1024KiB, 2048KiB - 4096KiB, 8192KiB and 16384KiB granularities supported. + interleave. Default 256 (bytes). Only 256, 512, 1k, 2k, + 4k, 8k and 16k granularities supported. Example: :: - -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512k + -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512 ERST DEF("M", HAS_ARG, QEMU_OPTION_M, -- Gitee From a8a621a06d54b987502d277f33021547d00fd133 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Wed, 2 Aug 2023 20:52:31 -0700 Subject: [PATCH 523/939] target/m68k: Map FPU exceptions to FPSR register Add helpers for reading/writing the 68881 FPSR register so that changes in floating point exception state can be seen by the application. Call these helpers in pre_load/post_load hooks to synchronize exception state. Signed-off-by: Keith Packard Reviewed-by: Richard Henderson Message-Id: <20230803035231.429697-1-keithp@keithp.com> Signed-off-by: Richard Henderson (cherry picked from commit 5888357942da1fd5a50efb6e4a6af8b1a27a5af8) Signed-off-by: zhujun2 --- target/m68k/cpu.c | 12 +++++-- target/m68k/cpu.h | 3 +- target/m68k/fpu_helper.c | 72 ++++++++++++++++++++++++++++++++++++++++ target/m68k/helper.c | 4 +-- target/m68k/helper.h | 2 ++ target/m68k/translate.c | 4 +-- 6 files changed, 90 insertions(+), 7 deletions(-) diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index 11c7e0a790..d95deaafcd 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -396,12 +396,19 @@ static const VMStateDescription vmstate_freg = { } }; -static int fpu_post_load(void *opaque, int version) +static int fpu_pre_save(void *opaque) { M68kCPU *s = opaque; - cpu_m68k_restore_fp_status(&s->env); + s->env.fpsr = cpu_m68k_get_fpsr(&s->env); + return 0; +} + +static int fpu_post_load(void *opaque, int version) +{ + M68kCPU *s = opaque; + cpu_m68k_set_fpsr(&s->env, s->env.fpsr); return 0; } @@ -410,6 +417,7 @@ const VMStateDescription vmmstate_fpu = { .version_id = 1, .minimum_version_id = 1, .needed = fpu_needed, + .pre_save = fpu_pre_save, .post_load = fpu_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(env.fpcr, M68kCPU), diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h index 6cfc696d2b..4d78da9d5f 100644 --- a/target/m68k/cpu.h +++ b/target/m68k/cpu.h @@ -199,7 +199,8 @@ void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t); void cpu_m68k_set_sr(CPUM68KState *env, uint32_t); void cpu_m68k_restore_fp_status(CPUM68KState *env); void cpu_m68k_set_fpcr(CPUM68KState *env, uint32_t val); - +uint32_t cpu_m68k_get_fpsr(CPUM68KState *env); +void cpu_m68k_set_fpsr(CPUM68KState *env, uint32_t val); /* * Instead of computing the condition codes after each m68k instruction, diff --git a/target/m68k/fpu_helper.c b/target/m68k/fpu_helper.c index ab120b5f59..8314791f50 100644 --- a/target/m68k/fpu_helper.c +++ b/target/m68k/fpu_helper.c @@ -164,6 +164,78 @@ void HELPER(set_fpcr)(CPUM68KState *env, uint32_t val) cpu_m68k_set_fpcr(env, val); } +/* Convert host exception flags to cpu_m68k form. */ +static int cpu_m68k_exceptbits_from_host(int host_bits) +{ + int target_bits = 0; + + if (host_bits & float_flag_invalid) { + target_bits |= 0x80; + } + if (host_bits & float_flag_overflow) { + target_bits |= 0x40; + } + if (host_bits & (float_flag_underflow | float_flag_output_denormal)) { + target_bits |= 0x20; + } + if (host_bits & float_flag_divbyzero) { + target_bits |= 0x10; + } + if (host_bits & float_flag_inexact) { + target_bits |= 0x08; + } + return target_bits; +} + +/* Convert cpu_m68k exception flags to target form. */ +static int cpu_m68k_exceptbits_to_host(int target_bits) +{ + int host_bits = 0; + + if (target_bits & 0x80) { + host_bits |= float_flag_invalid; + } + if (target_bits & 0x40) { + host_bits |= float_flag_overflow; + } + if (target_bits & 0x20) { + host_bits |= float_flag_underflow; + } + if (target_bits & 0x10) { + host_bits |= float_flag_divbyzero; + } + if (target_bits & 0x08) { + host_bits |= float_flag_inexact; + } + return host_bits; +} + +uint32_t cpu_m68k_get_fpsr(CPUM68KState *env) +{ + int host_flags = get_float_exception_flags(&env->fp_status); + int target_flags = cpu_m68k_exceptbits_from_host(host_flags); + int except = (env->fpsr & ~(0xf8)) | target_flags; + return except; +} + +uint32_t HELPER(get_fpsr)(CPUM68KState *env) +{ + return cpu_m68k_get_fpsr(env); +} + +void cpu_m68k_set_fpsr(CPUM68KState *env, uint32_t val) +{ + env->fpsr = val; + + int host_flags = cpu_m68k_exceptbits_to_host((int) env->fpsr); + set_float_exception_flags(host_flags, &env->fp_status); +} + +void HELPER(set_fpsr)(CPUM68KState *env, uint32_t val) +{ + cpu_m68k_set_fpsr(env, val); +} + #define PREC_BEGIN(prec) \ do { \ FloatX80RoundPrec old = \ diff --git a/target/m68k/helper.c b/target/m68k/helper.c index 0a1544cd68..beab4b96bc 100644 --- a/target/m68k/helper.c +++ b/target/m68k/helper.c @@ -118,7 +118,7 @@ static int m68k_fpu_gdb_get_reg(CPUM68KState *env, GByteArray *mem_buf, int n) case 8: /* fpcontrol */ return gdb_get_reg32(mem_buf, env->fpcr); case 9: /* fpstatus */ - return gdb_get_reg32(mem_buf, env->fpsr); + return gdb_get_reg32(mem_buf, cpu_m68k_get_fpsr(env)); case 10: /* fpiar, not implemented */ return gdb_get_reg32(mem_buf, 0); } @@ -137,7 +137,7 @@ static int m68k_fpu_gdb_set_reg(CPUM68KState *env, uint8_t *mem_buf, int n) cpu_m68k_set_fpcr(env, ldl_p(mem_buf)); return 4; case 9: /* fpstatus */ - env->fpsr = ldl_p(mem_buf); + cpu_m68k_set_fpsr(env, ldl_p(mem_buf)); return 4; case 10: /* fpiar, not implemented */ return 4; diff --git a/target/m68k/helper.h b/target/m68k/helper.h index 2bbe0dc032..95aa5e53bb 100644 --- a/target/m68k/helper.h +++ b/target/m68k/helper.h @@ -54,6 +54,8 @@ DEF_HELPER_4(fsdiv, void, env, fp, fp, fp) DEF_HELPER_4(fddiv, void, env, fp, fp, fp) DEF_HELPER_4(fsgldiv, void, env, fp, fp, fp) DEF_HELPER_FLAGS_3(fcmp, TCG_CALL_NO_RWG, void, env, fp, fp) +DEF_HELPER_2(set_fpsr, void, env, i32) +DEF_HELPER_1(get_fpsr, i32, env) DEF_HELPER_FLAGS_2(set_fpcr, TCG_CALL_NO_RWG, void, env, i32) DEF_HELPER_FLAGS_2(ftst, TCG_CALL_NO_RWG, void, env, fp) DEF_HELPER_3(fconst, void, env, fp, i32) diff --git a/target/m68k/translate.c b/target/m68k/translate.c index 4a0b0b2703..f8eeb70379 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -4686,7 +4686,7 @@ static void gen_load_fcr(DisasContext *s, TCGv res, int reg) tcg_gen_movi_i32(res, 0); break; case M68K_FPSR: - tcg_gen_ld_i32(res, tcg_env, offsetof(CPUM68KState, fpsr)); + gen_helper_get_fpsr(res, tcg_env); break; case M68K_FPCR: tcg_gen_ld_i32(res, tcg_env, offsetof(CPUM68KState, fpcr)); @@ -4700,7 +4700,7 @@ static void gen_store_fcr(DisasContext *s, TCGv val, int reg) case M68K_FPIAR: break; case M68K_FPSR: - tcg_gen_st_i32(val, tcg_env, offsetof(CPUM68KState, fpsr)); + gen_helper_set_fpsr(tcg_env, val); break; case M68K_FPCR: gen_helper_set_fpcr(tcg_env, val); -- Gitee From 254c67a88ab54fdfe1eb55d7efaf4386a9597cd0 Mon Sep 17 00:00:00 2001 From: tangzhongrui Date: Sat, 16 Nov 2024 17:38:50 +0800 Subject: [PATCH 524/939] migration: fix-possible-int-overflow stat64_add() takes uint64_t as 2nd argument, but both "p->next_packet_size" and "p->packet_len" are uint32_t. Thus, theyr sum may overflow uint32_t. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Frolov Link: https://lore.kernel.org/r/20241113140509.325732-2-frolov@swemel.ru Signed-off-by: Peter Xu Signed-off-by: Zhongrui Tang --- migration/multifd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index 7d373a245e..f3bf6888c0 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -735,7 +735,7 @@ static void *multifd_send_thread(void *opaque) } stat64_add(&mig_stats.multifd_bytes, - p->next_packet_size + p->packet_len); + (uint64_t)p->next_packet_size + p->packet_len); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); p->pending_job--; -- Gitee From d0b24cfdeb8bd64fa55154d79574352be33ecc51 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 15 Nov 2024 17:25:15 +0000 Subject: [PATCH 525/939] tcg: Allow top bit of SIMD_DATA_BITS to be set in simd_desc() In simd_desc() we create a SIMD descriptor from various pieces including an arbitrary data value from the caller. We try to sanitize these to make sure everything will fit: the 'data' value needs to fit in the SIMD_DATA_BITS (== 22) sized field. However we do that sanitizing with: tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS)); This works for the case where the data is supposed to be considered as a signed integer (which can then be returned via simd_data()). However, some callers want to treat the data value as unsigned. Specifically, for the Arm SVE operations, make_svemte_desc() assembles a data value as a collection of fields, and it needs to use all 22 bits. Currently if MTE is enabled then its MTEDESC SIZEM1 field may have the most significant bit set, and then it will trip this assertion. Loosen the assertion so that we only check that the data value will fit into the field in some way, either as a signed or as an unsigned value. This means we will fail to detect some kinds of bug in the callers, but we won't spuriously assert for intentional use of the data field as unsigned. Cc: qemu-stable@nongnu.org Fixes: db432672dc50e ("tcg: Add generic vector expanders") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2601 Signed-off-by: Peter Maydell Message-ID: <20241115172515.1229393-1-peter.maydell@linaro.org> Reviewed-by: Richard Henderson Signed-off-by: Richard Henderson Signed-off-by: Zhongrui Tang --- tcg/tcg-op-gvec.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index bb88943f79..733b44f105 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -88,7 +88,20 @@ uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) uint32_t desc = 0; check_size_align(oprsz, maxsz, 0); - tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS)); + + /* + * We want to check that 'data' will fit into SIMD_DATA_BITS. + * However, some callers want to treat the data as a signed + * value (which they can later get back with simd_data()) + * and some want to treat it as an unsigned value. + * So here we assert only that the data will fit into the + * field in at least one way. This means that some invalid + * values from the caller will not be detected, e.g. if the + * caller wants to handle the value as a signed integer but + * incorrectly passes us 1 << (SIMD_DATA_BITS - 1). + */ + tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) || + data == extract32(data, 0, SIMD_DATA_BITS)); oprsz = (oprsz / 8) - 1; maxsz = (maxsz / 8) - 1; -- Gitee From ad5b05def5521a9cbbdd750c915fccaba391f53b Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 12 Nov 2024 11:32:01 -0800 Subject: [PATCH 526/939] linux-user: Honor elf alignment when placing images Most binaries don't actually depend on more than page alignment, but any binary can request it. Not honoring this was a bug. This became obvious when gdb reported Failed to read a valid object file image from memory when examining some vdso which are marked as needing more than page alignment. Signed-off-by: Richard Henderson Signed-off-by: Zhongrui Tang --- linux-user/elfload.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index cf9e74468b..2a82468079 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -3263,7 +3263,8 @@ static void load_elf_image(const char *image_name, const ImageSource *src, char **pinterp_name) { g_autofree struct elf_phdr *phdr = NULL; - abi_ulong load_addr, load_bias, loaddr, hiaddr, error; + abi_ulong load_addr, load_bias, loaddr, hiaddr, error, align; + size_t reserve_size, align_size; int i, prot_exec; Error *err = NULL; @@ -3347,6 +3348,9 @@ static void load_elf_image(const char *image_name, const ImageSource *src, load_addr = loaddr; + align = pow2ceil(info->alignment); + info->alignment = align; + if (pinterp_name != NULL) { if (ehdr->e_type == ET_EXEC) { /* @@ -3355,8 +3359,6 @@ static void load_elf_image(const char *image_name, const ImageSource *src, */ probe_guest_base(image_name, loaddr, hiaddr); } else { - abi_ulong align; - /* * The binary is dynamic, but we still need to * select guest_base. In this case we pass a size. @@ -3374,10 +3376,7 @@ static void load_elf_image(const char *image_name, const ImageSource *src, * Since we do not have complete control over the guest * address space, we prefer the kernel to choose some address * rather than force the use of LOAD_ADDR via MAP_FIXED. - * But without MAP_FIXED we cannot guarantee alignment, - * only suggest it. */ - align = pow2ceil(info->alignment); if (align) { load_addr &= -align; } @@ -3401,13 +3400,35 @@ static void load_elf_image(const char *image_name, const ImageSource *src, * In both cases, we will overwrite pages in this range with mappings * from the executable. */ - load_addr = target_mmap(load_addr, (size_t)hiaddr - loaddr + 1, PROT_NONE, + reserve_size = (size_t)hiaddr - loaddr + 1; + align_size = reserve_size; + + if (ehdr->e_type != ET_EXEC && align > qemu_real_host_page_size()) { + align_size += align - 1; + } + + load_addr = target_mmap(load_addr, align_size, PROT_NONE, MAP_PRIVATE | MAP_ANON | MAP_NORESERVE | (ehdr->e_type == ET_EXEC ? MAP_FIXED_NOREPLACE : 0), -1, 0); if (load_addr == -1) { goto exit_mmap; } + + if (align_size != reserve_size) { + abi_ulong align_addr = ROUND_UP(load_addr, align); + abi_ulong align_end = align_addr + reserve_size; + abi_ulong load_end = load_addr + align_size; + + if (align_addr != load_addr) { + target_munmap(load_addr, align_addr - load_addr); + } + if (align_end != load_end) { + target_munmap(align_end, load_end - align_end); + } + load_addr = align_addr; + } + load_bias = load_addr - loaddr; if (elf_is_fdpic(ehdr)) { -- Gitee From b611bd7f3f4525c8373f2e504594414e1ed5b058 Mon Sep 17 00:00:00 2001 From: guping Date: Mon, 18 Nov 2024 02:50:17 +0000 Subject: [PATCH 527/939] accel/tcg: Fix user-only probe_access_internal plugin check cherry-pick from 2a339fee450638b512c5122281cb5ab49331cfb8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The acc_flag check for write should have been against PAGE_WRITE_ORG, not PAGE_WRITE. But it is better to combine two acc_flag checks to a single check against access_type. This matches the system code in cputlb.c. Cc: qemu-stable@nongnu.org Resolves: #2647 Signed-off-by: default avatarRichard Henderson Message-Id: 20241111145002.144995-1-richard.henderson@linaro.org Reviewed-by: default avatarAlex Bennée Signed-off-by: guping --- accel/tcg/user-exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index 68b252cb8e..e87848a5e2 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -794,7 +794,7 @@ static int probe_access_internal(CPUArchState *env, vaddr addr, if (guest_addr_valid_untagged(addr)) { int page_flags = page_get_flags(addr); if (page_flags & acc_flag) { - if ((acc_flag == PAGE_READ || acc_flag == PAGE_WRITE) + if (access_type != MMU_INST_FETCH && cpu_plugin_mem_cbs_enabled(env_cpu(env))) { return TLB_MMIO; } -- Gitee From 6d4db685ae8b4cbffab80c61c01ef56c57b67eb4 Mon Sep 17 00:00:00 2001 From: guping Date: Mon, 18 Nov 2024 03:09:59 +0000 Subject: [PATCH 528/939] linux-user: Tolerate CONFIG_LSM_MMAP_MIN_ADDR cherry-pick from fb7f3572b111ffb6c2dd2c7f6c5b4dc57dd8a3f5 Running qemu-i386 on a system running with SELinux in enforcing mode (more precisely: s390x trixie container on Fedora 40) fails with: qemu-i386: tests/tcg/i386-linux-user/sigreturn-sigmask: Unable to find a guest_base to satisfy all guest address mapping requirements 00000000-ffffffff The reason is that main() determines mmap_min_addr from /proc/sys/vm/mmap_min_addr, but SELinux additionally defines CONFIG_LSM_MMAP_MIN_ADDR, which is normally larger: 32K or 64K, but, in general, can be anything. There is no portable way to query its value: /boot/config, /proc/config and /proc/config.gz are distro- and environment-specific. Once the identity map fails, the magnitude of guest_base does not matter, so fix by starting the search from 1M or 1G. Cc: qemu-stable@nongnu.org Resolves: #2598 Suggested-by: default avatarRichard Henderson Signed-off-by: default avatarIlya Leoshkevich Message-ID: <20241023002558.34589-1-iii@linux.ibm.com> Signed-off-by: default avatarRichard Henderson Signed-off-by: guping --- linux-user/elfload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index cf9e74468b..0df64c6442 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -2980,7 +2980,7 @@ static uintptr_t pgb_try_itree(const PGBAddrs *ga, uintptr_t base, static uintptr_t pgb_find_itree(const PGBAddrs *ga, IntervalTreeRoot *root, uintptr_t align, uintptr_t brk) { - uintptr_t last = mmap_min_addr; + uintptr_t last = sizeof(uintptr_t) == 4 ? MiB : GiB; uintptr_t base, skip; while (true) { -- Gitee From 1f6dde2350209e937a5676c6775d1500136caea2 Mon Sep 17 00:00:00 2001 From: gubin Date: Mon, 18 Nov 2024 13:48:37 +0800 Subject: [PATCH 529/939] acpi/tests/avocado/bits: wait for 200 seconds for SHUTDOWN event from bits VM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 7ef4c41e91d59d72a3b8bc022a6cb3e81787a50a By default, the timeout to receive any specified event from the QEMU VM is 60 seconds set by the python avocado test framework. Please see event_wait() and events_wait() in python/qemu/machine/machine.py. If the matching event is not triggered within that interval, an asyncio.TimeoutError is generated. Since the timeout for the bits avocado test is 200 secs, we need to make event_wait() timeout of the same value as well so that an early timeout is not triggered by the avocado framework. CC: peter.maydell@linaro.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2077 Signed-off-by: Ani Sinha Reviewed-by: Daniel P. Berrangé Message-id: 20240117042556.3360190-1-anisinha@redhat.com Signed-off-by: Peter Maydell Signed-off-by: gubin --- tests/avocado/acpi-bits.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/avocado/acpi-bits.py b/tests/avocado/acpi-bits.py index 68b9e98d4e..efe4f52ee0 100644 --- a/tests/avocado/acpi-bits.py +++ b/tests/avocado/acpi-bits.py @@ -54,6 +54,8 @@ deps = ["xorriso", "mformat"] # dependent tools needed in the test setup/box. supported_platforms = ['x86_64'] # supported test platforms. +# default timeout of 120 secs is sometimes not enough for bits test. +BITS_TIMEOUT = 200 def which(tool): """ looks up the full path for @tool, returns None if not found @@ -133,7 +135,7 @@ class AcpiBitsTest(QemuBaseTest): #pylint: disable=too-many-instance-attributes """ # in slower systems the test can take as long as 3 minutes to complete. - timeout = 200 + timeout = BITS_TIMEOUT def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -400,7 +402,8 @@ def test_acpi_smbios_bits(self): # biosbits has been configured to run all the specified test suites # in batch mode and then automatically initiate a vm shutdown. - # Rely on avocado's unit test timeout. - self._vm.event_wait('SHUTDOWN') + # Set timeout to BITS_TIMEOUT for SHUTDOWN event from bits VM at par + # with the avocado test timeout. + self._vm.event_wait('SHUTDOWN', timeout=BITS_TIMEOUT) self._vm.wait(timeout=None) self.parse_log() -- Gitee From b60350d9f495f568aa1380f02a13b51e9619a7de Mon Sep 17 00:00:00 2001 From: gubin Date: Mon, 18 Nov 2024 14:17:52 +0800 Subject: [PATCH 530/939] audio/audio.c: remove trailing newline in error_setg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 09a36158c283f7448d1b00fdbb6634f05d27f922 error_setg() appends newline to the formatted message. Fixes: cb94ff5f80c5 ("audio: propagate Error * out of audio_init") Signed-off-by: Michael Tokarev Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: gubin --- audio/audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audio/audio.c b/audio/audio.c index 8d1e4ad922..7ac74f9e16 100644 --- a/audio/audio.c +++ b/audio/audio.c @@ -1744,7 +1744,7 @@ static AudioState *audio_init(Audiodev *dev, Error **errp) if (driver) { done = !audio_driver_init(s, driver, dev, errp); } else { - error_setg(errp, "Unknown audio driver `%s'\n", drvname); + error_setg(errp, "Unknown audio driver `%s'", drvname); } if (!done) { goto out; -- Gitee From d2ee29691b6d6b48ba8da179e97572f5a6684a9d Mon Sep 17 00:00:00 2001 From: gubin Date: Mon, 18 Nov 2024 14:47:25 +0800 Subject: [PATCH 531/939] Avoid unaligned fetch in ladr_match() cherry-pick from 6a5287ce80470bb8df95901d73ee779a64e70c3a There is no guarantee that the PCNetState is allocated such that csr[8] is allocated on an 8-byte boundary. Since not all hosts are capable of unaligned fetches the 16-bit elements need to be fetched individually to avoid a potential fault. Closes issue #2143 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2143 Signed-off-by: Nick Briggs Reviewed-by: Peter Maydell Signed-off-by: Jason Wang Signed-off-by: gubin --- hw/net/pcnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c index a7e123e60d..7d574f487b 100644 --- a/hw/net/pcnet.c +++ b/hw/net/pcnet.c @@ -632,7 +632,7 @@ static inline int ladr_match(PCNetState *s, const uint8_t *buf, int size) { struct qemu_ether_header *hdr = (void *)buf; if ((*(hdr->ether_dhost)&0x01) && - ((uint64_t *)&s->csr[8])[0] != 0LL) { + (s->csr[8] | s->csr[9] | s->csr[10] | s->csr[11]) != 0) { uint8_t ladr[8] = { s->csr[8] & 0xff, s->csr[8] >> 8, s->csr[9] & 0xff, s->csr[9] >> 8, -- Gitee From c5b349f9ff0792cce72cdd1ade2521c568058a25 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 18 Nov 2024 14:20:56 -0500 Subject: [PATCH 532/939] cpu: ensure we don't call start_exclusive from cpu_exec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 779f30a01af8566780cefc8639505b758950afb3 Reviewed-by: Richard Henderson Signed-off-by: Pierrick Bouvier Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20241025175857.2554252-3-pierrick.bouvier@linaro.org> Signed-off-by: Richard Henderson Signed-off-by: qihao_yewu --- cpu-common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpu-common.c b/cpu-common.c index 54e63b3f77..a949ad7ca3 100644 --- a/cpu-common.c +++ b/cpu-common.c @@ -234,6 +234,9 @@ void start_exclusive(void) CPUState *other_cpu; int running_cpus; + /* Ensure we are not running, or start_exclusive will be blocked. */ + g_assert(!current_cpu->running); + if (current_cpu->exclusive_context_count) { current_cpu->exclusive_context_count++; return; -- Gitee From c006b5b78ffe7e6af76cde943a9fdd082473ba55 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 18 Nov 2024 15:45:24 -0500 Subject: [PATCH 533/939] target/i386: Fix minor typo in NO_NESTED_DATA_BP feature bit cheery-pick from 9c882ad4dc96f658ff9f92b88b3749d0398e6fa2 Rename CPUID_8000_0021_EAX_No_NESTED_DATA_BP to CPUID_8000_0021_EAX_NO_NESTED_DATA_BP. No functional change intended. Signed-off-by: Babu Moger Link: https://lore.kernel.org/r/a6749acd125670d3930f4ca31736a91b1d965f2f.1729807947.git.babu.moger@amd.com Signed-off-by: Paolo Bonzini Signed-off-by: qihao_yewu --- target/i386/cpu.c | 2 +- target/i386/cpu.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ca7e5337b0..c2dc929eaa 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -5063,7 +5063,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { CPUID_8000_0008_EBX_STIBP_ALWAYS_ON | CPUID_8000_0008_EBX_AMD_SSBD | CPUID_8000_0008_EBX_AMD_PSFD, .features[FEAT_8000_0021_EAX] = - CPUID_8000_0021_EAX_No_NESTED_DATA_BP | + CPUID_8000_0021_EAX_NO_NESTED_DATA_BP | CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING | CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE | CPUID_8000_0021_EAX_AUTO_IBRS, diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 34f9615b98..6ca185cd9d 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -971,7 +971,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_8000_0008_EBX_AMD_PSFD (1U << 28) /* Processor ignores nested data breakpoints */ -#define CPUID_8000_0021_EAX_No_NESTED_DATA_BP (1U << 0) +#define CPUID_8000_0021_EAX_NO_NESTED_DATA_BP (1U << 0) /* LFENCE is always serializing */ #define CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING (1U << 2) /* Null Selector Clears Base */ -- Gitee From f0be5a2c99d2f893a27839cd5eb5fa74f3ff5564 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 18 Nov 2024 21:03:55 -0500 Subject: [PATCH 534/939] hw/misc/mos6522: Fix bad class definition of the MOS6522 device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from c3d7c18b0d616cf7fb3c1f325503e1462307209d When compiling QEMU with --enable-cfi, the "q800" m68k machine currently crashes very early, when the q800_machine_init() function tries to wire the interrupts of the "via1" device. This happens because TYPE_MOS6522_Q800_VIA1 is supposed to be a proper SysBus device, but its parent (TYPE_MOS6522) has a mistake in its class definition where it is only derived from DeviceClass, and not from SysBusDeviceClass, so we end up in funny memory access issues here. Using the right class hierarchy for the MOS6522 device fixes the problem. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2675 Signed-off-by: Thomas Huth Fixes: 51f233ec92 ("misc: introduce new mos6522 VIA device") Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Mark Cave-Ayland Message-ID: <20241114104653.963812-1-thuth@redhat.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- include/hw/misc/mos6522.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/misc/mos6522.h b/include/hw/misc/mos6522.h index fba45668ab..920871a598 100644 --- a/include/hw/misc/mos6522.h +++ b/include/hw/misc/mos6522.h @@ -154,7 +154,7 @@ struct MOS6522State { OBJECT_DECLARE_TYPE(MOS6522State, MOS6522DeviceClass, MOS6522) struct MOS6522DeviceClass { - DeviceClass parent_class; + SysBusDeviceClass parent_class; ResettablePhases parent_phases; void (*portB_write)(MOS6522State *dev); -- Gitee From bdd1d8b5aea219c7ec1fb590430e3c8e99f43700 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 18 Nov 2024 21:37:32 -0500 Subject: [PATCH 535/939] usb-hub: Fix handling port power control messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from b2cc69997924b651c0c6f4037782e25f2e438715 The ClearPortFeature control message fails for PORT_POWER because there is no break; at the end of the case statement, causing it to fall through to the failure handler. Add the missing break; to solve the problem. Fixes: 1cc403eb21 ("usb-hub: emulate per port power switching") Signed-off-by: Guenter Roeck Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20241112170152.217664-11-linux@roeck-us.net> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/usb/dev-hub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/usb/dev-hub.c b/hw/usb/dev-hub.c index 5703e0e826..7b3cfa2c1b 100644 --- a/hw/usb/dev-hub.c +++ b/hw/usb/dev-hub.c @@ -479,6 +479,7 @@ static void usb_hub_handle_control(USBDevice *dev, USBPacket *p, usb_hub_port_clear(port, PORT_STAT_SUSPEND); port->wPortChange = 0; } + break; default: goto fail; } -- Gitee From 0d93daee2da62d0e86d99fd561d2a973c9634d1f Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 18 Nov 2024 22:32:53 -0500 Subject: [PATCH 536/939] target/riscv: Fix vcompress with rvv_ta_all_1s cheery-pick from c128d39edeff337220fc536a3e935bcba01ecb49 vcompress packs vl or less fields into vd, so the tail starts after the last packed field. This could be more clearly expressed in the ISA, but for now this thread helps to explain it: https://github.com/riscv/riscv-v-spec/issues/796 Signed-off-by: Anton Blanchard Reviewed-by: Daniel Henrique Barboza Reviewed-by: Alistair Francis Message-ID: <20241030043538.939712-1-antonb@tenstorrent.com> Signed-off-by: Alistair Francis Signed-off-by: qihao_yewu --- target/riscv/vector_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index c1c3a4d1ea..42ffd3a68a 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -5045,7 +5045,7 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ } \ env->vstart = 0; \ /* set tail elements to 1s */ \ - vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ + vext_set_elems_1s(vd, vta, num * esz, total_elems * esz); \ } /* Compress into vd elements of vs2 where vs1 is enabled */ -- Gitee From b6bfee023b15f25c1db077df7bfd2e9212cda762 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 18 Mar 2024 16:53:36 +0100 Subject: [PATCH 537/939] target/i386: add guest-phys-bits cpu property commit 513ba32dccc659c80722b3a43233b26eaa50309a upstream. Allows to set guest-phys-bits (cpuid leaf 80000008, eax[23:16]) via -cpu $model,guest-phys-bits=$nr. Intel-SIG: commit 513ba32dccc6 target/i386: add guest-phys-bits cpu property Signed-off-by: Gerd Hoffmann Message-ID: <20240318155336.156197-3-kraxel@redhat.com> Reviewed-by: Zhao Liu Signed-off-by: Paolo Bonzini [jz: compatible property for 9.0 machines not included] Signed-off-by: Jason Zeng --- target/i386/cpu.c | 22 ++++++++++++++++++++++ target/i386/cpu.h | 8 ++++++++ 2 files changed, 30 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ca7e5337b0..93f88b7bf8 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6827,6 +6827,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) { /* 64 bit processor */ *eax |= (cpu_x86_virtual_addr_width(env) << 8); + *eax |= (cpu->guest_phys_bits << 16); } *ebx = env->features[FEAT_8000_0008_EBX]; if (cs->nr_cores * cs->nr_threads > 1) { @@ -7603,6 +7604,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) goto out; } + if (cpu->guest_phys_bits == -1) { + /* + * If it was not set by the user, or by the accelerator via + * cpu_exec_realizefn, clear. + */ + cpu->guest_phys_bits = 0; + } + if (cpu->ucode_rev == 0) { /* * The default is the same as KVM's. Note that this check @@ -7653,6 +7662,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) if (cpu->phys_bits == 0) { cpu->phys_bits = TCG_PHYS_ADDR_BITS; } + if (cpu->guest_phys_bits && + (cpu->guest_phys_bits > cpu->phys_bits || + cpu->guest_phys_bits < 32)) { + error_setg(errp, "guest-phys-bits should be between 32 and %u " + " (but is %u)", + cpu->phys_bits, cpu->guest_phys_bits); + return; + } } else { /* For 32 bit systems don't use the user set value, but keep * phys_bits consistent with what we tell the guest. @@ -7661,6 +7678,10 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) error_setg(errp, "phys-bits is not user-configurable in 32 bit"); return; } + if (cpu->guest_phys_bits != 0) { + error_setg(errp, "guest-phys-bits is not user-configurable in 32 bit"); + return; + } if (env->features[FEAT_1_EDX] & (CPUID_PSE36 | CPUID_PAE)) { cpu->phys_bits = 36; @@ -8167,6 +8188,7 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_BOOL("x-force-features", X86CPU, force_features, false), DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true), DEFINE_PROP_UINT32("phys-bits", X86CPU, phys_bits, 0), + DEFINE_PROP_UINT32("guest-phys-bits", X86CPU, guest_phys_bits, -1), DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false), DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 0), DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, false), diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 34f9615b98..d6fdcc04ca 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2029,6 +2029,14 @@ struct ArchCPU { /* Number of physical address bits supported */ uint32_t phys_bits; + /* + * Number of guest physical address bits available. Usually this is + * identical to host physical address bits. With NPT or EPT 4-level + * paging, guest physical address space might be restricted to 48 bits + * even if the host cpu supports more physical address bits. + */ + uint32_t guest_phys_bits; + /* in order to simplify APIC support, we leave this pointer to the user */ struct DeviceState *apic_state; -- Gitee From a2383a2a0537750794223f21156241b1b1e78d2e Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 18 Mar 2024 16:53:35 +0100 Subject: [PATCH 538/939] kvm: add support for guest physical bits commit 0d08c423688edcca857f88dab20f1fc56de2b281 upstream. Query kvm for supported guest physical address bits, in cpuid function 80000008, eax[23:16]. Usually this is identical to host physical address bits. With NPT or EPT being used this might be restricted to 48 (max 4-level paging address space size) even if the host cpu supports more physical address bits. When set pass this to the guest, using cpuid too. Guest firmware can use this to figure how big the usable guest physical address space is, so PCI bar mapping are actually reachable. Intel-SIG: commit 0d08c423688e kvm: add support for guest physical bits Signed-off-by: Gerd Hoffmann Reviewed-by: Xiaoyao Li Reviewed-by: Zhao Liu Message-ID: <20240318155336.156197-2-kraxel@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm-cpu.c | 50 ++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c index 9c791b7b05..f76972e47e 100644 --- a/target/i386/kvm/kvm-cpu.c +++ b/target/i386/kvm/kvm-cpu.c @@ -18,10 +18,32 @@ #include "kvm_i386.h" #include "hw/core/accel-cpu.h" +static void kvm_set_guest_phys_bits(CPUState *cs) +{ + X86CPU *cpu = X86_CPU(cs); + uint32_t eax, guest_phys_bits; + + eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x80000008, 0, R_EAX); + guest_phys_bits = (eax >> 16) & 0xff; + if (!guest_phys_bits) { + return; + } + cpu->guest_phys_bits = guest_phys_bits; + if (cpu->guest_phys_bits > cpu->phys_bits) { + cpu->guest_phys_bits = cpu->phys_bits; + } + + if (cpu->host_phys_bits && cpu->host_phys_bits_limit && + cpu->guest_phys_bits > cpu->host_phys_bits_limit) { + cpu->guest_phys_bits = cpu->host_phys_bits_limit; + } +} + static bool kvm_cpu_realizefn(CPUState *cs, Error **errp) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + bool ret; /* * The realize order is important, since x86_cpu_realize() checks if @@ -32,13 +54,15 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp) * * realize order: * - * x86_cpu_realize(): - * -> x86_cpu_expand_features() - * -> cpu_exec_realizefn(): - * -> accel_cpu_common_realize() - * kvm_cpu_realizefn() -> host_cpu_realizefn() - * -> cpu_common_realizefn() - * -> check/update ucode_rev, phys_bits, mwait + * x86_cpu_realizefn(): + * x86_cpu_expand_features() + * cpu_exec_realizefn(): + * accel_cpu_common_realize() + * kvm_cpu_realizefn() + * host_cpu_realizefn() + * kvm_set_guest_phys_bits() + * check/update ucode_rev, phys_bits, guest_phys_bits, mwait + * cpu_common_realizefn() (via xcc->parent_realize) */ if (cpu->max_features) { if (enable_cpu_pm && kvm_has_waitpkg()) { @@ -50,7 +74,17 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp) MSR_IA32_UCODE_REV); } } - return host_cpu_realizefn(cs, errp); + ret = host_cpu_realizefn(cs, errp); + if (!ret) { + return ret; + } + + if ((env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) && + cpu->guest_phys_bits == -1) { + kvm_set_guest_phys_bits(cs); + } + + return true; } static bool lmce_supported(void) -- Gitee From 6060f8cad07a3d2a49795fef19d585a9d205ecef Mon Sep 17 00:00:00 2001 From: Jia Qingtong Date: Tue, 24 Sep 2024 18:24:33 +0800 Subject: [PATCH 539/939] hw/arm/virt:Keep Guest L1 cache type consistent with KVM Linux KVM normalize the cache configuration and expose a fabricated CLIDR_EL1 value to guest, where L1 cache type could be unified or seperate instruction cache and data cache. Let's keep guest L1 cache type consistent with KVM by checking the guest visable CLIDR_EL1, which can avoid abnormal issue in guest when it's probing cache info conbined CLIDR_EL1 with ACPI PPTT and DT. Signed-off-by: Yanan Wang Signed-off-by: lishusen --- hw/acpi/aml-build.c | 165 ++--------------------------------- hw/arm/virt-acpi-build.c | 167 ++++++++++++++++++++++++++++++++++++ hw/arm/virt.c | 86 +++++++++++++++---- include/hw/acpi/aml-build.h | 54 ++---------- include/hw/arm/virt.h | 60 +++++++++++++ 5 files changed, 306 insertions(+), 226 deletions(-) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index bf9c59f544..0d4994bafe 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -47,7 +47,7 @@ static void build_prepend_byte(GArray *array, uint8_t val) g_array_prepend_val(array, val); } -static void build_append_byte(GArray *array, uint8_t val) +void build_append_byte(GArray *array, uint8_t val) { g_array_append_val(array, val); } @@ -1990,10 +1990,10 @@ void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms, * ACPI spec, Revision 6.3 * 5.2.29.1 Processor hierarchy node structure (Type 0) */ -static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, - uint32_t parent, uint32_t id, - uint32_t *priv_rsrc, - uint32_t priv_num) +void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, + uint32_t parent, uint32_t id, + uint32_t *priv_rsrc, + uint32_t priv_num) { int i; @@ -2016,161 +2016,6 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, } } -/* - * ACPI spec, Revision 6.3 - * 5.2.29.2 Cache Type Structure (Type 1) - */ -static void build_cache_hierarchy_node(GArray *tbl, uint32_t next_level, - uint32_t cache_type) -{ - build_append_byte(tbl, 1); - build_append_byte(tbl, 24); - build_append_int_noprefix(tbl, 0, 2); - build_append_int_noprefix(tbl, 127, 4); - build_append_int_noprefix(tbl, next_level, 4); - - switch (cache_type) { - case ARM_L1D_CACHE: /* L1 dcache info */ - build_append_int_noprefix(tbl, ARM_L1DCACHE_SIZE, 4); - build_append_int_noprefix(tbl, ARM_L1DCACHE_SETS, 4); - build_append_byte(tbl, ARM_L1DCACHE_ASSOCIATIVITY); - build_append_byte(tbl, ARM_L1DCACHE_ATTRIBUTES); - build_append_int_noprefix(tbl, ARM_L1DCACHE_LINE_SIZE, 2); - break; - case ARM_L1I_CACHE: /* L1 icache info */ - build_append_int_noprefix(tbl, ARM_L1ICACHE_SIZE, 4); - build_append_int_noprefix(tbl, ARM_L1ICACHE_SETS, 4); - build_append_byte(tbl, ARM_L1ICACHE_ASSOCIATIVITY); - build_append_byte(tbl, ARM_L1ICACHE_ATTRIBUTES); - build_append_int_noprefix(tbl, ARM_L1ICACHE_LINE_SIZE, 2); - break; - case ARM_L2_CACHE: /* L2 cache info */ - build_append_int_noprefix(tbl, ARM_L2CACHE_SIZE, 4); - build_append_int_noprefix(tbl, ARM_L2CACHE_SETS, 4); - build_append_byte(tbl, ARM_L2CACHE_ASSOCIATIVITY); - build_append_byte(tbl, ARM_L2CACHE_ATTRIBUTES); - build_append_int_noprefix(tbl, ARM_L2CACHE_LINE_SIZE, 2); - break; - case ARM_L3_CACHE: /* L3 cache info */ - build_append_int_noprefix(tbl, ARM_L3CACHE_SIZE, 4); - build_append_int_noprefix(tbl, ARM_L3CACHE_SETS, 4); - build_append_byte(tbl, ARM_L3CACHE_ASSOCIATIVITY); - build_append_byte(tbl, ARM_L3CACHE_ATTRIBUTES); - build_append_int_noprefix(tbl, ARM_L3CACHE_LINE_SIZE, 2); - break; - default: - build_append_int_noprefix(tbl, 0, 4); - build_append_int_noprefix(tbl, 0, 4); - build_append_byte(tbl, 0); - build_append_byte(tbl, 0); - build_append_int_noprefix(tbl, 0, 2); - } -} - -/* - * ACPI spec, Revision 6.3 - * 5.2.29 Processor Properties Topology Table (PPTT) - */ -void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms, - const char *oem_id, const char *oem_table_id) -{ - MachineClass *mc = MACHINE_GET_CLASS(ms); - GQueue *list = g_queue_new(); - guint pptt_start = table_data->len; - guint parent_offset; - guint length, i; - int uid = 0; - int socket; - AcpiTable table = { .sig = "PPTT", .rev = 2, - .oem_id = oem_id, .oem_table_id = oem_table_id }; - - acpi_table_begin(&table, table_data); - - for (socket = 0; socket < ms->smp.sockets; socket++) { - uint32_t l3_cache_offset = table_data->len - pptt_start; - build_cache_hierarchy_node(table_data, 0, ARM_L3_CACHE); - - g_queue_push_tail(list, - GUINT_TO_POINTER(table_data->len - pptt_start)); - build_processor_hierarchy_node( - table_data, - /* - * Physical package - represents the boundary - * of a physical package - */ - (1 << 0), - 0, socket, &l3_cache_offset, 1); - } - - if (mc->smp_props.clusters_supported) { - length = g_queue_get_length(list); - for (i = 0; i < length; i++) { - int cluster; - - parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); - for (cluster = 0; cluster < ms->smp.clusters; cluster++) { - g_queue_push_tail(list, - GUINT_TO_POINTER(table_data->len - pptt_start)); - build_processor_hierarchy_node( - table_data, - (0 << 0), /* not a physical package */ - parent_offset, cluster, NULL, 0); - } - } - } - - length = g_queue_get_length(list); - for (i = 0; i < length; i++) { - int core; - - parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); - for (core = 0; core < ms->smp.cores; core++) { - uint32_t priv_rsrc[3] = {}; - priv_rsrc[0] = table_data->len - pptt_start; /* L2 cache offset */ - build_cache_hierarchy_node(table_data, 0, ARM_L2_CACHE); - - priv_rsrc[1] = table_data->len - pptt_start; /* L1 dcache offset */ - build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1D_CACHE); - - priv_rsrc[2] = table_data->len - pptt_start; /* L1 icache offset */ - build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1I_CACHE); - - if (ms->smp.threads > 1) { - g_queue_push_tail(list, - GUINT_TO_POINTER(table_data->len - pptt_start)); - build_processor_hierarchy_node( - table_data, - (0 << 0), /* not a physical package */ - parent_offset, core, priv_rsrc, 3); - } else { - build_processor_hierarchy_node( - table_data, - (1 << 1) | /* ACPI Processor ID valid */ - (1 << 3), /* Node is a Leaf */ - parent_offset, uid++, priv_rsrc, 3); - } - } - } - - length = g_queue_get_length(list); - for (i = 0; i < length; i++) { - int thread; - - parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); - for (thread = 0; thread < ms->smp.threads; thread++) { - build_processor_hierarchy_node( - table_data, - (1 << 1) | /* ACPI Processor ID valid */ - (1 << 2) | /* Processor is a Thread */ - (1 << 3), /* Node is a Leaf */ - parent_offset, uid++, NULL, 0); - } - } - - g_queue_free(list); - acpi_table_end(linker, &table); -} - /* * ACPI spec, Revision 6.3 * 5.2.29 Processor Properties Topology Table (PPTT) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 179600d4fe..86984b7167 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -63,6 +63,173 @@ #define ACPI_BUILD_TABLE_SIZE 0x20000 +/* + * ACPI spec, Revision 6.3 + * 5.2.29.2 Cache Type Structure (Type 1) + */ +static void build_cache_hierarchy_node(GArray *tbl, uint32_t next_level, + uint32_t cache_type) +{ + build_append_byte(tbl, 1); + build_append_byte(tbl, 24); + build_append_int_noprefix(tbl, 0, 2); + build_append_int_noprefix(tbl, 127, 4); + build_append_int_noprefix(tbl, next_level, 4); + + switch (cache_type) { + case ARM_L1D_CACHE: /* L1 dcache info */ + build_append_int_noprefix(tbl, ARM_L1DCACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1DCACHE_SETS, 4); + build_append_byte(tbl, ARM_L1DCACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1DCACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1DCACHE_LINE_SIZE, 2); + break; + case ARM_L1I_CACHE: /* L1 icache info */ + build_append_int_noprefix(tbl, ARM_L1ICACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1ICACHE_SETS, 4); + build_append_byte(tbl, ARM_L1ICACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1ICACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1ICACHE_LINE_SIZE, 2); + break; + case ARM_L1_CACHE: /* L1 cache info */ + build_append_int_noprefix(tbl, ARM_L1CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1CACHE_SETS, 4); + build_append_byte(tbl, ARM_L1CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1CACHE_LINE_SIZE, 2); + break; + case ARM_L2_CACHE: /* L2 cache info */ + build_append_int_noprefix(tbl, ARM_L2CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L2CACHE_SETS, 4); + build_append_byte(tbl, ARM_L2CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L2CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L2CACHE_LINE_SIZE, 2); + break; + case ARM_L3_CACHE: /* L3 cache info */ + build_append_int_noprefix(tbl, ARM_L3CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L3CACHE_SETS, 4); + build_append_byte(tbl, ARM_L3CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L3CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L3CACHE_LINE_SIZE, 2); + break; + default: + build_append_int_noprefix(tbl, 0, 4); + build_append_int_noprefix(tbl, 0, 4); + build_append_byte(tbl, 0); + build_append_byte(tbl, 0); + build_append_int_noprefix(tbl, 0, 2); + } +} + +/* + * ACPI spec, Revision 6.3 + * 5.2.29 Processor Properties Topology Table (PPTT) + */ +static void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms, + const char *oem_id, const char *oem_table_id) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + GQueue *list = g_queue_new(); + guint pptt_start = table_data->len; + guint parent_offset; + guint length, i; + int uid = 0; + int socket; + AcpiTable table = { .sig = "PPTT", .rev = 2, + .oem_id = oem_id, .oem_table_id = oem_table_id }; + bool unified_l1 = cpu_l1_cache_unified(0); + + acpi_table_begin(&table, table_data); + + for (socket = 0; socket < ms->smp.sockets; socket++) { + uint32_t l3_cache_offset = table_data->len - pptt_start; + build_cache_hierarchy_node(table_data, 0, ARM_L3_CACHE); + + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + /* + * Physical package - represents the boundary + * of a physical package + */ + (1 << 0), + 0, socket, &l3_cache_offset, 1); + } + + if (mc->smp_props.clusters_supported) { + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int cluster; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (cluster = 0; cluster < ms->smp.clusters; cluster++) { + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + (0 << 0), /* not a physical package */ + parent_offset, cluster, NULL, 0); + } + } + } + + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int core; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (core = 0; core < ms->smp.cores; core++) { + uint32_t priv_rsrc[3] = {}; + priv_rsrc[0] = table_data->len - pptt_start; /* L2 cache offset */ + build_cache_hierarchy_node(table_data, 0, ARM_L2_CACHE); + + if (unified_l1) { + priv_rsrc[1] = table_data->len - pptt_start; /* L1 cache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1_CACHE); + } else { + priv_rsrc[1] = table_data->len - pptt_start; /* L1 dcache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1D_CACHE); + priv_rsrc[2] = table_data->len - pptt_start; /* L1 icache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1I_CACHE); + } + + if (ms->smp.threads > 1) { + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + (0 << 0), /* not a physical package */ + parent_offset, core, priv_rsrc, 3); + } else { + build_processor_hierarchy_node( + table_data, + (1 << 1) | /* ACPI Processor ID valid */ + (1 << 3), /* Node is a Leaf */ + parent_offset, uid++, priv_rsrc, 3); + } + } + } + + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int thread; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (thread = 0; thread < ms->smp.threads; thread++) { + build_processor_hierarchy_node( + table_data, + (1 << 1) | /* ACPI Processor ID valid */ + (1 << 2) | /* Processor is a Thread */ + (1 << 3), /* Node is a Leaf */ + parent_offset, uid++, NULL, 0); + } + } + + g_queue_free(list); + acpi_table_end(linker, &table); +} + static void acpi_dsdt_add_psd(Aml *dev, int cpus) { Aml *pkg; diff --git a/hw/arm/virt.c b/hw/arm/virt.c index e31c289968..a9efcec85e 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -401,6 +401,39 @@ static void fdt_add_timer_nodes(const VirtMachineState *vms) INTID_TO_PPI(ARCH_TIMER_NS_EL2_IRQ), irqflags); } +/* + * In CLIDR_EL1 exposed to guest by the hypervisor, L1 cache type + * maybe unified or seperate ins and data. We need to read the + * guest visable CLIDR_EL1 and check L1 cache type. + */ +bool cpu_l1_cache_unified(int cpu) +{ + bool unified = false; + uint64_t clidr; + ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); + CPUState *cs = CPU(armcpu); + int ret; + + if (kvm_enabled()) { + struct kvm_one_reg reg = { + .id = ARM64_REG_CLIDR_EL1, + .addr = (uintptr_t)&clidr + }; + + ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); + if (ret) { + error_setg(&error_fatal, "Get vCPU clidr from KVM failed:%d", ret); + return unified; + } + + if (CLIDR_CTYPE(clidr, 1) == CTYPE_UNIFIED) { + unified = true; + } + } + + return unified; +} + static void fdt_add_l3cache_nodes(const VirtMachineState *vms) { int i; @@ -415,9 +448,10 @@ static void fdt_add_l3cache_nodes(const VirtMachineState *vms) qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache"); qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true"); qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-level", 3); - qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", 0x2000000); - qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", 128); - qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", 2048); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L3CACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", + ARM_L3CACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L3CACHE_SETS); qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", qemu_fdt_alloc_phandle(ms->fdt)); g_free(nodename); @@ -436,10 +470,12 @@ static void fdt_add_l2cache_nodes(const VirtMachineState *vms) char *nodename = g_strdup_printf("/cpus/l2-cache%d", cpu); qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true"); qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache"); - qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", 0x80000); - qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", 64); - qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", 1024); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L2CACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", + ARM_L2CACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L2CACHE_SETS); qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", next_path); qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", @@ -453,18 +489,32 @@ static void fdt_add_l2cache_nodes(const VirtMachineState *vms) static void fdt_add_l1cache_prop(const VirtMachineState *vms, char *nodename, int cpu) { - const MachineState *ms = MACHINE(vms); - char *cachename = g_strdup_printf("/cpus/l2-cache%d", cpu); - - qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-size", 0x10000); - qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-line-size", 64); - qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-sets", 256); - qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-size", 0x10000); - qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-line-size", 64); - qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-sets", 256); - qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", - cachename); - g_free(cachename); + const MachineState *ms = MACHINE(vms); + char *next_path = g_strdup_printf("/cpus/l2-cache%d", cpu); + bool unified_l1 = cpu_l1_cache_unified(0); + + if (unified_l1) { + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L1CACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", + ARM_L1CACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L1CACHE_SETS); + } else { + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-size", + ARM_L1DCACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-line-size", + ARM_L1DCACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-sets", + ARM_L1DCACHE_SETS); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-size", + ARM_L1ICACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-line-size", + ARM_L1ICACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-sets", + ARM_L1ICACHE_SETS); + } + qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", next_path); + + g_free(next_path); } static void fdt_add_cpu_nodes(const VirtMachineState *vms) diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index 7281c281f6..91f9cbf4f1 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -221,51 +221,6 @@ struct AcpiBuildTables { BIOSLinker *linker; } AcpiBuildTables; -/* Definitions of the hardcoded cache info*/ - -typedef enum { - ARM_L1D_CACHE, - ARM_L1I_CACHE, - ARM_L2_CACHE, - ARM_L3_CACHE -} ArmCacheType; - -/* L1 data cache: */ -#define ARM_L1DCACHE_SIZE 65536 -#define ARM_L1DCACHE_SETS 256 -#define ARM_L1DCACHE_ASSOCIATIVITY 4 -#define ARM_L1DCACHE_ATTRIBUTES 2 -#define ARM_L1DCACHE_LINE_SIZE 64 - -/* L1 instruction cache: */ -#define ARM_L1ICACHE_SIZE 65536 -#define ARM_L1ICACHE_SETS 256 -#define ARM_L1ICACHE_ASSOCIATIVITY 4 -#define ARM_L1ICACHE_ATTRIBUTES 4 -#define ARM_L1ICACHE_LINE_SIZE 64 - -/* Level 2 unified cache: */ -#define ARM_L2CACHE_SIZE 524288 -#define ARM_L2CACHE_SETS 1024 -#define ARM_L2CACHE_ASSOCIATIVITY 8 -#define ARM_L2CACHE_ATTRIBUTES 10 -#define ARM_L2CACHE_LINE_SIZE 64 - -/* Level 3 unified cache: */ -#define ARM_L3CACHE_SIZE 33554432 -#define ARM_L3CACHE_SETS 2048 -#define ARM_L3CACHE_ASSOCIATIVITY 15 -#define ARM_L3CACHE_ATTRIBUTES 10 -#define ARM_L3CACHE_LINE_SIZE 128 - -struct offset_status { - uint32_t parent; - uint32_t l2_offset; - uint32_t l1d_offset; - uint32_t l1i_offset; -}; - - typedef struct CrsRangeEntry { uint64_t base; @@ -460,6 +415,7 @@ Aml *aml_sizeof(Aml *arg); Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target); Aml *aml_object_type(Aml *object); +void build_append_byte(GArray *array, uint8_t val); void build_append_int_noprefix(GArray *table, uint64_t value, int size); typedef struct AcpiTable { @@ -537,10 +493,12 @@ void build_srat_memory(GArray *table_data, uint64_t base, void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms, const char *oem_id, const char *oem_table_id); -void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, - const char *oem_id, const char *oem_table_id); +void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, + uint32_t parent, uint32_t id, + uint32_t *priv_rsrc, + uint32_t priv_num); -void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms, +void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, const char *oem_id, const char *oem_table_id); void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 76a0d3fa5b..4b7dc61c24 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -47,6 +47,65 @@ /* See Linux kernel arch/arm64/include/asm/pvclock-abi.h */ #define PVTIME_SIZE_PER_CPU 64 +/* ARM CLIDR_EL1 related definitions */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CTYPE_NONE 0b000 +#define CTYPE_INS 0b001 +#define CTYPE_DATA 0b010 +#define CTYPE_INS_DATA 0b011 +#define CTYPE_UNIFIED 0b100 + +#define ARM64_REG_CLIDR_EL1 ARM64_SYS_REG(3, 1, 0, 0, 1) + +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +/* L1 data cache */ +#define ARM_L1DCACHE_SIZE 65536 +#define ARM_L1DCACHE_SETS 256 +#define ARM_L1DCACHE_ASSOCIATIVITY 4 +#define ARM_L1DCACHE_ATTRIBUTES 2 +#define ARM_L1DCACHE_LINE_SIZE 64 + +/* L1 instruction cache */ +#define ARM_L1ICACHE_SIZE 65536 +#define ARM_L1ICACHE_SETS 256 +#define ARM_L1ICACHE_ASSOCIATIVITY 4 +#define ARM_L1ICACHE_ATTRIBUTES 4 +#define ARM_L1ICACHE_LINE_SIZE 64 + +/* L1 unified cache */ +#define ARM_L1CACHE_SIZE 131072 +#define ARM_L1CACHE_SETS 256 +#define ARM_L1CACHE_ASSOCIATIVITY 4 +#define ARM_L1CACHE_ATTRIBUTES 10 +#define ARM_L1CACHE_LINE_SIZE 128 + +/* L2 unified cache */ +#define ARM_L2CACHE_SIZE 524288 +#define ARM_L2CACHE_SETS 1024 +#define ARM_L2CACHE_ASSOCIATIVITY 8 +#define ARM_L2CACHE_ATTRIBUTES 10 +#define ARM_L2CACHE_LINE_SIZE 64 + +/* L3 unified cache */ +#define ARM_L3CACHE_SIZE 33554432 +#define ARM_L3CACHE_SETS 2048 +#define ARM_L3CACHE_ASSOCIATIVITY 15 +#define ARM_L3CACHE_ATTRIBUTES 10 +#define ARM_L3CACHE_LINE_SIZE 128 + +/* Definitions of the hardcoded cache info */ +typedef enum { + ARM_L1D_CACHE, + ARM_L1I_CACHE, + ARM_L1_CACHE, + ARM_L2_CACHE, + ARM_L3_CACHE +} ArmCacheType; + enum { VIRT_FLASH, VIRT_MEM, @@ -194,6 +253,7 @@ OBJECT_DECLARE_TYPE(VirtMachineState, VirtMachineClass, VIRT_MACHINE) void virt_acpi_setup(VirtMachineState *vms); bool virt_is_acpi_enabled(VirtMachineState *vms); +bool cpu_l1_cache_unified(int cpu); /* Return number of redistributors that fit in the specified region */ static uint32_t virt_redist_capacity(VirtMachineState *vms, int region) -- Gitee From 8ac5c38a54d407b363d6633eb01806b0e9aaa15e Mon Sep 17 00:00:00 2001 From: yinxiuxiu Date: Fri, 22 Nov 2024 14:45:09 +0800 Subject: [PATCH 540/939] Avoid taking address of out-of-bounds array index Signed-off-by: yinxiuxiu --- hw/intc/openpic.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index 0f99b77a17..d74ec11af4 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -1031,13 +1031,14 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, s_IRQ = IRQ_get_next(opp, &dst->servicing); /* Check queued interrupts. */ n_IRQ = IRQ_get_next(opp, &dst->raised); - src = &opp->src[n_IRQ]; - if (n_IRQ != -1 && - (s_IRQ == -1 || - IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { - DPRINTF("Raise OpenPIC INT output cpu %d irq %d", - idx, n_IRQ); - qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); + if (n_IRQ != -1) { + src = &opp->src[n_IRQ]; + if (s_IRQ == -1 || + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority) { + DPRINTF("Raise OpenPIC INT output cpu %d irq %d", + idx, n_IRQ); + qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); + } } break; default: -- Gitee From 482808a35957c10d9eb4264492a8e11a2ba749c1 Mon Sep 17 00:00:00 2001 From: gubin Date: Fri, 22 Nov 2024 17:49:38 +0800 Subject: [PATCH 541/939] hw/audio/virtio-snd: Always use little endian audio format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from a276ec8e2632c9015d0f9b4e47194e4e91dfa8bb The VIRTIO Sound Device conforms with the Virtio spec v1.2, thus only use little endianness. Remove the suspicious target_words_bigendian() noticed during code review. Cc: qemu-stable@nongnu.org Fixes: eb9ad377bb ("virtio-sound: handle control messages and streams") Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Message-Id: <20240422211830.25606-1-philmd@linaro.org> Signed-off-by: gubin --- hw/audio/virtio-snd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index 817fdcd910..9f7a69e408 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -377,7 +377,7 @@ static void virtio_snd_get_qemu_audsettings(audsettings *as, as->nchannels = MIN(AUDIO_MAX_CHANNELS, params->channels); as->fmt = virtio_snd_get_qemu_format(params->format); as->freq = virtio_snd_get_qemu_freq(params->rate); - as->endianness = target_words_bigendian() ? 1 : 0; + as->endianness = 0; /* Conforming to VIRTIO 1.0: always little endian. */ } /* -- Gitee From 7810c5462cc56c92f50ecf3878525c15000212f6 Mon Sep 17 00:00:00 2001 From: gubin Date: Fri, 22 Nov 2024 18:02:26 +0800 Subject: [PATCH 542/939] target/arm: Avoid shifts by -1 in tszimm_shr() and tszimm_shl() cherry-pick from 76916dfa89e8900639c1055c07a295c06628a0bc The function tszimm_esz() returns a shift amount, or possibly -1 in certain cases that correspond to unallocated encodings in the instruction set. We catch these later in the trans_ functions (generally with an "a-esz < 0" check), but before we do the decodetree-generated code will also call tszimm_shr() or tszimm_sl(), which will use the tszimm_esz() return value as a shift count without checking that it is not negative, which is undefined behaviour. Avoid the UB by checking the return value in tszimm_shr() and tszimm_shl(). Cc: qemu-stable@nongnu.org Resolves: Coverity CID 1547617, 1547694 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240722172957.1041231-4-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/tcg/translate-sve.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index 296e7d1ce2..dd0c633897 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -50,13 +50,27 @@ static int tszimm_esz(DisasContext *s, int x) static int tszimm_shr(DisasContext *s, int x) { - return (16 << tszimm_esz(s, x)) - x; + /* + * We won't use the tszimm_shr() value if tszimm_esz() returns -1 (the + * trans function will check for esz < 0), so we can return any + * value we like from here in that case as long as we avoid UB. + */ + int esz = tszimm_esz(s, x); + if (esz < 0) { + return esz; + } + return (16 << esz) - x; } /* See e.g. LSL (immediate, predicated). */ static int tszimm_shl(DisasContext *s, int x) { - return x - (8 << tszimm_esz(s, x)); + /* As with tszimm_shr(), value will be unused if esz < 0 */ + int esz = tszimm_esz(s, x); + if (esz < 0) { + return esz; + } + return x - (8 << esz); } /* The SH bit is in bit 8. Extract the low 8 and shift. */ -- Gitee From 2f37362de1d971cc90c35405705bfa22a33f6cd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Wed, 20 Nov 2024 14:20:24 -0600 Subject: [PATCH 543/939] linux-user: Print tid not pid with strace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This aligns with strace, and is very useful when tracing multi-threaded programs. The result is the same in single-threaded programs. Signed-off-by: J. Neuschäfer Message-Id: 20241024-strace-v1-1-56c4161431cd@gmx.net [rth: Use TaskState.ts_tid via get_task_state()] Signed-off-by: Richard Henderson Signed-off-by: Zhongrui Tang --- linux-user/strace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux-user/strace.c b/linux-user/strace.c index cf26e55264..ac9177ebe4 100644 --- a/linux-user/strace.c +++ b/linux-user/strace.c @@ -4176,7 +4176,7 @@ print_syscall(CPUArchState *cpu_env, int num, if (!f) { return; } - fprintf(f, "%d ", getpid()); + fprintf(f, "%d ", get_task_state(env_cpu(cpu_env))->ts_tid); for (i = 0; i < nsyscalls; i++) { if (scnames[i].nr == num) { -- Gitee From d0076c906a96019c0fe12be78e5ab21eaf15e69e Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 25 Nov 2024 04:48:16 -0500 Subject: [PATCH 544/939] hw/timer/exynos4210_mct: fix possible int overflow cheery-pick from c5d36da7ec62e4c72a72a437057fb6072cf0d6ab The product "icnto * s->tcntb" may overflow uint32_t. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Frolov Message-id: 20241106083801.219578-2-frolov@swemel.ru Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell Signed-off-by: qihao_yewu --- hw/timer/exynos4210_mct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/timer/exynos4210_mct.c b/hw/timer/exynos4210_mct.c index 446bbd2b96..6f47bfe2c2 100644 --- a/hw/timer/exynos4210_mct.c +++ b/hw/timer/exynos4210_mct.c @@ -815,7 +815,7 @@ static uint32_t exynos4210_ltick_cnt_get_cnto(struct tick_timer *s) /* Both are counting */ icnto = remain / s->tcntb; if (icnto) { - tcnto = remain % (icnto * s->tcntb); + tcnto = remain % ((uint64_t)icnto * s->tcntb); } else { tcnto = remain % s->tcntb; } -- Gitee From ea76b33ca7a8c2fd39f50b6d1bb6702ab0a4fc87 Mon Sep 17 00:00:00 2001 From: fangyi Date: Sat, 22 Jun 2024 07:02:48 +0000 Subject: [PATCH 545/939] vdpa: fix vdpa device migrate rollback wrong when suspend device failed. 1. set vdpa->suspended before call vhost_dev_suspend to make sure vdpa device will resume when suspend failed. 2. using state == RUN_STATE_FINISH_MIGRATE instead of ms->state == MIGRATION_STATUS_ACTIVE to judge vm in migration. As migrate_fd_cancel will change ms->state, which will result in some vdpa devices not being suspended. Signed-off-by: fangyi --- hw/virtio/vdpa-dev-mig.c | 81 ++++------------------------------------ 1 file changed, 7 insertions(+), 74 deletions(-) diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c index 887c96a201..7de996c835 100644 --- a/hw/virtio/vdpa-dev-mig.c +++ b/hw/virtio/vdpa-dev-mig.c @@ -130,100 +130,33 @@ free: static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa) { VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); - BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); - VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - int ret; if (!vdpa->started || vdpa->suspended) { return 0; } - if (!k->set_guest_notifiers) { - return -EFAULT; - } - - vdpa->started = false; vdpa->suspended = true; - ret = vhost_dev_suspend(&vdpa->dev, vdev, false); - if (ret) { - goto suspend_fail; - } - - ret = k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, false); - if (ret < 0) { - error_report("vhost guest notifier cleanup failed: %d\n", ret); - goto set_guest_notifiers_fail; - } - - vhost_dev_disable_notifiers(&vdpa->dev, vdev); - return ret; - -set_guest_notifiers_fail: - ret = k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, true); - if (ret) { - error_report("vhost guest notifier restore failed: %d\n", ret); - } - -suspend_fail: - vdpa->suspended = false; - vdpa->started = true; - return ret; + return vhost_dev_suspend(&vdpa->dev, vdev, false); } static int vhost_vdpa_device_resume(VhostVdpaDevice *vdpa) { VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); - BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); - VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - int i, ret; + MigrationIncomingState *mis = migration_incoming_get_current(); + int ret; - if (vdpa->started || !vdpa->suspended) { + if (!vdpa->started || + (!vdpa->suspended && mis->state != RUN_STATE_RESTORE_VM)) { return 0; } - if (!k->set_guest_notifiers) { - error_report("binding does not support guest notifiers\n"); - return -ENOSYS; - } - - ret = vhost_dev_enable_notifiers(&vdpa->dev, vdev); + ret = vhost_dev_resume(&vdpa->dev, vdev, false); if (ret < 0) { - error_report("Error enabling host notifiers: %d\n", ret); return ret; } - ret = k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, true); - if (ret < 0) { - error_report("Error binding guest notifier: %d\n", ret); - goto err_host_notifiers; - } - - vdpa->dev.acked_features = vdev->guest_features; - - ret = vhost_dev_resume(&vdpa->dev, vdev, false); - if (ret < 0) { - error_report("Error starting vhost: %d\n", ret); - goto err_guest_notifiers; - } - vdpa->started = true; vdpa->suspended = false; - - /* - * guest_notifier_mask/pending not used yet, so just unmask - * everything here. virtio-pci will do the right thing by - * enabling/disabling irqfd. - */ - for (i = 0; i < vdpa->dev.nvqs; i++) { - vhost_virtqueue_mask(&vdpa->dev, vdev, i, false); - } - - return ret; - -err_guest_notifiers: - k->set_guest_notifiers(qbus->parent, vdpa->dev.nvqs, false); -err_host_notifiers: - vhost_dev_disable_notifiers(&vdpa->dev, vdev); return ret; } @@ -248,7 +181,7 @@ static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) MigrationIncomingState *mis = migration_incoming_get_current(); if (!running) { - if (ms->state == MIGRATION_STATUS_ACTIVE || state == RUN_STATE_PAUSED) { + if (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED) { ret = vhost_vdpa_device_suspend(vdpa); if (ret) { error_report("suspend vdpa device failed: %d\n", ret); -- Gitee From 8c65e8d7c923ade6f3c7fbef43000562d4733629 Mon Sep 17 00:00:00 2001 From: fangyi Date: Sat, 7 Sep 2024 07:11:07 +0000 Subject: [PATCH 546/939] vdpa: support resizing virtio-blk capacity online for kernel vdpa Signed-off-by: fangyi --- hw/virtio/vdpa-dev.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index 91e71847b0..bf4b3ec3fd 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -31,6 +31,7 @@ #include "hw/virtio/vdpa-dev-mig.h" #include "migration/migration.h" #include "exec/address-spaces.h" +#include "standard-headers/linux/virtio_ids.h" static void vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -201,7 +202,23 @@ static void vhost_vdpa_device_get_config(VirtIODevice *vdev, uint8_t *config) { VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); + uint8_t *new_config; + int ret; + + if (s->vdev_id != VIRTIO_ID_BLOCK) { + goto out; + } + new_config = g_malloc0(s->config_size); + ret = vhost_dev_get_config(&s->dev, new_config, s->config_size, NULL); + if (ret < 0) { + error_report("vhost-vdpa-device: get config failed(%d)\n", ret); + goto free; + } + memcpy(s->config, new_config, s->config_size); +free: + g_free(new_config); +out: memcpy(config, s->config, s->config_size); } -- Gitee From 05ee3017d156005e3d8d8fb19514d593858abd44 Mon Sep 17 00:00:00 2001 From: fangyi Date: Tue, 29 Oct 2024 19:51:41 +0800 Subject: [PATCH 547/939] Revert "vdpa: add vhost_vdpa_suspend" Use a new scheme instead for kernel vdpa, So revert it. This reverts commit 0bb302a9960a186fc488068d268dc373e6b70876. --- hw/virtio/trace-events | 1 - hw/virtio/vhost-vdpa.c | 26 -------------------------- 2 files changed, 27 deletions(-) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 637cac4edf..de02bdc1d0 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -52,7 +52,6 @@ vhost_vdpa_set_vring_ready(void *dev, unsigned i, int r) "dev: %p, idx: %u, r: % vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s" vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32 -vhost_vdpa_suspend(void *dev) "dev: %p" vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index d49826845f..130afb06dc 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -865,13 +865,11 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev, static int vhost_vdpa_reset_device(struct vhost_dev *dev) { - struct vhost_vdpa *v = dev->opaque; int ret; uint8_t status = 0; ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); trace_vhost_vdpa_reset_device(dev); - v->suspended = false; return ret; } @@ -1274,29 +1272,6 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) } } -static void vhost_vdpa_suspend(struct vhost_dev *dev) -{ - struct vhost_vdpa *v = dev->opaque; - int r; - - if (!vhost_vdpa_first_dev(dev)) { - return; - } - - if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) { - trace_vhost_vdpa_suspend(dev); - r = ioctl(v->device_fd, VHOST_VDPA_SUSPEND); - if (unlikely(r)) { - error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno); - } else { - v->suspended = true; - return; - } - } - - vhost_vdpa_reset_device(dev); -} - static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) { struct vhost_vdpa *v = dev->opaque; @@ -1310,7 +1285,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) return -1; } } else { - vhost_vdpa_suspend(dev); vhost_vdpa_svqs_stop(dev); vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } -- Gitee From e1f733fcbc4eb39333ad9527865c1590d74092ed Mon Sep 17 00:00:00 2001 From: fangyi Date: Tue, 29 Oct 2024 19:53:27 +0800 Subject: [PATCH 548/939] Revert "vdpa: add vhost_vdpa->suspended parameter" Use a new scheme instead for kernel vdpa, So revert it. This reverts commit b6662cb7e5376659c7abb56efe27dcf3898d4fe6. --- hw/virtio/vhost-vdpa.c | 8 -------- include/hw/virtio/vhost-vdpa.h | 2 -- 2 files changed, 10 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 130afb06dc..bb3320946d 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1406,14 +1406,6 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, return 0; } - if (!v->suspended) { - /* - * Cannot trust in value returned by device, let vhost recover used - * idx from guest. - */ - return -1; - } - ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); return ret; diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index 5407d54fd7..ee255bc1bd 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -42,8 +42,6 @@ typedef struct vhost_vdpa { bool shadow_vqs_enabled; /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */ bool shadow_data; - /* Device suspended successfully */ - bool suspended; /* IOVA mapping used by the Shadow Virtqueue */ VhostIOVATree *iova_tree; GPtrArray *shadow_vqs; -- Gitee From 4a79b3c07dca4f1e21e4dbb1e59bf437b2a814fa Mon Sep 17 00:00:00 2001 From: fangyi Date: Tue, 29 Oct 2024 19:58:14 +0800 Subject: [PATCH 549/939] Revert "vdpa: block migration if SVQ does not admit a feature" Use a new scheme instead for kernel vdpa, So revert it. This reverts commit 57ac831865e370012496fb581a38d261cb72c5d0. --- hw/virtio/vhost-vdpa.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index bb3320946d..69cf3b76e9 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -596,21 +596,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) return 0; } - /* - * If dev->shadow_vqs_enabled at initialization that means the device has - * been started with x-svq=on, so don't block migration - */ - if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) { - /* We don't have dev->features yet */ - uint64_t features; - ret = vhost_vdpa_get_dev_features(dev, &features); - if (unlikely(ret)) { - error_setg_errno(errp, -ret, "Could not get device features"); - return ret; - } - vhost_svq_valid_features(features, &dev->migration_blocker); - } - /* * Similar to VFIO, we end up pinning all guest memory and have to * disable discarding of RAM. -- Gitee From 1c62372d7c9e1f71ef9563e88b7491a7272b2a7d Mon Sep 17 00:00:00 2001 From: fangyi Date: Tue, 29 Oct 2024 20:02:10 +0800 Subject: [PATCH 550/939] vdpa: remove memory listener unregister in vhost_vdpa_reset_status Remove memory listener unregister in vhost_vdpa_reset_status as we move the memory listener registration of vdpa from the start stage to the realize stage before. --- hw/virtio/vhost-vdpa.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 69cf3b76e9..dcf1ef2c15 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1292,8 +1292,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) static void vhost_vdpa_reset_status(struct vhost_dev *dev) { - struct vhost_vdpa *v = dev->opaque; - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { return; } @@ -1301,7 +1299,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev) vhost_vdpa_reset_device(dev); vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER); - memory_listener_unregister(&v->listener); } static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, -- Gitee From 9e0b6c4df61aced66c5b3ee9ca93c6ac33868dc0 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 28 Nov 2024 14:06:44 +0800 Subject: [PATCH 551/939] target/arm: Don't assert for 128-bit tile accesses when SVL is 128 cherry-pick from 56f1c0db928aae0b83fd91c89ddb226b137e2b21 For an instruction which accesses a 128-bit element tile when the SVL is also 128 (for example MOV z0.Q, p0/M, ZA0H.Q[w0,0]), we will assert in get_tile_rowcol(): qemu-system-aarch64: ../../tcg/tcg-op.c:926: tcg_gen_deposit_z_i32: Assertion `len > 0' failed. This happens because we calculate len = ctz32(streaming_vec_reg_size(s)) - esz;$ but if the SVL and the element size are the same len is 0, and the deposit operation asserts. In this case the ZA storage contains exactly one 128 bit element ZA tile, and the horizontal or vertical slice is just that tile. This means that regardless of the index value in the Ws register, we always access that tile. (In pseudocode terms, we calculate (index + offset) MOD 1, which is 0.) Special case the len == 0 case to avoid hitting the assertion in tcg_gen_deposit_z_i32(). Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240722172957.1041231-2-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/tcg/translate-sme.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c index 8f0dfc884e..1e89516736 100644 --- a/target/arm/tcg/translate-sme.c +++ b/target/arm/tcg/translate-sme.c @@ -49,7 +49,15 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, /* Prepare a power-of-two modulo via extraction of @len bits. */ len = ctz32(streaming_vec_reg_size(s)) - esz; - if (vertical) { + if (!len) { + /* + * SVL is 128 and the element size is 128. There is exactly + * one 128x128 tile in the ZA storage, and so we calculate + * (Rs + imm) MOD 1, which is always 0. We need to special case + * this because TCG doesn't allow deposit ops with len 0. + */ + tcg_gen_movi_i32(tmp, 0); + } else if (vertical) { /* * Compute the byte offset of the index within the tile: * (index % (svl / size)) * size -- Gitee From 42a30e10bada5f034b0b2bfe8760482c972a4e61 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 28 Nov 2024 14:14:21 +0800 Subject: [PATCH 552/939] target/arm: Don't get MDCR_EL2 in pmu_counter_enabled() before checking ARM_FEATURE_PMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from ac1d88e9e7ca0bed83e91e07ce6d0597f10cc77d It doesn't make sense to read the value of MDCR_EL2 on a non-A-profile CPU, and in fact if you try to do it we will assert: (assertion=0x5555565a8c70 "!arm_feature(env, ARM_FEATURE_M)", file=0x5555565a6e5c "../../target/arm/helper.c", line=12600, function=0x5555565a9560 <__PRETTY_FUNCTION__.0> "arm_security_space_below_el3") at ./assert/assert.c:101 We might call pmu_counter_enabled() on an M-profile CPU (for example from the migration pre/post hooks in machine.c); this should always return false because these CPUs don't set ARM_FEATURE_PMU. Avoid the assertion by not calling arm_mdcr_el2_eff() before we have done the early return for "PMU not present". This fixes an assertion failure if you try to do a loadvm or savevm for an M-profile board. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2155 Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Message-id: 20240208153346.970021-1-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/helper.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/target/arm/helper.c b/target/arm/helper.c index 793aa89cc6..762eb086c5 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -1182,13 +1182,21 @@ static bool pmu_counter_enabled(CPUARMState *env, uint8_t counter) bool enabled, prohibited = false, filtered; bool secure = arm_is_secure(env); int el = arm_current_el(env); - uint64_t mdcr_el2 = arm_mdcr_el2_eff(env); - uint8_t hpmn = mdcr_el2 & MDCR_HPMN; + uint64_t mdcr_el2; + uint8_t hpmn; + /* + * We might be called for M-profile cores where MDCR_EL2 doesn't + * exist and arm_mdcr_el2_eff() will assert, so this early-exit check + * must be before we read that value. + */ if (!arm_feature(env, ARM_FEATURE_PMU)) { return false; } + mdcr_el2 = arm_mdcr_el2_eff(env); + hpmn = mdcr_el2 & MDCR_HPMN; + if (!arm_feature(env, ARM_FEATURE_EL2) || (counter < hpmn || counter == 31)) { e = env->cp15.c9_pmcr & PMCRE; -- Gitee From fe9725eed4d9be8e14d2c3865f1d7d5f24cbdd73 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 28 Nov 2024 14:21:15 +0800 Subject: [PATCH 553/939] target/arm: Fix A64 scalar SQSHRN and SQRSHRN cherry-pick from 6fffc8378562c7fea6290c430b4f653f830a4c1a In commit 1b7bc9b5c8bf374dd we changed handle_vec_simd_sqshrn() so that instead of starting with a 0 value and depositing in each new element from the narrowing operation, it instead started with the raw result of the narrowing operation of the first element. This is fine in the vector case, because the deposit operations for the second and subsequent elements will always overwrite any higher bits that might have been in the first element's result value in tcg_rd. However in the scalar case we only go through this loop once. The effect is that for a signed narrowing operation, if the result is negative then we will now return a value where the bits above the first element are incorrectly 1 (because the narrowfn returns a sign-extended result, not one that is truncated to the element size). Fix this by using an extract operation to get exactly the correct bits of the output of the narrowfn for element 1, instead of a plain move. Cc: qemu-stable@nongnu.org Fixes: 1b7bc9b5c8bf374dd3 ("target/arm: Avoid tcg_const_ptr in handle_vec_simd_sqshrn") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2089 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240123153416.877308-1-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/tcg/translate-a64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 5560a53630..a05182b57f 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -8221,7 +8221,7 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q, narrowfn(tcg_rd_narrowed, tcg_env, tcg_rd); tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed); if (i == 0) { - tcg_gen_mov_i64(tcg_final, tcg_rd); + tcg_gen_extract_i64(tcg_final, tcg_rd, 0, esize); } else { tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); } -- Gitee From 3031ddd4dd45a706def011a9d6afdacd2557d147 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 28 Nov 2024 14:26:43 +0800 Subject: [PATCH 554/939] target/arm: fix exception syndrome for AArch32 bkpt insn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from f670be1aad33e801779af580398895b9455747ee Debug exceptions that target AArch32 Hyp mode are reported differently than on AAarch64. Internally, Qemu uses the AArch64 syndromes. Therefore such exceptions need to be either converted to a prefetch abort (breakpoints, vector catch) or a data abort (watchpoints). Cc: qemu-stable@nongnu.org Signed-off-by: Jan Klötzke Reviewed-by: Richard Henderson Message-id: 20240127202758.3326381-1-jan.kloetzke@kernkonzept.com Signed-off-by: Peter Maydell Signed-off-by: gubin --- target/arm/helper.c | 18 ++++++++++++++++++ target/arm/syndrome.h | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/target/arm/helper.c b/target/arm/helper.c index 793aa89cc6..35b8eaf15a 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -10848,6 +10848,24 @@ static void arm_cpu_do_interrupt_aarch32(CPUState *cs) } if (env->exception.target_el == 2) { + /* Debug exceptions are reported differently on AArch32 */ + switch (syn_get_ec(env->exception.syndrome)) { + case EC_BREAKPOINT: + case EC_BREAKPOINT_SAME_EL: + case EC_AA32_BKPT: + case EC_VECTORCATCH: + env->exception.syndrome = syn_insn_abort(arm_current_el(env) == 2, + 0, 0, 0x22); + break; + case EC_WATCHPOINT: + env->exception.syndrome = syn_set_ec(env->exception.syndrome, + EC_DATAABORT); + break; + case EC_WATCHPOINT_SAME_EL: + env->exception.syndrome = syn_set_ec(env->exception.syndrome, + EC_DATAABORT_SAME_EL); + break; + } arm_cpu_do_interrupt_aarch32_hyp(cs); return; } diff --git a/target/arm/syndrome.h b/target/arm/syndrome.h index 95454b5b3b..eccb759da6 100644 --- a/target/arm/syndrome.h +++ b/target/arm/syndrome.h @@ -25,6 +25,8 @@ #ifndef TARGET_ARM_SYNDROME_H #define TARGET_ARM_SYNDROME_H +#include "qemu/bitops.h" + /* Valid Syndrome Register EC field values */ enum arm_exception_class { EC_UNCATEGORIZED = 0x00, @@ -80,6 +82,7 @@ typedef enum { SME_ET_InactiveZA, } SMEExceptionType; +#define ARM_EL_EC_LENGTH 6 #define ARM_EL_EC_SHIFT 26 #define ARM_EL_IL_SHIFT 25 #define ARM_EL_ISV_SHIFT 24 @@ -91,6 +94,11 @@ static inline uint32_t syn_get_ec(uint32_t syn) return syn >> ARM_EL_EC_SHIFT; } +static inline uint32_t syn_set_ec(uint32_t syn, uint32_t ec) +{ + return deposit32(syn, ARM_EL_EC_SHIFT, ARM_EL_EC_LENGTH, ec); +} + /* * Utility functions for constructing various kinds of syndrome value. * Note that in general we follow the AArch64 syndrome values; in a -- Gitee From 1ad09007da426e9cd1585babcdd4de25ddfb2f8b Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 28 Nov 2024 14:39:05 +0800 Subject: [PATCH 555/939] target/arm: Fix incorrect aa64_tidcp1 feature check cherry-pick from ee0a2e3c9d2991a11c13ffadb15e4d0add43c257 A typo in the implementation of isar_feature_aa64_tidcp1() means we were checking the field in the wrong ID register, so we might have provided the feature on CPUs that don't have it and not provided it on CPUs that should have it. Correct this bug. Cc: qemu-stable@nongnu.org Fixes: 9cd0c0dec97be9 "target/arm: Implement FEAT_TIDCP1" Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2120 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240123160333.958841-1-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/cpu-features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h index 954d358268..165a497f7b 100644 --- a/target/arm/cpu-features.h +++ b/target/arm/cpu-features.h @@ -771,7 +771,7 @@ static inline bool isar_feature_aa64_hcx(const ARMISARegisters *id) static inline bool isar_feature_aa64_tidcp1(const ARMISARegisters *id) { - return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR1, TIDCP1) != 0; + return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, TIDCP1) != 0; } static inline bool isar_feature_aa64_hafs(const ARMISARegisters *id) -- Gitee From 17d589becc1a66934e55a4e2efffdd3876d56130 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= Date: Wed, 30 Oct 2024 10:09:30 +0000 Subject: [PATCH 556/939] crypto: perform runtime check for hash/hmac support in gcrypt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcrypto has the ability to dynamically disable hash/hmac algorithms at runtime, so QEMU must perform a runtime check. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Daniel P. Berrangé Signed-off-by: cheliequan --- crypto/hash-gcrypt.c | 2 +- crypto/hmac-gcrypt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/hash-gcrypt.c b/crypto/hash-gcrypt.c index d3bdfe5633..2b6dbd97bb 100644 --- a/crypto/hash-gcrypt.c +++ b/crypto/hash-gcrypt.c @@ -42,7 +42,7 @@ gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg) { if (alg < G_N_ELEMENTS(qcrypto_hash_alg_map) && qcrypto_hash_alg_map[alg] != GCRY_MD_NONE) { - return true; + return gcry_md_test_algo(qcrypto_hash_alg_map[alg]) == 0; } return false; } diff --git a/crypto/hmac-gcrypt.c b/crypto/hmac-gcrypt.c index 888afb86ed..15926fccfa 100644 --- a/crypto/hmac-gcrypt.c +++ b/crypto/hmac-gcrypt.c @@ -40,7 +40,7 @@ bool qcrypto_hmac_supports(QCryptoHashAlgorithm alg) { if (alg < G_N_ELEMENTS(qcrypto_hmac_alg_map) && qcrypto_hmac_alg_map[alg] != GCRY_MAC_NONE) { - return true; + return gcry_mac_test_algo(qcrypto_hmac_alg_map[alg]) == 0; } return false; -- Gitee From ecca2052693cc2a91459ac418bface2f1e635c88 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Nov 2024 13:53:18 +0100 Subject: [PATCH 557/939] hw/audio/hda: fix memory leak on audio setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SET_STREAM_FORMAT is called, the st->buft timer is overwritten, thus causing a memory leak. This was originally fixed in commit 816139ae6a5 ("hw/audio/hda: fix memory leak on audio setup", 2024-11-14) but that caused the audio to break in SPICE. Fortunately, a simpler fix is possible. The timer only needs to be reset, because the callback is always the same (st->output is set at realize time in hda_audio_init); call to timer_new_ns overkill. Replace it with timer_del and only initialize the timer once; for simplicity, do it even if use_timer is false. An even simpler fix would be to free the old time in hda_audio_setup(). However, it seems better to place the initialization of the timer close to that of st->ouput. Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Reviewed-by: Michael Tokarev Message-ID: <20241114125318.1707590-3-pbonzini@redhat.com> Signed-off-by: Philippe Mathieu-Daudé (cherry picked from commit 626b39006d2f9b1378a04cb88a2187bb852cb055) Signed-off-by: zhujun2 --- hw/audio/hda-codec.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c index 19f401cabe..ac908e56c6 100644 --- a/hw/audio/hda-codec.c +++ b/hw/audio/hda-codec.c @@ -487,8 +487,7 @@ static void hda_audio_setup(HDAAudioStream *st) if (st->output) { if (use_timer) { cb = hda_audio_output_cb; - st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, - hda_audio_output_timer, st); + timer_del(st->buft); } else { cb = hda_audio_compat_output_cb; } @@ -497,8 +496,7 @@ static void hda_audio_setup(HDAAudioStream *st) } else { if (use_timer) { cb = hda_audio_input_cb; - st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, - hda_audio_input_timer, st); + timer_del(st->buft); } else { cb = hda_audio_compat_input_cb; } @@ -726,8 +724,12 @@ static void hda_audio_init(HDACodecDevice *hda, st->gain_right = QEMU_HDA_AMP_STEPS; st->compat_bpos = sizeof(st->compat_buf); st->output = true; + st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, + hda_audio_output_timer, st); } else { st->output = false; + st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, + hda_audio_input_timer, st); } st->format = AC_FMT_TYPE_PCM | AC_FMT_BITS_16 | (1 << AC_FMT_CHAN_SHIFT); @@ -750,9 +752,7 @@ static void hda_audio_exit(HDACodecDevice *hda) if (st->node == NULL) { continue; } - if (a->use_timer) { - timer_free(st->buft); - } + timer_free(st->buft); if (st->output) { AUD_close_out(&a->card, st->voice.out); } else { -- Gitee From 84321dcfb4ec3d08984e7680c8efad80907bde84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 29 Jul 2024 15:44:13 +0100 Subject: [PATCH 558/939] contrib/plugins: add compat for g_memdup2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were premature if bumping this because some of our builds are still on older glibs. Just copy the compat handler for now and we can remove it later. Fixes: ee293103b0 (plugins: update lockstep to use g_memdup2) Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2161 Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-Id: <20240729144414.830369-14-alex.bennee@linaro.org> (cherry picked from commit 44e794896759236885f6d30d1f6b9b8b76355d52) Signed-off-by: zhujun2 --- contrib/plugins/lockstep.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/contrib/plugins/lockstep.c b/contrib/plugins/lockstep.c index 237543b43a..0c6f060183 100644 --- a/contrib/plugins/lockstep.c +++ b/contrib/plugins/lockstep.c @@ -100,6 +100,31 @@ static void plugin_exit(qemu_plugin_id_t id, void *p) plugin_cleanup(id); } +/* + * g_memdup has been deprecated in Glib since 2.68 and + * will complain about it if you try to use it. However until + * glib_req_ver for QEMU is bumped we make a copy of the glib-compat + * handler. + */ +static inline gpointer g_memdup2_qemu(gconstpointer mem, gsize byte_size) +{ +#if GLIB_CHECK_VERSION(2, 68, 0) + return g_memdup2(mem, byte_size); +#else + gpointer new_mem; + + if (mem && byte_size != 0) { + new_mem = g_malloc(byte_size); + memcpy(new_mem, mem, byte_size); + } else { + new_mem = NULL; + } + + return new_mem; +#endif +} +#define g_memdup2(m, s) g_memdup2_qemu(m, s) + static void report_divergance(ExecState *us, ExecState *them) { DivergeState divrec = { log, 0 }; -- Gitee From ddb2cb652db80b24ba5ddf0b00dd3ba3f9224eba Mon Sep 17 00:00:00 2001 From: Pierrick Bouvier Date: Fri, 25 Oct 2024 10:58:56 -0700 Subject: [PATCH 559/939] target/i386: fix hang when using slow path for ptw_setl When instrumenting memory accesses for plugin, we force memory accesses to use the slow path for mmu [1]. This create a situation where we end up calling ptw_setl_slow. This was fixed recently in [2] but the issue still could appear out of plugins use case. Since this function gets called during a cpu_exec, start_exclusive then hangs. This exclusive section was introduced initially for security reasons [3]. I suspect this code path was never triggered, because ptw_setl_slow would always be called transitively from cpu_exec, resulting in a hang. [1] https://gitlab.com/qemu-project/qemu/-/commit/6d03226b42247b68ab2f0b3663e0f624335a4055 [2] https://gitlab.com/qemu-project/qemu/-/commit/115ade42d50144c15b74368d32dc734ea277d853 [2] https://gitlab.com/qemu-project/qemu/-/commit/9a96406787afcc9960fbe8791892c78311d6971f in 8.2.x series [3] https://gitlab.com/qemu-project/qemu/-/issues/279 Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2566 Signed-off-by: Pierrick Bouvier Reviewed-by: Richard Henderson Message-ID: <20241025175857.2554252-2-pierrick.bouvier@linaro.org> Signed-off-by: Richard Henderson (cherry picked from commit 7ba055b49b74c4d2f4a338c5198485bdff373fb1) Signed-off-by: zhujun2 --- target/i386/tcg/sysemu/excp_helper.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c index 5b86f439ad..294dbc50e2 100644 --- a/target/i386/tcg/sysemu/excp_helper.c +++ b/target/i386/tcg/sysemu/excp_helper.c @@ -107,6 +107,10 @@ static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new) { uint32_t cmp; + CPUState *cpu = env_cpu(in->env); + /* We are in cpu_exec, and start_exclusive can't be called directly.*/ + g_assert(cpu->running); + cpu_exec_end(cpu); /* Does x86 really perform a rmw cycle on mmio for ptw? */ start_exclusive(); cmp = cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0); @@ -114,6 +118,7 @@ static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new) cpu_stl_mmuidx_ra(in->env, in->gaddr, new, in->ptw_idx, 0); } end_exclusive(); + cpu_exec_start(cpu); return cmp == old; } -- Gitee From 1475170931ea2979a150fe4c1d3fc6b649eb3a6e Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 12 Nov 2024 06:12:32 -0800 Subject: [PATCH 560/939] target/arm: Drop user-only special case in sve_stN_r This path is reachable with plugins enabled, and provoked with run-plugin-catch-syscalls-with-libinline.so. Cc: qemu-stable@nongnu.org Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson Message-ID: <20241112141232.321354-1-richard.henderson@linaro.org> (cherry picked from commit f27550804688da43c6e0d87b2f9e143adbf76271) Signed-off-by: zhujun2 --- target/arm/tcg/sve_helper.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index f006d152cc..ce8134320b 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -6306,9 +6306,6 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, flags = info.page[0].flags | info.page[1].flags; if (unlikely(flags != 0)) { -#ifdef CONFIG_USER_ONLY - g_assert_not_reached(); -#else /* * At least one page includes MMIO. * Any bus operation can fail with cpu_transaction_failed, @@ -6339,7 +6336,6 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, } while (reg_off & 63); } while (reg_off <= reg_last); return; -#endif } mem_off = info.mem_off_first[0]; -- Gitee From 72aa575da11b3a897eeaae926802c50dc8ff7a84 Mon Sep 17 00:00:00 2001 From: Hanna Czenczek Date: Tue, 15 Oct 2024 19:04:37 +0200 Subject: [PATCH 561/939] migration: Ensure vmstate_save() sets errp migration/savevm.c contains some calls to vmstate_save() that are followed by migrate_set_error() if the integer return value indicates an error. migrate_set_error() requires that the `Error *` object passed to it is set. Therefore, vmstate_save() is assumed to always set *errp on error. Right now, that assumption is not met: vmstate_save_state_v() (called internally by vmstate_save()) will not set *errp if vmstate_subsection_save() or vmsd->post_save() fail. Fix that by adding an *errp parameter to vmstate_subsection_save(), and by generating a generic error in case post_save() fails (as is already done for pre_save()). Without this patch, qemu will crash after vmstate_subsection_save() or post_save() have failed inside of a vmstate_save() call (unless migrate_set_error() then happen to discard the new error because s->error is already set). This happens e.g. when receiving the state from a virtio-fs back-end (virtiofsd) fails. Signed-off-by: Hanna Czenczek Link: https://lore.kernel.org/r/20241015170437.310358-1-hreitz@redhat.com Signed-off-by: Peter Xu (cherry picked from commit 37dfcba1a04989830c706f9cbc00450e5d3a7447) Signed-off-by: zhujun2 --- migration/vmstate.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/migration/vmstate.c b/migration/vmstate.c index b7723a4187..bd08e390c5 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -22,7 +22,8 @@ #include "trace.h" static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc); + void *opaque, JSONWriter *vmdesc, + Error **errp); static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, void *opaque); @@ -440,12 +441,13 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, json_writer_end_array(vmdesc); } - ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc); + ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc, errp); if (vmsd->post_save) { int ps_ret = vmsd->post_save(opaque); - if (!ret) { + if (!ret && ps_ret) { ret = ps_ret; + error_setg(errp, "post-save failed: %s", vmsd->name); } } return ret; @@ -515,7 +517,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, } static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc) + void *opaque, JSONWriter *vmdesc, + Error **errp) { const VMStateDescription **sub = vmsd->subsections; bool vmdesc_has_subsections = false; @@ -543,7 +546,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len); qemu_put_be32(f, vmsdsub->version_id); - ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc); + ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp); if (ret) { return ret; } -- Gitee From b44fc9f3fc91363c55f6ba739f6c09222f979d88 Mon Sep 17 00:00:00 2001 From: Sergey Makarov Date: Wed, 18 Sep 2024 17:02:29 +0300 Subject: [PATCH 562/939] hw/intc: Don't clear pending bits on IRQ lowering According to PLIC specification (chapter 5), there is only one case, when interrupt is claimed. Fix PLIC controller to match this behavior. Signed-off-by: Sergey Makarov Reviewed-by: Alistair Francis Message-ID: <20240918140229.124329-3-s.makarov@syntacore.com> Signed-off-by: Alistair Francis (cherry picked from commit a84be2baa9eca8bc500f866ad943b8f63dc99adf) Signed-off-by: zhujun2 --- hw/intc/sifive_plic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c index 5522ede2cf..e5de52bc44 100644 --- a/hw/intc/sifive_plic.c +++ b/hw/intc/sifive_plic.c @@ -349,8 +349,10 @@ static void sifive_plic_irq_request(void *opaque, int irq, int level) { SiFivePLICState *s = opaque; - sifive_plic_set_pending(s, irq, level > 0); - sifive_plic_update(s); + if (level > 0) { + sifive_plic_set_pending(s, irq, true); + sifive_plic_update(s); + } } static void sifive_plic_realize(DeviceState *dev, Error **errp) -- Gitee From e698238a5fa6e78fdffc8269d59884df69da3434 Mon Sep 17 00:00:00 2001 From: chenzheng Date: Thu, 5 Dec 2024 11:06:57 +0000 Subject: [PATCH 563/939] Reserve address for MSI mapping in the CVM scenario. Signed-off-by: yangxiangkai@huawei.com --- hw/arm/virt.c | 3 ++- include/hw/arm/virt.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index a9efcec85e..8823f2ed1c 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -162,8 +162,9 @@ static const MemMapEntry base_memmap[] = { [VIRT_PVTIME] = { 0x090a0000, 0x00010000 }, [VIRT_SECURE_GPIO] = { 0x090b0000, 0x00001000 }, [VIRT_CPUHP_ACPI] = { 0x090c0000, ACPI_CPU_HOTPLUG_REG_LEN}, - /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ [VIRT_MMIO] = { 0x0a000000, 0x00000200 }, + /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ + [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 4b7dc61c24..345b2d5594 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -121,6 +121,7 @@ enum { VIRT_UART, VIRT_CPUFREQ, VIRT_MMIO, + VIRT_CVM_MSI, VIRT_RTC, VIRT_FW_CFG, VIRT_PCIE, -- Gitee From 4ce59de673b1b190cde76c458ac9e92a6413172d Mon Sep 17 00:00:00 2001 From: jiangxin Date: Wed, 25 Aug 2021 11:07:41 +0800 Subject: [PATCH 564/939] target/i386: csv: Add command to initialize CSV3 context When CSV3 is enabled, KVM_CSV3_INIT command is used to initialize the platform, which is implemented by reusing the SEV API framework and extending the functionality. The KVM_CSV3_INIT command should be performed earlier than any other command. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 11 +++++++++ target/i386/csv-sysemu-stub.c | 5 ++++ target/i386/csv.c | 45 +++++++++++++++++++++++++++++++++++ target/i386/csv.h | 4 ++++ target/i386/sev.c | 17 +++++++++++++ target/i386/sev.h | 7 ++++++ 6 files changed, 89 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 8dc00808ec..90869068c8 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2108,6 +2108,17 @@ struct kvm_csv_init { __u32 len; }; +/* CSV3 command */ +enum csv3_cmd_id { + KVM_CSV3_NR_MIN = 0xc0, + + KVM_CSV3_INIT = KVM_CSV3_NR_MIN, +}; + +struct kvm_csv3_init_data { + __u64 nodemask; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c index 5874e4cc1d..72f0f5c772 100644 --- a/target/i386/csv-sysemu-stub.c +++ b/target/i386/csv-sysemu-stub.c @@ -14,3 +14,8 @@ #include "qemu/osdep.h" #include "sev.h" #include "csv.h" + +int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) +{ + return 0; +} diff --git a/target/i386/csv.c b/target/i386/csv.c index 9a1de04db7..fd3ea291ca 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -12,6 +12,13 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" + +#include + +#ifdef CONFIG_NUMA +#include +#endif #include "cpu.h" #include "sev.h" @@ -21,6 +28,44 @@ bool csv_kvm_cpu_reset_inhibit; Csv3GuestState csv3_guest = { 0 }; +int +csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) +{ + int fw_error; + int ret; + struct kvm_csv3_init_data data = { 0 }; + +#ifdef CONFIG_NUMA + int mode; + unsigned long nodemask; + + /* Set flags as 0 to retrieve the default NUMA policy. */ + ret = get_mempolicy(&mode, &nodemask, sizeof(nodemask) * 8, NULL, 0); + if (ret == 0 && mode == MPOL_BIND) + data.nodemask = nodemask; +#endif + + if (!ops || !ops->sev_ioctl || !ops->fw_error_to_str) + return -1; + + csv3_guest.policy = policy; + if (csv3_enabled()) { + ret = ops->sev_ioctl(fd, KVM_CSV3_INIT, &data, &fw_error); + if (ret) { + csv3_guest.policy = 0; + error_report("%s: Fail to initialize ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, ops->fw_error_to_str(fw_error)); + return -1; + } + + csv3_guest.sev_fd = fd; + csv3_guest.state = state; + csv3_guest.sev_ioctl = ops->sev_ioctl; + csv3_guest.fw_error_to_str = ops->fw_error_to_str; + } + return 0; +} + bool csv3_enabled(void) { diff --git a/target/i386/csv.h b/target/i386/csv.h index ea87c1ba27..4096e8658b 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -15,6 +15,7 @@ #define I386_CSV_H #include "qapi/qapi-commands-misc-target.h" +#include "sev.h" #define GUEST_POLICY_CSV3_BIT (1 << 6) #define GUEST_POLICY_REUSE_ASID (1 << 7) @@ -77,10 +78,13 @@ struct Csv3GuestState { uint32_t policy; int sev_fd; void *state; + int (*sev_ioctl)(int fd, int cmd, void *data, int *error); + const char *(*fw_error_to_str)(int code); }; typedef struct Csv3GuestState Csv3GuestState; extern struct Csv3GuestState csv3_guest; +extern int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops); #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index af61ca5ba8..1c453b3148 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1225,6 +1225,18 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) goto err; } + /* Support CSV3 */ + if (!ret && cmd == KVM_SEV_ES_INIT) { + ret = csv3_init(sev_guest->policy, sev->sev_fd, (void *)&sev->state, &sev_ops); + if (ret) { + error_setg(errp, "%s: failed to init csv3 context", __func__); + goto err; + } + /* The CSV3 guest is not resettable */ + if (csv3_enabled()) + csv_kvm_cpu_reset_inhibit = true; + } + /* * The LAUNCH context is used for new guest, if its an incoming guest * then RECEIVE context will be created after the connection is established. @@ -2635,6 +2647,11 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) return ret; } +struct sev_ops sev_ops = { + .sev_ioctl = sev_ioctl, + .fw_error_to_str = fw_error_to_str, +}; + static void sev_register_types(void) { diff --git a/target/i386/sev.h b/target/i386/sev.h index 0bfe3879ef..e91431e0f7 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -80,4 +80,11 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); extern bool sev_kvm_has_msr_ghcb; +struct sev_ops { + int (*sev_ioctl)(int fd, int cmd, void *data, int *error); + const char *(*fw_error_to_str)(int code); +}; + +extern struct sev_ops sev_ops; + #endif -- Gitee From 53cba8da8fb18cc9a463ec1f57990e8558cd4008 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Wed, 25 Aug 2021 09:59:16 +0800 Subject: [PATCH 565/939] target/i386: csv: Add command to load data to CSV3 guest memory The KVM_CSV3_LAUNCH_ENCRYPT_DATA command is used to load data to an encrypted guest memory in an isolated memory region that guest owns. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 7 ++++ target/i386/csv-sysemu-stub.c | 5 +++ target/i386/csv.c | 69 +++++++++++++++++++++++++++++++++++ target/i386/csv.h | 2 + target/i386/trace-events | 3 ++ 5 files changed, 86 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 90869068c8..dd6d9c2e07 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2113,6 +2113,13 @@ enum csv3_cmd_id { KVM_CSV3_NR_MIN = 0xc0, KVM_CSV3_INIT = KVM_CSV3_NR_MIN, + KVM_CSV3_LAUNCH_ENCRYPT_DATA, +}; + +struct kvm_csv3_launch_encrypt_data { + __u64 gpa; + __u64 uaddr; + __u32 len; }; struct kvm_csv3_init_data { diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c index 72f0f5c772..b0ccbd2f18 100644 --- a/target/i386/csv-sysemu-stub.c +++ b/target/i386/csv-sysemu-stub.c @@ -19,3 +19,8 @@ int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) { return 0; } + +int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp) +{ + g_assert_not_reached(); +} diff --git a/target/i386/csv.c b/target/i386/csv.c index fd3ea291ca..2a596681b8 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -13,6 +13,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" +#include "qapi/error.h" #include @@ -20,6 +21,7 @@ #include #endif +#include "trace.h" #include "cpu.h" #include "sev.h" #include "csv.h" @@ -74,3 +76,70 @@ csv3_enabled(void) return sev_es_enabled() && (csv3_guest.policy & GUEST_POLICY_CSV3_BIT); } + +static bool +csv3_check_state(SevState state) +{ + return *((SevState *)csv3_guest.state) == state; +} + +static int +csv3_ioctl(int cmd, void *data, int *error) +{ + if (csv3_guest.sev_ioctl) + return csv3_guest.sev_ioctl(csv3_guest.sev_fd, cmd, data, error); + else + return -1; +} + +static const char * +fw_error_to_str(int code) +{ + if (csv3_guest.fw_error_to_str) + return csv3_guest.fw_error_to_str(code); + else + return NULL; +} + +static int +csv3_launch_encrypt_data(uint64_t gpa, uint8_t *addr, uint64_t len) +{ + int ret, fw_error; + struct kvm_csv3_launch_encrypt_data update; + + if (!addr || !len) { + return 1; + } + + update.gpa = (__u64)gpa; + update.uaddr = (__u64)(unsigned long)addr; + update.len = len; + trace_kvm_csv3_launch_encrypt_data(gpa, addr, len); + ret = csv3_ioctl(KVM_CSV3_LAUNCH_ENCRYPT_DATA, &update, &fw_error); + if (ret) { + error_report("%s: CSV3 LAUNCH_ENCRYPT_DATA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + } + + return ret; +} + +int +csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp) +{ + int ret = 0; + + if (!csv3_enabled()) { + error_setg(errp, "%s: CSV3 is not enabled", __func__); + return -1; + } + + /* if CSV3 is in update state then load the data to secure memory */ + if (csv3_check_state(SEV_STATE_LAUNCH_UPDATE)) { + ret = csv3_launch_encrypt_data(gpa, ptr, len); + if (ret) + error_setg(errp, "%s: CSV3 fail to encrypt data", __func__); + } + + return ret; +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 4096e8658b..27b66f7857 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -87,4 +87,6 @@ typedef struct Csv3GuestState Csv3GuestState; extern struct Csv3GuestState csv3_guest; extern int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops); +int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); + #endif diff --git a/target/i386/trace-events b/target/i386/trace-events index 87b765c73c..34c205ffda 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -19,3 +19,6 @@ kvm_sev_receive_update_data(void *src, void *dst, int len, void *hdr, int hdr_le kvm_sev_receive_finish(void) "" kvm_sev_send_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *dst, int len) "cpu_id %d cpu_index %d trans %p len %d" kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int len, void *hdr, int hdr_len) "cpu_id %d cpu_index %d trans %p len %d hdr %p hdr_len %d" + +# csv.c +kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 "addr %p len 0x%" PRIx64 -- Gitee From 368bf2c044fcdd21f10545de103af7cd2a5986f9 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Wed, 25 Aug 2021 12:25:05 +0800 Subject: [PATCH 566/939] target/i386: csv: Add command to load vmcb to CSV3 guest memory The KVM_CSV3_LAUNCH_ENCRYPT_VMCB command is used to load and encrypt the initial VMCB data to secure memory in an isolated region that guest owns. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 1 + target/i386/csv-sysemu-stub.c | 5 +++++ target/i386/csv.c | 21 +++++++++++++++++++++ target/i386/csv.h | 1 + target/i386/sev.c | 8 ++++++-- 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index dd6d9c2e07..8487d0889b 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2114,6 +2114,7 @@ enum csv3_cmd_id { KVM_CSV3_INIT = KVM_CSV3_NR_MIN, KVM_CSV3_LAUNCH_ENCRYPT_DATA, + KVM_CSV3_LAUNCH_ENCRYPT_VMCB, }; struct kvm_csv3_launch_encrypt_data { diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c index b0ccbd2f18..23d885f0f3 100644 --- a/target/i386/csv-sysemu-stub.c +++ b/target/i386/csv-sysemu-stub.c @@ -24,3 +24,8 @@ int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp) { g_assert_not_reached(); } + +int csv3_launch_encrypt_vmcb(void) +{ + g_assert_not_reached(); +} diff --git a/target/i386/csv.c b/target/i386/csv.c index 2a596681b8..12282ba451 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -143,3 +143,24 @@ csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp) return ret; } + +int +csv3_launch_encrypt_vmcb(void) +{ + int ret, fw_error; + + if (!csv3_enabled()) { + error_report("%s: CSV3 is not enabled", __func__); + return -1; + } + + ret = csv3_ioctl(KVM_CSV3_LAUNCH_ENCRYPT_VMCB, NULL, &fw_error); + if (ret) { + error_report("%s: CSV3 LAUNCH_ENCRYPT_VMCB ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + +err: + return ret; +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 27b66f7857..3caf216743 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -86,6 +86,7 @@ typedef struct Csv3GuestState Csv3GuestState; extern struct Csv3GuestState csv3_guest; extern int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops); +extern int csv3_launch_encrypt_vmcb(void); int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); diff --git a/target/i386/sev.c b/target/i386/sev.c index 1c453b3148..6ff8891678 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -880,8 +880,12 @@ sev_launch_get_measure(Notifier *notifier, void *unused) } if (sev_es_enabled()) { - /* measure all the VM save areas before getting launch_measure */ - ret = sev_launch_update_vmsa(sev); + if (csv3_enabled()) { + ret = csv3_launch_encrypt_vmcb(); + } else { + /* measure all the VM save areas before getting launch_measure */ + ret = sev_launch_update_vmsa(sev); + } if (ret) { exit(1); } -- Gitee From 120d0b9e5c92de91c69fb9fbea038b51c820013d Mon Sep 17 00:00:00 2001 From: jiangxin Date: Tue, 24 Aug 2021 17:31:28 +0800 Subject: [PATCH 567/939] target/i386: cpu: Populate CPUID 0x8000_001F when CSV3 is active On Hygon platform, bit 30 of EAX indicates whether this feature is supported in hardware. When CSV3 is active, CPUID 0x8000_001F provides information for it. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- target/i386/cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ca7e5337b0..36f7ad6460 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -29,6 +29,7 @@ #include "hvf/hvf-i386.h" #include "kvm/kvm_i386.h" #include "sev.h" +#include "csv.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "qapi/qapi-visit-machine.h" @@ -6943,6 +6944,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, if (sev_enabled()) { *eax = 0x2; *eax |= sev_es_enabled() ? 0x8 : 0; + *eax |= csv3_enabled() ? 0x40000000 : 0; /* bit 30 for CSV3 */ *ebx = sev_get_cbit_position() & 0x3f; /* EBX[5:0] */ *ebx |= (sev_get_reduced_phys_bits() & 0x3f) << 6; /* EBX[11:6] */ } -- Gitee From a3e8267b93d1e77dc547fff6fb9af6f8d48a674f Mon Sep 17 00:00:00 2001 From: jiangxin Date: Wed, 25 Aug 2021 12:36:00 +0800 Subject: [PATCH 568/939] target/i386: csv: Do not register/unregister guest secure memory for CSV3 guest CSV3's guest memory is allocated by firmware in secure processor from dedicated memory reserved upon system boot up, consequently it is not necessary to add notifier to pin/unpin memory. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- target/i386/sev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index 6ff8891678..0012a5efb0 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1262,7 +1262,10 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) } } - ram_block_notifier_add(&sev_ram_notifier); + /* CSV3 guest do not need notifier to reg/unreg memory */ + if (!csv3_enabled()) { + ram_block_notifier_add(&sev_ram_notifier); + } qemu_add_machine_init_done_notifier(&sev_machine_done_notify); qemu_add_vm_change_state_handler(sev_vm_state_change, sev); migration_add_notifier(&sev_migration_state, sev_migration_state_notifier); -- Gitee From ed3c233cc00d4c30718fc64b3afc48a51b4eb438 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Wed, 25 Aug 2021 14:29:40 +0800 Subject: [PATCH 569/939] target/i386: csv: Load initial image to private memory for CSV3 guest The initial image of CSV3 guest should be loaded into private memory before boot the guest. Add APIs to implement the image load. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- hw/i386/pc_sysfw.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index c8d9e71b88..2bbcbb8d35 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -37,6 +37,7 @@ #include "hw/block/flash.h" #include "sysemu/kvm.h" #include "sev.h" +#include "csv.h" #define FLASH_SECTOR_SIZE 4096 @@ -263,7 +264,18 @@ void x86_firmware_configure(void *ptr, int size) error_report("failed to locate and/or save reset vector"); exit(1); } + if (csv3_enabled()) { + ram_addr_t offset = 0; + MemoryRegion *mr; - sev_encrypt_flash(ptr, size, &error_fatal); + mr = memory_region_from_host(ptr, &offset); + if (!mr) { + error_report("failed to get memory region of flash"); + exit(1); + } + csv3_load_data(mr->addr + offset, ptr, size, &error_fatal); + } else { + sev_encrypt_flash(ptr, size, &error_fatal); + } } } -- Gitee From b791d13a0630e6640b3c39dc90671a2150734a24 Mon Sep 17 00:00:00 2001 From: Xin Jiang Date: Thu, 13 Jul 2023 09:35:10 +0800 Subject: [PATCH 570/939] vga: Force full update for CSV3 guest As CSV3's NPT(nested page table) is managed by firmware, VMM is hard to track the dirty pages of vga buffer. Although VMM could perform a command to firmware to update read/write attribute of vga buffer in NPT, it costs more time due to communication between VMM and firmware. So the simplest method is to fully update vga buffer always. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- accel/kvm/kvm-all.c | 1 + accel/stubs/kvm-stub.c | 2 ++ hw/display/vga.c | 7 +++++++ include/sysemu/kvm.h | 8 ++++++++ target/i386/csv.c | 3 +++ 5 files changed, 21 insertions(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 8077630825..8028caddf9 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -103,6 +103,7 @@ bool kvm_allowed; bool kvm_readonly_mem_allowed; bool kvm_vm_attributes_allowed; bool kvm_msi_use_devid; +bool kvm_csv3_allowed; bool kvm_has_guest_debug; static int kvm_sstep_flags; static bool kvm_immediate_exit; diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index ad39a434c4..b071afee45 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -27,6 +27,8 @@ bool kvm_msi_use_devid; bool virtcca_cvm_allowed; +bool kvm_csv3_allowed; + void kvm_flush_coalesced_mmio_buffer(void) { } diff --git a/hw/display/vga.c b/hw/display/vga.c index cb6b6ee2ca..3f1358676b 100644 --- a/hw/display/vga.c +++ b/hw/display/vga.c @@ -39,6 +39,8 @@ #include "migration/vmstate.h" #include "trace.h" +#include "sysemu/kvm.h" + //#define DEBUG_VGA_MEM //#define DEBUG_VGA_REG @@ -1790,6 +1792,11 @@ static void vga_update_display(void *opaque) s->cursor_blink_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); full_update = 1; } + + /* Force to full update in CSV guest. */ + if (kvm_csv3_enabled()) + full_update = 1; + switch(graphic_mode) { case GMODE_TEXT: vga_draw_text(s, full_update); diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 31af5f0e24..fd8634cc8f 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -44,6 +44,7 @@ extern bool kvm_gsi_routing_allowed; extern bool kvm_gsi_direct_mapping; extern bool kvm_readonly_mem_allowed; extern bool kvm_msi_use_devid; +extern bool kvm_csv3_allowed; #define kvm_enabled() (kvm_allowed) #define virtcca_cvm_enabled() (virtcca_cvm_allowed) @@ -147,6 +148,12 @@ extern bool kvm_msi_use_devid; */ #define kvm_msi_devid_required() (kvm_msi_use_devid) +/** + * kvm_csv3_enabled: + * Returns: true if CSV3 feature is used for the VM. + */ +#define kvm_csv3_enabled() (kvm_csv3_allowed) + #else #define kvm_enabled() (0) @@ -163,6 +170,7 @@ extern bool kvm_msi_use_devid; #define kvm_gsi_direct_mapping() (false) #define kvm_readonly_mem_enabled() (false) #define kvm_msi_devid_required() (false) +#define kvm_csv3_enabled() (false) #endif /* CONFIG_KVM_IS_POSSIBLE */ diff --git a/target/i386/csv.c b/target/i386/csv.c index 12282ba451..65d87de003 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "sysemu/kvm.h" #include @@ -60,6 +61,8 @@ csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) return -1; } + kvm_csv3_allowed = true; + csv3_guest.sev_fd = fd; csv3_guest.state = state; csv3_guest.sev_ioctl = ops->sev_ioctl; -- Gitee From 5631d7e167d87c4e2f9283cfac39f2f4107203cc Mon Sep 17 00:00:00 2001 From: liuyafei Date: Mon, 22 May 2023 20:37:40 +0800 Subject: [PATCH 571/939] vfio: Only map shared region for CSV3 virtual machine qemu vfio listener map/unmap all of the virtual machine's memory. It does not work for CSV3 virtual machine, as only shared memory should be accessed by device. Signed-off-by: liuyafei Signed-off-by: hanliyang --- hw/vfio/container.c | 46 +++++++++++- include/exec/memory.h | 11 +++ system/memory.c | 18 +++++ target/i386/csv-sysemu-stub.c | 10 +++ target/i386/csv.c | 134 ++++++++++++++++++++++++++++++++++ target/i386/csv.h | 12 +++ target/i386/kvm/kvm.c | 2 + 7 files changed, 230 insertions(+), 3 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 422235a221..77e61cfedd 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -30,6 +30,7 @@ #include "qemu/error-report.h" #include "qemu/range.h" #include "sysemu/reset.h" +#include "sysemu/kvm.h" #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" @@ -534,6 +535,32 @@ static void vfio_free_container(VFIOContainer *container) g_free(container); } +static SharedRegionListener *g_shl; + +static void shared_memory_listener_register(MemoryListener *listener, + AddressSpace *as) +{ + SharedRegionListener *shl; + + shl = g_new0(SharedRegionListener, 1); + + shl->listener = listener; + shl->as = as; + + shared_region_register_listener(shl); + g_shl = shl; +} + +static void shared_memory_listener_unregister(void) +{ + SharedRegionListener *shl = g_shl; + + shared_region_unregister_listener(shl); + + g_free(shl); + g_shl = NULL; +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -681,7 +708,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->listener = vfio_memory_listener; - memory_listener_register(&container->listener, container->space->as); + if (kvm_csv3_enabled()) { + shared_memory_listener_register(&container->listener, + container->space->as); + } else { + memory_listener_register(&container->listener, container->space->as); + } if (container->error) { ret = -1; @@ -697,7 +729,11 @@ listener_release_exit: QLIST_REMOVE(group, container_next); QLIST_REMOVE(container, next); vfio_kvm_device_del_group(group); - memory_listener_unregister(&container->listener); + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); + } else { + memory_listener_unregister(&container->listener); + } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { vfio_spapr_container_deinit(container); @@ -731,7 +767,11 @@ static void vfio_disconnect_container(VFIOGroup *group) * group. */ if (QLIST_EMPTY(&container->group_list)) { - memory_listener_unregister(&container->listener); + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); + } else { + memory_listener_unregister(&container->listener); + } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { vfio_spapr_container_deinit(container); diff --git a/include/exec/memory.h b/include/exec/memory.h index 73d274d8f3..542c9da918 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -775,6 +775,17 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, ram_addr_t *ram_addr, bool *read_only, bool *mr_has_discard_manager); +typedef struct SharedRegionListener SharedRegionListener; +struct SharedRegionListener { + MemoryListener *listener; + AddressSpace *as; + QTAILQ_ENTRY(SharedRegionListener) next; +}; + +void shared_region_register_listener(SharedRegionListener *shl); +void shared_region_unregister_listener(SharedRegionListener *shl); +void *shared_region_listeners_get(void); + typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; diff --git a/system/memory.c b/system/memory.c index 1ae03074f3..9db07fd832 100644 --- a/system/memory.c +++ b/system/memory.c @@ -48,6 +48,9 @@ static QTAILQ_HEAD(, MemoryListener) memory_listeners static QTAILQ_HEAD(, AddressSpace) address_spaces = QTAILQ_HEAD_INITIALIZER(address_spaces); +static QTAILQ_HEAD(, SharedRegionListener) shared_region_listeners + = QTAILQ_HEAD_INITIALIZER(shared_region_listeners); + static GHashTable *flat_views; typedef struct AddrRange AddrRange; @@ -2226,6 +2229,21 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, return true; } +void shared_region_register_listener(SharedRegionListener *shl) +{ + QTAILQ_INSERT_TAIL(&shared_region_listeners, shl, next); +} + +void shared_region_unregister_listener(SharedRegionListener *shl) +{ + QTAILQ_REMOVE(&shared_region_listeners, shl, next); +} + +void *shared_region_listeners_get(void) +{ + return &shared_region_listeners; +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c index 23d885f0f3..db22c299a6 100644 --- a/target/i386/csv-sysemu-stub.c +++ b/target/i386/csv-sysemu-stub.c @@ -29,3 +29,13 @@ int csv3_launch_encrypt_vmcb(void) { g_assert_not_reached(); } + +int csv3_shared_region_dma_map(uint64_t start, uint64_t end) +{ + return 0; +} + +void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) +{ + +} diff --git a/target/i386/csv.c b/target/i386/csv.c index 65d87de003..e4706efa27 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -15,6 +15,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "sysemu/kvm.h" +#include "exec/address-spaces.h" #include @@ -67,6 +68,8 @@ csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) csv3_guest.state = state; csv3_guest.sev_ioctl = ops->sev_ioctl; csv3_guest.fw_error_to_str = ops->fw_error_to_str; + QTAILQ_INIT(&csv3_guest.dma_map_regions_list); + qemu_mutex_init(&csv3_guest.dma_map_regions_list_mutex); } return 0; } @@ -167,3 +170,134 @@ csv3_launch_encrypt_vmcb(void) err: return ret; } + +int csv3_shared_region_dma_map(uint64_t start, uint64_t end) +{ + MemoryRegionSection section; + AddressSpace *as; + QTAILQ_HEAD(, SharedRegionListener) *shared_region_listeners; + SharedRegionListener *shl; + MemoryListener *listener; + uint64_t size; + Csv3GuestState *s = &csv3_guest; + struct dma_map_region *region, *pos; + int ret = 0; + + if (!csv3_enabled()) + return 0; + + if (end <= start) + return 0; + + shared_region_listeners = shared_region_listeners_get(); + if (QTAILQ_EMPTY(shared_region_listeners)) + return 0; + + size = end - start; + + qemu_mutex_lock(&s->dma_map_regions_list_mutex); + QTAILQ_FOREACH(pos, &s->dma_map_regions_list, list) { + if (start >= (pos->start + pos->size)) { + continue; + } else if ((start + size) <= pos->start) { + break; + } else { + goto end; + } + } + QTAILQ_FOREACH(shl, shared_region_listeners, next) { + listener = shl->listener; + as = shl->as; + section = memory_region_find(as->root, start, size); + if (!section.mr) { + goto end; + } + + if (!memory_region_is_ram(section.mr)) { + memory_region_unref(section.mr); + goto end; + } + + if (listener->region_add) { + listener->region_add(listener, §ion); + } + memory_region_unref(section.mr); + } + + region = g_malloc0(sizeof(*region)); + if (!region) { + ret = -1; + goto end; + } + region->start = start; + region->size = size; + + if (pos) { + QTAILQ_INSERT_BEFORE(pos, region, list); + } else { + QTAILQ_INSERT_TAIL(&s->dma_map_regions_list, region, list); + } + +end: + qemu_mutex_unlock(&s->dma_map_regions_list_mutex); + return ret; +} + +void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) +{ + MemoryRegionSection section; + AddressSpace *as; + QTAILQ_HEAD(, SharedRegionListener) *shared_region_listeners; + SharedRegionListener *shl; + MemoryListener *listener; + uint64_t size; + Csv3GuestState *s = &csv3_guest; + struct dma_map_region *pos, *next_pos; + + if (!csv3_enabled()) + return; + + if (end <= start) + return; + + shared_region_listeners = shared_region_listeners_get(); + if (QTAILQ_EMPTY(shared_region_listeners)) + return; + + size = end - start; + + qemu_mutex_lock(&s->dma_map_regions_list_mutex); + QTAILQ_FOREACH_SAFE(pos, &s->dma_map_regions_list, list, next_pos) { + uint64_t l, r; + uint64_t curr_end = pos->start + pos->size; + + l = MAX(start, pos->start); + r = MIN(start + size, pos->start + pos->size); + if (l < r) { + if ((start <= pos->start) && (start + size >= pos->start + pos->size)) { + QTAILQ_FOREACH(shl, shared_region_listeners, next) { + listener = shl->listener; + as = shl->as; + section = memory_region_find(as->root, pos->start, pos->size); + if (!section.mr) { + goto end; + } + if (listener->region_del) { + listener->region_del(listener, §ion); + } + memory_region_unref(section.mr); + } + + QTAILQ_REMOVE(&s->dma_map_regions_list, pos, list); + g_free(pos); + } + break; + } + if ((start + size) <= curr_end) { + break; + } + } +end: + qemu_mutex_unlock(&s->dma_map_regions_list_mutex); + return; +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 3caf216743..12733341b3 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -15,6 +15,8 @@ #define I386_CSV_H #include "qapi/qapi-commands-misc-target.h" +#include "qemu/thread.h" +#include "qemu/queue.h" #include "sev.h" #define GUEST_POLICY_CSV3_BIT (1 << 6) @@ -74,12 +76,19 @@ int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent); int csv_load_incoming_cpu_state(QEMUFile *f); /* CSV3 */ +struct dma_map_region { + uint64_t start, size; + QTAILQ_ENTRY(dma_map_region) list; +}; + struct Csv3GuestState { uint32_t policy; int sev_fd; void *state; int (*sev_ioctl)(int fd, int cmd, void *data, int *error); const char *(*fw_error_to_str)(int code); + QTAILQ_HEAD(, dma_map_region) dma_map_regions_list; + QemuMutex dma_map_regions_list_mutex; }; typedef struct Csv3GuestState Csv3GuestState; @@ -90,4 +99,7 @@ extern int csv3_launch_encrypt_vmcb(void); int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); +int csv3_shared_region_dma_map(uint64_t start, uint64_t end); +void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); + #endif diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 2866a6d0ec..925f4f8040 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5026,8 +5026,10 @@ static int kvm_handle_exit_hypercall(X86CPU *cpu, struct kvm_run *run) if (enc) { sev_remove_shared_regions_list(gfn_start, gfn_end); + csv3_shared_region_dma_unmap(gpa, gfn_end << TARGET_PAGE_BITS); } else { sev_add_shared_regions_list(gfn_start, gfn_end); + csv3_shared_region_dma_map(gpa, gfn_end << TARGET_PAGE_BITS); } } return 0; -- Gitee From 454079664e1492eeb9b90d1d05598e84dc436f11 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Fri, 17 Jun 2022 09:25:19 +0800 Subject: [PATCH 572/939] linux-headers: update kernel headers to include CSV3 migration cmds Four new migration commands are added to support CSV3 migration. KVM_CSV3_SEND_ENCRYPT_DATA/KVM_CSV3_RECEIVE_ENCRYPT_DATA cmds are used to migrate guest's pages. KVM_CSV3_SEND_ENCRYPT_CONTEXT/KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT cmds are used to migration guest's runtime context. Signed-off-by: Xin Jiang Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 8487d0889b..8543db844e 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2115,6 +2115,12 @@ enum csv3_cmd_id { KVM_CSV3_INIT = KVM_CSV3_NR_MIN, KVM_CSV3_LAUNCH_ENCRYPT_DATA, KVM_CSV3_LAUNCH_ENCRYPT_VMCB, + KVM_CSV3_SEND_ENCRYPT_DATA, + KVM_CSV3_SEND_ENCRYPT_CONTEXT, + KVM_CSV3_RECEIVE_ENCRYPT_DATA, + KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, + + KVM_CSV3_NR_MAX, }; struct kvm_csv3_launch_encrypt_data { @@ -2127,6 +2133,38 @@ struct kvm_csv3_init_data { __u64 nodemask; }; +struct kvm_csv3_send_encrypt_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_addr_data; + __u32 guest_addr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv3_send_encrypt_context { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv3_receive_encrypt_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_addr_data; + __u32 guest_addr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv3_receive_encrypt_context { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- Gitee From 13bd2629b78f528b0b4684a643f59d30b7274aa8 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Fri, 17 Jun 2022 09:37:56 +0800 Subject: [PATCH 573/939] target/i386: csv: Add support to migrate the outgoing page for CSV3 guest The csv3_send_encrypt_data() provides the method to encrypt the guest's private pages during migration. The routine is similar to CSV2's. Usually, it starts with a SEND_START command to create the migration context. Then SEND_ENCRYPT_DATA command is performed to encrypt guest pages. After migration is completed, a SEND_FINISH command is performed to the firmware. Signed-off-by: Jiang Xin Signed-off-by: hanliyang --- migration/ram.c | 87 +++++++++++++++++++ target/i386/csv.c | 182 +++++++++++++++++++++++++++++++++++++++ target/i386/csv.h | 22 +++++ target/i386/sev.c | 14 ++- target/i386/sev.h | 1 + target/i386/trace-events | 1 + 6 files changed, 306 insertions(+), 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 1377b9eb37..1f9348fd06 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2480,6 +2480,90 @@ ram_save_encrypted_pages_in_batch(RAMState *rs, PageSearchStatus *pss) } #endif +/** + * ram_save_csv3_pages - send the given csv3 VM pages to the stream + */ +static int ram_save_csv3_pages(RAMState *rs, PageSearchStatus *pss) +{ + bool page_dirty; + int ret; + int tmppages, pages = 0; + uint8_t *p; + uint32_t host_len = 0; + uint64_t bytes_xmit = 0; + RAMBlock *block = pss->block; + ram_addr_t offset = 0; + hwaddr paddr = RAM_ADDR_INVALID; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + if (!kvm_csv3_enabled()) + return 0; + + do { + page_dirty = migration_bitmap_clear_dirty(rs, block, pss->page); + + /* Check the pages is dirty and if it is send it */ + if (page_dirty) { + ret = kvm_physical_memory_addr_from_host(kvm_state, + block->host + (pss->page << TARGET_PAGE_BITS), &paddr); + /* Process ROM or MMIO */ + if (paddr == RAM_ADDR_INVALID || + memory_region_is_rom(block->mr)) { + tmppages = migration_ops->ram_save_target_page(rs, pss); + } else { + /* Caculate the offset and host virtual address of the page */ + offset = pss->page << TARGET_PAGE_BITS; + p = block->host + offset; + + if (ops->queue_outgoing_page(p, TARGET_PAGE_SIZE, offset)) + return -1; + + tmppages = 1; + host_len += TARGET_PAGE_SIZE; + + stat64_add(&mig_stats.normal_pages, 1); + } + } else { + tmppages = 0; + } + + if (tmppages >= 0) { + pages += tmppages; + } else { + return tmppages; + } + + pss_find_next_dirty(pss); + } while (offset_in_ramblock(block, + ((ram_addr_t)pss->page) << TARGET_PAGE_BITS) && + host_len < CSV3_OUTGOING_PAGE_WINDOW_SIZE); + + /* Check if there are any queued pages */ + if (host_len != 0) { + /* Always set offset as 0 for csv3. */ + ram_transferred_add(save_page_header(pss, pss->pss_channel, + block, 0 | RAM_SAVE_FLAG_ENCRYPTED_DATA)); + + qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE); + ram_transferred_add(4); + /* Process the queued pages in batch */ + ret = ops->save_queued_outgoing_pages(pss->pss_channel, &bytes_xmit); + if (ret) { + return -1; + } + ram_transferred_add(bytes_xmit); + } + + /* The offset we leave with is the last one we looked at */ + pss->page--; + + return pages; +} + /** * ram_save_host_page: save a whole host page * @@ -2515,6 +2599,9 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } + if (kvm_csv3_enabled()) + return ram_save_csv3_pages(rs, pss); + #ifdef CONFIG_HYGON_CSV_MIG_ACCEL /* * If command_batch function is enabled and memory encryption is enabled diff --git a/target/i386/csv.c b/target/i386/csv.c index e4706efa27..22e709a95c 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -16,8 +16,13 @@ #include "qapi/error.h" #include "sysemu/kvm.h" #include "exec/address-spaces.h" +#include "migration/blocker.h" +#include "migration/qemu-file.h" +#include "migration/misc.h" +#include "monitor/monitor.h" #include +#include #ifdef CONFIG_NUMA #include @@ -30,6 +35,19 @@ bool csv_kvm_cpu_reset_inhibit; +struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = { + .save_setup = sev_save_setup, + .save_outgoing_page = NULL, + .is_gfn_in_unshared_region = NULL, + .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, + .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, + .queue_outgoing_page = csv3_queue_outgoing_page, + .save_queued_outgoing_pages = csv3_save_queued_outgoing_pages, +}; + +#define CSV3_OUTGOING_PAGE_NUM \ + (CSV3_OUTGOING_PAGE_WINDOW_SIZE / TARGET_PAGE_SIZE) + Csv3GuestState csv3_guest = { 0 }; int @@ -70,6 +88,7 @@ csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) csv3_guest.fw_error_to_str = ops->fw_error_to_str; QTAILQ_INIT(&csv3_guest.dma_map_regions_list); qemu_mutex_init(&csv3_guest.dma_map_regions_list_mutex); + csv3_guest.sev_send_start = ops->sev_send_start; } return 0; } @@ -301,3 +320,166 @@ end: qemu_mutex_unlock(&s->dma_map_regions_list_mutex); return; } + +static inline hwaddr csv3_hva_to_gfn(uint8_t *ptr) +{ + ram_addr_t offset = RAM_ADDR_INVALID; + + kvm_physical_memory_addr_from_host(kvm_state, ptr, &offset); + + return offset >> TARGET_PAGE_BITS; +} + +static int +csv3_send_start(QEMUFile *f, uint64_t *bytes_sent) +{ + if (csv3_guest.sev_send_start) + return csv3_guest.sev_send_start(f, bytes_sent); + else + return -1; +} + +static int +csv3_send_get_packet_len(int *fw_err) +{ + int ret; + struct kvm_csv3_send_encrypt_data update = {0}; + + update.hdr_len = 0; + update.trans_len = 0; + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_DATA, &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + ret = 0; + goto err; + } + + if (update.hdr_len <= INT_MAX) + ret = update.hdr_len; + else + ret = 0; + +err: + return ret; +} + +static int +csv3_send_encrypt_data(Csv3GuestState *s, QEMUFile *f, + uint8_t *ptr, uint32_t size, uint64_t *bytes_sent) +{ + int ret, fw_error = 0; + guchar *trans; + uint32_t guest_addr_entry_num; + uint32_t i; + struct kvm_csv3_send_encrypt_data update = { }; + + /* + * If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (!s->send_packet_hdr) { + s->send_packet_hdr_len = csv3_send_get_packet_len(&fw_error); + if (s->send_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len); + } + + if (!s->guest_addr_len || !s->guest_addr_data) { + error_report("%s: invalid host address or size", __func__); + return 1; + } else { + guest_addr_entry_num = s->guest_addr_len / sizeof(struct guest_addr_entry); + } + + /* allocate transport buffer */ + trans = g_new(guchar, guest_addr_entry_num * TARGET_PAGE_SIZE); + + update.hdr_uaddr = (uintptr_t)s->send_packet_hdr; + update.hdr_len = s->send_packet_hdr_len; + update.guest_addr_data = (uintptr_t)s->guest_addr_data; + update.guest_addr_len = s->guest_addr_len; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = guest_addr_entry_num * TARGET_PAGE_SIZE; + + trace_kvm_csv3_send_encrypt_data(trans, update.trans_len); + + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_DATA, &update, &fw_error); + if (ret) { + error_report("%s: SEND_ENCRYPT_DATA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + for (i = 0; i < guest_addr_entry_num; i++) { + if (s->guest_addr_data[i].share) + memcpy(trans + i * TARGET_PAGE_SIZE, (guchar *)s->guest_hva_data[i].hva, + TARGET_PAGE_SIZE); + } + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent += 4 + update.hdr_len; + + qemu_put_be32(f, update.guest_addr_len); + qemu_put_buffer(f, (uint8_t *)update.guest_addr_data, update.guest_addr_len); + *bytes_sent += 4 + update.guest_addr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += (4 + update.trans_len); + +err: + s->guest_addr_len = 0; + g_free(trans); + return ret; +} + +int +csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) +{ + Csv3GuestState *s = &csv3_guest; + uint32_t i = 0; + + if (!s->guest_addr_data) { + s->guest_hva_data = g_new0(struct guest_hva_entry, CSV3_OUTGOING_PAGE_NUM); + s->guest_addr_data = g_new0(struct guest_addr_entry, CSV3_OUTGOING_PAGE_NUM); + s->guest_addr_len = 0; + } + + if (s->guest_addr_len >= sizeof(struct guest_addr_entry) * CSV3_OUTGOING_PAGE_NUM) { + error_report("Failed to queue outgoing page"); + return 1; + } + + i = s->guest_addr_len / sizeof(struct guest_addr_entry); + s->guest_hva_data[i].hva = (uintptr_t)ptr; + s->guest_addr_data[i].share = 0; + s->guest_addr_data[i].reserved = 0; + s->guest_addr_data[i].gfn = csv3_hva_to_gfn(ptr); + s->guest_addr_len += sizeof(struct guest_addr_entry); + + return 0; +} + +int +csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) +{ + Csv3GuestState *s = &csv3_guest; + + /* + * If this is a first buffer then create outgoing encryption context + * and write our PDH, policy and session data. + */ + if (!csv3_check_state(SEV_STATE_SEND_UPDATE) && + csv3_send_start(f, bytes_sent)) { + error_report("Failed to create outgoing context"); + return 1; + } + + return csv3_send_encrypt_data(s, f, NULL, 0, bytes_sent); +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 12733341b3..12c1b22659 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -81,6 +81,18 @@ struct dma_map_region { QTAILQ_ENTRY(dma_map_region) list; }; +#define CSV3_OUTGOING_PAGE_WINDOW_SIZE (512 * TARGET_PAGE_SIZE) + +struct guest_addr_entry { + uint64_t share: 1; + uint64_t reserved: 11; + uint64_t gfn: 52; +}; + +struct guest_hva_entry { + uint64_t hva; +}; + struct Csv3GuestState { uint32_t policy; int sev_fd; @@ -89,11 +101,19 @@ struct Csv3GuestState { const char *(*fw_error_to_str)(int code); QTAILQ_HEAD(, dma_map_region) dma_map_regions_list; QemuMutex dma_map_regions_list_mutex; + gchar *send_packet_hdr; + size_t send_packet_hdr_len; + struct guest_hva_entry *guest_hva_data; + struct guest_addr_entry *guest_addr_data; + size_t guest_addr_len; + + int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent); }; typedef struct Csv3GuestState Csv3GuestState; extern struct Csv3GuestState csv3_guest; +extern struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops; extern int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops); extern int csv3_launch_encrypt_vmcb(void); @@ -101,5 +121,7 @@ int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); int csv3_shared_region_dma_map(uint64_t start, uint64_t end); void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); +int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); +int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); #endif diff --git a/target/i386/sev.c b/target/i386/sev.c index 0012a5efb0..5a96b0b452 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1270,7 +1270,11 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) qemu_add_vm_change_state_handler(sev_vm_state_change, sev); migration_add_notifier(&sev_migration_state, sev_migration_state_notifier); - cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; + if (csv3_enabled()) { + cgs_class->memory_encryption_ops = &csv3_memory_encryption_ops; + } else { + cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; + } QTAILQ_INIT(&sev->shared_regions_list); /* Determine whether support MSR_AMD64_SEV_ES_GHCB */ @@ -2654,9 +2658,17 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) return ret; } +static int _sev_send_start(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + + return sev_send_start(s, f, bytes_sent); +} + struct sev_ops sev_ops = { .sev_ioctl = sev_ioctl, .fw_error_to_str = fw_error_to_str, + .sev_send_start = _sev_send_start, }; static void diff --git a/target/i386/sev.h b/target/i386/sev.h index e91431e0f7..8ccef22a95 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -83,6 +83,7 @@ extern bool sev_kvm_has_msr_ghcb; struct sev_ops { int (*sev_ioctl)(int fd, int cmd, void *data, int *error); const char *(*fw_error_to_str)(int code); + int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent); }; extern struct sev_ops sev_ops; diff --git a/target/i386/trace-events b/target/i386/trace-events index 34c205ffda..a4a58b12a1 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -22,3 +22,4 @@ kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int # csv.c kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 "addr %p len 0x%" PRIx64 +kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d" -- Gitee From 3434042340ca031b6d355cc79dd00e166bd2e2fd Mon Sep 17 00:00:00 2001 From: jiangxin Date: Fri, 17 Jun 2022 09:45:45 +0800 Subject: [PATCH 574/939] target/i386: csv: Add support to migrate the incoming page for CSV3 guest The csv3_receive_encrypt_data() provides the method to read incoming guest private pages from socket and load them into guest memory. The routine is similar to CSV2's. Usually, it starts with a RECEIVE START command to create the migration context. Then RECEIVE ENCRYPT DATA command is performed to let the firmware load incoming pages into guest memory. After migration is completed, a RECEIVE FINISH command is performed to the firmware. Signed-off-by: Jiang Xin Signed-off-by: hanliyang --- target/i386/csv.c | 87 ++++++++++++++++++++++++++++++++++++++++ target/i386/csv.h | 2 + target/i386/sev.c | 8 ++++ target/i386/sev.h | 1 + target/i386/trace-events | 1 + 5 files changed, 99 insertions(+) diff --git a/target/i386/csv.c b/target/i386/csv.c index 22e709a95c..ac080b3766 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -38,11 +38,14 @@ bool csv_kvm_cpu_reset_inhibit; struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = { .save_setup = sev_save_setup, .save_outgoing_page = NULL, + .load_incoming_page = csv3_load_incoming_page, .is_gfn_in_unshared_region = NULL, .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, .queue_outgoing_page = csv3_queue_outgoing_page, .save_queued_outgoing_pages = csv3_save_queued_outgoing_pages, + .queue_incoming_page = NULL, + .load_queued_incoming_pages = NULL, }; #define CSV3_OUTGOING_PAGE_NUM \ @@ -89,6 +92,7 @@ csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) QTAILQ_INIT(&csv3_guest.dma_map_regions_list); qemu_mutex_init(&csv3_guest.dma_map_regions_list_mutex); csv3_guest.sev_send_start = ops->sev_send_start; + csv3_guest.sev_receive_start = ops->sev_receive_start; } return 0; } @@ -483,3 +487,86 @@ csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) return csv3_send_encrypt_data(s, f, NULL, 0, bytes_sent); } + +static int +csv3_receive_start(QEMUFile *f) +{ + if (csv3_guest.sev_receive_start) + return csv3_guest.sev_receive_start(f); + else + return -1; +} + +static int csv3_receive_encrypt_data(QEMUFile *f, uint8_t *ptr) +{ + int ret = 1, fw_error = 0; + uint32_t i, guest_addr_entry_num; + gchar *hdr = NULL, *trans = NULL; + struct guest_addr_entry *guest_addr_data; + struct kvm_csv3_receive_encrypt_data update = {}; + void *hva = NULL; + MemoryRegion *mr = NULL; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get guest addr data */ + update.guest_addr_len = qemu_get_be32(f); + + guest_addr_data = (struct guest_addr_entry *)g_new(gchar, update.guest_addr_len); + qemu_get_buffer(f, (uint8_t *)guest_addr_data, update.guest_addr_len); + update.guest_addr_data = (uintptr_t)guest_addr_data; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + /* update share memory. */ + guest_addr_entry_num = update.guest_addr_len / sizeof(struct guest_addr_entry); + for (i = 0; i < guest_addr_entry_num; i++) { + if (guest_addr_data[i].share) { + hva = gpa2hva(&mr, + ((uint64_t)guest_addr_data[i].gfn << TARGET_PAGE_BITS), + TARGET_PAGE_SIZE, + NULL); + if (hva) + memcpy(hva, trans + i * TARGET_PAGE_SIZE, TARGET_PAGE_SIZE); + } + } + + trace_kvm_csv3_receive_encrypt_data(trans, update.trans_len, hdr, update.hdr_len); + + ret = csv3_ioctl(KVM_CSV3_RECEIVE_ENCRYPT_DATA, &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_ENCRYPT_DATA ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + +err: + g_free(trans); + g_free(guest_addr_data); + g_free(hdr); + return ret; +} + +int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr) +{ + /* + * If this is first buffer and SEV is not in recieiving state then + * use RECEIVE_START command to create a encryption context. + */ + if (!csv3_check_state(SEV_STATE_RECEIVE_UPDATE) && + csv3_receive_start(f)) { + return 1; + } + + return csv3_receive_encrypt_data(f, ptr); +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 12c1b22659..afcd59180c 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -108,6 +108,7 @@ struct Csv3GuestState { size_t guest_addr_len; int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent); + int (*sev_receive_start)(QEMUFile *f); }; typedef struct Csv3GuestState Csv3GuestState; @@ -121,6 +122,7 @@ int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); int csv3_shared_region_dma_map(uint64_t start, uint64_t end); void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); +int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr); int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); diff --git a/target/i386/sev.c b/target/i386/sev.c index 5a96b0b452..5124bf3dee 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -2665,10 +2665,18 @@ static int _sev_send_start(QEMUFile *f, uint64_t *bytes_sent) return sev_send_start(s, f, bytes_sent); } +static int _sev_receive_start(QEMUFile *f) +{ + SevGuestState *s = sev_guest; + + return sev_receive_start(s, f); +} + struct sev_ops sev_ops = { .sev_ioctl = sev_ioctl, .fw_error_to_str = fw_error_to_str, .sev_send_start = _sev_send_start, + .sev_receive_start = _sev_receive_start, }; static void diff --git a/target/i386/sev.h b/target/i386/sev.h index 8ccef22a95..647b426b16 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -84,6 +84,7 @@ struct sev_ops { int (*sev_ioctl)(int fd, int cmd, void *data, int *error); const char *(*fw_error_to_str)(int code); int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent); + int (*sev_receive_start)(QEMUFile *f); }; extern struct sev_ops sev_ops; diff --git a/target/i386/trace-events b/target/i386/trace-events index a4a58b12a1..b3cb9aaf71 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -23,3 +23,4 @@ kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int # csv.c kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 "addr %p len 0x%" PRIx64 kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d" +kvm_csv3_receive_encrypt_data(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" -- Gitee From 0ebf32463e858c5f9cbd98e3f2fe494d0fbea259 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Fri, 17 Jun 2022 09:52:31 +0800 Subject: [PATCH 575/939] target/i386: csv: Add support to migrate the outgoing context for CSV3 guest CSV3 needs to migrate guest cpu's context pages. Prior to migration of the context, it should query transfer buffer length and header data length by SEND ENCRYPT CONTEXT command. New migration flag RAM_SAVE_ENCRYPTED_CSV3_CONTEXT is defined for CSV3. Signed-off-by: Jiang Xin Signed-off-by: hanliyang --- target/i386/csv.c | 81 ++++++++++++++++++++++++++++++++++++++++ target/i386/csv.h | 1 + target/i386/trace-events | 1 + 3 files changed, 83 insertions(+) diff --git a/target/i386/csv.c b/target/i386/csv.c index ac080b3766..cc90b57e5b 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -46,6 +46,7 @@ struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = { .save_queued_outgoing_pages = csv3_save_queued_outgoing_pages, .queue_incoming_page = NULL, .load_queued_incoming_pages = NULL, + .save_outgoing_cpu_state = csv3_save_outgoing_context, }; #define CSV3_OUTGOING_PAGE_NUM \ @@ -570,3 +571,83 @@ int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr) return csv3_receive_encrypt_data(f, ptr); } + +static int +csv3_send_get_context_len(int *fw_err, int *context_len, int *hdr_len) +{ + int ret = 0; + struct kvm_csv3_send_encrypt_context update = { 0 }; + + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_CONTEXT, &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + error_report("%s: failed to get context length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + ret = -1; + goto err; + } + + if (update.trans_len <= INT_MAX && update.hdr_len <= INT_MAX) { + *context_len = update.trans_len; + *hdr_len = update.hdr_len; + } + ret = 0; +err: + return ret; +} + +static int +csv3_send_encrypt_context(Csv3GuestState *s, QEMUFile *f, uint64_t *bytes_sent) +{ + int ret, fw_error = 0; + int context_len = 0; + int hdr_len = 0; + guchar *trans; + guchar *hdr; + struct kvm_csv3_send_encrypt_context update = { }; + + ret = csv3_send_get_context_len(&fw_error, &context_len, &hdr_len); + if (context_len < 1 || hdr_len < 1) { + error_report("%s: fail to get context length fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + /* allocate transport buffer */ + trans = g_new(guchar, context_len); + hdr = g_new(guchar, hdr_len); + + update.hdr_uaddr = (uintptr_t)hdr; + update.hdr_len = hdr_len; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = context_len; + + trace_kvm_csv3_send_encrypt_context(trans, update.trans_len); + + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_CONTEXT, &update, &fw_error); + if (ret) { + error_report("%s: SEND_ENCRYPT_CONTEXT ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent += 4 + update.hdr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += 4 + update.trans_len; + +err: + g_free(trans); + g_free(hdr); + return ret; +} + +int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent) +{ + Csv3GuestState *s = &csv3_guest; + + /* send csv3 context. */ + return csv3_send_encrypt_context(s, f, bytes_sent); +} diff --git a/target/i386/csv.h b/target/i386/csv.h index afcd59180c..9f83a271fd 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -125,5 +125,6 @@ void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr); int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); +int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent); #endif diff --git a/target/i386/trace-events b/target/i386/trace-events index b3cb9aaf71..043412c569 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -23,4 +23,5 @@ kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int # csv.c kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 "addr %p len 0x%" PRIx64 kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d" +kvm_csv3_send_encrypt_context(void *dst, int len) "trans %p len %d" kvm_csv3_receive_encrypt_data(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" -- Gitee From b31be8b06440deccdf00de2a7886d04fe87dc802 Mon Sep 17 00:00:00 2001 From: jiangxin Date: Fri, 17 Jun 2022 10:00:46 +0800 Subject: [PATCH 576/939] target/i386: csv: Add support to migrate the incoming context for CSV3 guest The csv3_load_incoming_context() provides the method to read incoming guest's context from socket. It loads them into guest private memory. This is the last step during migration and RECEIVE FINISH command is performed by then to complete the whole migration. Signed-off-by: Jiang Xin Signed-off-by: hanliyang --- target/i386/csv.c | 45 ++++++++++++++++++++++++++++++++++++++++ target/i386/csv.h | 1 + target/i386/trace-events | 1 + 3 files changed, 47 insertions(+) diff --git a/target/i386/csv.c b/target/i386/csv.c index cc90b57e5b..571beeb61f 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -47,6 +47,7 @@ struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = { .queue_incoming_page = NULL, .load_queued_incoming_pages = NULL, .save_outgoing_cpu_state = csv3_save_outgoing_context, + .load_incoming_cpu_state = csv3_load_incoming_context, }; #define CSV3_OUTGOING_PAGE_NUM \ @@ -644,6 +645,42 @@ err: return ret; } +static int +csv3_receive_encrypt_context(Csv3GuestState *s, QEMUFile *f) +{ + int ret = 1, fw_error = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_csv3_receive_encrypt_context update = {}; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + trace_kvm_csv3_receive_encrypt_context(trans, update.trans_len, hdr, update.hdr_len); + + ret = csv3_ioctl(KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_ENCRYPT_CONTEXT ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + +err: + g_free(trans); + g_free(hdr); + return ret; +} + int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent) { Csv3GuestState *s = &csv3_guest; @@ -651,3 +688,11 @@ int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent) /* send csv3 context. */ return csv3_send_encrypt_context(s, f, bytes_sent); } + +int csv3_load_incoming_context(QEMUFile *f) +{ + Csv3GuestState *s = &csv3_guest; + + /* receive csv3 context. */ + return csv3_receive_encrypt_context(s, f); +} diff --git a/target/i386/csv.h b/target/i386/csv.h index 9f83a271fd..8621f0b6fd 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -123,6 +123,7 @@ int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); int csv3_shared_region_dma_map(uint64_t start, uint64_t end); void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr); +int csv3_load_incoming_context(QEMUFile *f); int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent); diff --git a/target/i386/trace-events b/target/i386/trace-events index 043412c569..ad3cfb9612 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -25,3 +25,4 @@ kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" P kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d" kvm_csv3_send_encrypt_context(void *dst, int len) "trans %p len %d" kvm_csv3_receive_encrypt_data(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" +kvm_csv3_receive_encrypt_context(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" -- Gitee From 0826efefea34a6fb6e17502f3a293572f109a261 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Thu, 5 Dec 2024 14:18:01 +0800 Subject: [PATCH 577/939] fix compile error on loongarch add cpu.h in loongarch_ipi.c Signed-off-by: Xianglai Li --- hw/intc/loongarch_ipi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c index e228669aa5..630bcb14ea 100644 --- a/hw/intc/loongarch_ipi.c +++ b/hw/intc/loongarch_ipi.c @@ -15,6 +15,7 @@ #include "exec/address-spaces.h" #include "hw/loongarch/virt.h" #include "migration/vmstate.h" +#include "target/loongarch/cpu.h" #include "target/loongarch/internals.h" #include "trace.h" -- Gitee From 838c585cf6c899a0b48683a0b46ed01cc24d835c Mon Sep 17 00:00:00 2001 From: Susanooo Date: Fri, 25 Oct 2024 10:08:39 +0800 Subject: [PATCH 578/939] ui/vnc: don't return an empty SASL mechlist to the client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SASL initialization phase may determine that there are no valid mechanisms available to use. This may be because the host OS admin forgot to install some packages, or it might be because the requested SSF level is incompatible with available mechanisms, or other unknown reasons. If we return an empty mechlist to the client, they're going to get a failure from the SASL library on their end and drop the connection. Thus there is no point even sending this back to the client, we can just drop the connection immediately. Reviewed-by: Marc-André Lureau Signed-off-by: Daniel P. Berrangé Signed-off-by: zhangchujun --- ui/vnc-auth-sasl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ui/vnc-auth-sasl.c b/ui/vnc-auth-sasl.c index 47fdae5b21..e321c9decc 100644 --- a/ui/vnc-auth-sasl.c +++ b/ui/vnc-auth-sasl.c @@ -674,6 +674,13 @@ void start_auth_sasl(VncState *vs) } trace_vnc_auth_sasl_mech_list(vs, mechlist); + if (g_str_equal(mechlist, "")) { + trace_vnc_auth_fail(vs, vs->auth, "no available SASL mechanisms", ""); + sasl_dispose(&vs->sasl.conn); + vs->sasl.conn = NULL; + goto authabort; + } + vs->sasl.mechlist = g_strdup(mechlist); mechlistlen = strlen(mechlist); vnc_write_u32(vs, mechlistlen); -- Gitee From 9ab31c6abf095d8f7c986676cf6a70132a3441b7 Mon Sep 17 00:00:00 2001 From: Adttil <2429917001@qq.com> Date: Tue, 10 Dec 2024 00:33:28 +0800 Subject: [PATCH 579/939] vdpa-dev: Fix initialisation order to restore VDUSE compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDUSE requires that virtqueues are first enabled before the DRIVER_OK status flag is set; with the current API of the kernel module, it is impossible to enable the opposite order in our block export code because userspace is not notified when a virtqueue is enabled. This requirement also mathces the normal initialisation order as done by the generic vhost code in QEMU. However, commit 6c48254 accidentally changed the order for vdpa-dev and broke access to VDUSE devices with this. This changes vdpa-dev to use the normal order again and use the standard vhost callback .vhost_set_vring_enable for this. VDUSE devices can be used with vdpa-dev again after this fix. vhost_net intentionally avoided enabling the vrings for vdpa and does this manually later while it does enable them for other vhost backends. Reflect this in the vhost_net code and return early for vdpa, so that the behaviour doesn't change for this device. Cc: qemu-stable@nongnu.org Fixes: 6c48254 ('vdpa: move vhost_vdpa_set_vring_ready to the caller') Signed-off-by: Kevin Wolf Message-ID: <20240315155949.86066-1-kwolf@redhat.com> Reviewed-by: Eugenio Pérez Reviewed-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- hw/net/vhost_net.c | 10 ++++++++++ hw/virtio/trace-events | 2 +- hw/virtio/vdpa-dev.c | 5 +---- hw/virtio/vhost-vdpa.c | 29 ++++++++++++++++++++++++++--- hw/virtio/vhost.c | 8 +++++++- 5 files changed, 45 insertions(+), 9 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e48c373b14..a02d65d208 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -599,6 +599,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable) VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; + /* + * vhost-vdpa network devices need to enable dataplane virtqueues after + * DRIVER_OK, so they can recover device state before starting dataplane. + * Because of that, we don't enable virtqueues here and leave it to + * net/vhost-vdpa.c. + */ + if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { + return 0; + } + nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 637cac4edf..f136815072 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -48,7 +48,7 @@ vhost_vdpa_set_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRI vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id %"PRIu32 vhost_vdpa_reset_device(void *dev) "dev: %p" vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq idx: %d" -vhost_vdpa_set_vring_ready(void *dev, unsigned i, int r) "dev: %p, idx: %u, r: %d" +vhost_vdpa_set_vring_enable_one(void *dev, unsigned i, int enable, int r) "dev: %p, idx: %u, enable: %u, r: %d" vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s" vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32 diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index 91e71847b0..7b2b19dfb8 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -259,14 +259,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) s->dev.acked_features = vdev->guest_features; - ret = vhost_dev_start(&s->dev, vdev, false); + ret = vhost_dev_start(&s->dev, vdev, true); if (ret < 0) { error_setg_errno(errp, -ret, "Error starting vhost"); goto err_guest_notifiers; } - for (i = 0; i < s->dev.nvqs; ++i) { - vhost_vdpa_set_vring_ready(&s->vdpa, i); - } s->started = true; /* diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index d49826845f..7e172eee49 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -883,12 +883,13 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) return idx; } -int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx, + int enable) { struct vhost_dev *dev = v->dev; struct vhost_vring_state state = { .index = idx, - .num = 1, + .num = enable, }; hwaddr addr = virtio_queue_get_desc_addr(dev->vdev, idx); if (addr == 0) { @@ -897,10 +898,31 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); - trace_vhost_vdpa_set_vring_ready(dev, idx, r); + trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r); return r; } +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) +{ + struct vhost_vdpa *v = dev->opaque; + unsigned int i; + int ret; + + for (i = 0; i < dev->nvqs; ++i) { + ret = vhost_vdpa_set_vring_enable_one(v, i, enable); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +{ + return vhost_vdpa_set_vring_enable_one(v, idx, 1); +} + static int vhost_vdpa_set_config_call(struct vhost_dev *dev, int fd) { @@ -1584,6 +1606,7 @@ const VhostOps vdpa_ops = { .vhost_set_features = vhost_vdpa_set_features, .vhost_reset_device = vhost_vdpa_reset_device, .vhost_get_vq_index = vhost_vdpa_get_vq_index, + .vhost_set_vring_enable = vhost_vdpa_set_vring_enable, .vhost_get_config = vhost_vdpa_get_config, .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index d073a6d5a5..d29075aa04 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -2063,7 +2063,13 @@ static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); } -/* Host notifiers must be enabled at this point. */ +/* + * Host notifiers must be enabled at this point. + * + * If @vrings is true, this function will enable all vrings before starting the + * device. If it is false, the vring initialization is left to be done by the + * caller. + */ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) { int i, r; -- Gitee From e52a2122cb1574723c7c8181ba751cc0ff37648e Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 12 Dec 2024 09:46:18 +0800 Subject: [PATCH 580/939] target/riscv: Avoid bad shift in riscv_cpu_do_interrupt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 5311599cdc48337f2f27b1b51a80d46d75b05ed0 In riscv_cpu_do_interrupt() we use the 'cause' value we got out of cs->exception as a shift value. However this value can be larger than 31, which means that "1 << cause" is undefined behaviour, because we do the shift on an 'int' type. This causes the undefined behaviour sanitizer to complain on one of the check-tcg tests: $ UBSAN_OPTIONS=print_stacktrace=1:abort_on_error=1:halt_on_error=1 ./build/clang/qemu-system-riscv64 -M virt -semihosting -display none -device loader,file=build/clang/tests/tcg/riscv64-softmmu/issue1060 ../../target/riscv/cpu_helper.c:1805:38: runtime error: shift exponent 63 is too large for 32-bit type 'int' #0 0x55f2dc026703 in riscv_cpu_do_interrupt /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/clang/../../target/riscv/cpu_helper.c:1805:38 #1 0x55f2dc3d170e in cpu_handle_exception /mnt/nvmedisk/linaro/qemu-from-laptop/qemu/build/clang/../../accel/tcg/cpu-exec.c:752:9 In this case cause is RISCV_EXCP_SEMIHOST, which is 0x3f. Use 1ULL instead to ensure that the shift is in range. Signed-off-by: Peter Maydell Fixes: 1697837ed9 ("target/riscv: Add M-mode virtual interrupt and IRQ filtering support.") Fixes: 40336d5b1d ("target/riscv: Add HS-mode virtual interrupt and IRQ filtering support.") Reviewed-by: Daniel Henrique Barboza Reviewed-by: Richard Henderson Reviewed-by: Alistair Francis Message-ID: <20241128103831.3452572-1-peter.maydell@linaro.org> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- target/riscv/cpu_helper.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c index e7e23b34f4..4d8f1248dd 100644 --- a/target/riscv/cpu_helper.c +++ b/target/riscv/cpu_helper.c @@ -1644,10 +1644,10 @@ void riscv_cpu_do_interrupt(CPUState *cs) bool async = !!(cs->exception_index & RISCV_EXCP_INT_FLAG); target_ulong cause = cs->exception_index & RISCV_EXCP_INT_MASK; uint64_t deleg = async ? env->mideleg : env->medeleg; - bool s_injected = env->mvip & (1 << cause) & env->mvien && - !(env->mip & (1 << cause)); - bool vs_injected = env->hvip & (1 << cause) & env->hvien && - !(env->mip & (1 << cause)); + bool s_injected = env->mvip & (1ULL << cause) & env->mvien && + !(env->mip & (1ULL << cause)); + bool vs_injected = env->hvip & (1ULL << cause) & env->hvien && + !(env->mip & (1ULL << cause)); target_ulong tval = 0; target_ulong tinst = 0; target_ulong htval = 0; -- Gitee From d1b98e84eeec0b94403fb716bef41080f6bee3b3 Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 12 Dec 2024 10:31:47 +0800 Subject: [PATCH 581/939] hw/pci: Remove unused pci_irq_pulse() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from ef45f46f382a5e2c41c39c71fd3364cff4f41bf5 Last use of pci_irq_pulse() was removed 7 years ago in commit 5e9aa92eb1 ("hw/block: Fix pin-based interrupt behaviour of NVMe"). Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Message-ID: <20241122103418.539-1-philmd@linaro.org> Signed-off-by: Thomas Huth Signed-off-by: Zhang Jiao --- include/hw/pci/pci.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 7cf7b5619a..cee0cf7460 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -632,16 +632,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev) pci_set_irq(pci_dev, 0); } -/* - * FIXME: PCI does not work this way. - * All the callers to this method should be fixed. - */ -static inline void pci_irq_pulse(PCIDevice *pci_dev) -{ - pci_irq_assert(pci_dev); - pci_irq_deassert(pci_dev); -} - MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); void pci_set_power(PCIDevice *pci_dev, bool state); -- Gitee From 885c1bf512582757f9d7e2e360701f72a9d6e95f Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 12 Dec 2024 11:27:23 +0800 Subject: [PATCH 582/939] hvf: remove unused but set variable cheery-pick from 19d542cc0bce0b3641e80444374f9ffd8294a15b fixes associated warning when building on MacOS. Signed-off-by: Pierrick Bouvier Link: https://lore.kernel.org/r/20241023182922.1040964-1-pierrick.bouvier@linaro.org Signed-off-by: Paolo Bonzini Signed-off-by: Zhang Jiao --- target/i386/hvf/x86_task.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/target/i386/hvf/x86_task.c b/target/i386/hvf/x86_task.c index f09bfbdda5..cdea2ea69d 100644 --- a/target/i386/hvf/x86_task.c +++ b/target/i386/hvf/x86_task.c @@ -122,7 +122,6 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea load_regs(cpu); struct x86_segment_descriptor curr_tss_desc, next_tss_desc; - int ret; x68_segment_selector old_tss_sel = vmx_read_segment_selector(cpu, R_TR); uint64_t old_tss_base = vmx_read_segment_base(cpu, R_TR); uint32_t desc_limit; @@ -138,7 +137,7 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea if (reason == TSR_IDT_GATE && gate_valid) { int dpl; - ret = x86_read_call_gate(cpu, &task_gate_desc, gate); + x86_read_call_gate(cpu, &task_gate_desc, gate); dpl = task_gate_desc.dpl; x68_segment_selector cs = vmx_read_segment_selector(cpu, R_CS); @@ -167,11 +166,12 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea x86_write_segment_descriptor(cpu, &next_tss_desc, tss_sel); } - if (next_tss_desc.type & 8) - ret = task_switch_32(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc); - else + if (next_tss_desc.type & 8) { + task_switch_32(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc); + } else { //ret = task_switch_16(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc); VM_PANIC("task_switch_16"); + } macvm_set_cr0(cpu->accel->fd, rvmcs(cpu->accel->fd, VMCS_GUEST_CR0) | CR0_TS_MASK); -- Gitee From e6b4460566522f1a9d608217bcb1534bf6709cab Mon Sep 17 00:00:00 2001 From: Zhang Jiao Date: Thu, 12 Dec 2024 12:16:01 +0800 Subject: [PATCH 583/939] hw/misc/nrf51_rng: Don't use BIT_MASK() when we mean BIT() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from a29a9776407e68c5560687e07828925bda710150 The BIT_MASK() macro from bitops.h provides the mask of a bit within a particular word of a multi-word bit array; it is intended to be used with its counterpart BIT_WORD() that gives the index of the word in the array. In nrf51_rng we are using it for cases where we have a bit number that we know is the index of a bit within a single word (in fact, it happens that all the bit numbers we pass to it are zero). This happens to give the right answer, but the macro that actually does the job we want here is BIT(). Use BIT() instead of BIT_MASK(). Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20241108135644.4007151-1-peter.maydell@linaro.org> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Zhang Jiao --- hw/misc/nrf51_rng.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/misc/nrf51_rng.c b/hw/misc/nrf51_rng.c index fc86e1b697..e911b3a3a3 100644 --- a/hw/misc/nrf51_rng.c +++ b/hw/misc/nrf51_rng.c @@ -107,25 +107,25 @@ static void rng_write(void *opaque, hwaddr offset, break; case NRF51_RNG_REG_SHORTS: s->shortcut_stop_on_valrdy = - (value & BIT_MASK(NRF51_RNG_REG_SHORTS_VALRDY_STOP)) ? 1 : 0; + (value & BIT(NRF51_RNG_REG_SHORTS_VALRDY_STOP)) ? 1 : 0; break; case NRF51_RNG_REG_INTEN: s->interrupt_enabled = - (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) ? 1 : 0; + (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) ? 1 : 0; break; case NRF51_RNG_REG_INTENSET: - if (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) { + if (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) { s->interrupt_enabled = 1; } break; case NRF51_RNG_REG_INTENCLR: - if (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) { + if (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) { s->interrupt_enabled = 0; } break; case NRF51_RNG_REG_CONFIG: s->filter_enabled = - (value & BIT_MASK(NRF51_RNG_REG_CONFIG_DECEN)) ? 1 : 0; + (value & BIT(NRF51_RNG_REG_CONFIG_DECEN)) ? 1 : 0; break; default: -- Gitee From 0029172c2c57c18d6aef61070c2471f40de6bb45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= Date: Wed, 30 Oct 2024 10:08:12 +0000 Subject: [PATCH 584/939] crypto: fix error check on gcry_md_open MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gcrypt does not return negative values on error, it returns non-zero values. This caused QEMU not to detect failure to open an unsupported hash, resulting in a later crash trying to use a NULL context. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Daniel P. Berrangé Signed-off-by: cheliequan --- crypto/hash-gcrypt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/hash-gcrypt.c b/crypto/hash-gcrypt.c index d3bdfe5633..bf5d7ff9ba 100644 --- a/crypto/hash-gcrypt.c +++ b/crypto/hash-gcrypt.c @@ -56,7 +56,7 @@ qcrypto_gcrypt_hash_bytesv(QCryptoHashAlgorithm alg, size_t *resultlen, Error **errp) { - int i, ret; + gcry_error_t ret; gcry_md_hd_t md; unsigned char *digest; @@ -69,7 +69,7 @@ qcrypto_gcrypt_hash_bytesv(QCryptoHashAlgorithm alg, ret = gcry_md_open(&md, qcrypto_hash_alg_map[alg], 0); - if (ret < 0) { + if (ret != 0) { error_setg(errp, "Unable to initialize hash algorithm: %s", gcry_strerror(ret)); -- Gitee From 0fc0686798aba89c4d4d94f7e0c8e513cfc473b1 Mon Sep 17 00:00:00 2001 From: lijunwei Date: Fri, 22 Nov 2024 17:09:17 +0800 Subject: [PATCH 585/939] Change vmstate_cpuhp_sts vmstateDescription version_id fix live migration failed error message: "qemu-kvm: Missing section footer for 0000:00:01.3/piix4_pm" change vmstate_cpuhp_sts vmstateDescription version_id Signed-off-by: lijunwei --- hw/acpi/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 292e1daca2..4ab27ac66e 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -316,7 +316,7 @@ void acpi_cpu_unplug_cb(CPUHotplugState *cpu_st, static const VMStateDescription vmstate_cpuhp_sts = { .name = "CPU hotplug device state", - .version_id = 1, + .version_id = 2, .minimum_version_id = 1, .fields = (VMStateField[]) { VMSTATE_BOOL(is_inserting, AcpiCpuStatus), -- Gitee From b6a6427bf45c249e8397bf758055ebb54622e8e2 Mon Sep 17 00:00:00 2001 From: gubin Date: Tue, 17 Dec 2024 14:32:17 +0800 Subject: [PATCH 586/939] target/arm: Fix nregs computation in do_{ld,st}_zpa cherry-pick from 64c6e7444dff64b42d11b836b9aec9acfbe8ecc2 The field is encoded as [0-3], which is convenient for indexing our array of function pointers, but the true value is [1-4]. Adjust before calling do_mem_zpa. Add an assert, and move the comment re passing ZT to the helper back next to the relevant code. Cc: qemu-stable@nongnu.org Fixes: 206adacfb8d ("target/arm: Add mte helpers for sve scalar + int loads") Signed-off-by: Richard Henderson Tested-by: Gustavo Romero Message-id: 20240207025210.8837-3-richard.henderson@linaro.org Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell Signed-off-by: gubin --- target/arm/tcg/translate-sve.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index dd0c633897..1d8e0d29bf 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -4459,11 +4459,7 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, TCGv_ptr t_pg; int desc = 0; - /* - * For e.g. LD4, there are not enough arguments to pass all 4 - * registers as pointers, so encode the regno into the data field. - * For consistency, do this even for LD1. - */ + assert(mte_n >= 1 && mte_n <= 4); if (s->mte_active[0]) { int msz = dtype_msz(dtype); @@ -4477,6 +4473,11 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, addr = clean_data_tbi(s, addr); } + /* + * For e.g. LD4, there are not enough arguments to pass all 4 + * registers as pointers, so encode the regno into the data field. + * For consistency, do this even for LD1. + */ desc = simd_desc(vsz, vsz, zt | desc); t_pg = tcg_temp_new_ptr(); @@ -4614,7 +4615,7 @@ static void do_ld_zpa(DisasContext *s, int zt, int pg, * accessible via the instruction encoding. */ assert(fn != NULL); - do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn); + do_mem_zpa(s, zt, pg, addr, dtype, nreg + 1, false, fn); } static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a) @@ -5182,14 +5183,13 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, if (nreg == 0) { /* ST1 */ fn = fn_single[s->mte_active[0]][be][msz][esz]; - nreg = 1; } else { /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */ assert(msz == esz); fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz]; } assert(fn != NULL); - do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn); + do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg + 1, true, fn); } static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a) -- Gitee From b69c9f4b7b72c0634f2353135f83d8e59f3308dd Mon Sep 17 00:00:00 2001 From: gubin Date: Tue, 17 Dec 2024 14:42:31 +0800 Subject: [PATCH 587/939] target/arm: Fix SVE/SME gross MTE suppression checks cherry-pick from 855f94eca80c85a99f459e36684ea2f98f6a3243 The TBI and TCMA bits are located within mtedesc, not desc. Cc: qemu-stable@nongnu.org Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson Tested-by: Gustavo Romero Message-id: 20240207025210.8837-7-richard.henderson@linaro.org Signed-off-by: Peter Maydell Signed-off-by: gubin --- target/arm/tcg/sme_helper.c | 8 ++++---- target/arm/tcg/sve_helper.c | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 1ee2690ceb..904bfdac43 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -573,8 +573,8 @@ void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -750,8 +750,8 @@ void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index ce8134320b..9694201550 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -5800,8 +5800,8 @@ void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -6156,8 +6156,8 @@ void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -6406,8 +6406,8 @@ void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } -- Gitee From 07dfcad1b3d9ecbf1afe65d3457a6dbcb31f1b94 Mon Sep 17 00:00:00 2001 From: gubin Date: Tue, 17 Dec 2024 14:47:59 +0800 Subject: [PATCH 588/939] target/arm: Fix UMOPA/UMOPS of 16-bit values cherry-pick from ea3f5a90f036734522e9af3bffd77e69e9f47355 The UMOPA/UMOPS instructions are supposed to multiply unsigned 8 or 16 bit elements and accumulate the products into a 64-bit element. In the Arm ARM pseudocode, this is done with the usual infinite-precision signed arithmetic. However our implementation doesn't quite get it right, because in the DEF_IMOP_64() macro we do: sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); where NTYPE and MTYPE are uint16_t or int16_t. In the uint16_t case, the C usual arithmetic conversions mean the values are converted to "int" type and the multiply is done as a 32-bit multiply. This means that if the inputs are, for example, 0xffff and 0xffff then the result is 0xFFFE0001 as an int, which is then promoted to uint64_t for the accumulation into sum; this promotion incorrectly sign extends the multiply. Avoid the incorrect sign extension by casting to int64_t before the multiply, so we do the multiply as 64-bit signed arithmetic, which is a type large enough that the multiply can never overflow into the sign bit. (The equivalent 8-bit operations in DEF_IMOP_32() are fine, because the 8-bit multiplies can never overflow into the sign bit of a 32-bit integer.) Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2372 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240722172957.1041231-3-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/tcg/sme_helper.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 1ee2690ceb..e94b5335e1 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -1134,10 +1134,10 @@ static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ uint64_t sum = 0; \ /* Apply P to N as a mask, making the inactive elements 0. */ \ n &= expand_pred_h(p); \ - sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ - sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ - sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ - sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ + sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ + sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ + sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ + sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ return neg ? a - sum : a + sum; \ } -- Gitee From cdf914a667f9d0f086329174c24f9623b00b8fb2 Mon Sep 17 00:00:00 2001 From: gubin Date: Tue, 17 Dec 2024 14:54:18 +0800 Subject: [PATCH 589/939] target/arm: Fix VCMLA Dd, Dn, Dm[idx] cherry-pick from 76bccf3cb9d9383da0128bbc6d1300cddbe3ae8f The inner loop, bounded by eltspersegment, must not be larger than the outer loop, bounded by elements. Cc: qemu-stable@nongnu.org Fixes: 18fc2405781 ("target/arm: Implement SVE fp complex multiply add (indexed)") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2376 Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson Message-id: 20240625183536.1672454-2-richard.henderson@linaro.org Signed-off-by: Peter Maydell Signed-off-by: gubin --- target/arm/tcg/vec_helper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 11e874c05a..83b49ef009 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -850,7 +850,7 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); uint32_t neg_real = flip ^ neg_imag; intptr_t elements = opr_sz / sizeof(float16); - intptr_t eltspersegment = 16 / sizeof(float16); + intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); intptr_t i, j; /* Shift boolean to the sign bit so we can xor to negate. */ @@ -912,7 +912,7 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); uint32_t neg_real = flip ^ neg_imag; intptr_t elements = opr_sz / sizeof(float32); - intptr_t eltspersegment = 16 / sizeof(float32); + intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); intptr_t i, j; /* Shift boolean to the sign bit so we can xor to negate. */ -- Gitee From 96b5acaa5dbff1e5bf8809fd818e6ff813e5a170 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Tue, 31 Dec 2024 02:04:04 -0500 Subject: [PATCH 590/939] target/i386/cpu: Fix notes for CPU models cheery-pick from 93dcc9390e5ad0696ae7e9b7b3a5b08c2d1b6de6 Fixes: 644e3c5d812 ("missing vmx features for Skylake-Server and Cascadelake-Server") Signed-off-by: Han Han Reviewed-by: Chenyi Qiang Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev Signed-off-by: qihao_yewu --- target/i386/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 727beb6a65..1fa08265bc 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3453,6 +3453,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { }, { .version = 4, + .note = "IBRS, EPT switching, no TSX", .props = (PropValue[]) { { "vmx-eptp-switching", "on" }, { /* end of list */ } @@ -3587,7 +3588,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { }, }, { .version = 4, - .note = "ARCH_CAPABILITIES, no TSX", + .note = "ARCH_CAPABILITIES, EPT switching, no TSX", .props = (PropValue[]) { { "vmx-eptp-switching", "on" }, { /* end of list */ } -- Gitee From 626103c76d0d8db8dee3f613b6e3159c8ddd5a57 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 2 Jan 2025 10:25:00 +0800 Subject: [PATCH 591/939] target/arm: LDAPR should honour SCTLR_ELx.nAA cherry-pick from 25489b521b61b874c4c6583956db0012a3674e3a In commit c1a1f80518d360b when we added the FEAT_LSE2 relaxations to the alignment requirements for atomic and ordered loads and stores, we didn't quite get it right for LDAPR/LDAPRH/LDAPRB with no immediate offset. These instructions were handled in the old decoder as part of disas_ldst_atomic(), but unlike all the other insns that function decoded (LDADD, LDCLR, etc) these insns are "ordered", not "atomic", so they should be using check_ordered_align() rather than check_atomic_align(). Commit c1a1f80518d360b used check_atomic_align() regardless for everything in disas_ldst_atomic(). We then carried that incorrect check over in the decodetree conversion, where LDAPR/LDAPRH/LDAPRB are now handled by trans_LDAPR(). The effect is that when FEAT_LSE2 is implemented, these instructions don't honour the SCTLR_ELx.nAA bit and will generate alignment faults when they should not. (The LDAPR insns with an immediate offset were in disas_ldst_ldapr_stlr() and then in trans_LDAPR_i() and trans_STLR_i(), and have always used the correct check_ordered_align().) Use check_ordered_align() in trans_LDAPR(). Cc: qemu-stable@nongnu.org Fixes: c1a1f80518d360b ("target/arm: Relax ordered/atomic alignment checks for LSE2") Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240709134504.3500007-3-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/tcg/translate-a64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index a05182b57f..5beac07b60 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -3306,7 +3306,7 @@ static bool trans_LDAPR(DisasContext *s, arg_LDAPR *a) if (a->rn == 31) { gen_check_sp_alignment(s); } - mop = check_atomic_align(s, a->rn, a->sz); + mop = check_ordered_align(s, a->rn, 0, false, a->sz); clean_addr = gen_mte_check1(s, cpu_reg_sp(s, a->rn), false, a->rn != 31, mop); /* -- Gitee From 582f5bc85da2d1c6a61e5164dfc272dc96f846d5 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 2 Jan 2025 10:30:33 +0800 Subject: [PATCH 592/939] target/arm: Reinstate "vfp" property on AArch32 CPUs cherry-pick from 185e3fdf8d106cb2f7d234d5e6453939c66db2a9 In commit 4315f7c614743 we restructured the logic for creating the VFP related properties to avoid testing the aa32_simd_r32 feature on AArch64 CPUs. However in the process we accidentally stopped exposing the "vfp" QOM property on AArch32 TCG CPUs. This mostly hasn't had any ill effects because not many people want to disable VFP, but it wasn't intentional. Reinstate the property. Cc: qemu-stable@nongnu.org Fixes: 4315f7c614743 ("target/arm: Restructure has_vfp_d32 test") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2098 Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240126193432.2210558-1-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/cpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 9dd61c10ea..09d391bd34 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1635,6 +1635,10 @@ void arm_cpu_post_init(Object *obj) } } else if (cpu_isar_feature(aa32_vfp, cpu)) { cpu->has_vfp = true; + if (tcg_enabled() || qtest_enabled()) { + qdev_property_add_static(DEVICE(obj), + &arm_cpu_has_vfp_property); + } if (cpu_isar_feature(aa32_simd_r32, cpu)) { cpu->has_vfp_d32 = true; /* -- Gitee From 45e80d1d71f7f4b50b47ec61560a77edd80badc1 Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 2 Jan 2025 10:35:05 +0800 Subject: [PATCH 593/939] target/arm: take HSTR traps of cp15 accesses to EL2, not EL1 cherry-pick from fbe5ac5671a9cfcc7f4aee9a5fac7720eea08876 The HSTR_EL2 register allows the hypervisor to trap AArch32 EL1 and EL0 accesses to cp15 registers. We incorrectly implemented this so they trap to EL1 when we detect the need for a HSTR trap at code generation time. (The check in access_check_cp_reg() which we do at runtime to catch traps from EL0 is correctly routing them to EL2.) Use the correct target EL when generating the code to take the trap. Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2226 Fixes: 049edada5e93df ("target/arm: Make HSTR_EL2 traps take priority over UNDEF-at-EL1") Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20240325133116.2075362-1-peter.maydell@linaro.org Signed-off-by: gubin --- target/arm/tcg/translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index b3660173d1..e555e885a1 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -4584,7 +4584,7 @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64, tcg_gen_andi_i32(t, t, 1u << maskbit); tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label); - gen_exception_insn(s, 0, EXCP_UDEF, syndrome); + gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2); /* * gen_exception_insn() will set is_jmp to DISAS_NORETURN, * but since we're conditionally branching over it, we want -- Gitee From 06da30c93dfd4cff013881582d25c3d04456376b Mon Sep 17 00:00:00 2001 From: gubin Date: Thu, 2 Jan 2025 10:40:17 +0800 Subject: [PATCH 594/939] target/arm: Use float_status copy in sme_fmopa_s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 31d93fedf41c24b0badb38cd9317590d1ef74e37 We made a copy above because the fp exception flags are not propagated back to the FPST register, but then failed to use the copy. Cc: qemu-stable@nongnu.org Fixes: 558e956c719 ("target/arm: Implement FMOPA, FMOPS (non-widening)") Signed-off-by: Daniyal Khan Signed-off-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alex Bennée Message-id: 20240717060149.204788-2-richard.henderson@linaro.org [rth: Split from a larger patch] Signed-off-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Alex Bennée Signed-off-by: Peter Maydell Signed-off-by: gubin --- target/arm/tcg/sme_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 9a9b1a240c..ae4f39ed02 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -916,7 +916,7 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, if (pb & 1) { uint32_t *a = vza_row + H1_4(col); uint32_t *m = vzm + H1_4(col); - *a = float32_muladd(n, *m, *a, 0, vst); + *a = float32_muladd(n, *m, *a, 0, &fpst); } col += 4; pb >>= 4; -- Gitee From a481451a811877640a57ccbef2b33b39567f2802 Mon Sep 17 00:00:00 2001 From: thomas Date: Fri, 12 Jul 2024 11:10:53 +0800 Subject: [PATCH 595/939] virtio-net: Fix network stall at the host side waiting for kick commit f937309fbdbb48c354220a3e7110c202ae4aa7fa upstream. Patch 06b12970174 ("virtio-net: fix network stall under load") added double-check to test whether the available buffer size can satisfy the request or not, in case the guest has added some buffers to the avail ring simultaneously after the first check. It will be lucky if the available buffer size becomes okay after the double-check, then the host can send the packet to the guest. If the buffer size still can't satisfy the request, even if the guest has added some buffers, viritio-net would stall at the host side forever. The patch enables notification and checks whether the guest has added some buffers since last check of available buffers when the available buffers are insufficient. If no buffer is added, return false, else recheck the available buffers in the loop. If the available buffers are sufficient, disable notification and return true. Changes: 1. Change the return type of virtqueue_get_avail_bytes() from void to int, it returns an opaque that represents the shadow_avail_idx of the virtqueue on success, else -1 on error. 2. Add a new API: virtio_queue_enable_notification_and_check(), it takes an opaque as input arg which is returned from virtqueue_get_avail_bytes(). It enables notification firstly, then checks whether the guest has added some buffers since last check of available buffers or not by virtio_queue_poll(), return ture if yes. The patch also reverts patch "06b12970174". The case below can reproduce the stall. Guest 0 +--------+ | iperf | ---------------> | server | Host | +--------+ +--------+ | ... | iperf |---- | client |---- Guest n +--------+ | +--------+ | | iperf | ---------------> | server | +--------+ Boot many guests from qemu with virtio network: qemu ... -netdev tap,id=net_x \ -device virtio-net-pci-non-transitional,\ iommu_platform=on,mac=xx:xx:xx:xx:xx:xx,netdev=net_x Each guest acts as iperf server with commands below: iperf3 -s -D -i 10 -p 8001 iperf3 -s -D -i 10 -p 8002 The host as iperf client: iperf3 -c guest_IP -p 8001 -i 30 -w 256k -P 20 -t 40000 iperf3 -c guest_IP -p 8002 -i 30 -w 256k -P 20 -t 40000 After some time, the host loses connection to the guest, the guest can send packet to the host, but can't receive packet from the host. It's more likely to happen if SWIOTLB is enabled in the guest, allocating and freeing bounce buffer takes some CPU ticks, copying from/to bounce buffer takes more CPU ticks, compared with that there is no bounce buffer in the guest. Once the rate of producing packets from the host approximates the rate of receiveing packets in the guest, the guest would loop in NAPI. receive packets --- | | v | free buf virtnet_poll | | v | add buf to avail ring --- | | need kick the host? | NAPI continues v receive packets --- | | v | free buf virtnet_poll | | v | add buf to avail ring --- | v ... ... On the other hand, the host fetches free buf from avail ring, if the buf in the avail ring is not enough, the host notifies the guest the event by writing the avail idx read from avail ring to the event idx of used ring, then the host goes to sleep, waiting for the kick signal from the guest. Once the guest finds the host is waiting for kick singal (in virtqueue_kick_prepare_split()), it kicks the host. The host may stall forever at the sequences below: Host Guest ------------ ----------- fetch buf, send packet receive packet --- ... ... | fetch buf, send packet add buf | ... add buf virtnet_poll buf not enough avail idx-> add buf | read avail idx add buf | add buf --- receive packet --- write event idx ... | wait for kick add buf virtnet_poll ... | --- no more packet, exit NAPI In the first loop of NAPI above, indicated in the range of virtnet_poll above, the host is sending packets while the guest is receiving packets and adding buffers. step 1: The buf is not enough, for example, a big packet needs 5 buf, but the available buf count is 3. The host read current avail idx. step 2: The guest adds some buf, then checks whether the host is waiting for kick signal, not at this time. The used ring is not empty, the guest continues the second loop of NAPI. step 3: The host writes the avail idx read from avail ring to used ring as event idx via virtio_queue_set_notification(q->rx_vq, 1). step 4: At the end of the second loop of NAPI, recheck whether kick is needed, as the event idx in the used ring written by the host is beyound the range of kick condition, the guest will not send kick signal to the host. Fixes: 06b12970174 ("virtio-net: fix network stall under load") Cc: qemu-stable@nongnu.org Signed-off-by: Wencheng Yang Reviewed-by: Michael S. Tsirkin Signed-off-by: Jason Wang --- hw/net/virtio-net.c | 28 ++++++++++------- hw/virtio/virtio.c | 64 +++++++++++++++++++++++++++++++++++--- include/hw/virtio/virtio.h | 19 +++++++++-- 3 files changed, 92 insertions(+), 19 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index c9c83fe297..7184c9c526 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1662,24 +1662,28 @@ static bool virtio_net_can_receive(NetClientState *nc) static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize) { + int opaque; + unsigned int in_bytes; VirtIONet *n = q->n; - if (virtio_queue_empty(q->rx_vq) || - (n->mergeable_rx_bufs && - !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { - virtio_queue_set_notification(q->rx_vq, 1); - - /* To avoid a race condition where the guest has made some buffers - * available after the above check but before notification was - * enabled, check for available buffers again. - */ - if (virtio_queue_empty(q->rx_vq) || - (n->mergeable_rx_bufs && - !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { + + while (virtio_queue_empty(q->rx_vq) || n->mergeable_rx_bufs) { + opaque = virtqueue_get_avail_bytes(q->rx_vq, &in_bytes, NULL, + bufsize, 0); + /* Buffer is enough, disable notifiaction */ + if (bufsize <= in_bytes) { + break; + } + + if (virtio_queue_enable_notification_and_check(q->rx_vq, opaque)) { + /* Guest has added some buffers, try again */ + continue; + } else { return 0; } } virtio_queue_set_notification(q->rx_vq, 0); + return 1; } diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 8c3b6b87aa..4f5b241fd3 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -743,6 +743,60 @@ int virtio_queue_empty(VirtQueue *vq) } } +static bool virtio_queue_split_poll(VirtQueue *vq, unsigned shadow_idx) +{ + if (unlikely(!vq->vring.avail)) { + return false; + } + + return (uint16_t)shadow_idx != vring_avail_idx(vq); +} + +static bool virtio_queue_packed_poll(VirtQueue *vq, unsigned shadow_idx) +{ + VRingPackedDesc desc; + VRingMemoryRegionCaches *caches; + + if (unlikely(!vq->vring.desc)) { + return false; + } + + caches = vring_get_region_caches(vq); + if (!caches) { + return false; + } + + vring_packed_desc_read(vq->vdev, &desc, &caches->desc, + shadow_idx, true); + + return is_desc_avail(desc.flags, vq->shadow_avail_wrap_counter); +} + +static bool virtio_queue_poll(VirtQueue *vq, unsigned shadow_idx) +{ + if (virtio_device_disabled(vq->vdev)) { + return false; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + return virtio_queue_packed_poll(vq, shadow_idx); + } else { + return virtio_queue_split_poll(vq, shadow_idx); + } +} + +bool virtio_queue_enable_notification_and_check(VirtQueue *vq, + int opaque) +{ + virtio_queue_set_notification(vq, 1); + + if (opaque >= 0) { + return virtio_queue_poll(vq, (unsigned)opaque); + } else { + return false; + } +} + static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len) { @@ -1322,9 +1376,9 @@ err: goto done; } -void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, - unsigned int *out_bytes, - unsigned max_in_bytes, unsigned max_out_bytes) +int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes, unsigned max_in_bytes, + unsigned max_out_bytes) { uint16_t desc_size; VRingMemoryRegionCaches *caches; @@ -1357,7 +1411,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, caches); } - return; + return (int)vq->shadow_avail_idx; err: if (in_bytes) { *in_bytes = 0; @@ -1365,6 +1419,8 @@ err: if (out_bytes) { *out_bytes = 0; } + + return -1; } int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 60494aed62..78db2bde98 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -273,9 +273,13 @@ void qemu_put_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, VirtQueueElement *elem); int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, unsigned int out_bytes); -void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, - unsigned int *out_bytes, - unsigned max_in_bytes, unsigned max_out_bytes); +/** + * Return <0 on error or an opaque >=0 to pass to + * virtio_queue_enable_notification_and_check on success. + */ +int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes, unsigned max_in_bytes, + unsigned max_out_bytes); void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq); void virtio_notify(VirtIODevice *vdev, VirtQueue *vq); @@ -309,6 +313,15 @@ int virtio_queue_ready(VirtQueue *vq); int virtio_queue_empty(VirtQueue *vq); +/** + * Enable notification and check whether guest has added some + * buffers since last call to virtqueue_get_avail_bytes. + * + * @opaque: value returned from virtqueue_get_avail_bytes + */ +bool virtio_queue_enable_notification_and_check(VirtQueue *vq, + int opaque); + /* Host binding interface. */ uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr); -- Gitee From f698e21192b07335197e8a20032cbb411715775a Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 11 Jan 2025 10:37:12 +0800 Subject: [PATCH 596/939] target/hexagon: don't look for static glib cherry-pick from fe68cc0923ebfa0c12e4176f61ec9b363a07a73a When cross compiling QEMU configured with --static, I've been getting configure errors like the following: Build-time dependency glib-2.0 found: NO ../target/hexagon/meson.build:303:15: ERROR: Dependency lookup for glib-2.0 with method 'pkgconfig' failed: Could not generate libs for glib-2.0: Package libpcre2-8 was not found in the pkg-config search path. Perhaps you should add the directory containing `libpcre2-8.pc' to the PKG_CONFIG_PATH environment variable Package 'libpcre2-8', required by 'glib-2.0', not found This happens because --static sets the prefer_static Meson option, but my build machine doesn't have a static libpcre2. I don't think it makes sense to insist that native dependencies are static, just because I want the non-native QEMU binaries to be static. Signed-off-by: Alyssa Ross Link: https://lore.kernel.org/r/20240805104921.4035256-1-hi@alyssa.is Signed-off-by: Paolo Bonzini Signed-off-by: gubin --- target/hexagon/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/hexagon/meson.build b/target/hexagon/meson.build index da8e608d00..436217f25a 100644 --- a/target/hexagon/meson.build +++ b/target/hexagon/meson.build @@ -188,7 +188,7 @@ if idef_parser_enabled and 'hexagon-linux-user' in target_dirs arguments: ['@INPUT@', '--defines=@OUTPUT1@', '--output=@OUTPUT0@'] ) - glib_dep = dependency('glib-2.0', native: true) + glib_dep = dependency('glib-2.0', native: true, static: false) idef_parser = executable( 'idef-parser', -- Gitee From a7209a19e2d730fed5f52fda44aaa24e8de8a81c Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 11 Jan 2025 10:46:10 +0800 Subject: [PATCH 597/939] target/riscv/vector_helper.c: set vstart = 0 in GEN_VEXT_VSLIDEUP_VX() cherry-pick from d3646e31ce6d1e02e46e6eabdbc2e637c0cbece7 The helper isn't setting env->vstart = 0 after its execution, as it is expected from every vector instruction that completes successfully. Signed-off-by: Daniel Henrique Barboza Reviewed-by: Richard Henderson Reviewed-by: Alistair Francis Reviewed-by: LIU Zhiwei Message-ID: <20240314175704.478276-2-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis Signed-off-by: gubin --- target/riscv/vector_helper.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 42ffd3a68a..e69b68ba43 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -4770,6 +4770,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \ } \ + env->vstart = 0; \ /* set tail elements to 1s */ \ vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } -- Gitee From a820983749a2d3eebcc36b5a3ae34436fd52db45 Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 11 Jan 2025 10:54:33 +0800 Subject: [PATCH 598/939] target/riscv/vector_helper.c: optimize loops in ldst helpers cherry-pick from 0a11629c915f61df798919db51a18ffe4649cb65 Change the for loops in ldst helpers to do a single increment in the counter, and assign it env->vstart, to avoid re-reading from vstart every time. Suggested-by: Richard Henderson Signed-off-by: Daniel Henrique Barboza Reviewed-by: Alistair Francis Reviewed-by: Richard Henderson Message-ID: <20240314175704.478276-11-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis Signed-off-by: gubin --- target/riscv/vector_helper.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 42ffd3a68a..b5acf81cc0 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -196,7 +196,7 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, uint32_t esz = 1 << log2_esz; uint32_t vma = vext_vma(desc); - for (i = env->vstart; i < env->vl; i++, env->vstart++) { + for (i = env->vstart; i < env->vl; env->vstart = ++i) { k = 0; while (k < nf) { if (!vm && !vext_elem_mask(v0, i)) { @@ -262,7 +262,7 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, uint32_t esz = 1 << log2_esz; /* load bytes from guest memory */ - for (i = env->vstart; i < evl; i++, env->vstart++) { + for (i = env->vstart; i < evl; env->vstart = ++i) { k = 0; while (k < nf) { target_ulong addr = base + ((i * nf + k) << log2_esz); @@ -376,7 +376,7 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, uint32_t vma = vext_vma(desc); /* load bytes from guest memory */ - for (i = env->vstart; i < env->vl; i++, env->vstart++) { + for (i = env->vstart; i < env->vl; env->vstart = ++i) { k = 0; while (k < nf) { if (!vm && !vext_elem_mask(v0, i)) { -- Gitee From 1c6b234766bae8c2b518cfd882e8907b831d8d03 Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 11 Jan 2025 11:10:29 +0800 Subject: [PATCH 599/939] target/riscv/vector_helper.c: fix 'vmvr_v' memcpy endianess cherry-pick from 768e7b329c0be22035da077fe76221dd0a47103b vmvr_v isn't handling the case where the host might be big endian and the bytes to be copied aren't sequential. Suggested-by: Richard Henderson Fixes: f714361ed7 ("target/riscv: rvv-1.0: implement vstart CSR") Signed-off-by: Daniel Henrique Barboza Reviewed-by: Alistair Francis Reviewed-by: LIU Zhiwei Reviewed-by: Richard Henderson Message-ID: <20240314175704.478276-4-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis Signed-off-by: gubin --- target/riscv/vector_helper.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 42ffd3a68a..351842f66a 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -5063,9 +5063,17 @@ void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) uint32_t startb = env->vstart * sewb; uint32_t i = startb; + if (HOST_BIG_ENDIAN && i % 8 != 0) { + uint32_t j = ROUND_UP(i, 8); + memcpy((uint8_t *)vd + H1(j - 1), + (uint8_t *)vs2 + H1(j - 1), + j - i); + i = j; + } + memcpy((uint8_t *)vd + H1(i), (uint8_t *)vs2 + H1(i), - maxsz - startb); + maxsz - i); env->vstart = 0; } -- Gitee From 4ca8ac93bd2c328c80841540b3b5e297ff24d3c9 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Wed, 5 Feb 2025 06:02:50 -0500 Subject: [PATCH 600/939] hw/usb/hcd-ehci: Fix debug printf format string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from a40b5f32867294b7c855d2e4b98a4c2d32b3be28 The variable is uint64_t so needs %PRIu64 instead of %d. Fixes: 3ae7eb88c47 ("ehci: fix overflow in frame timer code") Signed-off-by: BALATON Zoltan Reviewed-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20250124124713.64F8C4E6031@zero.eik.bme.hu> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/usb/hcd-ehci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c index 7b093acd98..fa8c7af5c8 100644 --- a/hw/usb/hcd-ehci.c +++ b/hw/usb/hcd-ehci.c @@ -2287,7 +2287,8 @@ static void ehci_work_bh(void *opaque) ehci_update_frindex(ehci, skipped_uframes); ehci->last_run_ns += UFRAME_TIMER_NS * skipped_uframes; uframes -= skipped_uframes; - DPRINTF("WARNING - EHCI skipped %d uframes\n", skipped_uframes); + DPRINTF("WARNING - EHCI skipped %"PRIu64" uframes\n", + skipped_uframes); } for (i = 0; i < uframes; i++) { -- Gitee From 44cf15f26215a07876d78d8ee63f0fb10ce2d1d4 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Wed, 5 Feb 2025 07:07:13 -0500 Subject: [PATCH 601/939] parallels: fix ext_off assertion failure due to overflow cheery-pick from 58607752d173438994d28dea7e2c2587726663e6 This error was discovered by fuzzing qemu-img. When ph.ext_off has a sufficiently large value, the operation le64_to_cpu(ph.ext_off) << BDRV_SECTOR_BITS in parallels_read_format_extension() can cause an overflow in int64_t. This overflow triggers the assert(ext_off > 0) check in block/parallels-ext.c: parallels_read_format_extension(), leading to a crash. This commit adds a check to prevent overflow when shifting ph.ext_off by BDRV_SECTOR_BITS, ensuring that the value remains within a valid range. Reported-by: Leonid Reviakin Signed-off-by: Denis Rastyogin Reviewed-by: Denis V. Lunev Message-ID: <20241212104212.513947-2-gerben@altlinux.org> Signed-off-by: Stefan Hajnoczi Signed-off-by: qihao_yewu --- block/parallels.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/parallels.c b/block/parallels.c index 9205a0864f..8f2b58e1c9 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -1298,6 +1298,10 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "Catalog too large"); return -EFBIG; } + if (le64_to_cpu(ph.ext_off) >= (INT64_MAX >> BDRV_SECTOR_BITS)) { + error_setg(errp, "Invalid image: Too big offset"); + return -EFBIG; + } size = bat_entry_off(s->bat_size); s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs)); -- Gitee From c5a859ec02af99574dfac2e5cfab9570345eb2e4 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Wed, 5 Feb 2025 08:04:10 -0500 Subject: [PATCH 602/939] backends/cryptodev-vhost-user: Fix local_error leaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 78b0c15a563ac4be5afb0375602ca0a3adc6c442 Do not propagate error to the upper, directly output the error to avoid leaks. Fixes: 2fda101de07 ("virtio-crypto: Support asynchronous mode") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2714 Signed-off-by: Gabriel Barrantes Reviewed-by: zhenwei pi Message-Id: Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- backends/cryptodev-vhost-user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c index c3283ba84a..b8e95ca8b4 100644 --- a/backends/cryptodev-vhost-user.c +++ b/backends/cryptodev-vhost-user.c @@ -281,8 +281,7 @@ static int cryptodev_vhost_user_create_session( break; default: - error_setg(&local_error, "Unsupported opcode :%" PRIu32 "", - sess_info->op_code); + error_report("Unsupported opcode :%" PRIu32 "", sess_info->op_code); return -VIRTIO_CRYPTO_NOTSUPP; } -- Gitee From da6ee14de85b4e619eedfbe3a6cac3f09d948589 Mon Sep 17 00:00:00 2001 From: nonce <2774337358@qq.com> Date: Thu, 23 Jan 2025 21:03:10 +0800 Subject: [PATCH 603/939] bakcend: VirtCCA:resolve hugepage memory waste issue in vhost-user scenario VirtCCA is based on SWIOTLB to implement virtio and will only allocate Bounce Buffer in the lower address range below 4GB. Therefore, the backend hugepages memory allocated above 4GB will not be used, resulting in significant waste. New address space and memory region are added to manage the backend hugepages memory corresponding to the GPA below 4GB, and there are shared with the vhostuser backend. Signed-off-by: nonce0_0 <2774337358@qq.com> --- backends/hostmem-file.c | 85 +++++++++++++++++++++++++++++++++++ hw/core/numa.c | 20 +++++++++ hw/virtio/vhost.c | 8 +++- include/exec/address-spaces.h | 3 ++ include/exec/cpu-common.h | 1 + include/exec/memory.h | 11 +++++ system/physmem.c | 17 +++++++ system/vl.c | 9 ++++ 8 files changed, 153 insertions(+), 1 deletion(-) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 361d4a8103..891fe4ac4a 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -20,9 +20,13 @@ #include "qom/object.h" #include "qapi/visitor.h" #include "qapi/qapi-visit-common.h" +#include "sysemu/kvm.h" +#include "exec/address-spaces.h" OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE) +bool virtcca_shared_hugepage_mapped = false; +uint64_t virtcca_cvm_ram_size = 0; struct HostMemoryBackendFile { HostMemoryBackend parent_obj; @@ -36,6 +40,83 @@ struct HostMemoryBackendFile { OnOffAuto rom; }; +/* Parse the path of the hugepages memory file used for memory sharing */ +static int virtcca_parse_share_mem_path(char *src, char *dst) +{ + int ret = 0; + char src_copy[PATH_MAX]; + char *token = NULL; + char *last_dir = NULL; + char *second_last_dir = NULL; + static const char delimiter[] = "/"; + + if (src == NULL || dst == NULL || + strlen(src) == 0 || strlen(src) > PATH_MAX - 1) { + error_report("Invalid input: NULL pointer or invalid string length."); + return -1; + } + + strcpy(src_copy, src); + token = strtok(src_copy, delimiter); + + /* Iterate over the path segments to find the second-to-last directory */ + while (token != NULL) { + second_last_dir = last_dir; + last_dir = token; + token = strtok(NULL, delimiter); + } + + /* Check if the second-to-last directory is found */ + if (second_last_dir == NULL) { + error_report("Invalid path: second-to-last directory not found."); + return -1; + } + + /* + * Construct the share memory path by appending the extracted domain name + * to the hugepages memory filesystem prefix + */ + ret = snprintf(dst, PATH_MAX, "/dev/hugepages/libvirt/qemu/%s", + second_last_dir); + + if (ret < 0 || ret >= PATH_MAX) { + error_report("Error: snprintf failed to construct the share mem path"); + return -1; + } + + return 0; +} + +/* + * Create a hugepage memory region in the virtcca scenario + * for sharing with process like vhost-user and others. + */ +static void +virtcca_shared_backend_memory_alloc(char *mem_path, uint32_t ram_flags, Error **errp) +{ + char dst[PATH_MAX]; + uint64_t size = virtcca_cvm_ram_size; + + if (virtcca_parse_share_mem_path(mem_path, dst)) { + error_report("parse virtcca share memory path failed"); + exit(1); + } + if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE) { + size = VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE; + } + + virtcca_shared_hugepage = g_new(MemoryRegion, 1); + memory_region_init_ram_from_file(virtcca_shared_hugepage, NULL, + "virtcca_shared_hugepage", size, + VIRTCCA_SHARED_HUGEPAGE_ALIGN, + ram_flags, dst, 0, errp); + if (*errp) { + error_reportf_err(*errp, "cannot init RamBlock for virtcca_shared_hugepage: "); + exit(1); + } + virtcca_shared_hugepage_mapped = true; +} + static void file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { @@ -90,6 +171,10 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) backend->size, fb->align, ram_flags, fb->mem_path, fb->offset, errp); g_free(name); + + if (virtcca_cvm_enabled() && backend->share && !virtcca_shared_hugepage_mapped) { + virtcca_shared_backend_memory_alloc(fb->mem_path, ram_flags, errp); + } #endif } diff --git a/hw/core/numa.c b/hw/core/numa.c index f08956ddb0..e7c48dab61 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -42,6 +42,8 @@ #include "qemu/option.h" #include "qemu/config-file.h" #include "qemu/cutils.h" +#include "exec/address-spaces.h" +#include "sysemu/kvm.h" QemuOptsList qemu_numa_opts = { .name = "numa", @@ -641,6 +643,21 @@ static void numa_init_memdev_container(MachineState *ms, MemoryRegion *ram) } } +/* + * Add virtcca_shared_hugepage as a sub-MR to the root MR of address space + * address_space_memory and address_space_virtcca_shared_memory. + */ +static void virtcca_shared_memory_configuration(MachineState *ms) +{ + MemoryRegion *alias_mr = g_new(MemoryRegion, 1); + + memory_region_add_subregion_overlap(ms->ram, 0, virtcca_shared_hugepage, 1); + memory_region_init_alias(alias_mr, NULL, "alias-mr", virtcca_shared_hugepage, + 0, int128_get64(virtcca_shared_hugepage->size)); + memory_region_add_subregion(address_space_virtcca_shared_memory.root, + VIRTCCA_GPA_START, alias_mr); +} + void numa_complete_configuration(MachineState *ms) { int i; @@ -711,6 +728,9 @@ void numa_complete_configuration(MachineState *ms) memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id, ms->ram_size); numa_init_memdev_container(ms, ms->ram); + if (virtcca_cvm_enabled() && virtcca_shared_hugepage->ram_block) { + virtcca_shared_memory_configuration(ms); + } } /* QEMU needs at least all unique node pair distances to build * the whole NUMA distance table. QEMU treats the distance table diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index d29075aa04..8b95558013 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -30,6 +30,7 @@ #include "sysemu/dma.h" #include "trace.h" #include "qapi/qapi-commands-migration.h" +#include "sysemu/kvm.h" /* enabled until disconnected backend stabilizes */ #define _VHOST_DEBUG 1 @@ -1616,7 +1617,12 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->log_size = 0; hdev->log_enabled = false; hdev->started = false; - memory_listener_register(&hdev->memory_listener, &address_space_memory); + if (virtcca_cvm_enabled()) { + memory_listener_register(&hdev->memory_listener, + &address_space_virtcca_shared_memory); + } else { + memory_listener_register(&hdev->memory_listener, &address_space_memory); + } QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); /* diff --git a/include/exec/address-spaces.h b/include/exec/address-spaces.h index 0d0aa61d68..4518b5da86 100644 --- a/include/exec/address-spaces.h +++ b/include/exec/address-spaces.h @@ -33,6 +33,9 @@ MemoryRegion *get_system_io(void); extern AddressSpace address_space_memory; extern AddressSpace address_space_io; +extern AddressSpace address_space_virtcca_shared_memory; + +extern MemoryRegion *virtcca_shared_hugepage; #endif diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index c7fd30d5b9..d21d9990ad 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -28,6 +28,7 @@ typedef uint64_t vaddr; void cpu_exec_init_all(void); void cpu_exec_step_atomic(CPUState *cpu); +void virtcca_shared_memory_address_space_init(void); /* Using intptr_t ensures that qemu_*_page_mask is sign-extended even * when intptr_t is 32-bit and we are aligning a long long. diff --git a/include/exec/memory.h b/include/exec/memory.h index 542c9da918..33778f5c64 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -243,6 +243,17 @@ typedef struct IOMMUTLBEvent { /* RAM FD is opened read-only */ #define RAM_READONLY_FD (1 << 11) +/* The GPA range of the VirtCCA bounce buffer is from 1GB to 4GB. */ +#define VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE 0xc0000000ULL + +/* The VirtCCA shared hugepage memory granularity is 1GB */ +#define VIRTCCA_SHARED_HUGEPAGE_ALIGN 0x40000000ULL + +/* The GPA starting address of the VirtCCA CVM is 1GB */ +#define VIRTCCA_GPA_START 0x40000000ULL + +extern uint64_t virtcca_cvm_ram_size; + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, diff --git a/system/physmem.c b/system/physmem.c index 250f315bc8..8f4be2d131 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -89,9 +89,17 @@ RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; static MemoryRegion *system_memory; static MemoryRegion *system_io; +static MemoryRegion *virtcca_shared_memory; + +/* + * Serves as the sub-MR of the root MR (virtcca_shared_memory) + * and is associated with the RAMBlock. + */ +MemoryRegion *virtcca_shared_hugepage; AddressSpace address_space_io; AddressSpace address_space_memory; +AddressSpace address_space_virtcca_shared_memory; static MemoryRegion io_mem_unassigned; @@ -2586,6 +2594,15 @@ static void memory_map_init(void) address_space_init(&address_space_io, system_io, "I/O"); } +void virtcca_shared_memory_address_space_init(void) +{ + virtcca_shared_memory = g_malloc(sizeof(*virtcca_shared_memory)); + memory_region_init(virtcca_shared_memory, NULL, + "virtcca_shared_memory", UINT64_MAX); + address_space_init(&address_space_virtcca_shared_memory, + virtcca_shared_memory, "virtcca_shared_memory"); +} + MemoryRegion *get_system_memory(void) { return system_memory; diff --git a/system/vl.c b/system/vl.c index a1e5e68773..7c10cd1337 100644 --- a/system/vl.c +++ b/system/vl.c @@ -3784,6 +3784,15 @@ void qemu_init(int argc, char **argv) configure_accelerators(argv[0]); phase_advance(PHASE_ACCEL_CREATED); + /* + * Must run after kvm_init completes, as virtcca_cvm_enabled() + * depends on initialization performed in kvm_init. + */ + if (virtcca_cvm_enabled()) { + virtcca_cvm_ram_size = current_machine->ram_size; + virtcca_shared_memory_address_space_init(); + } + /* * Beware, QOM objects created before this point miss global and * compat properties. -- Gitee From 10f5fa07068f54b23b01bf875259dc1a259d66b4 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Fri, 2 Aug 2024 01:35:25 +0800 Subject: [PATCH 604/939] qapi/qom,target/i386: csv-guest: Introduce secret-header-file=str and secret-file=str options This feature only applied to Hygon CSV. User can utilize the hag to generate secret header file and secret file, and inject these data to guest encrypted secret area automatically. Signed-off-by: hanliyang --- qapi/qom.json | 9 ++++- qemu-options.hx | 8 +++- target/i386/sev.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 2 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index 51d9daf55a..a74c7a91f9 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -869,6 +869,11 @@ # @user-id: the user id of the guest owner, only support on Hygon CPUs # (since 8.2) # +# @secret-header-file: the header file of guest owner's secret, only +# support on Hygon CPUs (since 8.2) +# @secret-file: the file guest owner's secret, only support on Hygon +# CPUs (since 8.2) +# # Since: 2.12 ## { 'struct': 'SevGuestProperties', @@ -880,7 +885,9 @@ '*cbitpos': 'uint32', 'reduced-phys-bits': 'uint32', '*kernel-hashes': 'bool', - '*user-id': 'str' } } + '*user-id': 'str', + '*secret-header-file': 'str', + '*secret-file': 'str' } } ## # @ThreadContextProperties: diff --git a/qemu-options.hx b/qemu-options.hx index 51ba9378b9..8516b73206 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -5637,7 +5637,7 @@ SRST -object secret,id=sec0,keyid=secmaster0,format=base64,\\ data=$SECRET,iv=$(user_id = g_strdup(value); } +static char * +sev_guest_get_secret_header_file(Object *obj, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + return g_strdup(s->secret_header_file); +} + +static void +sev_guest_set_secret_header_file(Object *obj, const char *value, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + s->secret_header_file = g_strdup(value); +} + +static char * +sev_guest_get_secret_file(Object *obj, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + return g_strdup(s->secret_file); +} + +static void +sev_guest_set_secret_file(Object *obj, const char *value, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + s->secret_file = g_strdup(value); +} + static char * sev_guest_get_sev_device(Object *obj, Error **errp) { @@ -448,6 +482,16 @@ sev_guest_class_init(ObjectClass *oc, void *data) sev_guest_set_user_id); object_class_property_set_description(oc, "user-id", "user id of the guest owner"); + object_class_property_add_str(oc, "secret-header-file", + sev_guest_get_secret_header_file, + sev_guest_set_secret_header_file); + object_class_property_set_description(oc, "secret-header-file", + "header file of the guest owner's secret"); + object_class_property_add_str(oc, "secret-file", + sev_guest_get_secret_file, + sev_guest_set_secret_file); + object_class_property_set_description(oc, "secret-file", + "file of the guest owner's secret"); } static void @@ -867,6 +911,9 @@ sev_launch_update_vmsa(SevGuestState *sev) return ret; } +static int +csv_load_launch_secret(const char *secret_header_file, const char *secret_file); + static void sev_launch_get_measure(Notifier *notifier, void *unused) { @@ -917,6 +964,15 @@ sev_launch_get_measure(Notifier *notifier, void *unused) /* encode the measurement value and emit the event */ sev->measurement = g_base64_encode(data, measurement.len); trace_kvm_sev_launch_measurement(sev->measurement); + + /* Hygon CSV will auto load guest owner's secret */ + if (is_hygon_cpu()) { + if (sev->secret_header_file && + strlen(sev->secret_header_file) && + sev->secret_file && + strlen(sev->secret_file)) + csv_load_launch_secret(sev->secret_header_file, sev->secret_file); + } } static char *sev_get_launch_measurement(void) @@ -2526,6 +2582,50 @@ int csv_load_incoming_cpu_state(QEMUFile *f) return ret; } +static int +csv_load_launch_secret(const char *secret_header_file, const char *secret_file) +{ + gsize secret_header_size, secret_size; + gchar *secret_header = NULL, *secret = NULL; + uint8_t *data; + struct sev_secret_area *area; + uint64_t gpa; + GError *error = NULL; + Error *local_err = NULL; + int ret = 0; + + if (!g_file_get_contents(secret_header_file, + &secret_header, + &secret_header_size, &error)) { + error_report("CSV: Failed to read '%s' (%s)", + secret_header_file, error->message); + g_error_free(error); + return -1; + } + + if (!g_file_get_contents(secret_file, &secret, &secret_size, &error)) { + error_report("CSV: Failed to read '%s' (%s)", secret_file, error->message); + g_error_free(error); + return -1; + } + + if (!pc_system_ovmf_table_find(SEV_SECRET_GUID, &data, NULL)) { + error_report("CSV: no secret area found in OVMF, gpa must be" + " specified."); + return -1; + } + area = (struct sev_secret_area *)data; + gpa = area->base; + + ret = sev_inject_launch_secret((char *)secret_header, + (char *)secret, gpa, &local_err); + + if (local_err) { + error_report_err(local_err); + } + return ret; +} + static const QemuUUID sev_hash_table_header_guid = { .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, 0xd4, 0x11, 0xfd, 0x21) -- Gitee From 9eb75830e70638d12efa0ec15a2f8b55e7c905da Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sat, 28 Sep 2024 14:46:28 +0800 Subject: [PATCH 605/939] target/i386: kvm: Support to get and enable extensions for Hygon CoCo guest To enable advanced Hygon CoCo features, we should detect these features during the initialization of VMs in the KVM accelerator. It is suggested to enable these features if they are detected, allowing the guest VM to run with additional functionalities. Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 7 +++++++ target/i386/csv.c | 2 ++ target/i386/csv.h | 2 ++ target/i386/kvm/csv-stub.c | 2 ++ target/i386/kvm/kvm.c | 17 +++++++++++++++++ 5 files changed, 30 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 05e499b45b..ab28e9af5e 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1204,6 +1204,13 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_TMM 300 #define KVM_CAP_SEV_ES_GHCB 500 +#define KVM_CAP_HYGON_COCO_EXT 501 +/* support userspace to request firmware to build CSV3 guest's memory space */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM (1 << 0) +/* support request to update CSV3 guest's memory region multiple times */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA (1 << 1) +/* support request to inject secret to CSV3 guest */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET (1 << 2) #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 diff --git a/target/i386/csv.c b/target/i386/csv.c index 571beeb61f..4aed225763 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -34,6 +34,8 @@ #include "csv.h" bool csv_kvm_cpu_reset_inhibit; +uint32_t kvm_hygon_coco_ext; +uint32_t kvm_hygon_coco_ext_inuse; struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = { .save_setup = sev_save_setup, diff --git a/target/i386/csv.h b/target/i386/csv.h index 8621f0b6fd..c1d4cec3e0 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -58,6 +58,8 @@ bool csv3_enabled(void); #define CSV_OUTGOING_PAGE_WINDOW_SIZE (4094 * TARGET_PAGE_SIZE) extern bool csv_kvm_cpu_reset_inhibit; +extern uint32_t kvm_hygon_coco_ext; +extern uint32_t kvm_hygon_coco_ext_inuse; typedef struct CsvBatchCmdList CsvBatchCmdList; typedef void (*CsvDestroyCmdNodeFn) (void *data); diff --git a/target/i386/kvm/csv-stub.c b/target/i386/kvm/csv-stub.c index 4d1376f268..8662d33206 100644 --- a/target/i386/kvm/csv-stub.c +++ b/target/i386/kvm/csv-stub.c @@ -15,3 +15,5 @@ #include "csv.h" bool csv_kvm_cpu_reset_inhibit; +uint32_t kvm_hygon_coco_ext; +uint32_t kvm_hygon_coco_ext_inuse; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 925f4f8040..12e920bbb4 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2639,6 +2639,23 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + if (is_hygon_cpu()) { + /* check and enable Hygon coco extensions */ + kvm_hygon_coco_ext = (uint32_t)kvm_vm_check_extension(s, + KVM_CAP_HYGON_COCO_EXT); + if (kvm_hygon_coco_ext) { + ret = kvm_vm_enable_cap(s, KVM_CAP_HYGON_COCO_EXT, 0, + (uint64_t)kvm_hygon_coco_ext); + if (ret == -EINVAL) { + error_report("kvm: Failed to enable KVM_CAP_HYGON_COCO_EXT cap: %s", + strerror(-ret)); + kvm_hygon_coco_ext_inuse = 0; + } else { + kvm_hygon_coco_ext_inuse = (uint32_t)ret; + } + } + } + ret = kvm_get_supported_msrs(s); if (ret < 0) { return ret; -- Gitee From ded4216fbfe740196a3ace80f5cb162b73f676b2 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sat, 28 Sep 2024 17:37:17 +0800 Subject: [PATCH 606/939] target/i386: csv: Request to set private memory of CSV3 guest if the extension is enabled If Qemu negotiates with Linux KVM to enable the KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM capability, then Qemu should explicitly request the issuance of the CSV3_CMD_SET_GUEST_PRIVATE_MEMORY command. Signed-off-by: hanliyang --- hw/i386/pc_sysfw.c | 3 +++ include/sysemu/kvm.h | 9 +++++++++ linux-headers/linux/kvm.h | 2 ++ target/i386/csv-sysemu-stub.c | 5 +++++ target/i386/csv.c | 23 +++++++++++++++++++++++ target/i386/csv.h | 2 ++ target/i386/trace-events | 3 ++- 7 files changed, 46 insertions(+), 1 deletion(-) diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index 2bbcbb8d35..7c6a910250 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -268,6 +268,9 @@ void x86_firmware_configure(void *ptr, int size) ram_addr_t offset = 0; MemoryRegion *mr; + if (kvm_csv3_should_set_priv_mem()) + csv3_set_guest_private_memory(&error_fatal); + mr = memory_region_from_host(ptr, &offset); if (!mr) { error_report("failed to get memory region of flash"); diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 438b4e9183..176aa53cbe 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -154,6 +154,14 @@ extern bool kvm_csv3_allowed; */ #define kvm_csv3_enabled() (kvm_csv3_allowed) +/** + * kvm_csv3_should_set_priv_mem: + * Returns: true if we should explicitly request + * KVM_CSV3_SET_GUEST_PRIVATE_MEMORY. + */ +#define kvm_csv3_should_set_priv_mem() \ + (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM) + #else #define kvm_enabled() (0) @@ -171,6 +179,7 @@ extern bool kvm_csv3_allowed; #define kvm_readonly_mem_enabled() (false) #define kvm_msi_devid_required() (false) #define kvm_csv3_enabled() (false) +#define kvm_csv3_should_set_priv_mem() (false) #endif /* CONFIG_KVM_IS_POSSIBLE */ diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index ab28e9af5e..84cec64b88 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2133,6 +2133,8 @@ enum csv3_cmd_id { KVM_CSV3_RECEIVE_ENCRYPT_DATA, KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, + KVM_CSV3_SET_GUEST_PRIVATE_MEMORY = 0xc8, + KVM_CSV3_NR_MAX, }; diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c index db22c299a6..e49755da5c 100644 --- a/target/i386/csv-sysemu-stub.c +++ b/target/i386/csv-sysemu-stub.c @@ -39,3 +39,8 @@ void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) { } + +int csv3_set_guest_private_memory(Error **errp) +{ + g_assert_not_reached(); +} diff --git a/target/i386/csv.c b/target/i386/csv.c index 4aed225763..d9b50040a3 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -698,3 +698,26 @@ int csv3_load_incoming_context(QEMUFile *f) /* receive csv3 context. */ return csv3_receive_encrypt_context(s, f); } + +int csv3_set_guest_private_memory(Error **errp) +{ + int fw_error; + int ret = 0; + + if (!csv3_enabled()) { + error_setg(errp, "%s: CSV3 is not enabled", __func__); + return -1; + } + + /* if CSV3 is in update state then load the data to secure memory */ + if (csv3_check_state(SEV_STATE_LAUNCH_UPDATE)) { + trace_kvm_csv3_set_guest_private_memory(); + ret = csv3_ioctl(KVM_CSV3_SET_GUEST_PRIVATE_MEMORY, NULL, &fw_error); + if (ret) + error_setg(errp, "%s: CSV3 fail set private memory, ret=%d" + " fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + } + + return ret; +} diff --git a/target/i386/csv.h b/target/i386/csv.h index c1d4cec3e0..fb669279a8 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -130,4 +130,6 @@ int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent); +int csv3_set_guest_private_memory(Error **errp); + #endif diff --git a/target/i386/trace-events b/target/i386/trace-events index ad3cfb9612..5d4a709a39 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -21,8 +21,9 @@ kvm_sev_send_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *dst, int len kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int len, void *hdr, int hdr_len) "cpu_id %d cpu_index %d trans %p len %d hdr %p hdr_len %d" # csv.c -kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 "addr %p len 0x%" PRIx64 +kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 " addr %p len 0x%" PRIx64 kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d" kvm_csv3_send_encrypt_context(void *dst, int len) "trans %p len %d" kvm_csv3_receive_encrypt_data(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" kvm_csv3_receive_encrypt_context(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" +kvm_csv3_set_guest_private_memory(void) "" -- Gitee From ca6d5f032ab4c93d78c90a83beefcfb05bf1ad79 Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sat, 28 Sep 2024 17:55:13 +0800 Subject: [PATCH 607/939] target/i386: csv: Support load kernel hashes for CSV3 guest only if the extension is enabled The CSV3 guest can only update kernel hashes when the KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA capability is enabled. Signed-off-by: hanliyang --- target/i386/sev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index 721eca2150..3a9c9ceec7 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -2748,7 +2748,17 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) /* zero the excess data so the measurement can be reliably calculated */ memset(padded_ht->padding, 0, sizeof(padded_ht->padding)); - if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) { + if (csv3_enabled()) { + if (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA) { + if (csv3_load_data(area->base, (uint8_t *)padded_ht, + sizeof(*padded_ht), errp) < 0) { + ret = false; + } + } else { + error_report("%s: CSV3 load kernel hashes unsupported!", __func__); + ret = false; + } + } else if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) { ret = false; } -- Gitee From b74c6b8971610ffc9c901a9b22c92b40084a74bf Mon Sep 17 00:00:00 2001 From: hanliyang Date: Sun, 29 Sep 2024 15:03:47 +0800 Subject: [PATCH 608/939] target/i386: csv: Support inject secret for CSV3 guest only if the extension is enabled The CSV3 guest can only inject secrets when the KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET capability is enabled. Additionally, if the guest is a CSV3 guest, the guest_uaddr field of the KVM ioctl's input should be set to the value of the GPA. Signed-off-by: hanliyang --- target/i386/sev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/target/i386/sev.c b/target/i386/sev.c index 3a9c9ceec7..b4b42fd716 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1416,7 +1416,17 @@ int sev_inject_launch_secret(const char *packet_hdr, const char *secret, input.trans_uaddr = (uint64_t)(unsigned long)data; input.trans_len = data_sz; - input.guest_uaddr = (uint64_t)(unsigned long)hva; + /* For Hygon CSV3 guest, the guest_uaddr should be the gpa */ + if (csv3_enabled()) { + if (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET) { + input.guest_uaddr = gpa; + } else { + error_setg(errp, "CSV3 inject secret unsupported!"); + return 1; + } + } else { + input.guest_uaddr = (uint64_t)(unsigned long)hva; + } input.guest_len = data_sz; trace_kvm_sev_launch_secret(gpa, input.guest_uaddr, -- Gitee From 166ecdd78a0f5cf359c0cbb4f7a5c32beee12fd7 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:18 +0800 Subject: [PATCH 609/939] vfio: Introduce base object for VFIOContainer and targeted interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a dumb VFIOContainerBase object and its targeted interface. This is willingly not a QOM object because we don't want it to be visible from the user interface. The VFIOContainerBase will be smoothly populated in subsequent patches as well as interfaces. No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- include/hw/vfio/vfio-common.h | 8 ++--- include/hw/vfio/vfio-container-base.h | 50 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 include/hw/vfio/vfio-container-base.h diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index fd9828d50b..c89b5886f2 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -30,6 +30,7 @@ #include #endif #include "sysemu/sysemu.h" +#include "hw/vfio/vfio-container-base.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -89,6 +90,7 @@ typedef struct VFIODMARange { } VFIODMARange; typedef struct VFIOContainer { + VFIOContainerBase bcontainer; VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; @@ -211,12 +213,6 @@ typedef struct VFIODisplay { } dmabuf; } VFIODisplay; -typedef struct { - unsigned long *bitmap; - hwaddr size; - hwaddr pages; -} VFIOBitmap; - VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h new file mode 100644 index 0000000000..1d6daaea5d --- /dev/null +++ b/include/hw/vfio/vfio-container-base.h @@ -0,0 +1,50 @@ +/* + * VFIO BASE CONTAINER + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H +#define HW_VFIO_VFIO_CONTAINER_BASE_H + +#include "exec/memory.h" + +typedef struct VFIODevice VFIODevice; +typedef struct VFIOIOMMUOps VFIOIOMMUOps; + +typedef struct { + unsigned long *bitmap; + hwaddr size; + hwaddr pages; +} VFIOBitmap; + +/* + * This is the base object for vfio container backends + */ +typedef struct VFIOContainerBase { + const VFIOIOMMUOps *ops; +} VFIOContainerBase; + +struct VFIOIOMMUOps { + /* basic feature */ + int (*dma_map)(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); + int (*dma_unmap)(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + int (*attach_device)(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + void (*detach_device)(VFIODevice *vbasedev); + /* migration feature */ + int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); + int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); +}; +#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ -- Gitee From bda13dc55ae5e16174a4a611353f4bb8a590d510 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:19 +0800 Subject: [PATCH 610/939] vfio/container: Introduce a empty VFIOIOMMUOps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This empty VFIOIOMMUOps named vfio_legacy_ops will hold all general IOMMU ops of legacy container. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container.c | 5 +++++ include/hw/vfio/vfio-common.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 77e61cfedd..8d8ed13e93 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -565,6 +565,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { VFIOContainer *container; + VFIOContainerBase *bcontainer; int ret, fd; VFIOAddressSpace *space; @@ -646,6 +647,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; + bcontainer->ops = &vfio_legacy_ops; ret = vfio_init_container(container, group->fd, errp); if (ret) { @@ -1046,3 +1049,5 @@ void vfio_detach_device(VFIODevice *vbasedev) vfio_put_base_device(vbasedev); vfio_put_group(group); } + +const VFIOIOMMUOps vfio_legacy_ops; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c89b5886f2..3a0a6ab6ee 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -268,7 +268,7 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; - +extern const VFIOIOMMUOps vfio_legacy_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; -- Gitee From 775cf7c2a0dc34d7163eeea1aab6bfc6cb28be9b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:20 +0800 Subject: [PATCH 611/939] vfio/container: Switch to dma_map|unmap API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 45 +++++++++++++++------------ hw/vfio/container-base.c | 32 +++++++++++++++++++ hw/vfio/container.c | 22 ++++++++----- hw/vfio/meson.build | 1 + hw/vfio/trace-events | 2 +- include/hw/vfio/vfio-common.h | 4 --- include/hw/vfio/vfio-container-base.h | 7 +++++ 7 files changed, 81 insertions(+), 32 deletions(-) create mode 100644 hw/vfio/container-base.c diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e08b147b3d..ea63271167 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainer *container = giommu->container; + VFIOContainerBase *bcontainer = &giommu->container->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; void *vaddr; int ret; @@ -322,21 +322,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) * of vaddr will always be there, even if the memory object is * destroyed and its backing memory munmap-ed. */ - ret = vfio_dma_map(container, iova, - iotlb->addr_mask + 1, vaddr, - read_only); + ret = vfio_container_dma_map(bcontainer, iova, + iotlb->addr_mask + 1, vaddr, + read_only); if (ret) { - error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", - container, iova, + bcontainer, iova, iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); } } else { - ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); + ret = vfio_container_dma_unmap(bcontainer, iova, + iotlb->addr_mask + 1, iotlb); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, + bcontainer, iova, iotlb->addr_mask + 1, ret, strerror(-ret)); vfio_set_migration_error(ret); } @@ -355,9 +356,10 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, int ret; /* Unmap with a single call. */ - ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + ret = vfio_container_dma_unmap(&vrdl->container->bcontainer, + iova, size , NULL); if (ret) { - error_report("%s: vfio_dma_unmap() failed: %s", __func__, + error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); } } @@ -385,8 +387,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, section->offset_within_address_space; vaddr = memory_region_get_ram_ptr(section->mr) + start; - ret = vfio_dma_map(vrdl->container, iova, next - start, - vaddr, section->readonly); + ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova, + next - start, vaddr, section->readonly); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -684,10 +686,11 @@ static void vfio_listener_region_add(MemoryListener *listener, } } - ret = vfio_dma_map(container, iova, int128_get64(llsize), - vaddr, section->readonly); + ret = vfio_container_dma_map(&container->bcontainer, + iova, int128_get64(llsize), vaddr, + section->readonly); if (ret) { - error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", container, iova, int128_get64(llsize), vaddr, ret, strerror(-ret)); @@ -784,18 +787,20 @@ static void vfio_listener_region_del(MemoryListener *listener, if (int128_eq(llsize, int128_2_64())) { /* The unmap ioctl doesn't accept a full 64-bit span. */ llsize = int128_rshift(llsize, 1); - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(&container->bcontainer, iova, + int128_get64(llsize), NULL); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", container, iova, int128_get64(llsize), ret, strerror(-ret)); } iova += int128_get64(llsize); } - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(&container->bcontainer, iova, + int128_get64(llsize), NULL); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", container, iova, int128_get64(llsize), ret, strerror(-ret)); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c new file mode 100644 index 0000000000..55d3a35fa4 --- /dev/null +++ b/hw/vfio/container-base.c @@ -0,0 +1,32 @@ +/* + * VFIO BASE CONTAINER + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/vfio/vfio-container-base.h" + +int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly) +{ + g_assert(bcontainer->ops->dma_map); + return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly); +} + +int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + g_assert(bcontainer->ops->dma_unmap); + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); +} diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 8d8ed13e93..40e378e888 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -140,9 +140,11 @@ void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ -int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb) +static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, IOMMUTLBEntry *iotlb) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), .flags = 0, @@ -193,7 +195,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, */ if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && container->iommu_type == VFIO_TYPE1v2_IOMMU) { - trace_vfio_dma_unmap_overflow_workaround(); + trace_vfio_legacy_dma_unmap_overflow_workaround(); unmap.size -= 1ULL << ctz64(container->pgsizes); continue; } @@ -212,9 +214,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, return 0; } -int vfio_dma_map(VFIOContainer *container, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) +static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ, @@ -241,7 +245,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && + (errno == EBUSY && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -1050,4 +1055,7 @@ void vfio_detach_device(VFIODevice *vbasedev) vfio_put_group(group); } -const VFIOIOMMUOps vfio_legacy_ops; +const VFIOIOMMUOps vfio_legacy_ops = { + .dma_map = vfio_legacy_dma_map, + .dma_unmap = vfio_legacy_dma_unmap, +}; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index b1db4c8605..32a6933280 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -2,6 +2,7 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'helpers.c', 'common.c', + 'container-base.c', 'container.c', 'spapr.c', 'migration.c', diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0eb2387cf2..9f7fedee98 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -116,7 +116,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" -vfio_dma_unmap_overflow_workaround(void) "" +vfio_legacy_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 3a0a6ab6ee..f94baf72db 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -221,10 +221,6 @@ bool vfio_devices_all_running_and_saving(VFIOContainer *container); VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, hwaddr start_addr, hwaddr size); void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); -int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb); -int vfio_dma_map(VFIOContainer *container, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly); int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 1d6daaea5d..56b033f59f 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -31,6 +31,13 @@ typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; } VFIOContainerBase; +int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + struct VFIOIOMMUOps { /* basic feature */ int (*dma_map)(VFIOContainerBase *bcontainer, -- Gitee From ff4e67fa5ceb31f1dc686a661cbf37c1a81cd644 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:21 +0800 Subject: [PATCH 612/939] vfio/common: Introduce vfio_container_init/destroy helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds two helper functions vfio_container_init/destroy which will be used by both legacy and iommufd containers to do base container specific initialization and release. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container-base.c | 9 +++++++++ hw/vfio/container.c | 4 +++- include/hw/vfio/vfio-container-base.h | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 55d3a35fa4..e929435751 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -30,3 +30,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, g_assert(bcontainer->ops->dma_unmap); return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } + +void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) +{ + bcontainer->ops = ops; +} + +void vfio_container_destroy(VFIOContainerBase *bcontainer) +{ +} diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 40e378e888..5a8c55056b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -653,7 +653,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; - bcontainer->ops = &vfio_legacy_ops; + vfio_container_init(bcontainer, &vfio_legacy_ops); ret = vfio_init_container(container, group->fd, errp); if (ret) { @@ -765,6 +765,7 @@ put_space_exit: static void vfio_disconnect_container(VFIOGroup *group) { VFIOContainer *container = group->container; + VFIOContainerBase *bcontainer = &container->bcontainer; QLIST_REMOVE(group, container_next); group->container = NULL; @@ -803,6 +804,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(giommu, giommu_next); g_free(giommu); } + vfio_container_destroy(bcontainer); trace_vfio_disconnect_container(container->fd); close(container->fd); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 56b033f59f..577f52ccbc 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -38,6 +38,10 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); +void vfio_container_init(VFIOContainerBase *bcontainer, + const VFIOIOMMUOps *ops); +void vfio_container_destroy(VFIOContainerBase *bcontainer); + struct VFIOIOMMUOps { /* basic feature */ int (*dma_map)(VFIOContainerBase *bcontainer, -- Gitee From 350f1a4d221849cc26a6d3950c128f951648c391 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:22 +0800 Subject: [PATCH 613/939] vfio/common: Move giommu_list in base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the giommu_list field in the base container and store the base container in the VFIOGuestIOMMU. No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 17 +++++++++++------ hw/vfio/container-base.c | 9 +++++++++ hw/vfio/container.c | 8 -------- include/hw/vfio/vfio-common.h | 9 --------- include/hw/vfio/vfio-container-base.h | 9 +++++++++ 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ea63271167..b8007b22c3 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainerBase *bcontainer = &giommu->container->bcontainer; + VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; void *vaddr; int ret; @@ -569,6 +569,7 @@ static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = &container->bcontainer; hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -612,7 +613,7 @@ static void vfio_listener_region_add(MemoryListener *listener, giommu->iommu_mr = iommu_mr; giommu->iommu_offset = section->offset_within_address_space - section->offset_within_region; - giommu->container = container; + giommu->bcontainer = bcontainer; llend = int128_add(int128_make64(section->offset_within_region), section->size); llend = int128_sub(llend, int128_one()); @@ -647,7 +648,7 @@ static void vfio_listener_region_add(MemoryListener *listener, g_free(giommu); goto fail; } - QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); return; @@ -732,6 +733,7 @@ static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = &container->bcontainer; hwaddr iova, end; Int128 llend, llsize; int ret; @@ -744,7 +746,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { memory_region_unregister_iommu_notifier(section->mr, @@ -1211,7 +1213,9 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier *gdn = container_of(n, vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; - VFIOContainer *container = giommu->container; + VFIOContainerBase *bcontainer = giommu->bcontainer; + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; int ret = -EINVAL; @@ -1289,12 +1293,13 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { + VFIOContainerBase *bcontainer = &container->bcontainer; ram_addr_t ram_addr; if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { Int128 llend; diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index e929435751..20bcb9669a 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -34,8 +34,17 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) { bcontainer->ops = ops; + QLIST_INIT(&bcontainer->giommu_list); } void vfio_container_destroy(VFIOContainerBase *bcontainer) { + VFIOGuestIOMMU *giommu, *tmp; + + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier( + MEMORY_REGION(giommu->iommu_mr), &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 5a8c55056b..03791601d0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -649,7 +649,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->dirty_pages_supported = false; container->dma_max_mappings = 0; container->iova_ranges = NULL; - QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; @@ -794,16 +793,9 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; - VFIOGuestIOMMU *giommu, *tmp; QLIST_REMOVE(container, next); - QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier( - MEMORY_REGION(giommu->iommu_mr), &giommu->n); - QLIST_REMOVE(giommu, giommu_next); - g_free(giommu); - } vfio_container_destroy(bcontainer); trace_vfio_disconnect_container(container->fd); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f94baf72db..6f02952ff6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -104,7 +104,6 @@ typedef struct VFIOContainer { uint64_t max_dirty_bitmap_size; unsigned long pgsizes; unsigned int dma_max_mappings; - QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; @@ -114,14 +113,6 @@ typedef struct VFIOContainer { GList *iova_ranges; } VFIOContainer; -typedef struct VFIOGuestIOMMU { - VFIOContainer *container; - IOMMUMemoryRegion *iommu_mr; - hwaddr iommu_offset; - IOMMUNotifier n; - QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; -} VFIOGuestIOMMU; - typedef struct VFIORamDiscardListener { VFIOContainer *container; MemoryRegion *mr; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 577f52ccbc..a11aec5755 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -29,8 +29,17 @@ typedef struct { */ typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; } VFIOContainerBase; +typedef struct VFIOGuestIOMMU { + VFIOContainerBase *bcontainer; + IOMMUMemoryRegion *iommu_mr; + hwaddr iommu_offset; + IOMMUNotifier n; + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; +} VFIOGuestIOMMU; + int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 97979ab4d92d0006ffefb586675b6110e5b7a746 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:23 +0800 Subject: [PATCH 614/939] vfio/container: Move space field to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the space field to the base object. Also the VFIOAddressSpace now contains a list of base containers. No functional change intended. Modify hw/vfio/container.c: vfio_connect_container->shared_memory_listener_register in kvm_csv3_enabled during backporting. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/ppc/spapr_pci_vfio.c | 10 +++++----- hw/vfio/common.c | 4 ++-- hw/vfio/container-base.c | 6 +++++- hw/vfio/container.c | 20 +++++++++----------- include/hw/vfio/vfio-common.h | 8 -------- include/hw/vfio/vfio-container-base.h | 9 +++++++++ 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index f283f7e38d..d1d07bec46 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -84,27 +84,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) { VFIOAddressSpace *space = vfio_get_address_space(as); - VFIOContainer *container = NULL; + VFIOContainerBase *bcontainer = NULL; if (QLIST_EMPTY(&space->containers)) { /* No containers to act on */ goto out; } - container = QLIST_FIRST(&space->containers); + bcontainer = QLIST_FIRST(&space->containers); - if (QLIST_NEXT(container, next)) { + if (QLIST_NEXT(bcontainer, next)) { /* * We don't yet have logic to synchronize EEH state across * multiple containers */ - container = NULL; + bcontainer = NULL; goto out; } out: vfio_put_address_space(space); - return container; + return container_of(bcontainer, VFIOContainer, bcontainer); } static bool vfio_eeh_as_ok(AddressSpace *as) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b8007b22c3..2f3f66991a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) bool vfio_viommu_preset(VFIODevice *vbasedev) { - return vbasedev->container->space->as != &address_space_memory; + return vbasedev->container->bcontainer.space->as != &address_space_memory; } static void vfio_set_migration_error(int err) @@ -922,7 +922,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, dirty.container = container; memory_listener_register(&dirty.listener, - container->space->as); + container->bcontainer.space->as); *ranges = dirty.ranges; diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 20bcb9669a..3933391e0d 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -31,9 +31,11 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } -void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops) +void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + const VFIOIOMMUOps *ops) { bcontainer->ops = ops; + bcontainer->space = space; QLIST_INIT(&bcontainer->giommu_list); } @@ -41,6 +43,8 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) { VFIOGuestIOMMU *giommu, *tmp; + QLIST_REMOVE(bcontainer, next); + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { memory_region_unregister_iommu_notifier( MEMORY_REGION(giommu->iommu_mr), &giommu->n); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 03791601d0..b7ab0d7323 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -607,7 +607,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ - QLIST_FOREACH(container, &space->containers, next) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { ret = vfio_ram_block_discard_disable(container, true); if (ret) { @@ -643,7 +644,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } container = g_malloc0(sizeof(*container)); - container->space = space; container->fd = fd; container->error = NULL; container->dirty_pages_supported = false; @@ -652,7 +652,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; - vfio_container_init(bcontainer, &vfio_legacy_ops); + vfio_container_init(bcontainer, space, &vfio_legacy_ops); ret = vfio_init_container(container, group->fd, errp); if (ret) { @@ -708,7 +708,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_kvm_device_add_group(group); QLIST_INIT(&container->group_list); - QLIST_INSERT_HEAD(&space->containers, container, next); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); @@ -717,9 +717,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (kvm_csv3_enabled()) { shared_memory_listener_register(&container->listener, - container->space->as); + bcontainer->space->as); } else { - memory_listener_register(&container->listener, container->space->as); + memory_listener_register(&container->listener, bcontainer->space->as); } if (container->error) { @@ -734,7 +734,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, return 0; listener_release_exit: QLIST_REMOVE(group, container_next); - QLIST_REMOVE(container, next); + QLIST_REMOVE(bcontainer, next); vfio_kvm_device_del_group(group); if (kvm_csv3_enabled()) { shared_memory_listener_unregister(); @@ -792,9 +792,7 @@ static void vfio_disconnect_container(VFIOGroup *group) } if (QLIST_EMPTY(&container->group_list)) { - VFIOAddressSpace *space = container->space; - - QLIST_REMOVE(container, next); + VFIOAddressSpace *space = bcontainer->space; vfio_container_destroy(bcontainer); @@ -815,7 +813,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) QLIST_FOREACH(group, &vfio_group_list, next) { if (group->groupid == groupid) { /* Found it. Now is it already in the right context? */ - if (group->container->space->as == as) { + if (group->container->bcontainer.space->as == as) { return group; } else { error_setg(errp, "group %d used in multiple address spaces", diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 6f02952ff6..31c9df4b03 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -73,12 +73,6 @@ typedef struct VFIOMigration { bool initial_data_sent; } VFIOMigration; -typedef struct VFIOAddressSpace { - AddressSpace *as; - QLIST_HEAD(, VFIOContainer) containers; - QLIST_ENTRY(VFIOAddressSpace) list; -} VFIOAddressSpace; - struct VFIOGroup; typedef struct VFIODMARange { @@ -91,7 +85,6 @@ typedef struct VFIODMARange { typedef struct VFIOContainer { VFIOContainerBase bcontainer; - VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; MemoryListener prereg_listener; @@ -108,7 +101,6 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_HEAD(, VFIODMARange) dma_list; - QLIST_ENTRY(VFIOContainer) next; QLIST_HEAD(, VFIODevice) device_list; GList *iova_ranges; } VFIOContainer; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index a11aec5755..c7cc6ec9c5 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -24,12 +24,20 @@ typedef struct { hwaddr pages; } VFIOBitmap; +typedef struct VFIOAddressSpace { + AddressSpace *as; + QLIST_HEAD(, VFIOContainerBase) containers; + QLIST_ENTRY(VFIOAddressSpace) list; +} VFIOAddressSpace; + /* * This is the base object for vfio container backends */ typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; + VFIOAddressSpace *space; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_ENTRY(VFIOContainerBase) next; } VFIOContainerBase; typedef struct VFIOGuestIOMMU { @@ -48,6 +56,7 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, IOMMUTLBEntry *iotlb); void vfio_container_init(VFIOContainerBase *bcontainer, + VFIOAddressSpace *space, const VFIOIOMMUOps *ops); void vfio_container_destroy(VFIOContainerBase *bcontainer); -- Gitee From c8c17aaddeee1e5002fc4bde7245719db75d4021 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:24 +0800 Subject: [PATCH 615/939] vfio/container: Switch to IOMMU BE set_dirty_page_tracking/query_dirty_bitmap API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dirty_pages_supported field is also moved to the base container No functional change intended. Modify vfio_listener_log_clear during backporting. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 14 +++++++++----- hw/vfio/container-base.c | 16 ++++++++++++++++ hw/vfio/container.c | 21 ++++++++++++++------- include/hw/vfio/vfio-common.h | 5 ----- include/hw/vfio/vfio-container-base.h | 6 ++++++ 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 2f3f66991a..3be6cecc63 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1079,7 +1079,8 @@ static void vfio_listener_log_global_start(MemoryListener *listener) if (vfio_devices_all_device_dirty_tracking(container)) { ret = vfio_devices_dma_logging_start(container); } else { - ret = vfio_set_dirty_page_tracking(container, true); + ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, + true); } if (ret) { @@ -1097,7 +1098,8 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) if (vfio_devices_all_device_dirty_tracking(container)) { vfio_devices_dma_logging_stop(container); } else { - ret = vfio_set_dirty_page_tracking(container, false); + ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, + false); } if (ret) { @@ -1166,7 +1168,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, VFIODMARange *qrange; int ret; - if (!container->dirty_pages_supported && !all_device_dirty_tracking) { + if (!container->bcontainer.dirty_pages_supported && + !all_device_dirty_tracking) { cpu_physical_memory_set_dirty_range(ram_addr, size, tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE); @@ -1187,7 +1190,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, if (all_device_dirty_tracking) { ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); } else { - ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); + ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap, + iova, size); } if (ret) { @@ -1480,7 +1484,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, VFIOContainer *container = container_of(listener, VFIOContainer, listener); if (vfio_listener_skipped_section(section) || - !container->dirty_pages_supported) { + !container->bcontainer.dirty_pages_supported) { return; } diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 3933391e0d..5d654ae172 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -31,11 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } +int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) +{ + g_assert(bcontainer->ops->set_dirty_page_tracking); + return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); +} + +int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) +{ + g_assert(bcontainer->ops->query_dirty_bitmap); + return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size); +} + void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, const VFIOIOMMUOps *ops) { bcontainer->ops = ops; bcontainer->space = space; + bcontainer->dirty_pages_supported = false; QLIST_INIT(&bcontainer->giommu_list); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index b7ab0d7323..cf373e42ef 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -157,7 +157,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, if (iotlb && vfio_devices_all_running_and_mig_active(container)) { if (!vfio_devices_all_device_dirty_tracking(container) && - container->dirty_pages_supported) { + container->bcontainer.dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -255,14 +255,17 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, return -errno; } -int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { .argsz = sizeof(dirty), }; - if (!container->dirty_pages_supported) { + if (!bcontainer->dirty_pages_supported) { return 0; } @@ -282,9 +285,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) return ret; } -int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, - hwaddr iova, hwaddr size) +static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; int ret; @@ -528,7 +534,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - container->dirty_pages_supported = true; + container->bcontainer.dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; } @@ -646,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; container->error = NULL; - container->dirty_pages_supported = false; container->dma_max_mappings = 0; container->iova_ranges = NULL; QLIST_INIT(&container->vrdl_list); @@ -1050,4 +1055,6 @@ void vfio_detach_device(VFIODevice *vbasedev) const VFIOIOMMUOps vfio_legacy_ops = { .dma_map = vfio_legacy_dma_map, .dma_unmap = vfio_legacy_dma_unmap, + .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, + .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, }; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 31c9df4b03..af0ef9042d 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -91,7 +91,6 @@ typedef struct VFIOContainer { unsigned iommu_type; Error *error; bool initialized; - bool dirty_pages_supported; bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; @@ -200,13 +199,9 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); bool vfio_devices_all_running_and_saving(VFIOContainer *container); -/* container->fd */ VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, hwaddr start_addr, hwaddr size); void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); -int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); -int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, - hwaddr iova, hwaddr size); /* SPAPR specific */ int vfio_container_add_section_window(VFIOContainer *container, diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index c7cc6ec9c5..f244f003d0 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -36,6 +36,7 @@ typedef struct VFIOAddressSpace { typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; VFIOAddressSpace *space; + bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_ENTRY(VFIOContainerBase) next; } VFIOContainerBase; @@ -54,6 +55,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); +int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start); +int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, -- Gitee From 22244582a5ff77c0d93008e603a343c1e47ca85d Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:25 +0800 Subject: [PATCH 616/939] vfio/container: Move per container device list in base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VFIO Device is also changed to point to base container instead of legacy container. No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 23 +++++++++++++++-------- hw/vfio/container.c | 12 ++++++------ include/hw/vfio/vfio-common.h | 3 +-- include/hw/vfio/vfio-container-base.h | 1 + 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 3be6cecc63..b952d1c811 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) bool vfio_viommu_preset(VFIODevice *vbasedev) { - return vbasedev->container->bcontainer.space->as != &address_space_memory; + return vbasedev->bcontainer->space->as != &address_space_memory; } static void vfio_set_migration_error(int err) @@ -179,6 +179,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); @@ -187,7 +188,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return false; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { VFIOMigration *migration = vbasedev->migration; if (!migration) { @@ -205,9 +206,10 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (!vbasedev->dirty_pages_supported) { return false; } @@ -222,13 +224,14 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) */ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; if (!migration_is_active(migrate_get_current())) { return false; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { VFIOMigration *migration = vbasedev->migration; if (!migration) { @@ -833,12 +836,13 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, VFIOContainer *container) { VFIOPCIDevice *pcidev; + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; Object *owner; owner = memory_region_owner(section->mr); - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { continue; } @@ -939,13 +943,14 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; feature->argsz = sizeof(buf); feature->flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (!vbasedev->dirty_tracking) { continue; } @@ -1036,6 +1041,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) { struct vfio_device_feature *feature; VFIODirtyRanges ranges; + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret = 0; @@ -1046,7 +1052,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) return -errno; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (vbasedev->dirty_tracking) { continue; } @@ -1139,10 +1145,11 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { + VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { ret = vfio_device_dma_logging_report(vbasedev, iova, size, vbmap->bitmap); if (ret) { diff --git a/hw/vfio/container.c b/hw/vfio/container.c index cf373e42ef..74d236ddee 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1001,7 +1001,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, int groupid = vfio_device_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; VFIOGroup *group; - VFIOContainer *container; + VFIOContainerBase *bcontainer; int ret; if (groupid < 0) { @@ -1028,9 +1028,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } - container = group->container; - vbasedev->container = container; - QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next); + bcontainer = &group->container->bcontainer; + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); return ret; @@ -1040,13 +1040,13 @@ void vfio_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - if (!vbasedev->container) { + if (!vbasedev->bcontainer) { return; } QLIST_REMOVE(vbasedev, global_next); QLIST_REMOVE(vbasedev, container_next); - vbasedev->container = NULL; + vbasedev->bcontainer = NULL; trace_vfio_detach_device(vbasedev->name, group->groupid); vfio_put_base_device(vbasedev); vfio_put_group(group); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index af0ef9042d..e27854228c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -100,7 +100,6 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_HEAD(, VFIODMARange) dma_list; - QLIST_HEAD(, VFIODevice) device_list; GList *iova_ranges; } VFIOContainer; @@ -128,7 +127,7 @@ typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) container_next; QLIST_ENTRY(VFIODevice) global_next; struct VFIOGroup *group; - VFIOContainer *container; + VFIOContainerBase *bcontainer; char *sysfsdev; char *name; DeviceState *dev; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index f244f003d0..7090962496 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -39,6 +39,7 @@ typedef struct VFIOContainerBase { bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_ENTRY(VFIOContainerBase) next; + QLIST_HEAD(, VFIODevice) device_list; } VFIOContainerBase; typedef struct VFIOGuestIOMMU { -- Gitee From 718cfbf181541fa4142aba10d5aee839e06b4d66 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:26 +0800 Subject: [PATCH 617/939] vfio/container: Convert functions to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the prospect to get rid of VFIOContainer refs in common.c lets convert misc functions to use the base container object instead: vfio_devices_all_dirty_tracking vfio_devices_all_device_dirty_tracking vfio_devices_all_running_and_mig_active vfio_devices_query_dirty_bitmap vfio_get_dirty_bitmap Modify vfio_get_dirty_bitmap/vfio_listener_log_clear during backporting. Signed-off-by: Eric Auger Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 46 ++++++++++++++++------------------- hw/vfio/container.c | 6 ++--- hw/vfio/trace-events | 2 +- include/hw/vfio/vfio-common.h | 9 +++---- 4 files changed, 29 insertions(+), 34 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b952d1c811..b663d0bcc0 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -177,9 +177,8 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; } -static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) +static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); @@ -204,9 +203,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return true; } -bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) +bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { @@ -222,9 +220,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. */ -bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) +bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; if (!migration_is_active(migrate_get_current())) { @@ -1082,7 +1079,7 @@ static void vfio_listener_log_global_start(MemoryListener *listener) VFIOContainer *container = container_of(listener, VFIOContainer, listener); int ret; - if (vfio_devices_all_device_dirty_tracking(container)) { + if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { ret = vfio_devices_dma_logging_start(container); } else { ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, @@ -1101,7 +1098,7 @@ static void vfio_listener_log_global_stop(MemoryListener *listener) VFIOContainer *container = container_of(listener, VFIOContainer, listener); int ret = 0; - if (vfio_devices_all_device_dirty_tracking(container)) { + if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { vfio_devices_dma_logging_stop(container); } else { ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, @@ -1141,11 +1138,10 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, return 0; } -int vfio_devices_query_dirty_bitmap(VFIOContainer *container, +int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret; @@ -1165,18 +1161,19 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, return 0; } -int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, +int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { bool all_device_dirty_tracking = - vfio_devices_all_device_dirty_tracking(container); + vfio_devices_all_device_dirty_tracking(bcontainer); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); uint64_t dirty_pages; VFIOBitmap vbmap; VFIODMARange *qrange; int ret; - if (!container->bcontainer.dirty_pages_supported && - !all_device_dirty_tracking) { + if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { cpu_physical_memory_set_dirty_range(ram_addr, size, tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE); @@ -1195,10 +1192,9 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, vbmap.bitmap = qrange->bitmap; if (all_device_dirty_tracking) { - ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); + ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size); } else { - ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap, - iova, size); + ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size); } if (ret) { @@ -1208,8 +1204,7 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, vbmap.pages); - trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, - ram_addr, dirty_pages); + trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); out: return ret; } @@ -1241,8 +1236,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, - translated_addr); + ret = vfio_get_dirty_bitmap(&container->bcontainer, iova, + iotlb->addr_mask + 1, translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", @@ -1271,7 +1266,8 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); + return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size, + ram_addr); } static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, @@ -1340,7 +1336,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; - return vfio_get_dirty_bitmap(container, + return vfio_get_dirty_bitmap(&container->bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), int128_get64(section->size), ram_addr); } @@ -1355,7 +1351,7 @@ static void vfio_listener_log_sync(MemoryListener *listener, return; } - if (vfio_devices_all_dirty_tracking(container)) { + if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { ret = vfio_sync_dirty_bitmap(container, section); if (ret) { error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, @@ -1495,7 +1491,7 @@ static void vfio_listener_log_clear(MemoryListener *listener, return; } - if (vfio_devices_all_dirty_tracking(container)) { + if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { vfio_physical_log_clear(container, section); } } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 74d236ddee..9a542368ab 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -155,8 +155,8 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, bool need_dirty_sync = false; int ret; - if (iotlb && vfio_devices_all_running_and_mig_active(container)) { - if (!vfio_devices_all_device_dirty_tracking(container) && + if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { + if (!vfio_devices_all_device_dirty_tracking(bcontainer) && container->bcontainer.dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -204,7 +204,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, } if (need_dirty_sync) { - ret = vfio_get_dirty_bitmap(container, iova, size, + ret = vfio_get_dirty_bitmap(bcontainer, iova, size, iotlb->translated_addr); if (ret) { return ret; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9f7fedee98..08a1f9dfa4 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -117,7 +117,7 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" vfio_legacy_dma_unmap_overflow_workaround(void) "" -vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 +vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 # platform.c diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index e27854228c..0295ede7ba 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -196,7 +196,6 @@ typedef struct VFIODisplay { VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); -bool vfio_devices_all_running_and_saving(VFIOContainer *container); VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, hwaddr start_addr, hwaddr size); @@ -274,11 +273,11 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); -bool vfio_devices_all_running_and_mig_active(VFIOContainer *container); -bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container); -int vfio_devices_query_dirty_bitmap(VFIOContainer *container, +bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer); +bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer); +int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); -int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, +int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From 961614f6c997caf632ce37ead96b301ec47b1847 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:27 +0800 Subject: [PATCH 618/939] vfio/container: Move pgsizes and dma_max_mappings to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 17 +++++++++-------- hw/vfio/container-base.c | 1 + hw/vfio/container.c | 11 +++++------ hw/vfio/spapr.c | 10 ++++++---- include/hw/vfio/vfio-common.h | 2 -- include/hw/vfio/vfio-container-base.h | 2 ++ 6 files changed, 23 insertions(+), 20 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b663d0bcc0..fd6249c290 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -401,6 +401,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, static void vfio_register_ram_discard_listener(VFIOContainer *container, MemoryRegionSection *section) { + VFIOContainerBase *bcontainer = &container->bcontainer; RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl; @@ -419,8 +420,8 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, section->mr); g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); - g_assert(container->pgsizes && - vrdl->granularity >= 1ULL << ctz64(container->pgsizes)); + g_assert(bcontainer->pgsizes && + vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); ram_discard_listener_init(&vrdl->listener, vfio_ram_discard_notify_populate, @@ -441,7 +442,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, * number of sections in the address space we could have over time, * also consuming DMA mappings. */ - if (container->dma_max_mappings) { + if (bcontainer->dma_max_mappings) { unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; #ifdef CONFIG_KVM @@ -462,11 +463,11 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } if (vrdl_mappings + max_memslots - vrdl_count > - container->dma_max_mappings) { + bcontainer->dma_max_mappings) { warn_report("%s: possibly running out of DMA mappings. E.g., try" " increasing the 'block-size' of virtio-mem devies." " Maximum possible DMA mappings: %d, Maximum possible" - " memslots: %d", __func__, container->dma_max_mappings, + " memslots: %d", __func__, bcontainer->dma_max_mappings, max_memslots); } } @@ -626,7 +627,7 @@ static void vfio_listener_region_add(MemoryListener *listener, iommu_idx); ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, - container->pgsizes, + bcontainer->pgsizes, &err); if (ret) { g_free(giommu); @@ -675,7 +676,7 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; + hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { trace_vfio_listener_region_add_no_dma_map( @@ -777,7 +778,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; - pgmask = (1ULL << ctz64(container->pgsizes)) - 1; + pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { vfio_unregister_ram_discard_listener(container, section); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 5d654ae172..dcce111349 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -52,6 +52,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, bcontainer->ops = ops; bcontainer->space = space; bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; QLIST_INIT(&bcontainer->giommu_list); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 9a542368ab..116a9e1e73 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -196,7 +196,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && container->iommu_type == VFIO_TYPE1v2_IOMMU) { trace_vfio_legacy_dma_unmap_overflow_workaround(); - unmap.size -= 1ULL << ctz64(container->pgsizes); + unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); continue; } error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); @@ -652,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; container->error = NULL; - container->dma_max_mappings = 0; container->iova_ranges = NULL; QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); @@ -684,13 +683,13 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - container->pgsizes = info->iova_pgsizes; + bcontainer->pgsizes = info->iova_pgsizes; } else { - container->pgsizes = qemu_real_host_page_size(); + bcontainer->pgsizes = qemu_real_host_page_size(); } - if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { - container->dma_max_mappings = 65535; + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; } vfio_get_info_iova_range(info, container); diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 83da2f7ec2..4f76bdd3ca 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -226,6 +226,7 @@ static int vfio_spapr_create_window(VFIOContainer *container, hwaddr *pgsize) { int ret = 0; + VFIOContainerBase *bcontainer = &container->bcontainer; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; unsigned entries, bits_total, bits_per_level, max_levels; @@ -239,13 +240,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, if (pagesize > rampagesize) { pagesize = rampagesize; } - pgmask = container->pgsizes & (pagesize | (pagesize - 1)); + pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; if (!pagesize) { error_report("Host doesn't support page size 0x%"PRIx64 ", the supported mask is 0x%lx", memory_region_iommu_get_min_page_size(iommu_mr), - container->pgsizes); + bcontainer->pgsizes); return -EINVAL; } @@ -421,6 +422,7 @@ void vfio_container_del_section_window(VFIOContainer *container, int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { + VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; @@ -461,7 +463,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } if (v2) { - container->pgsizes = info.ddw.pgsizes; + bcontainer->pgsizes = info.ddw.pgsizes; /* * There is a default window in just created container. * To make region_add/del simpler, we better remove this @@ -476,7 +478,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } } else { /* The default table uses 4K pages */ - container->pgsizes = 0x1000; + bcontainer->pgsizes = 0x1000; vfio_host_win_add(container, info.dma32_window_start, info.dma32_window_start + info.dma32_window_size - 1, diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0295ede7ba..3046287070 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -94,8 +94,6 @@ typedef struct VFIOContainer { bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; - unsigned long pgsizes; - unsigned int dma_max_mappings; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 7090962496..85ec7e1a56 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -36,6 +36,8 @@ typedef struct VFIOAddressSpace { typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; VFIOAddressSpace *space; + unsigned long pgsizes; + unsigned int dma_max_mappings; bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_ENTRY(VFIOContainerBase) next; -- Gitee From d0234f18616cfe9a43287ba75e4788a10166a526 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:28 +0800 Subject: [PATCH 619/939] vfio/container: Move vrdl_list to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 38 +++++++++++++-------------- hw/vfio/container-base.c | 1 + hw/vfio/container.c | 1 - include/hw/vfio/vfio-common.h | 11 -------- include/hw/vfio/vfio-container-base.h | 11 ++++++++ 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index fd6249c290..e9a19209ab 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -351,13 +351,13 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; const hwaddr size = int128_get64(section->size); const hwaddr iova = section->offset_within_address_space; int ret; /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(&vrdl->container->bcontainer, - iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); if (ret) { error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); @@ -369,6 +369,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; const hwaddr end = section->offset_within_region + int128_get64(section->size); hwaddr start, next, iova; @@ -387,8 +388,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, section->offset_within_address_space; vaddr = memory_region_get_ram_ptr(section->mr) + start; - ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova, - next - start, vaddr, section->readonly); + ret = vfio_container_dma_map(bcontainer, iova, next - start, + vaddr, section->readonly); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -398,10 +399,9 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, return 0; } -static void vfio_register_ram_discard_listener(VFIOContainer *container, +static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = &container->bcontainer; RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl; @@ -412,7 +412,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); vrdl = g_new0(VFIORamDiscardListener, 1); - vrdl->container = container; + vrdl->bcontainer = bcontainer; vrdl->mr = section->mr; vrdl->offset_within_address_space = section->offset_within_address_space; vrdl->size = int128_get64(section->size); @@ -427,7 +427,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, vfio_ram_discard_notify_populate, vfio_ram_discard_notify_discard, true); ram_discard_manager_register_listener(rdm, &vrdl->listener, section); - QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); /* * Sanity-check if we have a theoretically problematic setup where we could @@ -451,7 +451,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } #endif - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { hwaddr start, end; start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, @@ -473,13 +473,13 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } } -static void vfio_unregister_ram_discard_listener(VFIOContainer *container, +static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl = NULL; - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { if (vrdl->mr == section->mr && vrdl->offset_within_address_space == section->offset_within_address_space) { @@ -663,7 +663,7 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(container, section); + vfio_register_ram_discard_listener(bcontainer, section); return; } @@ -781,7 +781,7 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_unregister_ram_discard_listener(container, section); + vfio_unregister_ram_discard_listener(bcontainer, section); /* Unregistering will trigger an unmap. */ try_unmap = false; } @@ -1267,17 +1267,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size, - ram_addr); + return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr); } -static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, - MemoryRegionSection *section) +static int +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl = NULL; - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { if (vrdl->mr == section->mr && vrdl->offset_within_address_space == section->offset_within_address_space) { @@ -1331,7 +1331,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } return 0; } else if (memory_region_has_ram_discard_manager(section->mr)) { - return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); + return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); } ram_addr = memory_region_get_ram_addr(section->mr) + diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index dcce111349..584eee4ba1 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, bcontainer->dirty_pages_supported = false; bcontainer->dma_max_mappings = 0; QLIST_INIT(&bcontainer->giommu_list); + QLIST_INIT(&bcontainer->vrdl_list); } void vfio_container_destroy(VFIOContainerBase *bcontainer) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 116a9e1e73..023f220c93 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -653,7 +653,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; container->error = NULL; container->iova_ranges = NULL; - QLIST_INIT(&container->vrdl_list); QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, &vfio_legacy_ops); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 3046287070..0174b767ca 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -96,21 +96,10 @@ typedef struct VFIOContainer { uint64_t max_dirty_bitmap_size; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; - QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_HEAD(, VFIODMARange) dma_list; GList *iova_ranges; } VFIOContainer; -typedef struct VFIORamDiscardListener { - VFIOContainer *container; - MemoryRegion *mr; - hwaddr offset_within_address_space; - hwaddr size; - uint64_t granularity; - RamDiscardListener listener; - QLIST_ENTRY(VFIORamDiscardListener) next; -} VFIORamDiscardListener; - typedef struct VFIOHostDMAWindow { hwaddr min_iova; hwaddr max_iova; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 85ec7e1a56..8e05b5ac5a 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -40,6 +40,7 @@ typedef struct VFIOContainerBase { unsigned int dma_max_mappings; bool dirty_pages_supported; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainerBase) next; QLIST_HEAD(, VFIODevice) device_list; } VFIOContainerBase; @@ -52,6 +53,16 @@ typedef struct VFIOGuestIOMMU { QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; +typedef struct VFIORamDiscardListener { + VFIOContainerBase *bcontainer; + MemoryRegion *mr; + hwaddr offset_within_address_space; + hwaddr size; + uint64_t granularity; + RamDiscardListener listener; + QLIST_ENTRY(VFIORamDiscardListener) next; +} VFIORamDiscardListener; + int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 4515b719fb7a335ce76dd9168a9e4db24fca28df Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:29 +0800 Subject: [PATCH 620/939] vfio/container: Move listener to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move listener to base container. Also error and initialized fields are moved at the same time. No functional change intended. Modify vfio_physical_log_clear/vfio_connect_container during backporting. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 119 +++++++++++++------------- hw/vfio/container-base.c | 1 + hw/vfio/container.c | 23 +++-- hw/vfio/spapr.c | 11 +-- include/hw/vfio/vfio-common.h | 3 - include/hw/vfio/vfio-container-base.h | 3 + 6 files changed, 82 insertions(+), 78 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e9a19209ab..4647f4447d 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -541,7 +541,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section, return true; } -static bool vfio_get_section_iova_range(VFIOContainer *container, +static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, MemoryRegionSection *section, hwaddr *out_iova, hwaddr *out_end, Int128 *out_llend) @@ -569,8 +569,10 @@ static bool vfio_get_section_iova_range(VFIOContainer *container, static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -581,7 +583,8 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { if (memory_region_is_ram_device(section->mr)) { trace_vfio_listener_region_add_no_dma_map( memory_region_name(section->mr), @@ -688,13 +691,12 @@ static void vfio_listener_region_add(MemoryListener *listener, } } - ret = vfio_container_dma_map(&container->bcontainer, - iova, int128_get64(llsize), vaddr, - section->readonly); + ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), + vaddr, section->readonly); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", - container, iova, int128_get64(llsize), vaddr, ret, + bcontainer, iova, int128_get64(llsize), vaddr, ret, strerror(-ret)); if (memory_region_is_ram_device(section->mr)) { /* Allow unexpected mappings not to be fatal for RAM devices */ @@ -716,9 +718,9 @@ fail: * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ - if (!container->initialized) { - if (!container->error) { - error_propagate_prepend(&container->error, err, + if (!bcontainer->initialized) { + if (!bcontainer->error) { + error_propagate_prepend(&bcontainer->error, err, "Region %s: ", memory_region_name(section->mr)); } else { @@ -733,8 +735,10 @@ fail: static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -767,7 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { return; } @@ -790,22 +795,22 @@ static void vfio_listener_region_del(MemoryListener *listener, if (int128_eq(llsize, int128_2_64())) { /* The unmap ioctl doesn't accept a full 64-bit span. */ llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(&container->bcontainer, iova, + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, int128_get64(llsize), ret, + bcontainer, iova, int128_get64(llsize), ret, strerror(-ret)); } iova += int128_get64(llsize); } - ret = vfio_container_dma_unmap(&container->bcontainer, iova, + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, int128_get64(llsize), ret, + bcontainer, iova, int128_get64(llsize), ret, strerror(-ret)); } } @@ -825,16 +830,15 @@ typedef struct VFIODirtyRanges { } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { - VFIOContainer *container; + VFIOContainerBase *bcontainer; VFIODirtyRanges ranges; MemoryListener listener; } VFIODirtyRangesListener; static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainer *container) + VFIOContainerBase *bcontainer) { VFIOPCIDevice *pcidev; - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; Object *owner; @@ -863,7 +867,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, hwaddr iova, end, *min, *max; if (!vfio_listener_valid_section(section, "tracking_update") || - !vfio_get_section_iova_range(dirty->container, section, + !vfio_get_section_iova_range(dirty->bcontainer, section, &iova, &end, NULL)) { return; } @@ -887,7 +891,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, * The alternative would be an IOVATree but that has a much bigger runtime * overhead and unnecessary complexity. */ - if (vfio_section_is_vfio_pci(section, dirty->container) && + if (vfio_section_is_vfio_pci(section, dirty->bcontainer) && iova >= UINT32_MAX) { min = &range->minpci64; max = &range->maxpci64; @@ -911,7 +915,7 @@ static const MemoryListener vfio_dirty_tracking_listener = { .region_add = vfio_dirty_tracking_update, }; -static void vfio_dirty_tracking_init(VFIOContainer *container, +static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, VFIODirtyRanges *ranges) { VFIODirtyRangesListener dirty; @@ -921,10 +925,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, dirty.ranges.min64 = UINT64_MAX; dirty.ranges.minpci64 = UINT64_MAX; dirty.listener = vfio_dirty_tracking_listener; - dirty.container = container; + dirty.bcontainer = bcontainer; memory_listener_register(&dirty.listener, - container->bcontainer.space->as); + bcontainer->space->as); *ranges = dirty.ranges; @@ -936,12 +940,11 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, memory_listener_unregister(&dirty.listener); } -static void vfio_devices_dma_logging_stop(VFIOContainer *container) +static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) { uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; feature->argsz = sizeof(buf); @@ -962,7 +965,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) } static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainer *container, +vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, VFIODirtyRanges *tracking) { struct vfio_device_feature *feature; @@ -1035,16 +1038,15 @@ static void vfio_device_feature_dma_logging_start_destroy( g_free(feature); } -static int vfio_devices_dma_logging_start(VFIOContainer *container) +static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer) { struct vfio_device_feature *feature; VFIODirtyRanges ranges; - VFIOContainerBase *bcontainer = &container->bcontainer; VFIODevice *vbasedev; int ret = 0; - vfio_dirty_tracking_init(container, &ranges); - feature = vfio_device_feature_dma_logging_start_create(container, + vfio_dirty_tracking_init(bcontainer, &ranges); + feature = vfio_device_feature_dma_logging_start_create(bcontainer, &ranges); if (!feature) { return -errno; @@ -1067,7 +1069,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) out: if (ret) { - vfio_devices_dma_logging_stop(container); + vfio_devices_dma_logging_stop(bcontainer); } vfio_device_feature_dma_logging_start_destroy(feature); @@ -1077,14 +1079,14 @@ out: static void vfio_listener_log_global_start(MemoryListener *listener) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret; - if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { - ret = vfio_devices_dma_logging_start(container); + if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + ret = vfio_devices_dma_logging_start(bcontainer); } else { - ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, - true); + ret = vfio_container_set_dirty_page_tracking(bcontainer, true); } if (ret) { @@ -1096,14 +1098,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener) static void vfio_listener_log_global_stop(MemoryListener *listener) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret = 0; - if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) { - vfio_devices_dma_logging_stop(container); + if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + vfio_devices_dma_logging_stop(bcontainer); } else { - ret = vfio_container_set_dirty_page_tracking(&container->bcontainer, - false); + ret = vfio_container_set_dirty_page_tracking(bcontainer, false); } if (ret) { @@ -1221,8 +1223,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; VFIOContainerBase *bcontainer = giommu->bcontainer; - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; int ret = -EINVAL; @@ -1237,12 +1237,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - ret = vfio_get_dirty_bitmap(&container->bcontainer, iova, - iotlb->addr_mask + 1, translated_addr); + ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, + translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, iotlb->addr_mask + 1, ret, + bcontainer, iova, iotlb->addr_mask + 1, ret, strerror(-ret)); } } @@ -1298,10 +1298,9 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, &vrdl); } -static int vfio_sync_dirty_bitmap(VFIOContainer *container, +static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = &container->bcontainer; ram_addr_t ram_addr; if (memory_region_is_iommu(section->mr)) { @@ -1337,7 +1336,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; - return vfio_get_dirty_bitmap(&container->bcontainer, + return vfio_get_dirty_bitmap(bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), int128_get64(section->size), ram_addr); } @@ -1345,15 +1344,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret; if (vfio_listener_skipped_section(section)) { return; } - if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { - ret = vfio_sync_dirty_bitmap(container, section); + if (vfio_devices_all_dirty_tracking(bcontainer)) { + ret = vfio_sync_dirty_bitmap(bcontainer, section); if (ret) { error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, strerror(-ret)); @@ -1485,14 +1485,17 @@ static int vfio_physical_log_clear(VFIOContainer *container, static void vfio_listener_log_clear(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); if (vfio_listener_skipped_section(section) || - !container->bcontainer.dirty_pages_supported) { + !bcontainer->dirty_pages_supported) { return; } - if (vfio_devices_all_dirty_tracking(&container->bcontainer)) { + if (vfio_devices_all_dirty_tracking(bcontainer)) { vfio_physical_log_clear(container, section); } } diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 584eee4ba1..7f508669f5 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -51,6 +51,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, { bcontainer->ops = ops; bcontainer->space = space; + bcontainer->error = NULL; bcontainer->dirty_pages_supported = false; bcontainer->dma_max_mappings = 0; QLIST_INIT(&bcontainer->giommu_list); diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 023f220c93..50da1300dd 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -520,6 +520,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, { struct vfio_info_cap_header *hdr; struct vfio_iommu_type1_info_cap_migration *cap_mig; + VFIOContainerBase *bcontainer = &container->bcontainer; hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); if (!hdr) { @@ -534,7 +535,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - container->bcontainer.dirty_pages_supported = true; + bcontainer->dirty_pages_supported = true; container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; container->dirty_pgsizes = cap_mig->pgsize_bitmap; } @@ -651,7 +652,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; - container->error = NULL; container->iova_ranges = NULL; QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; @@ -716,23 +716,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); - container->listener = vfio_memory_listener; - if (kvm_csv3_enabled()) { - shared_memory_listener_register(&container->listener, + shared_memory_listener_register(&bcontainer->listener, bcontainer->space->as); - } else { - memory_listener_register(&container->listener, bcontainer->space->as); } - if (container->error) { + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); + + if (bcontainer->error) { ret = -1; - error_propagate_prepend(errp, container->error, + error_propagate_prepend(errp, bcontainer->error, "memory listener initialization failed: "); goto listener_release_exit; } - container->initialized = true; + bcontainer->initialized = true; return 0; listener_release_exit: @@ -742,7 +741,7 @@ listener_release_exit: if (kvm_csv3_enabled()) { shared_memory_listener_unregister(); } else { - memory_listener_unregister(&container->listener); + memory_listener_unregister(&bcontainer->listener); } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { @@ -781,7 +780,7 @@ static void vfio_disconnect_container(VFIOGroup *group) if (kvm_csv3_enabled()) { shared_memory_listener_unregister(); } else { - memory_listener_unregister(&container->listener); + memory_listener_unregister(&bcontainer->listener); } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 4f76bdd3ca..7a50975f25 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -46,6 +46,7 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, { VFIOContainer *container = container_of(listener, VFIOContainer, prereg_listener); + VFIOContainerBase *bcontainer = &container->bcontainer; const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -88,9 +89,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ - if (!container->initialized) { - if (!container->error) { - error_setg_errno(&container->error, -ret, + if (!bcontainer->initialized) { + if (!bcontainer->error) { + error_setg_errno(&bcontainer->error, -ret, "Memory registering failed"); } } else { @@ -445,9 +446,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) memory_listener_register(&container->prereg_listener, &address_space_memory); - if (container->error) { + if (bcontainer->error) { ret = -1; - error_propagate_prepend(errp, container->error, + error_propagate_prepend(errp, bcontainer->error, "RAM memory listener initialization failed: "); goto listener_unregister_exit; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0174b767ca..c23e7fb8ee 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -86,11 +86,8 @@ typedef struct VFIODMARange { typedef struct VFIOContainer { VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - MemoryListener listener; MemoryListener prereg_listener; unsigned iommu_type; - Error *error; - bool initialized; bool dirty_log_manual_clear; uint64_t dirty_pgsizes; uint64_t max_dirty_bitmap_size; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 8e05b5ac5a..95f8d319e0 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -36,6 +36,9 @@ typedef struct VFIOAddressSpace { typedef struct VFIOContainerBase { const VFIOIOMMUOps *ops; VFIOAddressSpace *space; + MemoryListener listener; + Error *error; + bool initialized; unsigned long pgsizes; unsigned int dma_max_mappings; bool dirty_pages_supported; -- Gitee From a59131a461adf9b626735886a53825e2a03f3272 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:30 +0800 Subject: [PATCH 621/939] vfio/container: Move dirty_pgsizes and max_dirty_bitmap_size to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container.c | 9 +++++---- include/hw/vfio/vfio-common.h | 2 -- include/hw/vfio/vfio-container-base.h | 2 ++ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 50da1300dd..191597167a 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -66,6 +66,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { + VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; VFIOBitmap vbmap; @@ -93,7 +94,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, bitmap->size = vbmap.size; bitmap->data = (__u64 *)vbmap.bitmap; - if (vbmap.size > container->max_dirty_bitmap_size) { + if (vbmap.size > bcontainer->max_dirty_bitmap_size) { error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); ret = -E2BIG; goto unmap_exit; @@ -157,7 +158,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { if (!vfio_devices_all_device_dirty_tracking(bcontainer) && - container->bcontainer.dirty_pages_supported) { + bcontainer->dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } @@ -536,8 +537,8 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { bcontainer->dirty_pages_supported = true; - container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; - container->dirty_pgsizes = cap_mig->pgsize_bitmap; + bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; } } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c23e7fb8ee..a8da41d27e 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -89,8 +89,6 @@ typedef struct VFIOContainer { MemoryListener prereg_listener; unsigned iommu_type; bool dirty_log_manual_clear; - uint64_t dirty_pgsizes; - uint64_t max_dirty_bitmap_size; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIODMARange) dma_list; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 95f8d319e0..80e4a993c5 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -39,6 +39,8 @@ typedef struct VFIOContainerBase { MemoryListener listener; Error *error; bool initialized; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; unsigned long pgsizes; unsigned int dma_max_mappings; bool dirty_pages_supported; -- Gitee From 4aac9c99e4f90d400d511bb46809714eab1fbf5f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:31 +0800 Subject: [PATCH 622/939] vfio/container: Move iova_ranges to base container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meanwhile remove the helper function vfio_free_container as it only calls g_free now. No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 5 +++-- hw/vfio/container-base.c | 3 +++ hw/vfio/container.c | 19 ++++++------------- include/hw/vfio/vfio-common.h | 1 - include/hw/vfio/vfio-container-base.h | 1 + 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 4647f4447d..9926454527 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -637,9 +637,10 @@ static void vfio_listener_region_add(MemoryListener *listener, goto fail; } - if (container->iova_ranges) { + if (bcontainer->iova_ranges) { ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, - container->iova_ranges, &err); + bcontainer->iova_ranges, + &err); if (ret) { g_free(giommu); goto fail; diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 7f508669f5..0177f43741 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, bcontainer->error = NULL; bcontainer->dirty_pages_supported = false; bcontainer->dma_max_mappings = 0; + bcontainer->iova_ranges = NULL; QLIST_INIT(&bcontainer->giommu_list); QLIST_INIT(&bcontainer->vrdl_list); } @@ -70,4 +71,6 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) QLIST_REMOVE(giommu, giommu_next); g_free(giommu); } + + g_list_free_full(bcontainer->iova_ranges, g_free); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 191597167a..13d42aad0d 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -360,7 +360,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, } static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, - VFIOContainer *container) + VFIOContainerBase *bcontainer) { struct vfio_info_cap_header *hdr; struct vfio_iommu_type1_info_cap_iova_range *cap; @@ -378,8 +378,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, range_set_bounds(range, cap->iova_ranges[i].start, cap->iova_ranges[i].end); - container->iova_ranges = - range_list_insert(container->iova_ranges, range); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); } return true; @@ -542,12 +542,6 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, } } -static void vfio_free_container(VFIOContainer *container) -{ - g_list_free_full(container->iova_ranges, g_free); - g_free(container); -} - static SharedRegionListener *g_shl; static void shared_memory_listener_register(MemoryListener *listener, @@ -653,7 +647,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container = g_malloc0(sizeof(*container)); container->fd = fd; - container->iova_ranges = NULL; QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, &vfio_legacy_ops); @@ -692,7 +685,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, bcontainer->dma_max_mappings = 65535; } - vfio_get_info_iova_range(info, container); + vfio_get_info_iova_range(info, bcontainer); vfio_get_iommu_info_migration(container, info); g_free(info); @@ -753,7 +746,7 @@ enable_discards_exit: vfio_ram_block_discard_disable(container, false); free_container_exit: - vfio_free_container(container); + g_free(container); close_fd_exit: close(fd); @@ -801,7 +794,7 @@ static void vfio_disconnect_container(VFIOGroup *group) trace_vfio_disconnect_container(container->fd); close(container->fd); - vfio_free_container(container); + g_free(container); vfio_put_address_space(space); } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index a8da41d27e..9a2e0ace72 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -92,7 +92,6 @@ typedef struct VFIOContainer { QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIODMARange) dma_list; - GList *iova_ranges; } VFIOContainer; typedef struct VFIOHostDMAWindow { diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 80e4a993c5..9658ffb526 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -48,6 +48,7 @@ typedef struct VFIOContainerBase { QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; QLIST_ENTRY(VFIOContainerBase) next; QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; } VFIOContainerBase; typedef struct VFIOGuestIOMMU { -- Gitee From 1ba796aff9476e5850df910304eb3720a09feef2 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:32 +0800 Subject: [PATCH 623/939] vfio/container: Implement attach/detach_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 16 ++++++++++++++++ hw/vfio/container.c | 12 +++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 9926454527..488aa43c9b 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1644,3 +1644,19 @@ retry: return info; } + +int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const VFIOIOMMUOps *ops = &vfio_legacy_ops; + + return ops->attach_device(name, vbasedev, as, errp); +} + +void vfio_detach_device(VFIODevice *vbasedev) +{ + if (!vbasedev->bcontainer) { + return; + } + vbasedev->bcontainer->ops->detach_device(vbasedev); +} diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 13d42aad0d..62af0f2bdd 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -986,8 +986,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) * @name and @vbasedev->name are likely to be different depending * on the type of the device, hence the need for passing @name */ -int vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) { int groupid = vfio_device_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; @@ -1027,14 +1027,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } -void vfio_detach_device(VFIODevice *vbasedev) +static void vfio_legacy_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - if (!vbasedev->bcontainer) { - return; - } - QLIST_REMOVE(vbasedev, global_next); QLIST_REMOVE(vbasedev, container_next); vbasedev->bcontainer = NULL; @@ -1046,6 +1042,8 @@ void vfio_detach_device(VFIODevice *vbasedev) const VFIOIOMMUOps vfio_legacy_ops = { .dma_map = vfio_legacy_dma_map, .dma_unmap = vfio_legacy_dma_unmap, + .attach_device = vfio_legacy_attach_device, + .detach_device = vfio_legacy_detach_device, .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, }; -- Gitee From 4b0bff002d93d8785ccec8020667dc559bda4e9c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:33 +0800 Subject: [PATCH 624/939] vfio/spapr: Introduce spapr backend and target interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an empty spapr backend which will hold spapr specific content, currently only prereg_listener and hostwin_list. Also introduce two spapr specific callbacks add/del_window into VFIOIOMMUOps. Instantiate a spapr ops with a helper setup_spapr_ops and assign it to bcontainer->ops. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/spapr.c | 14 ++++++++++++++ include/hw/vfio/vfio-container-base.h | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 7a50975f25..e1a6b35563 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -24,6 +24,10 @@ #include "qapi/error.h" #include "trace.h" +typedef struct VFIOSpaprContainer { + VFIOContainer container; +} VFIOSpaprContainer; + static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) { if (memory_region_is_iommu(section->mr)) { @@ -421,6 +425,14 @@ void vfio_container_del_section_window(VFIOContainer *container, } } +static VFIOIOMMUOps vfio_iommu_spapr_ops; + +static void setup_spapr_ops(VFIOContainerBase *bcontainer) +{ + vfio_iommu_spapr_ops = *bcontainer->ops; + bcontainer->ops = &vfio_iommu_spapr_ops; +} + int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { VFIOContainerBase *bcontainer = &container->bcontainer; @@ -486,6 +498,8 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) 0x1000); } + setup_spapr_ops(bcontainer); + return 0; listener_unregister_exit: diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 9658ffb526..f62a14ac73 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -101,5 +101,11 @@ struct VFIOIOMMUOps { int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); + /* SPAPR specific */ + int (*add_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp); + void (*del_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); }; #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ -- Gitee From 42d02193bbe543173aa16e463015c76fa2d38ec0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:34 +0800 Subject: [PATCH 625/939] vfio/spapr: switch to spapr IOMMU BE add/del_section_window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 8 ++------ hw/vfio/container-base.c | 21 +++++++++++++++++++++ hw/vfio/spapr.c | 19 ++++++++++++++----- include/hw/vfio/vfio-common.h | 5 ----- include/hw/vfio/vfio-container-base.h | 5 +++++ 5 files changed, 42 insertions(+), 16 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 488aa43c9b..679fee4321 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -571,8 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener, { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -595,7 +593,7 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (vfio_container_add_section_window(container, section, &err)) { + if (vfio_container_add_section_window(bcontainer, section, &err)) { goto fail; } @@ -738,8 +736,6 @@ static void vfio_listener_region_del(MemoryListener *listener, { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -818,7 +814,7 @@ static void vfio_listener_region_del(MemoryListener *listener, memory_region_unref(section->mr); - vfio_container_del_section_window(container, section); + vfio_container_del_section_window(bcontainer, section); } typedef struct VFIODirtyRanges { diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 0177f43741..71f7274973 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -31,6 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); } +int vfio_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp) +{ + if (!bcontainer->ops->add_window) { + return 0; + } + + return bcontainer->ops->add_window(bcontainer, section, errp); +} + +void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + if (!bcontainer->ops->del_window) { + return; + } + + return bcontainer->ops->del_window(bcontainer, section); +} + int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start) { diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index e1a6b35563..5be1911aad 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -319,10 +319,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, return 0; } -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp) +static int +vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -407,9 +410,13 @@ int vfio_container_add_section_window(VFIOContainer *container, return 0; } -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section) +static void +vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; } @@ -430,6 +437,8 @@ static VFIOIOMMUOps vfio_iommu_spapr_ops; static void setup_spapr_ops(VFIOContainerBase *bcontainer) { vfio_iommu_spapr_ops = *bcontainer->ops; + vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; + vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; bcontainer->ops = &vfio_iommu_spapr_ops; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9a2e0ace72..c6b1260911 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -183,11 +183,6 @@ VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); /* SPAPR specific */ -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp); -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section); int vfio_spapr_container_init(VFIOContainer *container, Error **errp); void vfio_spapr_container_deinit(VFIOContainer *container); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index f62a14ac73..4b6f017c6f 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -75,6 +75,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); +int vfio_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp); +void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start); int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, -- Gitee From 8f27e17107a923a0739c17efe5dcd11f818364af Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:35 +0800 Subject: [PATCH 626/939] vfio/spapr: Move prereg_listener into spapr container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/spapr.c | 24 ++++++++++++++++-------- include/hw/vfio/vfio-common.h | 1 - 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 5be1911aad..68c3dd6c75 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -26,6 +26,7 @@ typedef struct VFIOSpaprContainer { VFIOContainer container; + MemoryListener prereg_listener; } VFIOSpaprContainer; static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) @@ -48,8 +49,9 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) static void vfio_prereg_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - prereg_listener); + VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, + prereg_listener); + VFIOContainer *container = &scontainer->container; VFIOContainerBase *bcontainer = &container->bcontainer; const hwaddr gpa = section->offset_within_address_space; hwaddr end; @@ -107,8 +109,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, static void vfio_prereg_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - prereg_listener); + VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, + prereg_listener); + VFIOContainer *container = &scontainer->container; const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -445,6 +448,8 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer) int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; @@ -463,9 +468,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) return -errno; } } else { - container->prereg_listener = vfio_prereg_listener; + scontainer->prereg_listener = vfio_prereg_listener; - memory_listener_register(&container->prereg_listener, + memory_listener_register(&scontainer->prereg_listener, &address_space_memory); if (bcontainer->error) { ret = -1; @@ -513,7 +518,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) listener_unregister_exit: if (v2) { - memory_listener_unregister(&container->prereg_listener); + memory_listener_unregister(&scontainer->prereg_listener); } return ret; } @@ -523,7 +528,10 @@ void vfio_spapr_container_deinit(VFIOContainer *container) VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - memory_listener_unregister(&container->prereg_listener); + VFIOSpaprContainer *scontainer = container_of(container, + VFIOSpaprContainer, + container); + memory_listener_unregister(&scontainer->prereg_listener); } QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, next) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c6b1260911..ba8abed75a 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -86,7 +86,6 @@ typedef struct VFIODMARange { typedef struct VFIOContainer { VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - MemoryListener prereg_listener; unsigned iommu_type; bool dirty_log_manual_clear; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; -- Gitee From 13c57d5e888fe9d6bdf68469c8e76991a789c1e6 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:36 +0800 Subject: [PATCH 627/939] vfio/spapr: Move hostwin_list into spapr container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/spapr.c | 36 +++++++++++++++++++---------------- include/hw/vfio/vfio-common.h | 1 - 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 68c3dd6c75..5c6426e697 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -27,6 +27,7 @@ typedef struct VFIOSpaprContainer { VFIOContainer container; MemoryListener prereg_listener; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; } VFIOSpaprContainer; static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) @@ -154,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = { .region_del = vfio_prereg_listener_region_del, }; -static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, +static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, hwaddr max_iova, uint64_t iova_pgsizes) { VFIOHostDMAWindow *hostwin; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (ranges_overlap(hostwin->min_iova, hostwin->max_iova - hostwin->min_iova + 1, min_iova, @@ -173,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, hostwin->min_iova = min_iova; hostwin->max_iova = max_iova; hostwin->iova_pgsizes = iova_pgsizes; - QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); + QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); } -static int vfio_host_win_del(VFIOContainer *container, +static int vfio_host_win_del(VFIOSpaprContainer *scontainer, hwaddr min_iova, hwaddr max_iova) { VFIOHostDMAWindow *hostwin; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { QLIST_REMOVE(hostwin, hostwin_next); g_free(hostwin); @@ -192,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container, return -1; } -static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, hwaddr iova, hwaddr end) { VFIOHostDMAWindow *hostwin; @@ -329,6 +330,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, { VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -344,7 +347,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, iova = section->offset_within_address_space; end = iova + int128_get64(section->size) - 1; - if (!vfio_find_hostwin(container, iova, end)) { + if (!vfio_find_hostwin(scontainer, iova, end)) { error_setg(errp, "Container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); @@ -358,7 +361,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, } /* For now intersections are not allowed, we may relax this later */ - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (ranges_overlap(hostwin->min_iova, hostwin->max_iova - hostwin->min_iova + 1, section->offset_within_address_space, @@ -380,7 +383,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, return ret; } - vfio_host_win_add(container, section->offset_within_address_space, + vfio_host_win_add(scontainer, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1, pgsize); #ifdef CONFIG_KVM @@ -419,6 +422,8 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, { VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; @@ -426,7 +431,7 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, vfio_spapr_remove_window(container, section->offset_within_address_space); - if (vfio_host_win_del(container, + if (vfio_host_win_del(scontainer, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1) < 0) { @@ -454,7 +459,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; - QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&scontainer->hostwin_list); /* * The host kernel code implementing VFIO_IOMMU_DISABLE is called @@ -506,7 +511,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } else { /* The default table uses 4K pages */ bcontainer->pgsizes = 0x1000; - vfio_host_win_add(container, info.dma32_window_start, + vfio_host_win_add(scontainer, info.dma32_window_start, info.dma32_window_start + info.dma32_window_size - 1, 0x1000); @@ -525,15 +530,14 @@ listener_unregister_exit: void vfio_spapr_container_deinit(VFIOContainer *container) { + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - VFIOSpaprContainer *scontainer = container_of(container, - VFIOSpaprContainer, - container); memory_listener_unregister(&scontainer->prereg_listener); } - QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, next) { QLIST_REMOVE(hostwin, hostwin_next); g_free(hostwin); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ba8abed75a..9e22acbfb6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -88,7 +88,6 @@ typedef struct VFIOContainer { int fd; /* /dev/vfio/vfio, empowered by the attached groups */ unsigned iommu_type; bool dirty_log_manual_clear; - QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_HEAD(, VFIODMARange) dma_list; } VFIOContainer; -- Gitee From 6cb41a55992571dd215fee86ed910bb4d6688bf8 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:37 +0800 Subject: [PATCH 628/939] backends/iommufd: Introduce the iommufd object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an iommufd object which allows the interaction with the host /dev/iommu device. The /dev/iommu can have been already pre-opened outside of qemu, in which case the fd can be passed directly along with the iommufd object: This allows the iommufd object to be shared accross several subsystems (VFIO, VDPA, ...). For example, libvirt would open the /dev/iommu once. If no fd is passed along with the iommufd object, the /dev/iommu is opened by the qemu code. Suggested-by: Alex Williamson Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- MAINTAINERS | 8 ++ backends/Kconfig | 4 + backends/iommufd.c | 245 +++++++++++++++++++++++++++++++++++++++ backends/meson.build | 1 + backends/trace-events | 10 ++ include/sysemu/iommufd.h | 38 ++++++ qapi/qom.json | 19 +++ qemu-options.hx | 12 ++ 8 files changed, 337 insertions(+) create mode 100644 backends/iommufd.c create mode 100644 include/sysemu/iommufd.h diff --git a/MAINTAINERS b/MAINTAINERS index 695e0bd34f..a5a446914a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2167,6 +2167,14 @@ F: hw/vfio/ap.c F: docs/system/s390x/vfio-ap.rst L: qemu-s390x@nongnu.org +iommufd +M: Yi Liu +M: Eric Auger +M: Zhenzhong Duan +S: Supported +F: backends/iommufd.c +F: include/sysemu/iommufd.h + vhost M: Michael S. Tsirkin S: Supported diff --git a/backends/Kconfig b/backends/Kconfig index f35abc1609..2cb23f62fa 100644 --- a/backends/Kconfig +++ b/backends/Kconfig @@ -1 +1,5 @@ source tpm/Kconfig + +config IOMMUFD + bool + depends on VFIO diff --git a/backends/iommufd.c b/backends/iommufd.c new file mode 100644 index 0000000000..ba58a0eb0d --- /dev/null +++ b/backends/iommufd.c @@ -0,0 +1,245 @@ +/* + * iommufd container backend + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "sysemu/iommufd.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qemu/module.h" +#include "qom/object_interfaces.h" +#include "qemu/error-report.h" +#include "monitor/monitor.h" +#include "trace.h" +#include +#include + +static void iommufd_backend_init(Object *obj) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + + be->fd = -1; + be->users = 0; + be->owned = true; + qemu_mutex_init(&be->lock); +} + +static void iommufd_backend_finalize(Object *obj) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + + if (be->owned) { + close(be->fd); + be->fd = -1; + } +} + +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + int fd = -1; + + fd = monitor_fd_param(monitor_cur(), str, errp); + if (fd == -1) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + qemu_mutex_lock(&be->lock); + be->fd = fd; + be->owned = false; + qemu_mutex_unlock(&be->lock); + trace_iommu_backend_set_fd(be->fd); +} + +static bool iommufd_backend_can_be_deleted(UserCreatable *uc) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(uc); + + return !be->users; +} + +static void iommufd_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->can_be_deleted = iommufd_backend_can_be_deleted; + + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); +} + +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) +{ + int fd, ret = 0; + + qemu_mutex_lock(&be->lock); + if (be->users == UINT32_MAX) { + error_setg(errp, "too many connections"); + ret = -E2BIG; + goto out; + } + if (be->owned && !be->users) { + fd = qemu_open_old("/dev/iommu", O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, "/dev/iommu opening failed"); + ret = fd; + goto out; + } + be->fd = fd; + } + be->users++; +out: + trace_iommufd_backend_connect(be->fd, be->owned, + be->users, ret); + qemu_mutex_unlock(&be->lock); + return ret; +} + +void iommufd_backend_disconnect(IOMMUFDBackend *be) +{ + qemu_mutex_lock(&be->lock); + if (!be->users) { + goto out; + } + be->users--; + if (!be->users && be->owned) { + close(be->fd); + be->fd = -1; + } +out: + trace_iommufd_backend_disconnect(be->fd, be->users); + qemu_mutex_unlock(&be->lock); +} + +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_ioas_alloc alloc_data = { + .size = sizeof(alloc_data), + .flags = 0, + }; + + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate ioas"); + return ret; + } + + *ioas_id = alloc_data.out_ioas_id; + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); + + return ret; +} + +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) +{ + int ret, fd = be->fd; + struct iommu_destroy des = { + .size = sizeof(des), + .id = id, + }; + + ret = ioctl(fd, IOMMU_DESTROY, &des); + trace_iommufd_backend_free_id(fd, id, ret); + if (ret) { + error_report("Failed to free id: %u %m", id); + } +} + +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + int ret, fd = be->fd; + struct iommu_ioas_map map = { + .size = sizeof(map), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .ioas_id = ioas_id, + .__reserved = 0, + .user_va = (uintptr_t)vaddr, + .iova = iova, + .length = size, + }; + + if (!readonly) { + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; + } + + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, + vaddr, readonly, ret); + if (ret) { + ret = -errno; + + /* TODO: Not support mapping hardware PCI BAR region for now. */ + if (errno == EFAULT) { + warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?"); + } else { + error_report("IOMMU_IOAS_MAP failed: %m"); + } + } + return ret; +} + +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size) +{ + int ret, fd = be->fd; + struct iommu_ioas_unmap unmap = { + .size = sizeof(unmap), + .ioas_id = ioas_id, + .iova = iova, + .length = size, + }; + + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); + /* + * IOMMUFD takes mapping as some kind of object, unmapping + * nonexistent mapping is treated as deleting a nonexistent + * object and return ENOENT. This is different from legacy + * backend which allows it. vIOMMU may trigger a lot of + * redundant unmapping, to avoid flush the log, treat them + * as succeess for IOMMUFD just like legacy backend. + */ + if (ret && errno == ENOENT) { + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret); + ret = 0; + } else { + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); + } + + if (ret) { + ret = -errno; + error_report("IOMMU_IOAS_UNMAP failed: %m"); + } + return ret; +} + +static const TypeInfo iommufd_backend_info = { + .name = TYPE_IOMMUFD_BACKEND, + .parent = TYPE_OBJECT, + .instance_size = sizeof(IOMMUFDBackend), + .instance_init = iommufd_backend_init, + .instance_finalize = iommufd_backend_finalize, + .class_size = sizeof(IOMMUFDBackendClass), + .class_init = iommufd_backend_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&iommufd_backend_info); +} + +type_init(register_types); diff --git a/backends/meson.build b/backends/meson.build index 914c7c4afb..9a5cea480d 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -20,6 +20,7 @@ if have_vhost_user system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) endif system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) if have_vhost_user_crypto system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c')) endif diff --git a/backends/trace-events b/backends/trace-events index 652eb76a57..d45c6e31a6 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void) dbus_vmstate_post_load(int version_id) "version_id: %d" dbus_vmstate_loading(const char *id) "id: %s" dbus_vmstate_saving(const char *id) "id: %s" + +# iommufd.c +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h new file mode 100644 index 0000000000..9c5524b0ed --- /dev/null +++ b/include/sysemu/iommufd.h @@ -0,0 +1,38 @@ +#ifndef SYSEMU_IOMMUFD_H +#define SYSEMU_IOMMUFD_H + +#include "qom/object.h" +#include "qemu/thread.h" +#include "exec/hwaddr.h" +#include "exec/cpu-common.h" + +#define TYPE_IOMMUFD_BACKEND "iommufd" +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) + +struct IOMMUFDBackendClass { + ObjectClass parent_class; +}; + +struct IOMMUFDBackend { + Object parent; + + /*< protected >*/ + int fd; /* /dev/iommu file descriptor */ + bool owned; /* is the /dev/iommu opened internally */ + QemuMutex lock; + uint32_t users; + + /*< public >*/ +}; + +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); +void iommufd_backend_disconnect(IOMMUFDBackend *be); + +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp); +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly); +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); +#endif diff --git a/qapi/qom.json b/qapi/qom.json index a74c7a91f9..a5336e6b11 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -794,6 +794,23 @@ { 'struct': 'VfioUserServerProperties', 'data': { 'socket': 'SocketAddress', 'device': 'str' } } +## +# @IOMMUFDProperties: +# +# Properties for iommufd objects. +# +# @fd: file descriptor name previously passed via 'getfd' command, +# which represents a pre-opened /dev/iommu. This allows the +# iommufd object to be shared accross several subsystems +# (VFIO, VDPA, ...), and the file descriptor to be shared +# with other process, e.g. DPDK. (default: QEMU opens +# /dev/iommu by itself) +# +# Since: 9.0 +## +{ 'struct': 'IOMMUFDProperties', + 'data': { '*fd': 'str' } } + ## # @RngProperties: # @@ -969,6 +986,7 @@ 'input-barrier', { 'name': 'input-linux', 'if': 'CONFIG_LINUX' }, + 'iommufd', 'iothread', 'main-loop', { 'name': 'memory-backend-epc', @@ -1039,6 +1057,7 @@ 'input-barrier': 'InputBarrierProperties', 'input-linux': { 'type': 'InputLinuxProperties', 'if': 'CONFIG_LINUX' }, + 'iommufd': 'IOMMUFDProperties', 'iothread': 'IothreadProperties', 'main-loop': 'MainLoopProperties', 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', diff --git a/qemu-options.hx b/qemu-options.hx index 8516b73206..7fe76c4b1d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -5224,6 +5224,18 @@ SRST The ``share`` boolean option is on by default with memfd. + ``-object iommufd,id=id[,fd=fd]`` + Creates an iommufd backend which allows control of DMA mapping + through the ``/dev/iommu`` device. + + The ``id`` parameter is a unique ID which frontends (such as + vfio-pci of vdpa) will use to connect with the iommufd backend. + + The ``fd`` parameter is an optional pre-opened file descriptor + resulting from ``/dev/iommu`` opening. Usually the iommufd is shared + across all subsystems, bringing the benefit of centralized + reference counting. + ``-object rng-builtin,id=id`` Creates a random number generator backend which obtains entropy from QEMU builtin functions. The ``id`` parameter is a unique ID -- Gitee From 90688ff9c5802965f24460ac79fe52b93d2adb1f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 11 Jan 2025 10:52:38 +0800 Subject: [PATCH 629/939] util/char_dev: Add open_cdev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /dev/vfio/devices/vfioX may not exist. In that case it is still possible to open /dev/char/$major:$minor instead. Add helper function to abstract the cdev open. Suggested-by: Jason Gunthorpe Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- MAINTAINERS | 2 + include/qemu/chardev_open.h | 16 ++++++++ util/chardev_open.c | 81 +++++++++++++++++++++++++++++++++++++ util/meson.build | 1 + 4 files changed, 100 insertions(+) create mode 100644 include/qemu/chardev_open.h create mode 100644 util/chardev_open.c diff --git a/MAINTAINERS b/MAINTAINERS index a5a446914a..ca70bb4e64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2174,6 +2174,8 @@ M: Zhenzhong Duan S: Supported F: backends/iommufd.c F: include/sysemu/iommufd.h +F: include/qemu/chardev_open.h +F: util/chardev_open.c vhost M: Michael S. Tsirkin diff --git a/include/qemu/chardev_open.h b/include/qemu/chardev_open.h new file mode 100644 index 0000000000..64e8fcfdcb --- /dev/null +++ b/include/qemu/chardev_open.h @@ -0,0 +1,16 @@ +/* + * QEMU Chardev Helper + * + * Copyright (C) 2023 Intel Corporation. + * + * Authors: Yi Liu + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef QEMU_CHARDEV_OPEN_H +#define QEMU_CHARDEV_OPEN_H + +int open_cdev(const char *devpath, dev_t cdev); +#endif diff --git a/util/chardev_open.c b/util/chardev_open.c new file mode 100644 index 0000000000..f776429788 --- /dev/null +++ b/util/chardev_open.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * Copyright (C) 2023 Intel Corporation. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Yi Liu + * + * Copied from + * https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c + * + */ + +#include "qemu/osdep.h" +#include "qemu/chardev_open.h" + +static int open_cdev_internal(const char *path, dev_t cdev) +{ + struct stat st; + int fd; + + fd = qemu_open_old(path, O_RDWR); + if (fd == -1) { + return -1; + } + if (fstat(fd, &st) || !S_ISCHR(st.st_mode) || + (cdev != 0 && st.st_rdev != cdev)) { + close(fd); + return -1; + } + return fd; +} + +static int open_cdev_robust(dev_t cdev) +{ + g_autofree char *devpath = NULL; + + /* + * This assumes that udev is being used and is creating the /dev/char/ + * symlinks. + */ + devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev)); + return open_cdev_internal(devpath, cdev); +} + +int open_cdev(const char *devpath, dev_t cdev) +{ + int fd; + + fd = open_cdev_internal(devpath, cdev); + if (fd == -1 && cdev != 0) { + return open_cdev_robust(cdev); + } + return fd; +} diff --git a/util/meson.build b/util/meson.build index c2322ef6e7..174c133368 100644 --- a/util/meson.build +++ b/util/meson.build @@ -108,6 +108,7 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c')) endif if cpu == 'aarch64' -- Gitee From bf4c408cd5d3daadbfd11136655e5bcb40dcbba0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:39 +0800 Subject: [PATCH 630/939] vfio/common: return early if space isn't empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a trivial optimization. If there is active container in space, vfio_reset_handler will never be unregistered. So revert the check of space->containers and return early. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 679fee4321..f6c2029aec 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1608,10 +1608,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) void vfio_put_address_space(VFIOAddressSpace *space) { - if (QLIST_EMPTY(&space->containers)) { - QLIST_REMOVE(space, list); - g_free(space); + if (!QLIST_EMPTY(&space->containers)) { + return; } + + QLIST_REMOVE(space, list); + g_free(space); + if (QLIST_EMPTY(&vfio_address_spaces)) { qemu_unregister_reset(vfio_reset_handler, NULL); } -- Gitee From 5c034b7ec5ca255551956744a386288a74ab172e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 11 Jan 2025 10:52:40 +0800 Subject: [PATCH 631/939] vfio/iommufd: Implement the iommufd backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The iommufd backend is implemented based on the new /dev/iommu user API. This backend obviously depends on CONFIG_IOMMUFD. So far, the iommufd backend doesn't support dirty page sync yet. Co-authored-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 6 + hw/vfio/iommufd.c | 422 ++++++++++++++++++++++++++++++++++ hw/vfio/meson.build | 3 + hw/vfio/trace-events | 10 + include/hw/vfio/vfio-common.h | 11 + 5 files changed, 452 insertions(+) create mode 100644 hw/vfio/iommufd.c diff --git a/hw/vfio/common.c b/hw/vfio/common.c index f6c2029aec..0e900c6746 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #ifdef CONFIG_KVM #include @@ -1649,6 +1650,11 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, { const VFIOIOMMUOps *ops = &vfio_legacy_ops; +#ifdef CONFIG_IOMMUFD + if (vbasedev->iommufd) { + ops = &vfio_iommufd_ops; + } +#endif return ops->attach_device(name, vbasedev, as, errp); } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c new file mode 100644 index 0000000000..6d31aeac7b --- /dev/null +++ b/hw/vfio/iommufd.c @@ -0,0 +1,422 @@ +/* + * iommufd container backend + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include +#include +#include + +#include "hw/vfio/vfio-common.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "qapi/error.h" +#include "sysemu/iommufd.h" +#include "hw/qdev-core.h" +#include "sysemu/reset.h" +#include "qemu/cutils.h" +#include "qemu/chardev_open.h" + +static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + return iommufd_backend_map_dma(container->be, + container->ioas_id, + iova, size, vaddr, readonly); +} + +static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ + return iommufd_backend_unmap_dma(container->be, + container->ioas_id, iova, size); +} + +static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp) +{ + return vfio_kvm_device_add_fd(vbasedev->fd, errp); +} + +static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev) +{ + Error *err = NULL; + + if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) { + error_report_err(err); + } +} + +static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) +{ + IOMMUFDBackend *iommufd = vbasedev->iommufd; + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + }; + int ret; + + ret = iommufd_backend_connect(iommufd, errp); + if (ret) { + return ret; + } + + /* + * Add device to kvm-vfio to be prepared for the tracking + * in KVM. Especially for some emulated devices, it requires + * to have kvm information in the device open. + */ + ret = iommufd_cdev_kvm_device_add(vbasedev, errp); + if (ret) { + goto err_kvm_device_add; + } + + /* Bind device to iommufd */ + bind.iommufd = iommufd->fd; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); + if (ret) { + error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d", + vbasedev->fd, bind.iommufd); + goto err_bind; + } + + vbasedev->devid = bind.out_devid; + trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, + vbasedev->fd, vbasedev->devid); + return ret; +err_bind: + iommufd_cdev_kvm_device_del(vbasedev); +err_kvm_device_add: + iommufd_backend_disconnect(iommufd); + return ret; +} + +static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) +{ + /* Unbind is automatically conducted when device fd is closed */ + iommufd_cdev_kvm_device_del(vbasedev); + iommufd_backend_disconnect(vbasedev->iommufd); +} + +static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) +{ + long int ret = -ENOTTY; + char *path, *vfio_dev_path = NULL, *vfio_path = NULL; + DIR *dir = NULL; + struct dirent *dent; + gchar *contents; + struct stat st; + gsize length; + int major, minor; + dev_t vfio_devt; + + path = g_strdup_printf("%s/vfio-dev", sysfs_path); + if (stat(path, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + goto out_free_path; + } + + dir = opendir(path); + if (!dir) { + error_setg_errno(errp, errno, "couldn't open directory %s", path); + goto out_free_path; + } + + while ((dent = readdir(dir))) { + if (!strncmp(dent->d_name, "vfio", 4)) { + vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name); + break; + } + } + + if (!vfio_dev_path) { + error_setg(errp, "failed to find vfio-dev/vfioX/dev"); + goto out_close_dir; + } + + if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) { + error_setg(errp, "failed to load \"%s\"", vfio_dev_path); + goto out_free_dev_path; + } + + if (sscanf(contents, "%d:%d", &major, &minor) != 2) { + error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path); + goto out_free_dev_path; + } + g_free(contents); + vfio_devt = makedev(major, minor); + + vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name); + ret = open_cdev(vfio_path, vfio_devt); + if (ret < 0) { + error_setg(errp, "Failed to open %s", vfio_path); + } + + trace_iommufd_cdev_getfd(vfio_path, ret); + g_free(vfio_path); + +out_free_dev_path: + g_free(vfio_dev_path); +out_close_dir: + closedir(dir); +out_free_path: + if (*errp) { + error_prepend(errp, VFIO_MSG_PREFIX, path); + } + g_free(path); + + return ret; +} + +static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, + Error **errp) +{ + int ret, iommufd = vbasedev->iommufd->fd; + struct vfio_device_attach_iommufd_pt attach_data = { + .argsz = sizeof(attach_data), + .flags = 0, + .pt_id = id, + }; + + /* Attach device to an IOAS or hwpt within iommufd */ + ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); + if (ret) { + error_setg_errno(errp, errno, + "[iommufd=%d] error attach %s (%d) to id=%d", + iommufd, vbasedev->name, vbasedev->fd, id); + } else { + trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, + vbasedev->fd, id); + } + return ret; +} + +static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) +{ + int ret, iommufd = vbasedev->iommufd->fd; + struct vfio_device_detach_iommufd_pt detach_data = { + .argsz = sizeof(detach_data), + .flags = 0, + }; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data); + if (ret) { + error_setg_errno(errp, errno, "detach %s failed", vbasedev->name); + } else { + trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name); + } + return ret; +} + +static int iommufd_cdev_attach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); +} + +static void iommufd_cdev_detach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + Error *err = NULL; + + if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { + error_report_err(err); + } +} + +static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (!QLIST_EMPTY(&bcontainer->device_list)) { + return; + } + memory_listener_unregister(&bcontainer->listener); + vfio_container_destroy(bcontainer); + iommufd_backend_free_id(container->be, container->ioas_id); + g_free(container); +} + +static int iommufd_cdev_ram_block_discard_disable(bool state) +{ + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); +} + +static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + VFIOContainerBase *bcontainer; + VFIOIOMMUFDContainer *container; + VFIOAddressSpace *space; + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; + int ret, devfd; + uint32_t ioas_id; + Error *err = NULL; + + devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); + if (devfd < 0) { + return devfd; + } + vbasedev->fd = devfd; + + ret = iommufd_cdev_connect_and_bind(vbasedev, errp); + if (ret) { + goto err_connect_bind; + } + + space = vfio_get_address_space(as); + + /* try to attach to an existing container in this space */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + if (bcontainer->ops != &vfio_iommufd_ops || + vbasedev->iommufd != container->be) { + continue; + } + if (iommufd_cdev_attach_container(vbasedev, container, &err)) { + const char *msg = error_get_pretty(err); + + trace_iommufd_cdev_fail_attach_existing_container(msg); + error_free(err); + err = NULL; + } else { + ret = iommufd_cdev_ram_block_discard_disable(true); + if (ret) { + error_setg(errp, + "Cannot set discarding of RAM broken (%d)", ret); + goto err_discard_disable; + } + goto found_container; + } + } + + /* Need to allocate a new dedicated container */ + ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp); + if (ret < 0) { + goto err_alloc_ioas; + } + + trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); + + container = g_malloc0(sizeof(*container)); + container->be = vbasedev->iommufd; + container->ioas_id = ioas_id; + + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, &vfio_iommufd_ops); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + + ret = iommufd_cdev_attach_container(vbasedev, container, errp); + if (ret) { + goto err_attach_container; + } + + ret = iommufd_cdev_ram_block_discard_disable(true); + if (ret) { + goto err_discard_disable; + } + + bcontainer->pgsizes = qemu_real_host_page_size(); + + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); + + if (bcontainer->error) { + ret = -1; + error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); + goto err_listener_register; + } + + bcontainer->initialized = true; + +found_container: + ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info); + if (ret) { + error_setg_errno(errp, errno, "error getting device info"); + goto err_listener_register; + } + + /* + * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level + * for discarding incompatibility check as well? + */ + if (vbasedev->ram_block_discard_allowed) { + iommufd_cdev_ram_block_discard_disable(false); + } + + vbasedev->group = 0; + vbasedev->num_irqs = dev_info.num_irqs; + vbasedev->num_regions = dev_info.num_regions; + vbasedev->flags = dev_info.flags; + vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, + vbasedev->num_regions, vbasedev->flags); + return 0; + +err_listener_register: + iommufd_cdev_ram_block_discard_disable(false); +err_discard_disable: + iommufd_cdev_detach_container(vbasedev, container); +err_attach_container: + iommufd_cdev_container_destroy(container); +err_alloc_ioas: + vfio_put_address_space(space); + iommufd_cdev_unbind_and_disconnect(vbasedev); +err_connect_bind: + close(vbasedev->fd); + return ret; +} + +static void iommufd_cdev_detach(VFIODevice *vbasedev) +{ + VFIOContainerBase *bcontainer = vbasedev->bcontainer; + VFIOAddressSpace *space = bcontainer->space; + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + QLIST_REMOVE(vbasedev, global_next); + QLIST_REMOVE(vbasedev, container_next); + vbasedev->bcontainer = NULL; + + if (!vbasedev->ram_block_discard_allowed) { + iommufd_cdev_ram_block_discard_disable(false); + } + + iommufd_cdev_detach_container(vbasedev, container); + iommufd_cdev_container_destroy(container); + vfio_put_address_space(space); + + iommufd_cdev_unbind_and_disconnect(vbasedev); + close(vbasedev->fd); +} + +const VFIOIOMMUOps vfio_iommufd_ops = { + .dma_map = iommufd_cdev_map, + .dma_unmap = iommufd_cdev_unmap, + .attach_device = iommufd_cdev_attach, + .detach_device = iommufd_cdev_detach, +}; diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 32a6933280..bd5cc4ca79 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -7,6 +7,9 @@ vfio_ss.add(files( 'spapr.c', 'migration.c', )) +vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( + 'iommufd.c', +)) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', 'pci-quirks.c', diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 08a1f9dfa4..3340c93af0 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -164,3 +164,13 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" + +#iommufd.c + +iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d" +iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)" +iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d" +iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s" +iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" +iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" +iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9e22acbfb6..9b9fd7b461 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -99,6 +99,14 @@ typedef struct VFIOHostDMAWindow { QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; } VFIOHostDMAWindow; +typedef struct IOMMUFDBackend IOMMUFDBackend; + +typedef struct VFIOIOMMUFDContainer { + VFIOContainerBase bcontainer; + IOMMUFDBackend *be; + uint32_t ioas_id; +} VFIOIOMMUFDContainer; + typedef struct VFIODeviceOps VFIODeviceOps; typedef struct VFIODevice { @@ -126,6 +134,8 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + int devid; + IOMMUFDBackend *iommufd; } VFIODevice; struct VFIODeviceOps { @@ -215,6 +225,7 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; extern const VFIOIOMMUOps vfio_legacy_ops; +extern const VFIOIOMMUOps vfio_iommufd_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; -- Gitee From cb2bd16a67cd45a0ad3318098120aee10a298f3b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:41 +0800 Subject: [PATCH 632/939] vfio/iommufd: Relax assert check for iommufd backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently iommufd doesn't support dirty page sync yet, but it will not block us doing live migration if VFIO migration is force enabled. So in this case we allow set_dirty_page_tracking to be NULL. Note we don't need same change for query_dirty_bitmap because when dirty page sync isn't supported, query_dirty_bitmap will never be called. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container-base.c | 4 ++++ hw/vfio/container.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 71f7274973..eee2dcfe76 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -55,6 +55,10 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer, int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start) { + if (!bcontainer->dirty_pages_supported) { + return 0; + } + g_assert(bcontainer->ops->set_dirty_page_tracking); return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 62af0f2bdd..4936b8f27f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -266,10 +266,6 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, .argsz = sizeof(dirty), }; - if (!bcontainer->dirty_pages_supported) { - return 0; - } - if (start) { dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; } else { -- Gitee From d6f0612a8760959f25c148ab50a1e7c394d4279a Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:42 +0800 Subject: [PATCH 633/939] vfio/iommufd: Add support for iova_ranges and pgsizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some vIOMMU such as virtio-iommu use IOVA ranges from host side to setup reserved ranges for passthrough device, so that guest will not use an IOVA range beyond host support. Use an uAPI of IOMMUFD to get IOVA ranges of host side and pass to vIOMMU just like the legacy backend, if this fails, fallback to 64bit IOVA range. Also use out_iova_alignment returned from uAPI as pgsizes instead of qemu_real_host_page_size() as a fallback. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/iommufd.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 6d31aeac7b..01b448e840 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -261,6 +261,53 @@ static int iommufd_cdev_ram_block_discard_disable(bool state) return ram_block_uncoordinated_discard_disable(state); } +static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container, + uint32_t ioas_id, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + struct iommu_ioas_iova_ranges *info; + struct iommu_iova_range *iova_ranges; + int ret, sz, fd = container->be->fd; + + info = g_malloc0(sizeof(*info)); + info->size = sizeof(*info); + info->ioas_id = ioas_id; + + ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); + if (ret && errno != EMSGSIZE) { + goto error; + } + + sz = info->num_iovas * sizeof(struct iommu_iova_range); + info = g_realloc(info, sizeof(*info) + sz); + info->allowed_iovas = (uintptr_t)(info + 1); + + ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); + if (ret) { + goto error; + } + + iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas; + + for (int i = 0; i < info->num_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); + } + bcontainer->pgsizes = info->out_iova_alignment; + + g_free(info); + return 0; + +error: + ret = -errno; + g_free(info); + error_setg_errno(errp, errno, "Cannot get IOVA ranges"); + return ret; +} + static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { @@ -335,7 +382,14 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, goto err_discard_disable; } - bcontainer->pgsizes = qemu_real_host_page_size(); + ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err); + if (ret) { + error_append_hint(&err, + "Fallback to default 64bit IOVA range and 4K page size\n"); + warn_report_err(err); + err = NULL; + bcontainer->pgsizes = qemu_real_host_page_size(); + } bcontainer->listener = vfio_memory_listener; memory_listener_register(&bcontainer->listener, bcontainer->space->as); -- Gitee From 0b0701478649baccf3945051822f993619bce01e Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:43 +0800 Subject: [PATCH 634/939] vfio/pci: Extract out a helper vfio_pci_get_pci_hot_reset_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used by both legacy and iommufd backends. No functional changes intended. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/pci.c | 54 +++++++++++++++++++++++++++++++++++---------------- hw/vfio/pci.h | 3 +++ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c62c02f7b6..eb55e8ae88 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2445,22 +2445,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) return (strcmp(tmp, name) == 0); } -static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p) { - VFIOGroup *group; struct vfio_pci_hot_reset_info *info; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; + int ret, count; - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); - } - vdev->vbasedev.needs_reset = false; + assert(info_p && !*info_p); info = g_malloc0(sizeof(*info)); info->argsz = sizeof(*info); @@ -2468,24 +2459,53 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret && errno != ENOSPC) { ret = -errno; + g_free(info); if (!vdev->has_pm_reset) { error_report("vfio: Cannot reset device %s, " "no available reset mechanism.", vdev->vbasedev.name); } - goto out_single; + return ret; } count = info->count; - info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); - info->argsz = sizeof(*info) + (count * sizeof(*devices)); - devices = &info->devices[0]; + info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0]))); + info->argsz = sizeof(*info) + (count * sizeof(info->devices[0])); ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret) { ret = -errno; + g_free(info); error_report("vfio: hot reset info failed: %m"); + return ret; + } + + *info_p = info; + return 0; +} + +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +{ + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { goto out_single; } + devices = &info->devices[0]; trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index fba8737ab2..1006061afb 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); extern const PropertyInfo qdev_prop_nv_gpudirect_clique; +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p); + int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, -- Gitee From 32beb7b360416a5f04cebac227ffdf102448d518 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:44 +0800 Subject: [PATCH 635/939] vfio/pci: Introduce a vfio pci hot reset interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy vfio pci and iommufd cdev have different process to hot reset vfio device, expand current code to abstract out pci_hot_reset callback for legacy vfio, this same interface will also be used by iommufd cdev vfio device. Rename vfio_pci_hot_reset to vfio_legacy_pci_hot_reset and move it into container.c. vfio_pci_[pre/post]_reset and vfio_pci_host_match are exported so they could be called in legacy and iommufd pci_hot_reset callback. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/container.c | 170 ++++++++++++++++++++++++++ hw/vfio/pci.c | 168 +------------------------ hw/vfio/pci.h | 3 + include/hw/vfio/vfio-container-base.h | 3 + 4 files changed, 182 insertions(+), 162 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 4936b8f27f..e32e1b51e0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -35,6 +35,7 @@ #include "qapi/error.h" #include "migration/migration.h" #include "sysemu/kvm.h" +#include "pci.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -1035,6 +1036,174 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) vfio_put_group(group); } +static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + devices = &info->devices[0]; + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + trace_vfio_pci_hot_reset_dep_devices(host.domain, + host.bus, host.slot, host.function, devices[i].group_id); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + } + ret = -EPERM; + goto out; + } + + /* Prep dependent devices for reset and clear our marker. */ + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + break; + } + } + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Determine how many group fds need to be passed */ + count = 0; + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + count++; + break; + } + } + } + + reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); + reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); + fds = &reset->group_fds[0]; + + /* Fill in group fds */ + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + fds[reset->count++] = group->fd; + break; + } + } + } + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + +out: + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + break; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + vfio_pci_post_reset(tmp); + break; + } + } + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + const VFIOIOMMUOps vfio_legacy_ops = { .dma_map = vfio_legacy_dma_map, .dma_unmap = vfio_legacy_dma_unmap, @@ -1042,4 +1211,5 @@ const VFIOIOMMUOps vfio_legacy_ops = { .detach_device = vfio_legacy_detach_device, .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, + .pci_hot_reset = vfio_legacy_pci_hot_reset, }; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index eb55e8ae88..d00c3472c7 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2374,7 +2374,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) return 0; } -static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) +void vfio_pci_pre_reset(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; uint16_t cmd; @@ -2411,7 +2411,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } -static void vfio_pci_post_reset(VFIOPCIDevice *vdev) +void vfio_pci_post_reset(VFIOPCIDevice *vdev) { Error *err = NULL; int nr; @@ -2435,7 +2435,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_quirk_reset(vdev); } -static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { char tmp[13]; @@ -2485,166 +2485,10 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { - VFIOGroup *group; - struct vfio_pci_hot_reset_info *info = NULL; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; - - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); - } - vdev->vbasedev.needs_reset = false; - - ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); - - if (ret) { - goto out_single; - } - devices = &info->devices[0]; - - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); - - /* Verify that we have all the groups required */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - trace_vfio_pci_hot_reset_dep_devices(host.domain, - host.bus, host.slot, host.function, devices[i].group_id); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %s, " - "depends on group %d which is not owned.", - vdev->vbasedev.name, devices[i].group_id); - } - ret = -EPERM; - goto out; - } - - /* Prep dependent devices for reset and clear our marker. */ - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - if (single) { - ret = -EINVAL; - goto out_single; - } - vfio_pci_pre_reset(tmp); - tmp->vbasedev.needs_reset = false; - multi = true; - break; - } - } - } - - if (!single && !multi) { - ret = -EINVAL; - goto out_single; - } - - /* Determine how many group fds need to be passed */ - count = 0; - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - count++; - break; - } - } - } - - reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); - reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); - fds = &reset->group_fds[0]; - - /* Fill in group fds */ - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - fds[reset->count++] = group->fd; - break; - } - } - } - - /* Bus reset! */ - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); - g_free(reset); - - trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, - ret ? strerror(errno) : "Success"); - -out: - /* Re-enable INTx on affected devices */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - break; - } - - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - vfio_pci_post_reset(tmp); - break; - } - } - } -out_single: - if (!single) { - vfio_pci_post_reset(vdev); - } - g_free(info); + VFIODevice *vbasedev = &vdev->vbasedev; + const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops; - return ret; + return ops->pci_hot_reset(vbasedev, single); } /* diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 1006061afb..6e64a2654e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); extern const PropertyInfo qdev_prop_nv_gpudirect_clique; +void vfio_pci_pre_reset(VFIOPCIDevice *vdev); +void vfio_pci_post_reset(VFIOPCIDevice *vdev); +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name); int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, struct vfio_pci_hot_reset_info **info_p); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 4b6f017c6f..45bb19c767 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -106,6 +106,9 @@ struct VFIOIOMMUOps { int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); + /* PCI specific */ + int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); + /* SPAPR specific */ int (*add_window)(VFIOContainerBase *bcontainer, MemoryRegionSection *section, -- Gitee From de17750e24d4e583e9f392bbe47e4bd1aa81d6bc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:45 +0800 Subject: [PATCH 636/939] vfio/iommufd: Enable pci hot reset through iommufd cdev interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the newly introduced pci_hot_reset callback named iommufd_cdev_pci_hot_reset to do iommufd specific check and reset operation. Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/iommufd.c | 150 +++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 2 files changed, 151 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 01b448e840..6e53e013ef 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -24,6 +24,7 @@ #include "sysemu/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" +#include "pci.h" static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) @@ -468,9 +469,158 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) close(vbasedev->fd); } +static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) +{ + VFIODevice *vbasedev_iter; + + QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { + if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) { + continue; + } + if (devid == vbasedev_iter->devid) { + return vbasedev_iter; + } + } + return NULL; +} + +static VFIOPCIDevice * +iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev, + VFIODevice *reset_dev) +{ + VFIODevice *vbasedev_tmp; + + if (dep_dev->devid == reset_dev->devid || + dep_dev->devid == VFIO_PCI_DEVID_OWNED) { + return NULL; + } + + vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid); + if (!vbasedev_tmp || !vbasedev_tmp->dev->realized || + vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) { + return NULL; + } + + return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev); +} + +static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int ret, i; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + + assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID); + + devices = &info->devices[0]; + + if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) { + if (!vdev->has_pm_reset) { + for (i = 0; i < info->count; i++) { + if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) { + error_report("vfio: Cannot reset device %s, " + "depends on device %04x:%02x:%02x.%x " + "which is not owned.", + vdev->vbasedev.name, devices[i].segment, + devices[i].bus, PCI_SLOT(devices[i].devfn), + PCI_FUNC(devices[i].devfn)); + } + } + } + ret = -EPERM; + goto out_single; + } + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + for (i = 0; i < info->count; i++) { + VFIOPCIDevice *tmp; + + trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment, + devices[i].bus, + PCI_SLOT(devices[i].devfn), + PCI_FUNC(devices[i].devfn), + devices[i].devid); + + /* + * If a VFIO cdev device is resettable, all the dependent devices + * are either bound to same iommufd or within same iommu_groups as + * one of the iommufd bound devices. + */ + assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED); + + tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); + if (!tmp) { + continue; + } + + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Use zero length array for hot reset with iommufd backend */ + reset = g_malloc0(sizeof(*reset)); + reset->argsz = sizeof(*reset); + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + VFIOPCIDevice *tmp; + + tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); + if (!tmp) { + continue; + } + vfio_pci_post_reset(tmp); + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + const VFIOIOMMUOps vfio_iommufd_ops = { .dma_map = iommufd_cdev_map, .dma_unmap = iommufd_cdev_unmap, .attach_device = iommufd_cdev_attach, .detach_device = iommufd_cdev_detach, + .pci_hot_reset = iommufd_cdev_pci_hot_reset, }; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 3340c93af0..8fdde54456 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -174,3 +174,4 @@ iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Succ iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" +iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" -- Gitee From 6576af91f2621c24de4a8bbfa2c6681a16a5d043 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Sat, 11 Jan 2025 10:52:46 +0800 Subject: [PATCH 637/939] vfio/pci: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-pci device: If the user wants to use the legacy backend, it shall not link the vfio-pci device with any iommufd object: -device vfio-pci,host=0000:02:00.0 This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-pci device options: -object iommufd,id=iommufd0 -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/pci.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d00c3472c7..c5984b0598 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include @@ -42,6 +43,7 @@ #include "qapi/error.h" #include "migration/blocker.h" #include "migration/qemu-file.h" +#include "sysemu/iommufd.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -3386,6 +3388,10 @@ static Property vfio_pci_dev_properties[] = { * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), */ +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From 008d4e37fe67c7f81920efe862352c4b1f3cd1b0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:47 +0800 Subject: [PATCH 638/939] vfio/pci: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Together with the earlier support of pre-opening /dev/iommu device, now we have full support of passing a vfio device to unprivileged qemu by management tool. This mode is no more considered for the legacy backend. So let's remove the "TODO" comment. Add helper functions vfio_device_set_fd() and vfio_device_get_name() to set fd and get device name, they will also be used by other vfio devices. There is no easy way to check if a device is mdev with FD passing, so fail the x-balloon-allowed check unconditionally in this case. There is also no easy way to get BDF as name with FD passing, so we fake a name by VFIO_FD[fd]. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/helpers.c | 43 +++++++++++++++++++++++++++++++++++ hw/vfio/iommufd.c | 12 ++++++---- hw/vfio/pci.c | 28 +++++++++++++---------- include/hw/vfio/vfio-common.h | 4 ++++ 4 files changed, 71 insertions(+), 16 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 168847e7c5..3592c3d54e 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -27,6 +27,7 @@ #include "trace.h" #include "qapi/error.h" #include "qemu/error-report.h" +#include "monitor/monitor.h" /* * Common VFIO interrupt disable @@ -609,3 +610,45 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) return ret; } + +int vfio_device_get_name(VFIODevice *vbasedev, Error **errp) +{ + struct stat st; + + if (vbasedev->fd < 0) { + if (stat(vbasedev->sysfsdev, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + return -errno; + } + /* User may specify a name, e.g: VFIO platform device */ + if (!vbasedev->name) { + vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + } + } else { + if (!vbasedev->iommufd) { + error_setg(errp, "Use FD passing only with iommufd backend"); + return -EINVAL; + } + /* + * Give a name with fd so any function printing out vbasedev->name + * will not break. + */ + if (!vbasedev->name) { + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + } + } + + return 0; +} + +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +{ + int fd = monitor_fd_param(monitor_cur(), str, errp); + + if (fd < 0) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + vbasedev->fd = fd; +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 6e53e013ef..5accd26484 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -320,11 +320,15 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, uint32_t ioas_id; Error *err = NULL; - devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); - if (devfd < 0) { - return devfd; + if (vbasedev->fd < 0) { + devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); + if (devfd < 0) { + return devfd; + } + vbasedev->fd = devfd; + } else { + devfd = vbasedev->fd; } - vbasedev->fd = devfd; ret = iommufd_cdev_connect_and_bind(vbasedev, errp); if (ret) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c5984b0598..445d58c8e5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2944,17 +2944,19 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; char *tmp, *subsys; Error *err = NULL; - struct stat st; int i, ret; bool is_mdev; char uuid[UUID_STR_LEN]; char *name; - if (!vbasedev->sysfsdev) { + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || ~vdev->host.slot || ~vdev->host.function)) { error_setg(errp, "No provided host device"); error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); return; } @@ -2964,13 +2966,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vdev->host.slot, vdev->host.function); } - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, "no such host device"); - error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + if (vfio_device_get_name(vbasedev, errp) < 0) { return; } - - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); vbasedev->ops = &vfio_pci_ops; vbasedev->type = VFIO_DEVICE_TYPE_PCI; vbasedev->dev = DEVICE(vdev); @@ -3330,6 +3328,7 @@ static void vfio_instance_init(Object *obj) vdev->host.bus = ~0U; vdev->host.slot = ~0U; vdev->host.function = ~0U; + vdev->vbasedev.fd = -1; vdev->nv_gpudirect_clique = 0xFF; @@ -3383,11 +3382,6 @@ static Property vfio_pci_dev_properties[] = { qdev_prop_nv_gpudirect_clique, uint8_t), DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, OFF_AUTOPCIBAR_OFF), - /* - * TODO - support passed fds... is this necessary? - * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), - * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), - */ #ifdef CONFIG_IOMMUFD DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), @@ -3395,6 +3389,13 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +#ifdef CONFIG_IOMMUFD +static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); +} +#endif + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3402,6 +3403,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) dc->reset = vfio_pci_reset; device_class_set_props(dc, vfio_pci_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); +#endif dc->desc = "VFIO-based PCI device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9b9fd7b461..5f35f2900b 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -265,4 +265,8 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, hwaddr size); int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr); + +/* Returns 0 on success, or a negative errno. */ +int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From 1bbc795190c3ad7c838dc57a6f7a38a779dfdd65 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:48 +0800 Subject: [PATCH 639/939] vfio/platform: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-platform device: If the user wants to use the legacy backend, it shall not link the vfio-platform device with any iommufd object: -device vfio-platform,host=XXX This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-platform device options: -object iommufd,id=iommufd0 -device vfio-platform,host=XXX,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/platform.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 8e3d4ac458..98ae4bc655 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -15,11 +15,13 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include "qapi/error.h" #include #include #include "hw/vfio/vfio-platform.h" +#include "sysemu/iommufd.h" #include "migration/vmstate.h" #include "qemu/error-report.h" #include "qemu/lockable.h" @@ -649,6 +651,10 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From 9a12f3f754fcebe86fe2346e62cd25d8a2d06a89 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:49 +0800 Subject: [PATCH 640/939] vfio/platform: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/platform.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 98ae4bc655..a97d9c6234 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -531,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = { */ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) { - struct stat st; int ret; - /* @sysfsdev takes precedence over @host */ - if (vbasedev->sysfsdev) { + /* @fd takes precedence over @sysfsdev which takes precedence over @host */ + if (vbasedev->fd < 0 && vbasedev->sysfsdev) { g_free(vbasedev->name); vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - } else { + } else if (vbasedev->fd < 0) { if (!vbasedev->name || strchr(vbasedev->name, '/')) { error_setg(errp, "wrong host device name"); return -EINVAL; @@ -548,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) vbasedev->name); } - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, - "failed to get the sysfs host device file status"); - return -errno; + ret = vfio_device_get_name(vbasedev, errp); + if (ret) { + return ret; } ret = vfio_attach_device(vbasedev->name, vbasedev, @@ -658,6 +656,20 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void vfio_platform_instance_init(Object *obj) +{ + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); + + vdev->vbasedev.fd = -1; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp); +} +#endif + static void vfio_platform_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -665,6 +677,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) dc->realize = vfio_platform_realize; device_class_set_props(dc, vfio_platform_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd); +#endif dc->vmsd = &vfio_platform_vmstate; dc->desc = "VFIO-based platform device assignment"; sbc->connect_irq_notifier = vfio_start_irqfd_injection; @@ -677,6 +692,7 @@ static const TypeInfo vfio_platform_dev_info = { .name = TYPE_VFIO_PLATFORM, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(VFIOPlatformDevice), + .instance_init = vfio_platform_instance_init, .class_init = vfio_platform_class_init, .class_size = sizeof(VFIOPlatformDeviceClass), }; -- Gitee From 6b9f02dbde780118d33abb998bc72ed246f50b6a Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:50 +0800 Subject: [PATCH 641/939] vfio/ap: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-ap device: if the user wants to use the legacy backend, it shall not link the vfio-ap device with any iommufd object: -device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-ap device options: -object iommufd,id=iommufd0 -device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index bbf69ff55a..80629609ae 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -11,10 +11,12 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include #include "qapi/error.h" #include "hw/vfio/vfio-common.h" +#include "sysemu/iommufd.h" #include "hw/s390x/ap-device.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" @@ -204,6 +206,10 @@ static void vfio_ap_unrealize(DeviceState *dev) static Property vfio_ap_properties[] = { DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From e4e2a6414eabe80d0d9f57446626c91c55b40afa Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:51 +0800 Subject: [PATCH 642/939] vfio/ap: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 80629609ae..f180e4a32a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -160,7 +160,10 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); VFIODevice *vbasedev = &vapdev->vdev; - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } + vbasedev->ops = &vfio_ap_ops; vbasedev->type = VFIO_DEVICE_TYPE_AP; vbasedev->dev = dev; @@ -230,11 +233,28 @@ static const VMStateDescription vfio_ap_vmstate = { .unmigratable = 1, }; +static void vfio_ap_instance_init(Object *obj) +{ + VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); + + vapdev->vdev.fd = -1; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp); +} +#endif + static void vfio_ap_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); device_class_set_props(dc, vfio_ap_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd); +#endif dc->vmsd = &vfio_ap_vmstate; dc->desc = "VFIO-based AP device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); @@ -249,6 +269,7 @@ static const TypeInfo vfio_ap_info = { .name = TYPE_VFIO_AP_DEVICE, .parent = TYPE_AP_DEVICE, .instance_size = sizeof(VFIOAPDevice), + .instance_init = vfio_ap_instance_init, .class_init = vfio_ap_class_init, }; -- Gitee From 5e743a2f7791f4fb3eea40806ca69f6cce1258c2 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:52 +0800 Subject: [PATCH 643/939] vfio/ccw: Allow the selection of a given iommu backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we support two types of iommu backends, let's add the capability to select one of them. This depends on whether an iommufd object has been linked with the vfio-ccw device: If the user wants to use the legacy backend, it shall not link the vfio-ccw device with any iommufd object: -device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX This is called the legacy mode/backend. If the user wants to use the iommufd backend (/dev/iommu) it shall pass an iommufd object id in the vfio-ccw device options: -object iommufd,id=iommufd0 -device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0 Suggested-by: Alex Williamson Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ccw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index d857bb8d0f..d2d58bb677 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -15,12 +15,14 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include #include #include "qapi/error.h" #include "hw/vfio/vfio-common.h" +#include "sysemu/iommufd.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" #include "hw/qdev-properties.h" @@ -677,6 +679,10 @@ static void vfio_ccw_unrealize(DeviceState *dev) static Property vfio_ccw_properties[] = { DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev), DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; -- Gitee From 0f9545907220680ee7e85a823a0e19b216a8b7d9 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:53 +0800 Subject: [PATCH 644/939] vfio/ccw: Make vfio cdev pre-openable by passing a file handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives management tools like libvirt a chance to open the vfio cdev with privilege and pass FD to qemu. This way qemu never needs to have privilege to open a VFIO or iommu cdev node. Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Cédric Le Goater Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ccw.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index d2d58bb677..2afdf17dbe 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -590,11 +590,12 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) } } + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } + vbasedev->ops = &vfio_ccw_ops; vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid, - vcdev->cdev.hostid.ssid, - vcdev->cdev.hostid.devid); vbasedev->dev = dev; /* @@ -691,12 +692,29 @@ static const VMStateDescription vfio_ccw_vmstate = { .unmigratable = 1, }; +static void vfio_ccw_instance_init(Object *obj) +{ + VFIOCCWDevice *vcdev = VFIO_CCW(obj); + + vcdev->vdev.fd = -1; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp); +} +#endif + static void vfio_ccw_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass); device_class_set_props(dc, vfio_ccw_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd); +#endif dc->vmsd = &vfio_ccw_vmstate; dc->desc = "VFIO-based subchannel assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); @@ -714,6 +732,7 @@ static const TypeInfo vfio_ccw_info = { .name = TYPE_VFIO_CCW, .parent = TYPE_S390_CCW, .instance_size = sizeof(VFIOCCWDevice), + .instance_init = vfio_ccw_instance_init, .class_init = vfio_ccw_class_init, }; -- Gitee From f702d050b4309bb7e7ffc159a3c41c82fe34ba07 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:54 +0800 Subject: [PATCH 645/939] vfio: Make VFIOContainerBase poiner parameter const in VFIOIOMMUOps callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the callbacks in VFIOIOMMUOps pass VFIOContainerBase poiner, those callbacks only need read access to the sub object of VFIOContainerBase. So make VFIOContainerBase, VFIOContainer and VFIOIOMMUFDContainer as const in these callbacks. Local functions called by those callbacks also need same changes to avoid build error. Modify vfio_lookup_match_range/vfio_legacy_dma_map during backporting. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/common.c | 9 +++---- hw/vfio/container-base.c | 2 +- hw/vfio/container.c | 34 ++++++++++++++------------- hw/vfio/iommufd.c | 8 +++---- include/hw/vfio/vfio-common.h | 14 ++++++----- include/hw/vfio/vfio-container-base.h | 12 ++++++---- 6 files changed, 43 insertions(+), 36 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 0e900c6746..d572ec5880 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -204,7 +204,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) return true; } -bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) +bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; @@ -221,7 +221,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer) * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. */ -bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer) +bool +vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; @@ -1139,7 +1140,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, return 0; } -int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, +int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { @@ -1162,7 +1163,7 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, return 0; } -int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, +int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { bool all_device_dirty_tracking = diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index eee2dcfe76..1ffd25bbfa 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -63,7 +63,7 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); } -int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { diff --git a/hw/vfio/container.c b/hw/vfio/container.c index e32e1b51e0..67aeaa825b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -63,11 +63,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) } } -static int vfio_dma_unmap_bitmap(VFIOContainer *container, +static int vfio_dma_unmap_bitmap(const VFIOContainer *container, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { - VFIOContainerBase *bcontainer = &container->bcontainer; + const VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; VFIOBitmap vbmap; @@ -116,7 +116,7 @@ unmap_exit: return ret; } -VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, +VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, hwaddr start_addr, hwaddr size) { VFIODMARange *qrange; @@ -142,11 +142,12 @@ void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ -static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb) +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), .flags = 0, @@ -216,11 +217,11 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, return 0; } -static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, +static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) { VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + bcontainer); struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ, @@ -257,11 +258,12 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, return -errno; } -static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, - bool start) +static int +vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { .argsz = sizeof(dirty), @@ -283,12 +285,12 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer, return ret; } -static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer, +static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { - VFIOContainer *container = container_of(bcontainer, VFIOContainer, - bcontainer); + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; int ret; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 5accd26484..87a561c545 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -26,10 +26,10 @@ #include "qemu/chardev_open.h" #include "pci.h" -static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, +static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) { - VFIOIOMMUFDContainer *container = + const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); return iommufd_backend_map_dma(container->be, @@ -37,11 +37,11 @@ static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova, iova, size, vaddr, readonly); } -static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer, +static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { - VFIOIOMMUFDContainer *container = + const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 5f35f2900b..37f01410d5 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -186,7 +186,7 @@ typedef struct VFIODisplay { VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); -VFIODMARange *vfio_lookup_match_range(VFIOContainer *container, +VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, hwaddr start_addr, hwaddr size); void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); @@ -258,13 +258,15 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); -bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer); -bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer); -int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer, +bool +vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer); +bool +vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer); +int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); -int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova, - uint64_t size, ram_addr_t ram_addr); +int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); /* Returns 0 on success, or a negative errno. */ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 45bb19c767..2ae297ccda 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -82,7 +82,7 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer, MemoryRegionSection *section); int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, bool start); -int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer, +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); @@ -93,18 +93,20 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); struct VFIOIOMMUOps { /* basic feature */ - int (*dma_map)(VFIOContainerBase *bcontainer, + int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); - int (*dma_unmap)(VFIOContainerBase *bcontainer, + int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb); int (*attach_device)(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void (*detach_device)(VFIODevice *vbasedev); /* migration feature */ - int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start); - int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, + int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, + bool start); + int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, hwaddr size); /* PCI specific */ int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); -- Gitee From bcb031b40fe40d5b6347b2134fb039945b87e8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 11 Jan 2025 10:52:55 +0800 Subject: [PATCH 646/939] hw/arm: Activate IOMMUFD for virt machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index c0a7d0bd58..4a0ea0628f 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -8,6 +8,7 @@ config ARM_VIRT imply TPM_TIS_SYSBUS imply TPM_TIS_I2C imply NVDIMM + imply IOMMUFD select ARM_GIC select ACPI select ARM_SMMUV3 -- Gitee From 3dfc0dd0b59925d1b73ca1a0db6d307ae597f76e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 11 Jan 2025 10:52:56 +0800 Subject: [PATCH 647/939] kconfig: Activate IOMMUFD for s390x machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Matthew Rosato Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/s390x/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig index 4c068d7960..26ad104485 100644 --- a/hw/s390x/Kconfig +++ b/hw/s390x/Kconfig @@ -6,6 +6,7 @@ config S390_CCW_VIRTIO imply VFIO_CCW imply WDT_DIAG288 imply PCIE_DEVICES + imply IOMMUFD select PCI_EXPRESS select S390_FLIC select S390_FLIC_KVM if KVM -- Gitee From 5405fa36c5f2784a9a6b19ee60d44b6cffb9f769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 11 Jan 2025 10:52:57 +0800 Subject: [PATCH 648/939] hw/i386: Activate IOMMUFD for q35 machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/i386/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 682e324f1c..908f29e02b 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -105,6 +105,7 @@ config Q35 imply E1000E_PCI_EXPRESS imply VMPORT imply VMMOUSE + imply IOMMUFD select PC_PCI select PC_ACPI select PCI_EXPRESS_Q35 -- Gitee From 0781636a0c5652c25f81c06ba5fc289966021a33 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:58 +0800 Subject: [PATCH 649/939] vfio/pci: Move VFIODevice initializations in vfio_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_realize, move all of them in vfio_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/pci.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 445d58c8e5..87405584d7 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2969,9 +2969,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (vfio_device_get_name(vbasedev, errp) < 0) { return; } - vbasedev->ops = &vfio_pci_ops; - vbasedev->type = VFIO_DEVICE_TYPE_PCI; - vbasedev->dev = DEVICE(vdev); /* * Mediated devices *might* operate compatibly with discarding of RAM, but @@ -3320,6 +3317,7 @@ static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, "bootindex", NULL, @@ -3328,7 +3326,11 @@ static void vfio_instance_init(Object *obj) vdev->host.bus = ~0U; vdev->host.slot = ~0U; vdev->host.function = ~0U; - vdev->vbasedev.fd = -1; + + vbasedev->type = VFIO_DEVICE_TYPE_PCI; + vbasedev->ops = &vfio_pci_ops; + vbasedev->dev = DEVICE(vdev); + vbasedev->fd = -1; vdev->nv_gpudirect_clique = 0xFF; -- Gitee From 594a30d0a9d0d569cf264ffd7b042aa39a404383 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:52:59 +0800 Subject: [PATCH 650/939] vfio/platform: Move VFIODevice initializations in vfio_platform_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_platform_realize, move all of them in vfio_platform_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/platform.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index a97d9c6234..506eb8193f 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i, ret; - vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; - vbasedev->dev = dev; - vbasedev->ops = &vfio_platform_ops; - qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? @@ -659,8 +655,12 @@ static Property vfio_platform_dev_properties[] = { static void vfio_platform_instance_init(Object *obj) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; - vdev->vbasedev.fd = -1; + vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; + vbasedev->ops = &vfio_platform_ops; + vbasedev->dev = DEVICE(vdev); + vbasedev->fd = -1; } #ifdef CONFIG_IOMMUFD -- Gitee From 69da3907dc07bdb3cab4519922842820388bac4c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:00 +0800 Subject: [PATCH 651/939] vfio/ap: Move VFIODevice initializations in vfio_ap_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_ap_realize, move all of them in vfio_ap_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index f180e4a32a..95fe7cd98b 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -164,18 +164,6 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) return; } - vbasedev->ops = &vfio_ap_ops; - vbasedev->type = VFIO_DEVICE_TYPE_AP; - vbasedev->dev = dev; - - /* - * vfio-ap devices operate in a way compatible with discarding of - * memory in RAM blocks, as no pages are pinned in the host. - * This needs to be set before vfio_get_device() for vfio common to - * handle ram_block_discard_disable(). - */ - vapdev->vdev.ram_block_discard_allowed = true; - ret = vfio_attach_device(vbasedev->name, vbasedev, &address_space_memory, errp); if (ret) { @@ -236,8 +224,20 @@ static const VMStateDescription vfio_ap_vmstate = { static void vfio_ap_instance_init(Object *obj) { VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); + VFIODevice *vbasedev = &vapdev->vdev; - vapdev->vdev.fd = -1; + vbasedev->type = VFIO_DEVICE_TYPE_AP; + vbasedev->ops = &vfio_ap_ops; + vbasedev->dev = DEVICE(vapdev); + vbasedev->fd = -1; + + /* + * vfio-ap devices operate in a way compatible with discarding of + * memory in RAM blocks, as no pages are pinned in the host. + * This needs to be set before vfio_get_device() for vfio common to + * handle ram_block_discard_disable(). + */ + vbasedev->ram_block_discard_allowed = true; } #ifdef CONFIG_IOMMUFD -- Gitee From 4d12d39e824a35014f753a25e5aa8ec0e275a38c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:01 +0800 Subject: [PATCH 652/939] vfio/ccw: Move VFIODevice initializations in vfio_ccw_instance_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the VFIODevice initializations is in vfio_ccw_realize, move all of them in vfio_ccw_instance_init. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ccw.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 2afdf17dbe..6305a4c1b8 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -594,20 +594,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) return; } - vbasedev->ops = &vfio_ccw_ops; - vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->dev = dev; - - /* - * All vfio-ccw devices are believed to operate in a way compatible with - * discarding of memory in RAM blocks, ie. pages pinned in the host are - * in the current working set of the guest driver and therefore never - * overlap e.g., with pages available to the guest balloon driver. This - * needs to be set before vfio_get_device() for vfio common to handle - * ram_block_discard_disable(). - */ - vbasedev->ram_block_discard_allowed = true; - ret = vfio_attach_device(cdev->mdevid, vbasedev, &address_space_memory, errp); if (ret) { @@ -695,8 +681,22 @@ static const VMStateDescription vfio_ccw_vmstate = { static void vfio_ccw_instance_init(Object *obj) { VFIOCCWDevice *vcdev = VFIO_CCW(obj); + VFIODevice *vbasedev = &vcdev->vdev; + + vbasedev->type = VFIO_DEVICE_TYPE_CCW; + vbasedev->ops = &vfio_ccw_ops; + vbasedev->dev = DEVICE(vcdev); + vbasedev->fd = -1; - vcdev->vdev.fd = -1; + /* + * All vfio-ccw devices are believed to operate in a way compatible with + * discarding of memory in RAM blocks, ie. pages pinned in the host are + * in the current working set of the guest driver and therefore never + * overlap e.g., with pages available to the guest balloon driver. This + * needs to be set before vfio_get_device() for vfio common to handle + * ram_block_discard_disable(). + */ + vbasedev->ram_block_discard_allowed = true; } #ifdef CONFIG_IOMMUFD -- Gitee From 65c5381ba3ce5f062f0be9aa796e68b8a9d6bb3c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:02 +0800 Subject: [PATCH 653/939] vfio: Introduce a helper function to initialize VFIODevice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a helper function to replace the common code to initialize VFIODevice in pci, platform, ap and ccw VFIO device. No functional change intended. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- hw/vfio/ap.c | 8 ++------ hw/vfio/ccw.c | 8 ++------ hw/vfio/helpers.c | 11 +++++++++++ hw/vfio/pci.c | 6 ++---- hw/vfio/platform.c | 6 ++---- include/hw/vfio/vfio-common.h | 2 ++ 6 files changed, 21 insertions(+), 20 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 95fe7cd98b..e157aa1ff7 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -226,18 +226,14 @@ static void vfio_ap_instance_init(Object *obj) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); VFIODevice *vbasedev = &vapdev->vdev; - vbasedev->type = VFIO_DEVICE_TYPE_AP; - vbasedev->ops = &vfio_ap_ops; - vbasedev->dev = DEVICE(vapdev); - vbasedev->fd = -1; - /* * vfio-ap devices operate in a way compatible with discarding of * memory in RAM blocks, as no pages are pinned in the host. * This needs to be set before vfio_get_device() for vfio common to * handle ram_block_discard_disable(). */ - vbasedev->ram_block_discard_allowed = true; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, + DEVICE(vapdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 6305a4c1b8..90e4a53437 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -683,11 +683,6 @@ static void vfio_ccw_instance_init(Object *obj) VFIOCCWDevice *vcdev = VFIO_CCW(obj); VFIODevice *vbasedev = &vcdev->vdev; - vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->ops = &vfio_ccw_ops; - vbasedev->dev = DEVICE(vcdev); - vbasedev->fd = -1; - /* * All vfio-ccw devices are believed to operate in a way compatible with * discarding of memory in RAM blocks, ie. pages pinned in the host are @@ -696,7 +691,8 @@ static void vfio_ccw_instance_init(Object *obj) * needs to be set before vfio_get_device() for vfio common to handle * ram_block_discard_disable(). */ - vbasedev->ram_block_discard_allowed = true; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops, + DEVICE(vcdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 3592c3d54e..6789870802 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -652,3 +652,14 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) } vbasedev->fd = fd; } + +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard) +{ + vbasedev->type = type; + vbasedev->ops = ops; + vbasedev->dev = dev; + vbasedev->fd = -1; + + vbasedev->ram_block_discard_allowed = ram_discard; +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 87405584d7..1874ec1aba 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3327,10 +3327,8 @@ static void vfio_instance_init(Object *obj) vdev->host.slot = ~0U; vdev->host.function = ~0U; - vbasedev->type = VFIO_DEVICE_TYPE_PCI; - vbasedev->ops = &vfio_pci_ops; - vbasedev->dev = DEVICE(vdev); - vbasedev->fd = -1; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, + DEVICE(vdev), false); vdev->nv_gpudirect_clique = 0xFF; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 506eb8193f..a8d9b7da63 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -657,10 +657,8 @@ static void vfio_platform_instance_init(Object *obj) VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); VFIODevice *vbasedev = &vdev->vbasedev; - vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; - vbasedev->ops = &vfio_platform_ops; - vbasedev->dev = DEVICE(vdev); - vbasedev->fd = -1; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, + DEVICE(vdev), false); } #ifdef CONFIG_IOMMUFD diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 37f01410d5..151b2ab65f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -271,4 +271,6 @@ int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, /* Returns 0 on success, or a negative errno. */ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From fd1d6d64803a052adcab8c7993ca40cabc9c926d Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 11 Jan 2025 10:53:03 +0800 Subject: [PATCH 654/939] docs/devel: Add VFIO iommufd backend documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Cédric Le Goater Signed-off-by: Eric Auger Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Tested-by: Nicolin Chen Signed-off-by: Cédric Le Goater Signed-off-by: Zhou Wang --- MAINTAINERS | 1 + docs/devel/index-internals.rst | 1 + docs/devel/vfio-iommufd.rst | 166 +++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 docs/devel/vfio-iommufd.rst diff --git a/MAINTAINERS b/MAINTAINERS index ca70bb4e64..0ddb20a35f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2176,6 +2176,7 @@ F: backends/iommufd.c F: include/sysemu/iommufd.h F: include/qemu/chardev_open.h F: util/chardev_open.c +F: docs/devel/vfio-iommufd.rst vhost M: Michael S. Tsirkin diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 6f81df92bc..3def4a138b 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them. s390-dasd-ipl tracing vfio-migration + vfio-iommufd writing-monitor-commands virtio-backends diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst new file mode 100644 index 0000000000..3d1c11f175 --- /dev/null +++ b/docs/devel/vfio-iommufd.rst @@ -0,0 +1,166 @@ +=============================== +IOMMUFD BACKEND usage with VFIO +=============================== + +(Same meaning for backend/container/BE) + +With the introduction of iommufd, the Linux kernel provides a generic +interface for user space drivers to propagate their DMA mappings to kernel +for assigned devices. While the legacy kernel interface is group-centric, +the new iommufd interface is device-centric, relying on device fd and iommufd. + +To support both interfaces in the QEMU VFIO device, introduce a base container +to abstract the common part of VFIO legacy and iommufd container. So that the +generic VFIO code can use either container. + +The base container implements generic functions such as memory_listener and +address space management whereas the derived container implements callbacks +specific to either legacy or iommufd. Each container has its own way to setup +secure context and dma management interface. The below diagram shows how it +looks like with both containers. + +:: + + VFIO AddressSpace/Memory + +-------+ +----------+ +-----+ +-----+ + | pci | | platform | | ap | | ccw | + +---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+ + | | | | | AddressSpace | + | | | | +------------+---------+ + +---V-----------V-----------V--------V----+ / + | VFIOAddressSpace | <------------+ + | | | MemoryListener + | VFIOContainerBase list | + +-------+----------------------------+----+ + | | + | | + +-------V------+ +--------V----------+ + | iommufd | | vfio legacy | + | container | | container | + +-------+------+ +--------+----------+ + | | + | /dev/iommu | /dev/vfio/vfio + | /dev/vfio/devices/vfioX | /dev/vfio/$group_id + Userspace | | + ============+============================+=========================== + Kernel | device fd | + +---------------+ | group/container fd + | (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU) + | ATTACH_IOAS) | | device fd + | | | + | +-------V------------V-----------------+ + iommufd | | vfio | + (map/unmap | +---------+--------------------+-------+ + ioas_copy) | | | map/unmap + | | | + +------V------+ +-----V------+ +------V--------+ + | iommfd core | | device | | vfio iommu | + +-------------+ +------------+ +---------------+ + +* Secure Context setup + + - iommufd BE: uses device fd and iommufd to setup secure context + (bind_iommufd, attach_ioas) + - vfio legacy BE: uses group fd and container fd to setup secure context + (set_container, set_iommu) + +* Device access + + - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX`` + - vfio legacy BE: device fd is retrieved from group fd ioctl + +* DMA Mapping flow + + 1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener + 2. VFIO populates DMA map/unmap via the container BEs + * iommufd BE: uses iommufd + * vfio legacy BE: uses container fd + +Example configuration +===================== + +Step 1: configure the host device +--------------------------------- + +It's exactly same as the VFIO device with legacy VFIO container. + +Step 2: configure QEMU +---------------------- + +Interactions with the ``/dev/iommu`` are abstracted by a new iommufd +object (compiled in with the ``CONFIG_IOMMUFD`` option). + +Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must +be linked with an iommufd object. It gets a new optional property +named iommufd which allows to pass an iommufd object. Take ``vfio-pci`` +device for example: + +.. code-block:: bash + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 + +Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a +management layer. In such a case the fd is passed, the fd supports a +string naming the fd or a number, for example: + +.. code-block:: bash + + -object iommufd,id=iommufd0,fd=22 + -device vfio-pci,iommufd=iommufd0,fd=23 + +If the ``fd`` property is not passed, the fd is opened by QEMU. + +If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd +is not used and the user gets the behavior based on the legacy VFIO +container: + +.. code-block:: bash + + -device vfio-pci,host=0000:02:00.0 + +Supported platform +================== + +Supports x86, ARM and s390x currently. + +Caveats +======= + +Dirty page sync +--------------- + +Dirty page sync with iommufd backend is unsupported yet, live migration is +disabled by default. But it can be force enabled like below, low efficient +though. + +.. code-block:: bash + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on + +P2P DMA +------- + +PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI +BAR region yet. Below warning shows for assigned PCI device, it's not a bug. + +.. code-block:: none + + qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR? + qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address) + +FD passing with mdev +-------------------- + +``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev. +If FD passing is used, there is no way to know that and the mdev is treated +like a real PCI device. There is an error as below if user wants to enable +RAM discarding for mdev. + +.. code-block:: none + + qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices + +``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend +devices are always mdev and RAM discarding is force enabled. -- Gitee From 1bb64d6e69c385af5817dc6f0c3bbd204783c237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:17 +0100 Subject: [PATCH 655/939] vfio/container: Introduce vfio_legacy_setup() for further cleanups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help subsequent patches to unify the initialization of type1 and sPAPR IOMMU backends. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 67aeaa825b..27ce31c883 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -567,6 +567,35 @@ static void shared_memory_listener_unregister(void) g_shl = NULL; } +static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) +{ + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + g_autofree struct vfio_iommu_type1_info *info = NULL; + int ret; + + ret = vfio_get_iommu_info(container, &info); + if (ret) { + error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); + return ret; + } + + if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { + bcontainer->pgsizes = info->iova_pgsizes; + } else { + bcontainer->pgsizes = qemu_real_host_page_size(); + } + + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; + } + + vfio_get_info_iova_range(info, bcontainer); + + vfio_get_iommu_info_migration(container, info); + return 0; +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -665,31 +694,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_S_IOMMU: - { - struct vfio_iommu_type1_info *info; - - ret = vfio_get_iommu_info(container, &info); - if (ret) { - error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); - goto enable_discards_exit; - } - - if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - bcontainer->pgsizes = info->iova_pgsizes; - } else { - bcontainer->pgsizes = qemu_real_host_page_size(); - } - - if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { - bcontainer->dma_max_mappings = 65535; - } - - vfio_get_info_iova_range(info, bcontainer); - - vfio_get_iommu_info_migration(container, info); - g_free(info); + ret = vfio_legacy_setup(bcontainer, errp); break; - } case VFIO_SPAPR_TCE_v2_IOMMU: case VFIO_SPAPR_TCE_IOMMU: { @@ -699,6 +705,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } break; } + default: + g_assert_not_reached(); + } + + if (ret) { + goto enable_discards_exit; } vfio_kvm_device_add_group(group); -- Gitee From 7a81c3919dda48b4e12b83ceb661896523cce6ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:18 +0100 Subject: [PATCH 656/939] vfio/container: Initialize VFIOIOMMUOps under vfio_init_container() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_init_container() already defines the IOMMU type of the container. Do the same for the VFIOIOMMUOps struct. This prepares ground for the following patches that will deduce the associated VFIOIOMMUOps struct from the IOMMU type. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 27ce31c883..dc805ceb12 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -430,7 +430,7 @@ static int vfio_get_iommu_type(VFIOContainer *container, } static int vfio_init_container(VFIOContainer *container, int group_fd, - Error **errp) + VFIOAddressSpace *space, Error **errp) { int iommu_type, dirty_log_manual_clear, ret; @@ -467,7 +467,7 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, if (dirty_log_manual_clear) { container->dirty_log_manual_clear = dirty_log_manual_clear; } - + vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops); return 0; } @@ -679,7 +679,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, &vfio_legacy_ops); - ret = vfio_init_container(container, group->fd, errp); + ret = vfio_init_container(container, group->fd, space, errp); if (ret) { goto free_container_exit; } -- Gitee From 5f62836c64d5abdbdb0d8fb9f0d2fd0d87f47b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:19 +0100 Subject: [PATCH 657/939] vfio/container: Introduce a VFIOIOMMU QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VFIOContainerBase was not introduced as an abstract QOM object because it felt unnecessary to expose all the IOMMU backends to the QEMU machine and human interface. However, we can still abstract the IOMMU backend handlers using a QOM interface class. This provides more flexibility when referencing the various implementations. Simply transform the VFIOIOMMUOps struct in an InterfaceClass and do some initial name replacements. Next changes will start converting VFIOIOMMUOps. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 2 +- hw/vfio/container-base.c | 12 +++++++++++- hw/vfio/pci.c | 2 +- include/hw/vfio/vfio-container-base.h | 23 +++++++++++++++++++---- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d572ec5880..abca6aa01a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1649,7 +1649,7 @@ retry: int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - const VFIOIOMMUOps *ops = &vfio_legacy_ops; + const VFIOIOMMUClass *ops = &vfio_legacy_ops; #ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 1ffd25bbfa..913ae49077 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -72,7 +72,7 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, } void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, - const VFIOIOMMUOps *ops) + const VFIOIOMMUClass *ops) { bcontainer->ops = ops; bcontainer->space = space; @@ -99,3 +99,13 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer) g_list_free_full(bcontainer->iova_ranges, g_free); } + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU, + .parent = TYPE_INTERFACE, + .class_size = sizeof(VFIOIOMMUClass), + }, +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1874ec1aba..d84a9e73a6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2488,7 +2488,7 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { VFIODevice *vbasedev = &vdev->vbasedev; - const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops; + const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops; return ops->pci_hot_reset(vbasedev, single); } diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 2ae297ccda..ce8bf9e2e6 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -16,7 +16,8 @@ #include "exec/memory.h" typedef struct VFIODevice VFIODevice; -typedef struct VFIOIOMMUOps VFIOIOMMUOps; +typedef struct VFIOIOMMUClass VFIOIOMMUClass; +#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */ typedef struct { unsigned long *bitmap; @@ -34,7 +35,7 @@ typedef struct VFIOAddressSpace { * This is the base object for vfio container backends */ typedef struct VFIOContainerBase { - const VFIOIOMMUOps *ops; + const VFIOIOMMUClass *ops; VFIOAddressSpace *space; MemoryListener listener; Error *error; @@ -88,10 +89,24 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, - const VFIOIOMMUOps *ops); + const VFIOIOMMUClass *ops); void vfio_container_destroy(VFIOContainerBase *bcontainer); -struct VFIOIOMMUOps { + +#define TYPE_VFIO_IOMMU "vfio-iommu" + +/* + * VFIOContainerBase is not an abstract QOM object because it felt + * unnecessary to expose all the IOMMU backends to the QEMU machine + * and human interface. However, we can still abstract the IOMMU + * backend handlers using a QOM interface class. This provides more + * flexibility when referencing the various implementations. + */ +DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) + +struct VFIOIOMMUClass { + InterfaceClass parent_class; + /* basic feature */ int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, -- Gitee From 9f04d045ef1b2d206b002d20b792111b3ce86909 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:20 +0100 Subject: [PATCH 658/939] vfio/container: Introduce a VFIOIOMMU legacy QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the legacy VFIOIOMMUOps struct to the new VFIOIOMMU QOM interface. The set of of operations for this backend can be referenced with a literal typename instead of a C struct. This will simplify support of multiple backends. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 6 ++- hw/vfio/container.c | 59 ++++++++++++++++++++++----- include/hw/vfio/vfio-common.h | 1 - include/hw/vfio/vfio-container-base.h | 1 + 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index abca6aa01a..d98c3b7422 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1649,13 +1649,17 @@ retry: int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - const VFIOIOMMUClass *ops = &vfio_legacy_ops; + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); #ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { ops = &vfio_iommufd_ops; } #endif + + assert(ops); + return ops->attach_device(name, vbasedev, as, errp); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index dc805ceb12..6b8de8f471 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -429,10 +429,30 @@ static int vfio_get_iommu_type(VFIOContainer *container, return -EINVAL; } +/* + * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type + */ +static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) +{ + ObjectClass *klass = NULL; + + switch (iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); + break; + default: + g_assert_not_reached(); + }; + + return VFIO_IOMMU_CLASS(klass); +} + static int vfio_init_container(VFIOContainer *container, int group_fd, VFIOAddressSpace *space, Error **errp) { int iommu_type, dirty_log_manual_clear, ret; + const VFIOIOMMUClass *vioc; iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { @@ -467,7 +487,14 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, if (dirty_log_manual_clear) { container->dirty_log_manual_clear = dirty_log_manual_clear; } - vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops); + + vioc = vfio_get_iommu_class(iommu_type, errp); + if (!vioc) { + error_setg(errp, "No available IOMMU models"); + return -EINVAL; + } + + vfio_container_init(&container->bcontainer, space, vioc); return 0; } @@ -677,7 +704,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; QLIST_INIT(&container->dma_list); bcontainer = &container->bcontainer; - vfio_container_init(bcontainer, space, &vfio_legacy_ops); ret = vfio_init_container(container, group->fd, space, errp); if (ret) { @@ -1218,12 +1244,25 @@ out_single: return ret; } -const VFIOIOMMUOps vfio_legacy_ops = { - .dma_map = vfio_legacy_dma_map, - .dma_unmap = vfio_legacy_dma_unmap, - .attach_device = vfio_legacy_attach_device, - .detach_device = vfio_legacy_detach_device, - .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking, - .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap, - .pci_hot_reset = vfio_legacy_pci_hot_reset, +static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; + vioc->attach_device = vfio_legacy_attach_device; + vioc->detach_device = vfio_legacy_detach_device; + vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; }; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_LEGACY, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_legacy_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 151b2ab65f..f78a97006c 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -224,7 +224,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; -extern const VFIOIOMMUOps vfio_legacy_ops; extern const VFIOIOMMUOps vfio_iommufd_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index ce8bf9e2e6..dce801378b 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -94,6 +94,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU "vfio-iommu" +#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" /* * VFIOContainerBase is not an abstract QOM object because it felt -- Gitee From b8e67d06ec3036cd3fd6d625c550e0c542e49d60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:21 +0100 Subject: [PATCH 659/939] vfio/container: Intoduce a new VFIOIOMMUClass::setup handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help in converting the sPAPR IOMMU backend to a QOM interface. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 1 + include/hw/vfio/vfio-container-base.h | 1 + 2 files changed, 2 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 6b8de8f471..845239eff4 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1248,6 +1248,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + vioc->setup = vfio_legacy_setup; vioc->dma_map = vfio_legacy_dma_map; vioc->dma_unmap = vfio_legacy_dma_unmap; vioc->attach_device = vfio_legacy_attach_device; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index dce801378b..614de90767 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -109,6 +109,7 @@ struct VFIOIOMMUClass { InterfaceClass parent_class; /* basic feature */ + int (*setup)(VFIOContainerBase *bcontainer, Error **errp); int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 2692ea754863364731e5712ebf83208690179089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:22 +0100 Subject: [PATCH 660/939] vfio/spapr: Introduce a sPAPR VFIOIOMMU QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move vfio_spapr_container_setup() to a VFIOIOMMUClass::setup handler and convert the sPAPR VFIOIOMMUOps struct to a QOM interface. The sPAPR QOM interface inherits from the legacy QOM interface because because both have the same basic needs. The sPAPR interface is then extended with the handlers specific to the sPAPR IOMMU. This allows reuse and provides better abstraction of the backends. It will be useful to avoid compiling the sPAPR IOMMU backend on targets not supporting it. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 24 ++++++------------------ hw/vfio/spapr.c | 20 ++++++++++++++++++++ include/hw/vfio/vfio-container-base.h | 1 + 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 845239eff4..e245d5a082 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -441,6 +441,10 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) case VFIO_TYPE1_IOMMU: klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); break; + case VFIO_SPAPR_TCE_v2_IOMMU: + case VFIO_SPAPR_TCE_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_SPAPR); + break; default: g_assert_not_reached(); }; @@ -716,25 +720,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, goto free_container_exit; } - switch (container->iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - case VFIO_TYPE1v2_S_IOMMU: - ret = vfio_legacy_setup(bcontainer, errp); - break; - case VFIO_SPAPR_TCE_v2_IOMMU: - case VFIO_SPAPR_TCE_IOMMU: - { - ret = vfio_spapr_container_init(container, errp); - if (ret) { - goto enable_discards_exit; - } - break; - } - default: - g_assert_not_reached(); - } + assert(bcontainer->ops->setup); + ret = bcontainer->ops->setup(bcontainer, errp); if (ret) { goto enable_discards_exit; } diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 5c6426e697..3694dfb874 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -543,3 +543,23 @@ void vfio_spapr_container_deinit(VFIOContainer *container) g_free(hostwin); } } + +static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->add_window = vfio_spapr_container_add_section_window; + vioc->del_window = vfio_spapr_container_del_section_window; + //vioc->release = vfio_spapr_container_release; + //vioc->setup = vfio_spapr_container_setup; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_SPAPR, + .parent = TYPE_VFIO_IOMMU_LEGACY, + .class_init = vfio_iommu_spapr_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 614de90767..1085109d0c 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -95,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU "vfio-iommu" #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" +#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" /* * VFIOContainerBase is not an abstract QOM object because it felt -- Gitee From 66f71e9acdaa0c1c31770f00a21ea32644ebaac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:23 +0100 Subject: [PATCH 661/939] vfio/iommufd: Introduce a VFIOIOMMU iommufd QOM interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As previously done for the sPAPR and legacy IOMMU backends, convert the VFIOIOMMUOps struct to a QOM interface. The set of of operations for this backend can be referenced with a literal typename instead of a C struct. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 2 +- hw/vfio/iommufd.c | 35 ++++++++++++++++++++------- include/hw/vfio/vfio-common.h | 1 - include/hw/vfio/vfio-container-base.h | 2 +- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d98c3b7422..a8b7129fa5 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1654,7 +1654,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, #ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { - ops = &vfio_iommufd_ops; + ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); } #endif diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 87a561c545..d4c586e842 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -319,6 +319,8 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, int ret, devfd; uint32_t ioas_id; Error *err = NULL; + const VFIOIOMMUClass *iommufd_vioc = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); if (vbasedev->fd < 0) { devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); @@ -340,7 +342,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); - if (bcontainer->ops != &vfio_iommufd_ops || + if (bcontainer->ops != iommufd_vioc || vbasedev->iommufd != container->be) { continue; } @@ -374,7 +376,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, container->ioas_id = ioas_id; bcontainer = &container->bcontainer; - vfio_container_init(bcontainer, space, &vfio_iommufd_ops); + vfio_container_init(bcontainer, space, iommufd_vioc); QLIST_INSERT_HEAD(&space->containers, bcontainer, next); ret = iommufd_cdev_attach_container(vbasedev, container, errp); @@ -476,9 +478,11 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) { VFIODevice *vbasedev_iter; + const VFIOIOMMUClass *iommufd_vioc = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { - if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) { + if (vbasedev_iter->bcontainer->ops != iommufd_vioc) { continue; } if (devid == vbasedev_iter->devid) { @@ -621,10 +625,23 @@ out_single: return ret; } -const VFIOIOMMUOps vfio_iommufd_ops = { - .dma_map = iommufd_cdev_map, - .dma_unmap = iommufd_cdev_unmap, - .attach_device = iommufd_cdev_attach, - .detach_device = iommufd_cdev_detach, - .pci_hot_reset = iommufd_cdev_pci_hot_reset, +static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->dma_map = iommufd_cdev_map; + vioc->dma_unmap = iommufd_cdev_unmap; + vioc->attach_device = iommufd_cdev_attach; + vioc->detach_device = iommufd_cdev_detach; + vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; }; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_IOMMUFD, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_iommufd_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f78a97006c..f3966410c1 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -224,7 +224,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; -extern const VFIOIOMMUOps vfio_iommufd_ops; extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 1085109d0c..c12ce4dfcb 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -17,7 +17,6 @@ typedef struct VFIODevice VFIODevice; typedef struct VFIOIOMMUClass VFIOIOMMUClass; -#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */ typedef struct { unsigned long *bitmap; @@ -96,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer); #define TYPE_VFIO_IOMMU "vfio-iommu" #define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" #define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" +#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" /* * VFIOContainerBase is not an abstract QOM object because it felt -- Gitee From 017272249cc362055dc5b31cdc16b2265df39e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:24 +0100 Subject: [PATCH 662/939] vfio/spapr: Only compile sPAPR IOMMU support when needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sPAPR IOMMU support is only needed for pseries machines. Compile out support when CONFIG_PSERIES is not set. This saves ~7K of text. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bd5cc4ca79..bda2688983 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -4,9 +4,9 @@ vfio_ss.add(files( 'common.c', 'container-base.c', 'container.c', - 'spapr.c', 'migration.c', )) +vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( 'iommufd.c', )) -- Gitee From feed555b60bc36d3e704431148e302dae48b77a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:16 +0100 Subject: [PATCH 663/939] vfio/spapr: Extend VFIOIOMMUOps with a release handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to abstract a bit more the sPAPR IOMMU support in the legacy IOMMU backend. Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 8 ++++++-- hw/vfio/spapr.c | 19 +++++++++++++++++++ include/hw/vfio/vfio-container-base.h | 1 + 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index e245d5a082..4c62f088b1 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -764,7 +764,9 @@ listener_release_exit: } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { - vfio_spapr_container_deinit(container); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } } enable_discards_exit: @@ -803,7 +805,9 @@ static void vfio_disconnect_container(VFIOGroup *group) } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { - vfio_spapr_container_deinit(container); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } } } diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 3694dfb874..697f80d11d 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -440,6 +440,24 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, } } +static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) +{ + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&scontainer->prereg_listener); + } + QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + } +} + static VFIOIOMMUOps vfio_iommu_spapr_ops; static void setup_spapr_ops(VFIOContainerBase *bcontainer) @@ -447,6 +465,7 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer) vfio_iommu_spapr_ops = *bcontainer->ops; vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; + vfio_iommu_spapr_ops.release = vfio_spapr_container_release; bcontainer->ops = &vfio_iommu_spapr_ops; } diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index c12ce4dfcb..b2813b0c11 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -135,5 +135,6 @@ struct VFIOIOMMUClass { Error **errp); void (*del_window)(VFIOContainerBase *bcontainer, MemoryRegionSection *section); + void (*release)(VFIOContainerBase *bcontainer); }; #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ -- Gitee From 188948043652fbcdd4505fd9672e57bc61647159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 19 Dec 2023 07:58:25 +0100 Subject: [PATCH 664/939] vfio/iommufd: Remove CONFIG_IOMMUFD usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Availability of the IOMMUFD backend can now be fully determined at runtime and the ifdef check was a build time protection (for PPC not supporting it mostly). Reviewed-by: Zhenzhong Duan Tested-by: Eric Farman Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a8b7129fa5..b5d02df0c2 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -19,7 +19,6 @@ */ #include "qemu/osdep.h" -#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #ifdef CONFIG_KVM #include @@ -1652,11 +1651,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); -#ifdef CONFIG_IOMMUFD if (vbasedev->iommufd) { ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); } -#endif assert(ops); -- Gitee From 626698a1e9edff6a1032f496858555e1a4614fbe Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:27 +0800 Subject: [PATCH 665/939] backends: Introduce HostIOMMUDevice abstract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A HostIOMMUDevice is an abstraction for an assigned device that is protected by a physical IOMMU (aka host IOMMU). The userspace interaction with this physical IOMMU can be done either through the VFIO IOMMU type 1 legacy backend or the new iommufd backend. The assigned device can be a VFIO device or a VDPA device. The HostIOMMUDevice is needed to interact with the host IOMMU that protects the assigned device. It is especially useful when the device is also protected by a virtual IOMMU as this latter use the translation services of the physical IOMMU and is constrained by it. In that context the HostIOMMUDevice can be passed to the virtual IOMMU to collect physical IOMMU capabilities such as the supported address width. In the future, the virtual IOMMU will use the HostIOMMUDevice to program the guest page tables in the first translation stage of the physical IOMMU. Introduce .realize() to initialize HostIOMMUDevice further after instance init. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- MAINTAINERS | 2 ++ backends/host_iommu_device.c | 33 +++++++++++++++++++ backends/meson.build | 1 + include/sysemu/host_iommu_device.h | 53 ++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+) create mode 100644 backends/host_iommu_device.c create mode 100644 include/sysemu/host_iommu_device.h diff --git a/MAINTAINERS b/MAINTAINERS index 0ddb20a35f..ada87bfa9e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2174,6 +2174,8 @@ M: Zhenzhong Duan S: Supported F: backends/iommufd.c F: include/sysemu/iommufd.h +F: backends/host_iommu_device.c +F: include/sysemu/host_iommu_device.h F: include/qemu/chardev_open.h F: util/chardev_open.c F: docs/devel/vfio-iommufd.rst diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c new file mode 100644 index 0000000000..8f2dda1beb --- /dev/null +++ b/backends/host_iommu_device.c @@ -0,0 +1,33 @@ +/* + * Host IOMMU device abstract + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/host_iommu_device.h" + +OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice, + host_iommu_device, + HOST_IOMMU_DEVICE, + OBJECT) + +static void host_iommu_device_class_init(ObjectClass *oc, void *data) +{ +} + +static void host_iommu_device_init(Object *obj) +{ +} + +static void host_iommu_device_finalize(Object *obj) +{ + HostIOMMUDevice *hiod = HOST_IOMMU_DEVICE(obj); + + g_free(hiod->name); +} diff --git a/backends/meson.build b/backends/meson.build index 9a5cea480d..68b5e34e04 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -13,6 +13,7 @@ system_ss.add([files( system_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c')) system_ss.add(when: 'CONFIG_POSIX', if_true: files('hostmem-file.c')) system_ss.add(when: 'CONFIG_LINUX', if_true: files('hostmem-memfd.c')) +system_ss.add(when: 'CONFIG_LINUX', if_true: files('host_iommu_device.c')) if keyutils.found() system_ss.add(keyutils, files('cryptodev-lkcf.c')) endif diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h new file mode 100644 index 0000000000..db47a16189 --- /dev/null +++ b/include/sysemu/host_iommu_device.h @@ -0,0 +1,53 @@ +/* + * Host IOMMU device abstract declaration + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef HOST_IOMMU_DEVICE_H +#define HOST_IOMMU_DEVICE_H + +#include "qom/object.h" +#include "qapi/error.h" + +#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) + +struct HostIOMMUDevice { + Object parent_obj; + + char *name; +}; + +/** + * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices. + * + * Different types of host devices (e.g., VFIO or VDPA device) or devices + * with different backend (e.g., VFIO legacy container or IOMMUFD backend) + * will have different implementations of the HostIOMMUDeviceClass. + */ +struct HostIOMMUDeviceClass { + ObjectClass parent_class; + + /** + * @realize: initialize host IOMMU device instance further. + * + * Mandatory callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); +}; +#endif -- Gitee From ca210a4a8fe97dd56baa184671bb48bff9a54ecb Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:28 +0800 Subject: [PATCH 666/939] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities. Different platform IOMMU can support different elements. Currently only two elements, type and aw_bits, type hints the host platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints host IOMMU address width. Introduce .get_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX is supported. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- include/sysemu/host_iommu_device.h | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index db47a16189..a57873958b 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -15,6 +15,18 @@ #include "qom/object.h" #include "qapi/error.h" +/** + * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. + * + * @type: host platform IOMMU type. + * + * @aw_bits: host IOMMU address width. 0xff if no limitation. + */ +typedef struct HostIOMMUDeviceCaps { + uint32_t type; + uint8_t aw_bits; +} HostIOMMUDeviceCaps; + #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) @@ -22,6 +34,7 @@ struct HostIOMMUDevice { Object parent_obj; char *name; + HostIOMMUDeviceCaps caps; }; /** @@ -49,5 +62,30 @@ struct HostIOMMUDeviceClass { * Returns: true on success, false on failure. */ bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @get_cap: check if a host IOMMU device capability is supported. + * + * Optional callback, if not implemented, hint not supporting query + * of @cap. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @cap: capability to check. + * + * @errp: pass an Error out when fails to query capability. + * + * Returns: <0 on failure, 0 if a @cap is unsupported, or else + * 1 or some positive value for some special @cap, + * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS. + */ + int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp); }; + +/* + * Host IOMMU device capability list. + */ +#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE 0 +#define HOST_IOMMU_DEVICE_CAP_AW_BITS 1 + +#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX 64 #endif -- Gitee From c253a07d9fe1598c4dbbb1cefee457806c417885 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:29 +0800 Subject: [PATCH 667/939] vfio/container: Introduce TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO represents a host IOMMU device under VFIO legacy container backend. It will have its own realize implementation. Suggested-by: Eric Auger Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 6 +++++- include/hw/vfio/vfio-common.h | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 4c62f088b1..dcf49af2d0 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1255,7 +1255,11 @@ static const TypeInfo types[] = { .name = TYPE_VFIO_IOMMU_LEGACY, .parent = TYPE_VFIO_IOMMU, .class_init = vfio_iommu_legacy_class_init, - }, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE, + } + }; DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f3966410c1..0c807c2806 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -31,6 +31,7 @@ #endif #include "sysemu/sysemu.h" #include "hw/vfio/vfio-container-base.h" +#include "sysemu/host_iommu_device.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -75,6 +76,8 @@ typedef struct VFIOMigration { struct VFIOGroup; +#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" + typedef struct VFIODMARange { QLIST_ENTRY(VFIODMARange) next; hwaddr iova; -- Gitee From 50142057ec070a70f3f38ec272ec61cc3ae6e071 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:30 +0800 Subject: [PATCH 668/939] backends/iommufd: Introduce TYPE_HOST_IOMMU_DEVICE_IOMMUFD[_VFIO] devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TYPE_HOST_IOMMU_DEVICE_IOMMUFD represents a host IOMMU device under iommufd backend. It is abstract, because it is going to be derived into VFIO or VDPA type'd device. It will have its own .get_cap() implementation. TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO is a sub-class of TYPE_HOST_IOMMU_DEVICE_IOMMUFD, represents a VFIO type'd host IOMMU device under iommufd backend. It will be created during VFIO device attaching and passed to vIOMMU. It will have its own .realize() implementation. Opportunistically, add missed header to include/sysemu/iommufd.h. Suggested-by: Cédric Le Goater Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- backends/iommufd.c | 36 +++++++++++++++++------------------ hw/vfio/iommufd.c | 5 ++++- include/hw/vfio/vfio-common.h | 3 +++ include/sysemu/iommufd.h | 16 ++++++++++++++++ 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index ba58a0eb0d..a2b7f5c3c4 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -223,23 +223,23 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } -static const TypeInfo iommufd_backend_info = { - .name = TYPE_IOMMUFD_BACKEND, - .parent = TYPE_OBJECT, - .instance_size = sizeof(IOMMUFDBackend), - .instance_init = iommufd_backend_init, - .instance_finalize = iommufd_backend_finalize, - .class_size = sizeof(IOMMUFDBackendClass), - .class_init = iommufd_backend_class_init, - .interfaces = (InterfaceInfo[]) { - { TYPE_USER_CREATABLE }, - { } +static const TypeInfo types[] = { + { + .name = TYPE_IOMMUFD_BACKEND, + .parent = TYPE_OBJECT, + .instance_size = sizeof(IOMMUFDBackend), + .instance_init = iommufd_backend_init, + .instance_finalize = iommufd_backend_finalize, + .class_size = sizeof(IOMMUFDBackendClass), + .class_init = iommufd_backend_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .parent = TYPE_HOST_IOMMU_DEVICE, + .abstract = true, } }; - -static void register_types(void) -{ - type_register_static(&iommufd_backend_info); -} - -type_init(register_types); +DEFINE_TYPES(types) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d4c586e842..7a4b818830 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -641,7 +641,10 @@ static const TypeInfo types[] = { .name = TYPE_VFIO_IOMMU_IOMMUFD, .parent = TYPE_VFIO_IOMMU, .class_init = vfio_iommu_iommufd_class_init, - }, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + } }; DEFINE_TYPES(types) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0c807c2806..2cfc8521cd 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -32,6 +32,7 @@ #include "sysemu/sysemu.h" #include "hw/vfio/vfio-container-base.h" #include "sysemu/host_iommu_device.h" +#include "sysemu/iommufd.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -77,6 +78,8 @@ typedef struct VFIOMigration { struct VFIOGroup; #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ + TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" typedef struct VFIODMARange { QLIST_ENTRY(VFIODMARange) next; diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 9c5524b0ed..1a75e82f42 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -1,3 +1,16 @@ +/* + * iommufd container backend declaration + * + * Copyright (C) 2024 Intel Corporation. + * Copyright Red Hat, Inc. 2024 + * + * Authors: Yi Liu + * Eric Auger + * Zhenzhong Duan + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + #ifndef SYSEMU_IOMMUFD_H #define SYSEMU_IOMMUFD_H @@ -5,6 +18,7 @@ #include "qemu/thread.h" #include "exec/hwaddr.h" #include "exec/cpu-common.h" +#include "sysemu/host_iommu_device.h" #define TYPE_IOMMUFD_BACKEND "iommufd" OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) @@ -35,4 +49,6 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); + +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 30150b8727e9ec41f83c4dfcd93f04b766357469 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:31 +0800 Subject: [PATCH 669/939] range: Introduce range_get_last_bit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper get the highest 1 bit position of the upper bound. If the range is empty or upper bound is zero, -1 is returned. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- include/qemu/range.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/qemu/range.h b/include/qemu/range.h index 205e1da76d..4ce694a398 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -20,6 +20,8 @@ #ifndef QEMU_RANGE_H #define QEMU_RANGE_H +#include "qemu/bitops.h" + /* * Operations on 64 bit address ranges. * Notes: @@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, return !(last2 < first1 || last1 < first2); } +/* Get highest non-zero bit position of a range */ +static inline int range_get_last_bit(Range *range) +{ + if (range_is_empty(range)) { + return -1; + } + return 63 - clz64(range->upb); +} + /* * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. * Both @a and @b must not be empty. -- Gitee From c66d22fa4ee9f6f38193256d7ce1494c32e10581 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:32 +0800 Subject: [PATCH 670/939] vfio/container: Implement HostIOMMUDeviceClass::realize() handler The realize function populates the capabilities. For now only the aw_bits caps is computed for legacy backend. Introduce a helper function vfio_device_get_aw_bits() which calls range_get_last_bit() to get host aw_bits and package it in HostIOMMUDeviceCaps for query with .get_cap(). This helper will also be used by iommufd backend. Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 20 +++++++++++++++++++- hw/vfio/helpers.c | 17 +++++++++++++++++ include/hw/vfio/vfio-common.h | 1 + 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index dcf49af2d0..fbe2bc50d4 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1250,6 +1250,24 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; }; +static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + + hiod->name = g_strdup(vdev->name); + hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); + + return true; +} + +static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->realize = hiod_legacy_vfio_realize; +}; + static const TypeInfo types[] = { { .name = TYPE_VFIO_IOMMU_LEGACY, @@ -1258,8 +1276,8 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_legacy_vfio_class_init, } - }; DEFINE_TYPES(types) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 6789870802..35b8e42304 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -663,3 +663,20 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, vbasedev->ram_block_discard_allowed = ram_discard; } + +int vfio_device_get_aw_bits(VFIODevice *vdev) +{ + /* + * iova_ranges is a sorted list. For old kernels that support + * VFIO but not support query of iova ranges, iova_ranges is NULL, + * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. + */ + GList *l = g_list_last(vdev->bcontainer->iova_ranges); + + if (l) { + Range *range = l->data; + return range_get_last_bit(range) + 1; + } + + return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; +} diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 2cfc8521cd..376b8350b9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -277,4 +277,5 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, DeviceState *dev, bool ram_discard); +int vfio_device_get_aw_bits(VFIODevice *vdev); #endif /* HW_VFIO_VFIO_COMMON_H */ -- Gitee From ccd8baf4648e6fd6b69e65ee249609904edc92e1 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:33 +0800 Subject: [PATCH 671/939] backends/iommufd: Introduce helper function iommufd_backend_get_device_info() Introduce a helper function iommufd_backend_get_device_info() to get host IOMMU related information through iommufd uAPI. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- backends/iommufd.c | 22 ++++++++++++++++++++++ include/sysemu/iommufd.h | 3 +++ 2 files changed, 25 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index a2b7f5c3c4..604a8f4e7d 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -223,6 +223,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + Error **errp) +{ + struct iommu_hw_info info = { + .size = sizeof(info), + .dev_id = devid, + .data_len = len, + .data_uptr = (uintptr_t)data, + }; + + if (ioctl(be->fd, IOMMU_GET_HW_INFO, &info)) { + error_setg_errno(errp, errno, "Failed to get hardware info"); + return false; + } + + g_assert(type); + *type = info.out_data_type; + + return true; +} + static const TypeInfo types[] = { { .name = TYPE_IOMMUFD_BACKEND, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 1a75e82f42..dfade18e6d 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -49,6 +49,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From c9f1b73eb36a84347c3720ce2a93f72ea47f5daa Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:34 +0800 Subject: [PATCH 672/939] vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler It calls iommufd_backend_get_device_info() to get host IOMMU related information and translate it into HostIOMMUDeviceCaps for query with .get_cap(). For aw_bits, use the same way as legacy backend by calling vfio_device_get_aw_bits() which is common for different vendor IOMMU. Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/iommufd.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7a4b818830..2efdba5565 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -636,6 +636,35 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; }; +static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + HostIOMMUDeviceCaps *caps = &hiod->caps; + enum iommu_hw_info_type type; + union { + struct iommu_hw_info_vtd vtd; + } data; + + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, + &type, &data, sizeof(data), errp)) { + return false; + } + + hiod->name = g_strdup(vdev->name); + caps->type = type; + caps->aw_bits = vfio_device_get_aw_bits(vdev); + + return true; +} + +static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + + hiodc->realize = hiod_iommufd_vfio_realize; +}; + static const TypeInfo types[] = { { .name = TYPE_VFIO_IOMMU_IOMMUFD, @@ -644,6 +673,7 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .class_init = hiod_iommufd_vfio_class_init, } }; -- Gitee From b6830d3caff821b2472e369042c169935c906ef2 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:35 +0800 Subject: [PATCH 673/939] vfio/container: Implement HostIOMMUDeviceClass::get_cap() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index fbe2bc50d4..ed54ce6d0c 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1261,11 +1261,26 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, return true; } +static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, + Error **errp) +{ + HostIOMMUDeviceCaps *caps = &hiod->caps; + + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return caps->aw_bits; + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) { HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); hioc->realize = hiod_legacy_vfio_realize; + hioc->get_cap = hiod_legacy_vfio_get_cap; }; static const TypeInfo types[] = { -- Gitee From 2f1a2f4b320e70a85cef8392cd5f4b1e54afb9c9 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:36 +0800 Subject: [PATCH 674/939] backends/iommufd: Implement HostIOMMUDeviceClass::get_cap() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- backends/iommufd.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 604a8f4e7d..7e805bd664 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -245,6 +245,28 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, return true; } +static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) +{ + HostIOMMUDeviceCaps *caps = &hiod->caps; + + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: + return caps->type; + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return caps->aw_bits; + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + +static void hiod_iommufd_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->get_cap = hiod_iommufd_get_cap; +}; + static const TypeInfo types[] = { { .name = TYPE_IOMMUFD_BACKEND, @@ -261,6 +283,7 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_iommufd_class_init, .abstract = true, } }; -- Gitee From a152921f6d534f2a515b4e88304ad115fae8fa8f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:37 +0800 Subject: [PATCH 675/939] vfio: Create host IOMMU device instance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create host IOMMU device instance in vfio_attach_device() and call .realize() to initialize it further. Introuduce attribute VFIOIOMMUClass::hiod_typename and initialize it based on VFIO backend type. It will facilitate HostIOMMUDevice creation in vfio_attach_device(). Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/common.c | 18 +++++++++++++++++- hw/vfio/container.c | 2 ++ hw/vfio/iommufd.c | 2 ++ include/hw/vfio/vfio-common.h | 1 + include/hw/vfio/vfio-container-base.h | 3 +++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b5d02df0c2..d5ff65f90a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1650,6 +1650,8 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, { const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + HostIOMMUDevice *hiod = NULL; + int ret; if (vbasedev->iommufd) { ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); @@ -1657,7 +1659,20 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, assert(ops); - return ops->attach_device(name, vbasedev, as, errp); + ret = ops->attach_device(name, vbasedev, as, errp); + if (ret) { + return ret; + } + + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + object_unref(hiod); + ops->detach_device(vbasedev); + return -1; + } + vbasedev->hiod = hiod; + + return 0; } void vfio_detach_device(VFIODevice *vbasedev) @@ -1665,5 +1680,6 @@ void vfio_detach_device(VFIODevice *vbasedev) if (!vbasedev->bcontainer) { return; } + object_unref(vbasedev->hiod); vbasedev->bcontainer->ops->detach_device(vbasedev); } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index ed54ce6d0c..10f7635425 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1240,6 +1240,8 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO; + vioc->setup = vfio_legacy_setup; vioc->dma_map = vfio_legacy_dma_map; vioc->dma_unmap = vfio_legacy_dma_unmap; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 2efdba5565..7cbf0e44f1 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -629,6 +629,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) { VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO; + vioc->dma_map = iommufd_cdev_map; vioc->dma_unmap = iommufd_cdev_unmap; vioc->attach_device = iommufd_cdev_attach; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 376b8350b9..d45d40c329 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -140,6 +140,7 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; } VFIODevice; diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index b2813b0c11..7a4c575115 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -109,6 +109,9 @@ DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) struct VFIOIOMMUClass { InterfaceClass parent_class; + /* Properties */ + const char *hiod_typename; + /* basic feature */ int (*setup)(VFIOContainerBase *bcontainer, Error **errp); int (*dma_map)(const VFIOContainerBase *bcontainer, -- Gitee From 03f9b12e33238587da36be24523911fd1b003324 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:38 +0800 Subject: [PATCH 676/939] hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn() Extract out pci_device_get_iommu_bus_devfn() from pci_device_iommu_address_space() to facilitate implementation of pci_device_[set|unset]_iommu_device() in following patch. No functional change intended. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Nicolin Chen Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/pci/pci.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 7467a2a9de..0884fbb760 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2681,11 +2681,27 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data) } } -AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) +/* + * Get IOMMU root bus, aliased bus and devfn of a PCI device + * + * IOMMU root bus is needed by all call sites to call into iommu_ops. + * For call sites which don't need aliased BDF, passing NULL to + * aliased_[bus|devfn] is allowed. + * + * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device. + * + * @aliased_bus: return aliased #PCIBus of the PCI device, optional. + * + * @aliased_devfn: return aliased devfn of the PCI device, optional. + */ +static void pci_device_get_iommu_bus_devfn(PCIDevice *dev, + PCIBus **piommu_bus, + PCIBus **aliased_bus, + int *aliased_devfn) { PCIBus *bus = pci_get_bus(dev); PCIBus *iommu_bus = bus; - uint8_t devfn = dev->devfn; + int devfn = dev->devfn; while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); @@ -2726,7 +2742,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { + + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + assert(iommu_bus); + + if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) { + iommu_bus = NULL; + } + + *piommu_bus = iommu_bus; + + if (aliased_bus) { + *aliased_bus = bus; + } + + if (aliased_devfn) { + *aliased_devfn = devfn; + } +} + +AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); + if (iommu_bus) { return iommu_bus->iommu_ops->get_address_space(bus, iommu_bus->iommu_opaque, devfn); } -- Gitee From 7bc73d38984460315df315d007789f87f4d11994 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 5 Jun 2024 16:30:39 +0800 Subject: [PATCH 677/939] hw/pci: Introduce pci_device_[set|unset]_iommu_device() pci_device_[set|unset]_iommu_device() call pci_device_get_iommu_bus_devfn() to get iommu_bus->iommu_ops and call [set|unset]_iommu_device callback to set/unset HostIOMMUDevice for a given PCI device. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Nicolin Chen Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/pci/pci.c | 27 +++++++++++++++++++++++++++ include/hw/pci/pci.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 0884fbb760..d6f627aa51 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2775,6 +2775,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) return &address_space_memory; } +bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp) +{ + PCIBus *iommu_bus; + + /* set_iommu_device requires device's direct BDF instead of aliased BDF */ + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) { + return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn, hiod, errp); + } + return true; +} + +void pci_device_unset_iommu_device(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) { + return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn); + } +} + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { /* diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index cee0cf7460..8d1af44249 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -3,6 +3,7 @@ #include "exec/memory.h" #include "sysemu/dma.h" +#include "sysemu/host_iommu_device.h" /* PCI includes legacy ISA access. */ #include "hw/isa/isa.h" @@ -384,10 +385,45 @@ typedef struct PCIIOMMUOps { * * @devfn: device and function number */ - AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); + AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); + /** + * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU + * + * Optional callback, if not implemented in vIOMMU, then vIOMMU can't + * retrieve host information from the associated HostIOMMUDevice. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + * + * @dev: the #HostIOMMUDevice to attach. + * + * @errp: pass an Error out only when return false + * + * Returns: true if HostIOMMUDevice is attached or else false with errp set. + */ + bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *dev, Error **errp); + /** + * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU + * + * Optional callback. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + */ + void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); } PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); +bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp); +void pci_device_unset_iommu_device(PCIDevice *dev); /** * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus -- Gitee From dbbf6b33d9ce5f2785972f81919be143e81f866b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:40 +0800 Subject: [PATCH 678/939] vfio/pci: Pass HostIOMMUDevice to vIOMMU With HostIOMMUDevice passed, vIOMMU can check compatibility with host IOMMU, call into IOMMUFD specific methods, etc. Originally-by: Yi Liu Signed-off-by: Nicolin Chen Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/vfio/pci.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d84a9e73a6..675a608b9c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3107,6 +3107,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); + if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + error_prepend(errp, "Failed to set iommu_device: "); + goto out_teardown; + } + ret = vfio_add_capabilities(vdev, errp); if (ret) { goto out_teardown; @@ -3128,7 +3133,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) error_setg(errp, "cannot support IGD OpRegion feature on hotplugged " "device"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_get_dev_region_info(vbasedev, @@ -3137,13 +3142,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (ret) { error_setg_errno(errp, -ret, "does not support requested IGD OpRegion feature"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); g_free(opregion); if (ret) { - goto out_teardown; + goto out_unset_idev; } } @@ -3229,6 +3234,8 @@ out_deregister: if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } +out_unset_idev: + pci_device_unset_iommu_device(pdev); out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3257,6 +3264,7 @@ static void vfio_instance_finalize(Object *obj) static void vfio_exitfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); @@ -3271,7 +3279,8 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); - vfio_migration_exit(&vdev->vbasedev); + vfio_migration_exit(vbasedev); + pci_device_unset_iommu_device(pdev); } static void vfio_pci_reset(DeviceState *dev) -- Gitee From a051e4349316d7065c9418de691787edae8e7f4e Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:41 +0800 Subject: [PATCH 679/939] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap Extract cap/ecap initialization in vtd_cap_init() to make code cleaner. No functional change intended. Reviewed-by: Eric Auger Signed-off-by: Zhenzhong Duan Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 93 ++++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3da56e439e..6716407b7a 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3935,30 +3935,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) return; } -/* Do the initialization. It will also be called when reset, so pay - * attention when adding new initialization stuff. - */ -static void vtd_init(IntelIOMMUState *s) +static void vtd_cap_init(IntelIOMMUState *s) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); - memset(s->csr, 0, DMAR_REG_SIZE); - memset(s->wmask, 0, DMAR_REG_SIZE); - memset(s->w1cmask, 0, DMAR_REG_SIZE); - memset(s->womask, 0, DMAR_REG_SIZE); - - s->root = 0; - s->root_scalable = false; - s->dmar_enabled = false; - s->intr_enabled = false; - s->iq_head = 0; - s->iq_tail = 0; - s->iq = 0; - s->iq_size = 0; - s->qi_enabled = false; - s->iq_last_desc_type = VTD_INV_DESC_NONE; - s->iq_dw = false; - s->next_frcd_reg = 0; s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | VTD_CAP_MGAW(s->aw_bits); @@ -3975,27 +3955,6 @@ static void vtd_init(IntelIOMMUState *s) } s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; - /* - * Rsvd field masks for spte - */ - vtd_spte_rsvd[0] = ~0ULL; - vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); - vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); - vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); - - vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - - if (s->scalable_mode || s->snoop_control) { - vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; - vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; - vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; - } - if (x86_iommu_ir_supported(x86_iommu)) { s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; if (s->intr_eim == ON_OFF_AUTO_ON) { @@ -4028,6 +3987,56 @@ static void vtd_init(IntelIOMMUState *s) if (s->pasid) { s->ecap |= VTD_ECAP_PASID; } +} + +/* + * Do the initialization. It will also be called when reset, so pay + * attention when adding new initialization stuff. + */ +static void vtd_init(IntelIOMMUState *s) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + memset(s->csr, 0, DMAR_REG_SIZE); + memset(s->wmask, 0, DMAR_REG_SIZE); + memset(s->w1cmask, 0, DMAR_REG_SIZE); + memset(s->womask, 0, DMAR_REG_SIZE); + + s->root = 0; + s->root_scalable = false; + s->dmar_enabled = false; + s->intr_enabled = false; + s->iq_head = 0; + s->iq_tail = 0; + s->iq = 0; + s->iq_size = 0; + s->qi_enabled = false; + s->iq_last_desc_type = VTD_INV_DESC_NONE; + s->iq_dw = false; + s->next_frcd_reg = 0; + + vtd_cap_init(s); + + /* + * Rsvd field masks for spte + */ + vtd_spte_rsvd[0] = ~0ULL; + vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); + + vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + + if (s->scalable_mode || s->snoop_control) { + vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; + } vtd_reset_caches(s); -- Gitee From 5834bb1ccce592380a91a5cf127f90a031cd7cf2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 5 Jun 2024 16:30:42 +0800 Subject: [PATCH 680/939] intel_iommu: Implement [set|unset]_iommu_device() callbacks Implement [set|unset]_iommu_device() callbacks in Intel vIOMMU. In set call, we take a reference of HostIOMMUDevice and store it in hash table indexed by PCI BDF. Note this BDF index is device's real BDF not the aliased one which is different from the index of VTDAddressSpace. There can be multiple assigned devices under same virtual iommu group and share same VTDAddressSpace, but each has its own HostIOMMUDevice. Signed-off-by: Yi Liu Signed-off-by: Yi Sun Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 81 +++++++++++++++++++++++++++++++++++ include/hw/i386/intel_iommu.h | 2 + 2 files changed, 83 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 6716407b7a..bdc14f8438 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -61,6 +61,12 @@ struct vtd_as_key { uint32_t pasid; }; +/* bus/devfn is PCI device's real BDF not the aliased one */ +struct vtd_hiod_key { + PCIBus *bus; + uint8_t devfn; +}; + struct vtd_iotlb_key { uint64_t gfn; uint32_t pasid; @@ -250,6 +256,25 @@ static guint vtd_as_hash(gconstpointer v) return (guint)(value << 8 | key->devfn); } +/* Same implementation as vtd_as_hash() */ +static guint vtd_hiod_hash(gconstpointer v) +{ + return vtd_as_hash(v); +} + +static gboolean vtd_hiod_equal(gconstpointer v1, gconstpointer v2) +{ + const struct vtd_hiod_key *key1 = v1; + const struct vtd_hiod_key *key2 = v2; + + return (key1->bus == key2->bus) && (key1->devfn == key2->devfn); +} + +static void vtd_hiod_destroy(gpointer v) +{ + object_unref(v); +} + static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, gpointer user_data) { @@ -3813,6 +3838,58 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, return vtd_dev_as; } +static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) +{ + IntelIOMMUState *s = opaque; + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + }; + struct vtd_as_key *new_key; + + assert(hiod); + + vtd_iommu_lock(s); + + if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { + error_setg(errp, "Host IOMMU device already exist"); + vtd_iommu_unlock(s); + return false; + } + + new_key = g_malloc(sizeof(*new_key)); + new_key->bus = bus; + new_key->devfn = devfn; + + object_ref(hiod); + g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod); + + vtd_iommu_unlock(s); + + return true; +} + +static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) +{ + IntelIOMMUState *s = opaque; + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + }; + + vtd_iommu_lock(s); + + if (!g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { + vtd_iommu_unlock(s); + return; + } + + g_hash_table_remove(s->vtd_host_iommu_dev, &key); + + vtd_iommu_unlock(s); +} + /* Unmap the whole range in the notifier's scope. */ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) { @@ -4117,6 +4194,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, + .set_iommu_device = vtd_dev_set_iommu_device, + .unset_iommu_device = vtd_dev_unset_iommu_device, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4240,6 +4319,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) g_free, g_free); s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, g_free, g_free); + s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_hiod_hash, vtd_hiod_equal, + g_free, vtd_hiod_destroy); vtd_init(s); pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 7fa0a695c8..1eb05c29fc 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -292,6 +292,8 @@ struct IntelIOMMUState { /* list of registered notifiers */ QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; + GHashTable *vtd_host_iommu_dev; /* HostIOMMUDevice */ + /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ dma_addr_t intr_root; /* Interrupt remapping table pointer */ -- Gitee From 4ef1b086272552378c09356b0e9fd2548a27a621 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 5 Jun 2024 16:30:43 +0800 Subject: [PATCH 681/939] intel_iommu: Check compatibility with host IOMMU capabilities If check fails, host device (either VFIO or VDPA device) is not compatible with current vIOMMU config and should not be passed to guest. Only aw_bits is checked for now, we don't care about other caps before scalable modern mode is introduced. Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index bdc14f8438..60d86e0cb6 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3838,6 +3838,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, return vtd_dev_as; } +static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod, + Error **errp) +{ + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod); + int ret; + + if (!hiodc->get_cap) { + error_setg(errp, ".get_cap() not implemented"); + return false; + } + + /* Common checks */ + ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp); + if (ret < 0) { + return false; + } + if (s->aw_bits > ret) { + error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret); + return false; + } + + return true; +} + static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, HostIOMMUDevice *hiod, Error **errp) { @@ -3858,6 +3882,11 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, return false; } + if (!vtd_check_hiod(s, hiod, errp)) { + vtd_iommu_unlock(s); + return false; + } + new_key = g_malloc(sizeof(*new_key)); new_key->bus = bus; new_key->devfn = devfn; -- Gitee From 92da638c3a97679ab4d9f497ae5c7bf652e7bf99 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:49 +0100 Subject: [PATCH 682/939] vfio/pci: Extract mdev check into an helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to skip initialization of the HostIOMMUDevice for mdev, extract the checks that validate if a device is an mdev into helpers. A vfio_device_is_mdev() is created, and subsystems consult VFIODevice::mdev to check if it's mdev or not. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- hw/vfio/helpers.c | 14 ++++++++++++++ hw/vfio/pci.c | 12 +++--------- include/hw/vfio/vfio-common.h | 2 ++ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 35b8e42304..37bc383c69 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -680,3 +680,17 @@ int vfio_device_get_aw_bits(VFIODevice *vdev) return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; } + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 675a608b9c..de040e73ca 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2942,10 +2942,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - char *tmp, *subsys; Error *err = NULL; int i, ret; - bool is_mdev; char uuid[UUID_STR_LEN]; char *name; @@ -2976,15 +2974,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) * stays in sync with the active working set of the guest driver. Prevent * the x-balloon-allowed option unless this is minimally an mdev device. */ - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - g_free(tmp); - is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); - free(subsys); + vbasedev->mdev = vfio_device_is_mdev(vbasedev); - trace_vfio_mdev(vbasedev->name, is_mdev); + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - if (vbasedev->ram_block_discard_allowed && !is_mdev) { + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { error_setg(errp, "x-balloon-allowed only potentially compatible " "with mdev devices"); goto error; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index d45d40c329..e49e5fabba 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -126,6 +126,7 @@ typedef struct VFIODevice { DeviceState *dev; int fd; int type; + bool mdev; bool reset_works; bool needs_reset; bool no_mmap; @@ -219,6 +220,7 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); +bool vfio_device_is_mdev(VFIODevice *vbasedev); int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); -- Gitee From b2d58d5b474633514c3195d6948e1cd2a9c78d67 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:50 +0100 Subject: [PATCH 683/939] vfio/iommufd: Don't initialize nor set a HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by skipping HostIOMMUDevice initialization in the presence of mdevs, and skip setting an iommu device when it is known to be an mdev. Cc: Zhenzhong Duan Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Joao Martins Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- hw/vfio/common.c | 4 ++++ hw/vfio/pci.c | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d5ff65f90a..ceb1da0b94 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1664,6 +1664,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } + if (vbasedev->mdev) { + return true; + } + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { object_unref(hiod); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index de040e73ca..19211f4368 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3101,7 +3101,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); - if (!pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { error_prepend(errp, "Failed to set iommu_device: "); goto out_teardown; } @@ -3229,7 +3230,9 @@ out_deregister: timer_free(vdev->intx.mmap_timer); } out_unset_idev: - pci_device_unset_iommu_device(pdev); + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); -- Gitee From 7d53d0938921d0faa32e1fef4c7bcc45d21f9bfb Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:51 +0100 Subject: [PATCH 684/939] backends/iommufd: Extend iommufd_backend_get_device_info() to fetch HW capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The helper will be able to fetch vendor agnostic IOMMU capabilities supported both by hardware and software. Right now it is only iommu dirty tracking. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 4 +++- hw/vfio/iommufd.c | 4 +++- include/sysemu/iommufd.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 7e805bd664..1ce2a24226 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -225,7 +225,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp) + uint64_t *caps, Error **errp) { struct iommu_hw_info info = { .size = sizeof(info), @@ -241,6 +241,8 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, g_assert(type); *type = info.out_data_type; + g_assert(caps); + *caps = info.out_capabilities; return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7cbf0e44f1..d5b923ca83 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -647,9 +647,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, union { struct iommu_hw_info_vtd vtd; } data; + uint64_t hw_caps; if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), errp)) { + &type, &data, sizeof(data), + &hw_caps, errp)) { return false; } diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index dfade18e6d..a0a0143856 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -51,7 +51,7 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - Error **errp); + uint64_t *caps, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 56e5b9cf8e4041a023daca1ce439ca14619afa97 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 19 Jul 2024 13:04:52 +0100 Subject: [PATCH 685/939] vfio/iommufd: Return errno in iommufd_cdev_attach_ioas_hwpt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to implement auto domains have the attach function return the errno it got during domain attach instead of a bool. -EINVAL is tracked to track domain incompatibilities, and decide whether to create a new IOMMU domain. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d5b923ca83..5e7788ed59 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -200,11 +200,12 @@ static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, error_setg_errno(errp, errno, "[iommufd=%d] error attach %s (%d) to id=%d", iommufd, vbasedev->name, vbasedev->fd, id); - } else { - trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, - vbasedev->fd, id); + return -errno; } - return ret; + + trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, + vbasedev->fd, id); + return 0; } static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) -- Gitee From 44d573b10c45746e81d0d1786fe61d45160f2181 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 22 Jul 2024 15:07:12 +0800 Subject: [PATCH 686/939] vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by setting vbasedev->mdev true so skipping HostIOMMUDevice initialization in the presence of mdevs. Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Zhenzhong Duan Reviewed-by: Joao Martins Reviewed-by: Eric Auger --- hw/vfio/ap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index e157aa1ff7..6b2bc32549 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -234,6 +234,9 @@ static void vfio_ap_instance_init(Object *obj) */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, DEVICE(vapdev), true); + + /* AP device is mdev type device */ + vbasedev->mdev = true; } #ifdef CONFIG_IOMMUFD -- Gitee From ffcda8cc141e14528fd73aea750be822575eedcc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 22 Jul 2024 15:07:13 +0800 Subject: [PATCH 687/939] vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev mdevs aren't "physical" devices and when asking for backing IOMMU info, it fails the entire provisioning of the guest. Fix that by setting vbasedev->mdev true so skipping HostIOMMUDevice initialization in the presence of mdevs. Fixes: 930589520128 ("vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler") Signed-off-by: Zhenzhong Duan Reviewed-by: Joao Martins Acked-by: Eric Farman Reviewed-by: Eric Auger --- hw/vfio/ccw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 90e4a53437..257e9723cf 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -683,6 +683,9 @@ static void vfio_ccw_instance_init(Object *obj) VFIOCCWDevice *vcdev = VFIO_CCW(obj); VFIODevice *vbasedev = &vcdev->vdev; + /* CCW device is mdev type device */ + vbasedev->mdev = true; + /* * All vfio-ccw devices are believed to operate in a way compatible with * discarding of memory in RAM blocks, ie. pages pinned in the host are -- Gitee From 630efd6ca2f0c9383223f0ea092abda1c7528f21 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:18 +0100 Subject: [PATCH 688/939] vfio/iommufd: Introduce auto domain creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's generally two modes of operation for IOMMUFD: 1) The simple user API which intends to perform relatively simple things with IOMMUs e.g. DPDK. The process generally creates an IOAS and attaches to VFIO and mainly performs IOAS_MAP and UNMAP. 2) The native IOMMUFD API where you have fine grained control of the IOMMU domain and model it accordingly. This is where most new feature are being steered to. For dirty tracking 2) is required, as it needs to ensure that the stage-2/parent IOMMU domain will only attach devices that support dirty tracking (so far it is all homogeneous in x86, likely not the case for smmuv3). Such invariant on dirty tracking provides a useful guarantee to VMMs that will refuse incompatible device attachments for IOMMU domains. Dirty tracking insurance is enforced via HWPT_ALLOC, which is responsible for creating an IOMMU domain. This is contrast to the 'simple API' where the IOMMU domain is created by IOMMUFD automatically when it attaches to VFIO (usually referred as autodomains) but it has the needed handling for mdevs. To support dirty tracking with the advanced IOMMUFD API, it needs similar logic, where IOMMU domains are created and devices attached to compatible domains. Essentially mimicking kernel iommufd_device_auto_get_domain(). With mdevs given there's no IOMMU domain it falls back to IOAS attach. The auto domain logic allows different IOMMU domains to be created when DMA dirty tracking is not desired (and VF can provide it), and others where it is. Here it is not used in this way given how VFIODevice migration state is initialized after the device attachment. But such mixed mode of IOMMU dirty tracking + device dirty tracking is an improvement that can be added on. Keep the 'all of nothing' of type1 approach that we have been using so far between container vs device dirty tracking. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan [ clg: Added ERRP_GUARD() in iommufd_cdev_autodomains_get() ] Signed-off-by: Cédric Le Goater Reviewed-by: Eric Auger [Shameer: Changed ret for iommufd_cdev_autodomains_get() ] Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 30 +++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 85 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 9 ++++ include/sysemu/iommufd.h | 5 +++ 5 files changed, 130 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 1ce2a24226..0d995d7563 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -223,6 +223,36 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, return ret; } +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_hwpt_alloc alloc_hwpt = { + .size = sizeof(struct iommu_hwpt_alloc), + .flags = flags, + .dev_id = dev_id, + .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); + trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, + data_len, (uintptr_t)data_ptr, + alloc_hwpt.out_hwpt_id, ret); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate hwpt"); + return false; + } + + *out_hwpt = alloc_hwpt.out_hwpt_id; + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index d45c6e31a6..e248bf039e 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -14,4 +14,5 @@ iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 5e7788ed59..3b75cba26c 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -225,10 +225,89 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) return ret; } +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + ERRP_GUARD(); + IOMMUFDBackend *iommufd = vbasedev->iommufd; + uint32_t flags = 0; + VFIOIOASHwpt *hwpt; + uint32_t hwpt_id; + int ret; + + /* Try to find a domain */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + /* -EINVAL means the domain is incompatible with the device. */ + if (ret == -EINVAL) { + /* + * It is an expected failure and it just means we will try + * another domain, or create one if no existing compatible + * domain is found. Hence why the error is discarded below. + */ + error_free(*errp); + *errp = NULL; + continue; + } + + return ret; + } else { + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + return 0; + } + } + + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &hwpt_id, errp)) { + return -EINVAL; + } + + hwpt = g_malloc0(sizeof(*hwpt)); + hwpt->hwpt_id = hwpt_id; + QLIST_INIT(&hwpt->device_list); + + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + return ret; + } + + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + return 0; +} + +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + VFIOIOASHwpt *hwpt = vbasedev->hwpt; + + QLIST_REMOVE(vbasedev, hwpt_next); + vbasedev->hwpt = NULL; + + if (QLIST_EMPTY(&hwpt->device_list)) { + QLIST_REMOVE(hwpt, next); + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + } +} + static int iommufd_cdev_attach_container(VFIODevice *vbasedev, VFIOIOMMUFDContainer *container, Error **errp) { + /* mdevs aren't physical devices and will fail with auto domains */ + if (!vbasedev->mdev) { + return iommufd_cdev_autodomains_get(vbasedev, container, errp); + } + return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } @@ -240,6 +319,11 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { error_report_err(err); } + + if (vbasedev->hwpt) { + iommufd_cdev_autodomains_put(vbasedev, container); + } + } static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) @@ -375,6 +459,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, container = g_malloc0(sizeof(*container)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; + QLIST_INIT(&container->hwpt_list); bcontainer = &container->bcontainer; vfio_container_init(bcontainer, space, iommufd_vioc); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index e49e5fabba..2093ed2e91 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -107,10 +107,17 @@ typedef struct VFIOHostDMAWindow { typedef struct IOMMUFDBackend IOMMUFDBackend; +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + typedef struct VFIOIOMMUFDContainer { VFIOContainerBase bcontainer; IOMMUFDBackend *be; uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; } VFIOIOMMUFDContainer; typedef struct VFIODeviceOps VFIODeviceOps; @@ -144,6 +151,8 @@ typedef struct VFIODevice { HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; + VFIOIOASHwpt *hwpt; + QLIST_ENTRY(VFIODevice) hwpt_next; } VFIODevice; struct VFIODeviceOps { diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index a0a0143856..f6f01e4be8 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -52,6 +52,11 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp); +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 35f33bf18826286c9e9fc739a893b9915c71f43c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 14 Jun 2024 11:52:51 +0200 Subject: [PATCH 689/939] HostIOMMUDevice: Store the VFIO/VDPA agent Store the agent device (VFIO or VDPA) in the host IOMMU device. This will allow easy access to some of its resources. Signed-off-by: Eric Auger Reviewed-by: Zhenzhong Duan Reviewed-by: Michael S. Tsirkin --- hw/vfio/container.c | 1 + hw/vfio/iommufd.c | 2 ++ include/sysemu/host_iommu_device.h | 1 + 3 files changed, 4 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 10f7635425..8a5a112b6b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1259,6 +1259,7 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); + hiod->agent = opaque; return true; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 3b75cba26c..7a069ca576 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -735,6 +735,8 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, } data; uint64_t hw_caps; + hiod->agent = opaque; + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, &data, sizeof(data), &hw_caps, errp)) { diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index a57873958b..3e5f058e7b 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -34,6 +34,7 @@ struct HostIOMMUDevice { Object parent_obj; char *name; + void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ HostIOMMUDeviceCaps caps; }; -- Gitee From 7d3634d73af1f53549eba4b3d50bb8f9f49a5243 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:19 +0100 Subject: [PATCH 690/939] vfio/{iommufd,container}: Remove caps::aw_bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove caps::aw_bits which requires the bcontainer::iova_ranges being initialized after device is actually attached. Instead defer that to .get_cap() and call vfio_device_get_aw_bits() directly. This is in preparation for HostIOMMUDevice::realize() being called early during attach_device(). Suggested-by: Zhenzhong Duan Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger --- backends/iommufd.c | 3 ++- hw/vfio/container.c | 5 +---- hw/vfio/iommufd.c | 1 - include/sysemu/host_iommu_device.h | 3 --- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 0d995d7563..4aebf54765 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -19,6 +19,7 @@ #include "qemu/error-report.h" #include "monitor/monitor.h" #include "trace.h" +#include "hw/vfio/vfio-common.h" #include #include @@ -285,7 +286,7 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: return caps->type; case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 8a5a112b6b..30a62348d3 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1258,7 +1258,6 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, VFIODevice *vdev = opaque; hiod->name = g_strdup(vdev->name); - hiod->caps.aw_bits = vfio_device_get_aw_bits(vdev); hiod->agent = opaque; return true; @@ -1267,11 +1266,9 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { - HostIOMMUDeviceCaps *caps = &hiod->caps; - switch (cap) { case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return caps->aw_bits; + return vfio_device_get_aw_bits(hiod->agent); default: error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); return -EINVAL; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 7a069ca576..06e6a400be 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -745,7 +745,6 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); caps->type = type; - caps->aw_bits = vfio_device_get_aw_bits(vdev); return true; } diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index 3e5f058e7b..f586908945 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -19,12 +19,9 @@ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. * * @type: host platform IOMMU type. - * - * @aw_bits: host IOMMU address width. 0xff if no limitation. */ typedef struct HostIOMMUDeviceCaps { uint32_t type; - uint8_t aw_bits; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" -- Gitee From 72660b98e799248338588fe97f191c544c073806 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:20 +0100 Subject: [PATCH 691/939] vfio/iommufd: Add hw_caps field to HostIOMMUDeviceCaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store the value of @caps returned by iommufd_backend_get_device_info() in a new field HostIOMMUDeviceCaps::hw_caps. Right now the only value is whether device IOMMU supports dirty tracking (IOMMU_HW_CAP_DIRTY_TRACKING). This is in preparation for HostIOMMUDevice::realize() being called early during attach_device(). Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger --- hw/vfio/iommufd.c | 1 + include/sysemu/host_iommu_device.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 06e6a400be..d9088705de 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -745,6 +745,7 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, hiod->name = g_strdup(vdev->name); caps->type = type; + caps->hw_caps = hw_caps; return true; } diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index f586908945..e4d8300350 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -19,9 +19,13 @@ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. * * @type: host platform IOMMU type. + * + * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents + * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; + uint64_t hw_caps; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" -- Gitee From 2276a3a175576a63da6abd5ccb309dd1cdbc4021 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:21 +0100 Subject: [PATCH 692/939] vfio/{iommufd, container}: Invoke HostIOMMUDevice::realize() during attach_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the HostIOMMUDevice::realize() to be invoked during the attach of the device before we allocate IOMMUFD hardware pagetable objects (HWPT). This allows the use of the hw_caps obtained by IOMMU_GET_HW_INFO that essentially tell if the IOMMU behind the device supports dirty tracking. Note: The HostIOMMUDevice data from legacy backend is static and doesn't need any information from the (type1-iommu) backend to be initialized. In contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd FD to be connected and having a devid to be able to successfully GET_HW_INFO. This means vfio_device_hiod_realize() is called in different places within the backend .attach_device() implementation. Suggested-by: Cédric Le Goater Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater [ clg: Fixed error handling in iommufd_cdev_attach() ] Signed-off-by: Cédric Le Goater Reviewed-by: Eric Auger --- hw/vfio/common.c | 19 +++++++------------ hw/vfio/container.c | 4 ++++ hw/vfio/helpers.c | 11 +++++++++++ hw/vfio/iommufd.c | 11 +++++++++++ include/hw/vfio/vfio-common.h | 1 + 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ceb1da0b94..65e1c9f810 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1659,22 +1659,17 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, assert(ops); - ret = ops->attach_device(name, vbasedev, as, errp); - if (ret) { - return ret; - } - - if (vbasedev->mdev) { - return true; + if (!vbasedev->mdev) { + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + vbasedev->hiod = hiod; } - hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); - if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) { + ret = ops->attach_device(name, vbasedev, as, errp); + if (ret) { object_unref(hiod); - ops->detach_device(vbasedev); - return -1; + vbasedev->hiod = NULL; + return ret; } - vbasedev->hiod = hiod; return 0; } diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 30a62348d3..64eacfd912 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1030,6 +1030,10 @@ static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, trace_vfio_attach_device(vbasedev->name, groupid); + if (!vfio_device_hiod_realize(vbasedev, errp)) { + return false; + } + group = vfio_get_group(groupid, as, errp); if (!group) { return -ENOENT; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 37bc383c69..1f3bfed917 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -694,3 +694,14 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev) subsys = realpath(tmp, NULL); return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); } + +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) +{ + HostIOMMUDevice *hiod = vbasedev->hiod; + + if (!hiod) { + return true; + } + + return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index d9088705de..8fd6826826 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -424,6 +424,17 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, space = vfio_get_address_space(as); + /* + * The HostIOMMUDevice data from legacy backend is static and doesn't need + * any information from the (type1-iommu) backend to be initialized. In + * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd + * FD to be connected and having a devid to be able to successfully call + * iommufd_backend_get_device_info(). + */ + if (!vfio_device_hiod_realize(vbasedev, errp)) { + goto err_alloc_ioas; + } + /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 2093ed2e91..63da291456 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -230,6 +230,7 @@ void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); bool vfio_device_is_mdev(VFIODevice *vbasedev); +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp); int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); -- Gitee From db8ef4524568c2379c25986db6e30cb0f6c0ec05 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:22 +0100 Subject: [PATCH 693/939] vfio/iommufd: Probe and request hwpt dirty tracking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to using the dirty tracking UAPI, probe whether the IOMMU supports dirty tracking. This is done via the data stored in hiod::caps::hw_caps initialized from GET_HW_INFO. Qemu doesn't know if VF dirty tracking is supported when allocating hardware pagetable in iommufd_cdev_autodomains_get(). This is because VFIODevice migration state hasn't been initialized *yet* hence it can't pick between VF dirty tracking vs IOMMU dirty tracking. So, if IOMMU supports dirty tracking it always creates HWPTs with IOMMU_HWPT_ALLOC_DIRTY_TRACKING even if later on VFIOMigration decides to use VF dirty tracking instead. Signed-off-by: Joao Martins [ clg: - Fixed vbasedev->iommu_dirty_tracking assignment in iommufd_cdev_autodomains_get() - Added warning for heterogeneous dirty page tracking support in iommufd_cdev_autodomains_get() ] Signed-off-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 26 ++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 8fd6826826..a9400d8107 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -114,6 +114,11 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) iommufd_backend_disconnect(vbasedev->iommufd); } +static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) +{ + return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { long int ret = -ENOTTY; @@ -256,10 +261,22 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, } else { vbasedev->hwpt = hwpt; QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); return 0; } } + /* + * This is quite early and VFIO Migration state isn't yet fully + * initialized, thus rely only on IOMMU hardware capabilities as to + * whether IOMMU dirty tracking is going to be requested. Later + * vfio_migration_realize() may decide to use VF dirty tracking + * instead. + */ + if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + } + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, @@ -269,6 +286,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, hwpt = g_malloc0(sizeof(*hwpt)); hwpt->hwpt_id = hwpt_id; + hwpt->hwpt_flags = flags; QLIST_INIT(&hwpt->device_list); ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); @@ -279,8 +297,16 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, } vbasedev->hwpt = hwpt; + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + container->bcontainer.dirty_pages_supported |= + vbasedev->iommu_dirty_tracking; + if (container->bcontainer.dirty_pages_supported && + !vbasedev->iommu_dirty_tracking) { + warn_report("IOMMU instance for device %s doesn't support dirty tracking", + vbasedev->name); + } return 0; } diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 63da291456..22a7386591 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -109,6 +109,7 @@ typedef struct IOMMUFDBackend IOMMUFDBackend; typedef struct VFIOIOASHwpt { uint32_t hwpt_id; + uint32_t hwpt_flags; QLIST_HEAD(, VFIODevice) device_list; QLIST_ENTRY(VFIOIOASHwpt) next; } VFIOIOASHwpt; @@ -148,6 +149,7 @@ typedef struct VFIODevice { OnOffAuto pre_copy_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + bool iommu_dirty_tracking; HostIOMMUDevice *hiod; int devid; IOMMUFDBackend *iommufd; -- Gitee From 73b24be504fcd9b453a51e1f2fc8af64b092c586 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:23 +0100 Subject: [PATCH 694/939] vfio/iommufd: Implement VFIOIOMMUClass::set_dirty_tracking support ioctl(iommufd, IOMMU_HWPT_SET_DIRTY_TRACKING, arg) is the UAPI that enables or disables dirty page tracking. The ioctl is used if the hwpt has been created with dirty tracking supported domain (stored in hwpt::flags) and it is called on the whole list of iommu domains. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger [Shameer: changed iommufd_set_dirty_page_tracking() declaration] Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 23 +++++++++++++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 34 ++++++++++++++++++++++++++++++++++ include/sysemu/iommufd.h | 2 ++ 4 files changed, 60 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 4aebf54765..785d3fbbad 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -254,6 +254,29 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, return true; } +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, + uint32_t hwpt_id, bool start, + Error **errp) +{ + int ret; + struct iommu_hwpt_set_dirty_tracking set_dirty = { + .size = sizeof(set_dirty), + .hwpt_id = hwpt_id, + .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty); + trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed", + hwpt_id); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index e248bf039e..fe3297ca15 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -16,3 +16,4 @@ iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t si iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index a9400d8107..11e1392527 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -119,6 +119,39 @@ static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } +static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, start, NULL)) { + error_report("Failed to set dirty tracking hwpt_id %u errno: %d", + hwpt->hwpt_id, errno); + goto err; + } + } + + return 0; + +err: + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, !start, NULL); + } + return -EINVAL; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { long int ret = -ENOTTY; @@ -759,6 +792,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index f6f01e4be8..4f1dbe827c 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -57,6 +57,8 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, Error **errp); +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From d09cb3d1907e3afbae9b3ea345c9973e207614bf Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:24 +0100 Subject: [PATCH 695/939] vfio/iommufd: Implement VFIOIOMMUClass::query_dirty_bitmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioctl(iommufd, IOMMU_HWPT_GET_DIRTY_BITMAP, arg) is the UAPI that fetches the bitmap that tells what was dirty in an IOVA range. A single bitmap is allocated and used across all the hwpts sharing an IOAS which is then used in log_sync() to set Qemu global bitmaps. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan [Shameer: changed iommufd_query_dirty_bitmap() declaration] Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 29 +++++++++++++++++++++++++++++ backends/trace-events | 1 + hw/vfio/iommufd.c | 32 ++++++++++++++++++++++++++++++++ include/sysemu/iommufd.h | 4 ++++ 4 files changed, 66 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 785d3fbbad..c1260766f0 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -277,6 +277,35 @@ bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, return true; } +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, + uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp) +{ + int ret; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = { + .size = sizeof(get_dirty_bitmap), + .hwpt_id = hwpt_id, + .iova = iova, + .length = size, + .page_size = page_size, + .data = (uintptr_t)data, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap); + trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size, + page_size, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx + " size: 0x"RAM_ADDR_FMT") failed", iova, size); + return false; + } + + return true; +} + bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp) diff --git a/backends/trace-events b/backends/trace-events index fe3297ca15..b02433710a 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -17,3 +17,4 @@ iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioa iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" +iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 11e1392527..3d4f902ae5 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -25,6 +25,7 @@ #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" +#include "exec/ram_addr.h" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) @@ -152,6 +153,36 @@ err: return -EINVAL; } +static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) +{ + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + unsigned long page_size = qemu_real_host_page_size(); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id, + iova, size, page_size, + (uint64_t *)vbmap->bitmap, + NULL)) { + error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)iova, + (uint64_t)size, errno); + + return -EINVAL; + } + } + + return 0; +} + static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) { long int ret = -ENOTTY; @@ -793,6 +824,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->detach_device = iommufd_cdev_detach; vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 4f1dbe827c..3b28c8a81c 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -59,6 +59,10 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, Error **errp); bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, bool start, Error **errp); +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 6eab0b4a0c79d53250da601da25e2813177d44fe Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:25 +0100 Subject: [PATCH 696/939] vfio/migration: Don't block migration device dirty tracking is unsupported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default VFIO migration is set to auto, which will support live migration if the migration capability is set *and* also dirty page tracking is supported. For testing purposes one can force enable without dirty page tracking via enable-migration=on, but that option is generally left for testing purposes. So starting with IOMMU dirty tracking it can use to accommodate the lack of VF dirty page tracking allowing us to minimize the VF requirements for migration and thus enabling migration by default for those too. While at it change the error messages to mention IOMMU dirty tracking as well. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger [ clg: - spelling in commit log ] Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 28d422b39f..db128204af 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -945,16 +945,16 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported) { + if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, - "%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + "%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); goto add_blocker; } - warn_report("%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + warn_report("%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); } ret = vfio_block_multiple_devices_migration(vbasedev, errp); -- Gitee From b0fe5a6794c5403f4ab9859ec2ced338246690bd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 22 Jul 2024 22:13:26 +0100 Subject: [PATCH 697/939] vfio/common: Allow disabling device dirty page tracking The property 'x-pre-copy-dirty-page-tracking' allows disabling the whole tracking of VF pre-copy phase of dirty page tracking, though it means that it will only be used at the start of the switchover phase. Add an option that disables the VF dirty page tracking, and fall back into container-based dirty page tracking. This also allows to use IOMMU dirty tracking even on VFs with their own dirty tracker scheme. Signed-off-by: Joao Martins Reviewed-by: Zhenzhong Duan --- hw/vfio/common.c | 3 +++ hw/vfio/migration.c | 4 +++- hw/vfio/pci.c | 3 +++ include/hw/vfio/vfio-common.h | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 65e1c9f810..a8bc1c6055 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -208,6 +208,9 @@ bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) VFIODevice *vbasedev; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; + } if (!vbasedev->dirty_pages_supported) { return false; } diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index db128204af..3924beb289 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -945,7 +945,9 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { + if ((!vbasedev->dirty_pages_supported || + vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && + !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, "%s: VFIO device doesn't support device and " diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 19211f4368..f585f285f4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3350,6 +3350,9 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, + vbasedev.device_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 22a7386591..abae8655c4 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -147,6 +147,7 @@ typedef struct VFIODevice { VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + OnOffAuto device_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; bool iommu_dirty_tracking; -- Gitee From ac715e361fdb6d92169b3b3f5964405c816a13ac Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 14 Jan 2025 10:29:24 +0000 Subject: [PATCH 698/939] Update iommufd.h header for vSVA This is based on Linaro UADK branch: https://github.com/Linaro/linux-kernel-uadk/tree/6.12-wip-10.26 Signed-off-by: Shameer Kolothum --- linux-headers/linux/iommufd.h | 394 ++++++++++++++++++++++++++++++++-- 1 file changed, 371 insertions(+), 23 deletions(-) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 806d98d09c..41559c6064 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -37,18 +37,22 @@ enum { IOMMUFD_CMD_BASE = 0x80, IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, - IOMMUFD_CMD_IOAS_ALLOC, - IOMMUFD_CMD_IOAS_ALLOW_IOVAS, - IOMMUFD_CMD_IOAS_COPY, - IOMMUFD_CMD_IOAS_IOVA_RANGES, - IOMMUFD_CMD_IOAS_MAP, - IOMMUFD_CMD_IOAS_UNMAP, - IOMMUFD_CMD_OPTION, - IOMMUFD_CMD_VFIO_IOAS, - IOMMUFD_CMD_HWPT_ALLOC, - IOMMUFD_CMD_GET_HW_INFO, - IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, - IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, + IOMMUFD_CMD_IOAS_ALLOC = 0x81, + IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82, + IOMMUFD_CMD_IOAS_COPY = 0x83, + IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84, + IOMMUFD_CMD_IOAS_MAP = 0x85, + IOMMUFD_CMD_IOAS_UNMAP = 0x86, + IOMMUFD_CMD_OPTION = 0x87, + IOMMUFD_CMD_VFIO_IOAS = 0x88, + IOMMUFD_CMD_HWPT_ALLOC = 0x89, + IOMMUFD_CMD_GET_HW_INFO = 0x8a, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, + IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, + IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, + IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f, + IOMMUFD_CMD_VDEVICE_ALLOC = 0x90, }; /** @@ -355,10 +359,13 @@ struct iommu_vfio_ioas { * the parent HWPT in a nesting configuration. * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is * enforced on device attachment + * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is + * valid. */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, + IOMMU_HWPT_FAULT_ID_VALID = 1 << 2, }; /** @@ -389,14 +396,34 @@ struct iommu_hwpt_vtd_s1 { __u32 __reserved; }; +/** + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for + * a user stage-1 Context Descriptor Table. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass + * nested domain will translate the same as the nesting parent. + */ +struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table */ enum iommu_hwpt_data_type { - IOMMU_HWPT_DATA_NONE, - IOMMU_HWPT_DATA_VTD_S1, + IOMMU_HWPT_DATA_NONE = 0, + IOMMU_HWPT_DATA_VTD_S1 = 1, + IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, }; /** @@ -404,12 +431,15 @@ enum iommu_hwpt_data_type { * @size: sizeof(struct iommu_hwpt_alloc) * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS or HWPT to connect this HWPT to + * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 * @data_type: One of enum iommu_hwpt_data_type * @data_len: Length of the type specific data * @data_uptr: User pointer to the type specific data + * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of + * IOMMU_HWPT_FAULT_ID_VALID is set. + * @__reserved2: Padding to 64-bit alignment. Must be 0. * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the @@ -420,11 +450,13 @@ enum iommu_hwpt_data_type { * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. * - * A user-managed nested HWPT will be created from a given parent HWPT via - * @pt_id, in which the parent HWPT must be allocated previously via the - * same ioctl from a given IOAS (@pt_id). In this case, the @data_type - * must be set to a pre-defined type corresponding to an I/O page table - * type supported by the underlying IOMMU hardware. + * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a + * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be + * allocated previously via the same ioctl from a given IOAS (@pt_id). In this + * case, the @data_type must be set to a pre-defined type corresponding to an + * I/O page table type supported by the underlying IOMMU hardware. The device + * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU + * instance. * * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr @@ -440,6 +472,8 @@ struct iommu_hwpt_alloc { __u32 data_type; __u32 data_len; __aligned_u64 data_uptr; + __u32 fault_id; + __u32 __reserved2; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) @@ -474,15 +508,50 @@ struct iommu_hw_info_vtd { __aligned_u64 ecap_reg; }; +/** + * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information + * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + * + * @flags: Must be set to 0 + * @__reserved: Must be 0 + * @idr: Implemented features for ARM SMMU Non-secure programming interface + * @iidr: Information about the implementation and implementer of ARM SMMU, + * and architecture version supported + * @aidr: ARM SMMU architecture version + * + * For the details of @idr, @iidr and @aidr, please refer to the chapters + * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. + * + * User space should read the underlying ARM SMMUv3 hardware information for + * the list of supported features. + * + * Note that these values reflect the raw HW capability, without any insight if + * any required kernel driver support is present. Bits may be set indicating the + * HW has functionality that is lacking kernel software support, such as BTM. If + * a VMM is using this information to construct emulated copies of these + * registers it should only forward bits that it knows it can support. + * + * In future, presence of required kernel support will be indicated in flags. + */ +struct iommu_hw_info_arm_smmuv3 { + __u32 flags; + __u32 __reserved; + __u32 idr[6]; + __u32 iidr; + __u32 aidr; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type + * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { - IOMMU_HW_INFO_TYPE_NONE, - IOMMU_HW_INFO_TYPE_INTEL_VTD, + IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, + IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, }; /** @@ -494,9 +563,17 @@ enum iommu_hw_info_type { * IOMMU_HWPT_GET_DIRTY_BITMAP * IOMMU_HWPT_SET_DIRTY_TRACKING * + * @IOMMU_HW_CAP_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct iommu_hw_info::out_max_pasid_log2 + * is zero. + * @IOMMU_HW_CAP_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct iommu_hw_info::out_max_pasid_log2 + * is zero. */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; /** @@ -512,6 +589,9 @@ enum iommufd_hw_capabilities { * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -535,7 +615,8 @@ struct iommu_hw_info { __u32 data_len; __aligned_u64 data_uptr; __u32 out_data_type; - __u32 __reserved; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) @@ -613,4 +694,271 @@ struct iommu_hwpt_get_dirty_bitmap { #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) +/** + * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation + * Data Type + * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 + * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3 + */ +enum iommu_hwpt_invalidate_data_type { + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1, +}; + +/** + * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d + * stage-1 cache invalidation + * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies + * to all-levels page structure cache or just + * the leaf PTE cache. + */ +enum iommu_hwpt_vtd_s1_invalidate_flags { + IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0, +}; + +/** + * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation + * (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) + * @addr: The start address of the range to be invalidated. It needs to + * be 4KB aligned. + * @npages: Number of contiguous 4K pages to be invalidated. + * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags + * @__reserved: Must be 0 + * + * The Intel VT-d specific invalidation data for user-managed stage-1 cache + * invalidation in nested translation. Userspace uses this structure to + * tell the impacted cache scope after modifying the stage-1 page table. + * + * Invalidating all the caches related to the page table by setting @addr + * to be 0 and @npages to be U64_MAX. + * + * The device TLB will be invalidated automatically if ATS is enabled. + */ +struct iommu_hwpt_vtd_s1_invalidate { + __aligned_u64 addr; + __aligned_u64 npages; + __u32 flags; + __u32 __reserved; +}; + +/** + * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation + * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3) + * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ. + * Must be little-endian. + * + * Supported command list only when passing in a vIOMMU via @hwpt_id: + * CMDQ_OP_TLBI_NSNH_ALL + * CMDQ_OP_TLBI_NH_VA + * CMDQ_OP_TLBI_NH_VAA + * CMDQ_OP_TLBI_NH_ALL + * CMDQ_OP_TLBI_NH_ASID + * CMDQ_OP_ATC_INV + * CMDQ_OP_CFGI_CD + * CMDQ_OP_CFGI_CD_ALL + * + * -EIO will be returned if the command is not supported. + */ +struct iommu_viommu_arm_smmuv3_invalidate { + __aligned_le64 cmd[2]; +}; + +/** + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) + * @size: sizeof(struct iommu_hwpt_invalidate) + * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation + * @data_uptr: User pointer to an array of driver-specific cache invalidation + * data. + * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data + * type of all the entries in the invalidation request array. It + * should be a type supported by the hwpt pointed by @hwpt_id. + * @entry_len: Length (in bytes) of a request entry in the request array + * @entry_num: Input the number of cache invalidation requests in the array. + * Output the number of requests successfully handled by kernel. + * @__reserved: Must be 0. + * + * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications + * on a user-managed page table should be followed by this operation, if a HWPT + * is passed in via @hwpt_id. Other caches, such as device cache or descriptor + * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field. + * + * Each ioctl can support one or more cache invalidation requests in the array + * that has a total size of @entry_len * @entry_num. + * + * An empty invalidation request array by setting @entry_num==0 is allowed, and + * @entry_len and @data_uptr would be ignored in this case. This can be used to + * check if the given @data_type is supported or not by kernel. + */ +struct iommu_hwpt_invalidate { + __u32 size; + __u32 hwpt_id; + __aligned_u64 data_uptr; + __u32 data_type; + __u32 entry_len; + __u32 entry_num; + __u32 __reserved; +}; +#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) + +/** + * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is + * valid. + * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group. + */ +enum iommu_hwpt_pgfault_flags { + IOMMU_PGFAULT_FLAGS_PASID_VALID = (1 << 0), + IOMMU_PGFAULT_FLAGS_LAST_PAGE = (1 << 1), +}; + +/** + * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_PERM_READ: request for read permission + * @IOMMU_PGFAULT_PERM_WRITE: request for write permission + * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the + * Execute Requested bit set in PASID TLP Prefix. + * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the + * Privileged Mode Requested bit set in PASID TLP + * Prefix. + */ +enum iommu_hwpt_pgfault_perm { + IOMMU_PGFAULT_PERM_READ = (1 << 0), + IOMMU_PGFAULT_PERM_WRITE = (1 << 1), + IOMMU_PGFAULT_PERM_EXEC = (1 << 2), + IOMMU_PGFAULT_PERM_PRIV = (1 << 3), +}; + +/** + * struct iommu_hwpt_pgfault - iommu page fault data + * @flags: Combination of enum iommu_hwpt_pgfault_flags + * @dev_id: id of the originated device + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: Combination of enum iommu_hwpt_pgfault_perm + * @addr: Fault address + * @length: a hint of how much data the requestor is expecting to fetch. For + * example, if the PRI initiator knows it is going to do a 10MB + * transfer, it could fill in 10MB and the OS could pre-fault in + * 10MB of IOVA. It's default to 0 if there's no such hint. + * @cookie: kernel-managed cookie identifying a group of fault messages. The + * cookie number encoded in the last page fault of the group should + * be echoed back in the response message. + */ +struct iommu_hwpt_pgfault { + __u32 flags; + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u32 length; + __u32 cookie; +}; + +/** + * enum iommufd_page_response_code - Return status of fault handlers + * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is the + * "Success" defined in PCI 10.4.2.1. + * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is the "Invalid Request" in PCI + * 10.4.2.1. + */ +enum iommufd_page_response_code { + IOMMUFD_PAGE_RESP_SUCCESS = 0, + IOMMUFD_PAGE_RESP_INVALID = 1, +}; + +/** + * struct iommu_hwpt_page_response - IOMMU page fault response + * @cookie: The kernel-managed cookie reported in the fault message. + * @code: One of response code in enum iommufd_page_response_code. + */ +struct iommu_hwpt_page_response { + __u32 cookie; + __u32 code; +}; + +/** + * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC) + * @size: sizeof(struct iommu_fault_alloc) + * @flags: Must be 0 + * @out_fault_id: The ID of the new FAULT + * @out_fault_fd: The fd of the new FAULT + * + * Explicitly allocate a fault handling object. + */ +struct iommu_fault_alloc { + __u32 size; + __u32 flags; + __u32 out_fault_id; + __u32 out_fault_fd; +}; +#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC) + +/** + * enum iommu_viommu_type - Virtual IOMMU Type + * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + */ +enum iommu_viommu_type { + IOMMU_VIOMMU_TYPE_DEFAULT = 0, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC) + * @size: sizeof(struct iommu_viommu_alloc) + * @flags: Must be 0 + * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type + * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU + * @hwpt_id: ID of a nesting parent HWPT to associate to + * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * + * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's + * virtualization support that is a security-isolated slice of the real IOMMU HW + * that is unique to a specific VM. Operations global to the IOMMU are connected + * to the vIOMMU, such as: + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags + * - Access to a sharable nesting parent pagetable across physical IOMMUs + * - Non-affiliated event reporting (e.g. an invalidation queue error) + * - Virtualization of various platforms IDs, e.g. RIDs and others + * - Delivery of paravirtualized invalidation + * - Direct assigned invalidation queues + * - Direct assigned interrupts + */ +struct iommu_viommu_alloc { + __u32 size; + __u32 flags; + __u32 type; + __u32 dev_id; + __u32 hwpt_id; + __u32 out_viommu_id; +}; +#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) + +/** + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) + * @size: sizeof(struct iommu_vdevice_alloc) + * @viommu_id: vIOMMU ID to associate with the virtual device + * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU + * @__reserved: Must be 0 + * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID + * of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table. + * @out_vdevice_id: Output virtual instance ID for the allocated object + * @__reserved2: Must be 0 + * + * Allocate a virtual device instance (for a physical device) against a vIOMMU. + * This instance holds the device's information (related to its vIOMMU) in a VM. + */ +struct iommu_vdevice_alloc { + __u32 size; + __u32 viommu_id; + __u32 dev_id; + __u32 __reserved; + __aligned_u64 virt_id; + __u32 out_vdevice_id; + __u32 __reserved2; +}; +#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) #endif -- Gitee From cedca4d3635cde049151b5818df2cb66c2b1531f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 3 Nov 2023 16:54:01 +0800 Subject: [PATCH 699/939] backends/iommufd: Add helpers for invalidating user-managed HWPT Signed-off-by: Nicolin Chen Signed-off-by: Zhenzhong Duan --- backends/iommufd.c | 30 ++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 3 +++ 3 files changed, 34 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index c1260766f0..cf24370385 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -330,6 +330,36 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, return true; } +int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr) +{ + int ret, fd = be->fd; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = hwpt_id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = *entry_num, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + + trace_iommufd_backend_invalidate_cache(fd, hwpt_id, data_type, entry_len, + *entry_num, cache.entry_num, + (uintptr_t)data_ptr, ret); + if (ret) { + *entry_num = cache.entry_num; + error_report("IOMMU_HWPT_INVALIDATE failed: %s", strerror(errno)); + ret = -errno; + } else { + g_assert(*entry_num == cache.entry_num); + } + + return ret; +} + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { HostIOMMUDeviceCaps *caps = &hiod->caps; diff --git a/backends/trace-events b/backends/trace-events index b02433710a..ef0ff98921 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -18,3 +18,4 @@ iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" +iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 3b28c8a81c..f6596f6338 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -63,6 +63,9 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, uint64_t iova, ram_addr_t size, uint64_t page_size, uint64_t *data, Error **errp); +int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" #endif -- Gitee From 0e0956cb785f868dfe48201fcdead71dbdd234b0 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 15 Jan 2024 15:05:19 +0800 Subject: [PATCH 700/939] vfio/iommufd: Add properties and handlers to TYPE_HOST_IOMMU_DEVICE_IOMMUFD New added properties include IOMMUFD handle and devid, ioas. IOMMUFD handle and devid are used to allocate/free ioas, hwpt. ioas is used to re-attach IOMMUFD backed device to its default ioas id, i.e., when vIOMMU is disabled by guest. These properties are initialized in .realize() handler. New added handlers include [at|de]tach_hwpt. They are used to attaching/detaching hwpt. VFIO and VDPA have different way to attach and detach, so implementation will be in sub-class instead of HostIOMMUDeviceIOMMUFD. Add two wrappers host_iommu_device_iommufd_[at|de]tach_hwpt to wrap the two handlers. This is a prerequisite patch for following ones. Signed-off-by: Zhenzhong Duan --- backends/iommufd.c | 22 ++++++++++++++++++ include/sysemu/iommufd.h | 50 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index cf24370385..c10aa9b011 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -360,6 +360,26 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, return ret; } +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->attach_hwpt); + return idevc->attach_hwpt(idev, hwpt_id, errp); +} + +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->detach_hwpt); + return idevc->detach_hwpt(idev, errp); +} + static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) { HostIOMMUDeviceCaps *caps = &hiod->caps; @@ -398,6 +418,8 @@ static const TypeInfo types[] = { }, { .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, .parent = TYPE_HOST_IOMMU_DEVICE, + .instance_size = sizeof(HostIOMMUDeviceIOMMUFD), + .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass), .class_init = hiod_iommufd_class_init, .abstract = true, } diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index f6596f6338..3dc6934144 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -68,4 +68,54 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, uint32_t *entry_num, void *data_ptr); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Abstract of host IOMMU device with iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t ioas_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); #endif -- Gitee From 53a82c6a5a22bb41e9bd3f754479baf4ce0845bf Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 5 Aug 2024 09:29:00 +0800 Subject: [PATCH 701/939] HostIOMMUDevice: Introduce realize_late callback Previously we have a realize() callback which is called before attachment. But there are still some elements e.g., ioas not ready before attachment. So we need a realize_late() callback to further initialize them. Currently, this callback is only useful for iommufd backend. For legacy backend nothing needs to be initialized after attachment. Signed-off-by: Zhenzhong Duan --- hw/vfio/common.c | 18 +++++++++++++++--- include/sysemu/host_iommu_device.h | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index a8bc1c6055..0be63c5fbc 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1654,6 +1654,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, const VFIOIOMMUClass *ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); HostIOMMUDevice *hiod = NULL; + HostIOMMUDeviceClass *hiod_ops = NULL; int ret; if (vbasedev->iommufd) { @@ -1664,17 +1665,28 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, if (!vbasedev->mdev) { hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + hiod_ops = HOST_IOMMU_DEVICE_GET_CLASS(hiod); vbasedev->hiod = hiod; } ret = ops->attach_device(name, vbasedev, as, errp); if (ret) { - object_unref(hiod); - vbasedev->hiod = NULL; - return ret; + goto err_attach; + } + + if (hiod_ops && hiod_ops->realize_late && + !hiod_ops->realize_late(hiod, vbasedev, errp)) { + ops->detach_device(vbasedev); + ret = -EINVAL; + goto err_attach; } return 0; + +err_attach: + object_unref(hiod); + vbasedev->hiod = NULL; + return ret; } void vfio_detach_device(VFIODevice *vbasedev) diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index e4d8300350..84131f5495 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -64,6 +64,23 @@ struct HostIOMMUDeviceClass { * Returns: true on success, false on failure. */ bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @realize_late: initialize host IOMMU device instance after attachment, + * some elements e.g., ioas are ready only after attachment. + * This callback initialize them. + * + * Optional callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize_late)(HostIOMMUDevice *hiod, void *opaque, Error **errp); /** * @get_cap: check if a host IOMMU device capability is supported. * -- Gitee From b727a28ce2cf062473ca011dd69697e0b7826a25 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 5 Aug 2024 09:29:00 +0800 Subject: [PATCH 702/939] vfio/iommufd: Implement HostIOMMUDeviceClass::realize_late() handler There are three iommufd related elements iommufd handle, devid and ioas_id. ioas_id is ready only after VFIO device attachment. Device id and iommufd handle are ready before attachment, but they are all iommufd related elements, initialize them together with ioas_id. Signed-off-by: Zhenzhong Duan --- hw/vfio/iommufd.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 3d4f902ae5..47a8823146 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -827,6 +827,22 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + VFIOIOMMUFDContainer *container = container_of(vdev->bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->ioas_id = container->ioas_id; + + return true; +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { @@ -858,6 +874,7 @@ static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; + hiodc->realize_late = hiod_iommufd_vfio_realize_late; }; static const TypeInfo types[] = { -- Gitee From aea706f6a71ddbcc9bd342ece14991f8f8261224 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 11 Jan 2024 17:26:50 +0800 Subject: [PATCH 703/939] vfio/iommufd: Implement [at|de]tach_hwpt handlers Implement [at|de]tach_hwpt handlers in VFIO subsystem. vIOMMU utilizes them to attach to or detach from hwpt on host side. To achieve that, a new property vdev is add to TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO which is initialized in .realize() handler. Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan [Shameer: Changed ret for host_iommu_device_iommufd_vfio_detach_hwpt()] Signed-off-by: Shameer Kolothum --- hw/vfio/iommufd.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 47a8823146..528023b95b 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -827,6 +827,24 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, Error **errp) { @@ -872,9 +890,13 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->realize_late = hiod_iommufd_vfio_realize_late; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { -- Gitee From 207259b8f08e87b4a741a8b7884e699c95641a2e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Sat, 13 Apr 2024 00:15:17 +0000 Subject: [PATCH 704/939] backends/iommufd: Introduce iommufd_backend_alloc_viommu Add a helper to allocate a viommu object. Signed-off-by: Nicolin Chen --- backends/iommufd.c | 35 +++++++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 10 ++++++++++ 3 files changed, 46 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index c10aa9b011..82368a3918 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -360,6 +360,41 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, return ret; } +struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id) +{ + int ret, fd = be->fd; + struct IOMMUFDViommu *viommu = g_malloc(sizeof(*viommu)); + struct iommu_viommu_alloc alloc_viommu = { + .size = sizeof(alloc_viommu), + .type = viommu_type, + .dev_id = dev_id, + .hwpt_id = hwpt_id, + }; + + if (!viommu) { + error_report("failed to allocate viommu object"); + return NULL; + } + + ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu); + + trace_iommufd_backend_alloc_viommu(fd, viommu_type, dev_id, hwpt_id, + alloc_viommu.out_viommu_id, ret); + if (ret) { + error_report("IOMMU_VIOMMU_ALLOC failed: %s", strerror(errno)); + g_free(viommu); + return NULL; + } + + viommu->viommu_id = alloc_viommu.out_viommu_id; + viommu->s2_hwpt_id = hwpt_id; + viommu->iommufd = be; + return viommu; +} + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp) { diff --git a/backends/trace-events b/backends/trace-events index ef0ff98921..c24cd378df 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -19,3 +19,4 @@ iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (% iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" +iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 3dc6934144..05a08c49c2 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -39,6 +39,12 @@ struct IOMMUFDBackend { /*< public >*/ }; +typedef struct IOMMUFDViommu { + IOMMUFDBackend *iommufd; + uint32_t s2_hwpt_id; + uint32_t viommu_id; +} IOMMUFDViommu; + int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); void iommufd_backend_disconnect(IOMMUFDBackend *be); @@ -66,6 +72,10 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t *entry_num, void *data_ptr); +struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, -- Gitee From 005b8f4b6cef11982abcc2c071cbe40b69fb22e7 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Sat, 13 Apr 2024 00:21:22 +0000 Subject: [PATCH 705/939] backends/iommufd: Introduce iommufd_vdev_alloc Add a helper to allocate an iommufd device's virtual device (in the user space) per a viommu instance. Signed-off-by: Nicolin Chen --- backends/iommufd.c | 31 +++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 11 +++++++++++ 3 files changed, 43 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 82368a3918..af3376d0bf 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -395,6 +395,37 @@ struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, return viommu; } +struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + IOMMUFDViommu *viommu, + uint64_t virt_id) +{ + int ret, fd = viommu->iommufd->fd; + struct IOMMUFDVdev *vdev = g_malloc(sizeof(*vdev)); + struct iommu_vdevice_alloc alloc_vdev = { + .size = sizeof(alloc_vdev), + .viommu_id = viommu->viommu_id, + .dev_id = idev->devid, + .virt_id = virt_id, + }; + + ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev); + + trace_iommufd_backend_alloc_vdev(fd, idev->devid, viommu->viommu_id, virt_id, + alloc_vdev.out_vdevice_id, ret); + + if (ret) { + error_report("IOMMU_VDEVICE_ALLOC failed: %s", strerror(errno)); + g_free(vdev); + return NULL; + } + + vdev->idev = idev; + vdev->viommu = viommu; + vdev->virt_id = virt_id; + vdev->vdev_id = alloc_vdev.out_vdevice_id; + return vdev; +} + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp) { diff --git a/backends/trace-events b/backends/trace-events index c24cd378df..e150a37e9a 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -20,3 +20,4 @@ iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" +iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 05a08c49c2..0284e95460 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -128,4 +128,15 @@ bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp); bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, Error **errp); + +typedef struct IOMMUFDVdev { + HostIOMMUDeviceIOMMUFD *idev; + IOMMUFDViommu *viommu; + uint32_t vdev_id; + uint64_t virt_id; +} IOMMUFDVdev; + +struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + IOMMUFDViommu *viommu, + uint64_t virt_id); #endif -- Gitee From 2be28f75e4ed2a0a35549dd1a545e0655e63973d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 12 Apr 2024 23:27:54 +0000 Subject: [PATCH 706/939] backends/iommufd: Introduce iommufd_viommu_invalidate_cache Similar to iommufd_backend_invalidate_cache for iotlb invalidation via IOMMU_HWPT_INVALIDATE ioctl, add a new helper for viommu specific cache invalidation via IOMMU_VIOMMU_INVALIDATE ioctl. Signed-off-by: Nicolin Chen --- backends/iommufd.c | 31 +++++++++++++++++++++++++++++++ backends/trace-events | 1 + include/sysemu/iommufd.h | 3 +++ 3 files changed, 35 insertions(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index af3376d0bf..ee6f5bcf65 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -426,6 +426,37 @@ struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, return vdev; } +int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr) +{ + int ret, fd = be->fd; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = viommu_id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = *entry_num, + .data_uptr = (uint64_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + + trace_iommufd_viommu_invalidate_cache(fd, viommu_id, data_type, + entry_len, *entry_num, + cache.entry_num, + (uint64_t)data_ptr, ret); + if (ret) { + *entry_num = cache.entry_num; + error_report("IOMMU_VIOMMU_INVALIDATE failed: %s", strerror(errno)); + ret = -errno; + } else { + g_assert(*entry_num == cache.entry_num); + } + + return ret; +} + bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, Error **errp) { diff --git a/backends/trace-events b/backends/trace-events index e150a37e9a..f8592a2711 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -21,3 +21,4 @@ iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, u iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" +iommufd_viommu_invalidate_cache(int iommufd, uint32_t viommu_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d viommu_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 0284e95460..0f2c826036 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -76,6 +76,9 @@ struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, uint32_t dev_id, uint32_t viommu_type, uint32_t hwpt_id); +int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, -- Gitee From d589010512005bfc698f30417911e4b14478c81b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Jun 2022 01:30:39 -0700 Subject: [PATCH 707/939] hw/arm/smmu-common: Add a nested flag to SMMUState Add a nested flag in the SMMUState, passed in from device property. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 1 + hw/arm/smmuv3.c | 5 +++++ include/hw/arm/smmu-common.h | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 9a8ac45431..c5f3e02065 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -683,6 +683,7 @@ static Property smmu_dev_properties[] = { DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0), DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus, TYPE_PCI_BUS, PCIBus *), + DEFINE_PROP_BOOL("nested", SMMUState, nested, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index c3871ae067..64ca4c5542 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1746,6 +1746,11 @@ static void smmu_realize(DeviceState *d, Error **errp) SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; + if (s->stage && strcmp("1", s->stage)) { + /* Only support nested with an stage1 only vSMMU */ + sys->nested = false; + } + c->parent_realize(d, &local_err); if (local_err) { error_propagate(errp, local_err); diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index fd8d772da1..eae5d4d05b 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -22,6 +22,7 @@ #include "hw/sysbus.h" #include "hw/pci/pci.h" #include "qom/object.h" +#include "sysemu/iommufd.h" #define SMMU_PCI_BUS_MAX 256 #define SMMU_PCI_DEVFN_MAX 256 @@ -136,6 +137,9 @@ struct SMMUState { const char *mrtypename; MemoryRegion iomem; + /* Nested SMMU */ + bool nested; + GHashTable *smmu_pcibus_by_busptr; GHashTable *configs; /* cache for configuration data */ GHashTable *iotlb; -- Gitee From 6c330f39cc08e4c641a3567e2b6ad0ebcadf5165 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Jun 2024 21:22:04 +0000 Subject: [PATCH 708/939] hw/arm/smmu-common: Bypass emulated IOTLB for a nested SMMU If a vSMMU is configured as a nested one, HW IOTLB will be used and all cache invalidation should be done to the HW IOTLB too, v.s. the emulated iotlb. In this case, an iommu notifier isn't registered, as the devices behind a nested SMMU would stay in the system address space for stage-2 mappings. However, the KVM code still requests an iommu address space to translate an MSI doorbell gIOVA via get_msi_address_space() and translate(). Since a nested SMMU doesn't register an iommu notifier to flush emulated iotlb, bypass the emulated IOTLB and always walk through the guest-level IO page table. Note that regular nested SMMU could still register an iommu notifier for IOTLB invalidation, since QEMU traps the invalidation commands. But this would result in invalidation inefficiency since each invlaidation would be doubled for both HW IOTLB and the emulated IOTLB. Also, with NVIDIA's CMDQV feature on its Grace SoC, invalidation commands are issued to the CMDQ HW direclty, without any trapping. So, there is no way to maintain the emulated IOTLB. Meanwhile, the stage-1 translation request from KVM is only activated in case of an MSI table update, which does not happen that often to impact performance if walking through the guest RAM every time. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index c5f3e02065..016418a48c 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -75,6 +75,16 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, uint8_t level = 4 - (inputsize - 4) / stride; SMMUTLBEntry *entry = NULL; + /* + * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, + * KVM still requests for an iommu address space for an MSI fixup by looking + * up stage-1 page table. Make sure we don't go through the emulated pathway + * so that the emulated iotlb will not need any invalidation. + */ + if (bs->nested) { + return NULL; + } + while (level <= 3) { uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz); uint64_t mask = subpage_size - 1; @@ -110,6 +120,16 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); uint8_t tg = (new->granule - 10) / 2; + /* + * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, + * KVM still requests for an iommu address space for an MSI fixup by looking + * up stage-1 page table. Make sure we don't go through the emulated pathway + * so that the emulated iotlb will not need any invalidation. + */ + if (bs->nested) { + return; + } + if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { smmu_iotlb_inv_all(bs); } -- Gitee From 2fea4f93632679afcb15f0c35b3d9abeede37778 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Apr 2024 16:37:25 +0000 Subject: [PATCH 709/939] hw/arm/smmu-common: Extract smmu_get_sbus and smmu_get_sdev helpers Add two helpers to get sbus and sdev respectively. These will be used by the following patch adding set/unset_iommu_device ops. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 016418a48c..03d9ff58d4 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -589,12 +589,9 @@ SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num) return NULL; } -static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) +static SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus) { - SMMUState *s = opaque; SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); - SMMUDevice *sdev; - static unsigned int index; if (!sbus) { sbus = g_malloc0(sizeof(SMMUPciBus) + @@ -603,7 +600,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) g_hash_table_insert(s->smmu_pcibus_by_busptr, bus, sbus); } - sdev = sbus->pbdev[devfn]; + return sbus; +} + +static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, + PCIBus *bus, int devfn) +{ + SMMUDevice *sdev = sbus->pbdev[devfn]; + static unsigned int index; + if (!sdev) { char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn, index++); @@ -622,6 +627,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) g_free(name); } + return sdev; +} + +static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) +{ + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + return &sdev->as; } -- Gitee From 539e12641dc2db30a6fea7a0f061e163bc245d79 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Jun 2022 02:16:52 -0700 Subject: [PATCH 710/939] hw/arm/smmu-common: Add set/unset_iommu_device callback Implement a set_iommu_device callback: - Find an existing S2 hwpt to test attach() or allocate a new one (Devices behind the same physical SMMU should share an S2 HWPT.) - Attach the device to the S2 hwpt and add it to its device list And add an unset_iommu_device doing the opposite cleanup routine. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 177 +++++++++++++++++++++++++++++++++++ hw/arm/trace-events | 2 + include/hw/arm/smmu-common.h | 21 +++++ 3 files changed, 200 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 03d9ff58d4..038ae857d8 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -20,6 +20,7 @@ #include "trace.h" #include "exec/target_page.h" #include "hw/core/cpu.h" +#include "hw/pci/pci_device.h" #include "hw/qdev-properties.h" #include "qapi/error.h" #include "qemu/jhash.h" @@ -639,8 +640,184 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) return &sdev->as; } +static bool smmu_dev_attach_viommu(SMMUDevice *sdev, + HostIOMMUDeviceIOMMUFD *idev, Error **errp) +{ + struct iommu_hwpt_arm_smmuv3 bypass_data = { + .ste = { 0x9ULL, 0x0ULL }, //0x1ULL << (108 - 64) }, + }; + struct iommu_hwpt_arm_smmuv3 abort_data = { + .ste = { 0x1ULL, 0x0ULL }, + }; + SMMUState *s = sdev->smmu; + SMMUS2Hwpt *s2_hwpt; + SMMUViommu *viommu; + uint32_t s2_hwpt_id; + + if (s->viommu) { + return host_iommu_device_iommufd_attach_hwpt( + idev, s->viommu->s2_hwpt->hwpt_id, errp); + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &s2_hwpt_id, errp)) { + error_setg(errp, "failed to allocate an S2 hwpt"); + return false; + } + + /* Attach to S2 for MSI cookie */ + if (!host_iommu_device_iommufd_attach_hwpt(idev, s2_hwpt_id, errp)) { + error_setg(errp, "failed to attach stage-2 HW pagetable"); + goto free_s2_hwpt; + } + + viommu = g_new0(SMMUViommu, 1); + + viommu->core = iommufd_backend_alloc_viommu(idev->iommufd, idev->devid, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3, + s2_hwpt_id); + if (!viommu->core) { + error_setg(errp, "failed to allocate a viommu"); + goto free_viommu; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(abort_data), &abort_data, + &viommu->abort_hwpt_id, errp)) { + error_setg(errp, "failed to allocate an abort pagetable"); + goto free_viommu_core; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(bypass_data), &bypass_data, + &viommu->bypass_hwpt_id, errp)) { + error_setg(errp, "failed to allocate a bypass pagetable"); + goto free_abort_hwpt; + } + + if (!host_iommu_device_iommufd_attach_hwpt( + idev, viommu->bypass_hwpt_id, errp)) { + error_setg(errp, "failed to attach the bypass pagetable"); + goto free_bypass_hwpt; + } + + s2_hwpt = g_new0(SMMUS2Hwpt, 1); + s2_hwpt->iommufd = idev->iommufd; + s2_hwpt->hwpt_id = s2_hwpt_id; + s2_hwpt->ioas_id = idev->ioas_id; + + viommu->iommufd = idev->iommufd; + viommu->s2_hwpt = s2_hwpt; + + s->viommu = viommu; + return true; + +free_bypass_hwpt: + iommufd_backend_free_id(idev->iommufd, viommu->bypass_hwpt_id); +free_abort_hwpt: + iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id); +free_viommu_core: + iommufd_backend_free_id(idev->iommufd, viommu->core->viommu_id); + g_free(viommu->core); +free_viommu: + g_free(viommu); + host_iommu_device_iommufd_attach_hwpt(idev, sdev->idev->ioas_id, errp); +free_s2_hwpt: + iommufd_backend_free_id(idev->iommufd, s2_hwpt_id); + return false; +} + +static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) +{ + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + + if (!s->nested) { + return true; + } + + if (sdev->idev) { + if (sdev->idev != idev) { + return false;//-EEXIST; + } else { + return true; + } + } + + if (!idev) { + return true; + } + + if (!smmu_dev_attach_viommu(sdev, idev, errp)) { + error_report("Unable to attach viommu"); + return false; + } + + sdev->idev = idev; + sdev->viommu = s->viommu; + QLIST_INSERT_HEAD(&s->viommu->device_list, sdev, next); + trace_smmu_set_iommu_device(devfn, smmu_get_sid(sdev)); + + return true; +} + +static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) +{ + SMMUDevice *sdev; + SMMUViommu *viommu; + SMMUState *s = opaque; + SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); + + if (!s->nested) { + return; + } + + if (!sbus) { + return; + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + return; + } + + if (!host_iommu_device_iommufd_attach_hwpt(sdev->idev, + sdev->idev->ioas_id, NULL)) { + error_report("Unable to attach dev to the default HW pagetable"); + } + + viommu = sdev->viommu; + + sdev->idev = NULL; + sdev->viommu = NULL; + QLIST_REMOVE(sdev, next); + trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); + + if (QLIST_EMPTY(&viommu->device_list)) { + iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->core->viommu_id); + g_free(viommu->core); + iommufd_backend_free_id(viommu->iommufd, viommu->s2_hwpt->hwpt_id); + g_free(viommu->s2_hwpt); + g_free(viommu); + s->viommu = NULL; + } +} + static const PCIIOMMUOps smmu_ops = { .get_address_space = smmu_find_add_as, + .set_iommu_device = smmu_dev_set_iommu_device, + .unset_iommu_device = smmu_dev_unset_iommu_device, }; IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) diff --git a/hw/arm/trace-events b/hw/arm/trace-events index cdc1ea06a8..58e0636e95 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -5,6 +5,8 @@ virt_acpi_setup(void) "No fw cfg or ACPI disabled. Bailing out." # smmu-common.c smmu_add_mr(const char *name) "%s" +smmu_set_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" +smmu_unset_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" smmu_ptw_level(int stage, int level, uint64_t iova, size_t subpage_size, uint64_t baseaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d iova=0x%"PRIx64" subpage_sz=0x%zx baseaddr=0x%"PRIx64" offset=%d => pte=0x%"PRIx64 smmu_ptw_invalid_pte(int stage, int level, uint64_t baseaddr, uint64_t pteaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d base@=0x%"PRIx64" pte@=0x%"PRIx64" offset=%d pte=0x%"PRIx64 smmu_ptw_page_pte(int stage, int level, uint64_t iova, uint64_t baseaddr, uint64_t pteaddr, uint64_t pte, uint64_t address) "stage=%d level=%d iova=0x%"PRIx64" base@=0x%"PRIx64" pte@=0x%"PRIx64" pte=0x%"PRIx64" page address = 0x%"PRIx64 diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index eae5d4d05b..3bfb68cef6 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -23,6 +23,7 @@ #include "hw/pci/pci.h" #include "qom/object.h" #include "sysemu/iommufd.h" +#include #define SMMU_PCI_BUS_MAX 256 #define SMMU_PCI_DEVFN_MAX 256 @@ -107,11 +108,30 @@ typedef struct SMMUTransCfg { struct SMMUS2Cfg s2cfg; } SMMUTransCfg; +typedef struct SMMUS2Hwpt { + IOMMUFDBackend *iommufd; + uint32_t hwpt_id; + uint32_t ioas_id; +} SMMUS2Hwpt; + +typedef struct SMMUViommu { + void *smmu; + IOMMUFDBackend *iommufd; + IOMMUFDViommu *core; + SMMUS2Hwpt *s2_hwpt; + uint32_t bypass_hwpt_id; + uint32_t abort_hwpt_id; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; +} SMMUViommu; + typedef struct SMMUDevice { void *smmu; PCIBus *bus; int devfn; IOMMUMemoryRegion iommu; + HostIOMMUDeviceIOMMUFD *idev; + SMMUViommu *viommu; AddressSpace as; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; @@ -139,6 +159,7 @@ struct SMMUState { /* Nested SMMU */ bool nested; + SMMUViommu *viommu; GHashTable *smmu_pcibus_by_busptr; GHashTable *configs; /* cache for configuration data */ -- Gitee From a2735cd15160a62065a0a0b39af405c7b0f3fae8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Jun 2022 14:41:27 -0700 Subject: [PATCH 711/939] hw/arm/smmu-common: Add iommufd helpers Add a set of helper functions for IOMMUFD and new "struct SMMUS1Hwpt" to store the nested hwpt information. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 108 +++++++++++++++++++++++++++++++++++ include/hw/arm/smmu-common.h | 20 +++++++ 2 files changed, 128 insertions(+) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 038ae857d8..a79eb34277 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -838,6 +838,114 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) return NULL; } +/* IOMMUFD helpers */ +int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, void *data) +{ + uint64_t caps; + + if (!sdev || !sdev->idev) { + return -ENOENT; + } + + return !iommufd_backend_get_device_info(sdev->idev->iommufd, + sdev->idev->devid, data_type, data, + data_len, &caps, NULL); +} + +void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) +{ + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + uint32_t hwpt_id; + + if (!s1_hwpt || !sdev->viommu) { + return; + } + + if (abort) { + hwpt_id = sdev->viommu->abort_hwpt_id; + } else { + hwpt_id = sdev->viommu->bypass_hwpt_id; + } + + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { + return; + } + + iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); + sdev->s1_hwpt = NULL; + g_free(s1_hwpt); +} + +int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data) +{ + SMMUViommu *viommu = sdev->viommu; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + + if (!idev || !viommu) { + return -ENOENT; + } + + if (s1_hwpt) { + smmu_dev_uninstall_nested_ste(sdev, false); + } + + s1_hwpt = g_new0(SMMUS1Hwpt, 1); + if (!s1_hwpt) { + return -ENOMEM; + } + + s1_hwpt->smmu = sdev->smmu; + s1_hwpt->viommu = viommu; + s1_hwpt->iommufd = idev->iommufd; + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, data_type, + data_len, data, &s1_hwpt->hwpt_id, NULL)) { + goto free; + } + + if (!host_iommu_device_iommufd_attach_hwpt(idev, s1_hwpt->hwpt_id, NULL)) { + goto free_hwpt; + } + + sdev->s1_hwpt = s1_hwpt; + + return 0; +free_hwpt: + iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); +free: + sdev->s1_hwpt = NULL; + g_free(s1_hwpt); + + return -EINVAL; +} + +int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs) +{ + if (!s1_hwpt) { + return -ENOENT; + } + + return iommufd_backend_invalidate_cache(s1_hwpt->iommufd, s1_hwpt->hwpt_id, + type, len, num, reqs); +} + +int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, + uint32_t len, uint32_t *num, void *reqs) +{ + if (!viommu) { + return -ENOENT; + } + + return iommufd_viommu_invalidate_cache(viommu->iommufd, viommu->viommu_id, + type, len, num, reqs); +} + /* Unmap all notifiers attached to @mr */ static void smmu_inv_notifiers_mr(IOMMUMemoryRegion *mr) { diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 3bfb68cef6..66dc7206ea 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -125,6 +125,15 @@ typedef struct SMMUViommu { QLIST_ENTRY(SMMUViommu) next; } SMMUViommu; +typedef struct SMMUS1Hwpt { + void *smmu; + IOMMUFDBackend *iommufd; + SMMUViommu *viommu; + uint32_t hwpt_id; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; +} SMMUS1Hwpt; + typedef struct SMMUDevice { void *smmu; PCIBus *bus; @@ -132,6 +141,7 @@ typedef struct SMMUDevice { IOMMUMemoryRegion iommu; HostIOMMUDeviceIOMMUFD *idev; SMMUViommu *viommu; + SMMUS1Hwpt *s1_hwpt; AddressSpace as; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; @@ -225,4 +235,14 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, dma_addr_t iova, /* Unmap the range of all the notifiers registered to any IOMMU mr */ void smmu_inv_notifiers_all(SMMUState *s); +/* IOMMUFD helpers */ +int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, void *data); +void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); +int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data); +int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs); +int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, + uint32_t len, uint32_t *num, void *reqs); #endif /* HW_ARM_SMMU_COMMON_H */ -- Gitee From 3c6c29612d5ca0ff07bcb8a45735a3877c8fadd4 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 7 Dec 2023 20:04:47 +0000 Subject: [PATCH 712/939] hw/arm/smmu-common: Return sysmem if stage-1 is bypassed When nested translation is enabled, there are 2-stage translation occuring to two different address spaces: stage-1 in the iommu as, while stage-2 in the system as. If a device attached to the vSMMU doesn't enable stage-1 translation, e.g. vSTE sets to Config=Bypass, the system as should be returned, so QEMU can set up system memory mappings onto the stage-2 page table. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 18 +++++++++++++++++- include/hw/arm/smmu-common.h | 3 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index a79eb34277..cc41bf3de8 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -622,6 +622,9 @@ static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu), s->mrtypename, OBJECT(s), name, UINT64_MAX); + if (s->nested) { + address_space_init(&sdev->as_sysmem, &s->root, name); + } address_space_init(&sdev->as, MEMORY_REGION(&sdev->iommu), name); trace_smmu_add_mr(name); @@ -637,7 +640,12 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) SMMUPciBus *sbus = smmu_get_sbus(s, bus); SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); - return &sdev->as; + /* Return the system as if the device uses stage-2 only */ + if (s->nested && !sdev->s1_hwpt) { + return &sdev->as_sysmem; + } else { + return &sdev->as; + } } static bool smmu_dev_attach_viommu(SMMUDevice *sdev, @@ -983,6 +991,14 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) g_free, g_free); s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); + if (s->nested) { + memory_region_init(&s->root, OBJECT(s), "root", UINT64_MAX); + memory_region_init_alias(&s->sysmem, OBJECT(s), + "smmu-sysmem", get_system_memory(), 0, + memory_region_size(get_system_memory())); + memory_region_add_subregion(&s->root, 0, &s->sysmem); + } + if (s->primary_bus) { pci_setup_iommu(s->primary_bus, &smmu_ops, s); } else { diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 66dc7206ea..37dfeed026 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -143,6 +143,7 @@ typedef struct SMMUDevice { SMMUViommu *viommu; SMMUS1Hwpt *s1_hwpt; AddressSpace as; + AddressSpace as_sysmem; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; QLIST_ENTRY(SMMUDevice) next; @@ -165,7 +166,9 @@ struct SMMUState { /* */ SysBusDevice dev; const char *mrtypename; + MemoryRegion root; MemoryRegion iomem; + MemoryRegion sysmem; /* Nested SMMU */ bool nested; -- Gitee From 9f3b8c283d4c1014ff292faddb78bbbfd7ec22d3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 9 Apr 2024 01:49:26 +0000 Subject: [PATCH 713/939] hw/arm/smmuv3: Ignore IOMMU_NOTIFIER_MAP for nested-smmuv3 If a device's MemmoryRegion type is iommu, vfio core registers a listener, passing the IOMMU_NOTIFIER_IOTLB_EVENTS flag (bundle of IOMMU_NOTIFIER_MAP and IOMMU_NOTIFIER_UNMAP). On the other hand, nested SMMUv3 does not use a map notifier. And it would only insert an IOTLB entry for MSI doorbell page mapping, which can simply be done by the mr->translate call. Ignore the IOMMU_NOTIFIER_MAP flag and drop the error out. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 64ca4c5542..db111220c7 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1881,12 +1881,9 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, return -EINVAL; } - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu MAP notifier which is " - "not currently supported", pci_bus_num(sdev->bus), - PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn)); - return -EINVAL; + /* nested-smmuv3 does not need IOMMU_NOTIFIER_MAP. Ignore it. */ + if (s->nested) { + new &= ~IOMMU_NOTIFIER_MAP; } if (old == IOMMU_NOTIFIER_NONE) { -- Gitee From 03964c037862a594b4eb7d2e3754acd32c01c80b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 22 Sep 2022 14:06:07 -0700 Subject: [PATCH 714/939] hw/arm/smmuv3: Read host SMMU device info Read the underlying SMMU device info and set corresponding IDR bits. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3.c | 77 ++++++++++++++++++++++++++++++++++++ hw/arm/trace-events | 1 + include/hw/arm/smmu-common.h | 1 + 3 files changed, 79 insertions(+) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index db111220c7..4208325ab3 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -254,6 +254,80 @@ void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *info) info->recorded = true; } +static void smmuv3_nested_init_regs(SMMUv3State *s) +{ + SMMUState *bs = ARM_SMMU(s); + SMMUDevice *sdev; + uint32_t data_type; + uint32_t val; + int ret; + + if (!bs->nested || !bs->viommu) { + return; + } + + sdev = QLIST_FIRST(&bs->viommu->device_list); + if (!sdev) { + return; + } + + if (sdev->info.idr[0]) { + error_report("reusing the previous hw_info"); + goto out; + } + + ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &sdev->info); + if (ret) { + error_report("failed to get SMMU device info"); + return; + } + + if (data_type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { + error_report( "Wrong data type (%d)!", data_type); + return; + } + +out: + trace_smmuv3_get_device_info(sdev->info.idr[0], sdev->info.idr[1], + sdev->info.idr[3], sdev->info.idr[5]); + + val = FIELD_EX32(sdev->info.idr[0], IDR0, BTM); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, BTM, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, ATS); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, ASID16); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, TERM_MODEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, STALL_MODEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, STLEVEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, val); + + val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); + val = FIELD_EX32(sdev->info.idr[1], IDR1, SSIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, val); + + val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); + val = FIELD_EX32(sdev->info.idr[3], IDR3, RIL); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, val); + val = FIELD_EX32(sdev->info.idr[3], IDR3, BBML); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, val); + + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN4K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN16K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN64K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, OAS); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, val); + + /* FIXME check iidr and aidr registrs too */ +} + static void smmuv3_init_regs(SMMUv3State *s) { /* Based on sys property, the stages supported in smmu will be advertised.*/ @@ -292,6 +366,9 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1); + /* Override IDR fields with HW caps */ + smmuv3_nested_init_regs(s); + s->cmdq.base = deposit64(s->cmdq.base, 0, 5, SMMU_CMDQS); s->cmdq.prod = 0; s->cmdq.cons = 0; diff --git a/hw/arm/trace-events b/hw/arm/trace-events index 58e0636e95..1e3d86382d 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -55,5 +55,6 @@ smmuv3_cmdq_tlbi_s12_vmid(uint16_t vmid) "vmid=%d" smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 37dfeed026..d120c352cf 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -146,6 +146,7 @@ typedef struct SMMUDevice { AddressSpace as_sysmem; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; + struct iommu_hw_info_arm_smmuv3 info; QLIST_ENTRY(SMMUDevice) next; } SMMUDevice; -- Gitee From fac9784bbedb50dc964feb9cf70b6f37472fcf60 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Apr 2023 22:10:44 -0700 Subject: [PATCH 715/939] hw/arm/smmuv3: Check idr registers for STE_S1CDMAX and STE_S1STALLD With nested translation, the underlying HW could support those two fields. Allow them according to the updated idr registers after the hw_info ioctl. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 4208325ab3..253d297eec 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -622,13 +622,14 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, } } - if (STE_S1CDMAX(ste) != 0) { + if (!FIELD_EX32(s->idr[1], IDR1, SSIDSIZE) && STE_S1CDMAX(ste) != 0) { qemu_log_mask(LOG_UNIMP, "SMMUv3 does not support multiple context descriptors yet\n"); goto bad_ste; } - if (STE_S1STALLD(ste)) { + /* STALL_MODEL being 0b01 means "stall is not supported" */ + if ((FIELD_EX32(s->idr[0], IDR0, STALL_MODEL) & 0x1) && STE_S1STALLD(ste)) { qemu_log_mask(LOG_UNIMP, "SMMUv3 S1 stalling fault model not allowed yet\n"); goto bad_ste; -- Gitee From 752d98d93459c87817be5e02c39257e0fa5934f8 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Fri, 7 Mar 2025 21:07:11 -0500 Subject: [PATCH 716/939] qga: Don't daemonize before channel is initialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from c6f5dd7ac8ef62dcdec4cdeda1467c658161afff If the agent is set to daemonize but for whatever reason fails to init the channel, the error message is lost. Worse, the agent daemonizes needlessly and returns success. For instance: # qemu-ga -m virtio-serial \ -p /dev/nonexistent_device \ -f /run/qemu-ga.pid \ -t /run \ -d # echo $? 0 This makes it needlessly hard for init scripts to detect a failure in qemu-ga startup. Though, they shouldn't pass '-d' in the first place. Let's open the channel first and only after that become a daemon. Related bug: https://bugs.gentoo.org/810628 Signed-off-by: Michal Privoznik Reviewed-by: Ján Tomko Reviewed-by: Konstantin Kostiuk Message-ID: <7a42b0cbda5c7e01cf76bc1b29a1210cd018fa78.1736261360.git.mprivozn@redhat.com> Signed-off-by: Konstantin Kostiuk Signed-off-by: qihao_yewu --- qga/main.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/qga/main.c b/qga/main.c index c4dcbb86be..8d341ffdf1 100644 --- a/qga/main.c +++ b/qga/main.c @@ -1407,7 +1407,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) if (config->daemonize) { /* delay opening/locking of pidfile till filesystems are unfrozen */ s->deferred_options.pid_filepath = config->pid_filepath; - become_daemon(NULL); } if (config->log_filepath) { /* delay opening the log file till filesystems are unfrozen */ @@ -1416,9 +1415,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) ga_disable_logging(s); qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL); } else { - if (config->daemonize) { - become_daemon(config->pid_filepath); - } if (config->log_filepath) { FILE *log_file = ga_open_logfile(config->log_filepath); if (!log_file) { @@ -1482,6 +1478,20 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) } #endif + if (!channel_init(s, s->config->method, s->config->channel_path, + s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { + g_critical("failed to initialize guest agent channel"); + return NULL; + } + + if (config->daemonize) { + if (ga_is_frozen(s)) { + become_daemon(NULL); + } else { + become_daemon(config->pid_filepath); + } + } + ga_state = s; return s; failed: @@ -1516,8 +1526,9 @@ static void cleanup_agent(GAState *s) static int run_agent_once(GAState *s) { - if (!channel_init(s, s->config->method, s->config->channel_path, - s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { + if (!s->channel && + channel_init(s, s->config->method, s->config->channel_path, + s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { g_critical("failed to initialize guest agent channel"); return EXIT_FAILURE; } @@ -1526,6 +1537,7 @@ static int run_agent_once(GAState *s) if (s->channel) { ga_channel_free(s->channel); + s->channel = NULL; } return EXIT_SUCCESS; -- Gitee From 3a14516128cf936906e5f519bf7808b9a977a757 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Fri, 7 Mar 2025 21:57:29 -0500 Subject: [PATCH 717/939] qga: Add log to guest-fsfreeze-thaw command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from ad1e6843632555c771dda6a9425930fa25b71fb3 Reviewed-by: Daniel P. Berrangé Message-ID: <20241216154552.213961-2-kkostiuk@redhat.com> Signed-off-by: Konstantin Kostiuk Signed-off-by: qihao_yewu --- qga/commands-posix.c | 1 + qga/commands-win32.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 6169bbf7a0..f0d8e9e9c5 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -759,6 +759,7 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) ret = qmp_guest_fsfreeze_do_thaw(errp); if (ret >= 0) { ga_unset_frozen(ga_state); + slog("guest-fsthaw called"); execute_fsfreeze_hook(FSFREEZE_HOOK_THAW, errp); } else { ret = 0; diff --git a/qga/commands-win32.c b/qga/commands-win32.c index 697c65507c..656d1459f1 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -1275,6 +1275,9 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) qga_vss_fsfreeze(&i, false, NULL, errp); ga_unset_frozen(ga_state); + + slog("guest-fsthaw called"); + return i; } -- Gitee From 9eacd1a6df6861b76663e98133adb15059bf65cc Mon Sep 17 00:00:00 2001 From: gongchangsui Date: Mon, 17 Mar 2025 02:40:50 -0400 Subject: [PATCH 718/939] arm: VirtCCA: CVM support UEFI boot 1. Add UEFI boot support for Confidential VMs. 2. Modify the base memory address of Confidential VMs from 3GB to 1GB. 3. Disable pflash boot support for Confidential VMs; use the`-bios`option to specify`QEMU_EFI.fd`during launch. Signed-off-by: gongchangsui --- hw/arm/boot.c | 38 ++++++++++++++++++++++++++++++++++++-- hw/arm/virt.c | 33 ++++++++++++++++++++++++++++++++- include/hw/arm/boot.h | 3 +++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 42110b0f18..6b2f46af4d 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -43,6 +43,9 @@ #define BOOTLOADER_MAX_SIZE (4 * KiB) +#define UEFI_MAX_SIZE 0x8000000 +#define UEFI_LOADER_START 0x0 +#define DTB_MAX 0x200000 AddressSpace *arm_boot_address_space(ARMCPU *cpu, const struct arm_boot_info *info) { @@ -1155,7 +1158,31 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, } } -static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) +static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, + struct arm_boot_info *info, + const char *firmware_filename) +{ + ssize_t fw_size; + const char *fname; + AddressSpace *as = arm_boot_address_space(cpu, info); + + fname = qemu_find_file(QEMU_FILE_TYPE_BIOS, firmware_filename); + if (!fname) { + error_report("Could not find firmware image '%s'", firmware_filename); + exit(EXIT_FAILURE); + } + + fw_size = load_image_targphys_as(firmware_filename, + info->firmware_base, + info->firmware_max_size, as); + + if (fw_size <= 0) { + error_report("could not load firmware '%s'", firmware_filename); + exit(EXIT_FAILURE); + } +} + +static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename) { /* Set up for booting firmware (which might load a kernel via fw_cfg) */ @@ -1166,6 +1193,8 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) * DTB to the base of RAM for the bootloader to pick up. */ info->dtb_start = info->loader_start; + if (info->confidential) + tmm_add_ram_region(UEFI_LOADER_START, UEFI_MAX_SIZE, info->dtb_start, DTB_MAX , true); } if (info->kernel_filename) { @@ -1206,6 +1235,11 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) } } + if (info->confidential) { + arm_setup_confidential_firmware_boot(cpu, info, firmware_filename); + kvm_load_user_data(UEFI_LOADER_START, UEFI_MAX_SIZE, info->loader_start, info->loader_start + DTB_MAX, info->ram_size, + (struct kvm_numa_info *)info->numa_info); + } /* * We will start from address 0 (typically a boot ROM image) in the * same way as hardware. Leave env->boot_info NULL, so that @@ -1282,7 +1316,7 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) /* Load the kernel. */ if (!info->kernel_filename || info->firmware_loaded) { - arm_setup_firmware_boot(cpu, info); + arm_setup_firmware_boot(cpu, info, ms->firmware); } else { arm_setup_direct_kernel_boot(cpu, info); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 8823f2ed1c..6ffb26e7e6 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1398,6 +1398,9 @@ static void virt_flash_map1(PFlashCFI01 *flash, qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); + if (virtcca_cvm_enabled()) { + return; + } memory_region_add_subregion(sysmem, base, sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0)); @@ -1433,6 +1436,10 @@ static void virt_flash_fdt(VirtMachineState *vms, MachineState *ms = MACHINE(vms); char *nodename; + if (virtcca_cvm_enabled()) { + return; + } + if (sysmem == secure_sysmem) { /* Report both flash devices as a single node in the DT */ nodename = g_strdup_printf("/flash@%" PRIx64, flashbase); @@ -1468,6 +1475,23 @@ static void virt_flash_fdt(VirtMachineState *vms, } } +static bool virt_confidential_firmware_init(VirtMachineState *vms, + MemoryRegion *sysmem) +{ + MemoryRegion *fw_ram; + hwaddr fw_base = vms->memmap[VIRT_FLASH].base; + hwaddr fw_size = vms->memmap[VIRT_FLASH].size; + + if (!MACHINE(vms)->firmware) { + return false; + } + + fw_ram = g_new(MemoryRegion, 1); + memory_region_init_ram(fw_ram, NULL, "fw_ram", fw_size, NULL); + memory_region_add_subregion(sysmem, fw_base, fw_ram); + return true; +} + static bool virt_firmware_init(VirtMachineState *vms, MemoryRegion *sysmem, MemoryRegion *secure_sysmem) @@ -1486,6 +1510,10 @@ static bool virt_firmware_init(VirtMachineState *vms, pflash_blk0 = pflash_cfi01_get_blk(vms->flash[0]); + if (virtcca_cvm_enabled()) { + return virt_confidential_firmware_init(vms, sysmem); + } + bios_name = MACHINE(vms)->firmware; if (bios_name) { char *fname; @@ -2023,7 +2051,7 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; - vms->memmap[VIRT_MEM].base = 3 * GiB; + vms->memmap[VIRT_MEM].base = 1 * GiB; vms->memmap[VIRT_MEM].size = ms->ram_size; info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); @@ -2822,6 +2850,9 @@ static void machvirt_init(MachineState *machine) vms->bootinfo.get_dtb = machvirt_dtb; vms->bootinfo.skip_dtb_autoload = true; vms->bootinfo.firmware_loaded = firmware_loaded; + vms->bootinfo.firmware_base = vms->memmap[VIRT_FLASH].base; + vms->bootinfo.firmware_max_size = vms->memmap[VIRT_FLASH].size; + vms->bootinfo.confidential = virtcca_cvm_enabled(); vms->bootinfo.psci_conduit = vms->psci_conduit; arm_load_kernel(ARM_CPU(first_cpu), machine, &vms->bootinfo); diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h index 4491b1f85b..06ca1d90b2 100644 --- a/include/hw/arm/boot.h +++ b/include/hw/arm/boot.h @@ -133,6 +133,9 @@ struct arm_boot_info { bool secure_board_setup; arm_endianness endianness; + hwaddr firmware_base; + hwaddr firmware_max_size; + bool confidential; }; /** -- Gitee From 5bffeb311c969a0e05106e4bf54282431c5ba907 Mon Sep 17 00:00:00 2001 From: gongchangsui Date: Mon, 17 Mar 2025 02:42:43 -0400 Subject: [PATCH 719/939] arm: VirtCCA: qemu uefi boot support kae This commit introduces modifications to enable KAE functionality during UEFI boot in cVMs. Additionally,the ACPI feature must be configured in cVM. Signed-off-by: gongchangsui --- hw/arm/virt-acpi-build.c | 58 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 076781423b..f78331d69f 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -58,6 +58,7 @@ #include "migration/vmstate.h" #include "hw/acpi/ghes.h" #include "hw/acpi/viot.h" +#include "kvm_arm.h" #define ARM_SPI_BASE 32 @@ -405,6 +406,54 @@ static void acpi_dsdt_add_virtio(Aml *scope, } } +static void acpi_dsdt_add_hisi_sec(Aml *scope, + const MemMapEntry *virtio_mmio_memmap, + int dev_id) +{ + hwaddr size = 0x10000; + + /* + * Calculate the base address for the sec device node. + * Each device group contains one sec device and one hpre device,spaced by 2 * size. + */ + hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size; + + Aml *dev = aml_device("SE%02u", dev_id); + aml_append(dev, aml_name_decl("_HID", aml_string("SEC07"))); + aml_append(dev, aml_name_decl("_UID", aml_int(dev_id))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + + Aml *crs = aml_resource_template(); + + aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); +} + +static void acpi_dsdt_add_hisi_hpre(Aml *scope, + const MemMapEntry *virtio_mmio_memmap, + int dev_id) +{ + hwaddr size = 0x10000; + + /* + * Calculate the base address for the hpre device node. + * Each hpre device follows the corresponding sec device by an additional offset of size. + */ + hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size + size; + + Aml *dev = aml_device("HP%02u", dev_id); + aml_append(dev, aml_name_decl("_HID", aml_string("HPRE07"))); + aml_append(dev, aml_name_decl("_UID", aml_int(dev_id))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + + Aml *crs = aml_resource_template(); + + aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); +} + static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, uint32_t irq, VirtMachineState *vms) { @@ -1201,6 +1250,15 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) acpi_dsdt_add_virtio(scope, &memmap[VIRT_MMIO], (irqmap[VIRT_MMIO] + ARM_SPI_BASE), NUM_VIRTIO_TRANSPORTS); acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms); + + if (virtcca_cvm_enabled()) { + int kae_num = tmm_get_kae_num(); + for (int i = 0; i < kae_num; i++) { + acpi_dsdt_add_hisi_sec(scope, &memmap[VIRT_KAE_DEVICE], i); + acpi_dsdt_add_hisi_hpre(scope, &memmap[VIRT_KAE_DEVICE], i); + } + } + if (vms->acpi_dev) { build_ged_aml(scope, "\\_SB."GED_DEVICE, HOTPLUG_HANDLER(vms->acpi_dev), -- Gitee From 5ed17a43a4cc7fc76397d6d8cad8246063b5b2f3 Mon Sep 17 00:00:00 2001 From: gongchangsui Date: Mon, 17 Mar 2025 02:43:55 -0400 Subject: [PATCH 720/939] arm: VirtCCA: Compatibility with older versions of TMM and the kernel Since the base memory address of Confidential VMs in QEMU was changed from 3GB to 1GB, corresponding adjustments are required in both the TMM and kernel components. To maintain backward compatibility, the following modifications were implemented: 1. **TMM Versioning**: The TMM version number was incremented to reflect the update 2. **Kernel Interface**: A new interface was exposed in the kernel to retrieve the TMM version number. 3. **QEMU Compatibility Logic**: During initialization, QEMU checks the TMM version via the kernel interface. If the TMM version is**<2.1**(legacy), QEMU sets the Confidential VM's base memory address to**3GB**. For TMM versions **2.1**(updated), the address is configured to**1GB**to align with the new memory layout This approach ensures seamless backward compatibility while transitioning to the revised memory addressing scheme. Signed-off-by: gongchangsui --- accel/kvm/kvm-all.c | 3 +-- hw/arm/boot.c | 9 +++++++++ hw/arm/virt.c | 9 +++++++-- linux-headers/asm-arm64/kvm.h | 2 ++ linux-headers/linux/kvm.h | 3 +++ 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index a8e29f148e..38a48cc031 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2390,6 +2390,7 @@ static int kvm_init(MachineState *ms) qemu_mutex_init(&kml_slots_lock); s = KVM_STATE(ms->accelerator); + kvm_state = s; /* * On systems where the kernel can support different base page @@ -2609,8 +2610,6 @@ static int kvm_init(MachineState *ms) #endif } - kvm_state = s; - ret = kvm_arch_init(ms, s); if (ret < 0) { goto err; diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 6b2f46af4d..ca9f69fd3d 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -1162,6 +1162,15 @@ static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename) { + uint64_t tmi_version = 0; + if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { + error_report("please check the kernel version!"); + exit(EXIT_FAILURE); + } + if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { + error_report("please check the tmi version!"); + exit(EXIT_FAILURE); + } ssize_t fw_size; const char *fname; AddressSpace *as = arm_boot_address_space(cpu, info); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 6ffb26e7e6..39dfec0877 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2050,8 +2050,13 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) /* support kae vf device tree nodes */ vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; - - vms->memmap[VIRT_MEM].base = 1 * GiB; + uint64_t tmi_version = 0; + if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { + warn_report("can not get tmi version"); + } + if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { + vms->memmap[VIRT_MEM].base = 3 * GiB; + } vms->memmap[VIRT_MEM].size = ms->ram_size; info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 552fdcb18f..d69a71cbec 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -597,4 +597,6 @@ struct kvm_cap_arm_tmm_populate_region_args { #endif +#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 + #endif /* __ARM_KVM_H__ */ diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 84cec64b88..7a08f9b1e9 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2422,4 +2422,7 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* get tmi version */ +#define KVM_GET_TMI_VERSION _IOR(KVMIO, 0xd2, uint64_t) + #endif /* __LINUX_KVM_H */ -- Gitee From 0119389040e4d78c6238875b812827d4f07b5f0f Mon Sep 17 00:00:00 2001 From: gongchangsui Date: Mon, 17 Mar 2025 02:51:16 -0400 Subject: [PATCH 721/939] arm: VirtCCA: qemu CoDA support UEFI boot 1. Expose PCIe MMIO region from QEMU memory map. 2. Refactor struct kvm_user_data data_start and data_size represent the address base and size of the MMIO in UEFI boot modedata_start and data_size represent the address base and size of the DTB in direct boot mode. Signed-off-by: gongchangsui --- accel/kvm/kvm-all.c | 8 ++++---- hw/arm/boot.c | 10 ++++++---- hw/arm/virt.c | 6 ++++++ linux-headers/linux/kvm.h | 12 +++++++++--- target/arm/kvm_arm.h | 2 ++ 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 38a48cc031..57c6718b77 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -3527,7 +3527,7 @@ int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) return r; } -int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, +int kvm_load_user_data(hwaddr loader_start, hwaddr dtb_info, hwaddr data_start, hwaddr data_size, hwaddr ram_size, struct kvm_numa_info *numa_info) { KVMState *state = kvm_state; @@ -3535,9 +3535,9 @@ int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_star int ret; data.loader_start = loader_start; - data.image_end = image_end; - data.initrd_start = initrd_start; - data.dtb_end = dtb_end; + data.dtb_info = dtb_info; + data.data_start = data_start; + data.data_size = data_size; data.ram_size = ram_size; memcpy(&data.numa_info, numa_info, sizeof(struct kvm_numa_info)); diff --git a/hw/arm/boot.c b/hw/arm/boot.c index ca9f69fd3d..a3e0dbb68c 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -1149,10 +1149,10 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, if (kvm_enabled() && virtcca_cvm_enabled()) { if (info->dtb_limit == 0) { - info->dtb_limit = info->dtb_start + 0x200000; + info->dtb_limit = info->dtb_start + DTB_MAX; } - kvm_load_user_data(info->loader_start, image_high_addr, info->initrd_start, - info->dtb_limit, info->ram_size, (struct kvm_numa_info *)info->numa_info); + kvm_load_user_data(info->loader_start, 0x1, info->dtb_start, + info->dtb_limit - info->dtb_start, info->ram_size, (struct kvm_numa_info *)info->numa_info); tmm_add_ram_region(info->loader_start, image_high_addr - info->loader_start, info->initrd_start, info->dtb_limit - info->initrd_start, true); } @@ -1193,6 +1193,7 @@ static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename) { + hwaddr mmio_start, mmio_size; /* Set up for booting firmware (which might load a kernel via fw_cfg) */ if (have_dtb(info)) { @@ -1246,7 +1247,8 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, con if (info->confidential) { arm_setup_confidential_firmware_boot(cpu, info, firmware_filename); - kvm_load_user_data(UEFI_LOADER_START, UEFI_MAX_SIZE, info->loader_start, info->loader_start + DTB_MAX, info->ram_size, + virtcca_kvm_get_mmio_addr(&mmio_start, &mmio_size); + kvm_load_user_data(info->loader_start, DTB_MAX, mmio_start, mmio_size, info->ram_size, (struct kvm_numa_info *)info->numa_info); } /* diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 39dfec0877..6c5611826c 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -176,6 +176,12 @@ static const MemMapEntry base_memmap[] = { [VIRT_MEM] = { GiB, LEGACY_RAMLIMIT_BYTES }, }; +void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size) +{ + *mmio_start = base_memmap[VIRT_PCIE_MMIO].base; + *mmio_size = base_memmap[VIRT_PCIE_MMIO].size; +} + /* * Highmem IO Regions: This memory map is floating, located after the RAM. * Each MemMapEntry base (GPA) will be dynamically computed, depending on the diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 7a08f9b1e9..c9ec7f862a 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1510,9 +1510,15 @@ struct kvm_numa_info { struct kvm_user_data { __u64 loader_start; - __u64 image_end; - __u64 initrd_start; - __u64 dtb_end; + /* + * When the lowest bit of dtb_info is 0, the value of dtb_info represents the size of the DTB, + * and data_start and data_size represent the address base and size of the MMIO. + * When the lowest bit of dtb_info is 1, data_start and data_size represent the address base + * and size of the DTB. + */ + __u64 dtb_info; + __u64 data_start; + __u64 data_size; __u64 ram_size; struct kvm_numa_info numa_info; }; diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 31457a57f7..62fbb713f4 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -73,6 +73,8 @@ int kvm_arm_vcpu_finalize(CPUState *cs, int feature); void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group, uint64_t attr, int dev_fd, uint64_t addr_ormask); +void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size); + /** * kvm_arm_init_cpreg_list: * @cpu: ARMCPU -- Gitee From 458d90e226d5833661f9257f6af57c14f9b9bdfe Mon Sep 17 00:00:00 2001 From: gongchangsui Date: Mon, 17 Mar 2025 02:52:21 -0400 Subject: [PATCH 722/939] BUGFIX: Enforce isolation for virtcca_shared_hugepage Add memory isolation enforcement when virtcca hugepage is disabled. Signed-off-by: gongchangsui --- hw/core/numa.c | 3 ++- hw/virtio/vhost.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index e7c48dab61..c691578ef5 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -728,7 +728,8 @@ void numa_complete_configuration(MachineState *ms) memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id, ms->ram_size); numa_init_memdev_container(ms, ms->ram); - if (virtcca_cvm_enabled() && virtcca_shared_hugepage->ram_block) { + if (virtcca_cvm_enabled() && virtcca_shared_hugepage && + virtcca_shared_hugepage->ram_block) { virtcca_shared_memory_configuration(ms); } } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 8b95558013..4bf0b03977 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1617,7 +1617,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->log_size = 0; hdev->log_enabled = false; hdev->started = false; - if (virtcca_cvm_enabled()) { + if (virtcca_cvm_enabled() && virtcca_shared_hugepage && virtcca_shared_hugepage->ram_block) { memory_listener_register(&hdev->memory_listener, &address_space_virtcca_shared_memory); } else { -- Gitee From bc08940ad3c75da49e05c596f79e9e0164573709 Mon Sep 17 00:00:00 2001 From: gongchangsui Date: Mon, 17 Mar 2025 02:56:40 -0400 Subject: [PATCH 723/939] backends: VirtCCA: cvm_gpa_start supports both 1GB and 3GB For TMM versions 2.1 and above, `cvm_gpa_start` is 1GB, while for versions prior to 2.1, `cvm_gpa_start` is 3GB. Shared huge page memory supports both `cvm_gpa_start` values. Signed-off-by: gongchangsui --- backends/hostmem-file.c | 17 ++++++++++++++--- hw/arm/virt.c | 1 + hw/core/numa.c | 2 +- include/exec/memory.h | 11 +++++++---- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 891fe4ac4a..ce63a372a3 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -27,6 +27,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE) bool virtcca_shared_hugepage_mapped = false; uint64_t virtcca_cvm_ram_size = 0; +uint64_t virtcca_cvm_gpa_start = 0; struct HostMemoryBackendFile { HostMemoryBackend parent_obj; @@ -101,8 +102,16 @@ virtcca_shared_backend_memory_alloc(char *mem_path, uint32_t ram_flags, Error ** error_report("parse virtcca share memory path failed"); exit(1); } - if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE) { - size = VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE; + + /* + * 1) CVM_GPA_START = 3GB --> fix size = 1GB + * 2) CVM_GPA_START = 1GB && ram_size >= 3GB --> size = 3GB + * 3) CVM_GPA_START = 1GB && ram_size < 3GB --> size = ram_size + */ + if (virtcca_cvm_gpa_start != DEFAULT_VM_GPA_START) { + size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - virtcca_cvm_gpa_start; + } else if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START) { + size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START; } virtcca_shared_hugepage = g_new(MemoryRegion, 1); @@ -172,7 +181,9 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) fb->mem_path, fb->offset, errp); g_free(name); - if (virtcca_cvm_enabled() && backend->share && !virtcca_shared_hugepage_mapped) { + if (virtcca_cvm_enabled() && backend->share && + (strcmp(fb->mem_path, "/dev/shm") != 0) && + !virtcca_shared_hugepage_mapped) { virtcca_shared_backend_memory_alloc(fb->mem_path, ram_flags, errp); } #endif diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 6c5611826c..3c31d3667e 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2063,6 +2063,7 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { vms->memmap[VIRT_MEM].base = 3 * GiB; } + virtcca_cvm_gpa_start = vms->memmap[VIRT_MEM].base; vms->memmap[VIRT_MEM].size = ms->ram_size; info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); diff --git a/hw/core/numa.c b/hw/core/numa.c index c691578ef5..98d896e687 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -655,7 +655,7 @@ static void virtcca_shared_memory_configuration(MachineState *ms) memory_region_init_alias(alias_mr, NULL, "alias-mr", virtcca_shared_hugepage, 0, int128_get64(virtcca_shared_hugepage->size)); memory_region_add_subregion(address_space_virtcca_shared_memory.root, - VIRTCCA_GPA_START, alias_mr); + virtcca_cvm_gpa_start, alias_mr); } void numa_complete_configuration(MachineState *ms) diff --git a/include/exec/memory.h b/include/exec/memory.h index 33778f5c64..c14dc69d27 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -243,14 +243,17 @@ typedef struct IOMMUTLBEvent { /* RAM FD is opened read-only */ #define RAM_READONLY_FD (1 << 11) -/* The GPA range of the VirtCCA bounce buffer is from 1GB to 4GB. */ -#define VIRTCCA_SHARED_HUGEPAGE_MAX_SIZE 0xc0000000ULL +/* The address limit of the VirtCCA bounce buffer is 4GB. */ +#define VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT 0x100000000ULL /* The VirtCCA shared hugepage memory granularity is 1GB */ #define VIRTCCA_SHARED_HUGEPAGE_ALIGN 0x40000000ULL -/* The GPA starting address of the VirtCCA CVM is 1GB */ -#define VIRTCCA_GPA_START 0x40000000ULL +/* The default GPA starting address of VM is 1GB */ +#define DEFAULT_VM_GPA_START 0x40000000ULL + +/* The GPA starting address of the VirtCCA CVM is 1GB or 3GB */ +extern uint64_t virtcca_cvm_gpa_start; extern uint64_t virtcca_cvm_ram_size; -- Gitee From cfb01b2fe4a99ed030dacdc49064a152a472dc2d Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Thu, 19 Sep 2024 13:10:11 +0800 Subject: [PATCH 724/939] target/i386: Add more features enumerated by CPUID.7.2.EDX commit 10eaf9c0fb7060f45807becbb2742a9de9bc3632 upstream Following 5 bits in CPUID.7.2.EDX are supported by KVM. Add their supports in QEMU. Each of them indicates certain bits of IA32_SPEC_CTRL are supported. Those bits can control CPU speculation behavior which can be used to defend against side-channel attacks. bit0: intel-psfd if 1, indicates bit 7 of the IA32_SPEC_CTRL MSR is supported. Bit 7 of this MSR disables Fast Store Forwarding Predictor without disabling Speculative Store Bypass bit1: ipred-ctrl If 1, indicates bits 3 and 4 of the IA32_SPEC_CTRL MSR are supported. Bit 3 of this MSR enables IPRED_DIS control for CPL3. Bit 4 of this MSR enables IPRED_DIS control for CPL0/1/2 bit2: rrsba-ctrl If 1, indicates bits 5 and 6 of the IA32_SPEC_CTRL MSR are supported. Bit 5 of this MSR disables RRSBA behavior for CPL3. Bit 6 of this MSR disables RRSBA behavior for CPL0/1/2 bit3: ddpd-u If 1, indicates bit 8 of the IA32_SPEC_CTRL MSR is supported. Bit 8 of this MSR disables Data Dependent Prefetcher. bit4: bhi-ctrl if 1, indicates bit 10 of the IA32_SPEC_CTRL MSR is supported. Bit 10 of this MSR enables BHI_DIS_S behavior. Intel-SIG: 10eaf9c0fb70 target/i386: Add more features enumerated by CPUID.7.2.EDX Signed-off-by: Chao Gao Link: https://lore.kernel.org/r/20240919051011.118309-1-chao.gao@intel.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 1fa08265bc..f3df62127c 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1000,8 +1000,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_7_2_EDX] = { .type = CPUID_FEATURE_WORD, .feat_names = { - NULL, NULL, NULL, NULL, - NULL, "mcdt-no", NULL, NULL, + "intel-psfd", "ipred-ctrl", "rrsba-ctrl", "ddpd-u", + "bhi-ctrl", "mcdt-no", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -- Gitee From bce44f92530fed18cac1e51f81217a6addf992bd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 May 2024 11:10:54 +0200 Subject: [PATCH 725/939] target/i386: fix feature dependency for WAITPKG commit fe01af5d47d4cf7fdf90c54d43f784e5068c8d72 upstream. The VMX feature bit depends on general availability of WAITPKG, not the other way round. Intel-SIG: commit fe01af5d47d4 target/i386: fix feature dependency for WAITPKG Fixes: 33cc88261c3 ("target/i386: add support for VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE", 2023-08-28) Cc: qemu-stable@nongnu.org Reviewed-by: Zhao Liu Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index f3df62127c..860934b39f 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1550,8 +1550,8 @@ static FeatureDep feature_dependencies[] = { .to = { FEAT_SVM, ~0ull }, }, { - .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, - .to = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, + .from = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, + .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, }, }; -- Gitee From 110184b14d17c13e046e9c4ebed6c3cec29b31d0 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 8 Nov 2023 23:20:07 -0800 Subject: [PATCH 726/939] target/i386: add support for FRED in CPUID enumeration commit c1acad9f72d14daf918563eb77d2b31c39fbd06a upstream. FRED, i.e., the Intel flexible return and event delivery architecture, defines simple new transitions that change privilege level (ring transitions). The new transitions defined by the FRED architecture are FRED event delivery and, for returning from events, two FRED return instructions. FRED event delivery can effect a transition from ring 3 to ring 0, but it is used also to deliver events incident to ring 0. One FRED instruction (ERETU) effects a return from ring 0 to ring 3, while the other (ERETS) returns while remaining in ring 0. Collectively, FRED event delivery and the FRED return instructions are FRED transitions. In addition to these transitions, the FRED architecture defines a new instruction (LKGS) for managing the state of the GS segment register. The LKGS instruction can be used by 64-bit operating systems that do not use the new FRED transitions. WRMSRNS is an instruction that behaves exactly like WRMSR, with the only difference being that it is not a serializing instruction by default. Under certain conditions, WRMSRNS may replace WRMSR to improve performance. FRED uses it to switch RSP0 in a faster manner. Search for the latest FRED spec in most search engines with this search pattern: site:intel.com FRED (flexible return and event delivery) specification The CPUID feature flag CPUID.(EAX=7,ECX=1):EAX[17] enumerates FRED, and the CPUID feature flag CPUID.(EAX=7,ECX=1):EAX[18] enumerates LKGS, and the CPUID feature flag CPUID.(EAX=7,ECX=1):EAX[19] enumerates WRMSRNS. Add CPUID definitions for FRED/LKGS/WRMSRNS, and expose them to KVM guests. Because FRED relies on LKGS and WRMSRNS, add that to feature dependency map. Intel-SIG: commit c1acad9f72d1 target/i386: add support for FRED in CPUID enumeration Tested-by: Shan Kang Signed-off-by: Xin Li Message-ID: <20231109072012.8078-2-xin3.li@intel.com> [Fix order of dependencies, add dependencies from LM to FRED. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 14 +++++++++++++- target/i386/cpu.h | 6 ++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 860934b39f..47f00392be 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -966,7 +966,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "avx-vnni", "avx512-bf16", NULL, "cmpccxadd", NULL, NULL, "fzrm", "fsrs", "fsrc", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, + NULL, "fred", "lkgs", "wrmsrns", NULL, "amx-fp16", NULL, "avx-ifma", NULL, NULL, "lam", NULL, NULL, NULL, NULL, NULL, @@ -1553,6 +1553,18 @@ static FeatureDep feature_dependencies[] = { .from = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, }, + { + .from = { FEAT_8000_0001_EDX, CPUID_EXT2_LM }, + .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, + }, + { + .from = { FEAT_7_1_EAX, CPUID_7_1_EAX_LKGS }, + .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, + }, + { + .from = { FEAT_7_1_EAX, CPUID_7_1_EAX_WRMSRNS }, + .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, + }, }; typedef struct X86RegisterInfo32 { diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 21fb769cce..f392626f98 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -941,6 +941,12 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_7_1_EDX_AMX_COMPLEX (1U << 8) /* PREFETCHIT0/1 Instructions */ #define CPUID_7_1_EDX_PREFETCHITI (1U << 14) +/* Flexible return and event delivery (FRED) */ +#define CPUID_7_1_EAX_FRED (1U << 17) +/* Load into IA32_KERNEL_GS_BASE (LKGS) */ +#define CPUID_7_1_EAX_LKGS (1U << 18) +/* Non-Serializing Write to Model Specific Register (WRMSRNS) */ +#define CPUID_7_1_EAX_WRMSRNS (1U << 19) /* Do not exhibit MXCSR Configuration Dependent Timing (MCDT) behavior */ #define CPUID_7_2_EDX_MCDT_NO (1U << 5) -- Gitee From 1a2ee56c173984212ba7b9970aa36e307094d460 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 8 Nov 2023 23:20:08 -0800 Subject: [PATCH 727/939] target/i386: mark CR4.FRED not reserved commit f88ddc40c6d8b591a357108feec52cea13796d2d upstream. The CR4.FRED bit, i.e., CR4[32], is no longer a reserved bit when FRED is exposed to guests, otherwise it is still a reserved bit. Intel-SIG: commit f88ddc40c6d8 target/i386: mark CR4.FRED not reserved Tested-by: Shan Kang Signed-off-by: Xin Li Reviewed-by: Zhao Liu Message-ID: <20231109072012.8078-3-xin3.li@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index f392626f98..418daeab04 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -264,6 +264,18 @@ typedef enum X86Seg { #define CR4_PKS_MASK (1U << 24) #define CR4_LAM_SUP_MASK (1U << 28) +#ifdef TARGET_X86_64 +#define CR4_FRED_MASK (1ULL << 32) +#else +#define CR4_FRED_MASK 0 +#endif + +#ifdef TARGET_X86_64 +#define CR4_FRED_MASK (1ULL << 32) +#else +#define CR4_FRED_MASK 0 +#endif + #define CR4_RESERVED_MASK \ (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ | CR4_DE_MASK | CR4_PSE_MASK | CR4_PAE_MASK \ @@ -272,7 +284,7 @@ typedef enum X86Seg { | CR4_LA57_MASK \ | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \ | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \ - | CR4_LAM_SUP_MASK)) + | CR4_LAM_SUP_MASK | CR4_FRED_MASK)) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) @@ -2551,6 +2563,9 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env) if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { reserved_bits |= CR4_LAM_SUP_MASK; } + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED)) { + reserved_bits |= CR4_FRED_MASK; + } return reserved_bits; } -- Gitee From 3aa85bc2d9265305dde99cde12d716ffa9bcef4b Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 8 Nov 2023 23:20:10 -0800 Subject: [PATCH 728/939] vmxcap: add support for VMX FRED controls commit 2e641870170e28df28c5d9914e76ea7cab141516 upstream. Report secondary vm-exit controls and the VMX controls used to save/load FRED MSRs. Intel-SIG: commit 2e641870170e vmxcap: add support for VMX FRED controls Tested-by: Shan Kang Signed-off-by: Xin Li Message-ID: <20231109072012.8078-5-xin3.li@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- scripts/kvm/vmxcap | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap index 3fb4d5b342..44898d73c2 100755 --- a/scripts/kvm/vmxcap +++ b/scripts/kvm/vmxcap @@ -24,6 +24,7 @@ MSR_IA32_VMX_TRUE_EXIT_CTLS = 0x48F MSR_IA32_VMX_TRUE_ENTRY_CTLS = 0x490 MSR_IA32_VMX_VMFUNC = 0x491 MSR_IA32_VMX_PROCBASED_CTLS3 = 0x492 +MSR_IA32_VMX_EXIT_CTLS2 = 0x493 class msr(object): def __init__(self): @@ -219,11 +220,21 @@ controls = [ 23: 'Clear IA32_BNDCFGS', 24: 'Conceal VM exits from PT', 25: 'Clear IA32_RTIT_CTL', + 31: 'Activate secondary VM-exit controls', }, cap_msr = MSR_IA32_VMX_EXIT_CTLS, true_cap_msr = MSR_IA32_VMX_TRUE_EXIT_CTLS, ), + Allowed1Control( + name = 'secondary VM-Exit controls', + bits = { + 0: 'Save IA32 FRED MSRs', + 1: 'Load IA32 FRED MSRs', + }, + cap_msr = MSR_IA32_VMX_EXIT_CTLS2, + ), + Control( name = 'VM-Entry controls', bits = { @@ -237,6 +248,7 @@ controls = [ 16: 'Load IA32_BNDCFGS', 17: 'Conceal VM entries from PT', 18: 'Load IA32_RTIT_CTL', + 23: 'Load IA32 FRED MSRs', }, cap_msr = MSR_IA32_VMX_ENTRY_CTLS, true_cap_msr = MSR_IA32_VMX_TRUE_ENTRY_CTLS, -- Gitee From 5f828613ba69ce640512a900f630515d980208dd Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 8 Nov 2023 23:20:11 -0800 Subject: [PATCH 729/939] target/i386: enumerate VMX nested-exception support commit ef202d64c3020f3df03c39d3ad688732d81aaae8 upstream. Allow VMX nested-exception support to be exposed in KVM guests, thus nested KVM guests can enumerate it. Intel-SIG: commit ef202d64c302 target/i386: enumerate VMX nested-exception support Tested-by: Shan Kang Signed-off-by: Xin Li Message-ID: <20231109072012.8078-6-xin3.li@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- scripts/kvm/vmxcap | 1 + target/i386/cpu.c | 1 + target/i386/cpu.h | 1 + 3 files changed, 3 insertions(+) diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap index 44898d73c2..508be19c75 100755 --- a/scripts/kvm/vmxcap +++ b/scripts/kvm/vmxcap @@ -117,6 +117,7 @@ controls = [ 54: 'INS/OUTS instruction information', 55: 'IA32_VMX_TRUE_*_CTLS support', 56: 'Skip checks on event error code', + 58: 'VMX nested exception support', }, msr = MSR_IA32_VMX_BASIC, ), diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 47f00392be..00e636e61c 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1344,6 +1344,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [54] = "vmx-ins-outs", [55] = "vmx-true-ctls", [56] = "vmx-any-errcode", + [58] = "vmx-nested-exception", }, .msr = { .index = MSR_IA32_VMX_BASIC, diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 418daeab04..b03237c305 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1065,6 +1065,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define MSR_VMX_BASIC_INS_OUTS (1ULL << 54) #define MSR_VMX_BASIC_TRUE_CTLS (1ULL << 55) #define MSR_VMX_BASIC_ANY_ERRCODE (1ULL << 56) +#define MSR_VMX_BASIC_NESTED_EXCEPTION (1ULL << 58) #define MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK 0x1Full #define MSR_VMX_MISC_STORE_LMA (1ULL << 5) -- Gitee From c3e47749fba4418d80bf4314335118452912b29c Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 8 Nov 2023 23:20:12 -0800 Subject: [PATCH 730/939] target/i386: Add get/set/migrate support for FRED MSRs commit 4ebd98eb3ade5957a842da1420bda012eeeaab9c upstream. FRED CPU states are managed in 9 new FRED MSRs, in addtion to a few existing CPU registers and MSRs, e.g., CR4.FRED and MSR_IA32_PL0_SSP. Save/restore/migrate FRED MSRs if FRED is exposed to the guest. Intel-SIG: commit 4ebd98eb3ade target/i386: Add get/set/migrate support for FRED MSRs Tested-by: Shan Kang Signed-off-by: Xin Li Message-ID: <20231109072012.8078-7-xin3.li@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.h | 22 +++++++++++++++++++ target/i386/kvm/kvm.c | 49 +++++++++++++++++++++++++++++++++++++++++++ target/i386/machine.c | 28 +++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index b03237c305..1b9d922651 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -539,6 +539,17 @@ typedef enum X86Seg { #define MSR_IA32_XFD 0x000001c4 #define MSR_IA32_XFD_ERR 0x000001c5 +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x000001cc /* Stack level 0 regular stack pointer */ +#define MSR_IA32_FRED_RSP1 0x000001cd /* Stack level 1 regular stack pointer */ +#define MSR_IA32_FRED_RSP2 0x000001ce /* Stack level 2 regular stack pointer */ +#define MSR_IA32_FRED_RSP3 0x000001cf /* Stack level 3 regular stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x000001d0 /* FRED exception stack levels */ +#define MSR_IA32_FRED_SSP1 0x000001d1 /* Stack level 1 shadow stack pointer in ring 0 */ +#define MSR_IA32_FRED_SSP2 0x000001d2 /* Stack level 2 shadow stack pointer in ring 0 */ +#define MSR_IA32_FRED_SSP3 0x000001d3 /* Stack level 3 shadow stack pointer in ring 0 */ +#define MSR_IA32_FRED_CONFIG 0x000001d4 /* FRED Entrypoint and interrupt stack level */ + #define MSR_IA32_BNDCFGS 0x00000d90 #define MSR_IA32_XSS 0x00000da0 #define MSR_IA32_UMWAIT_CONTROL 0xe1 @@ -1698,6 +1709,17 @@ typedef struct CPUArchState { target_ulong cstar; target_ulong fmask; target_ulong kernelgsbase; + + /* FRED MSRs */ + uint64_t fred_rsp0; + uint64_t fred_rsp1; + uint64_t fred_rsp2; + uint64_t fred_rsp3; + uint64_t fred_stklvls; + uint64_t fred_ssp1; + uint64_t fred_ssp2; + uint64_t fred_ssp3; + uint64_t fred_config; #endif uint64_t tsc_adjust; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 12e920bbb4..5f3497e122 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -3391,6 +3391,17 @@ static int kvm_put_msrs(X86CPU *cpu, int level) kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase); kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask); kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar); + if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config); + } } #endif @@ -3867,6 +3878,17 @@ static int kvm_get_msrs(X86CPU *cpu) kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0); kvm_msr_entry_add(cpu, MSR_FMASK, 0); kvm_msr_entry_add(cpu, MSR_LSTAR, 0); + if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0); + } } #endif kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); @@ -4092,6 +4114,33 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_LSTAR: env->lstar = msrs[i].data; break; + case MSR_IA32_FRED_RSP0: + env->fred_rsp0 = msrs[i].data; + break; + case MSR_IA32_FRED_RSP1: + env->fred_rsp1 = msrs[i].data; + break; + case MSR_IA32_FRED_RSP2: + env->fred_rsp2 = msrs[i].data; + break; + case MSR_IA32_FRED_RSP3: + env->fred_rsp3 = msrs[i].data; + break; + case MSR_IA32_FRED_STKLVLS: + env->fred_stklvls = msrs[i].data; + break; + case MSR_IA32_FRED_SSP1: + env->fred_ssp1 = msrs[i].data; + break; + case MSR_IA32_FRED_SSP2: + env->fred_ssp2 = msrs[i].data; + break; + case MSR_IA32_FRED_SSP3: + env->fred_ssp3 = msrs[i].data; + break; + case MSR_IA32_FRED_CONFIG: + env->fred_config = msrs[i].data; + break; #endif case MSR_IA32_TSC: env->tsc = msrs[i].data; diff --git a/target/i386/machine.c b/target/i386/machine.c index 9a1cb8f3b8..7cbfbc0efb 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -1544,6 +1544,33 @@ static const VMStateDescription vmstate_msr_xfd = { }; #ifdef TARGET_X86_64 +static bool intel_fred_msrs_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return !!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED); +} + +static const VMStateDescription vmstate_msr_fred = { + .name = "cpu/fred", + .version_id = 1, + .minimum_version_id = 1, + .needed = intel_fred_msrs_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64(env.fred_rsp0, X86CPU), + VMSTATE_UINT64(env.fred_rsp1, X86CPU), + VMSTATE_UINT64(env.fred_rsp2, X86CPU), + VMSTATE_UINT64(env.fred_rsp3, X86CPU), + VMSTATE_UINT64(env.fred_stklvls, X86CPU), + VMSTATE_UINT64(env.fred_ssp1, X86CPU), + VMSTATE_UINT64(env.fred_ssp2, X86CPU), + VMSTATE_UINT64(env.fred_ssp3, X86CPU), + VMSTATE_UINT64(env.fred_config, X86CPU), + VMSTATE_END_OF_LIST() + } + }; + static bool amx_xtile_needed(void *opaque) { X86CPU *cpu = opaque; @@ -1768,6 +1795,7 @@ const VMStateDescription vmstate_x86_cpu = { &vmstate_pdptrs, &vmstate_msr_xfd, #ifdef TARGET_X86_64 + &vmstate_msr_fred, &vmstate_amx_xtile, #endif &vmstate_arch_lbr, -- Gitee From 1eacc509e9158b9e87f05fc9844142c0022b2d64 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 7 Aug 2024 01:18:10 -0700 Subject: [PATCH 731/939] target/i386: Delete duplicated macro definition CR4_FRED_MASK commit a23bc6539890d8b27458cf56bc4ed0e0d3c2de3e upstream. Macro CR4_FRED_MASK is defined twice, delete one. Intel-SIG: commit a23bc6539890 target/i386: Delete duplicated macro definition CR4_FRED_MASK Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20240807081813.735158-2-xin@zytor.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 1b9d922651..f022749c86 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -270,12 +270,6 @@ typedef enum X86Seg { #define CR4_FRED_MASK 0 #endif -#ifdef TARGET_X86_64 -#define CR4_FRED_MASK (1ULL << 32) -#else -#define CR4_FRED_MASK 0 -#endif - #define CR4_RESERVED_MASK \ (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ | CR4_DE_MASK | CR4_PSE_MASK | CR4_PAE_MASK \ -- Gitee From 4dea92e8570650776ed8caa0fedf0a90920f5e97 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 7 Aug 2024 01:18:11 -0700 Subject: [PATCH 732/939] target/i386: Add VMX control bits for nested FRED support commit 7c6ec5bc5fea92a4ddea3f0189e3a7e7588e1d19 upstream. Add definitions of 1) VM-exit activate secondary controls bit 2) VM-entry load FRED bit which are required to enable nested FRED. Intel-SIG: commit 7c6ec5bc5fea target/i386: Add VMX control bits for nested FRED support Reviewed-by: Zhao Liu Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20240807081813.735158-3-xin@zytor.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 00e636e61c..f80570f4da 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1271,7 +1271,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "vmx-exit-save-efer", "vmx-exit-load-efer", "vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs", NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL, - NULL, "vmx-exit-load-pkrs", NULL, NULL, + NULL, "vmx-exit-load-pkrs", NULL, "vmx-exit-secondary-ctls", }, .msr = { .index = MSR_IA32_VMX_TRUE_EXIT_CTLS, @@ -1286,7 +1286,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { NULL, "vmx-entry-ia32e-mode", NULL, NULL, NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", "vmx-entry-load-efer", "vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL, - NULL, NULL, "vmx-entry-load-pkrs", NULL, + NULL, NULL, "vmx-entry-load-pkrs", "vmx-entry-load-fred", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }, -- Gitee From 513d33050869a337262fdba0a2d064e7ce9fdb22 Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Wed, 7 Aug 2024 01:18:12 -0700 Subject: [PATCH 733/939] target/i386: Raise the highest index value used for any VMCS encoding commit ab891454ebe82f7e359be721007652556f9f8356 upstream. Because the index value of the VMCS field encoding of FRED injected-event data (one of the newly added VMCS fields for FRED transitions), 0x52, is larger than any existing index value, raise the highest index value used for any VMCS encoding to 0x52. Because the index value of the VMCS field encoding of Secondary VM-exit controls, 0x44, is larger than any existing index value, raise the highest index value used for any VMCS encoding to 0x44. Intel-SIG: commit ab891454ebe8 target/i386: Raise the highest index value used for any VMCS encoding Co-developed-by: Xin Li Signed-off-by: Xin Li Signed-off-by: Lei Wang Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20240807081813.735158-4-xin@zytor.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.h | 1 + target/i386/kvm/kvm.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index f022749c86..fb6721f182 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1166,6 +1166,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define VMX_VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VMX_VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VMX_VM_EXIT_LOAD_IA32_PKRS 0x20000000 +#define VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS 0x80000000 #define VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 #define VMX_VM_ENTRY_IA32E_MODE 0x00000200 diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 5f3497e122..ce96ed9158 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -3254,7 +3254,14 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, CR4_VMXE_MASK); - if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { + if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + /* FRED injected-event data (0x2052). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52); + } else if (f[FEAT_VMX_EXIT_CTLS] & + VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) { + /* Secondary VM-exit controls (0x2044). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44); + } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { /* TSC multiplier (0x2032). */ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32); } else { -- Gitee From bd6fec2cb2bb811aa73a2a6e6da45c76ecded49c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Jun 2024 01:12:42 +0200 Subject: [PATCH 734/939] target/i386: pass X86CPU to x86_cpu_get_supported_feature_word commit 8dee38483274bd0fcf3f74dea024d719b958200d upstream. This allows modifying the bits in "-cpu max"/"-cpu host" depending on the guest CPU vendor (which, at least by default, is the host vendor in the case of KVM). For example, machine check architecture differs between Intel and AMD, and bits from AMD should be dropped when configuring the guest for an Intel model. Intel-SIG: commit 8dee38483274 target/i386: pass X86CPU to x86_cpu_get_supported_feature_word Cc: Xiaoyao Li Cc: John Allen Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 11 +++++------ target/i386/cpu.h | 3 +-- target/i386/kvm/kvm-cpu.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index f80570f4da..dfc0f7fd2d 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -5959,8 +5959,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) #endif /* !CONFIG_USER_ONLY */ -uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, - bool migratable_only) +uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) { FeatureWordInfo *wi = &feature_word_info[w]; uint64_t r = 0; @@ -6002,7 +6001,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, r &= ~unavail; } #endif - if (migratable_only) { + if (cpu && cpu->migratable) { r &= x86_cpu_get_migratable_flags(w); } return r; @@ -7324,7 +7323,7 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) * by the user. */ env->features[w] |= - x86_cpu_get_supported_feature_word(w, cpu->migratable) & + x86_cpu_get_supported_feature_word(cpu, w) & ~env->user_features[w] & ~feature_word_info[w].no_autoenable_flags; } @@ -7450,7 +7449,7 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool verbose) for (w = 0; w < FEATURE_WORDS; w++) { uint64_t host_feat = - x86_cpu_get_supported_feature_word(w, false); + x86_cpu_get_supported_feature_word(NULL, w); uint64_t requested_features = env->features[w]; uint64_t unavailable_features = requested_features & ~host_feat; mark_unavailable_features(cpu, w, unavailable_features, prefix); @@ -7566,7 +7565,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) env->features[FEAT_PERF_CAPABILITIES] & PERF_CAP_LBR_FMT; if (requested_lbr_fmt && kvm_enabled()) { uint64_t host_perf_cap = - x86_cpu_get_supported_feature_word(FEAT_PERF_CAPABILITIES, false); + x86_cpu_get_supported_feature_word(NULL, FEAT_PERF_CAPABILITIES); unsigned host_lbr_fmt = host_perf_cap & PERF_CAP_LBR_FMT; if (!cpu->enable_pmu) { diff --git a/target/i386/cpu.h b/target/i386/cpu.h index fb6721f182..b90182582f 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -655,8 +655,7 @@ typedef enum FeatureWord { } FeatureWord; typedef uint64_t FeatureWordArray[FEATURE_WORDS]; -uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, - bool migratable_only); +uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); /* cpuid_features bits */ #define CPUID_FP87 (1U << 0) diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c index f76972e47e..a3bc8d8f83 100644 --- a/target/i386/kvm/kvm-cpu.c +++ b/target/i386/kvm/kvm-cpu.c @@ -137,7 +137,7 @@ static void kvm_cpu_xsave_init(void) if (!esa->size) { continue; } - if ((x86_cpu_get_supported_feature_word(esa->feature, false) & esa->bits) + if ((x86_cpu_get_supported_feature_word(NULL, esa->feature) & esa->bits) != esa->bits) { continue; } -- Gitee From 0d5ac4f36208eadbb922f552ba1b762f5bd0c3a6 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 24 Jan 2024 21:40:15 -0500 Subject: [PATCH 735/939] i386/cpuid: Remove subleaf constraint on CPUID leaf 1F commit a3b5376521a0de898440e8d0942b54e628f0949f upstream. No such constraint that subleaf index needs to be less than 64. Intel-SIG: commit a3b5376521a0 i386/cpuid: Remove subleaf constraint on CPUID leaf 1F Signed-off-by: Xiaoyao Li Reviewed-by:Yang Weijiang Message-ID: <20240125024016.2521244-3-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index ce96ed9158..850104f6b5 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1928,10 +1928,6 @@ int kvm_arch_init_vcpu(CPUState *cs) break; } - if (i == 0x1f && j == 64) { - break; - } - c->function = i; c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; c->index = j; -- Gitee From e0b51ea0f229ea9c6788fa0da252e8100e30241e Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:23 -0400 Subject: [PATCH 736/939] target/i386: Don't construct a all-zero entry for CPUID[0xD 0x3f] commit 00c8a933d95add3ce4afebbe491ca0fa398a9007 upstream. Currently, QEMU always constructs a all-zero CPUID entry for CPUID[0xD 0x3f]. It's meaningless to construct such a leaf as the end of leaf 0xD. Rework the logic of how subleaves of 0xD are constructed to get rid of such all-zero value of subleaf 0x3f. Intel-SIG: commit 00c8a933d95a target/i386: Don't construct a all-zero entry for CPUID[0xD 0x3f] Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-2-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 850104f6b5..5057dfbd75 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1924,10 +1924,6 @@ int kvm_arch_init_vcpu(CPUState *cs) case 0xb: case 0xd: for (j = 0; ; j++) { - if (i == 0xd && j == 64) { - break; - } - c->function = i; c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; c->index = j; @@ -1943,7 +1939,12 @@ int kvm_arch_init_vcpu(CPUState *cs) break; } if (i == 0xd && c->eax == 0) { - continue; + if (j < 63) { + continue; + } else { + cpuid_i--; + break; + } } if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { fprintf(stderr, "cpuid_data is full, no space for " -- Gitee From 8c61e09f435ff3a965867b0496f01682d679182f Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:24 -0400 Subject: [PATCH 737/939] target/i386: Enable fdp-excptn-only and zero-fcs-fds commit 7dddc3bb875e7141ab25931d0f30a1c319bc8457 upstream. - CPUID.(EAX=07H,ECX=0H):EBX[bit 6]: x87 FPU Data Pointer updated only on x87 exceptions if 1. - CPUID.(EAX=07H,ECX=0H):EBX[bit 13]: Deprecates FPU CS and FPU DS values if 1. i.e., X87 FCS and FDS are always zero. Define names for them so that they can be exposed to guest with -cpu host. Also define the bit field MACROs so that named cpu models can add it as well in the future. Intel-SIG: commit 7dddc3bb875e target/i386: Enable fdp-excptn-only and zero-fcs-fds Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-3-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- target/i386/cpu.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index dfc0f7fd2d..d0aa2fb5ff 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -906,9 +906,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .type = CPUID_FEATURE_WORD, .feat_names = { "fsgsbase", "tsc-adjust", "sgx", "bmi1", - "hle", "avx2", NULL, "smep", + "hle", "avx2", "fdp-excptn-only", "smep", "bmi2", "erms", "invpcid", "rtm", - NULL, NULL, "mpx", NULL, + NULL, "zero-fcs-fds", "mpx", NULL, "avx512f", "avx512dq", "rdseed", "adx", "smap", "avx512ifma", "pcommit", "clflushopt", "clwb", "intel-pt", "avx512pf", "avx512er", diff --git a/target/i386/cpu.h b/target/i386/cpu.h index b90182582f..b883e5e1d6 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -809,6 +809,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_7_0_EBX_HLE (1U << 4) /* Intel Advanced Vector Extensions 2 */ #define CPUID_7_0_EBX_AVX2 (1U << 5) +/* FPU data pointer updated only on x87 exceptions */ +#define CPUID_7_0_EBX_FDP_EXCPTN_ONLY (1u << 6) /* Supervisor-mode Execution Prevention */ #define CPUID_7_0_EBX_SMEP (1U << 7) /* 2nd Group of Advanced Bit Manipulation Extensions */ @@ -819,6 +821,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_7_0_EBX_INVPCID (1U << 10) /* Restricted Transactional Memory */ #define CPUID_7_0_EBX_RTM (1U << 11) +/* Zero out FPU CS and FPU DS */ +#define CPUID_7_0_EBX_ZERO_FCS_FDS (1U << 13) /* Memory Protection Extension */ #define CPUID_7_0_EBX_MPX (1U << 14) /* AVX-512 Foundation */ -- Gitee From afcdb893e4c702f4e009a98da71408cf54a53cc4 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:27 -0400 Subject: [PATCH 738/939] target/i386: Construct CPUID 2 as stateful iff times > 1 commit 5ab639141b6d916a6f4041d4ec46f2f1a1e4a365 upstream. When times == 1, the CPUID leaf 2 is not stateful. Intel-SIG: commit 5ab639141b6d target/i386: Construct CPUID 2 as stateful iff times > 1 Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-6-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 5057dfbd75..a867512822 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1896,10 +1896,12 @@ int kvm_arch_init_vcpu(CPUState *cs) int times; c->function = i; - c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | - KVM_CPUID_FLAG_STATE_READ_NEXT; cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); times = c->eax & 0xff; + if (times > 1) { + c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | + KVM_CPUID_FLAG_STATE_READ_NEXT; + } for (j = 1; j < times; ++j) { if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { -- Gitee From 07a671dc3e3baedb650b307c36d69bef869c2480 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:31 -0400 Subject: [PATCH 739/939] target/i386: Make invtsc migratable when user sets tsc-khz explicitly commit 87c88db3143e91076d167a62dd7febf49afca8a2 upstream. When user sets tsc-frequency explicitly, the invtsc feature is actually migratable because the tsc-frequency is supposed to be fixed during the migration. See commit d99569d9d856 ("kvm: Allow invtsc migration if tsc-khz is set explicitly") for referrence. Intel-SIG: commit 87c88db3143e target/i386: Make invtsc migratable when user sets tsc-khz explicitly Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-10-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index d0aa2fb5ff..20358ffa91 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1685,9 +1685,10 @@ static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu) * Returns the set of feature flags that are supported and migratable by * QEMU, for a given FeatureWord. */ -static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) +static uint64_t x86_cpu_get_migratable_flags(X86CPU *cpu, FeatureWord w) { FeatureWordInfo *wi = &feature_word_info[w]; + CPUX86State *env = &cpu->env; uint64_t r = 0; int i; @@ -1701,6 +1702,12 @@ static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) r |= f; } } + + /* when tsc-khz is set explicitly, invtsc is migratable */ + if ((w == FEAT_8000_0007_EDX) && env->user_tsc_khz) { + r |= CPUID_APM_INVTSC; + } + return r; } @@ -6002,7 +6009,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) } #endif if (cpu && cpu->migratable) { - r &= x86_cpu_get_migratable_flags(w); + r &= x86_cpu_get_migratable_flags(cpu, w); } return r; } -- Gitee From c8eba92f97b68fad3f84dde2fb6fd4409738e626 Mon Sep 17 00:00:00 2001 From: lihuhua Date: Sat, 22 Mar 2025 12:01:26 +0800 Subject: [PATCH 740/939] virtcca: add kvm isolation when get tmi version. --- hw/arm/boot.c | 7 ++++++- hw/arm/virt.c | 6 +++++- linux-headers/asm-arm64/kvm.h | 2 -- linux-headers/linux/kvm.h | 1 + 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index a3e0dbb68c..9a33601d35 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -1163,7 +1163,12 @@ static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, const char *firmware_filename) { uint64_t tmi_version = 0; - if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { + int ret = -1; + + if (kvm_enabled()) { + ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version); + } + if (ret < 0) { error_report("please check the kernel version!"); exit(EXIT_FAILURE); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 3c31d3667e..fed2f8c4d7 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2057,7 +2057,11 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; uint64_t tmi_version = 0; - if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { + int ret = -1; + if (kvm_enabled()) { + ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version); + } + if (ret < 0) { warn_report("can not get tmi version"); } if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index d69a71cbec..552fdcb18f 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -597,6 +597,4 @@ struct kvm_cap_arm_tmm_populate_region_args { #endif -#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 - #endif /* __ARM_KVM_H__ */ diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index c9ec7f862a..b94c5fd90f 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2430,5 +2430,6 @@ struct kvm_s390_zpci_op { /* get tmi version */ #define KVM_GET_TMI_VERSION _IOR(KVMIO, 0xd2, uint64_t) +#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 #endif /* __LINUX_KVM_H */ -- Gitee From 29080940b37ce7486a46ab5534383321319fe2c5 Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 22 Mar 2025 15:10:32 +0800 Subject: [PATCH 741/939] backends/cryptodev: Do not abort for invalid session ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from eaf2bd29538d039df80bb4b1584de33a61312bc6 Instead of aborting when a session ID is invalid, return VIRTIO_CRYPTO_INVSESS ("Invalid session id"). Reproduced using: $ cat << EOF | qemu-system-i386 -display none \ -machine q35,accel=qtest -m 512M -nodefaults \ -object cryptodev-backend-builtin,id=cryptodev0 \ -device virtio-crypto-pci,id=crypto0,cryptodev=cryptodev0 \ -qtest stdio outl 0xcf8 0x80000804 outw 0xcfc 0x06 outl 0xcf8 0x80000820 outl 0xcfc 0xe0008000 write 0x10800e 0x1 0x01 write 0xe0008016 0x1 0x01 write 0xe0008020 0x4 0x00801000 write 0xe0008028 0x4 0x00c01000 write 0xe000801c 0x1 0x01 write 0x110000 0x1 0x05 write 0x110001 0x1 0x04 write 0x108002 0x1 0x11 write 0x108008 0x1 0x48 write 0x10800c 0x1 0x01 write 0x108018 0x1 0x10 write 0x10801c 0x1 0x02 write 0x10c002 0x1 0x01 write 0xe000b005 0x1 0x00 EOF Assertion failed: (session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]), function cryptodev_builtin_close_session, file cryptodev-builtin.c, line 430. Cc: qemu-stable@nongnu.org Reported-by: Zheyu Ma Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2274 Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: zhenwei pi Message-Id: <20240409094757.9127-1-philmd@linaro.org> Signed-off-by: gubin --- backends/cryptodev-builtin.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c index 0822f198d9..940104ee55 100644 --- a/backends/cryptodev-builtin.c +++ b/backends/cryptodev-builtin.c @@ -428,7 +428,9 @@ static int cryptodev_builtin_close_session( CRYPTODEV_BACKEND_BUILTIN(backend); CryptoDevBackendBuiltinSession *session; - assert(session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]); + if (session_id >= MAX_NUM_SESSIONS || !builtin->sessions[session_id]) { + return -VIRTIO_CRYPTO_INVSESS; + } session = builtin->sessions[session_id]; if (session->cipher) { -- Gitee From 690812903469db798ebae012248b9231d5ce9f11 Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 22 Mar 2025 15:15:08 +0800 Subject: [PATCH 742/939] backends/cryptodev: Do not ignore throttle/backends Errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 484aecf2d3a75251b63481be2a0c3aef635002af Both cryptodev_backend_set_throttle() and CryptoDevBackendClass::init() can set their Error** argument. Do not ignore them, return early on failure. Without that, running into another failure trips error_setv()'s assertion. Use the ERRP_GUARD() macro as suggested in commit ae7c80a7bd ("error: New macro ERRP_GUARD()"). Cc: qemu-stable@nongnu.org Fixes: e7a775fd9f ("cryptodev: Account statistics") Fixes: 2580b452ff ("cryptodev: support QoS") Reviewed-by: zhenwei pi Reviewed-by: Gonglei Reviewed-by: Markus Armbruster Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20231120150418.93443-1-philmd@linaro.org> Signed-off-by: gubin --- backends/cryptodev.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/backends/cryptodev.c b/backends/cryptodev.c index e5006bd215..fff89fd62a 100644 --- a/backends/cryptodev.c +++ b/backends/cryptodev.c @@ -398,6 +398,7 @@ static void cryptodev_backend_set_ops(Object *obj, Visitor *v, static void cryptodev_backend_complete(UserCreatable *uc, Error **errp) { + ERRP_GUARD(); CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc); CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc); uint32_t services; @@ -406,11 +407,20 @@ cryptodev_backend_complete(UserCreatable *uc, Error **errp) QTAILQ_INIT(&backend->opinfos); value = backend->tc.buckets[THROTTLE_OPS_TOTAL].avg; cryptodev_backend_set_throttle(backend, THROTTLE_OPS_TOTAL, value, errp); + if (*errp) { + return; + } value = backend->tc.buckets[THROTTLE_BPS_TOTAL].avg; cryptodev_backend_set_throttle(backend, THROTTLE_BPS_TOTAL, value, errp); + if (*errp) { + return; + } if (bc->init) { bc->init(backend, errp); + if (*errp) { + return; + } } services = backend->conf.crypto_services; -- Gitee From 43fdaaa492ea10ab0e90ec4cc68ec45aed1d415c Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 22 Mar 2025 15:20:27 +0800 Subject: [PATCH 743/939] hw/nvme: fix invalid check on mcl cherry-pick from 8c78015a55d84c016da6d5e41b6b5f618ecb25ab The number of logical blocks within a source range is converted into a 1s based number at the time of parsing. However, when verifying the copy length we add one again, causing the check against MCL to fail in error. Cc: qemu-stable@nongnu.org Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY") Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen Signed-off-by: gubin --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 29445938d5..407004b2f7 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -2863,7 +2863,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, uint32_t nlb; nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, &nlb, NULL, NULL, NULL); - copy_len += nlb + 1; + copy_len += nlb; } if (copy_len > ns->id_ns.mcl) { -- Gitee From 6de964bac51139ef24f43bde56933cd8eafaf317 Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 22 Mar 2025 15:25:39 +0800 Subject: [PATCH 744/939] hw/nvme: fix invalid endian conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from d2b5bb860e6c17442ad95cc275feb07c1665be5c numcntl is one byte and so is max_vfs. Using cpu_to_le16 on big endian hosts results in numcntl being set to 0. Fix by dropping the endian conversion. Fixes: 99f48ae7ae ("hw/nvme: Add support for Secondary Controller List") Reported-by: Kevin Wolf Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Message-ID: <20240222-fix-sriov-numcntl-v1-1-d60bea5e72d0@samsung.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: gubin --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 29445938d5..9410344844 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7928,7 +7928,7 @@ static void nvme_init_state(NvmeCtrl *n) n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); QTAILQ_INIT(&n->aer_queue); - list->numcntl = cpu_to_le16(max_vfs); + list->numcntl = max_vfs; for (i = 0; i < max_vfs; i++) { sctrl = &list->sec[i]; sctrl->pcid = cpu_to_le16(n->cntlid); -- Gitee From 17835e803d0cfa308cd00f070c7e21b27f3d036e Mon Sep 17 00:00:00 2001 From: gubin Date: Sat, 22 Mar 2025 15:38:09 +0800 Subject: [PATCH 745/939] net: fix build when libbpf is disabled, but libxdp is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 1f37280b37dbf85f36748f359a9f8802c8fe7ccd The net/af-xdp.c code is enabled when the libxdp library is present, however, it also has direct API calls to bpf_xdp_query_id & bpf_xdp_detach which are provided by the libbpf library. As a result if building with --disable-libbpf, but libxdp gets auto-detected, we'll fail to link QEMU /usr/bin/ld: libcommon.a.p/net_af-xdp.c.o: undefined reference to symbol 'bpf_xdp_query_id@@LIBBPF_0.7.0' There are two bugs here * Since we have direct libbpf API calls, when building net/af-xdp.c, we must tell meson that libbpf is a dependancy, so that we directly link to it, rather than relying on indirect linkage. * When must skip probing for libxdp at all, when libbpf is not found, raising an error if --enable-libxdp was given explicitly. Fixes: cb039ef3d9e3112da01e1ecd9b136ac9809ef733 Signed-off-by: Daniel P. Berrangé Signed-off-by: Jason Wang Signed-off-by: gubin --- meson.build | 10 ++++++++-- net/meson.build | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 4078f2aced..aea6a33ca3 100644 --- a/meson.build +++ b/meson.build @@ -1972,8 +1972,14 @@ endif # libxdp libxdp = not_found if not get_option('af_xdp').auto() or have_system - libxdp = dependency('libxdp', required: get_option('af_xdp'), - version: '>=1.4.0', method: 'pkg-config') + if libbpf.found() + libxdp = dependency('libxdp', required: get_option('af_xdp'), + version: '>=1.4.0', method: 'pkg-config') + else + if get_option('af_xdp').enabled() + error('libxdp requested, but libbpf is not available') + endif + endif endif # libdw diff --git a/net/meson.build b/net/meson.build index ce99bd4447..7264479242 100644 --- a/net/meson.build +++ b/net/meson.build @@ -37,7 +37,7 @@ if have_netmap system_ss.add(files('netmap.c')) endif -system_ss.add(when: libxdp, if_true: files('af-xdp.c')) +system_ss.add(when: [libxdp, libbpf], if_true: files('af-xdp.c')) if have_vhost_net_user system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c')) -- Gitee From 13b84313c9f7ca4823abdbad92baf091c337861e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Apr 2023 15:13:53 -0700 Subject: [PATCH 746/939] hw/arm/smmuv3: Add smmu_dev_install_nested_ste() for CFGI_STE Call smmu_dev_install_nested_ste and eventually down to IOMMU_HWPT_ALLOC ioctl for a nested HWPT allocation. Signed-off-by: Nicolin Chen --- hw/arm/smmu-common.c | 9 ++++ hw/arm/smmuv3-internal.h | 1 + hw/arm/smmuv3.c | 97 +++++++++++++++++++++++++++++++++++- hw/arm/trace-events | 1 + include/hw/arm/smmu-common.h | 14 ++++++ 5 files changed, 120 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index cc41bf3de8..9e9af8f5c7 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -780,6 +780,7 @@ static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) { + SMMUVdev *vdev; SMMUDevice *sdev; SMMUViommu *viommu; SMMUState *s = opaque; @@ -803,13 +804,21 @@ static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) error_report("Unable to attach dev to the default HW pagetable"); } + vdev = sdev->vdev; viommu = sdev->viommu; sdev->idev = NULL; sdev->viommu = NULL; + sdev->vdev = NULL; QLIST_REMOVE(sdev, next); trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); + if (vdev) { + iommufd_backend_free_id(viommu->iommufd, vdev->core->vdev_id); + g_free(vdev->core); + g_free(vdev); + } + if (QLIST_EMPTY(&viommu->device_list)) { iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index 6076025ad6..163459d450 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -552,6 +552,7 @@ typedef struct CD { #define STE_S1FMT(x) extract32((x)->word[0], 4 , 2) #define STE_S1CDMAX(x) extract32((x)->word[1], 27, 5) +#define STE_S1DSS(x) extract32((x)->word[2], 0, 2) #define STE_S1STALLD(x) extract32((x)->word[2], 27, 1) #define STE_EATS(x) extract32((x)->word[2], 28, 2) #define STE_STRW(x) extract32((x)->word[2], 30, 2) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 253d297eec..540831ab8e 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -563,6 +563,27 @@ bad_ste: return -EINVAL; } +static void decode_ste_config(SMMUTransCfg *cfg, uint32_t config) +{ + + if (STE_CFG_ABORT(config)) { + cfg->aborted = true; + return; + } + if (STE_CFG_BYPASS(config)) { + cfg->bypassed = true; + return; + } + + if (STE_CFG_S1_ENABLED(config)) { + cfg->stage = SMMU_STAGE_1; + } + + if (STE_CFG_S2_ENABLED(config)) { + cfg->stage |= SMMU_STAGE_2; + } +} + /* Returns < 0 in case of invalid STE, 0 otherwise */ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, STE *ste, SMMUEventInfo *event) @@ -579,12 +600,19 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, config = STE_CONFIG(ste); - if (STE_CFG_ABORT(config)) { + decode_ste_config(cfg, config); + + /* S1DSS.Terminate is same as Config.abort for default stream */ + if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0) { cfg->aborted = true; + } + + if (cfg->aborted || cfg->bypassed) { return 0; } - if (STE_CFG_BYPASS(config)) { + /* S1DSS.Bypass is same as Config.bypass for default stream */ + if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0x1) { cfg->bypassed = true; return 0; } @@ -1231,6 +1259,68 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) } } +static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) +{ +#ifdef __linux__ + SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid, + .inval_ste_allowed = true}; + struct iommu_hwpt_arm_smmuv3 nested_data = {}; + SMMUv3State *s = sdev->smmu; + SMMUState *bs = &s->smmu_state; + uint32_t config; + STE ste; + int ret; + + if (!sdev->viommu || !bs->nested) { + return; + } + + if (!sdev->vdev && sdev->idev && sdev->viommu) { + SMMUVdev *vdev = g_new0(SMMUVdev, 1); + vdev->core = iommufd_backend_alloc_vdev(sdev->idev, sdev->viommu->core, + sid); + if (!vdev->core) { + error_report("failed to allocate a vDEVICE"); + g_free(vdev); + return; + } + sdev->vdev = vdev; + } + + ret = smmu_find_ste(sdev->smmu, sid, &ste, &event); + if (ret) { + /* + * For a 2-level Stream Table, the level-2 table might not be ready + * until the device gets inserted to the stream table. Ignore this. + */ + return; + } + + config = STE_CONFIG(&ste); + if (!STE_VALID(&ste) || !STE_CFG_S1_ENABLED(config)) { + smmu_dev_uninstall_nested_ste(sdev, STE_CFG_ABORT(config)); + smmuv3_flush_config(sdev); + return; + } + + nested_data.ste[0] = (uint64_t)ste.word[0] | (uint64_t)ste.word[1] << 32; + nested_data.ste[1] = (uint64_t)ste.word[2] | (uint64_t)ste.word[3] << 32; + /* V | CONFIG | S1FMT | S1CTXPTR | S1CDMAX */ + nested_data.ste[0] &= 0xf80fffffffffffffULL; + /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ + nested_data.ste[1] &= 0x380000ffULL; + + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(nested_data), &nested_data); + if (ret) { + error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", + nested_data.ste[1], nested_data.ste[0], ret); + } + + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); +#endif +} + static gboolean smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) { @@ -1241,6 +1331,8 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) if (sid < sid_range->start || sid > sid_range->end) { return false; } + smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); trace_smmuv3_config_cache_inv(sid); return true; } @@ -1310,6 +1402,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_cfgi_ste(sid); sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); break; } diff --git a/hw/arm/trace-events b/hw/arm/trace-events index 1e3d86382d..490da6349c 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -57,4 +57,5 @@ smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 +smmuv3_install_nested_ste(uint32_t sid, uint64_t ste_1, uint64_t ste_0) "sid=%d ste=%"PRIx64":%"PRIx64 diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index d120c352cf..955ca716a5 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -51,6 +51,13 @@ typedef enum { SMMU_PTW_ERR_PERMISSION, /* Permission fault */ } SMMUPTWEventType; +/* SMMU Stage */ +typedef enum { + SMMU_STAGE_1 = 1, + SMMU_STAGE_2, + SMMU_NESTED, +} SMMUStage; + typedef struct SMMUPTWEventInfo { int stage; SMMUPTWEventType type; @@ -125,6 +132,12 @@ typedef struct SMMUViommu { QLIST_ENTRY(SMMUViommu) next; } SMMUViommu; +typedef struct SMMUVdev { + SMMUViommu *vsmmu; + IOMMUFDVdev *core; + uint32_t sid; +}SMMUVdev; + typedef struct SMMUS1Hwpt { void *smmu; IOMMUFDBackend *iommufd; @@ -141,6 +154,7 @@ typedef struct SMMUDevice { IOMMUMemoryRegion iommu; HostIOMMUDeviceIOMMUFD *idev; SMMUViommu *viommu; + SMMUVdev *vdev; SMMUS1Hwpt *s1_hwpt; AddressSpace as; AddressSpace as_sysmem; -- Gitee From 707bd8198642549595f11ef34c80094fbf7d2de1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 29 Apr 2024 21:26:41 +0000 Subject: [PATCH 747/939] hw/arm/smmuv3: Add missing STE invalidation Multitple STEs can be invalidated in a range via SMMU_CMD_CFGI_STE_RANGE or SMMU_CMD_CFGI_ALL command. Add the missing STE invalidation in this pathway. Signed-off-by: Nicolin Chen --- hw/arm/smmu-internal.h | 1 + hw/arm/smmuv3.c | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h index 843bebb185..5a81dd1b82 100644 --- a/hw/arm/smmu-internal.h +++ b/hw/arm/smmu-internal.h @@ -142,6 +142,7 @@ typedef struct SMMUIOTLBPageInvInfo { } SMMUIOTLBPageInvInfo; typedef struct SMMUSIDRange { + SMMUState *state; uint32_t start; uint32_t end; } SMMUSIDRange; diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 540831ab8e..9d44bb19bc 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1322,11 +1322,9 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) } static gboolean -smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) +_smmuv3_invalidate_ste(SMMUDevice *sdev, SMMUSIDRange *sid_range) { - SMMUDevice *sdev = (SMMUDevice *)key; uint32_t sid = smmu_get_sid(sdev); - SMMUSIDRange *sid_range = (SMMUSIDRange *)user_data; if (sid < sid_range->start || sid > sid_range->end) { return false; @@ -1337,6 +1335,28 @@ smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) return true; } +static gboolean +smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) +{ + return _smmuv3_invalidate_ste((SMMUDevice *)key, (SMMUSIDRange *)user_data); +} + +static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) +{ + SMMUState *bs = sid_range->state; + SMMUDevice *sdev; + + if (!bs->viommu) { + return; + } + + QLIST_FOREACH(sdev, &bs->viommu->device_list, next) { + if (smmu_get_sid(sdev)) { + _smmuv3_invalidate_ste(sdev, sid_range); + } + } +} + static int smmuv3_cmdq_consume(SMMUv3State *s) { SMMUState *bs = ARM_SMMU(s); @@ -1418,12 +1438,14 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) } mask = (1ULL << (range + 1)) - 1; + sid_range.state = bs; sid_range.start = sid & ~mask; sid_range.end = sid_range.start + mask; trace_smmuv3_cmdq_cfgi_ste_range(sid_range.start, sid_range.end); g_hash_table_foreach_remove(bs->configs, smmuv3_invalidate_ste, &sid_range); + smmuv3_invalidate_nested_ste(&sid_range); break; } case SMMU_CMD_CFGI_CD: -- Gitee From d8d7f775b602a84c37b8aced11e00cb5b0521c4e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 18 Jun 2024 17:22:18 -0700 Subject: [PATCH 748/939] hw/arm/smmu-common: Replace smmu_iommu_mr with smmu_find_sdev The caller of smmu_iommu_mr wants to get sdev for smmuv3_flush_config(). Do it directly instead of bridging with an iommu mr pointer. Signed-off-by: Nicolin Chen Message-id: 20240619002218.926674-1-nicolinc@nvidia.com Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell --- hw/arm/smmu-common.c | 8 ++------ hw/arm/smmuv3.c | 12 ++++-------- include/hw/arm/smmu-common.h | 4 ++-- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 9e9af8f5c7..d0bc620606 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -837,20 +837,16 @@ static const PCIIOMMUOps smmu_ops = { .unset_iommu_device = smmu_dev_unset_iommu_device, }; -IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) +SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) { uint8_t bus_n, devfn; SMMUPciBus *smmu_bus; - SMMUDevice *smmu; bus_n = PCI_BUS_NUM(sid); smmu_bus = smmu_find_smmu_pcibus(s, bus_n); if (smmu_bus) { devfn = SMMU_PCI_DEVFN(sid); - smmu = smmu_bus->pbdev[devfn]; - if (smmu) { - return &smmu->iommu; - } + return smmu_bus->pbdev[devfn]; } return NULL; } diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 9d44bb19bc..b2ffe2d40b 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1407,20 +1407,18 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_CFGI_STE: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; + SMMUDevice *sdev = smmu_find_sdev(bs, sid); if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { + if (!sdev) { break; } trace_smmuv3_cmdq_cfgi_ste(sid); - sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); smmuv3_install_nested_ste(sdev, sid); @@ -1452,20 +1450,18 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_CFGI_CD_ALL: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; + SMMUDevice *sdev = smmu_find_sdev(bs, sid); if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { + if (!sdev) { break; } trace_smmuv3_cmdq_cfgi_cd(sid); - sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); break; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 955ca716a5..e30539a8d4 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -234,8 +234,8 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, */ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova); -/* Return the iommu mr associated to @sid, or NULL if none */ -IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); +/* Return the SMMUDevice associated to @sid, or NULL if none */ +SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid); #define SMMU_IOTLB_MAX_SIZE 256 -- Gitee From b331acc42fa54ca93496c32d92cdf5397927bff1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 21 Apr 2023 15:18:56 -0700 Subject: [PATCH 749/939] hw/arm/smmuv3: Forward cache invalidate commands via iommufd Inroduce an SMMUCommandBatch and some helpers to batch the commands. Rewind the q->cons accordingly when it fails to execute a batch/command. Currently separate TLBI commands and device cache commands to avoid some errata on certain version of SMMUs. Later it should check IIDR register to detect if underlying SMMU hw has such an erratum. Signed-off-by: Nicolin Chen --- hw/arm/smmuv3-internal.h | 13 +++++ hw/arm/smmuv3.c | 113 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index 163459d450..a411fd4048 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -226,6 +226,19 @@ static inline bool smmuv3_gerror_irq_enabled(SMMUv3State *s) #define Q_CONS_WRAP(q) (((q)->cons & WRAP_MASK(q)) >> (q)->log2size) #define Q_PROD_WRAP(q) (((q)->prod & WRAP_MASK(q)) >> (q)->log2size) +#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) + +static inline int smmuv3_q_ncmds(SMMUQueue *q) +{ + uint32_t prod = Q_PROD(q); + uint32_t cons = Q_CONS(q); + + if (Q_PROD_WRAP(q) == Q_CONS_WRAP(q)) + return prod - cons; + else + return WRAP_MASK(q) - cons + prod; +} + static inline bool smmuv3_q_full(SMMUQueue *q) { return ((q->cons ^ q->prod) & WRAP_INDEX_MASK(q)) == WRAP_MASK(q); diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index b2ffe2d40b..b860c8385f 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1357,16 +1357,85 @@ static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) } } +/** + * SMMUCommandBatch - batch of commands to issue for nested SMMU invalidation + * @cmds: Pointer to list of commands + * @cons: Pointer to list of CONS corresponding to the commands + * @ncmds: Total ncmds in the batch + * @dev_cache: Issue to a device cache + */ +typedef struct SMMUCommandBatch { + Cmd *cmds; + uint32_t *cons; + uint32_t ncmds; + bool dev_cache; +} SMMUCommandBatch; + +/* Update batch->ncmds to the number of execute cmds */ +static int smmuv3_issue_cmd_batch(SMMUState *bs, SMMUCommandBatch *batch) +{ + uint32_t total = batch->ncmds; + int ret; + + ret = smmu_viommu_invalidate_cache(bs->viommu->core, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3, + sizeof(Cmd), &batch->ncmds, batch->cmds); + if (total != batch->ncmds) { + error_report("%s failed: ret=%d, total=%d, done=%d", + __func__, ret, total, batch->ncmds); + return ret; + } + + batch->ncmds = 0; + batch->dev_cache = false; + return ret; +} + +static int smmuv3_batch_cmds(SMMUState *bs, SMMUCommandBatch *batch, + Cmd *cmd, uint32_t *cons, bool dev_cache) +{ + int ret; + + if (!bs->nested || !bs->viommu) { + return 0; + } + + /* + * Currently separate dev_cache and hwpt for safety, which might not be + * necessary if underlying HW SMMU does not have the errata. + * + * TODO check IIDR register values read from hw_info. + */ + if (batch->ncmds && (dev_cache != batch->dev_cache)) { + ret = smmuv3_issue_cmd_batch(bs, batch); + if (ret) { + *cons = batch->cons[batch->ncmds]; + return ret; + } + } + batch->dev_cache = dev_cache; + batch->cmds[batch->ncmds] = *cmd; + batch->cons[batch->ncmds++] = *cons; + return 0; +} + static int smmuv3_cmdq_consume(SMMUv3State *s) { SMMUState *bs = ARM_SMMU(s); SMMUCmdError cmd_error = SMMU_CERROR_NONE; SMMUQueue *q = &s->cmdq; SMMUCommandType type = 0; + SMMUCommandBatch batch = {}; + uint32_t ncmds = 0; if (!smmuv3_cmdq_enabled(s)) { return 0; } + + ncmds = smmuv3_q_ncmds(q); + batch.cmds = g_new0(Cmd, ncmds); + batch.cons = g_new0(uint32_t, ncmds); + /* * some commands depend on register values, typically CR0. In case those * register values change while handling the command, spec says it @@ -1463,6 +1532,13 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_cfgi_cd(sid); smmuv3_flush_config(sdev); + + if (sdev->s1_hwpt) { + if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + } break; } case SMMU_CMD_TLBI_NH_ASID: @@ -1477,6 +1553,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_tlbi_nh_asid(asid); smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_asid(bs, asid); + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; } case SMMU_CMD_TLBI_NH_ALL: @@ -1489,6 +1569,11 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_tlbi_nh(); smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_all(bs); + + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; case SMMU_CMD_TLBI_NH_VAA: case SMMU_CMD_TLBI_NH_VA: @@ -1497,7 +1582,24 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) break; } smmuv3_range_inval(bs, &cmd); + + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; + case SMMU_CMD_ATC_INV: + { + SMMUDevice *sdev = smmu_find_sdev(bs, CMD_SID(&cmd)); + + if (sdev->s1_hwpt) { + if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + } + break; + } case SMMU_CMD_TLBI_S12_VMALL: { uint16_t vmid = CMD_VMID(&cmd); @@ -1529,7 +1631,6 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_TLBI_EL2_ASID: case SMMU_CMD_TLBI_EL2_VA: case SMMU_CMD_TLBI_EL2_VAA: - case SMMU_CMD_ATC_INV: case SMMU_CMD_PRI_RESP: case SMMU_CMD_RESUME: case SMMU_CMD_STALL_TERM: @@ -1554,12 +1655,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) */ queue_cons_incr(q); } + qemu_mutex_lock(&s->mutex); + if (!cmd_error && batch.ncmds && bs->viommu) { + if (smmuv3_issue_cmd_batch(bs, &batch)) { + q->cons = batch.cons[batch.ncmds]; + cmd_error = SMMU_CERROR_ILL; + } + } + qemu_mutex_unlock(&s->mutex); if (cmd_error) { trace_smmuv3_cmdq_consume_error(smmu_cmd_string(type), cmd_error); smmu_write_cmdq_err(s, cmd_error); smmuv3_trigger_irq(s, SMMU_IRQ_GERROR, R_GERROR_CMDQ_ERR_MASK); } + g_free(batch.cmds); + g_free(batch.cons); trace_smmuv3_cmdq_consume_out(Q_PROD(q), Q_CONS(q), Q_PROD_WRAP(q), Q_CONS_WRAP(q)); -- Gitee From ea23e4215b332446d4964769d004f7a11caba00b Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:02:37 +0800 Subject: [PATCH 750/939] tests/qtest: Allow DSDT acpi tables to change List all DSDT files and allow them to change. Signed-of-by: caijian --- tests/qtest/bios-tables-test-allowed-diff.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..e4a94bb8bd 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,7 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/microvm/DSDT.pcie", +"tests/data/acpi/virt/DSDT", +"tests/data/acpi/virt/DSDT.acpihmatvirt", +"tests/data/acpi/virt/DSDT.memhp", +"tests/data/acpi/virt/DSDT.pxb", +"tests/data/acpi/virt/DSDT.topology", -- Gitee From 237fdc8ddb0598234aace9c88ac4c8387119a12a Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 7 Jul 2022 11:55:25 -0400 Subject: [PATCH 751/939] acpi/gpex: Fix PCI Express Slot Information function 0 returned value At the moment we do not support other function than function 0. So according to ACPI spec "_DSM (Device Specific Method)" description, bit 0 should rather be 0, meaning no other function is supported than function 0. Signed-off-by: Eric Auger --- hw/pci-host/gpex-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index 1092dc3b70..ac5d229757 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -113,7 +113,7 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); ifctx = aml_if(aml_equal(aml_arg(0), UUID)); ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); - uint8_t byte_list[1] = {1}; + uint8_t byte_list[1] = {0}; buf = aml_buffer(1, byte_list); aml_append(ifctx1, aml_return(buf)); aml_append(ifctx, ifctx1); -- Gitee From 4a065d0fbbe159dfbc073e4480434d6889b7c5a4 Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:03:02 +0800 Subject: [PATCH 752/939] tests/data/acpi: Update DSDT acpi tables - * Disassembly of tests/data/acpi/virt/DSDT, Fri Mar 28 16:43:04 2025 + * Disassembly of /tmp/aml-1KF432, Fri Mar 28 16:43:04 2025 * * Original Table Header: * Signature "DSDT" * Length 0x000016B6 (5814) * Revision 0x02 - * Checksum 0x46 + * Checksum 0x47 * OEM ID "BOCHS " * OEM Table ID "BXPC " * OEM Revision 0x00000001 (1) * Compiler ID "BXPC" * Compiler Version 0x00000001 (1) */ DefinitionBlock ("", "DSDT", 2, "BOCHS ", "BXPC ", 0x00000001) @@ -2090,33 +2090,33 @@ } Else { CDW1 |= 0x04 Return (Arg3) } } Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method { If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */)) { If ((Arg2 == Zero)) { Return (Buffer (One) { - 0x01 // . + 0x00 // . }) } } Return (Buffer (One) { 0x00 }) } Signed-off-by: caijian --- tests/data/acpi/microvm/DSDT.pcie | Bin 3023 -> 3023 bytes tests/data/acpi/virt/DSDT | Bin 5814 -> 5814 bytes tests/data/acpi/virt/DSDT.acpihmatvirt | Bin 7323 -> 7323 bytes tests/data/acpi/virt/DSDT.memhp | Bin 7175 -> 7175 bytes tests/data/acpi/virt/DSDT.pxb | Bin 8297 -> 8297 bytes tests/data/acpi/virt/DSDT.topology | Bin 9335 -> 9335 bytes tests/qtest/bios-tables-test-allowed-diff.h | 6 ------ 7 files changed, 6 deletions(-) diff --git a/tests/data/acpi/microvm/DSDT.pcie b/tests/data/acpi/microvm/DSDT.pcie index 765f14ef3d1e54d3cadccbf0a880f8adb73b3f1f..af4c3eb38866d5a32928a73b01ea49a946073aa6 100644 GIT binary patch delta 25 gcmX>veqNl*CDveqNl*CD@!X7zle4%}0cSu5P5=M^ diff --git a/tests/data/acpi/virt/DSDT b/tests/data/acpi/virt/DSDT index d49ead54fa2d8fcd6c0f25ba74e748d90fec3551..404bc5ac2188d885e28b6c80d499193865cf1c9c 100644 GIT binary patch delta 25 hcmdm{yG@tNCD!0RUsV2lD^` delta 25 hcmdm{yG@tNCD!0RUsP2lD^` diff --git a/tests/data/acpi/virt/DSDT.acpihmatvirt b/tests/data/acpi/virt/DSDT.acpihmatvirt index c753f34bb050d146f4dc3195ec850ea26b3141ba..5f9c0b2d3cdc55949c32d564c92309aa54529d8d 100644 GIT binary patch delta 25 hcmbPjIop!UCD^Da?eHUL|C2DbnJ diff --git a/tests/data/acpi/virt/DSDT.pxb b/tests/data/acpi/virt/DSDT.pxb index 1e91767c3045bb8569fd7d5dfa991348ed625944..ccb43ab242521cdfc80f6d6b170d2e0818186632 100644 GIT binary patch delta 32 ncmaFq@X~?HCD Date: Tue, 5 Oct 2021 10:53:12 +0200 Subject: [PATCH 753/939] hw/pci-host/gpex: [needs kernel fix] Allow to generate preserve boot config DSM #5 Add a 'preserve_config' field in struct GPEXConfig and if set, generate the DSM #5 for preserving PCI boot configurations. The DSM presence is needed to expose RMRs. At the moment the DSM generation is not yet enabled. Signed-off-by: Eric Auger --- hw/pci-host/gpex-acpi.c | 35 +++++++++++++++++++++++++++++++---- include/hw/pci-host/gpex.h | 1 + 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index ac5d229757..ce424fc9da 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -49,9 +49,10 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) } } -static void acpi_dsdt_add_pci_osc(Aml *dev) +static void acpi_dsdt_add_pci_osc(Aml *dev, bool preserve_config) { Aml *method, *UUID, *ifctx, *ifctx1, *elsectx, *buf; + uint8_t byte_list[1] = {0}; /* Declare an _OSC (OS Control Handoff) method */ aml_append(dev, aml_name_decl("SUPP", aml_int(0))); @@ -113,10 +114,24 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); ifctx = aml_if(aml_equal(aml_arg(0), UUID)); ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); - uint8_t byte_list[1] = {0}; + if (preserve_config) { + /* support for functions other than function 0 and function 5 */ + byte_list[0] = 0x21; + } buf = aml_buffer(1, byte_list); aml_append(ifctx1, aml_return(buf)); aml_append(ifctx, ifctx1); + + if (preserve_config) { + Aml *ifctx2 = aml_if(aml_equal(aml_arg(2), aml_int(5))); + /* + * 0 - The operating system must not ignore the PCI configuration that + * firmware has done at boot time. + */ + aml_append(ifctx2, aml_return(aml_int(0))); + aml_append(ifctx, ifctx2); + } + aml_append(method, ifctx); byte_list[0] = 0; @@ -174,6 +189,12 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); } + if (cfg->preserve_config) { + method = aml_method("_DSM", 5, AML_SERIALIZED); + aml_append(method, aml_return(aml_int(0))); + aml_append(dev, method); + } + acpi_dsdt_add_pci_route_table(dev, cfg->irq); /* @@ -188,7 +209,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) if (is_cxl) { build_cxl_osc_method(dev); } else { - acpi_dsdt_add_pci_osc(dev); + acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); } aml_append(scope, dev); @@ -205,6 +226,12 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + if (cfg->preserve_config) { + method = aml_method("_DSM", 5, AML_SERIALIZED); + aml_append(method, aml_return(aml_int(0))); + aml_append(dev, method); + } + acpi_dsdt_add_pci_route_table(dev, cfg->irq); method = aml_method("_CBA", 0, AML_NOTSERIALIZED); @@ -263,7 +290,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) } aml_append(dev, aml_name_decl("_CRS", rbuf)); - acpi_dsdt_add_pci_osc(dev); + acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); Aml *dev_res0 = aml_device("%s", "RES0"); aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02"))); diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h index b0240bd768..65475f7f9d 100644 --- a/include/hw/pci-host/gpex.h +++ b/include/hw/pci-host/gpex.h @@ -64,6 +64,7 @@ struct GPEXConfig { MemMapEntry pio; int irq; PCIBus *bus; + bool preserve_config; }; int gpex_set_irq_num(GPEXHost *s, int index, int gsi); -- Gitee From a6c7b16107b506f85e6643604c923291e41f70d1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Jun 2024 04:42:33 +0000 Subject: [PATCH 754/939] hw/arm/virt: Add an SMMU_IO_LEN macro A following patch will add a new MMIO region for nested SMMU instances. This macro will be repeatedly used to set offsets and MMIO sizes in both virt and virt-acpi-build. Signed-off-by: Nicolin Chen Signed-off-by: Shameer Kolothum --- hw/arm/virt.c | 2 +- include/hw/arm/virt.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 8823f2ed1c..08c40c314b 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -155,7 +155,7 @@ static const MemMapEntry base_memmap[] = { [VIRT_FW_CFG] = { 0x09020000, 0x00000018 }, [VIRT_GPIO] = { 0x09030000, 0x00001000 }, [VIRT_SECURE_UART] = { 0x09040000, 0x00001000 }, - [VIRT_SMMU] = { 0x09050000, 0x00020000 }, + [VIRT_SMMU] = { 0x09050000, SMMU_IO_LEN }, [VIRT_PCDIMM_ACPI] = { 0x09070000, MEMORY_HOTPLUG_IO_LEN }, [VIRT_ACPI_GED] = { 0x09080000, ACPI_GED_EVT_SEL_LEN }, [VIRT_NVDIMM_ACPI] = { 0x09090000, NVDIMM_ACPI_IO_LEN}, diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 345b2d5594..e6a449becd 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -106,6 +106,9 @@ typedef enum { ARM_L3_CACHE } ArmCacheType; +/* MMIO region size for SMMUv3 */ +#define SMMU_IO_LEN 0x20000 + enum { VIRT_FLASH, VIRT_MEM, -- Gitee From 9895192512af4b52aff88432618a474e69b44bdd Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 6 Nov 2024 14:47:27 +0000 Subject: [PATCH 755/939] hw/arm/smmuv3: Add initial support for SMMUv3 Nested device Based on SMMUv3 as a parent device, add a user-creatable smmuv3-nested device. Subsequent patches will add support to specify a PCI bus for this device. Currently only supported for "virt", so hook up the sybus mem & irq for that as well. No FDT support is added for now. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 34 ++++++++++++++++++++++++++++++++++ hw/arm/virt.c | 31 +++++++++++++++++++++++++++++-- hw/core/sysbus-fdt.c | 1 + include/hw/arm/smmuv3.h | 15 +++++++++++++++ include/hw/arm/virt.h | 6 ++++++ 5 files changed, 85 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index b860c8385f..3010471cdc 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -2069,6 +2069,19 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } +static void smmu_nested_realize(DeviceState *d, Error **errp) +{ + SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); + Error *local_err = NULL; + + c->parent_realize(d, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } +} + static const VMStateDescription vmstate_smmuv3_queue = { .name = "smmuv3_queue", .version_id = 1, @@ -2167,6 +2180,18 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmuv3_properties); } +static void smmuv3_nested_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_CLASS(klass); + + dc->vmsd = &vmstate_smmuv3; + device_class_set_parent_realize(dc, smmu_nested_realize, + &c->parent_realize); + dc->user_creatable = true; + dc->hotpluggable = false; +} + static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, IOMMUNotifierFlag old, IOMMUNotifierFlag new, @@ -2205,6 +2230,14 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, imrc->notify_flag_changed = smmuv3_notify_flag_changed; } +static const TypeInfo smmuv3_nested_type_info = { + .name = TYPE_ARM_SMMUV3_NESTED, + .parent = TYPE_ARM_SMMUV3, + .instance_size = sizeof(SMMUv3NestedState), + .class_size = sizeof(SMMUv3NestedClass), + .class_init = smmuv3_nested_class_init, +}; + static const TypeInfo smmuv3_type_info = { .name = TYPE_ARM_SMMUV3, .parent = TYPE_ARM_SMMU, @@ -2223,6 +2256,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { static void smmuv3_register_types(void) { type_register(&smmuv3_type_info); + type_register(&smmuv3_nested_type_info); type_register(&smmuv3_iommu_memory_region_info); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 08c40c314b..a55f297af2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -166,6 +166,7 @@ static const MemMapEntry base_memmap[] = { /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, + [VIRT_SMMU_NESTED] = { 0x0b010000, 0x00ff0000}, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, @@ -211,6 +212,7 @@ static const int a15irqmap[] = { [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ + [VIRT_SMMU_NESTED] = 200, }; static const char *valid_cpus[] = { @@ -3613,10 +3615,34 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineClass *mc = MACHINE_GET_CLASS(vms); - if (vms->platform_bus_dev) { - MachineClass *mc = MACHINE_GET_CLASS(vms); + /* For smmuv3-nested devices we need to set the mem & irq */ + if (device_is_dynamic_sysbus(mc, dev) && + object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_NESTED)) { + hwaddr base = vms->memmap[VIRT_SMMU_NESTED].base; + int irq = vms->irqmap[VIRT_SMMU_NESTED]; + + if (vms->smmu_nested_count >= MAX_SMMU_NESTED) { + error_setg(errp, "smmuv3-nested max count reached!"); + return; + } + + base += (vms->smmu_nested_count * SMMU_IO_LEN); + irq += (vms->smmu_nested_count * NUM_SMMU_IRQS); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); + for (int i = 0; i < 4; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, + qdev_get_gpio_in(vms->gic, irq + i)); + } + if (vms->iommu != VIRT_IOMMU_SMMUV3_NESTED) { + vms->iommu = VIRT_IOMMU_SMMUV3_NESTED; + } + vms->smmu_nested_count++; + } + + if (vms->platform_bus_dev) { if (device_is_dynamic_sysbus(mc, dev)) { platform_bus_link_device(PLATFORM_BUS_DEVICE(vms->platform_bus_dev), SYS_BUS_DEVICE(dev)); @@ -3789,6 +3815,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_NESTED); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); #endif diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c index eebcd28f9a..0f0d0b3e58 100644 --- a/hw/core/sysbus-fdt.c +++ b/hw/core/sysbus-fdt.c @@ -489,6 +489,7 @@ static const BindingEntry bindings[] = { #ifdef CONFIG_LINUX TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), + TYPE_BINDING("arm-smmuv3-nested", no_fdt_node), VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), #endif #ifdef CONFIG_TPM diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index d183a62766..87e628be7a 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -84,6 +84,21 @@ struct SMMUv3Class { #define TYPE_ARM_SMMUV3 "arm-smmuv3" OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) +#define TYPE_ARM_SMMUV3_NESTED "arm-smmuv3-nested" +OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) + +struct SMMUv3NestedState { + SMMUv3State smmuv3_state; +}; + +struct SMMUv3NestedClass { + /*< private >*/ + SMMUv3Class smmuv3_class; + /*< public >*/ + + DeviceRealize parent_realize; +}; + #define STAGE1_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S1P) #define STAGE2_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S2P) diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index e6a449becd..cd41e28202 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -109,6 +109,9 @@ typedef enum { /* MMIO region size for SMMUv3 */ #define SMMU_IO_LEN 0x20000 +/* Max supported nested SMMUv3 */ +#define MAX_SMMU_NESTED 64 + enum { VIRT_FLASH, VIRT_MEM, @@ -121,6 +124,7 @@ enum { VIRT_GIC_ITS, VIRT_GIC_REDIST, VIRT_SMMU, + VIRT_SMMU_NESTED, VIRT_UART, VIRT_CPUFREQ, VIRT_MMIO, @@ -155,6 +159,7 @@ enum { typedef enum VirtIOMMUType { VIRT_IOMMU_NONE, VIRT_IOMMU_SMMUV3, + VIRT_IOMMU_SMMUV3_NESTED, VIRT_IOMMU_VIRTIO, } VirtIOMMUType; @@ -222,6 +227,7 @@ struct VirtMachineState { bool mte; bool dtb_randomness; bool pmu; + int smmu_nested_count; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; -- Gitee From afca50145f52601d912a805b65bd4530e9278388 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 6 Nov 2024 15:53:45 +0000 Subject: [PATCH 756/939] hw/arm/smmuv3: Associate a pci bus with a SMMUv3 Nested device Subsequent patches will add IORT modifications to get this working. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 27 +++++++++++++++++++++++++++ include/hw/arm/smmuv3.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 3010471cdc..66e4e1b57d 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -24,6 +24,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-core.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bridge.h" #include "cpu.h" #include "trace.h" #include "qemu/log.h" @@ -2069,12 +2070,32 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } +static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) +{ + DeviceState *d = opaque; + SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + if (s_nested->pci_bus && !strcmp(bus->qbus.name, s_nested->pci_bus)) { + object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), + &error_abort); + } + } + return 0; +} + static void smmu_nested_realize(DeviceState *d, Error **errp) { SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); + SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; + object_child_foreach_recursive(object_get_root(), + smmuv3_nested_pci_host_bridge, d); + object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); + c->parent_realize(d, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -2161,6 +2182,11 @@ static Property smmuv3_properties[] = { DEFINE_PROP_END_OF_LIST() }; +static Property smmuv3_nested_properties[] = { + DEFINE_PROP_STRING("pci-bus", SMMUv3NestedState, pci_bus), + DEFINE_PROP_END_OF_LIST() +}; + static void smmuv3_instance_init(Object *obj) { /* Nothing much to do here as of now */ @@ -2188,6 +2214,7 @@ static void smmuv3_nested_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_smmuv3; device_class_set_parent_realize(dc, smmu_nested_realize, &c->parent_realize); + device_class_set_props(dc, smmuv3_nested_properties); dc->user_creatable = true; dc->hotpluggable = false; } diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index 87e628be7a..96513fce56 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -89,6 +89,8 @@ OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) struct SMMUv3NestedState { SMMUv3State smmuv3_state; + + char *pci_bus; }; struct SMMUv3NestedClass { -- Gitee From a7ffb5856940a1515ef84a4d4644b7c7c07afb8f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 6 Nov 2024 19:22:13 +0000 Subject: [PATCH 757/939] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we can have multiple user-creatable smmuv3-nested devices, each associated with different pci buses, update IORT ID mappings accordingly. Signed-off-by: Nicolin Chen Signed-off-by: Shameer Kolothum --- hw/arm/virt-acpi-build.c | 43 ++++++++++++++++++++++++++++------------ include/hw/arm/virt.h | 6 ++++++ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 076781423b..1d7839e4a0 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -555,8 +555,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i, nb_nodes, rc_mapping_count; const uint32_t iort_node_offset = IORT_NODE_OFFSET; - size_t node_size, smmu_offset = 0; + size_t node_size, *smmu_offset; AcpiIortIdMapping *idmap; + hwaddr base; + int irq, num_smmus = 0; uint32_t id = 0; GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); @@ -566,7 +568,21 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Table 2 The IORT */ acpi_table_begin(&table, table_data); - if (vms->iommu == VIRT_IOMMU_SMMUV3) { + if (vms->smmu_nested_count) { + irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU_NESTED].base; + num_smmus = vms->smmu_nested_count; + } else if (virt_has_smmuv3(vms)) { + irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU].base; + num_smmus = 1; + } + + smmu_offset = g_new0(size_t, num_smmus); + nb_nodes = 2; /* RC, ITS */ + nb_nodes += num_smmus; /* SMMU nodes */ + + if (virt_has_smmuv3(vms)) { AcpiIortIdMapping next_range = {0}; object_child_foreach_recursive(object_get_root(), @@ -588,18 +604,19 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } next_range.input_base = idmap->input_base + idmap->id_count; + if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + nb_nodes++; /* RMR node per SMMU */ + } } /* Append the last RC -> ITS ID mapping */ - if (next_range.input_base < 0xFFFF) { - next_range.id_count = 0xFFFF - next_range.input_base; + if (next_range.input_base < 0x10000) { + next_range.id_count = 0x10000 - next_range.input_base; g_array_append_val(its_idmaps, next_range); } - nb_nodes = 3; /* RC, ITS, SMMUv3 */ rc_mapping_count = smmu_idmaps->len + its_idmaps->len; } else { - nb_nodes = 2; /* RC, ITS */ rc_mapping_count = 1; } /* Number of IORT Nodes */ @@ -621,10 +638,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* GIC ITS Identifier Array */ build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); - if (vms->iommu == VIRT_IOMMU_SMMUV3) { - int irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + for (i = 0; i < num_smmus; i++) { + smmu_offset[i] = table_data->len - table.table_offset; - smmu_offset = table_data->len - table.table_offset; /* Table 9 SMMUv3 Format */ build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */ node_size = SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE; @@ -635,7 +651,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Reference to ID Array */ build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4); /* Base address */ - build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8); + build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8); /* Flags */ build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4); build_append_int_noprefix(table_data, 0, 4); /* Reserved */ @@ -646,12 +662,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */ build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */ build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */ + irq += NUM_SMMU_IRQS; build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */ /* DeviceID mapping index (ignored since interrupts are GSIV based) */ build_append_int_noprefix(table_data, 0, 4); /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); } /* Table 17 Root Complex Node */ @@ -684,7 +701,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 0, 3); /* Reserved */ /* Output Reference */ - if (vms->iommu == VIRT_IOMMU_SMMUV3) { + if (virt_has_smmuv3(vms)) { AcpiIortIdMapping *range; /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ @@ -692,7 +709,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); /* output IORT node is the smmuv3 node */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, smmu_offset); + range->id_count, smmu_offset[i]); } /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index cd41e28202..bc3c8b70da 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -295,4 +295,10 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) vms->highmem_redists) ? 2 : 1; } +static inline bool virt_has_smmuv3(const VirtMachineState *vms) +{ + return vms->iommu == VIRT_IOMMU_SMMUV3 || + vms->iommu == VIRT_IOMMU_SMMUV3_NESTED; +} + #endif /* QEMU_ARM_VIRT_H */ -- Gitee From ca17fd9b9e608e0a6e8a948ccf46fa020c12f510 Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:06:13 +0800 Subject: [PATCH 758/939] tests/qtest: Allow IORT acpi table to change List changed IORT file and allow it to change. Signed-off-by: caijian --- tests/qtest/bios-tables-test-allowed-diff.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..9a5a923d6b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,2 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/virt/IORT", -- Gitee From 1746ba1aee671b9552540e36a629988b00846a82 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 5 Oct 2021 10:53:13 +0200 Subject: [PATCH 759/939] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding To handle SMMUv3 nested stage support it is practical to expose the guest with reserved memory regions (RMRs) covering the IOVAs used by the host kernel to map physical MSI doorbells. Those IOVAs belong to [0x8000000, 0x8100000] matching MSI_IOVA_BASE and MSI_IOVA_LENGTH definitions in kernel arm-smmu-v3 driver. This is the window used to allocate IOVAs matching physical MSI doorbells. With those RMRs, the guest is forced to use a flat mapping for this range. Hence the assigned device is programmed with one IOVA from this range. Stage 1, owned by the guest has a flat mapping for this IOVA. Stage2, owned by the VMM then enforces a mapping from this IOVA to the physical MSI doorbell. The creation of those RMR nodes only is relevant if nested stage SMMU is in use, along with VFIO. As VFIO devices can be hotplugged, all RMRs need to be created in advance. Hence the patch introduces a new arm virt "nested-smmuv3" iommu type. ARM DEN 0049E.b IORT specification also mandates that when RMRs are present, the OS must preserve PCIe configuration performed by the boot FW. So along with the RMR IORT nodes, a _DSM function #5, as defined by PCI FIRMWARE SPECIFICATION EVISION 3.3, chapter 4.6.5 is added to PCIe host bridge and PCIe expander bridge objects. Signed-off-by: Eric Auger Suggested-by: Jean-Philippe Brucker Signed-off-by: Nicolin Chen Signed-off-by: Shameer Kolothum --- hw/arm/virt-acpi-build.c | 71 +++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 1d7839e4a0..ad0f79e03d 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -417,6 +417,14 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, .bus = vms->bus, }; + /* + * Nested SMMU requires RMRs for MSI 1-1 mapping, which + * require _DSM for PreservingPCI Boot Configurations + */ + if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + cfg.preserve_config = true; + } + if (vms->highmem_mmio) { cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO]; } @@ -495,7 +503,7 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms) #define IORT_NODE_OFFSET 48 static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, - uint32_t id_count, uint32_t out_ref) + uint32_t id_count, uint32_t out_ref, uint32_t flags) { /* Table 4 ID mapping format */ build_append_int_noprefix(table_data, input_base, 4); /* Input base */ @@ -503,7 +511,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, build_append_int_noprefix(table_data, input_base, 4); /* Output base */ build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */ /* Flags */ - build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4); + build_append_int_noprefix(table_data, flags, 4); /* Flags */ } struct AcpiIortIdMapping { @@ -545,6 +553,50 @@ static int iort_idmap_compare(gconstpointer a, gconstpointer b) return idmap_a->input_base - idmap_b->input_base; } +static void +build_iort_rmr_nodes(GArray *table_data, GArray *smmu_idmaps, + size_t *smmu_offset, uint32_t *id) +{ + AcpiIortIdMapping *range; + int i; + + for (i = 0; i < smmu_idmaps->len; i++) { + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + int bdf = range->input_base; + + /* Table 18 Reserved Memory Range Node */ + + build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */ + /* Length */ + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20, 2); + build_append_int_noprefix(table_data, 3, 1); /* Revision */ + build_append_int_noprefix(table_data, *id, 4); /* Identifier */ + /* Number of ID mappings */ + build_append_int_noprefix(table_data, 1, 4); + /* Reference to ID Array */ + build_append_int_noprefix(table_data, 28, 4); + + /* RMR specific data */ + + /* Flags */ + build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4); + /* Number of Memory Range Descriptors */ + build_append_int_noprefix(table_data, 1 , 4); + /* Reference to Memory Range Descriptors */ + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4); + build_iort_id_mapping(table_data, bdf, range->id_count, smmu_offset[i], 1); + + /* Table 19 Memory Range Descriptor */ + + /* Physical Range offset */ + build_append_int_noprefix(table_data, 0x8000000, 8); + /* Physical Range length */ + build_append_int_noprefix(table_data, 0x100000, 8); + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + *id += 1; + } +} + /* * Input Output Remapping Table (IORT) * Conforms to "IO Remapping Table System Software on ARM Platforms", @@ -554,7 +606,6 @@ static void build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i, nb_nodes, rc_mapping_count; - const uint32_t iort_node_offset = IORT_NODE_OFFSET; size_t node_size, *smmu_offset; AcpiIortIdMapping *idmap; hwaddr base; @@ -563,7 +614,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); - AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id, + AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id, .oem_table_id = vms->oem_table_id }; /* Table 2 The IORT */ acpi_table_begin(&table, table_data); @@ -668,7 +719,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 0, 4); /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); } /* Table 17 Root Complex Node */ @@ -709,7 +760,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); /* output IORT node is the smmuv3 node */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, smmu_offset[i]); + range->id_count, smmu_offset[i], 0); } /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ @@ -717,11 +768,15 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); /* output IORT node is the ITS group node (the first node) */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, iort_node_offset); + range->id_count, IORT_NODE_OFFSET, 0); } } else { /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); + } + + if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); } acpi_table_end(linker, &table); -- Gitee From bf12438e93f2d55aac6245f6a9f77f51b6fd2d8a Mon Sep 17 00:00:00 2001 From: caijian Date: Mon, 31 Mar 2025 15:06:24 +0800 Subject: [PATCH 760/939] tests/data/acpi/virt: Update IORT acpi table - * Disassembly of tests/data/acpi/virt/IORT, Fri Mar 28 18:05:37 2025 + * Disassembly of /tmp/aml-9R3932, Fri Mar 28 18:05:37 2025 * * ACPI Data Table [IORT] * * Format: [HexOffset DecimalOffset ByteLength] FieldName : FieldValue */ [000h 0000 4] Signature : "IORT" [IO Remapping Table] [004h 0004 4] Table Length : 00000080 -[008h 0008 1] Revision : 03 -[009h 0009 1] Checksum : B3 +[008h 0008 1] Revision : 05 +[009h 0009 1] Checksum : AE [00Ah 0010 6] Oem ID : "BOCHS " [010h 0016 8] Oem Table ID : "BXPC " [018h 0024 4] Oem Revision : 00000001 [01Ch 0028 4] Asl Compiler ID : "BXPC" [020h 0032 4] Asl Compiler Revision : 00000001 @@ -45,32 +45,32 @@ [058h 0088 4] Cache Coherency : 00000001 [05Ch 0092 1] Hints (decoded below) : 00 Transient : 0 Write Allocate : 0 Read Allocate : 0 Override : 0 [05Dh 0093 2] Reserved : 0000 [05Fh 0095 1] Memory Flags (decoded below) : 03 Coherency : 1 Device Attribute : 1 [060h 0096 4] ATS Attribute : 00000000 [064h 0100 4] PCI Segment Number : 00000000 [068h 0104 1] Memory Size Limit : 40 [069h 0105 3] Reserved : 000000 [06Ch 0108 4] Input base : 00000000 -[070h 0112 4] ID Count : 0000FFFF +[070h 0112 4] ID Count : 00010000 [074h 0116 4] Output Base : 00000000 [078h 0120 4] Output Reference : 00000030 [07Ch 0124 4] Flags (decoded below) : 00000000 Single Mapping : 0 Raw Table Data: Length 128 (0x80) - 0000: 49 4F 52 54 80 00 00 00 03 B3 42 4F 43 48 53 20 // IORT......BOCHS + 0000: 49 4F 52 54 80 00 00 00 05 AE 42 4F 43 48 53 20 // IORT......BOCHS 0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43 // BXPC ....BXPC 0020: 01 00 00 00 02 00 00 00 30 00 00 00 00 00 00 00 // ........0....... 0030: 00 18 00 01 00 00 00 00 00 00 00 00 00 00 00 00 // ................ 0040: 01 00 00 00 00 00 00 00 02 38 00 03 01 00 00 00 // .........8...... 0050: 01 00 00 00 24 00 00 00 01 00 00 00 00 00 00 03 // ....$........... 0060: 00 00 00 00 00 00 00 00 40 00 00 00 00 00 00 00 // ........@....... - 0070: FF FF 00 00 00 00 00 00 30 00 00 00 00 00 00 00 // ........0....... + 0070: 00 00 01 00 00 00 00 00 30 00 00 00 00 00 00 00 // ........0....... Signed-off-by: caijian --- tests/data/acpi/virt/IORT | Bin 128 -> 128 bytes tests/qtest/bios-tables-test-allowed-diff.h | 1 - 2 files changed, 1 deletion(-) diff --git a/tests/data/acpi/virt/IORT b/tests/data/acpi/virt/IORT index 7efd0ce8a6b3928efa7e1373f688ab4c5f50543b..9f0958b3df5aa6b9c3092885f79a20d82da8f011 100644 GIT binary patch delta 35 lcmZo*Y+&T_^bZPYU|?WiT{n>{O@M)c5y)m>FaVPb3;=k51i%0Q delta 35 jcmZo*Y+&T_^bZPYU|?Wi-aL^jP2m53AQK1-AQS@tmyia) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index 9a5a923d6b..dfb8523c8b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1,2 +1 @@ /* List of comma-separated changed AML files to ignore */ -"tests/data/acpi/virt/IORT", -- Gitee From 8414bc02f988ecca7dda5325227ff5ffbe45150c Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 15 Jan 2025 10:02:58 +0000 Subject: [PATCH 761/939] iommufd.h: Updated to openeuler olk-6.6 kernel Signed-off-by: Shameer Kolothum --- linux-headers/linux/iommufd.h | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 41559c6064..3e57fee01c 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -51,8 +51,8 @@ enum { IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, - IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f, - IOMMUFD_CMD_VDEVICE_ALLOC = 0x90, + IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, + IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, }; /** @@ -397,18 +397,20 @@ struct iommu_hwpt_vtd_s1 { }; /** - * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE * (IOMMU_HWPT_DATA_ARM_SMMUV3) * * @ste: The first two double words of the user space Stream Table Entry for - * a user stage-1 Context Descriptor Table. Must be little-endian. + * the translation. Must be little-endian. * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD * * -EIO will be returned if @ste is not legal or contains any non-allowed field. * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass - * nested domain will translate the same as the nesting parent. + * nested domain will translate the same as the nesting parent. The S1 will + * install a Context Descriptor Table pointing at userspace memory translated + * by the nesting parent. */ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; @@ -920,8 +922,8 @@ enum iommu_viommu_type { * that is unique to a specific VM. Operations global to the IOMMU are connected * to the vIOMMU, such as: * - Security namespace for guest owned ID, e.g. guest-controlled cache tags + * - Non-device-affiliated event reporting, e.g. invalidation queue errors * - Access to a sharable nesting parent pagetable across physical IOMMUs - * - Non-affiliated event reporting (e.g. an invalidation queue error) * - Virtualization of various platforms IDs, e.g. RIDs and others * - Delivery of paravirtualized invalidation * - Direct assigned invalidation queues @@ -941,12 +943,10 @@ struct iommu_viommu_alloc { * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) * @size: sizeof(struct iommu_vdevice_alloc) * @viommu_id: vIOMMU ID to associate with the virtual device - * @dev_id: The pyhsical device to allocate a virtual instance on the vIOMMU - * @__reserved: Must be 0 + * @dev_id: The physical device to allocate a virtual instance on the vIOMMU + * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID - * of AMD IOMMU, and vID of a nested Intel VT-d to a Context Table. - * @out_vdevice_id: Output virtual instance ID for the allocated object - * @__reserved2: Must be 0 + * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. @@ -955,10 +955,8 @@ struct iommu_vdevice_alloc { __u32 size; __u32 viommu_id; __u32 dev_id; - __u32 __reserved; - __aligned_u64 virt_id; __u32 out_vdevice_id; - __u32 __reserved2; + __aligned_u64 virt_id; }; #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) #endif -- Gitee From c8267f88b2af37779a597aac00aeaf06adc80ccc Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Mon, 11 Dec 2023 14:42:01 +0000 Subject: [PATCH 762/939] hw/arm/smmuv3: Enable sva/stall IDR features Emulate features that will enable the stall and sva feature in Guest. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3-internal.h | 3 ++- hw/arm/smmuv3.c | 8 +++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index a411fd4048..cfc04c563e 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -74,6 +74,7 @@ REG32(IDR1, 0x4) FIELD(IDR1, ECMDQ, 31, 1) #define SMMU_IDR1_SIDSIZE 16 +#define SMMU_IDR1_SSIDSIZE 16 #define SMMU_CMDQS 19 #define SMMU_EVENTQS 19 @@ -104,7 +105,7 @@ REG32(IDR5, 0x14) FIELD(IDR5, VAX, 10, 2); FIELD(IDR5, STALL_MAX, 16, 16); -#define SMMU_IDR5_OAS 4 +#define SMMU_IDR5_OAS 5 REG32(IIDR, 0x18) REG32(AIDR, 0x1c) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 66e4e1b57d..8d8dcccd48 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -343,13 +343,14 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, 1); /* 16-bit ASID */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, VMID16, 1); /* 16-bit VMID */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TTENDIAN, 2); /* little endian */ - s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 1); /* No stall */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 0); /* stall */ /* terminated transaction will always be aborted/error returned */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, 1); /* 2-level stream table supported */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, 1); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, SMMU_IDR1_SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, SMMU_IDR1_SSIDSIZE); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS, SMMU_CMDQS); @@ -361,7 +362,7 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1); s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2); - s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */ + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 48 bits */ /* 4K, 16K and 64K granule support */ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); @@ -776,9 +777,6 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event) if (!CD_A(cd)) { goto bad_cd; /* SMMU_IDR0.TERM_MODEL == 1 */ } - if (CD_S(cd)) { - goto bad_cd; /* !STE_SECURE && SMMU_IDR0.STALL_MODEL == 1 */ - } if (CD_HA(cd) || CD_HD(cd)) { goto bad_cd; /* HTTU = 0 */ } -- Gitee From cdd5c088ff46ebf423c926fe4c0b12e345ae0db0 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 23 Feb 2023 12:12:48 +0000 Subject: [PATCH 763/939] =?UTF-8?q?kvm:=20Translate=20MSI=20doorbell=20add?= =?UTF-8?q?ress=C2=A0only=20if=20it=20is=20valid?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guest might have already set the MSI doorbell address to invalid and if we try to translate the address again, Guest reports, [ 26.784082] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x10 received: [ 26.784088] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000001000000010 [ 26.784090] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 [ 26.784092] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 [ 26.784094] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 [ 26.788082] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x10 received: [ 26.788085] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000001000000010 [ 26.788087] arm-smmu-v3 arm-smmu-v3.0.auto: 0x0000000000000000 .... eg: rmmod hisi_zip.ko. The sequence seems to be, - Write 0 to MSI Message Address register - Disable MSI Hence check for address validity before we try to do the translation. Note: The fix is placed in generic code and hopefully is not a problem for other architectures. Signed-off-by: Shameer Kolothum --- accel/kvm/kvm-all.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index a8e29f148e..6fa97d2cbf 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2074,7 +2074,8 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, kroute.flags = KVM_MSI_VALID_DEVID; kroute.u.msi.devid = pci_requester_id(dev); } - if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { + if (msg.address && + kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { return -EINVAL; } -- Gitee From ebfa7213e32faafd5532d6f5b3cb873018b671ae Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 10 Oct 2024 06:19:31 +0000 Subject: [PATCH 764/939] smmuv3: Add support for page fault handling Handle page fault from host and send response back. Signed-off-by: Shameer Kolothum --- backends/iommufd.c | 20 +++- hw/arm/smmu-common.c | 39 ++++++-- hw/arm/smmuv3.c | 188 ++++++++++++++++++++++++++++++++++- hw/vfio/iommufd.c | 2 +- include/hw/arm/smmu-common.h | 24 ++++- include/sysemu/iommufd.h | 2 +- 6 files changed, 263 insertions(+), 12 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index ee6f5bcf65..e9ce82297b 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -228,7 +228,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, - Error **errp) + uint32_t *out_fault_fd, Error **errp) { int ret, fd = be->fd; struct iommu_hwpt_alloc alloc_hwpt = { @@ -241,6 +241,24 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, .data_uptr = (uintptr_t)data_ptr, }; + if (flags & IOMMU_HWPT_FAULT_ID_VALID) { + + struct iommu_fault_alloc cmd = { + .size = sizeof(cmd), + }; + + ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd); + if (ret) { + ret = -errno; + error_report("IOMMU_FAULT_ALLOC failed: %m"); + } else { + alloc_hwpt.fault_id = cmd.out_fault_id; + if (out_fault_fd) { + *out_fault_fd = cmd.out_fault_fd; + } + } + } + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, data_len, (uintptr_t)data_ptr, diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index d0bc620606..c382fa16e5 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -670,7 +670,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, IOMMU_HWPT_DATA_NONE, 0, NULL, - &s2_hwpt_id, errp)) { + &s2_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate an S2 hwpt"); return false; } @@ -695,7 +695,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, viommu->core->viommu_id, 0, IOMMU_HWPT_DATA_ARM_SMMUV3, sizeof(abort_data), &abort_data, - &viommu->abort_hwpt_id, errp)) { + &viommu->abort_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate an abort pagetable"); goto free_viommu_core; } @@ -704,7 +704,7 @@ static bool smmu_dev_attach_viommu(SMMUDevice *sdev, viommu->core->viommu_id, 0, IOMMU_HWPT_DATA_ARM_SMMUV3, sizeof(bypass_data), &bypass_data, - &viommu->bypass_hwpt_id, errp)) { + &viommu->bypass_hwpt_id, NULL, errp)) { error_setg(errp, "failed to allocate a bypass pagetable"); goto free_abort_hwpt; } @@ -882,6 +882,25 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) hwpt_id = sdev->viommu->bypass_hwpt_id; } + /* ToDo: May be better to move the below to smmuv3. */ + if (s1_hwpt->out_fault_fd) { + struct io_uring *ring = &s1_hwpt->fault_ring; + struct io_uring_sqe *sqe; + struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1}; + + s1_hwpt->exiting = true; + /* Send out a timeout sqe for the read handler to exit */ + sqe = io_uring_get_sqe(ring); + io_uring_prep_timeout(sqe, &ts, 0, 0); + io_uring_submit(ring); + + qemu_cond_signal(&s1_hwpt->fault_cond); + qemu_thread_join(&s1_hwpt->read_fault_thread); + qemu_thread_join(&s1_hwpt->write_fault_thread); + qemu_mutex_destroy(&s1_hwpt->fault_mutex); + io_uring_queue_exit(&s1_hwpt->fault_ring); + } + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { return; } @@ -892,11 +911,13 @@ void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) } int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, - uint32_t data_len, void *data) + uint32_t data_len, void *data, + bool req_fault_fd) { SMMUViommu *viommu = sdev->viommu; SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + uint32_t flags = 0; if (!idev || !viommu) { return -ENOENT; @@ -912,12 +933,18 @@ int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, } s1_hwpt->smmu = sdev->smmu; + s1_hwpt->sdev = sdev; s1_hwpt->viommu = viommu; s1_hwpt->iommufd = idev->iommufd; + if (req_fault_fd) { + flags |= IOMMU_HWPT_FAULT_ID_VALID; + } + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, - viommu->core->viommu_id, 0, data_type, - data_len, data, &s1_hwpt->hwpt_id, NULL)) { + viommu->core->viommu_id, flags, data_type, + data_len, data, &s1_hwpt->hwpt_id, + &s1_hwpt->out_fault_fd, NULL)) { goto free; } diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 8d8dcccd48..30c0ae4c3b 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -34,6 +34,9 @@ #include "hw/arm/smmuv3.h" #include "smmuv3-internal.h" #include "smmu-internal.h" +#ifdef CONFIG_LINUX_IO_URING +#include +#endif #define PTW_RECORD_FAULT(cfg) (((cfg)->stage == 1) ? (cfg)->record_faults : \ (cfg)->s2cfg.record_faults) @@ -1258,6 +1261,165 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) } } +static void smmuv3_report_iommu_fault(SMMUS1Hwpt *hwpt, + struct iommu_hwpt_pgfault *fault) +{ + PendFaultEntry *pend; + SMMUDevice *sdev = hwpt->sdev; + SMMUv3State *s3 = sdev->smmu; + uint32_t sid = smmu_get_sid(sdev); + SMMUEventInfo info = {0}; + + info.sid = sid; + info.type = SMMU_EVT_F_TRANSLATION; + info.u.f_translation.addr = fault->addr; + info.u.f_translation.stall = true; + info.u.f_translation.ssid = fault->pasid; + info.u.f_translation.stag = fault->grpid; + + if (fault->flags | IOMMU_PGFAULT_FLAGS_PASID_VALID) { + info.u.f_translation.ssv = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_READ) { + info.u.f_translation.rnw = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_PRIV) { + info.u.f_translation.pnu = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_EXEC) { + info.u.f_translation.ind = true; + } + + pend = g_new0(PendFaultEntry, 1); + memcpy(&pend->fault, fault, sizeof(*fault)); + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_INSERT_TAIL(&hwpt->pendfault, pend, entry); + qemu_mutex_unlock(&hwpt->fault_mutex); + smmuv3_record_event(s3, &info); + return; +} + +static void smmuv3_notify_stall_resume(SMMUState *bs, uint32_t sid, + uint32_t stag, uint32_t code) +{ + SMMUDevice *sdev = smmu_find_sdev(bs, sid); + PageRespEntry *msg; + PendFaultEntry *pend, *tmp; + SMMUS1Hwpt *hwpt; + bool found = false; + + if (!sdev) { + return; + } + + hwpt = sdev->s1_hwpt; + msg = g_new0(PageRespEntry, 1); + + /* Kernel expects addr and pasid info for page response */ + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(pend, &hwpt->pendfault, entry, tmp) { + if (pend->fault.grpid == stag) { + QTAILQ_REMOVE(&hwpt->pendfault, pend, entry); + msg->resp.cookie = pend->fault.cookie; + msg->resp.code = code; + QTAILQ_INSERT_TAIL(&hwpt->pageresp, msg, entry); + qemu_cond_signal(&hwpt->fault_cond); + + g_free(pend); + found = true; + break; + } + } + + qemu_mutex_unlock(&hwpt->fault_mutex); + if (!found) { + warn_report("No matching fault for resume(stag 0x%x), drop!", stag); + return; + } +} + +static void *write_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + PageRespEntry *msg, *tmp; + struct iommu_hwpt_page_response *resp; + int ret; + + resp = g_new0(struct iommu_hwpt_page_response, 1); + while (!hwpt->exiting) { + /* Check we have any pending responses */ + qemu_mutex_lock(&hwpt->fault_mutex); + qemu_cond_wait(&hwpt->fault_cond, &hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(msg, &hwpt->pageresp, entry, tmp) { + QTAILQ_REMOVE(&hwpt->pageresp, msg, entry); + memcpy(resp, &msg->resp, sizeof(*resp)); + g_free(msg); + + ret = write(hwpt->out_fault_fd, resp, sizeof(*resp)); + if (ret != sizeof(*resp)) { + warn_report("Write resp[cookie 0x%x] fail %d", + resp->cookie, ret); + } + } + qemu_mutex_unlock(&hwpt->fault_mutex); + } + g_free(resp); + return NULL; +} + +static void *read_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct iommu_hwpt_pgfault *fault; + struct io_uring *ring = &hwpt->fault_ring; + void *data; + int ret; + + fault = g_new0(struct iommu_hwpt_pgfault, 1); + while (!hwpt->exiting) { + sqe = io_uring_get_sqe(ring); + io_uring_prep_read(sqe, hwpt->out_fault_fd, fault, + sizeof(*fault), 0); + io_uring_sqe_set_data(sqe, fault); + io_uring_submit(ring); + + ret = io_uring_wait_cqe(ring, &cqe); + if (ret == 0) { + if (cqe->res == sizeof(*fault)) { + data = io_uring_cqe_get_data(cqe); + smmuv3_report_iommu_fault(hwpt, data); + } + } else { + warn_report("Read fault[hwpt_id 0x%x] failed %d", + hwpt->hwpt_id, ret); + } + io_uring_cqe_seen(ring, cqe); + } + g_free(fault); + return NULL; +} + +static void create_fault_handlers(SMMUS1Hwpt *hwpt) +{ + if (!hwpt->out_fault_fd) { + warn_report("No fault fd for hwpt id: %d", hwpt->hwpt_id); + return; + } + + io_uring_queue_init(1024, &hwpt->fault_ring, 0); + qemu_mutex_init(&hwpt->fault_mutex); + qemu_cond_init(&hwpt->fault_cond); + QTAILQ_INIT(&hwpt->pageresp); + QTAILQ_INIT(&hwpt->pendfault); + qemu_thread_create(&hwpt->read_fault_thread, "io fault read", + read_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); + qemu_thread_create(&hwpt->write_fault_thread, "io fault write", + write_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); +} static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) { #ifdef __linux__ @@ -1266,6 +1428,7 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) struct iommu_hwpt_arm_smmuv3 nested_data = {}; SMMUv3State *s = sdev->smmu; SMMUState *bs = &s->smmu_state; + bool req_fault_fd = false; uint32_t config; STE ste; int ret; @@ -1309,13 +1472,22 @@ static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ nested_data.ste[1] &= 0x380000ffULL; + if (STE_S1CDMAX(&ste)) { + req_fault_fd = true; + } + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, - sizeof(nested_data), &nested_data); + sizeof(nested_data), &nested_data, + req_fault_fd); if (ret) { error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", nested_data.ste[1], nested_data.ste[0], ret); } + if (req_fault_fd) { + create_fault_handlers(sdev->s1_hwpt); + } + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); #endif } @@ -1631,10 +1803,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_TLBI_EL2_VA: case SMMU_CMD_TLBI_EL2_VAA: case SMMU_CMD_PRI_RESP: - case SMMU_CMD_RESUME: case SMMU_CMD_STALL_TERM: trace_smmuv3_unhandled_cmd(type); break; + case SMMU_CMD_RESUME: + { + uint32_t sid = CMD_SID(&cmd); + uint16_t stag = CMD_RESUME_STAG(&cmd); + uint8_t action = CMD_RESUME_AC(&cmd); + uint32_t code = IOMMUFD_PAGE_RESP_INVALID; + + if (action) { + code = IOMMUFD_PAGE_RESP_SUCCESS; + } + smmuv3_notify_stall_resume(bs, sid, stag, code); + break; + } default: cmd_error = SMMU_CERROR_ILL; break; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 528023b95b..c0eb87c78c 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -344,7 +344,7 @@ static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, - &hwpt_id, errp)) { + &hwpt_id, NULL, errp)) { return -EINVAL; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index e30539a8d4..087a11efc7 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -138,13 +138,34 @@ typedef struct SMMUVdev { uint32_t sid; }SMMUVdev; +typedef struct PendFaultEntry { + struct iommu_hwpt_pgfault fault; + QTAILQ_ENTRY(PendFaultEntry) entry; +} PendFaultEntry; + +typedef struct PageRespEntry { + struct iommu_hwpt_page_response resp; + QTAILQ_ENTRY(PageRespEntry) entry; +} PageRespEntry; + typedef struct SMMUS1Hwpt { + void *sdev; void *smmu; IOMMUFDBackend *iommufd; SMMUViommu *viommu; uint32_t hwpt_id; + uint32_t out_fault_fd; QLIST_HEAD(, SMMUDevice) device_list; QLIST_ENTRY(SMMUViommu) next; + /* fault handling */ + struct io_uring fault_ring; + QemuThread read_fault_thread; + QemuThread write_fault_thread; + QemuMutex fault_mutex; + QemuCond fault_cond; + QTAILQ_HEAD(, PageRespEntry) pageresp; + QTAILQ_HEAD(, PendFaultEntry) pendfault; + bool exiting; } SMMUS1Hwpt; typedef struct SMMUDevice { @@ -258,7 +279,8 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, uint32_t data_len, void *data); void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, - uint32_t data_len, void *data); + uint32_t data_len, void *data, + bool req_fault_fd); int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, uint32_t *num, void *reqs); int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 0f2c826036..b279184974 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -62,7 +62,7 @@ bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, void *data_ptr, uint32_t *out_hwpt, - Error **errp); + uint32_t *out_fault_fd, Error **errp); bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, bool start, Error **errp); bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, -- Gitee From 494e0ace6c120af00b27a0cc1d4a478073654e35 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 12 Sep 2024 00:33:13 -0700 Subject: [PATCH 765/939] pci: Get pasid capability from vIOMMU Signed-off-by: Yi Liu --- hw/pci/pci.c | 13 +++++++++++++ include/hw/pci/pci.h | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index d6f627aa51..447ef2b163 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2802,6 +2802,19 @@ void pci_device_unset_iommu_device(PCIDevice *dev) } } +bool pci_device_get_pasid_cap(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->get_pasid_cap) { + return iommu_bus->iommu_ops->get_pasid_cap(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn); + } + return false; +} + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { /* diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 8d1af44249..0dfe274c33 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -418,12 +418,25 @@ typedef struct PCIIOMMUOps { * @devfn: device and function number of the PCI device. */ void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); + /** + * @get_pasid_cap: get pasid capability from vIOMMU + * + * Optional callback. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + */ + bool (*get_pasid_cap)(PCIBus *bus, void *opaque, int devfn); } PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, Error **errp); void pci_device_unset_iommu_device(PCIDevice *dev); +bool pci_device_get_pasid_cap(PCIDevice *dev); /** * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus -- Gitee From 0978556247d968ffc83beff3b2611c93fd9b6b13 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 12 Sep 2024 00:17:31 -0700 Subject: [PATCH 766/939] backend/iommufd: Report PASID capability Signed-off-by: Yi Liu --- backends/iommufd.c | 4 +++- hw/arm/smmu-common.c | 4 ++-- hw/arm/smmuv3.c | 4 +++- hw/vfio/iommufd.c | 4 +++- include/hw/arm/smmu-common.h | 2 +- include/sysemu/host_iommu_device.h | 1 + include/sysemu/iommufd.h | 3 ++- 7 files changed, 15 insertions(+), 7 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index e9ce82297b..4f5df63331 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -326,7 +326,8 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - uint64_t *caps, Error **errp) + uint64_t *caps, uint8_t *max_pasid_log2, + Error **errp) { struct iommu_hw_info info = { .size = sizeof(info), @@ -344,6 +345,7 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, *type = info.out_data_type; g_assert(caps); *caps = info.out_capabilities; + *max_pasid_log2 = info.out_max_pasid_log2; return true; } diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index c382fa16e5..e7028bd4ec 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -853,7 +853,7 @@ SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) /* IOMMUFD helpers */ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, - uint32_t data_len, void *data) + uint32_t data_len, uint8_t *pasid, void *data) { uint64_t caps; @@ -863,7 +863,7 @@ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, return !iommufd_backend_get_device_info(sdev->idev->iommufd, sdev->idev->devid, data_type, data, - data_len, &caps, NULL); + data_len, &caps, pasid, NULL); } void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 30c0ae4c3b..0ca0e96fcc 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -264,6 +264,7 @@ static void smmuv3_nested_init_regs(SMMUv3State *s) SMMUDevice *sdev; uint32_t data_type; uint32_t val; + uint8_t pasid; int ret; if (!bs->nested || !bs->viommu) { @@ -280,7 +281,8 @@ static void smmuv3_nested_init_regs(SMMUv3State *s) goto out; } - ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &sdev->info); + ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &pasid, + &sdev->info); if (ret) { error_report("failed to get SMMU device info"); return; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index c0eb87c78c..a108beda29 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -871,18 +871,20 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, struct iommu_hw_info_vtd vtd; } data; uint64_t hw_caps; + uint8_t pasids; hiod->agent = opaque; if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, &data, sizeof(data), - &hw_caps, errp)) { + &hw_caps, &pasids, errp)) { return false; } hiod->name = g_strdup(vdev->name); caps->type = type; caps->hw_caps = hw_caps; + caps->max_pasid_log2 = pasids; return true; } diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index 087a11efc7..8ae33c3753 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -276,7 +276,7 @@ void smmu_inv_notifiers_all(SMMUState *s); /* IOMMUFD helpers */ int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, - uint32_t data_len, void *data); + uint32_t data_len, uint8_t *pasid, void *data); void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, uint32_t data_len, void *data, diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h index 84131f5495..22c76a37a7 100644 --- a/include/sysemu/host_iommu_device.h +++ b/include/sysemu/host_iommu_device.h @@ -26,6 +26,7 @@ typedef struct HostIOMMUDeviceCaps { uint32_t type; uint64_t hw_caps; + uint8_t max_pasid_log2; } HostIOMMUDeviceCaps; #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index b279184974..29afaa429d 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -57,7 +57,8 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, - uint64_t *caps, Error **errp); + uint64_t *caps, uint8_t *max_pasid_log2, + Error **errp); bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t data_type, uint32_t data_len, -- Gitee From da7cdc41aa3813f6bb1c87ced178f60185dac692 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 12 Sep 2024 01:38:46 -0700 Subject: [PATCH 767/939] vfio: Synthesize vPASID capability to VM If user wants to expose PASID capability in vIOMMU, then VFIO would also report the PASID cap for this device if the underlying hardware supports it as well. As a start, this chooses to put the vPASID cap in the last 8 bytes of the vconfig space. This is a choice in the good hope of no conflict with any existing cap or hidden registers. For the devices that has hidden registers, user should figure out a proper offset for the vPASID cap. This may require an option for user to config it. Here we leave it as a future extension. There are more discussions on the mechanism of finding the proper offset. https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/ Signed-off-by: Yi Liu --- hw/pci/pcie.c | 12 ++++++++++++ hw/vfio/pci.c | 28 ++++++++++++++++++++++++++++ include/hw/pci/pcie.h | 4 ++++ 3 files changed, 44 insertions(+) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 04fbd794a8..a5b4e54bd7 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1123,3 +1123,15 @@ void pcie_acs_reset(PCIDevice *dev) pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0); } } + +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps) +{ + pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, 1, + offset, PCI_EXT_CAP_PASID_SIZEOF); + + dev->exp.pasid_cap = offset; + + pci_set_word(dev->config + offset + PCI_PASID_CAP, caps); + + pci_set_word(dev->wmask + dev->exp.pasid_cap + PCI_PASID_CTRL, 0x7); +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f585f285f4..293deb8737 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -21,6 +21,7 @@ #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include +#include #include #include "hw/hw.h" @@ -2348,6 +2349,33 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) } + { + HostIOMMUDeviceCaps *caps = &vdev->vbasedev.hiod->caps; + + /* + * TODO: Add option for enabling pasid at a safe offset, this adds the + * pasid capability in the end of the PCIE config space. + */ + if (caps->max_pasid_log2 && pci_device_get_pasid_cap(&vdev->pdev)) { + uint16_t pasid_caps = (caps->max_pasid_log2 << 8) & PCI_PASID_CAP_WIDTH; + + if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) { + pasid_caps |= PCI_PASID_CAP_EXEC; + } + + if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) { + pasid_caps |= PCI_PASID_CAP_PRIV; + } + + pcie_pasid_init(pdev, + PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF, + pasid_caps); + + /* PASID capability is fully emulated by QEMU */ + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8); + } + } + /* Cleanup chain head ID if necessary */ if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index 11f5a91bbb..41ee27f023 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -79,6 +79,9 @@ struct PCIExpressDevice { uint16_t sriov_cap; PCIESriovPF sriov_pf; PCIESriovVF sriov_vf; + + /* Offset of PASID capability in config space */ + uint16_t pasid_cap; }; #define COMPAT_PROP_PCP "power_controller_present" @@ -147,4 +150,5 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps); #endif /* QEMU_PCIE_H */ -- Gitee From d4d0d15716a3f4c89ca9532e6b598b14db76ae0c Mon Sep 17 00:00:00 2001 From: Zhangfei Gao Date: Sat, 26 Oct 2024 08:40:11 +0000 Subject: [PATCH 768/939] smmuv3: realize get_pasid_cap and set ssidsize with pasid Signed-off-by: Zhangfei Gao --- hw/arm/smmu-common.c | 9 +++++++++ hw/arm/smmuv3.c | 3 +-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index e7028bd4ec..3a257a5b0e 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -831,10 +831,19 @@ static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) } } +static bool smmu_dev_get_pasid_cap(PCIBus *bus, + void *opaque, int devfn) +{ + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + + return true; +} + static const PCIIOMMUOps smmu_ops = { .get_address_space = smmu_find_add_as, .set_iommu_device = smmu_dev_set_iommu_device, .unset_iommu_device = smmu_dev_unset_iommu_device, + .get_pasid_cap = smmu_dev_get_pasid_cap, }; SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 0ca0e96fcc..6964ab000d 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -312,8 +312,7 @@ out: val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); - val = FIELD_EX32(sdev->info.idr[1], IDR1, SSIDSIZE); - s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, val); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, pasid); val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); -- Gitee From 58f66c2581b3c4a45a02717330f1b2188424889b Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 15 Jan 2025 16:11:21 +0000 Subject: [PATCH 769/939] smmu-common: Return sysmem address space only for vfio-pci This will enable pcie-root-port hotplug event irq to work. Discussion Link: https://lore.kernel.org/qemu-devel/74114c0db34b420a90e9fe5bd991767e@huawei.com/ Signed-off-by: Shameer Kolothum --- hw/arm/smmu-common.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 3a257a5b0e..6c4b82757f 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -639,9 +639,16 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) SMMUState *s = opaque; SMMUPciBus *sbus = smmu_get_sbus(s, bus); SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + bool is_vfio = false; + PCIDevice *pdev; + + pdev = pci_find_device(bus, pci_bus_num(bus), devfn); + if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + is_vfio = true; + } /* Return the system as if the device uses stage-2 only */ - if (s->nested && !sdev->s1_hwpt) { + if (s->nested && !sdev->s1_hwpt && is_vfio) { return &sdev->as_sysmem; } else { return &sdev->as; -- Gitee From 2697e7418c1e0d87c82feca33800e3a093546a90 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 16 Jan 2025 15:20:18 +0000 Subject: [PATCH 770/939] smmuv3: Change arm-smmuv3-nested name to arm-smmuv3-accel This is based on feedback received for RFC v1. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 38 +++++++++++++++++++------------------- hw/arm/virt-acpi-build.c | 16 ++++++++-------- hw/arm/virt.c | 24 ++++++++++++------------ hw/core/sysbus-fdt.c | 2 +- include/hw/arm/smmuv3.h | 8 ++++---- include/hw/arm/virt.h | 10 +++++----- 6 files changed, 49 insertions(+), 49 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index 6964ab000d..ecdad6bda4 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -2253,14 +2253,14 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } -static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) +static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) { DeviceState *d = opaque; - SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); + SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; - if (s_nested->pci_bus && !strcmp(bus->qbus.name, s_nested->pci_bus)) { + if (s_accel->pci_bus && !strcmp(bus->qbus.name, s_accel->pci_bus)) { object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), &error_abort); } @@ -2268,15 +2268,15 @@ static int smmuv3_nested_pci_host_bridge(Object *obj, void *opaque) return 0; } -static void smmu_nested_realize(DeviceState *d, Error **errp) +static void smmu_accel_realize(DeviceState *d, Error **errp) { - SMMUv3NestedState *s_nested = ARM_SMMUV3_NESTED(d); - SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_GET_CLASS(s_nested); + SMMUv3AccelState *s_nested = ARM_SMMUV3_ACCEL(d); + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_nested); SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; object_child_foreach_recursive(object_get_root(), - smmuv3_nested_pci_host_bridge, d); + smmuv3_accel_pci_host_bridge, d); object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); c->parent_realize(d, &local_err); @@ -2365,8 +2365,8 @@ static Property smmuv3_properties[] = { DEFINE_PROP_END_OF_LIST() }; -static Property smmuv3_nested_properties[] = { - DEFINE_PROP_STRING("pci-bus", SMMUv3NestedState, pci_bus), +static Property smmuv3_accel_properties[] = { + DEFINE_PROP_STRING("pci-bus", SMMUv3AccelState, pci_bus), DEFINE_PROP_END_OF_LIST() }; @@ -2389,15 +2389,15 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmuv3_properties); } -static void smmuv3_nested_class_init(ObjectClass *klass, void *data) +static void smmuv3_accel_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SMMUv3NestedClass *c = ARM_SMMUV3_NESTED_CLASS(klass); + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass); dc->vmsd = &vmstate_smmuv3; - device_class_set_parent_realize(dc, smmu_nested_realize, + device_class_set_parent_realize(dc, smmu_accel_realize, &c->parent_realize); - device_class_set_props(dc, smmuv3_nested_properties); + device_class_set_props(dc, smmuv3_accel_properties); dc->user_creatable = true; dc->hotpluggable = false; } @@ -2440,12 +2440,12 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, imrc->notify_flag_changed = smmuv3_notify_flag_changed; } -static const TypeInfo smmuv3_nested_type_info = { - .name = TYPE_ARM_SMMUV3_NESTED, +static const TypeInfo smmuv3_accel_type_info = { + .name = TYPE_ARM_SMMUV3_ACCEL, .parent = TYPE_ARM_SMMUV3, - .instance_size = sizeof(SMMUv3NestedState), - .class_size = sizeof(SMMUv3NestedClass), - .class_init = smmuv3_nested_class_init, + .instance_size = sizeof(SMMUv3AccelState), + .class_size = sizeof(SMMUv3AccelClass), + .class_init = smmuv3_accel_class_init, }; static const TypeInfo smmuv3_type_info = { @@ -2466,7 +2466,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { static void smmuv3_register_types(void) { type_register(&smmuv3_type_info); - type_register(&smmuv3_nested_type_info); + type_register(&smmuv3_accel_type_info); type_register(&smmuv3_iommu_memory_region_info); } diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index ad0f79e03d..db635120f9 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -418,10 +418,10 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, }; /* - * Nested SMMU requires RMRs for MSI 1-1 mapping, which + * Accel SMMU requires RMRs for MSI 1-1 mapping, which * require _DSM for PreservingPCI Boot Configurations */ - if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { cfg.preserve_config = true; } @@ -619,10 +619,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Table 2 The IORT */ acpi_table_begin(&table, table_data); - if (vms->smmu_nested_count) { - irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE; - base = vms->memmap[VIRT_SMMU_NESTED].base; - num_smmus = vms->smmu_nested_count; + if (vms->smmu_accel_count) { + irq = vms->irqmap[VIRT_SMMU_ACCEL] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU_ACCEL].base; + num_smmus = vms->smmu_accel_count; } else if (virt_has_smmuv3(vms)) { irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; base = vms->memmap[VIRT_SMMU].base; @@ -655,7 +655,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } next_range.input_base = idmap->input_base + idmap->id_count; - if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { nb_nodes++; /* RMR node per SMMU */ } } @@ -775,7 +775,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); } - if (vms->iommu == VIRT_IOMMU_SMMUV3_NESTED) { + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index a55f297af2..57d00acd48 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -166,7 +166,7 @@ static const MemMapEntry base_memmap[] = { /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, - [VIRT_SMMU_NESTED] = { 0x0b010000, 0x00ff0000}, + [VIRT_SMMU_ACCEL] = { 0x0b010000, 0x00ff0000}, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, @@ -212,7 +212,7 @@ static const int a15irqmap[] = { [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ - [VIRT_SMMU_NESTED] = 200, + [VIRT_SMMU_ACCEL] = 200, }; static const char *valid_cpus[] = { @@ -3619,27 +3619,27 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, /* For smmuv3-nested devices we need to set the mem & irq */ if (device_is_dynamic_sysbus(mc, dev) && - object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_NESTED)) { - hwaddr base = vms->memmap[VIRT_SMMU_NESTED].base; - int irq = vms->irqmap[VIRT_SMMU_NESTED]; + object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_ACCEL)) { + hwaddr base = vms->memmap[VIRT_SMMU_ACCEL].base; + int irq = vms->irqmap[VIRT_SMMU_ACCEL]; - if (vms->smmu_nested_count >= MAX_SMMU_NESTED) { + if (vms->smmu_accel_count >= MAX_SMMU_ACCEL) { error_setg(errp, "smmuv3-nested max count reached!"); return; } - base += (vms->smmu_nested_count * SMMU_IO_LEN); - irq += (vms->smmu_nested_count * NUM_SMMU_IRQS); + base += (vms->smmu_accel_count * SMMU_IO_LEN); + irq += (vms->smmu_accel_count * NUM_SMMU_IRQS); sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); for (int i = 0; i < 4; i++) { sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, qdev_get_gpio_in(vms->gic, irq + i)); } - if (vms->iommu != VIRT_IOMMU_SMMUV3_NESTED) { - vms->iommu = VIRT_IOMMU_SMMUV3_NESTED; + if (vms->iommu != VIRT_IOMMU_SMMUV3_ACCEL) { + vms->iommu = VIRT_IOMMU_SMMUV3_ACCEL; } - vms->smmu_nested_count++; + vms->smmu_accel_count++; } if (vms->platform_bus_dev) { @@ -3815,7 +3815,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_NESTED); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_ACCEL); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); #endif diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c index 0f0d0b3e58..58f4dc614c 100644 --- a/hw/core/sysbus-fdt.c +++ b/hw/core/sysbus-fdt.c @@ -489,7 +489,7 @@ static const BindingEntry bindings[] = { #ifdef CONFIG_LINUX TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), - TYPE_BINDING("arm-smmuv3-nested", no_fdt_node), + TYPE_BINDING("arm-smmuv3-accel", no_fdt_node), VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), #endif #ifdef CONFIG_TPM diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index 96513fce56..79b6fcd8e7 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -84,16 +84,16 @@ struct SMMUv3Class { #define TYPE_ARM_SMMUV3 "arm-smmuv3" OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) -#define TYPE_ARM_SMMUV3_NESTED "arm-smmuv3-nested" -OBJECT_DECLARE_TYPE(SMMUv3NestedState, SMMUv3NestedClass, ARM_SMMUV3_NESTED) +#define TYPE_ARM_SMMUV3_ACCEL "arm-smmuv3-accel" +OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL) -struct SMMUv3NestedState { +struct SMMUv3AccelState { SMMUv3State smmuv3_state; char *pci_bus; }; -struct SMMUv3NestedClass { +struct SMMUv3AccelClass { /*< private >*/ SMMUv3Class smmuv3_class; /*< public >*/ diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index bc3c8b70da..3e2759d225 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -110,7 +110,7 @@ typedef enum { #define SMMU_IO_LEN 0x20000 /* Max supported nested SMMUv3 */ -#define MAX_SMMU_NESTED 64 +#define MAX_SMMU_ACCEL 64 enum { VIRT_FLASH, @@ -124,7 +124,7 @@ enum { VIRT_GIC_ITS, VIRT_GIC_REDIST, VIRT_SMMU, - VIRT_SMMU_NESTED, + VIRT_SMMU_ACCEL, VIRT_UART, VIRT_CPUFREQ, VIRT_MMIO, @@ -159,7 +159,7 @@ enum { typedef enum VirtIOMMUType { VIRT_IOMMU_NONE, VIRT_IOMMU_SMMUV3, - VIRT_IOMMU_SMMUV3_NESTED, + VIRT_IOMMU_SMMUV3_ACCEL, VIRT_IOMMU_VIRTIO, } VirtIOMMUType; @@ -227,7 +227,7 @@ struct VirtMachineState { bool mte; bool dtb_randomness; bool pmu; - int smmu_nested_count; + int smmu_accel_count; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; @@ -298,7 +298,7 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) static inline bool virt_has_smmuv3(const VirtMachineState *vms) { return vms->iommu == VIRT_IOMMU_SMMUV3 || - vms->iommu == VIRT_IOMMU_SMMUV3_NESTED; + vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL; } #endif /* QEMU_ARM_VIRT_H */ -- Gitee From 5e83bdd94533c91d69c7154d967f3bdd2fa86054 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Thu, 16 Jan 2025 15:29:49 +0000 Subject: [PATCH 771/939] smmuv3: Use default bus for arm-smmuv3-accel This is based on feedback on RFC v1. Signed-off-by: Shameer Kolothum --- hw/arm/smmuv3.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index ecdad6bda4..c0fcdd7574 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -2256,11 +2256,10 @@ static void smmu_realize(DeviceState *d, Error **errp) static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) { DeviceState *d = opaque; - SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; - if (s_accel->pci_bus && !strcmp(bus->qbus.name, s_accel->pci_bus)) { + if (d->parent_bus && !strcmp(bus->qbus.name, d->parent_bus->name)) { object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), &error_abort); } @@ -2365,11 +2364,6 @@ static Property smmuv3_properties[] = { DEFINE_PROP_END_OF_LIST() }; -static Property smmuv3_accel_properties[] = { - DEFINE_PROP_STRING("pci-bus", SMMUv3AccelState, pci_bus), - DEFINE_PROP_END_OF_LIST() -}; - static void smmuv3_instance_init(Object *obj) { /* Nothing much to do here as of now */ @@ -2397,9 +2391,9 @@ static void smmuv3_accel_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_smmuv3; device_class_set_parent_realize(dc, smmu_accel_realize, &c->parent_realize); - device_class_set_props(dc, smmuv3_accel_properties); dc->user_creatable = true; dc->hotpluggable = false; + dc->bus_type = TYPE_PCIE_BUS; } static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, -- Gitee From b1087bb8a4edbacc7240c0fcab63bc1cf2624627 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 21 Jan 2025 14:42:45 +0000 Subject: [PATCH 772/939] gpex-acpi: Remove duplicate DSM #5 It looks like acpi_dsdt_add_pci_osc() already builds the _DSM for virt/gpex case, and we don't need to add duplicate DSM methods for _DSM #5 case. And the acpi_dsdt_add_pci_osc() already adds _DSM #5 when preserve_config is true. This is to get rid of the ACPI related error messages during boot: ACPI BIOS Error (bug): Failure creating named object [\_SB.PC08._DSM], AE_ALREADY_EXISTS ACPI BIOS Error (bug): \_SB.PC08.PCI0._DSM: Excess arguments - ASL declared 5, ACPI requires 4 ToDo: Only sanity tested. Signed-off-by: Shameer Kolothum --- hw/pci-host/gpex-acpi.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index ce424fc9da..162f6221ab 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -189,12 +189,6 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); } - if (cfg->preserve_config) { - method = aml_method("_DSM", 5, AML_SERIALIZED); - aml_append(method, aml_return(aml_int(0))); - aml_append(dev, method); - } - acpi_dsdt_add_pci_route_table(dev, cfg->irq); /* @@ -226,12 +220,6 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); - if (cfg->preserve_config) { - method = aml_method("_DSM", 5, AML_SERIALIZED); - aml_append(method, aml_return(aml_int(0))); - aml_append(dev, method); - } - acpi_dsdt_add_pci_route_table(dev, cfg->irq); method = aml_method("_CBA", 0, AML_NOTSERIALIZED); -- Gitee From c1f1346eea8da6552e085aa13630bbf5227db00f Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 7 Apr 2025 12:54:10 -0400 Subject: [PATCH 773/939] hw/pci-host/designware: Fix ATU_UPPER_TARGET register access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 04e99f9eb7920b0f0fcce65686c3bedf5e32a1f9 Fix copy/paste error writing to the ATU_UPPER_TARGET register, we want to update the upper 32 bits. Cc: qemu-stable@nongnu.org Reported-by: Joey Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2861 Fixes: d64e5eabc4c ("pci: Add support for Designware IP block") Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Gustavo Romero Message-Id: <20250331152041.74533-2-philmd@linaro.org> Signed-off-by: qihao_yewu --- hw/pci-host/designware.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c index f477f97847..004142709c 100644 --- a/hw/pci-host/designware.c +++ b/hw/pci-host/designware.c @@ -360,7 +360,7 @@ static void designware_pcie_root_config_write(PCIDevice *d, uint32_t address, case DESIGNWARE_PCIE_ATU_UPPER_TARGET: viewport->target &= 0x00000000FFFFFFFFULL; - viewport->target |= val; + viewport->target |= (uint64_t)val << 32; break; case DESIGNWARE_PCIE_ATU_LIMIT: -- Gitee From 068fef175047c18f60900dacd54c7a436114c164 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 7 Apr 2025 13:18:47 -0400 Subject: [PATCH 774/939] hw/ufs: free irq on exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from c458f9474d6574505ce9144ab1a90b951e69c1bd Fix a memory leak bug in ufs_init_pci() due to u->irq not being freed in ufs_exit(). Signed-off-by: Zheng Huang Reviewed-by: Philippe Mathieu-Daudé Message-ID: <43ceb427-87aa-44ee-9007-dbaecc499bba@gmail.com> Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/ufs/ufs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c index 068895b27b..f57d33e771 100644 --- a/hw/ufs/ufs.c +++ b/hw/ufs/ufs.c @@ -25,6 +25,7 @@ #include "qapi/error.h" #include "migration/vmstate.h" #include "scsi/constants.h" +#include "hw/irq.h" #include "trace.h" #include "ufs.h" @@ -1286,6 +1287,8 @@ static void ufs_exit(PCIDevice *pci_dev) { UfsHc *u = UFS(pci_dev); + qemu_free_irq(u->irq); + qemu_bh_delete(u->doorbell_bh); qemu_bh_delete(u->complete_bh); -- Gitee From 3746a434596b9bc20994c869c79fb9db24227418 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 7 Apr 2025 13:56:18 -0400 Subject: [PATCH 775/939] hw/sd/sdhci: free irq on exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 1c2d03bb0889b7a9a677d53126fb035190683af4 Fix a memory leak bug in sdhci_pci_realize() due to s->irq not being freed in sdhci_pci_exit(). Signed-off-by: Zheng Huang Reviewed-by: Philippe Mathieu-Daudé Message-ID: <09ddf42b-a6db-42d5-954b-148d09d8d6cc@gmail.com> [PMD: Moved qemu_free_irq() call before sdhci_common_unrealize()] Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: qihao_yewu --- hw/sd/sdhci-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/sd/sdhci-pci.c b/hw/sd/sdhci-pci.c index 9b7bee8b3f..c1eb67cf29 100644 --- a/hw/sd/sdhci-pci.c +++ b/hw/sd/sdhci-pci.c @@ -18,6 +18,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu/module.h" +#include "hw/irq.h" #include "hw/qdev-properties.h" #include "hw/sd/sdhci.h" #include "sdhci-internal.h" @@ -49,6 +50,7 @@ static void sdhci_pci_exit(PCIDevice *dev) { SDHCIState *s = PCI_SDHCI(dev); + qemu_free_irq(s->irq); sdhci_common_unrealize(s); sdhci_uninitfn(s); } -- Gitee From 52cc8f5a9ba854268a58402d351d2fd43dddb1b4 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Mon, 7 Apr 2025 17:54:20 -0400 Subject: [PATCH 776/939] target/s390x: Fix a typo in s390_cpu_class_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 6a93b1c7b4cfa4f5e3c0b8a17177ce14aaa2346c Replace the comma at the end of the line by a semicolon. Fixes: 41868f846d2 ("s390x/cpumodel: "host" and "qemu" as CPU subclasses") Reviewed-by: Richard Henderson Reviewed-by: Thomas Huth Signed-off-by: Philippe Mathieu-Daudé Message-ID: <20250324165356.39540-1-philmd@linaro.org> Signed-off-by: Thomas Huth Signed-off-by: qihao_yewu --- target/s390x/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 6acfa1c91b..5e64f24cc2 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -350,7 +350,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) device_class_set_parent_reset(dc, s390_cpu_reset_full, &scc->parent_reset); scc->reset = s390_cpu_reset; - cc->class_by_name = s390_cpu_class_by_name, + cc->class_by_name = s390_cpu_class_by_name; cc->has_work = s390_cpu_has_work; cc->dump_state = s390_cpu_dump_state; cc->query_cpu_fast = s390_query_cpu_fast; -- Gitee From 7e1bd6e7e109c6228bc4c40ea6f2af2d7f281fca Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Tue, 8 Apr 2025 05:59:29 -0400 Subject: [PATCH 777/939] hw/misc/aspeed_hace: Fix buffer overflow in has_padding function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cheery-pick from 78877b2e06464f49f777e086845e094ea7bc82ef The maximum padding size is either 64 or 128 bytes and should always be smaller than "req_len". If "padding_size" exceeds "req_len", then "req_len - padding_size" underflows due to "uint32_t" data type, leading to a large incorrect value (e.g., `0xFFXXXXXX`). This causes an out-of-bounds memory access, potentially leading to a buffer overflow. Added a check to ensure "padding_size" does not exceed "req_len" before computing "pad_offset". This prevents "req_len - padding_size" from underflowing and avoids accessing invalid memory. Signed-off-by: Jamin Lin Reviewed-by: Cédric Le Goater Fixes: 5cd7d8564a8b563da724b9e6264c967f0a091afa ("aspeed/hace: Support AST2600 HACE ") Link: https://lore.kernel.org/qemu-devel/20250321092623.2097234-3-jamin_lin@aspeedtech.com Signed-off-by: Cédric Le Goater Signed-off-by: qihao_yewu --- hw/misc/aspeed_hace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c index b07506ec04..8706e3d376 100644 --- a/hw/misc/aspeed_hace.c +++ b/hw/misc/aspeed_hace.c @@ -123,6 +123,11 @@ static bool has_padding(AspeedHACEState *s, struct iovec *iov, if (*total_msg_len <= s->total_req_len) { uint32_t padding_size = s->total_req_len - *total_msg_len; uint8_t *padding = iov->iov_base; + + if (padding_size > req_len) { + return false; + } + *pad_offset = req_len - padding_size; if (padding[*pad_offset] == 0x80) { return true; -- Gitee From 5eb0bb1f8ce9835b368e78d414ff6136c77ef94b Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Tue, 8 Apr 2025 06:51:26 -0400 Subject: [PATCH 778/939] hw/xen: Fix xen_bus_realize() error handling cheery-pick from de7b18083bfed4e1a01bb40b4ad050c47d2011fa The Error ** argument must be NULL, &error_abort, &error_fatal, or a pointer to a variable containing NULL. Passing an argument of the latter kind twice without clearing it in between is wrong: if the first call sets an error, it no longer points to NULL for the second call. xen_bus_realize() is wrong that way: it passes &local_err to xs_node_watch() in a loop. If this fails in more than one iteration, it can trip error_setv()'s assertion. Fix by clearing @local_err. Fixes: c4583c8c394e (xen-bus: reduce scope of backend watch) Signed-off-by: Markus Armbruster Message-ID: <20250314143500.2449658-2-armbru@redhat.com> Reviewed-by: Stefano Stabellini Signed-off-by: qihao_yewu --- hw/xen/xen-bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c index 4973e7d9c9..c10b089914 100644 --- a/hw/xen/xen-bus.c +++ b/hw/xen/xen-bus.c @@ -352,6 +352,7 @@ static void xen_bus_realize(BusState *bus, Error **errp) error_reportf_err(local_err, "failed to set up '%s' enumeration watch: ", type[i]); + local_err = NULL; } g_free(node); -- Gitee From ca3f4fd234ea4b8f02a415b99b449e71d028c076 Mon Sep 17 00:00:00 2001 From: qihao_yewu Date: Tue, 8 Apr 2025 07:27:47 -0400 Subject: [PATCH 779/939] cryptodev: Fix error handling in cryptodev_lkcf_execute_task() cheery-pick from 1c89dfefc4c33295126208225f202f39b5a234c3 When cryptodev_lkcf_set_op_desc() fails, we report an error, but continue anyway. This is wrong. We then pass a non-null @local_error to various functions, which could easily fail error_setv()'s assertion on failure. Fail the function instead. When qcrypto_akcipher_new() fails, we fail the function without reporting the error. This leaks the Error object. Add the missing error reporting. This also frees the Error object. Signed-off-by: Markus Armbruster Message-ID: <20250312101131.1615777-1-armbru@redhat.com> Reviewed-by: zhenwei pi Signed-off-by: qihao_yewu --- backends/cryptodev-lkcf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c index 45aba1ff67..45b287a953 100644 --- a/backends/cryptodev-lkcf.c +++ b/backends/cryptodev-lkcf.c @@ -330,6 +330,8 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task) cryptodev_lkcf_set_op_desc(&session->akcipher_opts, op_desc, sizeof(op_desc), &local_error) != 0) { error_report_err(local_error); + status = -VIRTIO_CRYPTO_ERR; + goto out; } else { key_id = add_key(KCTL_KEY_TYPE_PKEY, "lkcf-backend-priv-key", p8info, p8info_len, KCTL_KEY_RING); @@ -346,6 +348,7 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task) session->key, session->keylen, &local_error); if (!akcipher) { + error_report_err(local_error); status = -VIRTIO_CRYPTO_ERR; goto out; } -- Gitee From c0717e82e34f96af456309b3786a6808e8e324e4 Mon Sep 17 00:00:00 2001 From: huangyan Date: Wed, 16 Apr 2025 00:43:27 +0800 Subject: [PATCH 780/939] Revert "linux-user: Print tid not pid with strace" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2f37362de1d971cc90c35405705bfa22a33f6cd8. * this change is incomplete, "get_task_state" lacks the implementation. * Moreover, it requires all calls to the "getpid" function to be changed to use "get_task_state", it would cause too much disruption,and it has not been applied in the upstream 8.2.0. --- linux-user/strace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linux-user/strace.c b/linux-user/strace.c index ac9177ebe4..cf26e55264 100644 --- a/linux-user/strace.c +++ b/linux-user/strace.c @@ -4176,7 +4176,7 @@ print_syscall(CPUArchState *cpu_env, int num, if (!f) { return; } - fprintf(f, "%d ", get_task_state(env_cpu(cpu_env))->ts_tid); + fprintf(f, "%d ", getpid()); for (i = 0; i < nsyscalls; i++) { if (scnames[i].nr == num) { -- Gitee From 655073e4e179e601e35a444f585d8e2049df97f5 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 5 Feb 2025 19:56:54 +0800 Subject: [PATCH 781/939] target/loongarch: fix vcpu reset command word issue When the KVM_REG_LOONGARCH_VCPU_RESET command word is sent to the kernel through the kvm_set_one_reg interface, the parameter source needs to be a legal address, otherwise the kernel will return an error and the command word will fail to be sent. Signed-off-by: Xianglai Li --- target/loongarch/cpu.c | 2 +- target/loongarch/kvm/kvm.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index ee764f0bc7..570ce8be3b 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -638,8 +638,8 @@ static void loongarch_cpu_realizefn(DeviceState *dev, Error **errp) loongarch_cpu_register_gdb_regs_for_features(cs); - cpu_reset(cs); qemu_init_vcpu(cs); + cpu_reset(cs); lacc->parent_realize(dev, errp); } diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 0acdd5c4c1..277210ca04 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -590,9 +590,16 @@ static int kvm_loongarch_get_lbt(CPUState *cs) void kvm_arch_reset_vcpu(CPUState *cs) { CPULoongArchState *env = cpu_env(cs); + int ret = 0; + uint64_t unused = 0; env->mp_state = KVM_MP_STATE_RUNNABLE; - kvm_set_one_reg(cs, KVM_REG_LOONGARCH_VCPU_RESET, 0); + ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_VCPU_RESET, &unused); + if (ret) { + error_report("Failed to set KVM_REG_LOONGARCH_VCPU_RESET: %s", + strerror(errno)); + exit(EXIT_FAILURE); + } } static int kvm_loongarch_get_mpstate(CPUState *cs) -- Gitee From 2a51f062a46c2e3fbd96a1d75f9d53cab449f4ac Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Fri, 21 Mar 2025 20:40:37 +0800 Subject: [PATCH 782/939] target/loongarch: Fix the cpu unplug resource leak When the cpu is created, qemu_add_vm_change_state_handler is called in the kvm_arch_init_vcpu function to create the VMChangeStateEntry resource. However, the resource is not released when the cpu is destroyed. This results in a qemu process segment error when the virtual machine restarts after the cpu is unplugged. This patch solves the problem by adding the corresponding resource release process to the kvm_arch_destroy_vcpu function. Signed-off-by: Xianglai Li --- target/loongarch/cpu.c | 2 +- target/loongarch/cpu.h | 1 + target/loongarch/kvm/kvm.c | 5 ++++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index 570ce8be3b..561566f3a0 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -573,7 +573,7 @@ static void loongarch_cpu_reset_hold(Object *obj) env->CSR_ECFG = FIELD_DP64(env->CSR_ECFG, CSR_ECFG, VS, 0); env->CSR_ECFG = FIELD_DP64(env->CSR_ECFG, CSR_ECFG, LIE, 0); - env->CSR_ESTAT = env->CSR_ESTAT & (~MAKE_64BIT_MASK(0, 2)); + env->CSR_ESTAT = 0; env->CSR_RVACFG = FIELD_DP64(env->CSR_RVACFG, CSR_RVACFG, RBITS, 0); env->CSR_CPUID = cs->cpu_index; env->CSR_TCFG = FIELD_DP64(env->CSR_TCFG, CSR_TCFG, EN, 0); diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 9af622aba5..6cc717c5ea 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -427,6 +427,7 @@ struct ArchCPU { const char *dtb_compatible; /* used by KVM_REG_LOONGARCH_COUNTER ioctl to access guest time counters */ uint64_t kvm_state_counter; + VMChangeStateEntry *vmsentry; }; /** diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index 277210ca04..f6e008a517 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -905,9 +905,10 @@ int kvm_arch_init_vcpu(CPUState *cs) uint64_t val; int ret; Error *local_err = NULL; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); ret = 0; - qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs); + cpu->vmsentry = qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs); if (!kvm_get_one_reg(cs, KVM_REG_LOONGARCH_DEBUG_INST, &val)) { brk_insn = val; @@ -928,6 +929,8 @@ int kvm_arch_init_vcpu(CPUState *cs) int kvm_arch_destroy_vcpu(CPUState *cs) { + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + qemu_del_vm_change_state_handler(cpu->vmsentry); return 0; } -- Gitee From 79a6baa688a19242512a753ab240a2238bb7ed7e Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 21 Jan 2025 10:06:47 +0800 Subject: [PATCH 783/939] target/i386: Introduce SierraForest-v2 model commit c597ff5339a9918b00d9f4160126db0ac2a423cc upstream. Update SierraForest CPU model to add LAM, 4 bits indicating certain bits of IA32_SPEC_CTR are supported(intel-psfd, ipred-ctrl, rrsba-ctrl, bhi-ctrl) and the missing features(ss, tsc-adjust, cldemote, movdiri, movdir64b) Also add GDS-NO and RFDS-NO to indicate the related vulnerabilities are mitigated in stepping 3. Intel-SIG: commit c597ff5339a9 target/i386: Introduce SierraForest-v2 model. Add SRF CPU model support Tested-by: Xuelian Guo Signed-off-by: Tao Su Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20250121020650.1899618-2-tao1.su@linux.intel.com Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 20358ffa91..bad30581ce 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4315,6 +4315,25 @@ static const X86CPUDefinition builtin_x86_defs[] = { .model_id = "Intel Xeon Processor (SierraForest)", .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, + { + .version = 2, + .props = (PropValue[]) { + { "ss", "on" }, + { "tsc-adjust", "on" }, + { "cldemote", "on" }, + { "movdiri", "on" }, + { "movdir64b", "on" }, + { "gds-no", "on" }, + { "rfds-no", "on" }, + { "lam", "on" }, + { "intel-psfd", "on"}, + { "ipred-ctrl", "on"}, + { "rrsba-ctrl", "on"}, + { "bhi-ctrl", "on"}, + { "stepping", "3" }, + { /* end of list */ } + } + }, { /* end of list */ }, }, }, -- Gitee From bd65b82f94b07c90f856a34cb10d535b5301d9d9 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 21 Jan 2025 10:06:48 +0800 Subject: [PATCH 784/939] target/i386: Export BHI_NO bit to guests commit b611931d4f70b9a3e49e39c405c63b3b5e9c0df1 upstream. Branch History Injection (BHI) is a CPU side-channel vulnerability, where an attacker may manipulate branch history before transitioning from user to supervisor mode or from VMX non-root/guest to root mode. CPUs that set BHI_NO bit in MSR IA32_ARCH_CAPABILITIES to indicate no additional mitigation is required to prevent BHI. Make BHI_NO bit available to guests. Intel-SIG: commit b611931d4f70 target/i386: Export BHI_NO bit to guests. Tested-by: Xuelian Guo Signed-off-by: Tao Su Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20250121020650.1899618-3-tao1.su@linux.intel.com Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index bad30581ce..b5231432e7 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1157,7 +1157,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "taa-no", NULL, NULL, NULL, NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no", NULL, "fb-clear", NULL, NULL, - NULL, NULL, NULL, NULL, + "bhi-no", NULL, NULL, NULL, "pbrsb-no", NULL, "gds-no", "rfds-no", "rfds-clear", NULL, NULL, NULL, }, -- Gitee From 2753607e8768002debb4608dacafe1309420a4dd Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 21 Jan 2025 10:06:50 +0800 Subject: [PATCH 785/939] docs: Add GNR, SRF and CWF CPU models commit 0a6dec6d11e5e392dcd6299548bf1514f1201707 upstream. Update GraniteRapids, SierraForest and ClearwaterForest CPU models in section "Preferred CPU models for Intel x86 hosts". Also introduce bhi-no, gds-no and rfds-no in doc. Intel-SIG: commit 0a6dec6d11e5 docs: Add GNR, SRF and CWF CPU models. Suggested-by: Zhao Liu Signed-off-by: Tao Su Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20250121020650.1899618-5-tao1.su@linux.intel.com Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- docs/system/cpu-models-x86.rst.inc | 50 +++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/docs/system/cpu-models-x86.rst.inc b/docs/system/cpu-models-x86.rst.inc index 7f6368f999..37fe1d0ac8 100644 --- a/docs/system/cpu-models-x86.rst.inc +++ b/docs/system/cpu-models-x86.rst.inc @@ -71,6 +71,16 @@ mixture of host CPU models between machines, if live migration compatibility is required, use the newest CPU model that is compatible across all desired hosts. +``ClearwaterForest`` + Intel Xeon Processor (ClearwaterForest, 2025) + +``SierraForest``, ``SierraForest-v2`` + Intel Xeon Processor (SierraForest, 2024), SierraForest-v2 mitigates + the GDS and RFDS vulnerabilities with stepping 3. + +``GraniteRapids``, ``GraniteRapids-v2`` + Intel Xeon Processor (GraniteRapids, 2024) + ``Cascadelake-Server``, ``Cascadelake-Server-noTSX`` Intel Xeon Processor (Cascade Lake, 2019), with "stepping" levels 6 or 7 only. (The Cascade Lake Xeon processor with *stepping 5 is @@ -181,7 +191,7 @@ features are included if using "Host passthrough" or "Host model". CVE-2018-12127, [MSBDS] CVE-2018-12126). This is an MSR (Model-Specific Register) feature rather than a CPUID feature, - so it will not appear in the Linux ``/proc/cpuinfo`` in the host or + therefore it will not appear in the Linux ``/proc/cpuinfo`` in the host or guest. Instead, the host kernel uses it to populate the MDS vulnerability file in ``sysfs``. @@ -189,10 +199,10 @@ features are included if using "Host passthrough" or "Host model". affected} in the ``/sys/devices/system/cpu/vulnerabilities/mds`` file. ``taa-no`` - Recommended to inform that the guest that the host is ``not`` + Recommended to inform the guest that the host is ``not`` vulnerable to CVE-2019-11135, TSX Asynchronous Abort (TAA). - This too is an MSR feature, so it does not show up in the Linux + This is also an MSR feature, therefore it does not show up in the Linux ``/proc/cpuinfo`` in the host or guest. It should only be enabled for VMs if the host reports ``Not affected`` @@ -214,7 +224,7 @@ features are included if using "Host passthrough" or "Host model". By disabling TSX, KVM-based guests can avoid paying the price of mitigating TSX-based attacks. - Note that ``tsx-ctrl`` too is an MSR feature, so it does not show + Note that ``tsx-ctrl`` is also an MSR feature, therefore it does not show up in the Linux ``/proc/cpuinfo`` in the host or guest. To validate that Intel TSX is indeed disabled for the guest, there are @@ -223,6 +233,38 @@ features are included if using "Host passthrough" or "Host model". ``/sys/devices/system/cpu/vulnerabilities/tsx_async_abort`` file in the guest should report ``Mitigation: TSX disabled``. +``bhi-no`` + Recommended to inform the guest that the host is ``not`` + vulnerable to CVE-2022-0001, Branch History Injection (BHI). + + This is also an MSR feature, therefore it does not show up in the Linux + ``/proc/cpuinfo`` in the host or guest. + + It should only be enabled for VMs if the host reports + ``BHI: Not affected`` in the + ``/sys/devices/system/cpu/vulnerabilities/spectre_v2`` file. + +``gds-no`` + Recommended to inform the guest that the host is ``not`` + vulnerable to CVE-2022-40982, Gather Data Sampling (GDS). + + This is also an MSR feature, therefore it does not show up in the Linux + ``/proc/cpuinfo`` in the host or guest. + + It should only be enabled for VMs if the host reports ``Not affected`` + in the ``/sys/devices/system/cpu/vulnerabilities/gather_data_sampling`` + file. + +``rfds-no`` + Recommended to inform the guest that the host is ``not`` + vulnerable to CVE-2023-28746, Register File Data Sampling (RFDS). + + This is also an MSR feature, therefore it does not show up in the Linux + ``/proc/cpuinfo`` in the host or guest. + + It should only be enabled for VMs if the host reports ``Not affected`` + in the ``/sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling`` + file. Preferred CPU models for AMD x86 hosts ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- Gitee From 87871b854241cc52f967805e005bdd66a923c555 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 3 Jul 2024 13:42:49 +0200 Subject: [PATCH 786/939] target/i386: add sha512, sm3, sm4 feature bits commit 78be258c0eeba3d5613c37888889e84f2ba9bd94 upstream. SHA512, SM3, SM4 (CPUID[EAX=7,ECX=1).EAX bits 0 to 2) is supported by Clearwater Forest processor, add it to QEMU as it does not need any specific enablement. See https://lore.kernel.org/kvm/20241105054825.870939-1-tao1.su@linux.intel.com/ for reference. Intel-SIG: commit 78be258c0eeb target/i386: add sha512, sm3, sm4 feature bits. Reviewed-by: Tao Su Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index b5231432e7..6ed4e84b5c 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -962,7 +962,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_7_1_EAX] = { .type = CPUID_FEATURE_WORD, .feat_names = { - NULL, NULL, NULL, NULL, + "sha512", "sm3", "sm4", NULL, "avx-vnni", "avx512-bf16", NULL, "cmpccxadd", NULL, NULL, "fzrm", "fsrs", "fsrc", NULL, NULL, NULL, -- Gitee From e6464174c2261e809764ed63f8a064913a108446 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 21 Jan 2025 10:06:49 +0800 Subject: [PATCH 787/939] target/i386: Add new CPU model ClearwaterForest commit 56e84d898f17606b5d88778726466540af96b234 upstream. According to table 1-2 in Intel Architecture Instruction Set Extensions and Future Features (rev 056) [1], ClearwaterForest has the following new features which have already been virtualized: - AVX-VNNI-INT16 CPUID.(EAX=7,ECX=1):EDX[bit 10] - SHA512 CPUID.(EAX=7,ECX=1):EAX[bit 0] - SM3 CPUID.(EAX=7,ECX=1):EAX[bit 1] - SM4 CPUID.(EAX=7,ECX=1):EAX[bit 2] Add above features to new CPU model ClearwaterForest. Comparing with SierraForest, ClearwaterForest bare-metal contains all features of SierraForest-v2 CPU model and adds: - PREFETCHI CPUID.(EAX=7,ECX=1):EDX[bit 14] - DDPD_U CPUID.(EAX=7,ECX=2):EDX[bit 3] - BHI_NO IA32_ARCH_CAPABILITIES[bit 20] Add above and all features of SierraForest-v2 CPU model to new CPU model ClearwaterForest. [1] https://cdrdv2.intel.com/v1/dl/getContent/671368 Intel-SIG: commit 56e84d898f17 target/i386: Add new CPU model ClearwaterForest. Tested-by: Xuelian Guo Signed-off-by: Tao Su Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20250121020650.1899618-4-tao1.su@linux.intel.com Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++ target/i386/cpu.h | 35 +++++++++--- 2 files changed, 164 insertions(+), 6 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6ed4e84b5c..f79d0c9abf 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4337,6 +4337,141 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, + { + .name = "ClearwaterForest", + .level = 0x23, + .xlevel = 0x80000008, + .vendor = CPUID_VENDOR_INTEL, + .family = 6, + .model = 221, + .stepping = 0, + /* + * please keep the ascending order so that we can have a clear view of + * bit position of each feature. + */ + .features[FEAT_1_EDX] = + CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | + CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | + CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | + CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR | + CPUID_SSE | CPUID_SSE2 | CPUID_SS, + .features[FEAT_1_ECX] = + CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 | + CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 | + CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | + CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES | + CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | CPUID_EXT_RDRAND, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | + CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, + .features[FEAT_8000_0008_EBX] = + CPUID_8000_0008_EBX_WBNOINVD, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_TSC_ADJUST | + CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | + CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | + CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | + CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB | + CPUID_7_0_EBX_SHA_NI, + .features[FEAT_7_0_ECX] = + CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_GFNI | + CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ | + CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT | + CPUID_7_0_ECX_CLDEMOTE | CPUID_7_0_ECX_MOVDIRI | + CPUID_7_0_ECX_MOVDIR64B, + .features[FEAT_7_0_EDX] = + CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE | + CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_ARCH_CAPABILITIES | + CPUID_7_0_EDX_SPEC_CTRL_SSBD, + .features[FEAT_ARCH_CAPABILITIES] = + MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | + MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO | + MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_SBDR_SSDP_NO | + MSR_ARCH_CAP_FBSDP_NO | MSR_ARCH_CAP_PSDP_NO | + MSR_ARCH_CAP_BHI_NO | MSR_ARCH_CAP_PBRSB_NO | + MSR_ARCH_CAP_GDS_NO | MSR_ARCH_CAP_RFDS_NO, + .features[FEAT_XSAVE] = + CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | + CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, + .features[FEAT_7_1_EAX] = + CPUID_7_1_EAX_SHA512 | CPUID_7_1_EAX_SM3 | CPUID_7_1_EAX_SM4 | + CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_CMPCCXADD | + CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_AVX_IFMA | + CPUID_7_1_EAX_LAM, + .features[FEAT_7_1_EDX] = + CPUID_7_1_EDX_AVX_VNNI_INT8 | CPUID_7_1_EDX_AVX_NE_CONVERT | + CPUID_7_1_EDX_AVX_VNNI_INT16 | CPUID_7_1_EDX_PREFETCHITI, + .features[FEAT_7_2_EDX] = + CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL | + CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_DDPD_U | + CPUID_7_2_EDX_BHI_CTRL | CPUID_7_2_EDX_MCDT_NO, + .features[FEAT_VMX_BASIC] = + MSR_VMX_BASIC_INS_OUTS | MSR_VMX_BASIC_TRUE_CTLS, + .features[FEAT_VMX_ENTRY_CTLS] = + VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_IA32E_MODE | + VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_ENTRY_LOAD_IA32_PAT | VMX_VM_ENTRY_LOAD_IA32_EFER, + .features[FEAT_VMX_EPT_VPID_CAPS] = + MSR_VMX_EPT_EXECONLY | MSR_VMX_EPT_PAGE_WALK_LENGTH_4 | + MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | MSR_VMX_EPT_1GB | + MSR_VMX_EPT_INVEPT | MSR_VMX_EPT_AD_BITS | + MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT | + MSR_VMX_EPT_INVVPID_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS, + .features[FEAT_VMX_EXIT_CTLS] = + VMX_VM_EXIT_SAVE_DEBUG_CONTROLS | + VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_IA32_PAT | + VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER | + VMX_VM_EXIT_LOAD_IA32_EFER | VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, + .features[FEAT_VMX_MISC] = + MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_ACTIVITY_HLT | + MSR_VMX_MISC_VMWRITE_VMEXIT, + .features[FEAT_VMX_PINBASED_CTLS] = + VMX_PIN_BASED_EXT_INTR_MASK | VMX_PIN_BASED_NMI_EXITING | + VMX_PIN_BASED_VIRTUAL_NMIS | VMX_PIN_BASED_VMX_PREEMPTION_TIMER | + VMX_PIN_BASED_POSTED_INTR, + .features[FEAT_VMX_PROCBASED_CTLS] = + VMX_CPU_BASED_VIRTUAL_INTR_PENDING | + VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING | + VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING | + VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING | + VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING | + VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING | + VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_VIRTUAL_NMI_PENDING | + VMX_CPU_BASED_MOV_DR_EXITING | VMX_CPU_BASED_UNCOND_IO_EXITING | + VMX_CPU_BASED_USE_IO_BITMAPS | VMX_CPU_BASED_MONITOR_TRAP_FLAG | + VMX_CPU_BASED_USE_MSR_BITMAPS | VMX_CPU_BASED_MONITOR_EXITING | + VMX_CPU_BASED_PAUSE_EXITING | + VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, + .features[FEAT_VMX_SECONDARY_CTLS] = + VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + VMX_SECONDARY_EXEC_ENABLE_EPT | VMX_SECONDARY_EXEC_DESC | + VMX_SECONDARY_EXEC_RDTSCP | + VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_WBINVD_EXITING | + VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST | + VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT | + VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + VMX_SECONDARY_EXEC_RDRAND_EXITING | + VMX_SECONDARY_EXEC_ENABLE_INVPCID | + VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS | + VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML | + VMX_SECONDARY_EXEC_XSAVES, + .features[FEAT_VMX_VMFUNC] = + MSR_VMX_VMFUNC_EPT_SWITCHING, + .model_id = "Intel Xeon Processor (ClearwaterForest)", + .versions = (X86CPUVersionDefinition[]) { + { .version = 1 }, + { /* end of list */ }, + }, + }, { .name = "Denverton", .level = 21, diff --git a/target/i386/cpu.h b/target/i386/cpu.h index b883e5e1d6..4424e58d1b 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -801,6 +801,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ #define CPUID_7_0_EBX_FSGSBASE (1U << 0) +/* Support TSC adjust MSR */ +#define CPUID_7_0_EBX_TSC_ADJUST (1U << 1) /* Support SGX */ #define CPUID_7_0_EBX_SGX (1U << 2) /* 1st Group of Advanced Bit Manipulation Extensions */ @@ -934,6 +936,12 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); /* Speculative Store Bypass Disable */ #define CPUID_7_0_EDX_SPEC_CTRL_SSBD (1U << 31) +/* SHA512 Instruction */ +#define CPUID_7_1_EAX_SHA512 (1U << 0) +/* SM3 Instruction */ +#define CPUID_7_1_EAX_SM3 (1U << 1) +/* SM4 Instruction */ +#define CPUID_7_1_EAX_SM4 (1U << 2) /* AVX VNNI Instruction */ #define CPUID_7_1_EAX_AVX_VNNI (1U << 4) /* AVX512 BFloat16 Instruction */ @@ -946,6 +954,12 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_7_1_EAX_FSRS (1U << 11) /* Fast Short REP CMPS/SCAS */ #define CPUID_7_1_EAX_FSRC (1U << 12) +/* Flexible return and event delivery (FRED) */ +#define CPUID_7_1_EAX_FRED (1U << 17) +/* Load into IA32_KERNEL_GS_BASE (LKGS) */ +#define CPUID_7_1_EAX_LKGS (1U << 18) +/* Non-Serializing Write to Model Specific Register (WRMSRNS) */ +#define CPUID_7_1_EAX_WRMSRNS (1U << 19) /* Support Tile Computational Operations on FP16 Numbers */ #define CPUID_7_1_EAX_AMX_FP16 (1U << 21) /* Support for VPMADD52[H,L]UQ */ @@ -957,17 +971,23 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4) /* AVX NE CONVERT Instructions */ #define CPUID_7_1_EDX_AVX_NE_CONVERT (1U << 5) +/* AVX-VNNI-INT16 Instructions */ +#define CPUID_7_1_EDX_AVX_VNNI_INT16 (1U << 10) /* AMX COMPLEX Instructions */ #define CPUID_7_1_EDX_AMX_COMPLEX (1U << 8) /* PREFETCHIT0/1 Instructions */ #define CPUID_7_1_EDX_PREFETCHITI (1U << 14) -/* Flexible return and event delivery (FRED) */ -#define CPUID_7_1_EAX_FRED (1U << 17) -/* Load into IA32_KERNEL_GS_BASE (LKGS) */ -#define CPUID_7_1_EAX_LKGS (1U << 18) -/* Non-Serializing Write to Model Specific Register (WRMSRNS) */ -#define CPUID_7_1_EAX_WRMSRNS (1U << 19) +/* Indicate bit 7 of the IA32_SPEC_CTRL MSR is supported */ +#define CPUID_7_2_EDX_PSFD (1U << 0) +/* Indicate bits 3 and 4 of the IA32_SPEC_CTRL MSR are supported */ +#define CPUID_7_2_EDX_IPRED_CTRL (1U << 1) +/* Indicate bits 5 and 6 of the IA32_SPEC_CTRL MSR are supported */ +#define CPUID_7_2_EDX_RRSBA_CTRL (1U << 2) +/* Indicate bit 8 of the IA32_SPEC_CTRL MSR is supported */ +#define CPUID_7_2_EDX_DDPD_U (1U << 3) +/* Indicate bit 10 of the IA32_SPEC_CTRL MSR is supported */ +#define CPUID_7_2_EDX_BHI_CTRL (1U << 4) /* Do not exhibit MXCSR Configuration Dependent Timing (MCDT) behavior */ #define CPUID_7_2_EDX_MCDT_NO (1U << 5) @@ -1061,7 +1081,10 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define MSR_ARCH_CAP_FBSDP_NO (1U << 14) #define MSR_ARCH_CAP_PSDP_NO (1U << 15) #define MSR_ARCH_CAP_FB_CLEAR (1U << 17) +#define MSR_ARCH_CAP_BHI_NO (1U << 20) #define MSR_ARCH_CAP_PBRSB_NO (1U << 24) +#define MSR_ARCH_CAP_GDS_NO (1U << 26) +#define MSR_ARCH_CAP_RFDS_NO (1U << 27) #define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5) -- Gitee From e549f32b1a88cb9ffdc4fc88fa818854a918498e Mon Sep 17 00:00:00 2001 From: eillon Date: Mon, 14 Apr 2025 22:33:21 +0800 Subject: [PATCH 788/939] hw/arm/virt: support the HDBSS feature We use QEMU to enable or disable the HDBSS feature during live migration. We can use the migration-parameter to control the size of the HDBSS buffer, such as: migrate_set_parameter hdbss-buffer-size 3 info migrate_parameters Signed-off-by: eillon --- linux-headers/linux/kvm.h | 2 ++ migration/migration-hmp-cmds.c | 9 +++++++++ migration/migration.h | 7 +++++++ migration/options.c | 21 +++++++++++++++++++++ migration/options.h | 1 + migration/ram.c | 28 ++++++++++++++++++++++++++++ qapi/migration.json | 17 ++++++++++++++--- 7 files changed, 82 insertions(+), 3 deletions(-) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index b94c5fd90f..57d6e12744 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1212,6 +1212,8 @@ struct kvm_ppc_resize_hpt { /* support request to inject secret to CSV3 guest */ #define KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET (1 << 2) +#define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502 + #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index aac5e7a73a..9857e2c97f 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -409,6 +409,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_SEV_AMD_CERT), params->sev_amd_cert); + + assert(params->has_hdbss_buffer_size); + monitor_printf(mon, "%s: %u\n", + MigrationParameter_str(MIGRATION_PARAMETER_HDBSS_BUFFER_SIZE), + params->hdbss_buffer_size); } qapi_free_MigrationParameters(params); @@ -725,6 +730,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->sev_amd_cert->type = QTYPE_QSTRING; visit_type_str(v, param, &p->sev_amd_cert->u.s, &err); break; + case MIGRATION_PARAMETER_HDBSS_BUFFER_SIZE: + p->has_hdbss_buffer_size = true; + visit_type_uint8(v, param, &p->hdbss_buffer_size, &err); + break; default: assert(0); } diff --git a/migration/migration.h b/migration/migration.h index eeddb7c0bd..4a95f00157 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -48,6 +48,13 @@ struct PostcopyBlocktimeContext; */ #define CLEAR_BITMAP_SHIFT_MAX 31 +/* + * The default HDBSS size. The value ranges [0, 9]. + * Set to 0 to disable the HDBSS feature. + */ +#define DEFAULT_HDBSS_BUFFER_SIZE 0 +#define MAX_HDBSS_BUFFER_SIZE 9 + /* This is an abstraction of a "temp huge page" for postcopy's purpose */ typedef struct { /* diff --git a/migration/options.c b/migration/options.c index 71e71ea801..71645c8721 100644 --- a/migration/options.c +++ b/migration/options.c @@ -186,6 +186,9 @@ Property migration_properties[] = { DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), + DEFINE_PROP_UINT8("hdbss-buffer-size", MigrationState, + parameters.hdbss_buffer_size, + DEFAULT_HDBSS_BUFFER_SIZE), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), @@ -853,6 +856,13 @@ MigMode migrate_mode(void) return s->parameters.mode; } +int migrate_hdbss_buffer_size(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.hdbss_buffer_size; +} + int migrate_multifd_channels(void) { MigrationState *s = migrate_get_current(); @@ -1032,6 +1042,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_hdbss_buffer_size = true; + params->hdbss_buffer_size = s->parameters.hdbss_buffer_size; return params; } @@ -1069,6 +1081,7 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_hdbss_buffer_size = true; params->sev_pdh = g_strdup(""); params->sev_plat_cert = g_strdup(""); @@ -1415,6 +1428,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, assert(params->sev_amd_cert->type == QTYPE_QSTRING); dest->sev_amd_cert = params->sev_amd_cert->u.s; } + + if (params->has_hdbss_buffer_size) { + dest->hdbss_buffer_size = params->hdbss_buffer_size; + } } static void migrate_params_apply(MigrateSetParameters *params, Error **errp) @@ -1579,6 +1596,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) assert(params->sev_amd_cert->type == QTYPE_QSTRING); s->parameters.sev_amd_cert = g_strdup(params->sev_amd_cert->u.s); } + + if (params->has_hdbss_buffer_size) { + s->parameters.hdbss_buffer_size = params->hdbss_buffer_size; + } } void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) diff --git a/migration/options.h b/migration/options.h index 9aca5e41ad..987fc81a18 100644 --- a/migration/options.h +++ b/migration/options.h @@ -85,6 +85,7 @@ uint64_t migrate_max_bandwidth(void); uint64_t migrate_avail_switchover_bandwidth(void); uint64_t migrate_max_postcopy_bandwidth(void); MigMode migrate_mode(void); +int migrate_hdbss_buffer_size(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); diff --git a/migration/ram.c b/migration/ram.c index 1f9348fd06..f1ff38cf39 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -39,6 +39,7 @@ #include "migration-stats.h" #include "migration/register.h" #include "migration/misc.h" +#include "migration/options.h" #include "qemu-file.h" #include "postcopy-ram.h" #include "page_cache.h" @@ -2790,6 +2791,31 @@ static void xbzrle_cleanup(void) XBZRLE_cache_unlock(); } +static void kvm_update_hdbss_cap(bool enable) +{ + KVMState *s = kvm_state; + int size, ret; + + if (s == NULL || !kvm_check_extension(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK)) { + return; + } + + size = migrate_hdbss_buffer_size(); + if (size < 0 || size > MAX_HDBSS_BUFFER_SIZE) { + fprintf(stderr, "Invalid hdbss buffer size: %d\n", size); + return; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK, 0, + enable ? size : 0); + if (ret) { + fprintf(stderr, "Could not %s KVM_CAP_ARM_HW_DIRTY_STATE_TRACK: %d\n", + enable ? "enable" : "disable", ret); + } + + return; +} + static void ram_save_cleanup(void *opaque) { RAMState **rsp = opaque; @@ -2806,6 +2832,7 @@ static void ram_save_cleanup(void *opaque) * memory_global_dirty_log_stop will assert that * memory_global_dirty_log_start/stop used in pairs */ + kvm_update_hdbss_cap(false); memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); } } @@ -3209,6 +3236,7 @@ static void ram_init_bitmaps(RAMState *rs) ram_list_init_bitmaps(); /* We don't use dirty log with background snapshots */ if (!migrate_background_snapshot()) { + kvm_update_hdbss_cap(true); memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs, false); } diff --git a/qapi/migration.json b/qapi/migration.json index 3aed216c3b..f672da5c0d 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -902,6 +902,9 @@ # @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in # base64, or vendor cert filename for hygon (Since 4.2) # +# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure). +# Defaults to 0. (Since 8.6) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -937,7 +940,7 @@ { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', 'mode', - 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert'] } + 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert', 'hdbss-buffer-size'] } ## # @MigrateSetParameters: @@ -1106,6 +1109,9 @@ # @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in # base64, or vendor cert filename for hygon (Since 4.2) # +# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure). +# Defaults to 0. (Since 8.6) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1165,7 +1171,8 @@ '*mode': 'MigMode', '*sev-pdh': 'StrOrNull', '*sev-plat-cert': 'StrOrNull', - '*sev-amd-cert' : 'StrOrNull' } } + '*sev-amd-cert' : 'StrOrNull', + '*hdbss-buffer-size': 'uint8'} } ## @@ -1355,6 +1362,9 @@ # @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in # base64, or vendor cert filename for hygon (Since 4.2) # +# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure). +# Defaults to 0. (Since 8.6) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1410,7 +1420,8 @@ '*mode': 'MigMode', '*sev-pdh': 'str', '*sev-plat-cert': 'str', - '*sev-amd-cert' : 'str'} } + '*sev-amd-cert' : 'str', + '*hdbss-buffer-size': 'uint8'} } ## # @query-migrate-parameters: -- Gitee From 33aa02dc05bed8316b1c64131e8269f404287598 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 15 Apr 2025 20:10:50 +0800 Subject: [PATCH 789/939] target/arm: Change arm_cpu_mp_affinity when enabled IPIV feature virt inclusion category: feature bugzilla: https://gitee.com/openeuler/qemu/issues/IC1EV7 --------------------------------------------------------------- Before IPIV feature, it gets mpidr from vcpu id, but after the feature, we need to know whether IPIV is enabled. Signed-off-by: Xiang Chen --- linux-headers/linux/kvm.h | 2 ++ target/arm/cpu.c | 22 +++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index b94c5fd90f..a9d407eace 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1205,6 +1205,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SEV_ES_GHCB 500 #define KVM_CAP_HYGON_COCO_EXT 501 + +#define KVM_CAP_ARM_IPIV_MODE 503 /* support userspace to request firmware to build CSV3 guest's memory space */ #define KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM (1 << 0) /* support request to update CSV3 guest's memory region multiple times */ diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 09d391bd34..b0f70de018 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1324,9 +1324,25 @@ static void arm_cpu_dump_state(CPUState *cs, FILE *f, int flags) uint64_t arm_cpu_mp_affinity(int idx, uint8_t clustersz) { - uint32_t Aff1 = idx / clustersz; - uint32_t Aff0 = idx % clustersz; - return (Aff1 << ARM_AFF1_SHIFT) | Aff0; + uint64_t Aff0 = 0, Aff1 = 0, Aff2 = 0, Aff3 = 0; + int mode; + + if (!kvm_enabled()) { + Aff1 = idx / clustersz; + Aff0 = idx % clustersz; + return (Aff1 << ARM_AFF1_SHIFT) | Aff0; + } + + mode = kvm_check_extension(kvm_state, KVM_CAP_ARM_IPIV_MODE); + if (mode) { + Aff1 = idx % 16; + Aff2 = idx / 16; + } else { + Aff1 = idx / clustersz; + Aff0 = idx % clustersz; + } + return (Aff3 << ARM_AFF3_SHIFT) | (Aff2 << ARM_AFF2_SHIFT) | + (Aff1 << ARM_AFF1_SHIFT) | Aff0; } static void arm_cpu_initfn(Object *obj) -- Gitee From b93ac4e4fd07e36b95ce211faefd0c7912b6f62a Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 3 Dec 2024 13:18:06 +0000 Subject: [PATCH 790/939] fw_cfg: Don't set callback_opaque NULL in fw_cfg_modify_bytes_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On arm/virt platform, Chen Xiang reported a Guest crash while attempting the below steps, 1. Launch the Guest with nvdimm=on 2. Hot-add a NVDIMM dev 3. Reboot 4. Guest boots fine. 5. Reboot again. 6. Guest boot fails. QEMU_EFI reports the below error: ProcessCmdAddPointer: invalid pointer value in "etc/acpi/tables" OnRootBridgesConnected: InstallAcpiTables: Protocol Error Debugging shows that on first reboot(after hot adding NVDIMM), Qemu updates the etc/table-loader len, qemu_ram_resize()   fw_cfg_modify_file()      fw_cfg_modify_bytes_read() And in fw_cfg_modify_bytes_read() we set the "callback_opaque" for the key entry to NULL. Because of this, on the second reboot, virt_acpi_build_update() is called with a NULL "build_state" and returns without updating the ACPI tables. This seems to be upsetting the firmware. To fix this, don't change the callback_opaque in fw_cfg_modify_bytes_read(). Fixes: bdbb5b1706d165 ("fw_cfg: add fw_cfg_machine_reset function") Reported-by: chenxiang Acked-by: Igor Mammedov Acked-by: Gerd Hoffmann Signed-off-by: Shameer Kolothum Message-ID: <20241203131806.37548-1-shameerali.kolothum.thodi@huawei.com> Signed-off-by: Philippe Mathieu-Daudé --- hw/nvram/fw_cfg.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 4e4524673a..d32079ebdf 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -729,7 +729,6 @@ static void *fw_cfg_modify_bytes_read(FWCfgState *s, uint16_t key, ptr = s->entries[arch][key].data; s->entries[arch][key].data = data; s->entries[arch][key].len = len; - s->entries[arch][key].callback_opaque = NULL; s->entries[arch][key].allow_write = false; return ptr; -- Gitee From 257ffabb9c06b476a3a42bf679db6fbc61c19459 Mon Sep 17 00:00:00 2001 From: Adttil <2429917001@qq.com> Date: Fri, 25 Apr 2025 09:41:59 +0800 Subject: [PATCH 791/939] vdpa:Fix dirty page bitmap synchronization not done after suspend for vdpa devices Change the flag for vdpa device to determine whether to perform log_sync from dev->start to dev->log, and do not release dev->log after vdpa device suspend, and release it uniformly by vhost_dev_stop. Signed-off-by: Adttil <2429917001@qq.com> --- hw/virtio/vhost.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index d29075aa04..bec6e63fc7 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -252,7 +252,7 @@ static void vhost_log_sync(MemoryListener *listener, memory_listener); MigrationState *ms = migrate_get_current(); - if (!dev->log_enabled || !dev->started) { + if (!dev->log_enabled || !dev->log) { return; } @@ -2624,7 +2624,6 @@ int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) memory_listener_unregister(&hdev->iommu_listener); } vhost_stop_config_intr(hdev); - vhost_log_put(hdev, true); hdev->started = false; vdev->vhost_started = false; hdev->vdev = NULL; -- Gitee From e2bc395c5db34111faf2adcecdb385e5a4e8d23d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 22 Dec 2023 08:55:23 +0100 Subject: [PATCH 792/939] backends/iommufd: Remove check on number of backend users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QOM already has a ref count on objects and it will assert much earlier, when INT_MAX is reached. Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 4f5df63331..f17a846aab 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -81,11 +81,6 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) int fd, ret = 0; qemu_mutex_lock(&be->lock); - if (be->users == UINT32_MAX) { - error_setg(errp, "too many connections"); - ret = -E2BIG; - goto out; - } if (be->owned && !be->users) { fd = qemu_open_old("/dev/iommu", O_RDWR); if (fd < 0) { -- Gitee From 1e6734af14b3223a7d7e304262c96051ddf8637f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 21 Dec 2023 16:58:41 +0100 Subject: [PATCH 793/939] backends/iommufd: Remove mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coverity reports a concurrent data access violation because be->users is being accessed in iommufd_backend_can_be_deleted() without holding the mutex. However, these routines are called from the QEMU main thread when a device is created. In this case, the code paths should be protected by the BQL lock and it should be safe to drop the IOMMUFD backend mutex. Simply remove it. Fixes: CID 1531550 Fixes: CID 1531549 Reviewed-by: Eric Auger Reviewed-by: Zhenzhong Duan Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 7 ------- include/sysemu/iommufd.h | 2 -- 2 files changed, 9 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index f17a846aab..3cbf11fc8b 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -30,7 +30,6 @@ static void iommufd_backend_init(Object *obj) be->fd = -1; be->users = 0; be->owned = true; - qemu_mutex_init(&be->lock); } static void iommufd_backend_finalize(Object *obj) @@ -53,10 +52,8 @@ static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) error_prepend(errp, "Could not parse remote object fd %s:", str); return; } - qemu_mutex_lock(&be->lock); be->fd = fd; be->owned = false; - qemu_mutex_unlock(&be->lock); trace_iommu_backend_set_fd(be->fd); } @@ -80,7 +77,6 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) { int fd, ret = 0; - qemu_mutex_lock(&be->lock); if (be->owned && !be->users) { fd = qemu_open_old("/dev/iommu", O_RDWR); if (fd < 0) { @@ -94,13 +90,11 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) out: trace_iommufd_backend_connect(be->fd, be->owned, be->users, ret); - qemu_mutex_unlock(&be->lock); return ret; } void iommufd_backend_disconnect(IOMMUFDBackend *be) { - qemu_mutex_lock(&be->lock); if (!be->users) { goto out; } @@ -111,7 +105,6 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be) } out: trace_iommufd_backend_disconnect(be->fd, be->users); - qemu_mutex_unlock(&be->lock); } int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 29afaa429d..908c94d811 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -15,7 +15,6 @@ #define SYSEMU_IOMMUFD_H #include "qom/object.h" -#include "qemu/thread.h" #include "exec/hwaddr.h" #include "exec/cpu-common.h" #include "sysemu/host_iommu_device.h" @@ -33,7 +32,6 @@ struct IOMMUFDBackend { /*< protected >*/ int fd; /* /dev/iommu file descriptor */ bool owned; /* is the /dev/iommu opened internally */ - QemuMutex lock; uint32_t users; /*< public >*/ -- Gitee From 88006385c8e58b2aa612bf5aa184263f0d4245de Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Mon, 11 Mar 2024 11:37:55 +0800 Subject: [PATCH 794/939] backends/iommufd: Fix missing ERRP_GUARD() for error_prepend() As the comment in qapi/error, passing @errp to error_prepend() requires ERRP_GUARD(): * = Why, when and how to use ERRP_GUARD() = * * Without ERRP_GUARD(), use of the @errp parameter is restricted: ... * - It should not be passed to error_prepend(), error_vprepend() or * error_append_hint(), because that doesn't work with &error_fatal. * ERRP_GUARD() lifts these restrictions. * * To use ERRP_GUARD(), add it right at the beginning of the function. * @errp can then be used without worrying about the argument being * NULL or &error_fatal. ERRP_GUARD() could avoid the case when @errp is &error_fatal, the user can't see this additional information, because exit() happens in error_setg earlier than information is added [1]. The iommufd_backend_set_fd() passes @errp to error_prepend(), to avoid the above issue, add missing ERRP_GUARD() at the beginning of this function. [1]: Issue description in the commit message of commit ae7c80a7bd73 ("error: New macro ERRP_GUARD()"). Cc: Yi Liu Cc: Eric Auger Cc: Zhenzhong Duan Signed-off-by: Zhao Liu Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger Message-ID: <20240311033822.3142585-3-zhao1.liu@linux.intel.com> Signed-off-by: Thomas Huth --- backends/iommufd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/iommufd.c b/backends/iommufd.c index 3cbf11fc8b..f061b6869a 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -44,6 +44,7 @@ static void iommufd_backend_finalize(Object *obj) static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) { + ERRP_GUARD(); IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); int fd = -1; -- Gitee From c9a107b1f73bddb4c9844c12444e3802e5f576b4 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 7 May 2024 14:42:52 +0800 Subject: [PATCH 795/939] backends/iommufd: Make iommufd_backend_*() return bool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is to follow the coding standand to return bool if 'Error **' is used to pass error. The changed functions include: iommufd_backend_connect iommufd_backend_alloc_ioas By this chance, simplify the functions a bit by avoiding duplicate recordings, e.g., log through either error interface or trace, not both. Suggested-by: Cédric Le Goater Signed-off-by: Zhenzhong Duan Reviewed-by: Cédric Le Goater Signed-off-by: Cédric Le Goater --- backends/iommufd.c | 29 +++++++++++++---------------- backends/trace-events | 4 ++-- include/sysemu/iommufd.h | 6 +++--- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index f061b6869a..fad580fdcb 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -74,24 +74,22 @@ static void iommufd_backend_class_init(ObjectClass *oc, void *data) object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); } -int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) +bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) { - int fd, ret = 0; + int fd; if (be->owned && !be->users) { fd = qemu_open_old("/dev/iommu", O_RDWR); if (fd < 0) { error_setg_errno(errp, errno, "/dev/iommu opening failed"); - ret = fd; - goto out; + return false; } be->fd = fd; } be->users++; -out: - trace_iommufd_backend_connect(be->fd, be->owned, - be->users, ret); - return ret; + + trace_iommufd_backend_connect(be->fd, be->owned, be->users); + return true; } void iommufd_backend_disconnect(IOMMUFDBackend *be) @@ -108,25 +106,24 @@ out: trace_iommufd_backend_disconnect(be->fd, be->users); } -int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, - Error **errp) +bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp) { - int ret, fd = be->fd; + int fd = be->fd; struct iommu_ioas_alloc alloc_data = { .size = sizeof(alloc_data), .flags = 0, }; - ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); - if (ret) { + if (ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data)) { error_setg_errno(errp, errno, "Failed to allocate ioas"); - return ret; + return false; } *ioas_id = alloc_data.out_ioas_id; - trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); + trace_iommufd_backend_alloc_ioas(fd, *ioas_id); - return ret; + return true; } void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) diff --git a/backends/trace-events b/backends/trace-events index f8592a2711..8fe77149b2 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -7,13 +7,13 @@ dbus_vmstate_loading(const char *id) "id: %s" dbus_vmstate_saving(const char *id) "id: %s" # iommufd.c -iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" +iommufd_backend_connect(int fd, bool owned, uint32_t users) "fd=%d owned=%d users=%d" iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" -iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 908c94d811..0531a4ad98 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -43,11 +43,11 @@ typedef struct IOMMUFDViommu { uint32_t viommu_id; } IOMMUFDViommu; -int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); +bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); void iommufd_backend_disconnect(IOMMUFDBackend *be); -int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, - Error **errp); +bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp); void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From 959b91b9b45b3ec649c6de0e268a4dcd603ce8af Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Mon, 15 Jul 2024 16:21:54 +0800 Subject: [PATCH 796/939] backends/iommufd: Get rid of qemu_open_old() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For qemu_open_old(), osdep.h said: > Don't introduce new usage of this function, prefer the following > qemu_open/qemu_create that take an "Error **errp". So replace qemu_open_old() with qemu_open(). Cc: Yi Liu Cc: Eric Auger Cc: Zhenzhong Duan Signed-off-by: Zhao Liu Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Yi Liu Reviewed-by: Michael Tokarev Signed-off-by: Michael Tokarev --- backends/iommufd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index fad580fdcb..62df6e41f0 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -79,9 +79,8 @@ bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) int fd; if (be->owned && !be->users) { - fd = qemu_open_old("/dev/iommu", O_RDWR); + fd = qemu_open("/dev/iommu", O_RDWR, errp); if (fd < 0) { - error_setg_errno(errp, errno, "/dev/iommu opening failed"); return false; } be->fd = fd; -- Gitee From 08a4aa240587fed26c17271bf9af87f0a5997f4a Mon Sep 17 00:00:00 2001 From: libai Date: Wed, 26 Mar 2025 18:59:33 +0800 Subject: [PATCH 797/939] Kconfig/iommufd/VDPA: Update IOMMUFD module configuration dependencies The vDPA module can also use IOMMUFD like the VFIO module. Therefore, adjust Kconfig to remove the dependency of IOMMUFD on VFIO and add a reverse dependency on IOMMUFD for vDPA Signed-off-by: libai --- Kconfig.host | 1 + backends/Kconfig | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Kconfig.host b/Kconfig.host index f496475f8e..faf58d9af5 100644 --- a/Kconfig.host +++ b/Kconfig.host @@ -28,6 +28,7 @@ config VHOST_USER config VHOST_VDPA bool + select IOMMUFD config VHOST_KERNEL bool diff --git a/backends/Kconfig b/backends/Kconfig index 2cb23f62fa..8d0be5a263 100644 --- a/backends/Kconfig +++ b/backends/Kconfig @@ -2,4 +2,3 @@ source tpm/Kconfig config IOMMUFD bool - depends on VFIO -- Gitee From 184e5195a815d57701cd9358f4b0537025729833 Mon Sep 17 00:00:00 2001 From: libai Date: Wed, 26 Mar 2025 20:44:40 +0800 Subject: [PATCH 798/939] vdpa/iommufd:support associating iommufd backend for vDPA devices The following parameters can associate the iommufd object with the vdpa device: -object iommufd,id=iommufd1 -device '{ "driver":"vhost-vdpa-device-pci", "id":"vhostdev0", "vhostdev":"/dev/vhost-vdpa-1", "iommufd":"iommufd1", }' Signed-off-by: libai --- hw/virtio/vdpa-dev.c | 1 + include/hw/virtio/vdpa-dev.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index bd787cf39c..9ce7ed7eae 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -356,6 +356,7 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) static Property vhost_vdpa_device_properties[] = { DEFINE_PROP_STRING("vhostdev", VhostVdpaDevice, vhostdev), DEFINE_PROP_UINT16("queue-size", VhostVdpaDevice, queue_size, 0), + DEFINE_PROP_LINK("iommufd", VhostVdpaDevice, iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h index 60e9c3f3fe..accdb7fa28 100644 --- a/include/hw/virtio/vdpa-dev.h +++ b/include/hw/virtio/vdpa-dev.h @@ -18,6 +18,7 @@ #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-vdpa.h" #include "qom/object.h" +#include "sysemu/iommufd.h" #define TYPE_VHOST_VDPA_DEVICE "vhost-vdpa-device" @@ -41,6 +42,7 @@ struct VhostVdpaDevice { int (*post_init)(VhostVdpaDevice *v, Error **errp); VMChangeStateEntry *vmstate; Notifier migration_state; + IOMMUFDBackend *iommufd; }; #endif -- Gitee From 16670675cbf7fc4db147a698ba7787d2e2fa675b Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 26 Mar 2025 17:02:37 +0800 Subject: [PATCH 799/939] hw/loongarch/boot: Adjust the loading position of the initrd When only the -kernel parameter is used to load the elf kernel, the initrd is loaded in the ram. If the initrd size is too large, the loading fails, resulting in a VM startup failure. This patch first loads initrd near the kernel. When the nearby memory space of the kernel is insufficient, it tries to load it to the starting position of high memory. If there is still not enough, qemu will report an error and ask the user to increase the memory space for the virtual machine to boot. Signed-off-by: Xianglai Li --- hw/loongarch/boot.c | 53 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 53dcefbb55..39c4a6d8c6 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -171,6 +171,48 @@ static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); } +static void find_initrd_loadoffset(struct loongarch_boot_info *info, + uint64_t kernel_high, ssize_t kernel_size) +{ + hwaddr base, size, gap, low_end; + ram_addr_t initrd_end, initrd_start; + + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + initrd_start = ROUND_UP(kernel_high + 4 * kernel_size, 64 * KiB); + initrd_end = initrd_start + initrd_size; + + size = info->ram_size; + low_end = base + MIN(size, gap); + if (initrd_end <= low_end) { + initrd_offset = initrd_start; + return; + } + + if (size <= gap) { + error_report("The low memory too small for initial ram disk '%s'," + "You need to expand the memory space", + info->initrd_filename); + exit(1); + } + + /* + * Try to load initrd in the high memory + */ + size -= gap; + base = VIRT_HIGHMEM_BASE; + initrd_start = ROUND_UP(base, 64 * KiB); + if (initrd_size <= size) { + initrd_offset = initrd_start; + return; + } + + error_report("The high memory too small for initial ram disk '%s'," + "You need to expand the memory space", + info->initrd_filename); + exit(1); +} + static int64_t load_kernel_info(struct loongarch_boot_info *info) { uint64_t kernel_entry, kernel_low, kernel_high; @@ -192,16 +234,9 @@ static int64_t load_kernel_info(struct loongarch_boot_info *info) if (info->initrd_filename) { initrd_size = get_image_size(info->initrd_filename); if (initrd_size > 0) { - initrd_offset = ROUND_UP(kernel_high + 4 * kernel_size, 64 * KiB); - - if (initrd_offset + initrd_size > info->ram_size) { - error_report("memory too small for initial ram disk '%s'", - info->initrd_filename); - exit(1); - } - + find_initrd_loadoffset(info, kernel_high, kernel_size); initrd_size = load_image_targphys(info->initrd_filename, initrd_offset, - info->ram_size - initrd_offset); + initrd_size); } if (initrd_size == (target_ulong)-1) { -- Gitee From 4044284b230182cbaeb401bdb1b65dcbd11c7550 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 7 Apr 2025 18:59:42 +0800 Subject: [PATCH 800/939] hw/rtc: Fixed loongson rtc emulation errors The expire time is sent to the timer only when the expire Time is greater than 0 or greater than now. Otherwise, the timer will trigger interruption continuously. Timer interrupts are sent using pulse functions. Signed-off-by: Xianglai Li --- hw/loongarch/virt.c | 9 +++++++-- hw/rtc/ls7a_rtc.c | 22 +++++++++++++--------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 0c24e632bb..ce026a4c3c 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -51,6 +51,11 @@ #include "qemu/error-report.h" #include "qemu/guest-random.h" +#define FDT_IRQ_FLAGS_EDGE_LO_HI 1 +#define FDT_IRQ_FLAGS_EDGE_HI_LO 2 +#define FDT_IRQ_FLAGS_LEVEL_HI 4 +#define FDT_IRQ_FLAGS_LEVEL_LO 8 + static bool virt_is_veiointc_enabled(LoongArchVirtMachineState *lvms) { if (lvms->veiointc == ON_OFF_AUTO_OFF) { @@ -275,7 +280,7 @@ static void fdt_add_rtc_node(LoongArchVirtMachineState *lvms, "loongson,ls7a-rtc"); qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", - VIRT_RTC_IRQ - VIRT_GSI_BASE , 0x4); + VIRT_RTC_IRQ - VIRT_GSI_BASE , FDT_IRQ_FLAGS_EDGE_LO_HI); qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", *pch_pic_phandle); g_free(nodename); @@ -334,7 +339,7 @@ static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, qemu_fdt_setprop_cell(ms->fdt, nodename, "clock-frequency", 100000000); if (chosen) qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename); - qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", irq, 0x4); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", irq, FDT_IRQ_FLAGS_LEVEL_HI); qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", *pch_pic_phandle); g_free(nodename); diff --git a/hw/rtc/ls7a_rtc.c b/hw/rtc/ls7a_rtc.c index 1f9e38a735..be9546c850 100644 --- a/hw/rtc/ls7a_rtc.c +++ b/hw/rtc/ls7a_rtc.c @@ -145,20 +145,22 @@ static void toymatch_write(LS7ARtcState *s, uint64_t val, int num) now = qemu_clock_get_ms(rtc_clock); toymatch_val_to_time(s, val, &tm); expire_time = now + (qemu_timedate_diff(&tm) - s->offset_toy) * 1000; - timer_mod(s->toy_timer[num], expire_time); + if (expire_time > now) + timer_mod(s->toy_timer[num], expire_time); } } static void rtcmatch_write(LS7ARtcState *s, uint64_t val, int num) { - uint64_t expire_ns; + int64_t expire_ns; /* it do not support write when toy disabled */ if (rtc_enabled(s)) { s->rtcmatch[num] = val; /* calculate expire time */ expire_ns = ticks_to_ns(val) - ticks_to_ns(s->offset_rtc); - timer_mod_ns(s->rtc_timer[num], expire_ns); + if (expire_ns > 0) + timer_mod_ns(s->rtc_timer[num], expire_ns); } } @@ -185,7 +187,7 @@ static void ls7a_rtc_stop(LS7ARtcState *s) static void ls7a_toy_start(LS7ARtcState *s) { int i; - uint64_t expire_time, now; + int64_t expire_time, now; struct tm tm = {}; now = qemu_clock_get_ms(rtc_clock); @@ -194,19 +196,21 @@ static void ls7a_toy_start(LS7ARtcState *s) for (i = 0; i < TIMER_NUMS; i++) { toymatch_val_to_time(s, s->toymatch[i], &tm); expire_time = now + (qemu_timedate_diff(&tm) - s->offset_toy) * 1000; - timer_mod(s->toy_timer[i], expire_time); + if (expire_time > now) + timer_mod(s->toy_timer[i], expire_time); } } static void ls7a_rtc_start(LS7ARtcState *s) { int i; - uint64_t expire_time; + int64_t expire_time; /* recalculate expire time and enable timer */ for (i = 0; i < TIMER_NUMS; i++) { expire_time = ticks_to_ns(s->rtcmatch[i]) - ticks_to_ns(s->offset_rtc); - timer_mod_ns(s->rtc_timer[i], expire_time); + if (expire_time > 0) + timer_mod_ns(s->rtc_timer[i], expire_time); } } @@ -370,7 +374,7 @@ static void toy_timer_cb(void *opaque) LS7ARtcState *s = opaque; if (toy_enabled(s)) { - qemu_irq_raise(s->irq); + qemu_irq_pulse(s->irq); } } @@ -379,7 +383,7 @@ static void rtc_timer_cb(void *opaque) LS7ARtcState *s = opaque; if (rtc_enabled(s)) { - qemu_irq_raise(s->irq); + qemu_irq_pulse(s->irq); } } -- Gitee From d6f75f9e532a4a4b6bb4610049f4fa7f26160733 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Thu, 20 Feb 2025 19:24:18 +0800 Subject: [PATCH 801/939] hw/intc: Add extioi ability of 256 vcpu interrupt routing Add the feature field for the CPU-encoded interrupt route to extioi and the corresponding mechanism for backup recovery. Signed-off-by: Xianglai Li --- hw/intc/loongarch_extioi_kvm.c | 65 ++++++++++++++++++++++++++++-- hw/loongarch/virt.c | 2 + include/hw/intc/loongarch_extioi.h | 4 ++ linux-headers/asm-loongarch/kvm.h | 10 +++++ 4 files changed, 77 insertions(+), 4 deletions(-) diff --git a/hw/intc/loongarch_extioi_kvm.c b/hw/intc/loongarch_extioi_kvm.c index f5bbc33255..2e7c764b7c 100644 --- a/hw/intc/loongarch_extioi_kvm.c +++ b/hw/intc/loongarch_extioi_kvm.c @@ -18,8 +18,32 @@ static void kvm_extioi_access_regs(int fd, uint64_t addr, void *val, int is_write) { - kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS, - addr, val, is_write, &error_abort); + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static void kvm_extioi_access_sw_status(int fd, uint64_t addr, + void *val, bool is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS, + addr, val, is_write, &error_abort); +} + +static void kvm_extioi_save_load_sw_status(void *opaque, bool is_write) +{ + KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque; + KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s); + int fd = class->dev_fd; + int addr; + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU; + kvm_extioi_access_sw_status(fd, addr, (void *)&s->num_cpu, is_write); + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE; + kvm_extioi_access_sw_status(fd, addr, (void *)&s->features, is_write); + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE; + kvm_extioi_access_sw_status(fd, addr, (void *)&s->status, is_write); } static int kvm_loongarch_extioi_pre_save(void *opaque) @@ -41,6 +65,8 @@ static int kvm_loongarch_extioi_pre_save(void *opaque) kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, (void *)s->coreisr, false); + kvm_extioi_save_load_sw_status(opaque, false); + return 0; } @@ -61,12 +87,19 @@ static int kvm_loongarch_extioi_post_load(void *opaque, int version_id) (void *)s->sw_coremap, true); kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, (void *)s->coreisr, true); + kvm_extioi_save_load_sw_status(opaque, true); + + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED, + NULL, true, &error_abort); + return 0; } static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) { KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_GET_CLASS(dev); + KVMLoongArchExtIOI *s = KVM_LOONGARCH_EXTIOI(dev); struct kvm_create_device cd = {0}; Error *err = NULL; int ret,i; @@ -77,6 +110,10 @@ static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) return; } + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + s->features |= EXTIOI_VIRT_HAS_FEATURES; + } + if (!extioi_class->is_created) { cd.type = KVM_DEV_TYPE_LA_EXTIOI; ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); @@ -87,6 +124,15 @@ static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) } extioi_class->is_created = true; extioi_class->dev_fd = cd.fd; + + kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU, + &s->num_cpu, true, NULL); + + kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE, + &s->features, true, NULL); + fprintf(stdout, "Create LoongArch extioi irqchip in KVM done!\n"); } @@ -102,8 +148,8 @@ static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) static const VMStateDescription vmstate_kvm_extioi_core = { .name = "kvm-extioi-single", - .version_id = 1, - .minimum_version_id = 1, + .version_id = 2, + .minimum_version_id = 2, .pre_save = kvm_loongarch_extioi_pre_save, .post_load = kvm_loongarch_extioi_post_load, .fields = (VMStateField[]) { @@ -119,10 +165,20 @@ static const VMStateDescription vmstate_kvm_extioi_core = { EXTIOI_IRQS_IPMAP_SIZE / 4), VMSTATE_UINT32_ARRAY(coremap, KVMLoongArchExtIOI, EXTIOI_IRQS / 4), VMSTATE_UINT8_ARRAY(sw_coremap, KVMLoongArchExtIOI, EXTIOI_IRQS), + VMSTATE_UINT32(num_cpu, KVMLoongArchExtIOI), + VMSTATE_UINT32(features, KVMLoongArchExtIOI), + VMSTATE_UINT32(status, KVMLoongArchExtIOI), VMSTATE_END_OF_LIST() } }; +static Property extioi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", KVMLoongArchExtIOI, num_cpu, 1), + DEFINE_PROP_BIT("has-virtualization-extension", KVMLoongArchExtIOI, + features, EXTIOI_HAS_VIRT_EXTENSION, 0), + DEFINE_PROP_END_OF_LIST(), +}; + static void kvm_loongarch_extioi_class_init(ObjectClass *oc, void *data) { DeviceClass *dc = DEVICE_CLASS(oc); @@ -131,6 +187,7 @@ static void kvm_loongarch_extioi_class_init(ObjectClass *oc, void *data) extioi_class->parent_realize = dc->realize; dc->realize = kvm_loongarch_extioi_realize; extioi_class->is_created = false; + device_class_set_props(dc, extioi_properties); dc->vmsd = &vmstate_kvm_extioi_core; } diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index ce026a4c3c..233297d78f 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -874,6 +874,8 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) /* Create EXTIOI device */ if (kvm_enabled() && kvm_irqchip_in_kernel()) { extioi = qdev_new(TYPE_KVM_LOONGARCH_EXTIOI); + qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus); + qdev_prop_set_bit(extioi, "has-virtualization-extension", true); sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); } else { extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index 9966cd98d3..92b38d5c38 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -94,6 +94,10 @@ struct LoongArchExtIOI { struct KVMLoongArchExtIOI { SysBusDevice parent_obj; + uint32_t num_cpu; + uint32_t features; + uint32_t status; + /* hardware state */ uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2]; uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT]; diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h index 13c1280662..34abd65939 100644 --- a/linux-headers/asm-loongarch/kvm.h +++ b/linux-headers/asm-loongarch/kvm.h @@ -141,6 +141,16 @@ struct kvm_iocsr_entry { #define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS 0x40000003 +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS 0x40000006 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU 0x0 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE 0x1 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE 0x2 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL 0x40000007 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU 0x0 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE 0x1 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED 0x3 + #define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL 0x40000004 #define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT 0 -- Gitee From 9cdd7c19a08c773f1f8a2d314bb94d61bd08fd77 Mon Sep 17 00:00:00 2001 From: libai Date: Thu, 27 Mar 2025 16:51:03 +0800 Subject: [PATCH 802/939] vdpa/iommufd:Introduce vdpa-iommufd module The purpose of the vdpa-iommufd module is to share the DMA mapping of multiple vdpa through the kernel iommufd interface. The VDPA devices can share the same DMA mapping by associating with the same IOMMUFD backend. This can avoid VDPA devices from repeatedly establishing DMA mappings, reduce the time required for hot plugging and unplugging VDPA devices, and minimize duplicate IOMMU TLB. The vDPA devices that need to be isolated can also be divided into different groups by associating them with different iommufds. Each iommufd backend is associated with a VDPAIOMMUFDContainer to establish contact with multiple vDPA devices. To improve availability, even if vDPA devices encounter problems when sharing page tables, they can still complete DMA mapping by applying for a separate HWPT. Signed-off-by: libai --- hw/virtio/meson.build | 2 +- hw/virtio/vdpa-dev-iommufd.c | 294 +++++++++++++++++++++++++++ hw/virtio/vdpa-dev.c | 17 ++ include/hw/virtio/vdpa-dev-iommufd.h | 40 ++++ include/hw/virtio/vdpa-dev.h | 2 + linux-headers/linux/vhost.h | 28 +++ 6 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 hw/virtio/vdpa-dev-iommufd.c create mode 100644 include/hw/virtio/vdpa-dev-iommufd.h diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index 596651d113..67291563d3 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -5,7 +5,7 @@ system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c') system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_VSOCK_COMMON', if_true: files('vhost-vsock-common.c')) system_virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c')) -system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c', 'vdpa-dev-mig.c')) +system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c', 'vdpa-dev-mig.c', 'vdpa-dev-iommufd.c')) specific_virtio_ss = ss.source_set() specific_virtio_ss.add(files('virtio.c')) diff --git a/hw/virtio/vdpa-dev-iommufd.c b/hw/virtio/vdpa-dev-iommufd.c new file mode 100644 index 0000000000..d72f56d52f --- /dev/null +++ b/hw/virtio/vdpa-dev-iommufd.c @@ -0,0 +1,294 @@ +/* + * vhost vdpa device iommufd backend + * + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All Rights Reserved. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include +#include +#include "qapi/error.h" +#include "hw/virtio/vdpa-dev-iommufd.h" + +static QLIST_HEAD(, VDPAIOMMUFDContainer) vdpa_container_list = + QLIST_HEAD_INITIALIZER(vdpa_container_list); + +static int vhost_vdpa_container_connect_iommufd(VDPAIOMMUFDContainer *container) +{ + IOMMUFDBackend *iommufd = container->iommufd; + uint32_t ioas_id; + Error *err = NULL; + + if (!iommufd) { + return -1; + } + + if (!iommufd_backend_connect(iommufd, &err)) { + error_report_err(err); + return -1; + } + + if (!iommufd_backend_alloc_ioas(iommufd, &ioas_id, &err)) { + error_report_err(err); + iommufd_backend_disconnect(iommufd); + return -1; + } + container->ioas_id = ioas_id; + return 0; +} + +static void vhost_vdpa_container_disconnect_iommufd(VDPAIOMMUFDContainer *container) +{ + IOMMUFDBackend *iommufd = container->iommufd; + uint32_t ioas_id = container->ioas_id; + + if (!iommufd) { + return; + } + + iommufd_backend_free_id(iommufd, ioas_id); + iommufd_backend_disconnect(iommufd); +} + +static IOMMUFDHWPT *vhost_vdpa_find_hwpt(VDPAIOMMUFDContainer *container, + VhostVdpaDevice *vdev) +{ + IOMMUFDHWPT *hwpt = NULL; + VhostVdpaDevice *tmp = NULL; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + QLIST_FOREACH(tmp, &hwpt->device_list, next) { + if (tmp == vdev) { + return hwpt; + } + } + } + + return NULL; +} + +static VDPAIOMMUFDContainer *vhost_vdpa_find_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + + QLIST_FOREACH(container, &vdpa_container_list, next) { + if (container->iommufd == vdev->iommufd) { + return container; + } + } + + return NULL; +} + +static VDPAIOMMUFDContainer *vhost_vdpa_create_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + + container = g_new0(VDPAIOMMUFDContainer, 1); + container->iommufd = vdev->iommufd; + QLIST_INIT(&container->hwpt_list); + + QLIST_INSERT_HEAD(&vdpa_container_list, container, next); + + return container; +} + +static void vhost_vdpa_destroy_container(VDPAIOMMUFDContainer *container) +{ + if (!container) { + return; + } + + container->iommufd = NULL; + QLIST_SAFE_REMOVE(container, next); + g_free(container); +} + +static void vhost_vdpa_device_unbind_iommufd(VhostVdpaDevice *vdev) +{ + int ret; + ret = ioctl(vdev->vhostfd, VHOST_VDPA_UNBIND_IOMMUFD, 0); + if (ret) { + qemu_log("vhost vdpa device unbind iommufd failed: %d, devid: %d\n", + ret, vdev->iommufd_devid); + } +} + +static int vhost_vdpa_device_bind_iommufd(VhostVdpaDevice *vdev) +{ + IOMMUFDBackend *iommufd = vdev->iommufd; + struct vdpa_dev_bind_iommufd bind = { + .iommufd = iommufd->fd, + .out_devid = -1, + }; + int ret; + + /* iommufd auto unbind when vdev->vhostfd close */ + ret = ioctl(vdev->vhostfd, VHOST_VDPA_BIND_IOMMUFD, &bind); + if (ret) { + qemu_log("vhost vdpa device bind iommufd failed: %d\n", ret); + return ret; + } + vdev->iommufd_devid = bind.out_devid; + return 0; +} + +static int vhost_vdpa_container_attach_device(VDPAIOMMUFDContainer *container, VhostVdpaDevice *vdev) +{ + IOMMUFDBackend *iommufd = NULL; + IOMMUFDHWPT *hwpt = NULL; + Error *err = NULL; + uint32_t pt_id; + int ret; + + if (!container || !container->iommufd || container->iommufd != vdev->iommufd) { + return -1; + } + + iommufd = container->iommufd; + + /* try to find an available hwpt */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + pt_id = hwpt->hwpt_id; + ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id); + if (ret == 0) { + QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next); + return 0; + } + } + + /* available hwpt not found in the container, create a new one */ + hwpt = g_new0(IOMMUFDHWPT, 1); + QLIST_INIT(&hwpt->device_list); + + if (!iommufd_backend_alloc_hwpt(iommufd, vdev->iommufd_devid, + container->ioas_id, 0, 0, 0, NULL, + &pt_id, NULL, &err)) { + error_report_err(err); + ret = -1; + goto free_mem; + } + + hwpt->hwpt_id = pt_id; + + ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id); + if (ret) { + qemu_log("vhost vdpa device attach iommufd pt failed: %d\n", ret); + goto free_hwpt; + } + + QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + + return 0; + +free_hwpt: + iommufd_backend_free_id(iommufd, hwpt->hwpt_id); +free_mem: + g_free(hwpt); + return ret; +} + +static void vhost_vdpa_container_detach_device(VDPAIOMMUFDContainer *container, VhostVdpaDevice *vdev) +{ + IOMMUFDBackend *iommufd = vdev->iommufd; + IOMMUFDHWPT *hwpt = NULL; + + /* find the hwpt using by this device */ + hwpt = vhost_vdpa_find_hwpt(container, vdev); + if (!hwpt) { + return; + } + + ioctl(vdev->vhostfd, VHOST_VDPA_DETACH_IOMMUFD_PT, &hwpt->hwpt_id); + + QLIST_SAFE_REMOVE(vdev, next); + + /* No device using this hwpt, free it */ + if (QLIST_EMPTY(&hwpt->device_list)) { + iommufd_backend_free_id(iommufd, hwpt->hwpt_id); + QLIST_SAFE_REMOVE(hwpt, next); + g_free(hwpt); + } +} + +int vhost_vdpa_attach_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + IOMMUFDBackend *iommufd = vdev->iommufd; + bool new_container = false; + int ret = 0; + + if (!iommufd) { + return 0; + } + + container = vhost_vdpa_find_container(vdev); + if (!container) { + container = vhost_vdpa_create_container(vdev); + if (!container) { + qemu_log("vdpa create container failed\n"); + return -1; + } + ret = vhost_vdpa_container_connect_iommufd(container); + if (ret) { + qemu_log("vdpa container connect iommufd failed\n"); + goto destroy; + } + new_container = true; + } + + ret = vhost_vdpa_device_bind_iommufd(vdev); + if (ret) { + qemu_log("vdpa device bind iommufd failed\n"); + goto disconnect; + } + + ret = vhost_vdpa_container_attach_device(container, vdev); + if (ret) { + qemu_log("vdpa container attach device failed\n"); + goto unbind; + } + + return 0; + +unbind: + vhost_vdpa_device_unbind_iommufd(vdev); +disconnect: + if (!new_container) { + return ret; + } + vhost_vdpa_container_disconnect_iommufd(container); +destroy: + vhost_vdpa_destroy_container(container); + + return ret; +} + +void vhost_vdpa_detach_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + IOMMUFDBackend *iommufd = vdev->iommufd; + + if (!iommufd) { + return; + } + + container = vhost_vdpa_find_container(vdev); + if (!container) { + return; + } + + vhost_vdpa_container_detach_device(container, vdev); + + vhost_vdpa_device_unbind_iommufd(vdev); + + if (!QLIST_EMPTY(&container->hwpt_list)) { + return; + } + /* No HWPT in this container, destroy it */ + vhost_vdpa_container_disconnect_iommufd(container); + + vhost_vdpa_destroy_container(container); +} \ No newline at end of file diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index 9ce7ed7eae..a6bd695724 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -32,6 +32,7 @@ #include "migration/migration.h" #include "exec/address-spaces.h" #include "standard-headers/linux/virtio_ids.h" +#include "hw/virtio/vdpa-dev-iommufd.h" static void vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -127,6 +128,16 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) goto free_vqs; } + /* If the vdpa device is associated with an iommufd, attach device to container */ + if (v->iommufd) { + ret = vhost_vdpa_attach_container(v); + if (ret < 0) { + error_setg(errp, "vhost vdpa device attach container failed: %s", + strerror(-ret)); + goto free_vqs; + } + } + memory_listener_register(&v->vdpa.listener, &address_space_memory); v->config_size = vhost_vdpa_device_get_u32(v->vhostfd, VHOST_VDPA_GET_CONFIG_SIZE, @@ -168,6 +179,9 @@ free_config: vhost_cleanup: memory_listener_unregister(&v->vdpa.listener); vhost_dev_cleanup(&v->dev); + if (v->iommufd) { + vhost_vdpa_detach_container(v); + } free_vqs: g_free(vqs); out: @@ -194,6 +208,9 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev) g_free(s->dev.vqs); memory_listener_unregister(&s->vdpa.listener); vhost_dev_cleanup(&s->dev); + if (s->iommufd) { + vhost_vdpa_detach_container(s); + } qemu_close(s->vhostfd); s->vhostfd = -1; } diff --git a/include/hw/virtio/vdpa-dev-iommufd.h b/include/hw/virtio/vdpa-dev-iommufd.h new file mode 100644 index 0000000000..dc14d9dd15 --- /dev/null +++ b/include/hw/virtio/vdpa-dev-iommufd.h @@ -0,0 +1,40 @@ +/* + * vhost vDPA device support iommufd header + * + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All Rights Reserved. + */ + +#ifndef _VHOST_VDPA_IOMMUFD_H +#define _VHOST_VDPA_IOMMUFD_H + +#include "hw/virtio/vdpa-dev.h" + +/* + * A HW pagetable is called an iommu_domain inside the kernel. + * This user object allows directly creating an inspecting the + * domains. Domains that have kernel owned page tables will be + * associated with an iommufd_ioas that provides the IOVA to + * PFN map. + */ +typedef struct IOMMUFDHWPT { + uint32_t hwpt_id; + QLIST_HEAD(, VhostVdpaDevice) device_list; + QLIST_ENTRY(IOMMUFDHWPT) next; +} IOMMUFDHWPT; + +typedef struct VDPAIOMMUFDContainer { + struct IOMMUFDBackend *iommufd; + uint32_t ioas_id; + QLIST_HEAD(, IOMMUFDHWPT) hwpt_list; + QLIST_ENTRY(VDPAIOMMUFDContainer) next; +} VDPAIOMMUFDContainer; + +struct vdpa_dev_bind_iommufd { + __s32 iommufd; + __u32 out_devid; +}; + +int vhost_vdpa_attach_container(VhostVdpaDevice *vdev); +void vhost_vdpa_detach_container(VhostVdpaDevice *vdev); + +#endif /* _VHOST_VDPA_IOMMUFD_H */ diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h index accdb7fa28..872e630546 100644 --- a/include/hw/virtio/vdpa-dev.h +++ b/include/hw/virtio/vdpa-dev.h @@ -43,6 +43,8 @@ struct VhostVdpaDevice { VMChangeStateEntry *vmstate; Notifier migration_state; IOMMUFDBackend *iommufd; + uint32_t iommufd_devid; + QLIST_ENTRY(VhostVdpaDevice) next; }; #endif diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index a08e980a1e..f5c05abe8b 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -232,6 +232,34 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) +/* Bind a vDPA device to the specified iommufd + * + * After the return of this ioctl, the vDPA device is binded to the specified + * iommufd, and the device id is also returned. + */ +#define VHOST_VDPA_BIND_IOMMUFD _IO(VHOST_VIRTIO, 0x90) + +/* Unbind a vDPA device from the specified iommufd + * + * After the return of this ioctl, the vDPA device is unbinded from the specified + * iommufd. + */ +#define VHOST_VDPA_UNBIND_IOMMUFD _IO(VHOST_VIRTIO, 0x91) + +/* Associate the vDPA device with an address space within the bound iommufd + * + * After the return of this ioctl, the vDPA device is attached to the bound + * iommufd. + */ +#define VHOST_VDPA_ATTACH_IOMMUFD_PT _IO(VHOST_VIRTIO, 0x92) + +/* Detach the vDPA device from an address space within the bound iommufd. + * + * After the return of this ioctl, the vDPA device is detached from the address + * space within the bound iommufd. + */ +#define VHOST_VDPA_DETACH_IOMMUFD_PT _IO(VHOST_VIRTIO, 0x93) + /* set and get device buffer */ #define VHOST_GET_DEV_BUFFER _IOR(VHOST_VIRTIO, 0xb0, struct vhost_vdpa_config) #define VHOST_SET_DEV_BUFFER _IOW(VHOST_VIRTIO, 0xb1, struct vhost_vdpa_config) -- Gitee From b88b03c84aa695b96a91329e2d01fffad551c34d Mon Sep 17 00:00:00 2001 From: libai Date: Thu, 27 Mar 2025 19:24:53 +0800 Subject: [PATCH 803/939] vdpa/iommufd:Implement DMA mapping through the iommufd interface Change the owner of memorylistener from the independent vDPA device to VDPAIOMMUFDContainer Signed-off-by: libai --- hw/virtio/vdpa-dev-iommufd.c | 137 +++++++++++++++++++++++++++ hw/virtio/vdpa-dev.c | 4 +- hw/virtio/vhost-vdpa.c | 13 +-- include/hw/virtio/vdpa-dev-iommufd.h | 1 + include/hw/virtio/vhost-vdpa.h | 7 ++ 5 files changed, 154 insertions(+), 8 deletions(-) diff --git a/hw/virtio/vdpa-dev-iommufd.c b/hw/virtio/vdpa-dev-iommufd.c index d72f56d52f..668c6a1cb1 100644 --- a/hw/virtio/vdpa-dev-iommufd.c +++ b/hw/virtio/vdpa-dev-iommufd.c @@ -9,11 +9,124 @@ #include #include #include "qapi/error.h" +#include "exec/target_page.h" +#include "exec/address-spaces.h" #include "hw/virtio/vdpa-dev-iommufd.h" static QLIST_HEAD(, VDPAIOMMUFDContainer) vdpa_container_list = QLIST_HEAD_INITIALIZER(vdpa_container_list); +static int vhost_vdpa_iommufd_container_dma_map(VDPAIOMMUFDContainer *container, hwaddr iova, + hwaddr size, void *vaddr, bool readonly) +{ + return iommufd_backend_map_dma(container->iommufd, container->ioas_id, iova, size, vaddr, readonly); + +} +static int vhost_vdpa_iommufd_container_dma_unmap(VDPAIOMMUFDContainer *container, + hwaddr iova, hwaddr size) +{ + return iommufd_backend_unmap_dma(container->iommufd, container->ioas_id, iova, size); +} + +static void vhost_vdpa_iommufd_container_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener); + hwaddr iova; + Int128 llend, llsize; + void *vaddr; + int page_size = qemu_target_page_size(); + int page_mask = -page_size; + int ret; + + if (vhost_vdpa_listener_skipped_section(section, 0, ULLONG_MAX, page_mask)) { + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) != + (section->offset_within_region & ~page_mask))) { + return; + } + + iova = ROUND_UP(section->offset_within_address_space, page_size); + llend = vhost_vdpa_section_end(section, page_mask); + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + memory_region_ref(section->mr); + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + llsize = int128_sub(llend, int128_make64(iova)); + + ret = vhost_vdpa_iommufd_container_dma_map(container, iova, int128_get64(llsize), + vaddr, section->readonly); + if (ret) { + qemu_log("vhost vdpa iommufd container dma map failed: %d\n", ret); + } +} + +static void vhost_vdpa_iommufd_container_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener); + hwaddr iova; + Int128 llend, llsize; + int page_size = qemu_target_page_size(); + int page_mask = -page_size; + int ret; + + if (vhost_vdpa_listener_skipped_section(section, 0, ULLONG_MAX, page_mask)) { + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) != + (section->offset_within_region & ~page_mask))) { + return; + } + + iova = ROUND_UP(section->offset_within_address_space, page_size); + llend = vhost_vdpa_section_end(section, page_mask); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + /* + * The unmap ioctl doesn't accept a full 64-bit. need to check it + */ + if (int128_eq(llsize, int128_2_64())) { + llsize = int128_rshift(llsize, 1); + ret = vhost_vdpa_iommufd_container_dma_unmap(container, iova, int128_get64(llsize)); + + if (ret) { + qemu_log("vhost vdpa iommufd container unmap failed(0x%" HWADDR_PRIx ", " + "0x%" HWADDR_PRIx ") = %d (%m)", iova, int128_get64(llsize), ret); + } + iova += int128_get64(llsize); + } + ret = vhost_vdpa_iommufd_container_dma_unmap(container, iova, int128_get64(llsize)); + + if (ret) { + qemu_log("vhost vdpa iommufd container unmap failed(0x%" HWADDR_PRIx ", " + "0x%" HWADDR_PRIx ") = %d (%m)", iova, int128_get64(llsize), ret); + } + + memory_region_unref(section->mr); +} + +/* + * IOTLB API used by vhost vdpa iommufd container + */ +const MemoryListener vhost_vdpa_iommufd_container_listener = { + .name = "vhost-vdpa-iommufd-container", + .region_add = vhost_vdpa_iommufd_container_region_add, + .region_del = vhost_vdpa_iommufd_container_region_del, +}; + static int vhost_vdpa_container_connect_iommufd(VDPAIOMMUFDContainer *container) { IOMMUFDBackend *iommufd = container->iommufd; @@ -87,6 +200,7 @@ static VDPAIOMMUFDContainer *vhost_vdpa_create_container(VhostVdpaDevice *vdev) container = g_new0(VDPAIOMMUFDContainer, 1); container->iommufd = vdev->iommufd; + container->listener = vhost_vdpa_iommufd_container_listener; QLIST_INIT(&container->hwpt_list); QLIST_INSERT_HEAD(&vdpa_container_list, container, next); @@ -213,11 +327,27 @@ static void vhost_vdpa_container_detach_device(VDPAIOMMUFDContainer *container, } } +static int vhost_vdpa_container_get_dev_count(VDPAIOMMUFDContainer *container) +{ + IOMMUFDHWPT *hwpt; + VhostVdpaDevice *dev; + int dev_count = 0; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + QLIST_FOREACH(dev, &hwpt->device_list, next) { + dev_count++; + } + } + + return dev_count; +} + int vhost_vdpa_attach_container(VhostVdpaDevice *vdev) { VDPAIOMMUFDContainer *container = NULL; IOMMUFDBackend *iommufd = vdev->iommufd; bool new_container = false; + int dev_count = 0; int ret = 0; if (!iommufd) { @@ -251,6 +381,12 @@ int vhost_vdpa_attach_container(VhostVdpaDevice *vdev) goto unbind; } + /* register the container memory listener when attaching the first device */ + dev_count = vhost_vdpa_container_get_dev_count(container); + if (dev_count == 1) { + memory_listener_register(&container->listener, &address_space_memory); + } + return 0; unbind: @@ -288,6 +424,7 @@ void vhost_vdpa_detach_container(VhostVdpaDevice *vdev) return; } /* No HWPT in this container, destroy it */ + memory_listener_unregister(&container->listener); vhost_vdpa_container_disconnect_iommufd(container); vhost_vdpa_destroy_container(container); diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index a6bd695724..b256ad540c 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -136,9 +136,9 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) strerror(-ret)); goto free_vqs; } + } else { + memory_listener_register(&v->vdpa.listener, &address_space_memory); } - - memory_listener_register(&v->vdpa.listener, &address_space_memory); v->config_size = vhost_vdpa_device_get_u32(v->vhostfd, VHOST_VDPA_GET_CONFIG_SIZE, errp); diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 4a8fc37851..b5fb89b98e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -26,13 +26,14 @@ #include "qemu/main-loop.h" #include "trace.h" #include "qapi/error.h" +#include "hw/virtio/vdpa-dev-iommufd.h" /* * Return one past the end of the end of section. Be careful with uint64_t * conversions! */ -static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, - int page_mask) +Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, + int page_mask) { Int128 llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); @@ -41,10 +42,10 @@ static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, return llend; } -static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, - uint64_t iova_min, - uint64_t iova_max, - int page_mask) +bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, + uint64_t iova_min, + uint64_t iova_max, + int page_mask) { Int128 llend; diff --git a/include/hw/virtio/vdpa-dev-iommufd.h b/include/hw/virtio/vdpa-dev-iommufd.h index dc14d9dd15..8e56647690 100644 --- a/include/hw/virtio/vdpa-dev-iommufd.h +++ b/include/hw/virtio/vdpa-dev-iommufd.h @@ -23,6 +23,7 @@ typedef struct IOMMUFDHWPT { } IOMMUFDHWPT; typedef struct VDPAIOMMUFDContainer { + MemoryListener listener; struct IOMMUFDBackend *iommufd; uint32_t ioas_id; QLIST_HEAD(, IOMMUFDHWPT) hwpt_list; diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index ee255bc1bd..e32effc6e1 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -57,6 +57,13 @@ typedef struct vhost_vdpa { int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range); int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx); +Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, + int page_mask); +bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, + uint64_t iova_min, + uint64_t iova_max, + int page_mask); + int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova, hwaddr size, void *vaddr, bool readonly); int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova, -- Gitee From ee97f42ea46a2527d19a3e87f33994d350959a90 Mon Sep 17 00:00:00 2001 From: eastmoutain <14304864+eastmoutain@user.noreply.gitee.com> Date: Mon, 20 May 2024 21:12:23 +0800 Subject: [PATCH 804/939] target/i386: csv: Release CSV3 shared pages after unmapping DMA The shared pages are created for Device DMA access, release them once DMA mapping is removed. Signed-off-by: yangwencheng Signed-off-by: hanliyang --- linux-headers/linux/kvm.h | 9 +++++++++ target/i386/csv-sysemu-stub.c | 5 +++++ target/i386/csv.c | 34 ++++++++++++++++++++++++++++++++++ target/i386/csv.h | 1 + target/i386/kvm/kvm.c | 1 + 5 files changed, 50 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 44a326fddc..a19683f1e9 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -2142,6 +2142,7 @@ enum csv3_cmd_id { KVM_CSV3_SEND_ENCRYPT_CONTEXT, KVM_CSV3_RECEIVE_ENCRYPT_DATA, KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, + KVM_CSV3_HANDLE_MEMORY, KVM_CSV3_SET_GUEST_PRIVATE_MEMORY = 0xc8, @@ -2190,6 +2191,14 @@ struct kvm_csv3_receive_encrypt_context { __u32 trans_len; }; +#define KVM_CSV3_RELEASE_SHARED_MEMORY (0x0001) + +struct kvm_csv3_handle_memory { + __u64 gpa; + __u32 num_pages; + __u32 opcode; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c index e49755da5c..735cce0e4b 100644 --- a/target/i386/csv-sysemu-stub.c +++ b/target/i386/csv-sysemu-stub.c @@ -40,6 +40,11 @@ void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) } +void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages) +{ + +} + int csv3_set_guest_private_memory(Error **errp) { g_assert_not_reached(); diff --git a/target/i386/csv.c b/target/i386/csv.c index d9b50040a3..b229f7c317 100644 --- a/target/i386/csv.c +++ b/target/i386/csv.c @@ -270,6 +270,40 @@ end: return ret; } +void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages) +{ + struct kvm_csv3_handle_memory mem = { 0 }; + MemoryRegion *mr = NULL; + void *hva; + int ret; + + if (!csv3_enabled()) + return; + + if (!gpa || !num_pages) + return; + + mem.gpa = (__u64)gpa; + mem.num_pages = (__u32)num_pages; + mem.opcode = (__u32)KVM_CSV3_RELEASE_SHARED_MEMORY; + + /* unpin the pages */ + ret = csv3_ioctl(KVM_CSV3_HANDLE_MEMORY, &mem, NULL); + if (ret <= 0) { + if (ret < 0) + error_report("%s: CSV3 unpin failed ret %d", __func__, ret); + return; + } + + /* drop the pages */ + hva = gpa2hva(&mr, gpa, num_pages << TARGET_PAGE_BITS, NULL); + if (hva) { + ret = madvise(hva, num_pages << TARGET_PAGE_BITS, MADV_DONTNEED); + if (ret) + error_report("%s: madvise failed %d", __func__, ret); + } +} + void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) { MemoryRegionSection section; diff --git a/target/i386/csv.h b/target/i386/csv.h index fb669279a8..70f9933d3b 100644 --- a/target/i386/csv.h +++ b/target/i386/csv.h @@ -124,6 +124,7 @@ int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); int csv3_shared_region_dma_map(uint64_t start, uint64_t end); void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); +void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages); int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr); int csv3_load_incoming_context(QEMUFile *f); int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index a867512822..2df3ff99c3 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5099,6 +5099,7 @@ static int kvm_handle_exit_hypercall(X86CPU *cpu, struct kvm_run *run) if (enc) { sev_remove_shared_regions_list(gfn_start, gfn_end); csv3_shared_region_dma_unmap(gpa, gfn_end << TARGET_PAGE_BITS); + csv3_shared_region_release(gpa, npages); } else { sev_add_shared_regions_list(gfn_start, gfn_end); csv3_shared_region_dma_map(gpa, gfn_end << TARGET_PAGE_BITS); -- Gitee From 122a0daf78f540bb3595432acc33a749cc6ca5a4 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:10 +0200 Subject: [PATCH 805/939] migration/multifd: Fix error message in multifd_recv_initial_packet() commit c77b40859a5201f01b44dc475258405e289c431f upstream. In multifd_recv_initial_packet(), if MultiFDInit_t->id is greater than the configured number of multifd channels, an irrelevant error message about multifd version is printed. Change the error message to a relevant one about the channel id. Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-6-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f3bf6888c0..055b2688ad 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -229,8 +229,8 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) } if (msg.id > migrate_multifd_channels()) { - error_setg(errp, "multifd: received channel version %u " - "expected %u", msg.version, MULTIFD_VERSION); + error_setg(errp, "multifd: received channel id %u is greater than " + "number of channels %u", msg.id, migrate_multifd_channels()); return -1; } -- Gitee From 9ec8c17e34afec47c8085a870e8dcfff36a9d3c7 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:11 +0200 Subject: [PATCH 806/939] migration/multifd: Simplify multifd_channel_connect() if else statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a4395f5d3c06472ed70d9ef9f79878f95575be9e upstream. The else branch in multifd_channel_connect() is redundant because when the if branch is taken the function returns. Simplify the code by removing the else branch. Signed-off-by: Avihai Horon Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-7-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 055b2688ad..06585f0141 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -851,14 +851,13 @@ static bool multifd_channel_connect(MultiFDSendParams *p, * so we mustn't call multifd_send_thread until then */ return multifd_tls_channel_connect(p, ioc, errp); - - } else { - migration_ioc_register_yank(ioc); - p->registered_yank = true; - p->c = ioc; - qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, - QEMU_THREAD_JOINABLE); } + + migration_ioc_register_yank(ioc); + p->registered_yank = true; + p->c = ioc; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); return true; } -- Gitee From 313207b5d51f530b45f106addcf489845f32b449 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:12 +0200 Subject: [PATCH 807/939] migration/multifd: Fix leaking of Error in TLS error flow commit 6ae208ce9656114e428b1a75ac62a6761ed3216c upstream. If there is an error in multifd TLS handshake task, multifd_tls_outgoing_handshake() retrieves the error with qio_task_propagate_error() but never frees it. Fix it by freeing the obtained Error. In addition, the error is not reported at all, so report it with migrate_set_error(). Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-8-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd.c b/migration/multifd.c index 06585f0141..8221ebe4b6 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -791,6 +791,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + migrate_set_error(migrate_get_current(), err); /* * Error happen, mark multifd_send_thread status as 'quit' although it * is not created, and then tell who pay attention to me. @@ -798,6 +799,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, p->quit = true; qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_post(&p->sem_sync); + error_free(err); } static void *multifd_tls_handshake_thread(void *opaque) -- Gitee From 1698ab2f40ef2bde3e7ee3175a5b5656589ce27d Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:13 +0200 Subject: [PATCH 808/939] migration/multifd: Remove error_setg() in migration_ioc_process_incoming() commit 1d3886f837d8e972366a8b58ba8afb0e5efbeed7 upstream. If multifd_load_setup() fails in migration_ioc_process_incoming(), error_setg() is called with errp. This will lead to an assert because in that case errp already contains an error. Fix it by removing the redundant error_setg(). Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-9-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index dce22c2da5..5829565f9c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -848,7 +848,6 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } if (multifd_load_setup(errp) != 0) { - error_setg(errp, "Failed to setup multifd channels"); return; } -- Gitee From 3a81455a093f3b06fd76d4964d0073c78ddbcc49 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:14 +0200 Subject: [PATCH 809/939] migration: Fix migration_channel_read_peek() error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4f8cf323e80c17f7d4b5604f1699591326df6262 upstream. migration_channel_read_peek() calls qio_channel_readv_full() and handles both cases of return value == 0 and return value < 0 the same way, by calling error_setg() with errp. However, if return value < 0, errp is already set, so calling error_setg() with errp will lead to an assert. Fix it by handling these cases separately, calling error_setg() with errp only in return value == 0 case. Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-10-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/channel.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index ca3319a309..f9de064f3b 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -117,9 +117,12 @@ int migration_channel_read_peek(QIOChannel *ioc, len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); - if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { - error_setg(errp, - "Failed to peek at channel"); + if (len < 0 && len != QIO_CHANNEL_ERR_BLOCK) { + return -1; + } + + if (len == 0) { + error_setg(errp, "Failed to peek at channel"); return -1; } -- Gitee From c707a4d1339d572942b79a1b6440cbe487ab2b81 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:16 +0200 Subject: [PATCH 810/939] migration/multifd: Remove unnecessary usage of local Error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3fc58efa938338a82e4d5c0c031e7e9c98e9544f upstream. According to Error API, usage of ERRP_GUARD() or a local Error instead of errp is needed if errp is passed to void functions, where it is later dereferenced to see if an error occurred. There are several places in multifd.c that use local Error although it is not needed. Change these places to use errp directly. Signed-off-by: Avihai Horon Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-12-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 8221ebe4b6..f5991bc746 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -955,12 +955,10 @@ int multifd_save_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_send_state->ops->send_setup(p, &local_err); + ret = multifd_send_state->ops->send_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } @@ -1199,12 +1197,10 @@ int multifd_load_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_recv_state->ops->recv_setup(p, &local_err); + ret = multifd_recv_state->ops->recv_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } -- Gitee From d6e061a269348d6d559be65a816cc0404501d86a Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:38 -0300 Subject: [PATCH 811/939] migration/multifd: Remove MultiFDPages_t::packet_num commit dca1bc7f24d2fa227f0b787f85f3cc67006e67bf upstream. This was introduced by commit 34c55a94b1 ("migration: Create multipage support") and never used. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 1 - migration/multifd.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f5991bc746..3ea204cac8 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -251,7 +251,6 @@ static void multifd_pages_clear(MultiFDPages_t *pages) { pages->num = 0; pages->allocated = 0; - pages->packet_num = 0; pages->block = NULL; g_free(pages->offset); pages->offset = NULL; diff --git a/migration/multifd.h b/migration/multifd.h index a835643b48..b0ff610c37 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -58,8 +58,6 @@ typedef struct { uint32_t num; /* number of allocated pages */ uint32_t allocated; - /* global number of generated multifd packets */ - uint64_t packet_num; /* offset of each page */ ram_addr_t *offset; RAMBlock *block; -- Gitee From d7823b26d0d983402a16b3568543bac7bb5c7f34 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:39 -0300 Subject: [PATCH 812/939] migration/multifd: Remove QEMUFile from where it is not needed commit 9346fa1870784c70618bfd5a9e1f1da89de0c5ec upstream. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-3-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/ram.c [jz: resolve context conflict due to BQL name, qemu_mutex_lock_iothread() hasn't renamed to bql_lock() yet] Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++------ migration/multifd.h | 4 ++-- migration/ram.c | 15 +++++++-------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3ea204cac8..3e5aaaa1d4 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -391,7 +391,7 @@ struct { * false. */ -static int multifd_send_pages(QEMUFile *f) +static int multifd_send_pages(void) { int i; static int next_channel; @@ -437,7 +437,7 @@ static int multifd_send_pages(QEMUFile *f) return 1; } -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) +int multifd_queue_page(RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; bool changed = false; @@ -457,12 +457,12 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) changed = true; } - if (multifd_send_pages(f) < 0) { + if (multifd_send_pages() < 0) { return -1; } if (changed) { - return multifd_queue_page(f, block, offset); + return multifd_queue_page(block, offset); } return 1; @@ -584,7 +584,7 @@ static int multifd_zero_copy_flush(QIOChannel *c) return ret; } -int multifd_send_sync_main(QEMUFile *f) +int multifd_send_sync_main(void) { int i; bool flush_zero_copy; @@ -593,7 +593,7 @@ int multifd_send_sync_main(QEMUFile *f) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages(f) < 0) { + if (multifd_send_pages() < 0) { error_report("%s: multifd_send_pages fail", __func__); return -1; } diff --git a/migration/multifd.h b/migration/multifd.h index b0ff610c37..35d11f103c 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -21,8 +21,8 @@ void multifd_load_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -int multifd_send_sync_main(QEMUFile *f); -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); +int multifd_send_sync_main(void); +int multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/ram.c b/migration/ram.c index f1ff38cf39..67fa9c83d6 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1387,10 +1387,9 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) return pages; } -static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, - ram_addr_t offset) +static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(file, block, offset) < 0) { + if (multifd_queue_page(block, offset) < 0) { return -1; } stat64_add(&mig_stats.normal_pages, 1); @@ -1473,7 +1472,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; - int ret = multifd_send_sync_main(f); + int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -2265,7 +2264,7 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) * still see partially copied pages which is data corruption. */ if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(pss->pss_channel, block, offset); + return ram_save_multifd_page(block, offset); } return ram_save_page(rs, pss); @@ -3434,7 +3433,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_ops->ram_save_target_page = ram_save_target_page_legacy; qemu_mutex_unlock_iothread(); - ret = multifd_send_sync_main(f); + ret = multifd_send_sync_main(); qemu_mutex_lock_iothread(); if (ret < 0) { return ret; @@ -3558,7 +3557,7 @@ out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -3654,7 +3653,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } } - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } -- Gitee From 61e0a1ad97ca72ea4396d142bdfd7481b9380d6c Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:40 -0300 Subject: [PATCH 813/939] migration/multifd: Change multifd_pages_init argument commit 6074f81625800743e4c374aecf7dd30774aaf6e0 upstream. The 'size' argument is actually the number of pages that fit in a multifd packet. Change it to uint32_t and rename. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3e5aaaa1d4..ef7d4520c4 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -237,12 +237,12 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } -static MultiFDPages_t *multifd_pages_init(size_t size) +static MultiFDPages_t *multifd_pages_init(uint32_t n) { MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); - pages->allocated = size; - pages->offset = g_new0(ram_addr_t, size); + pages->allocated = n; + pages->offset = g_new0(ram_addr_t, n); return pages; } -- Gitee From 8235f51444f1147a36733474278476d7de83d545 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:41 -0300 Subject: [PATCH 814/939] migration: Report error in incoming migration commit e3b8ad5c13714cca5e3fc1445472171fbcd469bc upstream. We're not currently reporting the errors set with migrate_set_error() when incoming migration fails. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 5829565f9c..2c5258d0b0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -698,6 +698,13 @@ process_incoming_migration_co(void *opaque) } if (ret < 0) { + MigrationState *s = migrate_get_current(); + + if (migrate_has_error(s)) { + WITH_QEMU_LOCK_GUARD(&s->error_mutex) { + error_report_err(s->error); + } + } error_report("load of migration failed: %s", strerror(-ret)); goto fail; } -- Gitee From 20c8c77ba5e362b1bfada691b2242648d3626d5d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:42 -0300 Subject: [PATCH 815/939] tests/qtest/migration: Print migration incoming errors commit 679a7382a389875c0f7835a1a409ebf4859f8410 upstream. We're currently just asserting when incoming migration fails. Let's print the error message from QMP as well. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-helpers.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index 24fb7b3525..f1106128a9 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -118,6 +118,12 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}", args); + + if (!qdict_haskey(rsp, "return")) { + g_autoptr(GString) s = qobject_to_json_pretty(QOBJECT(rsp), true); + g_test_message("%s", s->str); + } + g_assert(qdict_haskey(rsp, "return")); qobject_unref(rsp); -- Gitee From d78a7031877a343563200e875c4ef2d71522f1d0 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:43 -0300 Subject: [PATCH 816/939] tests/qtest/migration: Add a wrapper to print test names commit e33b6712dba206547a313a6f2608b0fd967ee558 upstream. Our usage of gtest results in us losing the very basic functionality of "knowing which test failed". The issue is that gtest only prints test names ("paths" in gtest parlance) once the test has finished, but we use asserts in the tests and crash gtest itself before it can print anything. We also use a final abort when the result of g_test_run is not 0. Depending on how the test failed/broke we can see the function that trigged the abort, which may be representative of the test, but it could also just be some generic function. We have been relying on the primitive method of looking at the name of the previous successful test and then looking at the code to figure out which test should have come next. Add a wrapper to the test registration that does the job of printing the test name before running. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-helpers.c | 32 ++++++++++++++++++++++++++++++++ tests/qtest/migration-helpers.h | 1 + 2 files changed, 33 insertions(+) diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index f1106128a9..164e09c299 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -298,3 +298,35 @@ char *resolve_machine_version(const char *alias, const char *var1, return find_common_machine_version(machine_name, var1, var2); } + +typedef struct { + char *name; + void (*func)(void); +} MigrationTest; + +static void migration_test_destroy(gpointer data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_free(test->name); + g_free(test); +} + +static void migration_test_wrapper(const void *data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_test_message("Running /%s%s", qtest_get_arch(), test->name); + test->func(); +} + +void migration_test_add(const char *path, void (*fn)(void)) +{ + MigrationTest *test = g_new0(MigrationTest, 1); + + test->func = fn; + test->name = g_strdup(path); + + qtest_add_data_func_full(path, test, migration_test_wrapper, + migration_test_destroy); +} diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h index e31dc85cc7..0d9a02edc7 100644 --- a/tests/qtest/migration-helpers.h +++ b/tests/qtest/migration-helpers.h @@ -47,4 +47,5 @@ char *find_common_machine_version(const char *mtype, const char *var1, const char *var2); char *resolve_machine_version(const char *alias, const char *var1, const char *var2); +void migration_test_add(const char *path, void (*fn)(void)); #endif /* MIGRATION_HELPERS_H */ -- Gitee From a26a1ea993f48dbccd0fee3812b7535531b1cc14 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:44 -0300 Subject: [PATCH 817/939] tests/qtest/migration: Use the new migration_test_add commit 6f0771de903bb7623dc85bbf9f94f641979daaaa upstream. Replace the tests registration with the new function that prints tests names. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-8-farosas@suse.de Signed-off-by: Peter Xu Conflicts: tests/qtest/migration-test.c [jz: resolve context conflicts due to live-suspend which is not backported] Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 202 ++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 98 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 0fbaa6a90f..470b06bbb4 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -3339,62 +3339,64 @@ int main(int argc, char **argv) module_call_init(MODULE_INIT_QOM); if (has_uffd) { - qtest_add_func("/migration/postcopy/plain", test_postcopy); - qtest_add_func("/migration/postcopy/recovery/plain", - test_postcopy_recovery); - qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt); - qtest_add_func("/migration/postcopy/preempt/recovery/plain", - test_postcopy_preempt_recovery); + migration_test_add("/migration/postcopy/plain", test_postcopy); + migration_test_add("/migration/postcopy/recovery/plain", + test_postcopy_recovery); + migration_test_add("/migration/postcopy/preempt/plain", + test_postcopy_preempt); + migration_test_add("/migration/postcopy/preempt/recovery/plain", + test_postcopy_preempt_recovery); if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/postcopy/compress/plain", - test_postcopy_compress); - qtest_add_func("/migration/postcopy/recovery/compress/plain", - test_postcopy_recovery_compress); + migration_test_add("/migration/postcopy/compress/plain", + test_postcopy_compress); + migration_test_add("/migration/postcopy/recovery/compress/plain", + test_postcopy_recovery_compress); } #ifndef _WIN32 - qtest_add_func("/migration/postcopy/recovery/double-failures", - test_postcopy_recovery_double_fail); + migration_test_add("/migration/postcopy/recovery/double-failures", + test_postcopy_recovery_double_fail); #endif /* _WIN32 */ - } - qtest_add_func("/migration/bad_dest", test_baddest); + migration_test_add("/migration/bad_dest", test_baddest); #ifndef _WIN32 if (!g_str_equal(arch, "s390x")) { - qtest_add_func("/migration/analyze-script", test_analyze_script); + migration_test_add("/migration/analyze-script", test_analyze_script); } #endif - qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain); - qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle); + migration_test_add("/migration/precopy/unix/plain", + test_precopy_unix_plain); + migration_test_add("/migration/precopy/unix/xbzrle", + test_precopy_unix_xbzrle); /* * Compression fails from time to time. * Put test here but don't enable it until everything is fixed. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/precopy/unix/compress/wait", - test_precopy_unix_compress); - qtest_add_func("/migration/precopy/unix/compress/nowait", - test_precopy_unix_compress_nowait); + migration_test_add("/migration/precopy/unix/compress/wait", + test_precopy_unix_compress); + migration_test_add("/migration/precopy/unix/compress/nowait", + test_precopy_unix_compress_nowait); } - qtest_add_func("/migration/precopy/file", - test_precopy_file); - qtest_add_func("/migration/precopy/file/offset", - test_precopy_file_offset); - qtest_add_func("/migration/precopy/file/offset/bad", - test_precopy_file_offset_bad); + migration_test_add("/migration/precopy/file", + test_precopy_file); + migration_test_add("/migration/precopy/file/offset", + test_precopy_file_offset); + migration_test_add("/migration/precopy/file/offset/bad", + test_precopy_file_offset_bad); /* * Our CI system has problems with shared memory. * Don't run this test until we find a workaround. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/mode/reboot", test_mode_reboot); + migration_test_add("/migration/mode/reboot", test_mode_reboot); } #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/unix/tls/psk", - test_precopy_unix_tls_psk); + migration_test_add("/migration/precopy/unix/tls/psk", + test_precopy_unix_tls_psk); if (has_uffd) { /* @@ -3402,110 +3404,114 @@ int main(int argc, char **argv) * channels are tested under precopy. Here what we want to test is the * general postcopy path that has TLS channel enabled. */ - qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk); - qtest_add_func("/migration/postcopy/recovery/tls/psk", - test_postcopy_recovery_tls_psk); - qtest_add_func("/migration/postcopy/preempt/tls/psk", - test_postcopy_preempt_tls_psk); - qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk", - test_postcopy_preempt_all); + migration_test_add("/migration/postcopy/tls/psk", + test_postcopy_tls_psk); + migration_test_add("/migration/postcopy/recovery/tls/psk", + test_postcopy_recovery_tls_psk); + migration_test_add("/migration/postcopy/preempt/tls/psk", + test_postcopy_preempt_tls_psk); + migration_test_add("/migration/postcopy/preempt/recovery/tls/psk", + test_postcopy_preempt_all); } #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/unix/tls/x509/default-host", - test_precopy_unix_tls_x509_default_host); - qtest_add_func("/migration/precopy/unix/tls/x509/override-host", - test_precopy_unix_tls_x509_override_host); + migration_test_add("/migration/precopy/unix/tls/x509/default-host", + test_precopy_unix_tls_x509_default_host); + migration_test_add("/migration/precopy/unix/tls/x509/override-host", + test_precopy_unix_tls_x509_override_host); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain); + migration_test_add("/migration/precopy/tcp/plain", test_precopy_tcp_plain); - qtest_add_func("/migration/precopy/tcp/plain/switchover-ack", - test_precopy_tcp_switchover_ack); + migration_test_add("/migration/precopy/tcp/plain/switchover-ack", + test_precopy_tcp_switchover_ack); #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/tcp/tls/psk/match", - test_precopy_tcp_tls_psk_match); - qtest_add_func("/migration/precopy/tcp/tls/psk/mismatch", - test_precopy_tcp_tls_psk_mismatch); + migration_test_add("/migration/precopy/tcp/tls/psk/match", + test_precopy_tcp_tls_psk_match); + migration_test_add("/migration/precopy/tcp/tls/psk/mismatch", + test_precopy_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/tcp/tls/x509/default-host", - test_precopy_tcp_tls_x509_default_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/override-host", - test_precopy_tcp_tls_x509_override_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/mismatch-host", - test_precopy_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/friendly-client", - test_precopy_tcp_tls_x509_friendly_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/hostile-client", - test_precopy_tcp_tls_x509_hostile_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/allow-anon-client", - test_precopy_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/reject-anon-client", - test_precopy_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/default-host", + test_precopy_tcp_tls_x509_default_host); + migration_test_add("/migration/precopy/tcp/tls/x509/override-host", + test_precopy_tcp_tls_x509_override_host); + migration_test_add("/migration/precopy/tcp/tls/x509/mismatch-host", + test_precopy_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/precopy/tcp/tls/x509/friendly-client", + test_precopy_tcp_tls_x509_friendly_client); + migration_test_add("/migration/precopy/tcp/tls/x509/hostile-client", + test_precopy_tcp_tls_x509_hostile_client); + migration_test_add("/migration/precopy/tcp/tls/x509/allow-anon-client", + test_precopy_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/reject-anon-client", + test_precopy_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */ + /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */ #ifndef _WIN32 - qtest_add_func("/migration/fd_proto", test_migrate_fd_proto); + migration_test_add("/migration/fd_proto", test_migrate_fd_proto); #endif - qtest_add_func("/migration/validate_uuid", test_validate_uuid); - qtest_add_func("/migration/validate_uuid_error", test_validate_uuid_error); - qtest_add_func("/migration/validate_uuid_src_not_set", - test_validate_uuid_src_not_set); - qtest_add_func("/migration/validate_uuid_dst_not_set", - test_validate_uuid_dst_not_set); + migration_test_add("/migration/validate_uuid", test_validate_uuid); + migration_test_add("/migration/validate_uuid_error", + test_validate_uuid_error); + migration_test_add("/migration/validate_uuid_src_not_set", + test_validate_uuid_src_not_set); + migration_test_add("/migration/validate_uuid_dst_not_set", + test_validate_uuid_dst_not_set); /* * See explanation why this test is slow on function definition */ if (g_test_slow()) { - qtest_add_func("/migration/auto_converge", test_migrate_auto_converge); + migration_test_add("/migration/auto_converge", + test_migrate_auto_converge); if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_limit", test_migrate_dirty_limit); + migration_test_add("/migration/dirty_limit", + test_migrate_dirty_limit); } } - qtest_add_func("/migration/multifd/tcp/plain/none", - test_multifd_tcp_none); + migration_test_add("/migration/multifd/tcp/plain/none", + test_multifd_tcp_none); /* * This test is flaky and sometimes fails in CI and otherwise: * don't run unless user opts in via environment variable. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/multifd/tcp/plain/cancel", - test_multifd_tcp_cancel); + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); } - qtest_add_func("/migration/multifd/tcp/plain/zlib", - test_multifd_tcp_zlib); + migration_test_add("/migration/multifd/tcp/plain/zlib", + test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD - qtest_add_func("/migration/multifd/tcp/plain/zstd", - test_multifd_tcp_zstd); + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); #endif #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/multifd/tcp/tls/psk/match", - test_multifd_tcp_tls_psk_match); - qtest_add_func("/migration/multifd/tcp/tls/psk/mismatch", - test_multifd_tcp_tls_psk_mismatch); + migration_test_add("/migration/multifd/tcp/tls/psk/match", + test_multifd_tcp_tls_psk_match); + migration_test_add("/migration/multifd/tcp/tls/psk/mismatch", + test_multifd_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/multifd/tcp/tls/x509/default-host", - test_multifd_tcp_tls_x509_default_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/override-host", - test_multifd_tcp_tls_x509_override_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/mismatch-host", - test_multifd_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/allow-anon-client", - test_multifd_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/multifd/tcp/tls/x509/reject-anon-client", - test_multifd_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/default-host", + test_multifd_tcp_tls_x509_default_host); + migration_test_add("/migration/multifd/tcp/tls/x509/override-host", + test_multifd_tcp_tls_x509_override_host); + migration_test_add("/migration/multifd/tcp/tls/x509/mismatch-host", + test_multifd_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/multifd/tcp/tls/x509/allow-anon-client", + test_multifd_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/reject-anon-client", + test_multifd_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_ring", - test_precopy_unix_dirty_ring); - qtest_add_func("/migration/vcpu_dirty_limit", - test_vcpu_dirty_limit); + migration_test_add("/migration/dirty_ring", + test_precopy_unix_dirty_ring); + migration_test_add("/migration/vcpu_dirty_limit", + test_vcpu_dirty_limit); } ret = g_test_run(); -- Gitee From eea4f476c2c35e4153637d5efe25ce308c2aaa55 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 11 Oct 2023 15:46:04 -0300 Subject: [PATCH 818/939] tests/qtest: Re-enable multifd cancel test commit 75b1f88cd2dd5eeb1fd817a2f3a291c2670f9c50 upstream. We've found the source of flakiness in this test, so re-enable it. Reviewed-by: Juan Quintela Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20230606144551.24367-4-farosas@suse.de [peterx: rebase to 2a61a6964c, to use migration_test_add()] Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 470b06bbb4..13888be898 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -3474,14 +3474,8 @@ int main(int argc, char **argv) } migration_test_add("/migration/multifd/tcp/plain/none", test_multifd_tcp_none); - /* - * This test is flaky and sometimes fails in CI and otherwise: - * don't run unless user opts in via environment variable. - */ - if (getenv("QEMU_TEST_FLAKY_TESTS")) { - migration_test_add("/migration/multifd/tcp/plain/cancel", - test_multifd_tcp_cancel); - } + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); migration_test_add("/migration/multifd/tcp/plain/zlib", test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD -- Gitee From 830cfda7df1e63448c916492ce6be497511d6fb7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:19 +0800 Subject: [PATCH 819/939] docs/migration: Create migration/ directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8cb2f8b172e74a7279fabb5d5c20aee32b5b98cd upstream. Migration documentation is growing into a single file too large. Create a sub-directory for it for a split. We also already have separate vfio/virtio documentations, move it all over into the directory. Note that the virtio one is still not yet converted to rST. That is a job for later. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/index-internals.rst | 2 +- docs/devel/{migration.rst => migration/main.rst} | 0 docs/devel/{vfio-migration.rst => migration/vfio.rst} | 0 docs/devel/{virtio-migration.txt => migration/virtio.txt} | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename docs/devel/{migration.rst => migration/main.rst} (100%) rename docs/devel/{vfio-migration.rst => migration/vfio.rst} (100%) rename docs/devel/{virtio-migration.txt => migration/virtio.txt} (100%) diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 3def4a138b..a41d62c1eb 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,7 +11,7 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration + migration/main multi-process reset s390-cpu-topology diff --git a/docs/devel/migration.rst b/docs/devel/migration/main.rst similarity index 100% rename from docs/devel/migration.rst rename to docs/devel/migration/main.rst diff --git a/docs/devel/vfio-migration.rst b/docs/devel/migration/vfio.rst similarity index 100% rename from docs/devel/vfio-migration.rst rename to docs/devel/migration/vfio.rst diff --git a/docs/devel/virtio-migration.txt b/docs/devel/migration/virtio.txt similarity index 100% rename from docs/devel/virtio-migration.txt rename to docs/devel/migration/virtio.txt -- Gitee From d91782d895b71e416f66bc7e42797d50699839bb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:20 +0800 Subject: [PATCH 820/939] docs/migration: Create index page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f6bbac985e6df492f2c6be94fb893ada75ffdefa upstream. Create an index page for migration module. Move VFIO migration there too. A trivial touch-up on the title to use lower case there. Since then we'll have "migration" as the top title, make the main doc file renamed to "migration framework". Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/index-internals.rst | 3 +-- docs/devel/migration/index.rst | 11 +++++++++++ docs/devel/migration/main.rst | 6 +++--- docs/devel/migration/vfio.rst | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 docs/devel/migration/index.rst diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index a41d62c1eb..5636e9cf1d 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,13 +11,12 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration/main + migration/index multi-process reset s390-cpu-topology s390-dasd-ipl tracing - vfio-migration vfio-iommufd writing-monitor-commands virtio-backends diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst new file mode 100644 index 0000000000..02cfdcc969 --- /dev/null +++ b/docs/devel/migration/index.rst @@ -0,0 +1,11 @@ +Migration +========= + +This is the main entry for QEMU migration documentations. It explains how +QEMU live migration works. + +.. toctree:: + :maxdepth: 2 + + main + vfio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index ec55089b25..82cdb420bf 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -1,6 +1,6 @@ -========= -Migration -========= +=================== +Migration framework +=================== QEMU has code to load/save the state of the guest that it is running. These are two complementary operations. Saving the state just does diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index 605fe60e96..c49482eab6 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -1,5 +1,5 @@ ===================== -VFIO device Migration +VFIO device migration ===================== Migration of virtual machine involves saving the state for each device that -- Gitee From 689a0e1d7e3fea78bc90ded9b17ccbf66b5e91ad Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:21 +0800 Subject: [PATCH 821/939] docs/migration: Convert virtio.txt into rST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4d7a691bcfeb5580e3f7457e1f1c2fbd64572161 upstream. Convert the plain old .txt into .rst, add it into migration/index.rst. Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/index.rst | 1 + .../migration/{virtio.txt => virtio.rst} | 139 +++++++++--------- 2 files changed, 74 insertions(+), 66 deletions(-) rename docs/devel/migration/{virtio.txt => virtio.rst} (37%) diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 02cfdcc969..2cb701c77c 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -9,3 +9,4 @@ QEMU live migration works. main vfio + virtio diff --git a/docs/devel/migration/virtio.txt b/docs/devel/migration/virtio.rst similarity index 37% rename from docs/devel/migration/virtio.txt rename to docs/devel/migration/virtio.rst index 98a6b0ffb5..611a18b821 100644 --- a/docs/devel/migration/virtio.txt +++ b/docs/devel/migration/virtio.rst @@ -1,5 +1,6 @@ -Virtio devices and migration -============================ +======================= +Virtio device migration +======================= Copyright 2015 IBM Corp. @@ -8,91 +9,97 @@ the COPYING file in the top-level directory. Saving and restoring the state of virtio devices is a bit of a twisty maze, for several reasons: + - state is distributed between several parts: + - virtio core, for common fields like features, number of queues, ... + - virtio transport (pci, ccw, ...), for the different proxy devices and transport specific state (msix vectors, indicators, ...) + - virtio device (net, blk, ...), for the different device types and their state (mac address, request queue, ...) + - most fields are saved via the stream interface; subsequently, subsections have been added to make cross-version migration possible This file attempts to document the current procedure and point out some caveats. - Save state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - save() function registered - via VMState wrapper on - device class -virtio_save() <---------- - ------> save_config() - - save proxy device - - save transport-specific - device fields -- save common device - fields -- save common virtqueue - fields - ------> save_queue() - - save transport-specific - virtqueue fields - ------> save_device() - - save device-specific - fields -- save subsections - - device endianness, - if changed from - default endianness - - 64 bit features, if - any high feature bit - is set - - virtio-1 virtqueue - fields, if VERSION_1 - is set +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + save() function registered + via VMState wrapper on + device class + virtio_save() <---------- + ------> save_config() + - save proxy device + - save transport-specific + device fields + - save common device + fields + - save common virtqueue + fields + ------> save_queue() + - save transport-specific + virtqueue fields + ------> save_device() + - save device-specific + fields + - save subsections + - device endianness, + if changed from + default endianness + - 64 bit features, if + any high feature bit + is set + - virtio-1 virtqueue + fields, if VERSION_1 + is set Load state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - load() function registered - via VMState wrapper on - device class -virtio_load() <---------- - ------> load_config() - - load proxy device - - load transport-specific - device fields -- load common device - fields -- load common virtqueue - fields - ------> load_queue() - - load transport-specific - virtqueue fields -- notify guest - ------> load_device() - - load device-specific - fields -- load subsections - - device endianness - - 64 bit features - - virtio-1 virtqueue - fields -- sanitize endianness -- sanitize features -- virtqueue index sanity - check - - feature-dependent setup +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + load() function registered + via VMState wrapper on + device class + virtio_load() <---------- + ------> load_config() + - load proxy device + - load transport-specific + device fields + - load common device + fields + - load common virtqueue + fields + ------> load_queue() + - load transport-specific + virtqueue fields + - notify guest + ------> load_device() + - load device-specific + fields + - load subsections + - device endianness + - 64 bit features + - virtio-1 virtqueue + fields + - sanitize endianness + - sanitize features + - virtqueue index sanity + check + - feature-dependent setup Implications of this setup ========================== -- Gitee From ed43780ea13b581be42a154890bdcc8e58919dd9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:22 +0800 Subject: [PATCH 822/939] docs/migration: Split "Backwards compatibility" separately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6cc6a7b98b88f1a7d1d5ed99db0d373a46606aac upstream. Split the section from main.rst into a separate file. Reference it in the index.rst. Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/compatibility.rst | 517 ++++++++++++++++++++++++ docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 519 ------------------------- 3 files changed, 518 insertions(+), 519 deletions(-) create mode 100644 docs/devel/migration/compatibility.rst diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst new file mode 100644 index 0000000000..5a5417ef06 --- /dev/null +++ b/docs/devel/migration/compatibility.rst @@ -0,0 +1,517 @@ +Backwards compatibility +======================= + +How backwards compatibility works +--------------------------------- + +When we do migration, we have two QEMU processes: the source and the +target. There are two cases, they are the same version or they are +different versions. The easy case is when they are the same version. +The difficult one is when they are different versions. + +There are two things that are different, but they have very similar +names and sometimes get confused: + +- QEMU version +- machine type version + +Let's start with a practical example, we start with: + +- qemu-system-x86_64 (v5.2), from now on qemu-5.2. +- qemu-system-x86_64 (v5.1), from now on qemu-5.1. + +Related to this are the "latest" machine types defined on each of +them: + +- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 +- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 + +First of all, migration is only supposed to work if you use the same +machine type in both source and destination. The QEMU hardware +configuration needs to be the same also on source and destination. +Most aspects of the backend configuration can be changed at will, +except for a few cases where the backend features influence frontend +device feature exposure. But that is not relevant for this section. + +I am going to list the number of combinations that we can have. Let's +start with the trivial ones, QEMU is the same on source and +destination: + +1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 + + This is the latest QEMU with the latest machine type. + This have to work, and if it doesn't work it is a bug. + +2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + Exactly the same case than the previous one, but for 5.1. + Nothing to see here either. + +This are the easiest ones, we will not talk more about them in this +section. + +Now we start with the more interesting cases. Consider the case where +we have the same QEMU version in both sides (qemu-5.2) but we are using +the latest machine type for that version (pc-5.2) but one of an older +QEMU version, in this case pc-5.1. + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + It needs to use the definition of pc-5.1 and the devices as they + were configured on 5.1, but this should be easy in the sense that + both sides are the same QEMU and both sides have exactly the same + idea of what the pc-5.1 machine is. + +4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 + + This combination is not possible as the qemu-5.1 doesn't understand + pc-5.2 machine type. So nothing to worry here. + +Now it comes the interesting ones, when both QEMU processes are +different. Notice also that the machine type needs to be pc-5.1, +because we have the limitation than qemu-5.1 doesn't know pc-5.2. So +the possible cases are: + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + This migration is known as newer to older. We need to make sure + when we are developing 5.2 we need to take care about not to break + migration to qemu-5.1. Notice that we can't make updates to + qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is + in qemu-5.2 side to make the relevant changes. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + This migration is known as older to newer. We need to make sure + than we are able to receive migrations from qemu-5.1. The problem is + similar to the previous one. + +If qemu-5.1 and qemu-5.2 were the same, there will not be any +compatibility problems. But the reason that we create qemu-5.2 is to +get new features, devices, defaults, etc. + +If we get a device that has a new feature, or change a default value, +we have a problem when we try to migrate between different QEMU +versions. + +So we need a way to tell qemu-5.2 that when we are using machine type +pc-5.1, it needs to **not** use the feature, to be able to migrate to +real qemu-5.1. + +And the equivalent part when migrating from qemu-5.1 to qemu-5.2. +qemu-5.2 has to expect that it is not going to get data for the new +feature, because qemu-5.1 doesn't know about it. + +How do we tell QEMU about these device feature changes? In +hw/core/machine.c:hw_compat_X_Y arrays. + +If we change a default value, we need to put back the old value on +that array. And the device, during initialization needs to look at +that array to see what value it needs to get for that feature. And +what are we going to put in that array, the value of a property. + +To create a property for a device, we need to use one of the +DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the +macros that exist. With it, we set the default value for that +property, and that is what it is going to get in the latest released +version. But if we want a different value for a previous version, we +can change that in the hw_compat_X_Y arrays. + +hw_compat_X_Y is an array of registers that have the format: + +- name_device +- name_property +- value + +Let's see a practical example. + +In qemu-5.2 virtio-blk-device got multi queue support. This is a +change that is not backward compatible. In qemu-5.1 it has one +queue. In qemu-5.2 it has the same number of queues as the number of +cpus in the system. + +When we are doing migration, if we migrate from a device that has 4 +queues to a device that have only one queue, we don't know where to +put the extra information for the other 3 queues, and we fail +migration. + +Similar problem when we migrate from qemu-5.1 that has only one queue +to qemu-5.2, we only sent information for one queue, but destination +has 4, and we have 3 queues that are not properly initialized and +anything can happen. + +So, how can we address this problem. Easy, just convince qemu-5.2 +that when it is running pc-5.1, it needs to set the number of queues +for virtio-blk-devices to 1. + +That way we fix the cases 5 and 6. + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + qemu-5.2 -M pc-5.1 sets number of queues to be 1. + qemu-5.1 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + qemu-5.1 -M pc-5.1 sets number of queues to be 1. + qemu-5.2 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +And now the other interesting case, case 3. In this case we have: + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + Here we have the same QEMU in both sides. So it doesn't matter a + lot if we have set the number of queues to 1 or not, because + they are the same. + + WRONG! + + Think what happens if we do one of this double migrations: + + A -> migrates -> B -> migrates -> C + + where: + + A: qemu-5.1 -M pc-5.1 + B: qemu-5.2 -M pc-5.1 + C: qemu-5.2 -M pc-5.1 + + migration A -> B is case 6, so number of queues needs to be 1. + + migration B -> C is case 3, so we don't care. But actually we + care because we haven't started the guest in qemu-5.2, it came + migrated from qemu-5.1. So to be in the safe place, we need to + always use number of queues 1 when we are using pc-5.1. + +Now, how was this done in reality? The following commit shows how it +was done:: + + commit 9445e1e15e66c19e42bea942ba810db28052cd05 + Author: Stefan Hajnoczi + Date: Tue Aug 18 15:33:47 2020 +0100 + + virtio-blk-pci: default num_queues to -smp N + +The relevant parts for migration are:: + + @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { + #endif + DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, + true), + - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), + + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, + + VIRTIO_BLK_AUTO_NUM_QUEUES), + DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), + +It changes the default value of num_queues. But it fishes it for old +machine types to have the right value:: + + @@ -31,6 +31,7 @@ + GlobalProperty hw_compat_5_1[] = { + ... + + { "virtio-blk-device", "num-queues", "1"}, + ... + }; + +A device with different features on both sides +---------------------------------------------- + +Let's assume that we are using the same QEMU binary on both sides, +just to make the things easier. But we have a device that has +different features on both sides of the migration. That can be +because the devices are different, because the kernel driver of both +devices have different features, whatever. + +How can we get this to work with migration. The way to do that is +"theoretically" easy. You have to get the features that the device +has in the source of the migration. The features that the device has +on the target of the migration, you get the intersection of the +features of both sides, and that is the way that you should launch +QEMU. + +Notice that this is not completely related to QEMU. The most +important thing here is that this should be handled by the managing +application that launches QEMU. If QEMU is configured correctly, the +migration will succeed. + +That said, actually doing it is complicated. Almost all devices are +bad at being able to be launched with only some features enabled. +With one big exception: cpus. + +You can read the documentation for QEMU x86 cpu models here: + +https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html + +See when they talk about migration they recommend that one chooses the +newest cpu model that is supported for all cpus. + +Let's say that we have: + +Host A: + +Device X has the feature Y + +Host B: + +Device X has not the feature Y + +If we try to migrate without any care from host A to host B, it will +fail because when migration tries to load the feature Y on +destination, it will find that the hardware is not there. + +Doing this would be the equivalent of doing with cpus: + +Host A: + +$ qemu-system-x86_64 -cpu host + +Host B: + +$ qemu-system-x86_64 -cpu host + +When both hosts have different cpu features this is guaranteed to +fail. Especially if Host B has less features than host A. If host A +has less features than host B, sometimes it works. Important word of +last sentence is "sometimes". + +So, forgetting about cpu models and continuing with the -cpu host +example, let's see that the differences of the cpus is that Host A and +B have the following features: + +Features: 'pcid' 'stibp' 'taa-no' +Host A: X X +Host B: X + +And we want to migrate between them, the way configure both QEMU cpu +will be: + +Host A: + +$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off + +Host B: + +$ qemu-system-x86_64 -cpu host,taa-no=off + +And you would be able to migrate between them. It is responsibility +of the management application or of the user to make sure that the +configuration is correct. QEMU doesn't know how to look at this kind +of features in general. + +Notice that we don't recommend to use -cpu host for migration. It is +used in this example because it makes the example simpler. + +Other devices have worse control about individual features. If they +want to be able to migrate between hosts that show different features, +the device needs a way to configure which ones it is going to use. + +In this section we have considered that we are using the same QEMU +binary in both sides of the migration. If we use different QEMU +versions process, then we need to have into account all other +differences and the examples become even more complicated. + +How to mitigate when we have a backward compatibility error +----------------------------------------------------------- + +We broke migration for old machine types continuously during +development. But as soon as we find that there is a problem, we fix +it. The problem is what happens when we detect after we have done a +release that something has gone wrong. + +Let see how it worked with one example. + +After the release of qemu-8.0 we found a problem when doing migration +of the machine type pc-7.2. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration fails + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration fails + +So clearly something fails when migration between qemu-7.2 and +qemu-8.0 with machine type pc-7.2. The error messages, and git bisect +pointed to this commit. + +In qemu-8.0 we got this commit:: + + commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 + Author: Jonathan Cameron + Date: Thu Mar 2 13:37:02 2023 +0000 + + hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register + + +The relevant bits of the commit for our example are this ones:: + + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +The patch changes how we configure PCI space for AER. But QEMU fails +when the PCI space configuration is different between source and +destination. + +The following commit shows how this got fixed:: + + commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f + Author: Leonardo Bras + Date: Tue May 2 21:27:02 2023 -0300 + + hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 + + [...] + +The relevant parts of the fix in QEMU are as follow: + +First, we create a new property for the device to be able to configure +the old behaviour or the new behaviour:: + + diff --git a/hw/pci/pci.c b/hw/pci/pci.c + index 8a87ccc8b0..5153ad63d6 100644 + --- a/hw/pci/pci.c + +++ b/hw/pci/pci.c + @@ -79,6 +79,8 @@ static Property pci_props[] = { + DEFINE_PROP_STRING("failover_pair_id", PCIDevice, + failover_pair_id), + DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), + + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, + + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), + DEFINE_PROP_END_OF_LIST() + }; + +Notice that we enable the feature for new machine types. + +Now we see how the fix is done. This is going to depend on what kind +of breakage happens, but in this case it is quite simple:: + + diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c + index 103667c368..374d593ead 100644 + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, + uint16_t offset, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_MASK_DEFAULT); + - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_SUPPORTED); + + + + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + } + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +I.e. If the property bit is enabled, we configure it as we did for +qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. + +And now, everything that is missing is disabling the feature for old +machine types:: + + diff --git a/hw/core/machine.c b/hw/core/machine.c + index 47a34841a5..07f763eb2e 100644 + --- a/hw/core/machine.c + +++ b/hw/core/machine.c + @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { + { "e1000e", "migrate-timadj", "off" }, + { "virtio-mem", "x-early-migration", "false" }, + { "migration", "x-preempt-pre-7-2", "true" }, + + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, + }; + const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); + +And now, when qemu-8.0.1 is released with this fix, all combinations +are going to work as supposed. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) + +So the normality has been restored and everything is ok, no? + +Not really, now our matrix is much bigger. We started with the easy +cases, migration from the same version to the same version always +works: + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +Now the interesting ones. When the QEMU processes versions are +different. For the 1st set, their fail and we can do nothing, both +versions are released and we can't change anything. + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +This two are the ones that work. The whole point of making the +change in qemu-8.0.1 release was to fix this issue: + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +But now we found that qemu-8.0 neither can migrate to qemu-7.2 not +qemu-8.0.1. + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + +So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to +anything except to qemu-8.0. + +Can we do better? + +Yeap. If we know that we are going to do this migration: + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +We can launch the appropriate devices with:: + + --device...,x-pci-e-err-unc-mask=on + +And now we can receive a migration from 8.0. And from now on, we can +do that migration to new machine types if we remember to enable that +property for pc-7.2. Notice that we need to remember, it is not +enough to know that the source of the migration is qemu-8.0. Think of +this example: + +$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 + +In the second migration, the source is not qemu-8.0, but we still have +that "problem" and have that property enabled. Notice that we need to +continue having this mark/property until we have this machine +rebooted. But it is not a normal reboot (that don't reload QEMU) we +need the machine to poweroff/poweron on a fixed QEMU. And from now +on we can use the proper real machine. diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 2cb701c77c..7fc02b9520 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -8,5 +8,6 @@ QEMU live migration works. :maxdepth: 2 main + compatibility vfio virtio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 82cdb420bf..04194414af 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -993,522 +993,3 @@ In some cases it may be best to tie specific firmware versions to specific versioned machine types to cut down on the combinations that will need support. This is also useful when newer versions of firmware outgrow the padding. - - -Backwards compatibility -======================= - -How backwards compatibility works ---------------------------------- - -When we do migration, we have two QEMU processes: the source and the -target. There are two cases, they are the same version or they are -different versions. The easy case is when they are the same version. -The difficult one is when they are different versions. - -There are two things that are different, but they have very similar -names and sometimes get confused: - -- QEMU version -- machine type version - -Let's start with a practical example, we start with: - -- qemu-system-x86_64 (v5.2), from now on qemu-5.2. -- qemu-system-x86_64 (v5.1), from now on qemu-5.1. - -Related to this are the "latest" machine types defined on each of -them: - -- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 -- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 - -First of all, migration is only supposed to work if you use the same -machine type in both source and destination. The QEMU hardware -configuration needs to be the same also on source and destination. -Most aspects of the backend configuration can be changed at will, -except for a few cases where the backend features influence frontend -device feature exposure. But that is not relevant for this section. - -I am going to list the number of combinations that we can have. Let's -start with the trivial ones, QEMU is the same on source and -destination: - -1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 - - This is the latest QEMU with the latest machine type. - This have to work, and if it doesn't work it is a bug. - -2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - Exactly the same case than the previous one, but for 5.1. - Nothing to see here either. - -This are the easiest ones, we will not talk more about them in this -section. - -Now we start with the more interesting cases. Consider the case where -we have the same QEMU version in both sides (qemu-5.2) but we are using -the latest machine type for that version (pc-5.2) but one of an older -QEMU version, in this case pc-5.1. - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - It needs to use the definition of pc-5.1 and the devices as they - were configured on 5.1, but this should be easy in the sense that - both sides are the same QEMU and both sides have exactly the same - idea of what the pc-5.1 machine is. - -4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 - - This combination is not possible as the qemu-5.1 doesn't understand - pc-5.2 machine type. So nothing to worry here. - -Now it comes the interesting ones, when both QEMU processes are -different. Notice also that the machine type needs to be pc-5.1, -because we have the limitation than qemu-5.1 doesn't know pc-5.2. So -the possible cases are: - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - This migration is known as newer to older. We need to make sure - when we are developing 5.2 we need to take care about not to break - migration to qemu-5.1. Notice that we can't make updates to - qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is - in qemu-5.2 side to make the relevant changes. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - This migration is known as older to newer. We need to make sure - than we are able to receive migrations from qemu-5.1. The problem is - similar to the previous one. - -If qemu-5.1 and qemu-5.2 were the same, there will not be any -compatibility problems. But the reason that we create qemu-5.2 is to -get new features, devices, defaults, etc. - -If we get a device that has a new feature, or change a default value, -we have a problem when we try to migrate between different QEMU -versions. - -So we need a way to tell qemu-5.2 that when we are using machine type -pc-5.1, it needs to **not** use the feature, to be able to migrate to -real qemu-5.1. - -And the equivalent part when migrating from qemu-5.1 to qemu-5.2. -qemu-5.2 has to expect that it is not going to get data for the new -feature, because qemu-5.1 doesn't know about it. - -How do we tell QEMU about these device feature changes? In -hw/core/machine.c:hw_compat_X_Y arrays. - -If we change a default value, we need to put back the old value on -that array. And the device, during initialization needs to look at -that array to see what value it needs to get for that feature. And -what are we going to put in that array, the value of a property. - -To create a property for a device, we need to use one of the -DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the -macros that exist. With it, we set the default value for that -property, and that is what it is going to get in the latest released -version. But if we want a different value for a previous version, we -can change that in the hw_compat_X_Y arrays. - -hw_compat_X_Y is an array of registers that have the format: - -- name_device -- name_property -- value - -Let's see a practical example. - -In qemu-5.2 virtio-blk-device got multi queue support. This is a -change that is not backward compatible. In qemu-5.1 it has one -queue. In qemu-5.2 it has the same number of queues as the number of -cpus in the system. - -When we are doing migration, if we migrate from a device that has 4 -queues to a device that have only one queue, we don't know where to -put the extra information for the other 3 queues, and we fail -migration. - -Similar problem when we migrate from qemu-5.1 that has only one queue -to qemu-5.2, we only sent information for one queue, but destination -has 4, and we have 3 queues that are not properly initialized and -anything can happen. - -So, how can we address this problem. Easy, just convince qemu-5.2 -that when it is running pc-5.1, it needs to set the number of queues -for virtio-blk-devices to 1. - -That way we fix the cases 5 and 6. - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - qemu-5.2 -M pc-5.1 sets number of queues to be 1. - qemu-5.1 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - qemu-5.1 -M pc-5.1 sets number of queues to be 1. - qemu-5.2 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -And now the other interesting case, case 3. In this case we have: - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - Here we have the same QEMU in both sides. So it doesn't matter a - lot if we have set the number of queues to 1 or not, because - they are the same. - - WRONG! - - Think what happens if we do one of this double migrations: - - A -> migrates -> B -> migrates -> C - - where: - - A: qemu-5.1 -M pc-5.1 - B: qemu-5.2 -M pc-5.1 - C: qemu-5.2 -M pc-5.1 - - migration A -> B is case 6, so number of queues needs to be 1. - - migration B -> C is case 3, so we don't care. But actually we - care because we haven't started the guest in qemu-5.2, it came - migrated from qemu-5.1. So to be in the safe place, we need to - always use number of queues 1 when we are using pc-5.1. - -Now, how was this done in reality? The following commit shows how it -was done:: - - commit 9445e1e15e66c19e42bea942ba810db28052cd05 - Author: Stefan Hajnoczi - Date: Tue Aug 18 15:33:47 2020 +0100 - - virtio-blk-pci: default num_queues to -smp N - -The relevant parts for migration are:: - - @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { - #endif - DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, - true), - - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), - + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, - + VIRTIO_BLK_AUTO_NUM_QUEUES), - DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), - -It changes the default value of num_queues. But it fishes it for old -machine types to have the right value:: - - @@ -31,6 +31,7 @@ - GlobalProperty hw_compat_5_1[] = { - ... - + { "virtio-blk-device", "num-queues", "1"}, - ... - }; - -A device with different features on both sides ----------------------------------------------- - -Let's assume that we are using the same QEMU binary on both sides, -just to make the things easier. But we have a device that has -different features on both sides of the migration. That can be -because the devices are different, because the kernel driver of both -devices have different features, whatever. - -How can we get this to work with migration. The way to do that is -"theoretically" easy. You have to get the features that the device -has in the source of the migration. The features that the device has -on the target of the migration, you get the intersection of the -features of both sides, and that is the way that you should launch -QEMU. - -Notice that this is not completely related to QEMU. The most -important thing here is that this should be handled by the managing -application that launches QEMU. If QEMU is configured correctly, the -migration will succeed. - -That said, actually doing it is complicated. Almost all devices are -bad at being able to be launched with only some features enabled. -With one big exception: cpus. - -You can read the documentation for QEMU x86 cpu models here: - -https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html - -See when they talk about migration they recommend that one chooses the -newest cpu model that is supported for all cpus. - -Let's say that we have: - -Host A: - -Device X has the feature Y - -Host B: - -Device X has not the feature Y - -If we try to migrate without any care from host A to host B, it will -fail because when migration tries to load the feature Y on -destination, it will find that the hardware is not there. - -Doing this would be the equivalent of doing with cpus: - -Host A: - -$ qemu-system-x86_64 -cpu host - -Host B: - -$ qemu-system-x86_64 -cpu host - -When both hosts have different cpu features this is guaranteed to -fail. Especially if Host B has less features than host A. If host A -has less features than host B, sometimes it works. Important word of -last sentence is "sometimes". - -So, forgetting about cpu models and continuing with the -cpu host -example, let's see that the differences of the cpus is that Host A and -B have the following features: - -Features: 'pcid' 'stibp' 'taa-no' -Host A: X X -Host B: X - -And we want to migrate between them, the way configure both QEMU cpu -will be: - -Host A: - -$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off - -Host B: - -$ qemu-system-x86_64 -cpu host,taa-no=off - -And you would be able to migrate between them. It is responsibility -of the management application or of the user to make sure that the -configuration is correct. QEMU doesn't know how to look at this kind -of features in general. - -Notice that we don't recommend to use -cpu host for migration. It is -used in this example because it makes the example simpler. - -Other devices have worse control about individual features. If they -want to be able to migrate between hosts that show different features, -the device needs a way to configure which ones it is going to use. - -In this section we have considered that we are using the same QEMU -binary in both sides of the migration. If we use different QEMU -versions process, then we need to have into account all other -differences and the examples become even more complicated. - -How to mitigate when we have a backward compatibility error ------------------------------------------------------------ - -We broke migration for old machine types continuously during -development. But as soon as we find that there is a problem, we fix -it. The problem is what happens when we detect after we have done a -release that something has gone wrong. - -Let see how it worked with one example. - -After the release of qemu-8.0 we found a problem when doing migration -of the machine type pc-7.2. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration fails - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration fails - -So clearly something fails when migration between qemu-7.2 and -qemu-8.0 with machine type pc-7.2. The error messages, and git bisect -pointed to this commit. - -In qemu-8.0 we got this commit:: - - commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 - Author: Jonathan Cameron - Date: Thu Mar 2 13:37:02 2023 +0000 - - hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register - - -The relevant bits of the commit for our example are this ones:: - - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -The patch changes how we configure PCI space for AER. But QEMU fails -when the PCI space configuration is different between source and -destination. - -The following commit shows how this got fixed:: - - commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f - Author: Leonardo Bras - Date: Tue May 2 21:27:02 2023 -0300 - - hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 - - [...] - -The relevant parts of the fix in QEMU are as follow: - -First, we create a new property for the device to be able to configure -the old behaviour or the new behaviour:: - - diff --git a/hw/pci/pci.c b/hw/pci/pci.c - index 8a87ccc8b0..5153ad63d6 100644 - --- a/hw/pci/pci.c - +++ b/hw/pci/pci.c - @@ -79,6 +79,8 @@ static Property pci_props[] = { - DEFINE_PROP_STRING("failover_pair_id", PCIDevice, - failover_pair_id), - DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), - + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, - + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), - DEFINE_PROP_END_OF_LIST() - }; - -Notice that we enable the feature for new machine types. - -Now we see how the fix is done. This is going to depend on what kind -of breakage happens, but in this case it is quite simple:: - - diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c - index 103667c368..374d593ead 100644 - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, - uint16_t offset, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_MASK_DEFAULT); - - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_SUPPORTED); - + - + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - + } - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -I.e. If the property bit is enabled, we configure it as we did for -qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. - -And now, everything that is missing is disabling the feature for old -machine types:: - - diff --git a/hw/core/machine.c b/hw/core/machine.c - index 47a34841a5..07f763eb2e 100644 - --- a/hw/core/machine.c - +++ b/hw/core/machine.c - @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { - { "e1000e", "migrate-timadj", "off" }, - { "virtio-mem", "x-early-migration", "false" }, - { "migration", "x-preempt-pre-7-2", "true" }, - + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, - }; - const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); - -And now, when qemu-8.0.1 is released with this fix, all combinations -are going to work as supposed. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) - -So the normality has been restored and everything is ok, no? - -Not really, now our matrix is much bigger. We started with the easy -cases, migration from the same version to the same version always -works: - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -Now the interesting ones. When the QEMU processes versions are -different. For the 1st set, their fail and we can do nothing, both -versions are released and we can't change anything. - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -This two are the ones that work. The whole point of making the -change in qemu-8.0.1 release was to fix this issue: - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -But now we found that qemu-8.0 neither can migrate to qemu-7.2 not -qemu-8.0.1. - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - -So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to -anything except to qemu-8.0. - -Can we do better? - -Yeap. If we know that we are going to do this migration: - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -We can launch the appropriate devices with:: - - --device...,x-pci-e-err-unc-mask=on - -And now we can receive a migration from 8.0. And from now on, we can -do that migration to new machine types if we remember to enable that -property for pc-7.2. Notice that we need to remember, it is not -enough to know that the source of the migration is qemu-8.0. Think of -this example: - -$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 - -In the second migration, the source is not qemu-8.0, but we still have -that "problem" and have that property enabled. Notice that we need to -continue having this mark/property until we have this machine -rebooted. But it is not a normal reboot (that don't reload QEMU) we -need the machine to poweroff/poweron on a fixed QEMU. And from now -on we can use the proper real machine. -- Gitee From 4d6c041c7c43372921b96446d9731a4797468555 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:23 +0800 Subject: [PATCH 823/939] docs/migration: Split "Debugging" and "Firmware" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 774ad6b53b9449223115ffa8851eb93de92b0ce7 upstream. Move the two sections into a separate file called "best-practices.rst". Add the entry into index. Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/best-practices.rst | 48 +++++++++++++++++++++++++ docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 44 ----------------------- 3 files changed, 49 insertions(+), 44 deletions(-) create mode 100644 docs/devel/migration/best-practices.rst diff --git a/docs/devel/migration/best-practices.rst b/docs/devel/migration/best-practices.rst new file mode 100644 index 0000000000..d7c34a3014 --- /dev/null +++ b/docs/devel/migration/best-practices.rst @@ -0,0 +1,48 @@ +============== +Best practices +============== + +Debugging +========= + +The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. + +Example usage: + +.. code-block:: shell + + $ qemu-system-x86_64 -display none -monitor stdio + (qemu) migrate "exec:cat > mig" + (qemu) q + $ ./scripts/analyze-migration.py -f mig + { + "ram (3)": { + "section sizes": { + "pc.ram": "0x0000000008000000", + ... + +See also ``analyze-migration.py -h`` help for more options. + +Firmware +======== + +Migration migrates the copies of RAM and ROM, and thus when running +on the destination it includes the firmware from the source. Even after +resetting a VM, the old firmware is used. Only once QEMU has been restarted +is the new firmware in use. + +- Changes in firmware size can cause changes in the required RAMBlock size + to hold the firmware and thus migration can fail. In practice it's best + to pad firmware images to convenient powers of 2 with plenty of space + for growth. + +- Care should be taken with device emulation code so that newer + emulation code can work with older firmware to allow forward migration. + +- Care should be taken with newer firmware so that backward migration + to older systems with older device emulation code will work. + +In some cases it may be best to tie specific firmware versions to specific +versioned machine types to cut down on the combinations that will need +support. This is also useful when newer versions of firmware outgrow +the padding. diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 7fc02b9520..9a8fd1ead7 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -11,3 +11,4 @@ QEMU live migration works. compatibility vfio virtio + best-practices diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 04194414af..7ca3b4dd3f 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -52,27 +52,6 @@ All these migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. -Debugging -========= - -The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. - -Example usage: - -.. code-block:: shell - - $ qemu-system-x86_64 -display none -monitor stdio - (qemu) migrate "exec:cat > mig" - (qemu) q - $ ./scripts/analyze-migration.py -f mig - { - "ram (3)": { - "section sizes": { - "pc.ram": "0x0000000008000000", - ... - -See also ``analyze-migration.py -h`` help for more options. - Common infrastructure ===================== @@ -970,26 +949,3 @@ the background migration channel. Anyone who cares about latencies of page faults during a postcopy migration should enable this feature. By default, it's not enabled. -Firmware -======== - -Migration migrates the copies of RAM and ROM, and thus when running -on the destination it includes the firmware from the source. Even after -resetting a VM, the old firmware is used. Only once QEMU has been restarted -is the new firmware in use. - -- Changes in firmware size can cause changes in the required RAMBlock size - to hold the firmware and thus migration can fail. In practice it's best - to pad firmware images to convenient powers of 2 with plenty of space - for growth. - -- Care should be taken with device emulation code so that newer - emulation code can work with older firmware to allow forward migration. - -- Care should be taken with newer firmware so that backward migration - to older systems with older device emulation code will work. - -In some cases it may be best to tie specific firmware versions to specific -versioned machine types to cut down on the combinations that will need -support. This is also useful when newer versions of firmware outgrow -the padding. -- Gitee From f335519e759500adc05157fc0399335a3646461d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:24 +0800 Subject: [PATCH 824/939] docs/migration: Split "Postcopy" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bfb4c7cd99f1c39dedf33381954d03b9f8f244ec upstream. Split postcopy into a separate file. Introduce a head page "features.rst" to keep all the features on top of migration framework. Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-7-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 9 + docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 305 ------------------------------ docs/devel/migration/postcopy.rst | 304 +++++++++++++++++++++++++++++ 4 files changed, 314 insertions(+), 305 deletions(-) create mode 100644 docs/devel/migration/features.rst create mode 100644 docs/devel/migration/postcopy.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst new file mode 100644 index 0000000000..0054e0c900 --- /dev/null +++ b/docs/devel/migration/features.rst @@ -0,0 +1,9 @@ +Migration features +================== + +Migration has plenty of features to support different use cases. + +.. toctree:: + :maxdepth: 2 + + postcopy diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 9a8fd1ead7..21ad58b189 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -8,6 +8,7 @@ QEMU live migration works. :maxdepth: 2 main + features compatibility vfio virtio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 7ca3b4dd3f..1e98e9e40c 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -644,308 +644,3 @@ algorithm will restrict virtual CPUs as needed to keep their dirty page rate inside the limit. This leads to more steady reading performance during live migration and can aid in improving large guest responsiveness. -Postcopy -======== - -'Postcopy' migration is a way to deal with migrations that refuse to converge -(or take too long to converge) its plus side is that there is an upper bound on -the amount of migration traffic and time it takes, the down side is that during -the postcopy phase, a failure of *either* side causes the guest to be lost. - -In postcopy the destination CPUs are started before all the memory has been -transferred, and accesses to pages that are yet to be transferred cause -a fault that's translated by QEMU into a request to the source QEMU. - -Postcopy can be combined with precopy (i.e. normal migration) so that if precopy -doesn't finish in a given time the switch is made to postcopy. - -Enabling postcopy ------------------ - -To enable postcopy, issue this command on the monitor (both source and -destination) prior to the start of migration: - -``migrate_set_capability postcopy-ram on`` - -The normal commands are then used to start a migration, which is still -started in precopy mode. Issuing: - -``migrate_start_postcopy`` - -will now cause the transition from precopy to postcopy. -It can be issued immediately after migration is started or any -time later on. Issuing it after the end of a migration is harmless. - -Blocktime is a postcopy live migration metric, intended to show how -long the vCPU was in state of interruptible sleep due to pagefault. -That metric is calculated both for all vCPUs as overlapped value, and -separately for each vCPU. These values are calculated on destination -side. To enable postcopy blocktime calculation, enter following -command on destination monitor: - -``migrate_set_capability postcopy-blocktime on`` - -Postcopy blocktime can be retrieved by query-migrate qmp command. -postcopy-blocktime value of qmp command will show overlapped blocking -time for all vCPU, postcopy-vcpu-blocktime will show list of blocking -time per vCPU. - -.. note:: - During the postcopy phase, the bandwidth limits set using - ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that - the destination is waiting for). - -Postcopy device transfer ------------------------- - -Loading of device data may cause the device emulation to access guest RAM -that may trigger faults that have to be resolved by the source, as such -the migration stream has to be able to respond with page data *during* the -device load, and hence the device data has to be read from the stream completely -before the device load begins to free the stream up. This is achieved by -'packaging' the device data into a blob that's read in one go. - -Source behaviour ----------------- - -Until postcopy is entered the migration stream is identical to normal -precopy, except for the addition of a 'postcopy advise' command at -the beginning, to tell the destination that postcopy might happen. -When postcopy starts the source sends the page discard data and then -forms the 'package' containing: - - - Command: 'postcopy listen' - - The device state - - A series of sections, identical to the precopy streams device state stream - containing everything except postcopiable devices (i.e. RAM) - - Command: 'postcopy run' - -The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the -contents are formatted in the same way as the main migration stream. - -During postcopy the source scans the list of dirty pages and sends them -to the destination without being requested (in much the same way as precopy), -however when a page request is received from the destination, the dirty page -scanning restarts from the requested location. This causes requested pages -to be sent quickly, and also causes pages directly after the requested page -to be sent quickly in the hope that those pages are likely to be used -by the destination soon. - -Destination behaviour ---------------------- - -Initially the destination looks the same as precopy, with a single thread -reading the migration stream; the 'postcopy advise' and 'discard' commands -are processed to change the way RAM is managed, but don't affect the stream -processing. - -:: - - ------------------------------------------------------------------------------ - 1 2 3 4 5 6 7 - main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) - thread | | - | (page request) - | \___ - v \ - listen thread: --- page -- page -- page -- page -- page -- - - a b c - ------------------------------------------------------------------------------ - -- On receipt of ``CMD_PACKAGED`` (1) - - All the data associated with the package - the ( ... ) section in the diagram - - is read into memory, and the main thread recurses into qemu_loadvm_state_main - to process the contents of the package (2) which contains commands (3,6) and - devices (4...) - -- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) - - a new thread (a) is started that takes over servicing the migration stream, - while the main thread carries on loading the package. It loads normal - background page data (b) but if during a device load a fault happens (5) - the returned page (c) is loaded by the listen thread allowing the main - threads device load to carry on. - -- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) - - letting the destination CPUs start running. At the end of the - ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and - is no longer used by migration, while the listen thread carries on servicing - page data until the end of migration. - -Postcopy Recovery ------------------ - -Comparing to precopy, postcopy is special on error handlings. When any -error happens (in this case, mostly network errors), QEMU cannot easily -fail a migration because VM data resides in both source and destination -QEMU instances. On the other hand, when issue happens QEMU on both sides -will go into a paused state. It'll need a recovery phase to continue a -paused postcopy migration. - -The recovery phase normally contains a few steps: - - - When network issue occurs, both QEMU will go into PAUSED state - - - When the network is recovered (or a new network is provided), the admin - can setup the new channel for migration using QMP command - 'migrate-recover' on destination node, preparing for a resume. - - - On source host, the admin can continue the interrupted postcopy - migration using QMP command 'migrate' with resume=true flag set. - - - After the connection is re-established, QEMU will continue the postcopy - migration on both sides. - -During a paused postcopy migration, the VM can logically still continue -running, and it will not be impacted from any page access to pages that -were already migrated to destination VM before the interruption happens. -However, if any of the missing pages got accessed on destination VM, the VM -thread will be halted waiting for the page to be migrated, it means it can -be halted until the recovery is complete. - -The impact of accessing missing pages can be relevant to different -configurations of the guest. For example, when with async page fault -enabled, logically the guest can proactively schedule out the threads -accessing missing pages. - -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - -Postcopy with hugepages ------------------------ - -Postcopy now works with hugetlbfs backed memory: - - a) The linux kernel on the destination must support userfault on hugepages. - b) The huge-page configuration on the source and destination VMs must be - identical; i.e. RAMBlocks on both sides must use the same page size. - c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal - RAM if it doesn't have enough hugepages, triggering (b) to fail. - Using ``-mem-prealloc`` enforces the allocation using hugepages. - d) Care should be taken with the size of hugepage used; postcopy with 2MB - hugepages works well, however 1GB hugepages are likely to be problematic - since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, - and until the full page is transferred the destination thread is blocked. - -Postcopy with shared memory ---------------------------- - -Postcopy migration with shared memory needs explicit support from the other -processes that share memory and from QEMU. There are restrictions on the type of -memory that userfault can support shared. - -The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` -(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` -for hugetlbfs which may be a problem in some configurations). - -The vhost-user code in QEMU supports clients that have Postcopy support, -and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes -to support postcopy. - -The client needs to open a userfaultfd and register the areas -of memory that it maps with userfault. The client must then pass the -userfaultfd back to QEMU together with a mapping table that allows -fault addresses in the clients address space to be converted back to -RAMBlock/offsets. The client's userfaultfd is added to the postcopy -fault-thread and page requests are made on behalf of the client by QEMU. -QEMU performs 'wake' operations on the client's userfaultfd to allow it -to continue after a page has arrived. - -.. note:: - There are two future improvements that would be nice: - a) Some way to make QEMU ignorant of the addresses in the clients - address space - b) Avoiding the need for QEMU to perform ufd-wake calls after the - pages have arrived - -Retro-fitting postcopy to existing clients is possible: - a) A mechanism is needed for the registration with userfault as above, - and the registration needs to be coordinated with the phases of - postcopy. In vhost-user extra messages are added to the existing - control channel. - b) Any thread that can block due to guest memory accesses must be - identified and the implication understood; for example if the - guest memory access is made while holding a lock then all other - threads waiting for that lock will also be blocked. - -Postcopy Preemption Mode ------------------------- - -Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -allows urgent pages (those got page fault requested from destination QEMU -explicitly) to be sent in a separate preempt channel, rather than queued in -the background migration channel. Anyone who cares about latencies of page -faults during a postcopy migration should enable this feature. By default, -it's not enabled. - diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst new file mode 100644 index 0000000000..d60eec06ab --- /dev/null +++ b/docs/devel/migration/postcopy.rst @@ -0,0 +1,304 @@ +Postcopy +======== + +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side causes the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +Enabling postcopy +----------------- + +To enable postcopy, issue this command on the monitor (both source and +destination) prior to the start of migration: + +``migrate_set_capability postcopy-ram on`` + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +``migrate_start_postcopy`` + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Blocktime is a postcopy live migration metric, intended to show how +long the vCPU was in state of interruptible sleep due to pagefault. +That metric is calculated both for all vCPUs as overlapped value, and +separately for each vCPU. These values are calculated on destination +side. To enable postcopy blocktime calculation, enter following +command on destination monitor: + +``migrate_set_capability postcopy-blocktime on`` + +Postcopy blocktime can be retrieved by query-migrate qmp command. +postcopy-blocktime value of qmp command will show overlapped blocking +time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +time per vCPU. + +.. note:: + During the postcopy phase, the bandwidth limits set using + ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that + the destination is waiting for). + +Postcopy device transfer +------------------------ + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour +---------------- + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + - Command: 'postcopy listen' + - The device state + + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + - Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour +--------------------- + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +:: + + ------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 + main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) + thread | | + | (page request) + | \___ + v \ + listen thread: --- page -- page -- page -- page -- page -- + + a b c + ------------------------------------------------------------------------------ + +- On receipt of ``CMD_PACKAGED`` (1) + + All the data associated with the package - the ( ... ) section in the diagram - + is read into memory, and the main thread recurses into qemu_loadvm_state_main + to process the contents of the package (2) which contains commands (3,6) and + devices (4...) + +- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) + + a new thread (a) is started that takes over servicing the migration stream, + while the main thread carries on loading the package. It loads normal + background page data (b) but if during a device load a fault happens (5) + the returned page (c) is loaded by the listen thread allowing the main + threads device load to carry on. + +- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) + + letting the destination CPUs start running. At the end of the + ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and + is no longer used by migration, while the listen thread carries on servicing + page data until the end of migration. + +Postcopy Recovery +----------------- + +Comparing to precopy, postcopy is special on error handlings. When any +error happens (in this case, mostly network errors), QEMU cannot easily +fail a migration because VM data resides in both source and destination +QEMU instances. On the other hand, when issue happens QEMU on both sides +will go into a paused state. It'll need a recovery phase to continue a +paused postcopy migration. + +The recovery phase normally contains a few steps: + + - When network issue occurs, both QEMU will go into PAUSED state + + - When the network is recovered (or a new network is provided), the admin + can setup the new channel for migration using QMP command + 'migrate-recover' on destination node, preparing for a resume. + + - On source host, the admin can continue the interrupted postcopy + migration using QMP command 'migrate' with resume=true flag set. + + - After the connection is re-established, QEMU will continue the postcopy + migration on both sides. + +During a paused postcopy migration, the VM can logically still continue +running, and it will not be impacted from any page access to pages that +were already migrated to destination VM before the interruption happens. +However, if any of the missing pages got accessed on destination VM, the VM +thread will be halted waiting for the page to be migrated, it means it can +be halted until the recovery is complete. + +The impact of accessing missing pages can be relevant to different +configurations of the guest. For example, when with async page fault +enabled, logically the guest can proactively schedule out the threads +accessing missing pages. + +Postcopy states +--------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Source side page map +-------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy with hugepages +----------------------- + +Postcopy now works with hugetlbfs backed memory: + + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using ``-mem-prealloc`` enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. + +Postcopy with shared memory +--------------------------- + +Postcopy migration with shared memory needs explicit support from the other +processes that share memory and from QEMU. There are restrictions on the type of +memory that userfault can support shared. + +The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` +(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` +for hugetlbfs which may be a problem in some configurations). + +The vhost-user code in QEMU supports clients that have Postcopy support, +and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes +to support postcopy. + +The client needs to open a userfaultfd and register the areas +of memory that it maps with userfault. The client must then pass the +userfaultfd back to QEMU together with a mapping table that allows +fault addresses in the clients address space to be converted back to +RAMBlock/offsets. The client's userfaultfd is added to the postcopy +fault-thread and page requests are made on behalf of the client by QEMU. +QEMU performs 'wake' operations on the client's userfaultfd to allow it +to continue after a page has arrived. + +.. note:: + There are two future improvements that would be nice: + a) Some way to make QEMU ignorant of the addresses in the clients + address space + b) Avoiding the need for QEMU to perform ufd-wake calls after the + pages have arrived + +Retro-fitting postcopy to existing clients is possible: + a) A mechanism is needed for the registration with userfault as above, + and the registration needs to be coordinated with the phases of + postcopy. In vhost-user extra messages are added to the existing + control channel. + b) Any thread that can block due to guest memory accesses must be + identified and the implication understood; for example if the + guest memory access is made while holding a lock then all other + threads waiting for that lock will also be blocked. + +Postcopy Preemption Mode +------------------------ + +Postcopy preempt is a new capability introduced in 8.0 QEMU release, it +allows urgent pages (those got page fault requested from destination QEMU +explicitly) to be sent in a separate preempt channel, rather than queued in +the background migration channel. Anyone who cares about latencies of page +faults during a postcopy migration should enable this feature. By default, +it's not enabled. -- Gitee From 10545ddb8797505ac298960171afaebc327c926c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:25 +0800 Subject: [PATCH 825/939] docs/migration: Split "dirty limit" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4c6f8a79ae539eeb1f86af6522e4000edde3638b upstream. Split that into a separate file, put under "features". Cc: Yong Huang Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-8-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/dirty-limit.rst | 71 ++++++++++++++++++++++++++++ docs/devel/migration/features.rst | 1 + docs/devel/migration/main.rst | 71 ---------------------------- 3 files changed, 72 insertions(+), 71 deletions(-) create mode 100644 docs/devel/migration/dirty-limit.rst diff --git a/docs/devel/migration/dirty-limit.rst b/docs/devel/migration/dirty-limit.rst new file mode 100644 index 0000000000..8f32329d5f --- /dev/null +++ b/docs/devel/migration/dirty-limit.rst @@ -0,0 +1,71 @@ +Dirty limit +=========== + +The dirty limit, short for dirty page rate upper limit, is a new capability +introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM +dirty ring to throttle down the guest during live migration. + +The algorithm framework is as follows: + +:: + + ------------------------------------------------------------------------------ + main --------------> throttle thread ------------> PREPARE(1) <-------- + thread \ | | + \ | | + \ V | + -\ CALCULATE(2) | + \ | | + \ | | + \ V | + \ SET PENALTY(3) ----- + -\ | + \ | + \ V + -> virtual CPU thread -------> ACCEPT PENALTY(4) + ------------------------------------------------------------------------------ + +When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, +the QEMU main thread starts the throttle thread. The throttle thread, once +launched, executes the loop, which consists of three steps: + + - PREPARE (1) + + The entire work of PREPARE (1) is preparation for the second stage, + CALCULATE(2), as the name implies. It involves preparing the dirty + page rate value and the corresponding upper limit of the VM: + The dirty page rate is calculated via the KVM dirty ring mechanism, + which tells QEMU how many dirty pages a virtual CPU has had since the + last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper + limit is specified by caller, therefore fetch it directly. + + - CALCULATE (2) + + Calculate a suitable sleep period for each virtual CPU, which will be + used to determine the penalty for the target virtual CPU. The + computation must be done carefully in order to reduce the dirty page + rate progressively down to the upper limit without oscillation. To + achieve this, two strategies are provided: the first is to add or + subtract sleep time based on the ratio of the current dirty page rate + to the limit, which is used when the current dirty page rate is far + from the limit; the second is to add or subtract a fixed time when + the current dirty page rate is close to the limit. + + - SET PENALTY (3) + + Set the sleep time for each virtual CPU that should be penalized based + on the results of the calculation supplied by step CALCULATE (2). + +After completing the three above stages, the throttle thread loops back +to step PREPARE (1) until the dirty limit is reached. + +On the other hand, each virtual CPU thread reads the sleep duration and +sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that +is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will +obviously exit to the path and get penalized, whereas virtual CPUs involved +with read processes will not. + +In summary, thanks to the KVM dirty ring technology, the dirty limit +algorithm will restrict virtual CPUs as needed to keep their dirty page +rate inside the limit. This leads to more steady reading performance during +live migration and can aid in improving large guest responsiveness. diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 0054e0c900..e257d0d100 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -7,3 +7,4 @@ Migration has plenty of features to support different use cases. :maxdepth: 2 postcopy + dirty-limit diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 1e98e9e40c..396c7c51ca 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -573,74 +573,3 @@ path. Return path - opened by main thread, written by main thread AND postcopy thread (protected by rp_mutex) -Dirty limit -===================== -The dirty limit, short for dirty page rate upper limit, is a new capability -introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM -dirty ring to throttle down the guest during live migration. - -The algorithm framework is as follows: - -:: - - ------------------------------------------------------------------------------ - main --------------> throttle thread ------------> PREPARE(1) <-------- - thread \ | | - \ | | - \ V | - -\ CALCULATE(2) | - \ | | - \ | | - \ V | - \ SET PENALTY(3) ----- - -\ | - \ | - \ V - -> virtual CPU thread -------> ACCEPT PENALTY(4) - ------------------------------------------------------------------------------ - -When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, -the QEMU main thread starts the throttle thread. The throttle thread, once -launched, executes the loop, which consists of three steps: - - - PREPARE (1) - - The entire work of PREPARE (1) is preparation for the second stage, - CALCULATE(2), as the name implies. It involves preparing the dirty - page rate value and the corresponding upper limit of the VM: - The dirty page rate is calculated via the KVM dirty ring mechanism, - which tells QEMU how many dirty pages a virtual CPU has had since the - last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper - limit is specified by caller, therefore fetch it directly. - - - CALCULATE (2) - - Calculate a suitable sleep period for each virtual CPU, which will be - used to determine the penalty for the target virtual CPU. The - computation must be done carefully in order to reduce the dirty page - rate progressively down to the upper limit without oscillation. To - achieve this, two strategies are provided: the first is to add or - subtract sleep time based on the ratio of the current dirty page rate - to the limit, which is used when the current dirty page rate is far - from the limit; the second is to add or subtract a fixed time when - the current dirty page rate is close to the limit. - - - SET PENALTY (3) - - Set the sleep time for each virtual CPU that should be penalized based - on the results of the calculation supplied by step CALCULATE (2). - -After completing the three above stages, the throttle thread loops back -to step PREPARE (1) until the dirty limit is reached. - -On the other hand, each virtual CPU thread reads the sleep duration and -sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that -is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will -obviously exit to the path and get penalized, whereas virtual CPUs involved -with read processes will not. - -In summary, thanks to the KVM dirty ring technology, the dirty limit -algorithm will restrict virtual CPUs as needed to keep their dirty page -rate inside the limit. This leads to more steady reading performance during -live migration and can aid in improving large guest responsiveness. - -- Gitee From b15ee6a2f82aa810cfed0401d0843f33f5761d48 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:26 +0800 Subject: [PATCH 826/939] docs/migration: Organize "Postcopy" page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 21b17cd011c959c3fd3fdad994389410a02df901 upstream. Reorganize the page, moving things around, and add a few headlines ("Postcopy internals", "Postcopy features") to cover sub-areas. Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-9-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/postcopy.rst | 159 ++++++++++++++++-------------- 1 file changed, 84 insertions(+), 75 deletions(-) diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst index d60eec06ab..6c51e96d79 100644 --- a/docs/devel/migration/postcopy.rst +++ b/docs/devel/migration/postcopy.rst @@ -1,6 +1,9 @@ +======== Postcopy ======== +.. contents:: + 'Postcopy' migration is a way to deal with migrations that refuse to converge (or take too long to converge) its plus side is that there is an upper bound on the amount of migration traffic and time it takes, the down side is that during @@ -14,7 +17,7 @@ Postcopy can be combined with precopy (i.e. normal migration) so that if precopy doesn't finish in a given time the switch is made to postcopy. Enabling postcopy ------------------ +================= To enable postcopy, issue this command on the monitor (both source and destination) prior to the start of migration: @@ -49,8 +52,71 @@ time per vCPU. ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that the destination is waiting for). -Postcopy device transfer ------------------------- +Postcopy internals +================== + +State machine +------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Device transfer +--------------- Loading of device data may cause the device emulation to access guest RAM that may trigger faults that have to be resolved by the source, as such @@ -130,7 +196,20 @@ processing. is no longer used by migration, while the listen thread carries on servicing page data until the end of migration. -Postcopy Recovery +Source side page bitmap +----------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy features +================= + +Postcopy recovery ----------------- Comparing to precopy, postcopy is special on error handlings. When any @@ -166,76 +245,6 @@ configurations of the guest. For example, when with async page fault enabled, logically the guest can proactively schedule out the threads accessing missing pages. -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - Postcopy with hugepages ----------------------- @@ -293,7 +302,7 @@ Retro-fitting postcopy to existing clients is possible: guest memory access is made while holding a lock then all other threads waiting for that lock will also be blocked. -Postcopy Preemption Mode +Postcopy preemption mode ------------------------ Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -- Gitee From e9614f86ff43d0417ddaa3eab8be67c565e561b9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:27 +0800 Subject: [PATCH 827/939] docs/migration: Further move vfio to be feature of migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 66fd3b1a7ab02f7d8c84f92eba23e3ddc955204d upstream. Move it one layer down, so taking VFIO-migration as a feature for migration. Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-10-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/index.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index e257d0d100..dea016f707 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -8,3 +8,4 @@ Migration has plenty of features to support different use cases. postcopy dirty-limit + vfio diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 21ad58b189..b1357309e1 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -10,6 +10,5 @@ QEMU live migration works. main features compatibility - vfio virtio best-practices -- Gitee From a8d5d9425ddec134a9e9c164a80b0bf1ba29381b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:28 +0800 Subject: [PATCH 828/939] docs/migration: Further move virtio to be feature of migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit eb9f6daae49c06bb91e9660908587cc55265e43a upstream. Move it one layer down, so taking Virtio-migration as a feature for migration. Cc: "Michael S. Tsirkin" Cc: Jason Wang Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-11-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/index.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index dea016f707..a9acaf618e 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -9,3 +9,4 @@ Migration has plenty of features to support different use cases. postcopy dirty-limit vfio + virtio diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index b1357309e1..2aa294d631 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -10,5 +10,4 @@ QEMU live migration works. main features compatibility - virtio best-practices -- Gitee From c454cdf2eef413af1c5ca04524e15dffdfc90a58 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:35 +0800 Subject: [PATCH 829/939] migration/multifd: Drop stale comment for multifd zero copy commit 8888a552bf7af200e36ff123772547dfb4f133c4 upstream. We've already done that with multifd_flush_after_each_section, for multifd in general. Drop the stale "TODO-like" comment. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index ef7d4520c4..07e7e78029 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -599,17 +599,6 @@ int multifd_send_sync_main(void) } } - /* - * When using zero-copy, it's necessary to flush the pages before any of - * the pages can be sent again, so we'll make sure the new version of the - * pages will always arrive _later_ than the old pages. - * - * Currently we achieve this by flushing the zero-page requested writes - * per ram iteration, but in the future we could potentially optimize it - * to be less frequent, e.g. only after we finished one whole scanning of - * all the dirty bitmaps. - */ - flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { -- Gitee From fa8d23b539d417e69cc0a02f13ca66ef2b506d8e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:36 +0800 Subject: [PATCH 830/939] migration/multifd: multifd_send_kick_main() commit 48c0f5d56fd2ff0a0cda23301637b742c690f59a upstream. When a multifd sender thread hit errors, it always needs to kick the main thread by kicking all the semaphores that it can be waiting upon. Provide a helper for it and deduplicate the code. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 07e7e78029..d2da6178b0 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -373,6 +373,18 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +/* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of + * them. Should mostly only be called when something wrong happened with + * the current multifd send thread. + */ +static void multifd_send_kick_main(MultiFDSendParams *p) +{ + qemu_sem_post(&p->sem_sync); + qemu_sem_post(&multifd_send_state->channels_ready); +} + /* * How we use multifd_send_state->pages and channel->pages? * @@ -743,8 +755,7 @@ out: assert(local_err); trace_multifd_send_error(p->id); multifd_send_terminate_threads(local_err); - qemu_sem_post(&p->sem_sync); - qemu_sem_post(&multifd_send_state->channels_ready); + multifd_send_kick_main(p); error_free(local_err); } @@ -785,8 +796,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, * is not created, and then tell who pay attention to me. */ p->quit = true; - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); + multifd_send_kick_main(p); error_free(err); } @@ -856,8 +866,7 @@ static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, { migrate_set_error(migrate_get_current(), err); /* Error happen, we need to tell who pay attention to me */ - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); + multifd_send_kick_main(p); /* * Although multifd_send_thread is not created, but main migration * thread need to judge whether it is running, so we need to mark -- Gitee From 046f864bba4035328269599e7d0e9de1b7a93932 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:37 +0800 Subject: [PATCH 831/939] migration/multifd: Drop MultiFDSendParams.quit, cleanup error paths commit 15f3f21d598148895c33b6fc41e29777cf6ad992 upstream. Multifd send side has two fields to indicate error quits: - MultiFDSendParams.quit - &multifd_send_state->exiting Merge them into the global one. The replacement is done by changing all p->quit checks into the global var check. The global check doesn't need any lock. A few more things done on top of this altogether: - multifd_send_terminate_threads() Moving the xchg() of &multifd_send_state->exiting upper, so as to cover the tracepoint, migrate_set_error() and migrate_set_state(). - multifd_send_sync_main() In the 2nd loop, add one more check over the global var to make sure we don't keep the looping if QEMU already decided to quit. - multifd_tls_outgoing_handshake() Use multifd_send_terminate_threads() to set the error state. That has a benefit of updating MigrationState.error to that error too, so we can persist that 1st error we hit in that specific channel. - multifd_new_send_channel_async() Take similar approach like above, drop the migrate_set_error() because multifd_send_terminate_threads() already covers that. Unwrap the helper multifd_new_send_channel_cleanup() along the way; not really needed. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 85 ++++++++++++++++++--------------------------- migration/multifd.h | 2 -- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index d2da6178b0..ea756b6eb8 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -373,6 +373,11 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +static bool multifd_send_should_exit(void) +{ + return qatomic_read(&multifd_send_state->exiting); +} + /* * The migration thread can wait on either of the two semaphores. This * function can be used to kick the main thread out of waiting on either of @@ -410,7 +415,7 @@ static int multifd_send_pages(void) MultiFDSendParams *p = NULL; /* make happy gcc */ MultiFDPages_t *pages = multifd_send_state->pages; - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { return -1; } @@ -422,14 +427,11 @@ static int multifd_send_pages(void) */ next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { - p = &multifd_send_state->params[i]; - - qemu_mutex_lock(&p->mutex); - if (p->quit) { - error_report("%s: channel %d has already quit!", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } + p = &multifd_send_state->params[i]; + qemu_mutex_lock(&p->mutex); if (!p->pending_job) { p->pending_job++; next_channel = (i + 1) % migrate_multifd_channels(); @@ -484,6 +486,16 @@ static void multifd_send_terminate_threads(Error *err) { int i; + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (qatomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } + trace_multifd_send_terminate_threads(err != NULL); if (err) { @@ -498,26 +510,13 @@ static void multifd_send_terminate_threads(Error *err) } } - /* - * We don't want to exit each threads twice. Depending on where - * we get the error, or if there are two independent errors in two - * threads at the same time, we can end calling this function - * twice. - */ - if (qatomic_xchg(&multifd_send_state->exiting, 1)) { - return; - } - for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; qemu_sem_post(&p->sem); if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } @@ -616,16 +615,13 @@ int multifd_send_sync_main(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - trace_multifd_send_sync_main_signal(p->id); - - qemu_mutex_lock(&p->mutex); - - if (p->quit) { - error_report("%s: channel %d has already quit", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } + trace_multifd_send_sync_main_signal(p->id); + + qemu_mutex_lock(&p->mutex); p->packet_num = multifd_send_state->packet_num++; p->flags |= MULTIFD_FLAG_SYNC; p->pending_job++; @@ -635,6 +631,10 @@ int multifd_send_sync_main(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (multifd_send_should_exit()) { + return -1; + } + qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -675,7 +675,7 @@ static void *multifd_send_thread(void *opaque) qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_wait(&p->sem); - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { break; } qemu_mutex_lock(&p->mutex); @@ -790,12 +790,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - migrate_set_error(migrate_get_current(), err); - /* - * Error happen, mark multifd_send_thread status as 'quit' although it - * is not created, and then tell who pay attention to me. - */ - p->quit = true; + multifd_send_terminate_threads(err); multifd_send_kick_main(p); error_free(err); } @@ -861,22 +856,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return true; } -static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, - QIOChannel *ioc, Error *err) -{ - migrate_set_error(migrate_get_current(), err); - /* Error happen, we need to tell who pay attention to me */ - multifd_send_kick_main(p); - /* - * Although multifd_send_thread is not created, but main migration - * thread need to judge whether it is running, so we need to mark - * its status. - */ - p->quit = true; - object_unref(OBJECT(ioc)); - error_free(err); -} - static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; @@ -893,7 +872,10 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_new_send_channel_cleanup(p, ioc, local_err); + multifd_send_terminate_threads(local_err); + multifd_send_kick_main(p); + object_unref(OBJECT(ioc)); + error_free(local_err); } static void multifd_new_send_channel_create(gpointer opaque) @@ -925,7 +907,6 @@ int multifd_save_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); diff --git a/migration/multifd.h b/migration/multifd.h index 35d11f103c..7c040cb85a 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -95,8 +95,6 @@ typedef struct { QemuMutex mutex; /* is this channel thread running */ bool running; - /* should this thread finish */ - bool quit; /* multifd flags for each packet */ uint32_t flags; /* global number of generated multifd packets */ -- Gitee From 9ce63dcad32efdb9e31db0db495bf4a3e1a96595 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:38 +0800 Subject: [PATCH 832/939] migration/multifd: Postpone reset of MultiFDPages_t commit 836eca47f62f9f6d5b8e9b6fedfc3539775c4e2e upstream. Now we reset MultiFDPages_t object in the multifd sender thread in the middle of the sending job. That's not necessary, because the "*pages" struct will not be reused anyway until pending_job is cleared. Move that to the end after the job is completed, provide a helper to reset a "*pages" object. Use that same helper when free the object too. This prepares us to keep using p->pages in the follow up patches, where we may drop p->normal[]. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-5-peterx@redhat.com Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: openEuler backported 254c67a88ab5 ("migration: fix-possible-int-overflow") which causes simple context conflict when cherry-pick this commit] Signed-off-by: Jason Zeng --- migration/multifd.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index ea756b6eb8..fff119237a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -173,6 +173,17 @@ void multifd_register_ops(int method, MultiFDMethods *ops) multifd_ops[method] = ops; } +/* Reset a MultiFDPages_t* object for the next use */ +static void multifd_pages_reset(MultiFDPages_t *pages) +{ + /* + * We don't need to touch offset[] array, because it will be + * overwritten later when reused. + */ + pages->num = 0; + pages->block = NULL; +} + static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { MultiFDInit_t msg = {}; @@ -249,9 +260,8 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n) static void multifd_pages_clear(MultiFDPages_t *pages) { - pages->num = 0; + multifd_pages_reset(pages); pages->allocated = 0; - pages->block = NULL; g_free(pages->offset); pages->offset = NULL; g_free(pages); @@ -708,8 +718,6 @@ static void *multifd_send_thread(void *opaque) p->flags = 0; p->num_packets++; p->total_normal_pages += p->normal_num; - p->pages->num = 0; - p->pages->block = NULL; qemu_mutex_unlock(&p->mutex); trace_multifd_send(p->id, packet_num, p->normal_num, flags, @@ -736,6 +744,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, (uint64_t)p->next_packet_size + p->packet_len); + + multifd_pages_reset(p->pages); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); p->pending_job--; -- Gitee From 383f4cb78af723cf650841dc31862f9b0b612f4b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:39 +0800 Subject: [PATCH 833/939] migration/multifd: Drop MultiFDSendParams.normal[] array commit efd8c5439db7eaf00f35adc0fcc4f01d916e8619 upstream. This array is redundant when p->pages exists. Now we extended the life of p->pages to the whole period where pending_job is set, it should be safe to always use p->pages->offset[] rather than p->normal[]. Drop the array. Alongside, the normal_num is also redundant, which is the same to p->pages->num. This doesn't apply to recv side, because there's no extra buffering on recv side, so p->normal[] array is still needed. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 7 ++++--- migration/multifd-zstd.c | 7 ++++--- migration/multifd.c | 33 +++++++++++++-------------------- migration/multifd.h | 4 ---- 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 37ce48621e..100809abc1 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -116,17 +116,18 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zlib_data *z = p->data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; uint32_t i; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = Z_SYNC_FLUSH; } @@ -135,7 +136,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); + memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); zs->avail_in = p->page_size; zs->next_in = z->buf; diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index b471daadcd..2023edd8cc 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -113,6 +113,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zstd_data *z = p->data; int ret; uint32_t i; @@ -121,13 +122,13 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = ZSTD_e_flush; } - z->in.src = p->pages->block->host + p->normal[i]; + z->in.src = p->pages->block->host + pages->offset[i]; z->in.size = p->page_size; z->in.pos = 0; diff --git a/migration/multifd.c b/migration/multifd.c index fff119237a..bfafe94e1e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -91,13 +91,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < p->normal_num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = p->normal_num * p->page_size; + p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; return 0; } @@ -270,21 +270,22 @@ static void multifd_pages_clear(MultiFDPages_t *pages) static void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(p->normal_num); + packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet->packet_num = cpu_to_be64(p->packet_num); - if (p->pages->block) { - strncpy(packet->ramblock, p->pages->block->idstr, 256); + if (pages->block) { + strncpy(packet->ramblock, pages->block->idstr, 256); } - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { /* there are architectures where ram_addr_t is 32 bit */ - uint64_t temp = p->normal[i]; + uint64_t temp = pages->offset[i]; packet->offset[i] = cpu_to_be64(temp); } @@ -571,8 +572,6 @@ void multifd_save_cleanup(void) p->packet = NULL; g_free(p->iov); p->iov = NULL; - g_free(p->normal); - p->normal = NULL; multifd_send_state->ops->send_cleanup(p, &local_err); if (local_err) { migrate_set_error(migrate_get_current(), local_err); @@ -692,8 +691,8 @@ static void *multifd_send_thread(void *opaque) if (p->pending_job) { uint64_t packet_num = p->packet_num; + MultiFDPages_t *pages = p->pages; uint32_t flags; - p->normal_num = 0; if (use_zero_copy_send) { p->iovs_num = 0; @@ -701,12 +700,7 @@ static void *multifd_send_thread(void *opaque) p->iovs_num = 1; } - for (int i = 0; i < p->pages->num; i++) { - p->normal[p->normal_num] = p->pages->offset[i]; - p->normal_num++; - } - - if (p->normal_num) { + if (pages->num) { ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { qemu_mutex_unlock(&p->mutex); @@ -717,10 +711,10 @@ static void *multifd_send_thread(void *opaque) flags = p->flags; p->flags = 0; p->num_packets++; - p->total_normal_pages += p->normal_num; + p->total_normal_pages += pages->num; qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, p->normal_num, flags, + trace_multifd_send(p->id, packet_num, pages->num, flags, p->next_packet_size); if (use_zero_copy_send) { @@ -928,7 +922,6 @@ int multifd_save_setup(Error **errp) p->name = g_strdup_printf("multifdsend_%d", i); /* We need one extra place for the packet header */ p->iov = g_new0(struct iovec, page_count + 1); - p->normal = g_new0(ram_addr_t, page_count); p->page_size = qemu_target_page_size(); p->page_count = page_count; diff --git a/migration/multifd.h b/migration/multifd.h index 7c040cb85a..3920bdbcf1 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -122,10 +122,6 @@ typedef struct { struct iovec *iov; /* number of iovs used */ uint32_t iovs_num; - /* Pages that are not zero */ - ram_addr_t *normal; - /* num of non zero pages */ - uint32_t normal_num; /* used for compression methods */ void *data; } MultiFDSendParams; -- Gitee From 40021e3b91b10672849477f4d76712ff3e78f738 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:40 +0800 Subject: [PATCH 834/939] migration/multifd: Separate SYNC request with normal jobs commit f5f48a7891cf6664a920ba52f6f4dea1646049a4 upstream. Multifd provide a threaded model for processing jobs. On sender side, there can be two kinds of job: (1) a list of pages to send, or (2) a sync request. The sync request is a very special kind of job. It never contains a page array, but only a multifd packet telling the dest side to synchronize with sent pages. Before this patch, both requests use the pending_job field, no matter what the request is, it will boost pending_job, while multifd sender thread will decrement it after it finishes one job. However this should be racy, because SYNC is special in that it needs to set p->flags with MULTIFD_FLAG_SYNC, showing that this is a sync request. Consider a sequence of operations where: - migration thread enqueue a job to send some pages, pending_job++ (0->1) - [...before the selected multifd sender thread wakes up...] - migration thread enqueue another job to sync, pending_job++ (1->2), setup p->flags=MULTIFD_FLAG_SYNC - multifd sender thread wakes up, found pending_job==2 - send the 1st packet with MULTIFD_FLAG_SYNC and list of pages - send the 2nd packet with flags==0 and no pages This is not expected, because MULTIFD_FLAG_SYNC should hopefully be done after all the pages are received. Meanwhile, the 2nd packet will be completely useless, which contains zero information. I didn't verify above, but I think this issue is still benign in that at least on the recv side we always receive pages before handling MULTIFD_FLAG_SYNC. However that's not always guaranteed and just tricky. One other reason I want to separate it is using p->flags to communicate between the two threads is also not clearly defined, it's very hard to read and understand why accessing p->flags is always safe; see the current impl of multifd_send_thread() where we tried to cache only p->flags. It doesn't need to be that complicated. This patch introduces pending_sync, a separate flag just to show that the requester needs a sync. Alongside, we remove the tricky caching of p->flags now because after this patch p->flags should only be used by multifd sender thread now, which will be crystal clear. So it is always thread safe to access p->flags. With that, we can also safely convert the pending_job into a boolean, because we don't support >1 pending jobs anyway. Always use atomic ops to access both flags to make sure no cache effect. When at it, drop the initial setting of "pending_job = 0" because it's always allocated using g_new0(). Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-7-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 39 +++++++++++++++++++++++++-------------- migration/multifd.h | 13 +++++++++++-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index bfafe94e1e..dd90c09b26 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -443,8 +443,8 @@ static int multifd_send_pages(void) } p = &multifd_send_state->params[i]; qemu_mutex_lock(&p->mutex); - if (!p->pending_job) { - p->pending_job++; + if (qatomic_read(&p->pending_job) == false) { + qatomic_set(&p->pending_job, true); next_channel = (i + 1) % migrate_multifd_channels(); break; } @@ -632,8 +632,12 @@ int multifd_send_sync_main(void) qemu_mutex_lock(&p->mutex); p->packet_num = multifd_send_state->packet_num++; - p->flags |= MULTIFD_FLAG_SYNC; - p->pending_job++; + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. + */ + assert(qatomic_read(&p->pending_sync) == false); + qatomic_set(&p->pending_sync, true); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); } @@ -689,10 +693,9 @@ static void *multifd_send_thread(void *opaque) } qemu_mutex_lock(&p->mutex); - if (p->pending_job) { + if (qatomic_read(&p->pending_job)) { uint64_t packet_num = p->packet_num; MultiFDPages_t *pages = p->pages; - uint32_t flags; if (use_zero_copy_send) { p->iovs_num = 0; @@ -708,13 +711,11 @@ static void *multifd_send_thread(void *opaque) } } multifd_send_fill_packet(p); - flags = p->flags; - p->flags = 0; p->num_packets++; p->total_normal_pages += pages->num; qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, pages->num, flags, + trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); if (use_zero_copy_send) { @@ -742,12 +743,23 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); - p->pending_job--; + qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); - - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&p->sem_sync); + } else if (qatomic_read(&p->pending_sync)) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; + qatomic_set(&p->pending_sync, false); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem_sync); } else { qemu_mutex_unlock(&p->mutex); /* sometimes there are spurious wakeups */ @@ -911,7 +923,6 @@ int multifd_save_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); p->packet_len = sizeof(MultiFDPacket_t) diff --git a/migration/multifd.h b/migration/multifd.h index 3920bdbcf1..08f26ef3fe 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -99,8 +99,17 @@ typedef struct { uint32_t flags; /* global number of generated multifd packets */ uint64_t packet_num; - /* thread has work to do */ - int pending_job; + /* + * The sender thread has work to do if either of below boolean is set. + * + * @pending_job: a job is pending + * @pending_sync: a sync request is pending + * + * For both of these fields, they're only set by the requesters, and + * cleared by the multifd sender threads. + */ + bool pending_job; + bool pending_sync; /* array of pages to sent. * The owner of 'pages' depends of 'pending_job' value: * pending_job == 0 -> migration_thread can use it. -- Gitee From 9e616674520aa0272393eda94a4ad7301969b73c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:41 +0800 Subject: [PATCH 835/939] migration/multifd: Simplify locking in sender thread commit e3cce9af10b06c51434ced4e1a6686f1ce43e124 upstream. The sender thread will yield the p->mutex before IO starts, trying to not block the requester thread. This may be unnecessary lock optimizations, because the requester can already read pending_job safely even without the lock, because the requester is currently the only one who can assign a task. Drop that lock complication on both sides: (1) in the sender thread, always take the mutex until job done (2) in the requester thread, check pending_job clear lockless Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-8-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index dd90c09b26..cef4a88237 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -430,7 +430,9 @@ static int multifd_send_pages(void) return -1; } + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); + /* * next_channel can remain from a previous migration that was * using more channels, so ensure it doesn't overflow if the @@ -442,17 +444,26 @@ static int multifd_send_pages(void) return -1; } p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); + /* + * Lockless read to p->pending_job is safe, because only multifd + * sender thread can clear it. + */ if (qatomic_read(&p->pending_job) == false) { - qatomic_set(&p->pending_job, true); next_channel = (i + 1) % migrate_multifd_channels(); break; } - qemu_mutex_unlock(&p->mutex); } + + qemu_mutex_lock(&p->mutex); assert(!p->pages->num); assert(!p->pages->block); - + /* + * Double check on pending_job==false with the lock. In the future if + * we can have >1 requester thread, we can replace this with a "goto + * retry", but that is for later. + */ + assert(qatomic_read(&p->pending_job) == false); + qatomic_set(&p->pending_job, true); p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; @@ -713,8 +724,6 @@ static void *multifd_send_thread(void *opaque) multifd_send_fill_packet(p); p->num_packets++; p->total_normal_pages += pages->num; - qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -734,6 +743,7 @@ static void *multifd_send_thread(void *opaque) ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { + qemu_mutex_unlock(&p->mutex); break; } @@ -742,7 +752,6 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; - qemu_mutex_lock(&p->mutex); qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); } else if (qatomic_read(&p->pending_sync)) { -- Gitee From b24853b2f5524d988406732fc22c3fe9253de104 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:42 +0800 Subject: [PATCH 836/939] migration/multifd: Drop pages->num check in sender thread commit 83c560fb4249ee5698652249e0c1730c3d611a9b upstream. Now with a split SYNC handler, we always have pages->num set for pending_job==true. Assert it instead. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-9-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index cef4a88237..a67917b113 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -714,13 +714,14 @@ static void *multifd_send_thread(void *opaque) p->iovs_num = 1; } - if (pages->num) { - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - qemu_mutex_unlock(&p->mutex); - break; - } + assert(pages->num); + + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; } + multifd_send_fill_packet(p); p->num_packets++; p->total_normal_pages += pages->num; -- Gitee From a10ddd65e951c65119135eb847c93ab8db980638 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:43 +0800 Subject: [PATCH 837/939] migration/multifd: Rename p->num_packets and clean it up commit 05b7ec1890158471afb8537a6817a7e0d0a6c938 upstream. This field, no matter whether on src or dest, is only used for debugging purpose. They can even be removed already, unless it still more or less provide some accounting on "how many packets are sent/recved for this thread". The other more important one is called packet_num, which is embeded in the multifd packet headers (MultiFDPacket_t). So let's keep them for now, but make them much easier to understand, by doing below: - Rename both of them to packets_sent / packets_recved, the old name (num_packets) are waaay too confusing when we already have MultiFDPacket_t.packets_num. - Avoid worrying on the "initial packet": we know we will send it, that's good enough. The accounting won't matter a great deal to start with 0 or with 1. - Move them to where we send/recv the packets. They're: - multifd_send_fill_packet() for senders. - multifd_recv_unfill_packet() for receivers. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-10-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 +++++-------- migration/multifd.h | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index a67917b113..f79badb546 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -289,6 +289,8 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) packet->offset[i] = cpu_to_be64(temp); } + + p->packets_sent++; } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -336,6 +338,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; if (p->normal_num == 0) { return 0; @@ -692,8 +695,6 @@ static void *multifd_send_thread(void *opaque) ret = -1; goto out; } - /* initial packet */ - p->num_packets = 1; while (true) { qemu_sem_post(&multifd_send_state->channels_ready); @@ -723,7 +724,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - p->num_packets++; p->total_normal_pages += pages->num; trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -791,7 +791,7 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); return NULL; } @@ -1128,7 +1128,6 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, p->next_packet_size); - p->num_packets++; p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); @@ -1154,7 +1153,7 @@ static void *multifd_recv_thread(void *opaque) qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); return NULL; } @@ -1256,8 +1255,6 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) } p->c = ioc; object_ref(OBJECT(ioc)); - /* initial packet */ - p->num_packets = 1; p->running = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, diff --git a/migration/multifd.h b/migration/multifd.h index 08f26ef3fe..2e4ad0dc56 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -124,7 +124,7 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ - uint64_t num_packets; + uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; /* buffers to send */ @@ -174,8 +174,8 @@ typedef struct { MultiFDPacket_t *packet; /* size of the next packet that contains pages */ uint32_t next_packet_size; - /* packets sent through this channel */ - uint64_t num_packets; + /* packets received through this channel */ + uint64_t packets_recved; /* ramblock */ RAMBlock *block; /* ramblock host address */ -- Gitee From 2316c555d9893f3e637260367477edcf40592679 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:44 +0800 Subject: [PATCH 838/939] migration/multifd: Move total_normal_pages accounting commit db7e1cc5103137743394a939045a17fa2b30a0dc upstream. Just like the previous patch, move the accounting for total_normal_pages on both src/dst sides into the packet fill/unfill procedures. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-11-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f79badb546..510bfdcac8 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -291,6 +291,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; + p->total_normal_pages += pages->num; } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -339,6 +340,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; + p->total_normal_pages += p->normal_num; if (p->normal_num == 0) { return 0; @@ -724,7 +726,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - p->total_normal_pages += pages->num; trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -1128,7 +1129,6 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, p->next_packet_size); - p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); if (p->normal_num) { -- Gitee From 8a1deb6f19abbd8824a9b3e04abc77f5f72f37f6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:45 +0800 Subject: [PATCH 839/939] migration/multifd: Move trace_multifd_send|recv() commit 8a9ef1738037e1d1132f9e1bd3e2f1102bde719f upstream. Move them into fill/unfill of packets. With that, we can further cleanup the send/recv thread procedure, and remove one more temp var. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-12-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 510bfdcac8..f545faaa52 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -292,6 +292,9 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) p->packets_sent++; p->total_normal_pages += pages->num; + + trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, + p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -342,6 +345,9 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->packets_recved++; p->total_normal_pages += p->normal_num; + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, + p->next_packet_size); + if (p->normal_num == 0) { return 0; } @@ -708,7 +714,6 @@ static void *multifd_send_thread(void *opaque) qemu_mutex_lock(&p->mutex); if (qatomic_read(&p->pending_job)) { - uint64_t packet_num = p->packet_num; MultiFDPages_t *pages = p->pages; if (use_zero_copy_send) { @@ -726,8 +731,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); if (use_zero_copy_send) { /* Send header first, without zerocopy */ @@ -1127,8 +1130,6 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, - p->next_packet_size); qemu_mutex_unlock(&p->mutex); if (p->normal_num) { -- Gitee From fb749030a3151fff95a84f478ec5bcc1b5e0d07c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:46 +0800 Subject: [PATCH 840/939] migration/multifd: multifd_send_prepare_header() commit 452b205702335ddd45554aaf0eb37baf50bdfa00 upstream. Introduce a helper multifd_send_prepare_header() to setup the header packet for multifd sender. It's fine to setup the IOV[0] _before_ send_prepare() because the packet buffer is already ready, even if the content is to be filled in. With this helper, we can already slightly clean up the zero copy path. Note that I explicitly put it into multifd.h, because I want it inlined directly into multifd*.c where necessary later. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-13-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 16 ++++++++-------- migration/multifd.h | 8 ++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f545faaa52..a42e152268 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -716,10 +716,14 @@ static void *multifd_send_thread(void *opaque) if (qatomic_read(&p->pending_job)) { MultiFDPages_t *pages = p->pages; - if (use_zero_copy_send) { - p->iovs_num = 0; - } else { - p->iovs_num = 1; + p->iovs_num = 0; + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); } assert(pages->num); @@ -739,10 +743,6 @@ static void *multifd_send_thread(void *opaque) if (ret != 0) { break; } - } else { - /* Send header using the same writev call */ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, diff --git a/migration/multifd.h b/migration/multifd.h index 2e4ad0dc56..4ec005f53f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -209,5 +209,13 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); +static inline void multifd_send_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + + #endif -- Gitee From 1dfecda79660d2b68cd56a7e44ef76ac847f54d1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:47 +0800 Subject: [PATCH 841/939] migration/multifd: Move header prepare/fill into send_prepare() commit 25a1f8787597f6906b151b2f73ae6cc92a31de57 upstream. This patch redefines the interfacing of ->send_prepare(). It further simplifies multifd_send_thread() especially on zero copy. Now with the new interface, we require the hook to do all the work for preparing the IOVs to send. After it's completed, the IOVs should be ready to be dumped into the specific multifd QIOChannel later. So now the API looks like: p->pages -----------> send_prepare() -------------> IOVs This also prepares for the case where the input can be extended to even not any p->pages. But that's for later. This patch will achieve similar goal of what Fabiano used to propose here: https://lore.kernel.org/r/20240126221943.26628-1-farosas@suse.de However the send() interface may not be necessary. I'm boldly attaching a "Co-developed-by" for Fabiano. Co-developed-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-14-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 4 +++ migration/multifd-zstd.c | 4 +++ migration/multifd.c | 61 ++++++++++++++++++---------------------- migration/multifd.h | 1 + 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 100809abc1..012e3bdea1 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,6 +123,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; + multifd_send_prepare_header(p); + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; @@ -172,6 +174,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = out_size; p->flags |= MULTIFD_FLAG_ZLIB; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 2023edd8cc..dc8fe43e94 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,6 +118,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; + multifd_send_prepare_header(p); + z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; @@ -161,6 +163,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = z->out.pos; p->flags |= MULTIFD_FLAG_ZSTD; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd.c b/migration/multifd.c index a42e152268..d4528cf9d1 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -51,15 +51,15 @@ typedef struct { /** * nocomp_send_setup: setup send side * - * For no compression this function does nothing. - * - * Returns 0 for success or -1 for error - * * @p: Params for the channel that we are using * @errp: pointer to an error */ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) { + if (migrate_zero_copy_send()) { + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + return 0; } @@ -89,7 +89,17 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { + bool use_zero_copy_send = migrate_zero_copy_send(); MultiFDPages_t *pages = p->pages; + int ret; + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); + } for (int i = 0; i < pages->num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; @@ -99,6 +109,18 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, errp); + if (ret != 0) { + return -1; + } + } + return 0; } @@ -267,7 +289,7 @@ static void multifd_pages_clear(MultiFDPages_t *pages) g_free(pages); } -static void multifd_send_fill_packet(MultiFDSendParams *p) +void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; @@ -689,7 +711,6 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; - bool use_zero_copy_send = migrate_zero_copy_send(); thread = migration_threads_add(p->name, qemu_get_thread_id()); @@ -717,15 +738,6 @@ static void *multifd_send_thread(void *opaque) MultiFDPages_t *pages = p->pages; p->iovs_num = 0; - - if (!use_zero_copy_send) { - /* - * Only !zerocopy needs the header in IOV; zerocopy will - * send it separately. - */ - multifd_send_prepare_header(p); - } - assert(pages->num); ret = multifd_send_state->ops->send_prepare(p, &local_err); @@ -734,17 +746,6 @@ static void *multifd_send_thread(void *opaque) break; } - multifd_send_fill_packet(p); - - if (use_zero_copy_send) { - /* Send header first, without zerocopy */ - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; - } - } - ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { @@ -949,13 +950,7 @@ int multifd_save_setup(Error **errp) p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; - - if (migrate_zero_copy_send()) { - p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; - } else { - p->write_flags = 0; - } - + p->write_flags = 0; multifd_new_send_channel_create(p); } diff --git a/migration/multifd.h b/migration/multifd.h index 4ec005f53f..34a2ecb9f4 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -208,6 +208,7 @@ typedef struct { } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); +void multifd_send_fill_packet(MultiFDSendParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { -- Gitee From d4f46c41e0dd921563614ad48e7099eeac06d285 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:48 +0800 Subject: [PATCH 842/939] migration/multifd: Forbid spurious wakeups commit 859ebaf346e8b5dece6cf255c604fe953d8ec9ab upstream. Now multifd's logic is designed to have no spurious wakeup. I still remember a talk to Juan and he seems to agree we should drop it now, and if my memory was right it was there because multifd used to hit that when still debugging. Let's drop it and see what can explode; as long as it's not reaching soft-freeze. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-15-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index d4528cf9d1..3b7984cf99 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -760,7 +760,9 @@ static void *multifd_send_thread(void *opaque) p->next_packet_size = 0; qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); - } else if (qatomic_read(&p->pending_sync)) { + } else { + /* If not a normal job, must be a sync request */ + assert(qatomic_read(&p->pending_sync)); p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, @@ -775,9 +777,6 @@ static void *multifd_send_thread(void *opaque) qatomic_set(&p->pending_sync, false); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem_sync); - } else { - qemu_mutex_unlock(&p->mutex); - /* sometimes there are spurious wakeups */ } } -- Gitee From e033a771a9d35a86b7864652abf61165bcdcaf55 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:49 +0800 Subject: [PATCH 843/939] migration/multifd: Split multifd_send_terminate_threads() commit 3ab4441d97af59ea09ee015d68c4770704b2b34f upstream. Split multifd_send_terminate_threads() into two functions: - multifd_send_set_error(): used when an error happened on the sender side, set error and quit state only - multifd_send_terminate_threads(): used only by the main thread to kick all multifd send threads out of sleep, for the last recycling. Use multifd_send_set_error() in the three old call sites where only the error will be set. Use multifd_send_terminate_threads() in the last one where the main thread will kick the multifd threads at last in multifd_save_cleanup(). Both helpers will need to set quitting=1. Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-16-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 27 ++++++++++++++++++--------- migration/trace-events | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3b7984cf99..59ccc42c05 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -537,10 +537,9 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) return 1; } -static void multifd_send_terminate_threads(Error *err) +/* Multifd send side hit an error; remember it and prepare to quit */ +static void multifd_send_set_error(Error *err) { - int i; - /* * We don't want to exit each threads twice. Depending on where * we get the error, or if there are two independent errors in two @@ -551,8 +550,6 @@ static void multifd_send_terminate_threads(Error *err) return; } - trace_multifd_send_terminate_threads(err != NULL); - if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -564,7 +561,19 @@ static void multifd_send_terminate_threads(Error *err) MIGRATION_STATUS_FAILED); } } +} + +static void multifd_send_terminate_threads(void) +{ + int i; + + trace_multifd_send_terminate_threads(); + /* + * Tell everyone we're quitting. No xchg() needed here; we simply + * always set it. + */ + qatomic_set(&multifd_send_state->exiting, 1); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -587,7 +596,7 @@ void multifd_save_cleanup(void) if (!migrate_multifd()) { return; } - multifd_send_terminate_threads(NULL); + multifd_send_terminate_threads(); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -784,7 +793,7 @@ out: if (ret) { assert(local_err); trace_multifd_send_error(p->id); - multifd_send_terminate_threads(local_err); + multifd_send_set_error(local_err); multifd_send_kick_main(p); error_free(local_err); } @@ -820,7 +829,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - multifd_send_terminate_threads(err); + multifd_send_set_error(err); multifd_send_kick_main(p); error_free(err); } @@ -902,7 +911,7 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_send_terminate_threads(local_err); + multifd_send_set_error(local_err); multifd_send_kick_main(p); object_unref(OBJECT(ioc)); error_free(local_err); diff --git a/migration/trace-events b/migration/trace-events index de4a743c8a..298ad2b0dd 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -141,7 +141,7 @@ multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" -multifd_send_terminate_threads(bool error) "error %d" +multifd_send_terminate_threads(void) "" multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" -- Gitee From d95c440bb62e6eb30b3777e10d94fbc72b7f65a4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:50 +0800 Subject: [PATCH 844/939] migration/multifd: Change retval of multifd_queue_page() commit d6556d174a6b9fc443f2320193f18e71eb67052a upstream. Using int is an overkill when there're only two options. Change it to a boolean. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-17-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 9 +++++---- migration/multifd.h | 2 +- migration/ram.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 59ccc42c05..c48c031009 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -506,7 +506,8 @@ static int multifd_send_pages(void) return 1; } -int multifd_queue_page(RAMBlock *block, ram_addr_t offset) +/* Returns true if enqueue successful, false otherwise */ +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; bool changed = false; @@ -520,21 +521,21 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) pages->num++; if (pages->num < pages->allocated) { - return 1; + return true; } } else { changed = true; } if (multifd_send_pages() < 0) { - return -1; + return false; } if (changed) { return multifd_queue_page(block, offset); } - return 1; + return true; } /* Multifd send side hit an error; remember it and prepare to quit */ diff --git a/migration/multifd.h b/migration/multifd.h index 34a2ecb9f4..a320c53a6f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -22,7 +22,7 @@ bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); -int multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/ram.c b/migration/ram.c index 67fa9c83d6..9630b654c2 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1389,7 +1389,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(block, offset) < 0) { + if (!multifd_queue_page(block, offset)) { return -1; } stat64_add(&mig_stats.normal_pages, 1); -- Gitee From c91e89ee776b145b265f56fc9539514b36988e84 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:51 +0800 Subject: [PATCH 845/939] migration/multifd: Change retval of multifd_send_pages() commit 3b40964a863d69121733c8b9794a02347ed0000b upstream. Using int is an overkill when there're only two options. Change it to a boolean. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-18-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index c48c031009..dabfc3ec0d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -450,9 +450,10 @@ static void multifd_send_kick_main(MultiFDSendParams *p) * thread is using the channel mutex when changing it, and the channel * have to had finish with its own, otherwise pending_job can't be * false. + * + * Returns true if succeed, false otherwise. */ - -static int multifd_send_pages(void) +static bool multifd_send_pages(void) { int i; static int next_channel; @@ -460,7 +461,7 @@ static int multifd_send_pages(void) MultiFDPages_t *pages = multifd_send_state->pages; if (multifd_send_should_exit()) { - return -1; + return false; } /* We wait here, until at least one channel is ready */ @@ -474,7 +475,7 @@ static int multifd_send_pages(void) next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { if (multifd_send_should_exit()) { - return -1; + return false; } p = &multifd_send_state->params[i]; /* @@ -503,7 +504,7 @@ static int multifd_send_pages(void) qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); - return 1; + return true; } /* Returns true if enqueue successful, false otherwise */ @@ -527,7 +528,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) changed = true; } - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { return false; } @@ -667,7 +668,7 @@ int multifd_send_sync_main(void) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { error_report("%s: multifd_send_pages fail", __func__); return -1; } -- Gitee From 68733215eef6342b28386fd6711f3ab82a7dc66a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:52 +0800 Subject: [PATCH 846/939] migration/multifd: Rewrite multifd_queue_page() commit f88f86c4ee3fe673b34873e27af2de0a16fe01fd upstream. The current multifd_queue_page() is not easy to read and follow. It is not good with a few reasons: - No helper at all to show what exactly does a condition mean; in short, readability is low. - Rely on pages->ramblock being cleared to detect an empty queue. It's slightly an overload of the ramblock pointer, per Fabiano [1], which I also agree. - Contains a self recursion, even if not necessary.. Rewrite this function. We add some comments to make it even clearer on what it does. [1] https://lore.kernel.org/r/87wmrpjzew.fsf@suse.de Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-19-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 56 ++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index dabfc3ec0d..f92e6776f0 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -507,35 +507,53 @@ static bool multifd_send_pages(void) return true; } +static inline bool multifd_queue_empty(MultiFDPages_t *pages) +{ + return pages->num == 0; +} + +static inline bool multifd_queue_full(MultiFDPages_t *pages) +{ + return pages->num == pages->allocated; +} + +static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) +{ + pages->offset[pages->num++] = offset; +} + /* Returns true if enqueue successful, false otherwise */ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) { - MultiFDPages_t *pages = multifd_send_state->pages; - bool changed = false; + MultiFDPages_t *pages; + +retry: + pages = multifd_send_state->pages; - if (!pages->block) { + /* If the queue is empty, we can already enqueue now */ + if (multifd_queue_empty(pages)) { pages->block = block; + multifd_enqueue(pages, offset); + return true; } - if (pages->block == block) { - pages->offset[pages->num] = offset; - pages->num++; - - if (pages->num < pages->allocated) { - return true; + /* + * Not empty, meanwhile we need a flush. It can because of either: + * + * (1) The page is not on the same ramblock of previous ones, or, + * (2) The queue is full. + * + * After flush, always retry. + */ + if (pages->block != block || multifd_queue_full(pages)) { + if (!multifd_send_pages()) { + return false; } - } else { - changed = true; - } - - if (!multifd_send_pages()) { - return false; - } - - if (changed) { - return multifd_queue_page(block, offset); + goto retry; } + /* Not empty, and we still have space, do it! */ + multifd_enqueue(pages, offset); return true; } -- Gitee From bdcbbe9df0dcc74f21948ba459cc350da77446af Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:53 +0800 Subject: [PATCH 847/939] migration/multifd: Cleanup multifd_save_cleanup() commit 12808db3b8c22d26c9bc3da6f41756890ce882e4 upstream. Shrink the function by moving relevant works into helpers: move the thread join()s into multifd_send_terminate_threads(), then create two more helpers to cover channel/state cleanups. Add a TODO entry for the thread terminate process because p->running is still buggy. We need to fix it at some point but not yet covered. Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-20-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 91 +++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f92e6776f0..83c6ccd0f2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -594,6 +594,11 @@ static void multifd_send_terminate_threads(void) * always set it. */ qatomic_set(&multifd_send_state->exiting, 1); + + /* + * Firstly, kick all threads out; no matter whether they are just idle, + * or blocked in an IO system call. + */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -602,6 +607,21 @@ static void multifd_send_terminate_threads(void) qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } } + + /* + * Finally recycle all the threads. + * + * TODO: p->running is still buggy, e.g. we can reach here without the + * corresponding multifd_new_send_channel_async() get invoked yet, + * then a new thread can even be created after this function returns. + */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + if (p->running) { + qemu_thread_join(&p->thread); + } + } } static int multifd_send_channel_destroy(QIOChannel *send) @@ -609,6 +629,41 @@ static int multifd_send_channel_destroy(QIOChannel *send) return socket_send_channel_destroy(send); } +static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) +{ + if (p->registered_yank) { + migration_ioc_unregister_yank(p->c); + } + multifd_send_channel_destroy(p->c); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + multifd_pages_clear(p->pages); + p->pages = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + multifd_send_state->ops->send_cleanup(p, errp); + + return *errp == NULL; +} + +static void multifd_send_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; + multifd_pages_clear(multifd_send_state->pages); + multifd_send_state->pages = NULL; + g_free(multifd_send_state); + multifd_send_state = NULL; +} + void multifd_save_cleanup(void) { int i; @@ -616,48 +671,20 @@ void multifd_save_cleanup(void) if (!migrate_multifd()) { return; } + multifd_send_terminate_threads(); - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDSendParams *p = &multifd_send_state->params[i]; - if (p->running) { - qemu_thread_join(&p->thread); - } - } for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; Error *local_err = NULL; - if (p->registered_yank) { - migration_ioc_unregister_yank(p->c); - } - multifd_send_channel_destroy(p->c); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - multifd_pages_clear(p->pages); - p->pages = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - multifd_send_state->ops->send_cleanup(p, &local_err); - if (local_err) { + if (!multifd_send_cleanup_channel(p, &local_err)) { migrate_set_error(migrate_get_current(), local_err); error_free(local_err); } } - qemu_sem_destroy(&multifd_send_state->channels_ready); - g_free(multifd_send_state->params); - multifd_send_state->params = NULL; - multifd_pages_clear(multifd_send_state->pages); - multifd_send_state->pages = NULL; - g_free(multifd_send_state); - multifd_send_state = NULL; + + multifd_send_cleanup_state(); } static int multifd_zero_copy_flush(QIOChannel *c) -- Gitee From d7240e133b0eebb08d42de278fbefbc89061143b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:54 +0800 Subject: [PATCH 848/939] migration/multifd: Cleanup multifd_load_cleanup() commit 5e6ea8a1d64e72e648b5a5277f08ec7fb09c3b8e upstream. Use similar logic to cleanup the recv side. Note that multifd_recv_terminate_threads() may need some similar rework like the sender side, but let's leave that for later. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-21-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 52 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 83c6ccd0f2..048ff66760 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1074,6 +1074,34 @@ void multifd_load_shutdown(void) } } +static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) +{ + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + multifd_recv_state->ops->recv_cleanup(p); +} + +static void multifd_recv_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_recv_state->sem_sync); + g_free(multifd_recv_state->params); + multifd_recv_state->params = NULL; + g_free(multifd_recv_state); + multifd_recv_state = NULL; +} + void multifd_load_cleanup(void) { int i; @@ -1096,29 +1124,9 @@ void multifd_load_cleanup(void) qemu_thread_join(&p->thread); } for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - - migration_ioc_unregister_yank(p->c); - object_unref(OBJECT(p->c)); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_recv_state->ops->recv_cleanup(p); + multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); } - qemu_sem_destroy(&multifd_recv_state->sem_sync); - g_free(multifd_recv_state->params); - multifd_recv_state->params = NULL; - g_free(multifd_recv_state); - multifd_recv_state = NULL; + multifd_recv_cleanup_state(); } void multifd_recv_sync_main(void) -- Gitee From f78f9157a90c7bef026f87fd38f6ce5b785f6cb7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:55 +0800 Subject: [PATCH 849/939] migration/multifd: Stick with send/recv on function names commit cde85c37ca54e4a2dbee8653181938499887f6be upstream. Most of the multifd code uses send/recv to represent the two sides, but some rare cases use save/load. Since send/recv is the majority, replacing the save/load use cases to use send/recv globally. Now we reach a consensus on the naming. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-22-peterx@redhat.com Signed-off-by: Peter Xu [jz: upstream renamed qemu_mutex_lock_iothread() to qpl_lock(), while openEuler not yet, resolve context conflict due to this] Signed-off-by: Jason Zeng --- migration/migration.c | 12 ++++++------ migration/multifd.c | 10 +++++----- migration/multifd.h | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 2c5258d0b0..f428839dd6 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -269,7 +269,7 @@ void migration_incoming_state_destroy(void) { struct MigrationIncomingState *mis = migration_incoming_get_current(); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); if (mis->to_src_file) { @@ -622,7 +622,7 @@ static void process_incoming_migration_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); - multifd_load_shutdown(); + multifd_recv_shutdown(); dirty_bitmap_mig_before_vm_start(); @@ -721,7 +721,7 @@ fail: MIGRATION_STATUS_FAILED); qemu_fclose(mis->from_src_file); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); exit(EXIT_FAILURE); @@ -854,7 +854,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) default_channel = !mis->from_src_file; } - if (multifd_load_setup(errp) != 0) { + if (multifd_recv_setup(errp) != 0) { return; } @@ -1306,7 +1306,7 @@ static void migrate_fd_cleanup(MigrationState *s) } qemu_mutex_lock_iothread(); - multifd_save_cleanup(); + multifd_send_shutdown(); qemu_mutex_lock(&s->qemu_file_lock); tmp = s->to_dst_file; s->to_dst_file = NULL; @@ -3638,7 +3638,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_save_setup(&local_err) != 0) { + if (multifd_send_setup(&local_err) != 0) { migrate_set_error(s, local_err); error_report_err(local_err); migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, diff --git a/migration/multifd.c b/migration/multifd.c index 048ff66760..723b1d0b35 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -664,7 +664,7 @@ static void multifd_send_cleanup_state(void) multifd_send_state = NULL; } -void multifd_save_cleanup(void) +void multifd_send_shutdown(void) { int i; @@ -969,7 +969,7 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_save_setup(Error **errp) +int multifd_send_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); @@ -1067,7 +1067,7 @@ static void multifd_recv_terminate_threads(Error *err) } } -void multifd_load_shutdown(void) +void multifd_recv_shutdown(void) { if (migrate_multifd()) { multifd_recv_terminate_threads(NULL); @@ -1102,7 +1102,7 @@ static void multifd_recv_cleanup_state(void) multifd_recv_state = NULL; } -void multifd_load_cleanup(void) +void multifd_recv_cleanup(void) { int i; @@ -1217,7 +1217,7 @@ static void *multifd_recv_thread(void *opaque) return NULL; } -int multifd_load_setup(Error **errp) +int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); diff --git a/migration/multifd.h b/migration/multifd.h index a320c53a6f..9b40a53cb6 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,11 +13,11 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_save_setup(Error **errp); -void multifd_save_cleanup(void); -int multifd_load_setup(Error **errp); -void multifd_load_cleanup(void); -void multifd_load_shutdown(void); +int multifd_send_setup(Error **errp); +void multifd_send_shutdown(void); +int multifd_recv_setup(Error **errp); +void multifd_recv_cleanup(void); +void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -- Gitee From cafe218b15706cf78c3790eaa08497c09d78c7b4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:56 +0800 Subject: [PATCH 850/939] migration/multifd: Fix MultiFDSendParams.packet_num race commit 98ea497d8b8a5076be7b6ceb0dcc4a475373eb76 upstream. As reported correctly by Fabiano [1] (while per Fabiano, it sourced back to Elena's initial report in Oct 2023), MultiFDSendParams.packet_num is buggy to be assigned and stored. Consider two consequent operations of: (1) queue a job into multifd send thread X, then (2) queue another sync request to the same send thread X. Then the MultiFDSendParams.packet_num will be assigned twice, and the first assignment can get lost already. To avoid that, we move the packet_num assignment from p->packet_num into where the thread will fill in the packet. Use atomic operations to protect the field, making sure there's no race. Note that atomic fetch_add() may not be good for scaling purposes, however multifd should be fine as number of threads should normally not go beyond 16 threads. Let's leave that concern for later but fix the issue first. There's also a trick on how to make it always work even on 32 bit hosts for uint64_t packet number. Switching to uintptr_t as of now to simply the case. It will cause packet number to overflow easier on 32 bit, but that shouldn't be a major concern for now as 32 bit systems is not the major audience for any performance concerns like what multifd wants to address. We also need to move multifd_send_state definition upper, so that multifd_send_fill_packet() can reference it. [1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de Reported-by: Elena Ufimtseva Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-23-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 56 +++++++++++++++++++++++++++------------------ migration/multifd.h | 2 -- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 723b1d0b35..c52c18046a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -46,6 +46,35 @@ typedef struct { uint64_t unused2[4]; /* Reserved for future use */ } __attribute__((packed)) MultiFDInit_t; +struct { + MultiFDSendParams *params; + /* array of pages to sent */ + MultiFDPages_t *pages; + /* + * Global number of generated multifd packets. + * + * Note that we used 'uintptr_t' because it'll naturally support atomic + * operations on both 32bit / 64 bits hosts. It means on 32bit systems + * multifd will overflow the packet_num easier, but that should be + * fine. + * + * Another option is to use QEMU's Stat64 then it'll be 64 bits on all + * hosts, however so far it does not support atomic fetch_add() yet. + * Make it easy for now. + */ + uintptr_t packet_num; + /* send channels ready */ + QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_send_state; + /* Multifd without compression */ /** @@ -293,13 +322,16 @@ void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; + uint64_t packet_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); - packet->packet_num = cpu_to_be64(p->packet_num); + + packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); + packet->packet_num = cpu_to_be64(packet_num); if (pages->block) { strncpy(packet->ramblock, pages->block->idstr, 256); @@ -315,7 +347,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p) p->packets_sent++; p->total_normal_pages += pages->num; - trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, + trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); } @@ -399,24 +431,6 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return 0; } -struct { - MultiFDSendParams *params; - /* array of pages to sent */ - MultiFDPages_t *pages; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* send channels ready */ - QemuSemaphore channels_ready; - /* - * Have we already run terminate threads. There is a race when it - * happens that we got one error while we are exiting. - * We will use atomic operations. Only valid values are 0 and 1. - */ - int exiting; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_send_state; - static bool multifd_send_should_exit(void) { return qatomic_read(&multifd_send_state->exiting); @@ -498,7 +512,6 @@ static bool multifd_send_pages(void) */ assert(qatomic_read(&p->pending_job) == false); qatomic_set(&p->pending_job, true); - p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; qemu_mutex_unlock(&p->mutex); @@ -731,7 +744,6 @@ int multifd_send_sync_main(void) trace_multifd_send_sync_main_signal(p->id); qemu_mutex_lock(&p->mutex); - p->packet_num = multifd_send_state->packet_num++; /* * We should be the only user so far, so not possible to be set by * others concurrently. diff --git a/migration/multifd.h b/migration/multifd.h index 9b40a53cb6..98876ff94a 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -97,8 +97,6 @@ typedef struct { bool running; /* multifd flags for each packet */ uint32_t flags; - /* global number of generated multifd packets */ - uint64_t packet_num; /* * The sender thread has work to do if either of below boolean is set. * -- Gitee From 2beae052ba502782de62ca4ccf7a1cdb6e830150 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:57 +0800 Subject: [PATCH 851/939] migration/multifd: Optimize sender side to be lockless commit 488c84acb465c21b716c3fd14de27ab5ce388c85 upstream. When reviewing my attempt to refactor send_prepare(), Fabiano suggested we try out with dropping the mutex in multifd code [1]. I thought about that before but I never tried to change the code. Now maybe it's time to give it a stab. This only optimizes the sender side. The trick here is multifd has a clear provider/consumer model, that the migration main thread publishes requests (either pending_job/pending_sync), while the multifd sender threads are consumers. Here we don't have a lot of complicated data sharing, and the jobs can logically be submitted lockless. Arm the code with atomic weapons. Two things worth mentioning: - For multifd_send_pages(): we can use qatomic_load_acquire() when trying to find a free channel, but that's expensive if we attach one ACQUIRE per channel. Instead, keep the qatomic_read() on reading the pending_job flag as we do already, meanwhile use one smp_mb_acquire() after the loop to guarantee the memory ordering. - For pending_sync: it doesn't have any extra data required since now p->flags are never touched, it should be safe to not use memory barrier. That's different from pending_job. Provide rich comments for all the lockless operations to state how they are paired. With that, we can remove the mutex. [1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-24-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 51 +++++++++++++++++++++++---------------------- migration/multifd.h | 2 -- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index c52c18046a..c0d8f438bc 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -502,19 +502,19 @@ static bool multifd_send_pages(void) } } - qemu_mutex_lock(&p->mutex); - assert(!p->pages->num); - assert(!p->pages->block); /* - * Double check on pending_job==false with the lock. In the future if - * we can have >1 requester thread, we can replace this with a "goto - * retry", but that is for later. + * Make sure we read p->pending_job before all the rest. Pairs with + * qatomic_store_release() in multifd_send_thread(). */ - assert(qatomic_read(&p->pending_job) == false); - qatomic_set(&p->pending_job, true); + smp_mb_acquire(); + assert(!p->pages->num); multifd_send_state->pages = p->pages; p->pages = pages; - qemu_mutex_unlock(&p->mutex); + /* + * Making sure p->pages is setup before marking pending_job=true. Pairs + * with the qatomic_load_acquire() in multifd_send_thread(). + */ + qatomic_store_release(&p->pending_job, true); qemu_sem_post(&p->sem); return true; @@ -649,7 +649,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) } multifd_send_channel_destroy(p->c); p->c = NULL; - qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem); qemu_sem_destroy(&p->sem_sync); g_free(p->name); @@ -743,14 +742,12 @@ int multifd_send_sync_main(void) trace_multifd_send_sync_main_signal(p->id); - qemu_mutex_lock(&p->mutex); /* * We should be the only user so far, so not possible to be set by * others concurrently. */ assert(qatomic_read(&p->pending_sync) == false); qatomic_set(&p->pending_sync, true); - qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -800,9 +797,12 @@ static void *multifd_send_thread(void *opaque) if (multifd_send_should_exit()) { break; } - qemu_mutex_lock(&p->mutex); - if (qatomic_read(&p->pending_job)) { + /* + * Read pending_job flag before p->pages. Pairs with the + * qatomic_store_release() in multifd_send_pages(). + */ + if (qatomic_load_acquire(&p->pending_job)) { MultiFDPages_t *pages = p->pages; p->iovs_num = 0; @@ -810,14 +810,12 @@ static void *multifd_send_thread(void *opaque) ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } @@ -826,24 +824,31 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; - qatomic_set(&p->pending_job, false); - qemu_mutex_unlock(&p->mutex); + + /* + * Making sure p->pages is published before saying "we're + * free". Pairs with the smp_mb_acquire() in + * multifd_send_pages(). + */ + qatomic_store_release(&p->pending_job, false); } else { - /* If not a normal job, must be a sync request */ + /* + * If not a normal job, must be a sync request. Note that + * pending_sync is a standalone flag (unlike pending_job), so + * it doesn't require explicit memory barriers. + */ assert(qatomic_read(&p->pending_sync)); p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, p->packet_len, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } /* p->next_packet_size will always be zero for a SYNC packet */ stat64_add(&mig_stats.multifd_bytes, p->packet_len); p->flags = 0; qatomic_set(&p->pending_sync, false); - qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem_sync); } } @@ -857,10 +862,7 @@ out: error_free(local_err); } - qemu_mutex_lock(&p->mutex); p->running = false; - qemu_mutex_unlock(&p->mutex); - rcu_unregister_thread(); migration_threads_remove(thread); trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); @@ -1002,7 +1004,6 @@ int multifd_send_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); p->id = i; diff --git a/migration/multifd.h b/migration/multifd.h index 98876ff94a..78a2317263 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -91,8 +91,6 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* this mutex protects the following parameters */ - QemuMutex mutex; /* is this channel thread running */ bool running; /* multifd flags for each packet */ -- Gitee From d5a21de3aa2a13ab8bfb4d9d815ae60e04e08f94 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Thu, 25 Jan 2024 18:25:12 +0200 Subject: [PATCH 852/939] migration: Fix logic of channels and transport compatibility check commit 3205bebd4fc6dd501fb8b10c93ddce9da18e09db upstream. The commit in the fixes line mistakenly modified the channels and transport compatibility check logic so it now checks multi-channel support only for socket transport type. Thus, running multifd migration using a transport other than socket that is incompatible with multi-channels (such as "exec") would lead to a segmentation fault instead of an error message. For example: (qemu) migrate_set_capability multifd on (qemu) migrate -d "exec:cat > /tmp/vm_state" Segmentation fault (core dumped) Fix it by checking multi-channel compatibility for all transport types. Cc: qemu-stable Fixes: d95533e1cdcc ("migration: modify migration_channels_and_uri_compatible() for new QAPI syntax") Signed-off-by: Avihai Horon Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240125162528.7552-2-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index f428839dd6..0e8255180d 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -127,11 +127,17 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } -static bool transport_supports_multi_channels(SocketAddress *saddr) +static bool transport_supports_multi_channels(MigrationAddress *addr) { - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + SocketAddress *saddr = &addr->u.socket; + + return saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + } + + return false; } static bool @@ -139,8 +145,7 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { if (migration_needs_multiple_sockets() && - (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) && - !transport_supports_multi_channels(&addr->u.socket)) { + !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); return false; } -- Gitee From 234d32c5cef7114f2554f18c8ad73fb294fb4542 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:13 -0300 Subject: [PATCH 853/939] migration/multifd: Join the TLS thread commit e1921f10d9afe651f4887284e85f6789b37e67d3 upstream. We're currently leaking the resources of the TLS thread by not joining it and also overwriting the p->thread pointer altogether. Fixes: a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") Cc: qemu-stable Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 +++++++- migration/multifd.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index c0d8f438bc..459e7889e8 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -631,6 +631,10 @@ static void multifd_send_terminate_threads(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (p->tls_thread_created) { + qemu_thread_join(&p->tls_thread); + } + if (p->running) { qemu_thread_join(&p->thread); } @@ -925,7 +929,9 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); p->c = QIO_CHANNEL(tioc); - qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", + + p->tls_thread_created = true; + qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", multifd_tls_handshake_thread, p, QEMU_THREAD_JOINABLE); return true; diff --git a/migration/multifd.h b/migration/multifd.h index 78a2317263..720c9d50db 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -73,6 +73,8 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + QemuThread tls_thread; + bool tls_thread_created; /* communication channel */ QIOChannel *c; /* is the yank function registered */ -- Gitee From 9fb44da2534bcf1802c5f7ce36944b0940821728 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:14 -0300 Subject: [PATCH 854/939] migration/multifd: Remove p->running commit a2a63c4abd52f4e3ff4046dcb67fe44ebf0bb8de upstream. We currently only need p->running to avoid calling qemu_thread_join() on a non existent thread if the thread has never been created. However, there are at least two bugs in this logic: 1) On the sending side, p->running is set too early and qemu_thread_create() can be skipped due to an error during TLS handshake, leaving the flag set and leading to a crash when multifd_send_cleanup() calls qemu_thread_join(). 2) During exit, the multifd thread clears the flag while holding the channel lock. The counterpart at multifd_send_cleanup() reads the flag outside of the lock and might free the mutex while the multifd thread still has it locked. Fix the first issue by setting the flag right before creating the thread. Rename it from p->running to p->thread_created to clarify its usage. Fix the second issue by not clearing the flag at the multifd thread exit. We don't have any use for that. Note that these bugs are straight-forward logic issues and not race conditions. There is still a gap for races to affect this code due to multifd_send_cleanup() being allowed to run concurrently with the thread creation loop. This issue is solved in the next patches. Cc: qemu-stable Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") Reported-by: Avihai Horon Reported-by: chenyuhui5@huawei.com Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 27 ++++++++++++--------------- migration/multifd.h | 7 ++----- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 459e7889e8..59dcb6c9a2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -635,7 +635,7 @@ static void multifd_send_terminate_threads(void) qemu_thread_join(&p->tls_thread); } - if (p->running) { + if (p->thread_created) { qemu_thread_join(&p->thread); } } @@ -866,7 +866,6 @@ out: error_free(local_err); } - p->running = false; rcu_unregister_thread(); migration_threads_remove(thread); trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); @@ -957,6 +956,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p, migration_ioc_register_yank(ioc); p->registered_yank = true; p->c = ioc; + + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); return true; @@ -971,7 +972,6 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) trace_multifd_new_send_channel_async(p->id); if (!qio_task_propagate_error(task, &local_err)) { qio_channel_set_delay(ioc, false); - p->running = true; if (multifd_channel_connect(p, ioc, &local_err)) { return; } @@ -1132,15 +1132,15 @@ void multifd_recv_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - if (p->running) { - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - } + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); - qemu_thread_join(&p->thread); + if (p->thread_created) { + qemu_thread_join(&p->thread); + } } for (i = 0; i < migrate_multifd_channels(); i++) { multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); @@ -1226,9 +1226,6 @@ static void *multifd_recv_thread(void *opaque) multifd_recv_terminate_threads(local_err); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); @@ -1334,7 +1331,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) p->c = ioc; object_ref(OBJECT(ioc)); - p->running = true; + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); diff --git a/migration/multifd.h b/migration/multifd.h index 720c9d50db..7881980ee6 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -73,6 +73,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; QemuThread tls_thread; bool tls_thread_created; /* communication channel */ @@ -93,8 +94,6 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* is this channel thread running */ - bool running; /* multifd flags for each packet */ uint32_t flags; /* @@ -143,6 +142,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; /* communication channel */ QIOChannel *c; /* packet allocated len */ @@ -157,8 +157,6 @@ typedef struct { /* this mutex protects the following parameters */ QemuMutex mutex; - /* is this channel thread running */ - bool running; /* should this thread finish */ bool quit; /* multifd flags for each packet */ @@ -217,4 +215,3 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) #endif - -- Gitee From d9e7bf53856956e6417a2dd0b5636fb61fb1c365 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:15 -0300 Subject: [PATCH 855/939] migration/multifd: Move multifd_send_setup error handling in to the function commit bd8b0a8f82d8fc17aa285ab963ba75675c2fbe7a upstream. Hide the error handling inside multifd_send_setup to make it cleaner for the next patch to move the function around. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 6 +----- migration/multifd.c | 24 +++++++++++++++++------- migration/multifd.h | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 0e8255180d..66417b40a2 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3643,11 +3643,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_send_setup(&local_err) != 0) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); + if (!multifd_send_setup()) { migrate_fd_cleanup(s); return; } diff --git a/migration/multifd.c b/migration/multifd.c index 59dcb6c9a2..1299248fea 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -989,14 +989,16 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_send_setup(Error **errp) +bool multifd_send_setup(void) { - int thread_count; + MigrationState *s = migrate_get_current(); + Error *local_err = NULL; + int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; if (!migrate_multifd()) { - return 0; + return true; } thread_count = migrate_multifd_channels(); @@ -1030,14 +1032,22 @@ int multifd_send_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - int ret; - ret = multifd_send_state->ops->send_setup(p, errp); + ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - return ret; + break; } } - return 0; + + if (ret) { + migrate_set_error(s, local_err); + error_report_err(local_err); + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; + } + + return true; } struct { diff --git a/migration/multifd.h b/migration/multifd.h index 7881980ee6..8a1cad0996 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,7 +13,7 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_send_setup(Error **errp); +bool multifd_send_setup(void); void multifd_send_shutdown(void); int multifd_recv_setup(Error **errp); void multifd_recv_cleanup(void); -- Gitee From 4ab5ed68480ec55bff220496342000187b76c451 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:16 -0300 Subject: [PATCH 856/939] migration/multifd: Move multifd_send_setup into migration thread commit dd904bc13f2af0c605c3fe72f118ea4e27a6610c upstream. We currently have an unfavorable situation around multifd channels creation and the migration thread execution. We create the multifd channels with qio_channel_socket_connect_async -> qio_task_run_in_thread, but only connect them at the multifd_new_send_channel_async callback, called from qio_task_complete, which is registered as a glib event. So at multifd_send_setup() we create the channels, but they will only be actually usable after the whole multifd_send_setup() calling stack returns back to the main loop. Which means that the migration thread is already up and running without any possibility for the multifd channels to be ready on time. We currently rely on the channels-ready semaphore blocking multifd_send_sync_main() until channels start to come up and release it. However there have been bugs recently found when a channel's creation fails and multifd_send_cleanup() is allowed to run while other channels are still being created. Let's start to organize this situation by moving the multifd_send_setup() call into the migration thread. That way we unblock the main-loop to dispatch the completion callbacks and actually have a chance of getting the multifd channels ready for when the migration thread needs them. The next patches will deal with the synchronization aspects. Note that this takes multifd_send_setup() out of the BQL. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-5-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/migration.c [jz: upstream renamed qemu_mutex_lock_iothread() to bql_lock(), while openEuler not yet. Resolve context conflict due to this] Signed-off-by: Jason Zeng --- migration/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 66417b40a2..59c0bbee67 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3319,6 +3319,10 @@ static void *migration_thread(void *opaque) object_ref(OBJECT(s)); update_iteration_initial_status(s); + if (!multifd_send_setup()) { + goto out; + } + qemu_mutex_lock_iothread(); qemu_savevm_state_header(s->to_dst_file); qemu_mutex_unlock_iothread(); @@ -3390,6 +3394,7 @@ static void *migration_thread(void *opaque) urgent = migration_rate_limit(); } +out: trace_migration_thread_after_loop(); migration_iteration_finish(s); object_unref(OBJECT(s)); @@ -3643,11 +3648,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (!multifd_send_setup()) { - migrate_fd_cleanup(s); - return; - } - if (migrate_background_snapshot()) { qemu_thread_create(&s->thread, "bg_snapshot", bg_migration_thread, s, QEMU_THREAD_JOINABLE); -- Gitee From 7b385b0d528dfe3490bb3c8f58937bde1685f0f1 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:17 -0300 Subject: [PATCH 857/939] migration/multifd: Unify multifd and TLS connection paths commit 2576ae488ef9aa692486157df7d8b410919cd219 upstream. During multifd channel creation (multifd_send_new_channel_async) when TLS is enabled, the multifd_channel_connect function is called twice, once to create the TLS handshake thread and another time after the asynchrounous TLS handshake has finished. This creates a slightly confusing call stack where multifd_channel_connect() is called more times than the number of channels. It also splits error handling between the two callers of multifd_channel_connect() causing some code duplication. Lastly, it gets in the way of having a single point to determine whether all channel creation tasks have been initiated. Refactor the code to move the reentrancy one level up at the multifd_new_send_channel_async() level, de-duplicating the error handling and allowing for the next patch to introduce a synchronization point common to all the multifd channel creation, regardless of TLS. Note that the previous code would never fail once p->c had been set. This patch changes this assumption, which affects refcounting, so add comments around object_unref to explain the situation. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 83 ++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 1299248fea..85d1e7c347 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -873,30 +873,7 @@ out: return NULL; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp); - -static void multifd_tls_outgoing_handshake(QIOTask *task, - gpointer opaque) -{ - MultiFDSendParams *p = opaque; - QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); - Error *err = NULL; - - if (!qio_task_propagate_error(task, &err)) { - trace_multifd_tls_outgoing_handshake_complete(ioc); - if (multifd_channel_connect(p, ioc, &err)) { - return; - } - } - - trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - - multifd_send_set_error(err); - multifd_send_kick_main(p); - error_free(err); -} +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); static void *multifd_tls_handshake_thread(void *opaque) { @@ -904,7 +881,7 @@ static void *multifd_tls_handshake_thread(void *opaque) QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); qio_channel_tls_handshake(tioc, - multifd_tls_outgoing_handshake, + multifd_new_send_channel_async, p, NULL, NULL); @@ -924,6 +901,10 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return false; } + /* + * Ownership of the socket channel now transfers to the newly + * created TLS channel, which has already taken a reference. + */ object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); @@ -940,18 +921,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, Error **errp) { - trace_multifd_set_outgoing_channel( - ioc, object_get_typename(OBJECT(ioc)), - migrate_get_current()->hostname); - - if (migrate_channel_requires_tls_upgrade(ioc)) { - /* - * tls_channel_connect will call back to this - * function after the TLS handshake, - * so we mustn't call multifd_send_thread until then - */ - return multifd_tls_channel_connect(p, ioc, errp); - } + qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); p->registered_yank = true; @@ -963,24 +933,51 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return true; } +/* + * When TLS is enabled this function is called once to establish the + * TLS connection and a second time after the TLS handshake to create + * the multifd channel. Without TLS it goes straight into the channel + * creation. + */ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *local_err = NULL; + bool ret; trace_multifd_new_send_channel_async(p->id); - if (!qio_task_propagate_error(task, &local_err)) { - qio_channel_set_delay(ioc, false); - if (multifd_channel_connect(p, ioc, &local_err)) { - return; - } + + if (qio_task_propagate_error(task, &local_err)) { + ret = false; + goto out; + } + + trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)), + migrate_get_current()->hostname); + + if (migrate_channel_requires_tls_upgrade(ioc)) { + ret = multifd_tls_channel_connect(p, ioc, &local_err); + } else { + ret = multifd_channel_connect(p, ioc, &local_err); } + if (ret) { + return; + } + +out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); multifd_send_kick_main(p); - object_unref(OBJECT(ioc)); + if (!p->c) { + /* + * If no channel has been created, drop the initial + * reference. Otherwise cleanup happens at + * multifd_send_channel_destroy() + */ + object_unref(OBJECT(ioc)); + } error_free(local_err); } -- Gitee From 5236178dc96f2e9b24aa95bc01d700428a95d023 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:18 -0300 Subject: [PATCH 858/939] migration/multifd: Add a synchronization point for channel creation commit 93fa9dc2e0522c54b813dee0898a5feb98b624c9 upstream. It is possible that one of the multifd channels fails to be created at multifd_new_send_channel_async() while the rest of the channel creation tasks are still in flight. This could lead to multifd_save_cleanup() executing the qemu_thread_join() loop too early and not waiting for the threads which haven't been created yet, leading to the freeing of resources that the newly created threads will try to access and crash. Add a synchronization point after which there will be no attempts at thread creation and therefore calling multifd_save_cleanup() past that point will ensure it properly waits for the threads. A note about performance: Prior to this patch, if a channel took too long to be established, other channels could finish connecting first and already start taking load. Now we're bounded by the slowest-connecting channel. Reported-by: Avihai Horon Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 85d1e7c347..bd240649f7 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -63,6 +63,11 @@ struct { * Make it easy for now. */ uintptr_t packet_num; + /* + * Synchronization point past which no more channels will be + * created. + */ + QemuSemaphore channels_created; /* send channels ready */ QemuSemaphore channels_ready; /* @@ -623,10 +628,6 @@ static void multifd_send_terminate_threads(void) /* * Finally recycle all the threads. - * - * TODO: p->running is still buggy, e.g. we can reach here without the - * corresponding multifd_new_send_channel_async() get invoked yet, - * then a new thread can even be created after this function returns. */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -671,6 +672,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); multifd_send_state->params = NULL; @@ -958,18 +960,26 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) if (migrate_channel_requires_tls_upgrade(ioc)) { ret = multifd_tls_channel_connect(p, ioc, &local_err); + if (ret) { + return; + } } else { ret = multifd_channel_connect(p, ioc, &local_err); } +out: + /* + * Here we're not interested whether creation succeeded, only that + * it happened at all. + */ + qemu_sem_post(&multifd_send_state->channels_created); + if (ret) { return; } -out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); - multifd_send_kick_main(p); if (!p->c) { /* * If no channel has been created, drop the initial @@ -1002,6 +1012,7 @@ bool multifd_send_setup(void) multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1027,6 +1038,15 @@ bool multifd_send_setup(void) multifd_new_send_channel_create(p); } + /* + * Wait until channel creation has started for all channels. The + * creation can still fail, but no more channels will be created + * past this point. + */ + for (i = 0; i < thread_count; i++) { + qemu_sem_wait(&multifd_send_state->channels_created); + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; -- Gitee From eacc8d435828d31478498fe266487906941be6cb Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:08 -0300 Subject: [PATCH 859/939] migration/multifd: Remove p->quit from recv side commit 11dd7be57524d400652cecf8740a016b3d66b53d upstream. Like we did on the sending side, replace the p->quit per-channel flag with a global atomic 'exiting' flag. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index bd240649f7..126c18406f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -80,6 +80,19 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +struct { + MultiFDRecvParams *params; + /* number of created threads */ + int count; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + /* global number of generated multifd packets */ + uint64_t packet_num; + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_recv_state; + /* Multifd without compression */ /** @@ -441,6 +454,11 @@ static bool multifd_send_should_exit(void) return qatomic_read(&multifd_send_state->exiting); } +static bool multifd_recv_should_exit(void) +{ + return qatomic_read(&multifd_recv_state->exiting); +} + /* * The migration thread can wait on either of the two semaphores. This * function can be used to kick the main thread out of waiting on either of @@ -1067,24 +1085,16 @@ bool multifd_send_setup(void) return true; } -struct { - MultiFDRecvParams *params; - /* number of created threads */ - int count; - /* syncs main thread and channels */ - QemuSemaphore sem_sync; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_recv_state; - static void multifd_recv_terminate_threads(Error *err) { int i; trace_multifd_recv_terminate_threads(err != NULL); + if (qatomic_xchg(&multifd_recv_state->exiting, 1)) { + return; + } + if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -1098,8 +1108,6 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1109,7 +1117,6 @@ static void multifd_recv_terminate_threads(Error *err) if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } @@ -1214,7 +1221,7 @@ static void *multifd_recv_thread(void *opaque) while (true) { uint32_t flags; - if (p->quit) { + if (multifd_recv_should_exit()) { break; } @@ -1278,6 +1285,7 @@ int multifd_recv_setup(Error **errp) multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); qatomic_set(&multifd_recv_state->count, 0); + qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1286,7 +1294,6 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->id = i; p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; -- Gitee From 7a9435d5db4a525b841078b125ba4843339c82fa Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:09 -0300 Subject: [PATCH 860/939] migration/multifd: Release recv sem_sync earlier commit d13f0026c7a625a5a34a5dea4095a4d9cfa04652 upstream. Now that multifd_recv_terminate_threads() is called only once, release the recv side sem_sync earlier like we do for the send side. Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 126c18406f..bbd421004f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1108,6 +1108,12 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); + /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1166,12 +1172,6 @@ void multifd_recv_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - if (p->thread_created) { qemu_thread_join(&p->thread); } -- Gitee From f1ee974ab81330ae1048f0cf5ee2ccaeb16e26d1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:57 +0800 Subject: [PATCH 861/939] migration/multifd: Cleanup TLS iochannel referencing commit 9221e3c6a237da90ac296adfeb6e99ea9babfc20 upstream. Commit a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") introduced a thread for TLS channels, which will resolve the issue on blocking the main thread. However in the same commit p->c is slightly abused just to be able to pass over the pointer "p" into the thread. That's the major reason we'll need to conditionally free the io channel in the fault paths. To clean it up, using a separate structure to pass over both "p" and "tioc" in the tls handshake thread. Then we can make it a rule that p->c will never be set until the channel is completely setup. With that, we can drop the tricky conditional unref of the io channel in the error path. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index bbd421004f..ad8fa6a317 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -895,16 +895,22 @@ out: static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); +typedef struct { + MultiFDSendParams *p; + QIOChannelTLS *tioc; +} MultiFDTLSThreadArgs; + static void *multifd_tls_handshake_thread(void *opaque) { - MultiFDSendParams *p = opaque; - QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); + MultiFDTLSThreadArgs *args = opaque; - qio_channel_tls_handshake(tioc, + qio_channel_tls_handshake(args->tioc, multifd_new_send_channel_async, - p, + args->p, NULL, NULL); + g_free(args); + return NULL; } @@ -914,6 +920,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, { MigrationState *s = migrate_get_current(); const char *hostname = s->hostname; + MultiFDTLSThreadArgs *args; QIOChannelTLS *tioc; tioc = migration_tls_client_create(ioc, hostname, errp); @@ -928,11 +935,14 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); - p->c = QIO_CHANNEL(tioc); + + args = g_new0(MultiFDTLSThreadArgs, 1); + args->tioc = tioc; + args->p = p; p->tls_thread_created = true; qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", - multifd_tls_handshake_thread, p, + multifd_tls_handshake_thread, args, QEMU_THREAD_JOINABLE); return true; } @@ -945,6 +955,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, migration_ioc_register_yank(ioc); p->registered_yank = true; + /* Setup p->c only if the channel is completely setup */ p->c = ioc; p->thread_created = true; @@ -998,14 +1009,12 @@ out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); - if (!p->c) { - /* - * If no channel has been created, drop the initial - * reference. Otherwise cleanup happens at - * multifd_send_channel_destroy() - */ - object_unref(OBJECT(ioc)); - } + /* + * For error cases (TLS or non-TLS), IO channel is always freed here + * rather than when cleanup multifd: since p->c is not set, multifd + * cleanup code doesn't even know its existence. + */ + object_unref(OBJECT(ioc)); error_free(local_err); } -- Gitee From 103fe08122ba65282660932a5e342a282a4b3e1c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:58 +0800 Subject: [PATCH 862/939] migration/multifd: Drop registered_yank commit 0518b5d8d30d3a4d0ea4f45d61527bcdc43044d2 upstream. With a clear definition of p->c protocol, where we only set it up if the channel is fully established (TLS or non-TLS), registered_yank boolean will have equal meaning of "p->c != NULL". Drop registered_yank by checking p->c instead. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +++---- migration/multifd.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index ad8fa6a317..3e85bc544a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -667,11 +667,11 @@ static int multifd_send_channel_destroy(QIOChannel *send) static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { - if (p->registered_yank) { + if (p->c) { migration_ioc_unregister_yank(p->c); + multifd_send_channel_destroy(p->c); + p->c = NULL; } - multifd_send_channel_destroy(p->c); - p->c = NULL; qemu_sem_destroy(&p->sem); qemu_sem_destroy(&p->sem_sync); g_free(p->name); @@ -954,7 +954,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); - p->registered_yank = true; /* Setup p->c only if the channel is completely setup */ p->c = ioc; diff --git a/migration/multifd.h b/migration/multifd.h index 8a1cad0996..b3fe27ae93 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -78,8 +78,6 @@ typedef struct { bool tls_thread_created; /* communication channel */ QIOChannel *c; - /* is the yank function registered */ - bool registered_yank; /* packet allocated len */ uint32_t packet_len; /* guest page size */ -- Gitee From 797304d0151652a684f0df388036c2032dcc3979 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:59 +0800 Subject: [PATCH 863/939] migration/multifd: Make multifd_channel_connect() return void commit 770de49c00fa9eb262473f282c92979b47b7fd22 upstream. It never fails, drop the retval and also the Error**. Suggested-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3e85bc544a..a7289289a4 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -947,9 +947,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return true; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp) +static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { qio_channel_set_delay(ioc, false); @@ -960,7 +958,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); - return true; } /* @@ -992,7 +989,8 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) return; } } else { - ret = multifd_channel_connect(p, ioc, &local_err); + multifd_channel_connect(p, ioc); + ret = true; } out: -- Gitee From 28700ce624e7972fc971d7524c5aa8de868d253d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:53:00 +0800 Subject: [PATCH 864/939] migration/multifd: Cleanup outgoing_args in state destroy commit 72b90b96872acc5d00f9c16dfc196543349361da upstream. outgoing_args is a global cache of socket address to be reused in multifd. Freeing the cache in per-channel destructor is more or less a hack. Move it to multifd_send_cleanup_state() so it only get checked once. Use a small helper to do so because it's internal of socket.c. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 1 + migration/socket.c | 12 ++++++++---- migration/socket.h | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index a7289289a4..aa7b7e224e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -690,6 +690,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); diff --git a/migration/socket.c b/migration/socket.c index 98e3ea1514..3184c7c3c1 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -64,10 +64,6 @@ int socket_send_channel_destroy(QIOChannel *send) { /* Remove channel */ object_unref(OBJECT(send)); - if (outgoing_args.saddr) { - qapi_free_SocketAddress(outgoing_args.saddr); - outgoing_args.saddr = NULL; - } return 0; } @@ -137,6 +133,14 @@ void socket_start_outgoing_migration(MigrationState *s, NULL); } +void socket_cleanup_outgoing_migration(void) +{ + if (outgoing_args.saddr) { + qapi_free_SocketAddress(outgoing_args.saddr); + outgoing_args.saddr = NULL; + } +} + static void socket_accept_incoming_migration(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) diff --git a/migration/socket.h b/migration/socket.h index 5e4c33b8ea..5f52eddd4c 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -29,4 +29,6 @@ void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); void socket_start_outgoing_migration(MigrationState *s, SocketAddress *saddr, Error **errp); +void socket_cleanup_outgoing_migration(void); + #endif -- Gitee From 0700d5acc4e51e949cc6d34a9bbb504a2803a127 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:53:01 +0800 Subject: [PATCH 865/939] migration/multifd: Drop unnecessary helper to destroy IOC commit c9a7e83c9d64fd5ebc759186789e1b753c919d32 upstream. Both socket_send_channel_destroy() and multifd_send_channel_destroy() are unnecessary wrappers to destroy an IOC, as the only thing to do is to release the final IOC reference. We have plenty of code that destroys an IOC using direct unref() already; keep that style. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +------ migration/socket.c | 7 ------- migration/socket.h | 1 - 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index aa7b7e224e..9e3955cb8c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -660,16 +660,11 @@ static void multifd_send_terminate_threads(void) } } -static int multifd_send_channel_destroy(QIOChannel *send) -{ - return socket_send_channel_destroy(send); -} - static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); - multifd_send_channel_destroy(p->c); + object_unref(OBJECT(p->c)); p->c = NULL; } qemu_sem_destroy(&p->sem); diff --git a/migration/socket.c b/migration/socket.c index 3184c7c3c1..9ab89b1e08 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -60,13 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp) return QIO_CHANNEL(sioc); } -int socket_send_channel_destroy(QIOChannel *send) -{ - /* Remove channel */ - object_unref(OBJECT(send)); - return 0; -} - struct SocketConnectData { MigrationState *s; char *hostname; diff --git a/migration/socket.h b/migration/socket.h index 5f52eddd4c..46c233ecd2 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -23,7 +23,6 @@ void socket_send_channel_create(QIOTaskFunc f, void *data); QIOChannel *socket_send_channel_create_sync(Error **errp); -int socket_send_channel_destroy(QIOChannel *send); void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); -- Gitee From c17b6d51225501c92cfe6b086ea9217659d67bd1 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 1 Mar 2024 03:59:00 +0000 Subject: [PATCH 866/939] migration: Properly apply migration compression level parameters commit b4014a2bf57ce08e2f6458cd82e9f968facf25c8 upstream. Some glue code was missing, so that using `qmp_migrate_set_parameters` to set `multifd-zstd-level` or `multifd-zlib-level` did not work. This commit adds the glue code to fix that. Signed-off-by: Bryan Zhang Link: https://lore.kernel.org/r/20240301035901.4006936-2-bryan.zhang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/options.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/migration/options.c b/migration/options.c index 71645c8721..52ddbac35f 100644 --- a/migration/options.c +++ b/migration/options.c @@ -1377,6 +1377,12 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + dest->multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { dest->xbzrle_cache_size = params->xbzrle_cache_size; } @@ -1533,6 +1539,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + s->parameters.multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; xbzrle_cache_resize(params->xbzrle_cache_size, errp); -- Gitee From 51191c9239aee8a25428fef53fe99589e1aca711 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 1 Mar 2024 03:59:01 +0000 Subject: [PATCH 867/939] tests/migration: Set compression level in migration tests commit 2b571432314ab42da742fbb578f4174166ecd7f5 upstream. Adds calls to set compression level for `zstd` and `zlib` migration tests, just to make sure that the calls work. Signed-off-by: Bryan Zhang Link: https://lore.kernel.org/r/20240301035901.4006936-3-bryan.zhang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 13888be898..0ac5e7ddc9 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2560,6 +2560,13 @@ static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) { + /* + * Overloading this test to also check that set_parameter does not error. + * This is also done in the tests for the other compression methods. + */ + migrate_set_parameter_int(from, "multifd-zlib-level", 2); + migrate_set_parameter_int(to, "multifd-zlib-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); } @@ -2568,6 +2575,9 @@ static void * test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, QTestState *to) { + migrate_set_parameter_int(from, "multifd-zstd-level", 2); + migrate_set_parameter_int(to, "multifd-zstd-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); } #endif /* CONFIG_ZSTD */ -- Gitee From dc7717ee9311c374ad199c5baf4ecde8ac082248 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:29:55 -0300 Subject: [PATCH 868/939] migration/multifd: Cleanup multifd_recv_sync_main commit 4aac6b1e9bd48677c4f24518fe86ffd34c677d5a upstream. Some minor cleanups and documentation for multifd_recv_sync_main. Use thread_count as done in other parts of the code. Remove p->id from the multifd_recv_state sync, since that is global and not tied to a channel. Add documentation for the sync steps. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 17 +++++++++++++---- migration/trace-events | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 9e3955cb8c..429aad232b 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1186,18 +1186,27 @@ void multifd_recv_cleanup(void) void multifd_recv_sync_main(void) { + int thread_count = migrate_multifd_channels(); int i; if (!migrate_multifd()) { return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - trace_multifd_recv_sync_main_wait(p->id); + /* + * Initiate the synchronization by waiting for all channels. + * For socket-based migration this means each channel has received + * the SYNC packet on the stream. + */ + for (i = 0; i < thread_count; i++) { + trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } - for (i = 0; i < migrate_multifd_channels(); i++) { + + /* + * Sync done. Release the channels for the next iteration. + */ + for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; WITH_QEMU_LOCK_GUARD(&p->mutex) { diff --git a/migration/trace-events b/migration/trace-events index 298ad2b0dd..bf1a069632 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -132,7 +132,7 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uin multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" -multifd_recv_sync_main_wait(uint8_t id) "channel %u" +multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -- Gitee From 68a8a9da612d2d2dec5ad1b7b9ad5d7db603e05d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:06 -0300 Subject: [PATCH 869/939] migration/multifd: Rename MultiFDSend|RecvParams::data to compress_data commit 402dd7ac1c3be44f306c903cdfd2583ffec5e2fd upstream. Use a more specific name for the compression data so we can use the generic for the multifd core code. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-13-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 20 ++++++++++---------- migration/multifd-zstd.c | 20 ++++++++++---------- migration/multifd.h | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 012e3bdea1..2a8f5fc9a6 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -69,7 +69,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) err_msg = "out of memory for buf"; goto err_free_zbuff; } - p->data = z; + p->compress_data = z; return 0; err_free_zbuff: @@ -92,15 +92,15 @@ err_free_z: */ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; deflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; g_free(z->buf); z->buf = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -117,7 +117,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; @@ -194,7 +194,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; - p->data = z; + p->compress_data = z; zs->zalloc = Z_NULL; zs->zfree = Z_NULL; zs->opaque = Z_NULL; @@ -224,13 +224,13 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zlib_recv_cleanup(MultiFDRecvParams *p) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; inflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -246,7 +246,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) */ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index dc8fe43e94..593cf290ad 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->data = z; + p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -90,14 +90,14 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) */ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeCStream(z->zcs); z->zcs = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -114,7 +114,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; uint32_t i; @@ -183,7 +183,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; - p->data = z; + p->compress_data = z; z->zds = ZSTD_createDStream(); if (!z->zds) { g_free(z); @@ -221,14 +221,14 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zstd_recv_cleanup(MultiFDRecvParams *p) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeDStream(z->zds); z->zds = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -248,7 +248,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) uint32_t out_size = 0; uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; int i; diff --git a/migration/multifd.h b/migration/multifd.h index b3fe27ae93..adccd3532f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -127,7 +127,7 @@ typedef struct { /* number of iovs used */ uint32_t iovs_num; /* used for compression methods */ - void *data; + void *compress_data; } MultiFDSendParams; typedef struct { @@ -183,7 +183,7 @@ typedef struct { /* num of non zero pages */ uint32_t normal_num; /* used for de-compression methods */ - void *data; + void *compress_data; } MultiFDRecvParams; typedef struct { -- Gitee From deca5474782611e8bacf0c3110897ddd204084e9 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:07 -0300 Subject: [PATCH 870/939] migration/multifd: Decouple recv method from pages commit 9db191251381c75e57201f7b07330ca982a55d1e upstream. Next patches will abstract the type of data being received by the channels, so do some cleanup now to remove references to pages and dependency on 'normal_num'. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-14-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 6 +++--- migration/multifd-zstd.c | 6 +++--- migration/multifd.c | 13 ++++++++----- migration/multifd.h | 4 ++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 2a8f5fc9a6..6120faad65 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -234,7 +234,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) } /** - * zlib_recv_pages: read the data from the channel into actual pages + * zlib_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -244,7 +244,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zlib_recv(MultiFDRecvParams *p, Error **errp) { struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; @@ -319,7 +319,7 @@ static MultiFDMethods multifd_zlib_ops = { .send_prepare = zlib_send_prepare, .recv_setup = zlib_recv_setup, .recv_cleanup = zlib_recv_cleanup, - .recv_pages = zlib_recv_pages + .recv = zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 593cf290ad..cac236833d 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -232,7 +232,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) } /** - * zstd_recv_pages: read the data from the channel into actual pages + * zstd_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -242,7 +242,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; @@ -310,7 +310,7 @@ static MultiFDMethods multifd_zstd_ops = { .send_prepare = zstd_send_prepare, .recv_setup = zstd_recv_setup, .recv_cleanup = zstd_recv_cleanup, - .recv_pages = zstd_recv_pages + .recv = zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index 429aad232b..d5039af833 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -198,7 +198,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) } /** - * nocomp_recv_pages: read the data from the channel into actual pages + * nocomp_recv: read the data from the channel * * For no compression we just need to read things into the correct place. * @@ -207,7 +207,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; @@ -229,7 +229,7 @@ static MultiFDMethods multifd_nocomp_ops = { .send_prepare = nocomp_send_prepare, .recv_setup = nocomp_recv_setup, .recv_cleanup = nocomp_recv_cleanup, - .recv_pages = nocomp_recv_pages + .recv = nocomp_recv }; static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { @@ -1231,6 +1231,8 @@ static void *multifd_recv_thread(void *opaque) while (true) { uint32_t flags; + bool has_data = false; + p->normal_num = 0; if (multifd_recv_should_exit()) { break; @@ -1252,10 +1254,11 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - if (p->normal_num) { - ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { break; } diff --git a/migration/multifd.h b/migration/multifd.h index adccd3532f..6a54377cc1 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -197,8 +197,8 @@ typedef struct { int (*recv_setup)(MultiFDRecvParams *p, Error **errp); /* Cleanup for receiving side */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all pages */ - int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + /* Read all data */ + int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); -- Gitee From 48942069691dced68ba3ad74014ce0fb8850df46 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:08 -0300 Subject: [PATCH 871/939] migration/multifd: Allow multifd without packets commit 06833d83f8978139395da0f1d6a9fad81b9dd024 upstream. For the upcoming support to the new 'mapped-ram' migration stream format, we cannot use multifd packets because each write into the ramblock section in the migration file is expected to contain only the guest pages. They are written at their respective offsets relative to the ramblock section header. There is no space for the packet information and the expected gains from the new approach come partly from being able to write the pages sequentially without extraneous data in between. The new format also simply doesn't need the packets and all necessary information can be taken from the standard migration headers with some (future) changes to multifd code. Use the presence of the mapped-ram capability to decide whether to send packets. This only moves code under multifd_use_packets(), it has no effect for now as mapped-ram cannot yet be enabled with multifd. Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-15-farosas@suse.de Signed-off-by: Peter Xu [jz: make multifd_use_packet to always return true, since mapped-ram is not backported] Signed-off-by: Jason Zeng --- migration/multifd.c | 175 +++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 61 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index d5039af833..cac5f2743c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -93,6 +93,11 @@ struct { MultiFDMethods *ops; } *multifd_recv_state; +static bool multifd_use_packets(void) +{ + return true; +} + /* Multifd without compression */ /** @@ -123,6 +128,19 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->num * p->page_size; +} + /** * nocomp_send_prepare: prepare date to be able to send * @@ -137,9 +155,13 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { bool use_zero_copy_send = migrate_zero_copy_send(); - MultiFDPages_t *pages = p->pages; int ret; + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + return 0; + } + if (!use_zero_copy_send) { /* * Only !zerocopy needs the header in IOV; zerocopy will @@ -148,13 +170,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) multifd_send_prepare_header(p); } - for (int i = 0; i < pages->num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; - } - - p->next_packet_size = pages->num * p->page_size; + multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; multifd_send_fill_packet(p); @@ -209,7 +225,13 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) */ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { - uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t flags; + + if (!multifd_use_packets()) { + return 0; + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", @@ -796,6 +818,7 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; + bool use_packets = multifd_use_packets(); thread = migration_threads_add(p->name, qemu_get_thread_id()); @@ -805,9 +828,11 @@ static void *multifd_send_thread(void *opaque) trace_multifd_send_thread_start(p->id); rcu_register_thread(); - if (multifd_send_initial_packet(p, &local_err) < 0) { - ret = -1; - goto out; + if (use_packets) { + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } } while (true) { @@ -858,16 +883,20 @@ static void *multifd_send_thread(void *opaque) * it doesn't require explicit memory barriers. */ assert(qatomic_read(&p->pending_sync)); - p->flags = MULTIFD_FLAG_SYNC; - multifd_send_fill_packet(p); - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; + + if (use_packets) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; } - /* p->next_packet_size will always be zero for a SYNC packet */ - stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; + qatomic_set(&p->pending_sync, false); qemu_sem_post(&p->sem_sync); } @@ -1022,6 +1051,7 @@ bool multifd_send_setup(void) Error *local_err = NULL; int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; if (!migrate_multifd()) { @@ -1044,14 +1074,20 @@ bool multifd_send_setup(void) qemu_sem_init(&p->sem_sync, 0); p->id = i; p->pages = multifd_pages_init(page_count); - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + } else { + p->iov = g_new0(struct iovec, page_count); + } p->name = g_strdup_printf("multifdsend_%d", i); - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; p->write_flags = 0; @@ -1114,7 +1150,9 @@ static void multifd_recv_terminate_threads(Error *err) * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, * however try to wakeup it without harm in cleanup phase. */ - qemu_sem_post(&p->sem_sync); + if (multifd_use_packets()) { + qemu_sem_post(&p->sem_sync); + } /* * We could arrive here for two reasons: @@ -1189,7 +1227,7 @@ void multifd_recv_sync_main(void) int thread_count = migrate_multifd_channels(); int i; - if (!migrate_multifd()) { + if (!migrate_multifd() || !multifd_use_packets()) { return; } @@ -1224,13 +1262,14 @@ static void *multifd_recv_thread(void *opaque) { MultiFDRecvParams *p = opaque; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int ret; trace_multifd_recv_thread_start(p->id); rcu_register_thread(); while (true) { - uint32_t flags; + uint32_t flags = 0; bool has_data = false; p->normal_num = 0; @@ -1238,25 +1277,27 @@ static void *multifd_recv_thread(void *opaque) break; } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ - break; - } + if (use_packets) { + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + break; + } - qemu_mutex_lock(&p->mutex); - ret = multifd_recv_unfill_packet(p, &local_err); - if (ret) { + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - break; } - flags = p->flags; - /* recv methods don't know how to handle the SYNC flag */ - p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; - qemu_mutex_unlock(&p->mutex); - if (has_data) { ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { @@ -1264,9 +1305,11 @@ static void *multifd_recv_thread(void *opaque) } } - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&multifd_recv_state->sem_sync); - qemu_sem_wait(&p->sem_sync); + if (use_packets) { + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } } } @@ -1285,6 +1328,7 @@ int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; /* @@ -1309,9 +1353,12 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); p->id = i; - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); @@ -1355,18 +1402,24 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int id; - id = multifd_recv_initial_packet(ioc, &local_err); - if (id < 0) { - multifd_recv_terminate_threads(local_err); - error_propagate_prepend(errp, local_err, - "failed to receive packet" - " via multifd channel %d: ", - qatomic_read(&multifd_recv_state->count)); - return; + if (use_packets) { + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + } else { + /* next patch gives this a meaningful value */ + id = 0; } - trace_multifd_recv_new_channel(id); p = &multifd_recv_state->params[id]; if (p->c != NULL) { -- Gitee From 6bb380a1f7c37b5dda17f95519ec118990f332a8 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:11 +0000 Subject: [PATCH 872/939] migration/multifd: Add new migration option zero-page-detection. commit 5fdbb1dfccfd59661c95cae760b8e276c5b8e65c upstream. This new parameter controls where the zero page checking is running. 1. If this parameter is set to 'legacy', zero page checking is done in the migration main thread. 2. If this parameter is set to 'none', zero page checking is disabled. Signed-off-by: Hao Xiang Reviewed-by: Peter Xu Acked-by: Markus Armbruster Link: https://lore.kernel.org/r/20240311180015.3359271-4-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: hw/core/qdev-properties-system.c include/hw/qdev-properties-system.h migration/options.c qapi/migration.json [jz: resolve simple context conflicts] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 10 ++++++++++ include/hw/qdev-properties-system.h | 4 ++++ migration/migration-hmp-cmds.c | 9 +++++++++ migration/options.c | 21 +++++++++++++++++++++ migration/options.h | 1 + migration/ram.c | 4 ++++ qapi/migration.json | 28 +++++++++++++++++++++++++++- 7 files changed, 76 insertions(+), 1 deletion(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index c581d46f2e..cad1e04150 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -732,6 +732,16 @@ const PropertyInfo qdev_prop_mig_mode = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +const PropertyInfo qdev_prop_zero_page_detection = { + .name = "ZeroPageDetection", + .description = "zero_page_detection values, " + "none,legacy", + .enum_table = &ZeroPageDetection_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- Reserved Region --- */ /* diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 7cf27e51b9..63dcf69978 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr; extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; extern const PropertyInfo qdev_prop_mig_mode; +extern const PropertyInfo qdev_prop_zero_page_detection; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; extern const PropertyInfo qdev_prop_blockdev_retry_interval; @@ -48,6 +49,9 @@ extern const PropertyInfo qdev_prop_cpus390entitlement; #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ MigMode) +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \ + ZeroPageDetection) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 9857e2c97f..91e51eb7af 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -348,6 +348,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), MultiFDCompression_str(params->multifd_compression)); + assert(params->has_zero_page_detection); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), + qapi_enum_lookup(&ZeroPageDetection_lookup, + params->zero_page_detection)); monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); @@ -668,6 +673,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; + case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: + p->has_zero_page_detection = true; + visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); + break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; if (!visit_type_size(v, param, &cache_size, &err)) { diff --git a/migration/options.c b/migration/options.c index 52ddbac35f..e752163114 100644 --- a/migration/options.c +++ b/migration/options.c @@ -183,6 +183,9 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, + parameters.zero_page_detection, + ZERO_PAGE_DETECTION_LEGACY), DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), @@ -927,6 +930,13 @@ uint64_t migrate_xbzrle_cache_size(void) return s->parameters.xbzrle_cache_size; } +ZeroPageDetection migrate_zero_page_detection(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.zero_page_detection; +} + /* parameter setters */ void migrate_set_block_incremental(bool value) @@ -1042,6 +1052,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_zero_page_detection = true; + params->zero_page_detection = s->parameters.zero_page_detection; params->has_hdbss_buffer_size = true; params->hdbss_buffer_size = s->parameters.hdbss_buffer_size; @@ -1081,6 +1093,7 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_zero_page_detection = true; params->has_hdbss_buffer_size = true; params->sev_pdh = g_strdup(""); @@ -1422,6 +1435,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->mode = params->mode; } + if (params->has_zero_page_detection) { + dest->zero_page_detection = params->zero_page_detection; + } + if (params->sev_pdh) { assert(params->sev_pdh->type == QTYPE_QSTRING); dest->sev_pdh = params->sev_pdh->u.s; @@ -1593,6 +1610,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.mode = params->mode; } + if (params->has_zero_page_detection) { + s->parameters.zero_page_detection = params->zero_page_detection; + } + if (params->sev_pdh) { g_free(s->parameters.sev_pdh); assert(params->sev_pdh->type == QTYPE_QSTRING); diff --git a/migration/options.h b/migration/options.h index 987fc81a18..dbd52d7acd 100644 --- a/migration/options.h +++ b/migration/options.h @@ -95,6 +95,7 @@ const char *migrate_tls_authz(void); const char *migrate_tls_creds(void); const char *migrate_tls_hostname(void); uint64_t migrate_xbzrle_cache_size(void); +ZeroPageDetection migrate_zero_page_detection(void); /* parameters setters */ diff --git a/migration/ram.c b/migration/ram.c index 9630b654c2..7d0f1120df 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1141,6 +1141,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, QEMUFile *file = pss->pss_channel; int len = 0; + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { + return 0; + } + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { return 0; } diff --git a/qapi/migration.json b/qapi/migration.json index f672da5c0d..ff247a50ce 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -653,6 +653,18 @@ { 'enum': 'MigMode', 'data': [ 'normal', 'cpr-reboot' ] } +## +# @ZeroPageDetection: +# +# @none: Do not perform zero page checking. +# +# @legacy: Perform zero page checking in main migration thread. +# +# Since: 9.0 +## +{ 'enum': 'ZeroPageDetection', + 'data': [ 'none', 'legacy' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -891,6 +903,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -940,6 +956,7 @@ { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', 'mode', + 'zero-page-detection', 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert', 'hdbss-buffer-size'] } ## @@ -1098,6 +1115,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -1169,12 +1190,12 @@ 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', '*sev-pdh': 'StrOrNull', '*sev-plat-cert': 'StrOrNull', '*sev-amd-cert' : 'StrOrNull', '*hdbss-buffer-size': 'uint8'} } - ## # @migrate-set-parameters: # @@ -1351,6 +1372,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -1418,6 +1443,7 @@ 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', '*sev-pdh': 'str', '*sev-plat-cert': 'str', '*sev-amd-cert' : 'str', -- Gitee From 68f37655bf414e74c623164c9c20bc7884ee5bb8 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:12 +0000 Subject: [PATCH 873/939] migration/multifd: Implement zero page transmission on the multifd thread. commit 303e6f54f9657be76ee060006ee2d4cacff263a0 upstream. 1. Add zero_pages field in MultiFDPacket_t. 2. Implements the zero page detection and handling on the multifd threads for non-compression, zlib and zstd compression backends. 3. Added a new value 'multifd' in ZeroPageDetection enumeration. 4. Adds zero page counters and updates multifd send/receive tracing format to track the newly added counters. Signed-off-by: Hao Xiang Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-5-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: migration/meson.build migration/multifd.c [jz: there is no multifd_set_file_bitmap() because we didn't backport mapped-ram, so abandon changes in multifd_set_file_bitmap()] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-zero-page.c | 87 ++++++++++++++++++++++++++++++++ migration/multifd-zlib.c | 21 ++++++-- migration/multifd-zstd.c | 20 ++++++-- migration/multifd.c | 83 +++++++++++++++++++++++++----- migration/multifd.h | 23 ++++++++- migration/ram.c | 1 - migration/trace-events | 8 +-- qapi/migration.json | 7 ++- 10 files changed, 222 insertions(+), 31 deletions(-) create mode 100644 migration/multifd-zero-page.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index cad1e04150..b3b9238b65 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -735,7 +735,7 @@ const PropertyInfo qdev_prop_mig_mode = { const PropertyInfo qdev_prop_zero_page_detection = { .name = "ZeroPageDetection", .description = "zero_page_detection values, " - "none,legacy", + "none,legacy,multifd", .enum_table = &ZeroPageDetection_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index d9b46ef0df..d619ebf238 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,6 +22,7 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', + 'multifd-zero-page.c', 'options.c', 'postcopy-ram.c', 'savevm.c', diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c new file mode 100644 index 0000000000..1ba38be636 --- /dev/null +++ b/migration/multifd-zero-page.c @@ -0,0 +1,87 @@ +/* + * Multifd zero page detection implementation. + * + * Copyright (c) 2024 Bytedance Inc + * + * Authors: + * Hao Xiang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "ram.h" + +static bool multifd_zero_page_enabled(void) +{ + return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; +} + +static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) +{ + ram_addr_t temp; + + if (a == b) { + return; + } + + temp = pages_offset[a]; + pages_offset[a] = pages_offset[b]; + pages_offset[b] = temp; +} + +/** + * multifd_send_zero_page_detect: Perform zero page detection on all pages. + * + * Sorts normal pages before zero pages in p->pages->offset and updates + * p->pages->normal_num. + * + * @param p A pointer to the send params. + */ +void multifd_send_zero_page_detect(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + RAMBlock *rb = pages->block; + int i = 0; + int j = pages->num - 1; + + if (!multifd_zero_page_enabled()) { + pages->normal_num = pages->num; + return; + } + + /* + * Sort the page offset array by moving all normal pages to + * the left and all zero pages to the right of the array. + */ + while (i <= j) { + uint64_t offset = pages->offset[i]; + + if (!buffer_is_zero(rb->host + offset, p->page_size)) { + i++; + continue; + } + + swap_page_offset(pages->offset, i, j); + ram_release_page(rb->idstr, offset); + j--; + } + + pages->normal_num = i; +} + +void multifd_recv_zero_page_process(MultiFDRecvParams *p) +{ + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; + if (!buffer_is_zero(page, p->page_size)) { + memset(page, 0, p->page_size); + } + } +} diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 6120faad65..83c0374380 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,13 +123,15 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -172,10 +174,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = out_size; p->iovs_num++; p->next_packet_size = out_size; - p->flags |= MULTIFD_FLAG_ZLIB; +out: + p->flags |= MULTIFD_FLAG_ZLIB; multifd_send_fill_packet(p); - return 0; } @@ -261,6 +263,14 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZLIB); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -310,6 +320,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, out_size, expected_size); return -1; } + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index cac236833d..02112255ad 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,16 +118,18 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } z->in.src = p->pages->block->host + pages->offset[i]; @@ -161,10 +163,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = z->out.pos; p->iovs_num++; p->next_packet_size = z->out.pos; - p->flags |= MULTIFD_FLAG_ZSTD; +out: + p->flags |= MULTIFD_FLAG_ZSTD; multifd_send_fill_packet(p); - return 0; } @@ -257,6 +259,14 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZSTD); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { diff --git a/migration/multifd.c b/migration/multifd.c index cac5f2743c..6c01179858 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -11,6 +11,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" @@ -132,13 +133,13 @@ static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < pages->num; i++) { + for (int i = 0; i < pages->normal_num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = pages->num * p->page_size; + p->next_packet_size = pages->normal_num * p->page_size; } /** @@ -157,6 +158,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) bool use_zero_copy_send = migrate_zero_copy_send(); int ret; + multifd_send_zero_page_detect(p); + if (!multifd_use_packets()) { multifd_send_prepare_iovs(p); return 0; @@ -238,6 +241,13 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_NOCOMP); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; @@ -272,6 +282,7 @@ static void multifd_pages_reset(MultiFDPages_t *pages) * overwritten later when reused. */ pages->num = 0; + pages->normal_num = 0; pages->block = NULL; } @@ -363,11 +374,13 @@ void multifd_send_fill_packet(MultiFDSendParams *p) MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; uint64_t packet_num; + uint32_t zero_num = pages->num - pages->normal_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(pages->num); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -385,10 +398,11 @@ void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; - p->total_normal_pages += pages->num; + p->total_normal_pages += pages->normal_num; + p->total_zero_pages += zero_num; - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); + trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, + p->flags, p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -429,20 +443,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal_num = be32_to_cpu(packet->normal_pages); if (p->normal_num > packet->pages_alloc) { error_setg(errp, "multifd: received packet " - "with %u pages and expected maximum pages are %u", + "with %u normal pages and expected maximum pages are %u", p->normal_num, packet->pages_alloc) ; return -1; } + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > packet->pages_alloc - p->normal_num) { + error_setg(errp, "multifd: received packet " + "with %u zero pages and expected maximum zero pages are %u", + p->zero_num, packet->pages_alloc - p->normal_num) ; + return -1; + } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; p->total_normal_pages += p->normal_num; + p->total_zero_pages += p->zero_num; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, - p->next_packet_size); + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, + p->flags, p->next_packet_size); - if (p->normal_num == 0) { + if (p->normal_num == 0 && p->zero_num == 0) { return 0; } @@ -468,6 +491,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal[i] = offset; } + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + return 0; } @@ -866,6 +901,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, (uint64_t)p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); multifd_pages_reset(p->pages); p->next_packet_size = 0; @@ -913,7 +950,8 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1189,6 +1227,8 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->iov = NULL; g_free(p->normal); p->normal = NULL; + g_free(p->zero); + p->zero = NULL; multifd_recv_state->ops->recv_cleanup(p); } @@ -1294,7 +1334,7 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); } @@ -1319,7 +1359,9 @@ static void *multifd_recv_thread(void *opaque) } rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, + p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1362,6 +1404,7 @@ int multifd_recv_setup(Error **errp) p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; p->page_size = qemu_target_page_size(); } @@ -1437,3 +1480,17 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + multifd_send_prepare_header(p); + + return true; +} diff --git a/migration/multifd.h b/migration/multifd.h index 6a54377cc1..d99603c6a4 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -48,14 +48,24 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; uint64_t packet_num; - uint64_t unused[4]; /* Reserved for future use */ + /* zero pages */ + uint32_t zero_pages; + uint32_t unused32[1]; /* Reserved for future use */ + uint64_t unused64[3]; /* Reserved for future use */ char ramblock[256]; + /* + * This array contains the pointers to: + * - normal pages (initial normal_pages entries) + * - zero pages (following zero_pages entries) + */ uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; typedef struct { /* number of used pages */ uint32_t num; + /* number of normal pages */ + uint32_t normal_num; /* number of allocated pages */ uint32_t allocated; /* offset of each page */ @@ -122,6 +132,8 @@ typedef struct { uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; + /* zero pages sent through this channel */ + uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ @@ -176,12 +188,18 @@ typedef struct { uint8_t *host; /* non zero pages recv through this channel */ uint64_t total_normal_pages; + /* zero pages recv through this channel */ + uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ ram_addr_t *normal; /* num of non zero pages */ uint32_t normal_num; + /* Pages that are zero */ + ram_addr_t *zero; + /* num of zero pages */ + uint32_t zero_num; /* used for de-compression methods */ void *compress_data; } MultiFDRecvParams; @@ -203,6 +221,9 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); void multifd_send_fill_packet(MultiFDSendParams *p); +bool multifd_send_prepare_common(MultiFDSendParams *p); +void multifd_send_zero_page_detect(MultiFDSendParams *p); +void multifd_recv_zero_page_process(MultiFDRecvParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { diff --git a/migration/ram.c b/migration/ram.c index 7d0f1120df..bae5853996 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1396,7 +1396,6 @@ static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) if (!multifd_queue_page(block, offset)) { return -1; } - stat64_add(&mig_stats.normal_pages, 1); return 1; } diff --git a/migration/trace-events b/migration/trace-events index bf1a069632..f0e1cb80c7 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" +multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" +multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" multifd_send_terminate_threads(void) "" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 +multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" diff --git a/qapi/migration.json b/qapi/migration.json index ff247a50ce..fc3178b1dc 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -660,10 +660,15 @@ # # @legacy: Perform zero page checking in main migration thread. # +# @multifd: Perform zero page checking in multifd sender thread if +# multifd migration is enabled, else in the main migration +# thread as for @legacy. +# # Since: 9.0 +# ## { 'enum': 'ZeroPageDetection', - 'data': [ 'none', 'legacy' ] } + 'data': [ 'none', 'legacy', 'multifd' ] } ## # @BitmapMigrationBitmapAliasTransform: -- Gitee From 5107700317e5cba24822f71615a001a8a62fea07 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:13 +0000 Subject: [PATCH 874/939] migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. commit 9ae90f73e623c8b8c7ec1fccd8ca493805df8fbd upstream. 1. Add a dedicated handler for MigrationOps::ram_save_target_page in multifd live migration. 2. Refactor ram_save_target_page_legacy so that the legacy and multifd handlers don't have internal functions calling into each other. Signed-off-by: Hao Xiang Reviewed-by: Fabiano Rosas Message-Id: <20240226195654.934709-4-hao.xiang@bytedance.com> Link: https://lore.kernel.org/r/20240311180015.3359271-6-hao.xiang@linux.dev Signed-off-by: Peter Xu [jz: resolve context conflict due to BQL name] Signed-off-by: Jason Zeng --- migration/ram.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index bae5853996..fe2e4c6164 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2233,7 +2233,6 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, */ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) { - RAMBlock *block = pss->block; ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; @@ -2260,17 +2259,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } + return ram_save_page(rs, pss); +} + +/** + * ram_save_target_page_multifd: send one target page to multifd workers + * + * Returns 1 if the page was queued, -1 otherwise. + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) +{ + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + /* - * Do not use multifd in postcopy as one whole host page should be - * placed. Meanwhile postcopy requires atomic update of pages, so even - * if host page size == guest page size the dest guest during run may - * still see partially copied pages which is data corruption. + * While using multifd live migration, we still need to handle zero + * page checking on the migration main thread. */ - if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(block, offset); + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (save_zero_page(rs, pss, offset)) { + return 1; + } } - return ram_save_page(rs, pss); + return ram_save_multifd_page(block, offset); } /* Should be called before sending a host page */ @@ -3433,7 +3448,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } migration_ops = g_malloc0(sizeof(MigrationOps)); - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + if (migrate_multifd()) { + migration_ops->ram_save_target_page = ram_save_target_page_multifd; + } else { + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + } qemu_mutex_unlock_iothread(); ret = multifd_send_sync_main(); -- Gitee From 23a7d46a9bf6b0d692155eca9be0b7607db5d861 Mon Sep 17 00:00:00 2001 From: Jinqian Yang Date: Fri, 16 May 2025 18:20:10 +0800 Subject: [PATCH 875/939] Revert "target/arm: Change arm_cpu_mp_affinity when enabled IPIV feature" virt inclusion category: feature bugzilla: https://gitee.com/openeuler/qemu/issues/IC1EV7 ------------------------------------------------------------------------ This reverts commit 33aa02dc05bed8316b1c64131e8269f404287598. OpenEuler kernel OLK-6.6 add the SMCCC interface so that the guest OS can control the enabling of IPIV. When IPIV is enabled, the guest OS uses multiple unicast to implement multicast. So do not need to modify the MPIDR. Signed-off-by: Jinqian Yang --- linux-headers/linux/kvm.h | 2 -- target/arm/cpu.c | 22 +++------------------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index a19683f1e9..b711c04506 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1205,8 +1205,6 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SEV_ES_GHCB 500 #define KVM_CAP_HYGON_COCO_EXT 501 - -#define KVM_CAP_ARM_IPIV_MODE 503 /* support userspace to request firmware to build CSV3 guest's memory space */ #define KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM (1 << 0) /* support request to update CSV3 guest's memory region multiple times */ diff --git a/target/arm/cpu.c b/target/arm/cpu.c index b0f70de018..09d391bd34 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1324,25 +1324,9 @@ static void arm_cpu_dump_state(CPUState *cs, FILE *f, int flags) uint64_t arm_cpu_mp_affinity(int idx, uint8_t clustersz) { - uint64_t Aff0 = 0, Aff1 = 0, Aff2 = 0, Aff3 = 0; - int mode; - - if (!kvm_enabled()) { - Aff1 = idx / clustersz; - Aff0 = idx % clustersz; - return (Aff1 << ARM_AFF1_SHIFT) | Aff0; - } - - mode = kvm_check_extension(kvm_state, KVM_CAP_ARM_IPIV_MODE); - if (mode) { - Aff1 = idx % 16; - Aff2 = idx / 16; - } else { - Aff1 = idx / clustersz; - Aff0 = idx % clustersz; - } - return (Aff3 << ARM_AFF3_SHIFT) | (Aff2 << ARM_AFF2_SHIFT) | - (Aff1 << ARM_AFF1_SHIFT) | Aff0; + uint32_t Aff1 = idx / clustersz; + uint32_t Aff0 = idx % clustersz; + return (Aff1 << ARM_AFF1_SHIFT) | Aff0; } static void arm_cpu_initfn(Object *obj) -- Gitee From 57c611db900ca4373f3a34d3d87d57bb4f0bba00 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 1 Apr 2024 23:41:10 +0800 Subject: [PATCH 876/939] migration/multifd: solve zero page causing multiple page faults commit 5ef7e26bdb7eda10d6d5e1b77121be9945e5e550 upstream. Implemented recvbitmap tracking of received pages in multifd. If the zero page appears for the first time in the recvbitmap, this page is not checked and set. If the zero page has already appeared in the recvbitmap, there is no need to check the data but directly set the data to 0, because it is unlikely that the zero page will be migrated multiple times. Signed-off-by: Yuan Liu Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240401154110.2028453-2-yuan1.liu@intel.com [peterx: touch up the comment, as the bitmap is used outside postcopy now] Signed-off-by: Peter Xu Conflicts: include/exec/ramblock.h [jz: resolve context conflict due to mapped-ram which was not backported] Signed-off-by: Jason Zeng --- include/exec/ramblock.h | 2 +- migration/multifd-zero-page.c | 4 +++- migration/multifd-zlib.c | 1 + migration/multifd-zstd.c | 1 + migration/multifd.c | 1 + migration/ram.c | 4 ++++ migration/ram.h | 1 + 7 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 69c6a53902..8f9579ed70 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -44,7 +44,7 @@ struct RAMBlock { size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; - /* bitmap of already received pages in postcopy */ + /* Bitmap of already received pages. Only used on destination side. */ unsigned long *receivedmap; /* diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c index 1ba38be636..e1b8370f88 100644 --- a/migration/multifd-zero-page.c +++ b/migration/multifd-zero-page.c @@ -80,8 +80,10 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) { for (int i = 0; i < p->zero_num; i++) { void *page = p->host + p->zero[i]; - if (!buffer_is_zero(page, p->page_size)) { + if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { memset(page, 0, p->page_size); + } else { + ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); } } } diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 83c0374380..b210725f6e 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -284,6 +284,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) int flush = Z_NO_FLUSH; unsigned long start = zs->total_out; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); if (i == p->normal_num - 1) { flush = Z_SYNC_FLUSH; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 02112255ad..256858df0a 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -278,6 +278,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) z->in.pos = 0; for (i = 0; i < p->normal_num; i++) { + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); z->out.dst = p->host + p->normal[i]; z->out.size = p->page_size; z->out.pos = 0; diff --git a/migration/multifd.c b/migration/multifd.c index 6c01179858..4394952fbb 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -251,6 +251,7 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); } diff --git a/migration/ram.c b/migration/ram.c index fe2e4c6164..6acf518a34 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -275,6 +275,10 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, nr); } +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) +{ + set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); +} #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) /* diff --git a/migration/ram.h b/migration/ram.h index 9b937a446b..cd263df026 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -69,6 +69,7 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset); int64_t ramblock_recv_bitmap_send(QEMUFile *file, const char *block_name); bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); -- Gitee From 4c4e9830f3bee7313f3ac49fe4887f040fd85f7a Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:04 +0800 Subject: [PATCH 877/939] docs/migration: add qpl compression feature commit 0d40b3d76ced77c1c82c77a636af703fabdb407c upstream. add Intel Query Processing Library (QPL) compression method introduction Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Acked-by: Peter Xu Signed-off-by: Fabiano Rosas Conflicts: docs/devel/migration/features.rst [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/qpl-compression.rst | 260 +++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 docs/devel/migration/qpl-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index a9acaf618e..9819393c12 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -10,3 +10,4 @@ Migration has plenty of features to support different use cases. dirty-limit vfio virtio + qpl-compression diff --git a/docs/devel/migration/qpl-compression.rst b/docs/devel/migration/qpl-compression.rst new file mode 100644 index 0000000000..990992d786 --- /dev/null +++ b/docs/devel/migration/qpl-compression.rst @@ -0,0 +1,260 @@ +=============== +QPL Compression +=============== +The Intel Query Processing Library (Intel ``QPL``) is an open-source library to +provide compression and decompression features and it is based on deflate +compression algorithm (RFC 1951). + +The ``QPL`` compression relies on Intel In-Memory Analytics Accelerator(``IAA``) +and Shared Virtual Memory(``SVM``) technology, they are new features supported +from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``). + +For more ``QPL`` introduction, please refer to `QPL Introduction +`_ + +QPL Compression Framework +========================= + +:: + + +----------------+ +------------------+ + | MultiFD Thread | |accel-config tool | + +-------+--------+ +--------+---------+ + | | + | | + |compress/decompress | + +-------+--------+ | Setup IAA + | QPL library | | Resources + +-------+---+----+ | + | | | + | +-------------+-------+ + | Open IAA | + | Devices +-----+-----+ + | |idxd driver| + | +-----+-----+ + | | + | | + | +-----+-----+ + +-----------+IAA Devices| + Submit jobs +-----------+ + via enqcmd + + +QPL Build And Installation +-------------------------- + +.. code-block:: shell + + $git clone --recursive https://github.com/intel/qpl.git qpl + $mkdir qpl/build + $cd qpl/build + $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DQPL_LIBRARY_TYPE=SHARED .. + $sudo cmake --build . --target install + +For more details about ``QPL`` installation, please refer to `QPL Installation +`_ + +IAA Device Management +--------------------- + +The number of ``IAA`` devices will vary depending on the Xeon product model. +On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with up to +4 devices per socket. + +By default, all ``IAA`` devices are disabled and need to be configured and +enabled by users manually. + +Check the number of devices through the following command + +.. code-block:: shell + + #lspci -d 8086:0cfe + 6a:02.0 System peripheral: Intel Corporation Device 0cfe + 6f:02.0 System peripheral: Intel Corporation Device 0cfe + 74:02.0 System peripheral: Intel Corporation Device 0cfe + 79:02.0 System peripheral: Intel Corporation Device 0cfe + e7:02.0 System peripheral: Intel Corporation Device 0cfe + ec:02.0 System peripheral: Intel Corporation Device 0cfe + f1:02.0 System peripheral: Intel Corporation Device 0cfe + f6:02.0 System peripheral: Intel Corporation Device 0cfe + +IAA Device Configuration And Enabling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``accel-config`` tool is used to enable ``IAA`` devices and configure +``IAA`` hardware resources(work queues and engines). One ``IAA`` device +has 8 work queues and 8 processing engines, multiple engines can be assigned +to a work queue via ``group`` attribute. + +For ``accel-config`` installation, please refer to `accel-config installation +`_ + +One example of configuring and enabling an ``IAA`` device. + +.. code-block:: shell + + #accel-config config-engine iax1/engine1.0 -g 0 + #accel-config config-engine iax1/engine1.1 -g 0 + #accel-config config-engine iax1/engine1.2 -g 0 + #accel-config config-engine iax1/engine1.3 -g 0 + #accel-config config-engine iax1/engine1.4 -g 0 + #accel-config config-engine iax1/engine1.5 -g 0 + #accel-config config-engine iax1/engine1.6 -g 0 + #accel-config config-engine iax1/engine1.7 -g 0 + #accel-config config-wq iax1/wq1.0 -g 0 -s 128 -p 10 -b 1 -t 128 -m shared -y user -n app1 -d user + #accel-config enable-device iax1 + #accel-config enable-wq iax1/wq1.0 + +.. note:: + IAX is an early name for IAA + +- The ``IAA`` device index is 1, use ``ls -lh /sys/bus/dsa/devices/iax*`` + command to query the ``IAA`` device index. + +- 8 engines and 1 work queue are configured in group 0, so all compression jobs + submitted to this work queue can be processed by all engines at the same time. + +- Set work queue attributes including the work mode, work queue size and so on. + +- Enable the ``IAA1`` device and work queue 1.0 + +.. note:: + + Set work queue mode to shared mode, since ``QPL`` library only supports + shared mode + +For more detailed configuration, please refer to `IAA Configuration Samples +`_ + +IAA Unit Test +^^^^^^^^^^^^^ + +- Enabling ``IAA`` devices for Xeon platform, please refer to `IAA User Guide + `_ + +- ``IAA`` device driver is Intel Data Accelerator Driver (idxd), it is + recommended that the minimum version of Linux kernel is 5.18. + +- Add ``"intel_iommu=on,sm_on"`` parameter to kernel command line + for ``SVM`` feature enabling. + +Here is an easy way to verify ``IAA`` device driver and ``SVM`` with `iaa_test +`_ + +.. code-block:: shell + + #./test/iaa_test + [ info] alloc wq 0 shared size 128 addr 0x7f26cebe5000 batch sz 0xfffffffe xfer sz 0x80000000 + [ info] test noop: tflags 0x1 num_desc 1 + [ info] preparing descriptor for noop + [ info] Submitted all noop jobs + [ info] verifying task result for 0x16f7e20 + [ info] test with op 0 passed + + +IAA Resources Allocation For Migration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is no ``IAA`` resource configuration parameters for migration and +``accel-config`` tool configuration cannot directly specify the ``IAA`` +resources used for migration. + +The multifd migration with ``QPL`` compression method will use all work +queues that are enabled and shared mode. + +.. note:: + + Accessing IAA resources requires ``sudo`` command or ``root`` privileges + by default. Administrators can modify the IAA device node ownership + so that QEMU can use IAA with specified user permissions. + + For example + + #chown -R qemu /dev/iax + +Shared Virtual Memory(SVM) Introduction +======================================= + +An ability for an accelerator I/O device to operate in the same virtual +memory space of applications on host processors. It also implies the +ability to operate from pageable memory, avoiding functional requirements +to pin memory for DMA operations. + +When using ``SVM`` technology, users do not need to reserve memory for the +``IAA`` device and perform pin memory operation. The ``IAA`` device can +directly access data using the virtual address of the process. + +For more ``SVM`` technology, please refer to +`Shared Virtual Addressing (SVA) with ENQCMD +`_ + + +How To Use QPL Compression In Migration +======================================= + +1 - Installation of ``QPL`` library and ``accel-config`` library if using IAA + +2 - Configure and enable ``IAA`` devices and work queues via ``accel-config`` + +3 - Build ``QEMU`` with ``--enable-qpl`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qpl`` + +4 - Enable ``QPL`` compression during migration + + Set ``migrate_set_parameter multifd-compression qpl`` when migrating, the + ``QPL`` compression does not support configuring the compression level, it + only supports one compression level. + +The Difference Between QPL And ZLIB +=================================== + +Although both ``QPL`` and ``ZLIB`` are based on the deflate compression +algorithm, and ``QPL`` can support the header and tail of ``ZLIB``, ``QPL`` +is still not fully compatible with the ``ZLIB`` compression in the migration. + +``QPL`` only supports 4K history buffer, and ``ZLIB`` is 32K by default. +``ZLIB`` compresses data that ``QPL`` may not decompress correctly and +vice versa. + +``QPL`` does not support the ``Z_SYNC_FLUSH`` operation in ``ZLIB`` streaming +compression, current ``ZLIB`` implementation uses ``Z_SYNC_FLUSH``, so each +``multifd`` thread has a ``ZLIB`` streaming context, and all page compression +and decompression are based on this stream. ``QPL`` cannot decompress such data +and vice versa. + +The introduction for ``Z_SYNC_FLUSH``, please refer to `Zlib Manual +`_ + +The Best Practices +================== +When user enables the IAA device for ``QPL`` compression, it is recommended +to add ``-mem-prealloc`` parameter to the destination boot parameters. This +parameter can avoid the occurrence of I/O page fault and reduce the overhead +of IAA compression and decompression. + +The example of booting with ``-mem-prealloc`` parameter + +.. code-block:: shell + + $qemu-system-x86_64 --enable-kvm -cpu host --mem-prealloc ... + + +An example about I/O page fault measurement of destination without +``-mem-prealloc``, the ``svm_prq`` indicates the number of I/O page fault +occurrences and processing time. + +.. code-block:: shell + + #echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 3 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 4 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #cat /sys/kernel/debug/iommu/intel/dmar_perf_latency + IOMMU: dmar18 Register Base Address: c87fc000 + <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms 1ms-10ms >=10ms min(us) max(us) average(us) + inv_iotlb 0 286 123 0 0 0 0 0 1 0 + inv_devtlb 0 276 133 0 0 0 0 0 2 0 + inv_iec 0 0 0 0 0 0 0 0 0 0 + svm_prq 0 0 25206 364 395 0 0 1 556 9 -- Gitee From 4e0ebb941ba15c31e7d19d44189bf47fee3181c9 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:05 +0800 Subject: [PATCH 878/939] migration/multifd: put IOV initialization into compression method commit d9d3e4f243214f742425d9d8360f0794bb05c999 upstream. Different compression methods may require different numbers of IOVs. Based on streaming compression of zlib and zstd, all pages will be compressed to a data block, so two IOVs are needed for packet header and compressed data block. Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 7 +++++++ migration/multifd-zstd.c | 8 +++++++- migration/multifd.c | 22 ++++++++++++---------- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index b210725f6e..2df4983780 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -70,6 +70,10 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) goto err_free_zbuff; } p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); + return 0; err_free_zbuff: @@ -101,6 +105,9 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) z->buf = NULL; g_free(p->compress_data); p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 256858df0a..ca17b7e310 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) error_setg(errp, "multifd %u: out of memory for zbuff", p->id); return -1; } + p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); return 0; } @@ -98,6 +101,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) z->zbuff = NULL; g_free(p->compress_data); p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** diff --git a/migration/multifd.c b/migration/multifd.c index 4394952fbb..0fcecc3759 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -113,6 +113,13 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; } + if (multifd_use_packets()) { + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, p->page_count + 1); + } else { + p->iov = g_new0(struct iovec, p->page_count); + } + return 0; } @@ -126,6 +133,8 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) */ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) { + g_free(p->iov); + p->iov = NULL; return; } @@ -202,6 +211,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) */ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) { + p->iov = g_new0(struct iovec, p->page_count); return 0; } @@ -214,6 +224,8 @@ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void nocomp_recv_cleanup(MultiFDRecvParams *p) { + g_free(p->iov); + p->iov = NULL; } /** @@ -734,8 +746,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) p->packet_len = 0; g_free(p->packet); p->packet = NULL; - g_free(p->iov); - p->iov = NULL; multifd_send_state->ops->send_cleanup(p, errp); return *errp == NULL; @@ -1120,11 +1130,6 @@ bool multifd_send_setup(void) p->packet = g_malloc0(p->packet_len); p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); p->packet->version = cpu_to_be32(MULTIFD_VERSION); - - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); - } else { - p->iov = g_new0(struct iovec, page_count); } p->name = g_strdup_printf("multifdsend_%d", i); p->page_size = qemu_target_page_size(); @@ -1224,8 +1229,6 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->packet_len = 0; g_free(p->packet); p->packet = NULL; - g_free(p->iov); - p->iov = NULL; g_free(p->normal); p->normal = NULL; g_free(p->zero); @@ -1403,7 +1406,6 @@ int multifd_recv_setup(Error **errp) p->packet = g_malloc0(p->packet_len); } p->name = g_strdup_printf("multifdrecv_%d", i); - p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; -- Gitee From e75b4a4c735e07431d02dd85002f8175cfbd5db3 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:06 +0800 Subject: [PATCH 879/939] configure: add --enable-qpl build option commit b844a2c7cc7f7c7756a27d372e64f6688d67c4eb upstream. add --enable-qpl and --disable-qpl options to enable and disable the QPL compression method for multifd migration. The Query Processing Library (QPL) is an open-source library that supports data compression and decompression features. It is based on the deflate compression algorithm and use Intel In-Memory Analytics Accelerator(IAA) hardware for compression and decompression acceleration. For more live migration with IAA, please refer to the document docs/devel/migration/qpl-compression.rst Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- meson.build | 8 ++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 13 insertions(+) diff --git a/meson.build b/meson.build index aea6a33ca3..888af7e099 100644 --- a/meson.build +++ b/meson.build @@ -1043,6 +1043,12 @@ if not get_option('zstd').auto() or have_block required: get_option('zstd'), method: 'pkg-config') endif +qpl = not_found +if not get_option('qpl').auto() or have_system + qpl = dependency('qpl', version: '>=1.5.0', + required: get_option('qpl'), + method: 'pkg-config') +endif virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2281,6 +2287,7 @@ config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) +config_host_data.set('CONFIG_QPL', qpl.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4455,6 +4462,7 @@ summary_info += {'snappy support': snappy} summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} +summary_info += {'Query Processing Library support': qpl} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index cf9706c411..82f73d51ce 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -259,6 +259,8 @@ option('xkbcommon', type : 'feature', value : 'auto', description: 'xkbcommon support') option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') +option('qpl', type : 'feature', value : 'auto', + description: 'Query Processing Library support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 680fa3f581..784f74fde9 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -222,6 +222,7 @@ meson_options_help() { printf "%s\n" ' Xen PCI passthrough support' printf "%s\n" ' xkbcommon xkbcommon support' printf "%s\n" ' zstd zstd compression support' + printf "%s\n" ' qpl Query Processing Library support' } _meson_option_parse() { case $1 in @@ -562,6 +563,8 @@ _meson_option_parse() { --disable-xkbcommon) printf "%s" -Dxkbcommon=disabled ;; --enable-zstd) printf "%s" -Dzstd=enabled ;; --disable-zstd) printf "%s" -Dzstd=disabled ;; + --enable-qpl) printf "%s" -Dqpl=enabled ;; + --disable-qpl) printf "%s" -Dqpl=disabled ;; *) return 1 ;; esac } -- Gitee From 0f0f9c2c5a658a77c1d99e1d1ec166b8259ec307 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:07 +0800 Subject: [PATCH 880/939] migration/multifd: add qpl compression method commit 354cac2859e48ec5f7ee72a2a071da6c60a462d0 upstream. add the Query Processing Library (QPL) compression method Introduce the qpl as a new multifd migration compression method, it can use In-Memory Analytics Accelerator(IAA) to accelerate compression and decompression, which can not only reduce network bandwidth requirement but also reduce host compression and decompression CPU overhead. How to enable qpl compression during migration: migrate_set_parameter multifd-compression qpl There is no qpl compression level parameter added since it only supports level one, users do not need to specify the qpl compression level. Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas [fixed docs spacing in migration.json] Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-qpl.c | 20 ++++++++++++++++++++ migration/multifd.h | 1 + qapi/migration.json | 8 +++++++- 5 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 migration/multifd-qpl.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index b3b9238b65..6ee9744e00 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -711,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd", + "none/zlib/zstd/qpl", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index d619ebf238..6652f68d32 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed() system_ss.add(files('block.c')) endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) +system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c new file mode 100644 index 0000000000..056a68a060 --- /dev/null +++ b/migration/multifd-qpl.c @@ -0,0 +1,20 @@ +/* + * Multifd qpl compression accelerator implementation + * + * Copyright (c) 2023 Intel Corporation + * + * Authors: + * Yuan Liu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu/module.h" + +static void multifd_qpl_register(void) +{ + /* noop */ +} + +migration_init(multifd_qpl_register); diff --git a/migration/multifd.h b/migration/multifd.h index d99603c6a4..11f05dd6d5 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -33,6 +33,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) +#define MULTIFD_FLAG_QPL (4 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index fc3178b1dc..f8f3f6f272 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -625,11 +625,17 @@ # # @zstd: use zstd compression method. # +# @qpl: use qpl compression method. Query Processing Library(qpl) is +# based on the deflate compression algorithm and use the Intel +# In-Memory Analytics Accelerator(IAA) accelerated compression +# and decompression. (Since 9.1) +# # Since: 5.0 ## { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', - { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } + { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } ## # @MigMode: -- Gitee From 8b069af63b1dc70ffdcc2662289164b3fd6e29f3 Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Wed, 2 Apr 2025 18:09:21 +0800 Subject: [PATCH 881/939] migration/multifd: include ram.h in multifd.h Header file ram.h was included by multifd.h when mapped-ram was introduced in upstream code. This inclusion is needed by qpl when multifd-qpl.c includes multifd.h. Add this inclusion here since we don't backport mapped-ram Signed-off-by: Jason Zeng --- migration/multifd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd.h b/migration/multifd.h index 11f05dd6d5..41965df7a9 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,6 +13,8 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +#include "ram.h" + bool multifd_send_setup(void); void multifd_send_shutdown(void); int multifd_recv_setup(Error **errp); -- Gitee From 41fed938d3474ab517e689feeb8abf5e2876d2df Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:08 +0800 Subject: [PATCH 882/939] migration/multifd: implement initialization of qpl compression commit 34e104b897da6e144a5f34e7c5eebf8a4c4d9d59 upstream. during initialization, a software job is allocated to each channel for software path fallabck when the IAA hardware is unavailable or the hardware job submission fails. If the IAA hardware is available, multiple hardware jobs are allocated for batch processing. Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 328 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 327 insertions(+), 1 deletion(-) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 056a68a060..6791a204d5 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -9,12 +9,338 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ + #include "qemu/osdep.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "multifd.h" +#include "qpl/qpl.h" + +typedef struct { + /* the QPL hardware path job */ + qpl_job *job; + /* indicates if fallback to software path is required */ + bool fallback_sw_path; + /* output data from the software path */ + uint8_t *sw_output; + /* output data length from the software path */ + uint32_t sw_output_len; +} QplHwJob; + +typedef struct { + /* array of hardware jobs, the number of jobs equals the number pages */ + QplHwJob *hw_jobs; + /* the QPL software job for the slow path and software fallback */ + qpl_job *sw_job; + /* the number of pages that the QPL needs to process at one time */ + uint32_t page_num; + /* array of compressed page buffers */ + uint8_t *zbuf; + /* array of compressed page lengths */ + uint32_t *zlen; + /* the status of the hardware device */ + bool hw_avail; +} QplData; + +/** + * check_hw_avail: check if IAA hardware is available + * + * If the IAA hardware does not exist or is unavailable, + * the QPL hardware job initialization will fail. + * + * Returns true if IAA hardware is available, otherwise false. + * + * @job_size: indicates the hardware job size if hardware is available + */ +static bool check_hw_avail(uint32_t *job_size) +{ + qpl_path_t path = qpl_path_hardware; + uint32_t size = 0; + qpl_job *job; + + if (qpl_get_job_size(path, &size) != QPL_STS_OK) { + return false; + } + assert(size > 0); + job = g_malloc0(size); + if (qpl_init_job(path, job) != QPL_STS_OK) { + g_free(job); + return false; + } + g_free(job); + *job_size = size; + return true; +} + +/** + * multifd_qpl_free_sw_job: clean up software job + * + * Free the software job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_sw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->sw_job) { + qpl_fini_job(qpl->sw_job); + g_free(qpl->sw_job); + qpl->sw_job = NULL; + } +} + +/** + * multifd_qpl_free_jobs: clean up hardware jobs + * + * Free all hardware job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_hw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->hw_jobs) { + for (int i = 0; i < qpl->page_num; i++) { + qpl_fini_job(qpl->hw_jobs[i].job); + g_free(qpl->hw_jobs[i].job); + qpl->hw_jobs[i].job = NULL; + } + g_free(qpl->hw_jobs); + qpl->hw_jobs = NULL; + } +} + +/** + * multifd_qpl_init_sw_job: initialize a software job + * + * Use the QPL software path to initialize a job + * + * @qpl: pointer to the QplData structure + * @errp: pointer to an error + */ +static int multifd_qpl_init_sw_job(QplData *qpl, Error **errp) +{ + qpl_path_t path = qpl_path_software; + uint32_t size = 0; + qpl_job *job = NULL; + qpl_status status; + + status = qpl_get_job_size(path, &size); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_get_job_size failed with error %d", status); + return -1; + } + job = g_malloc0(size); + status = qpl_init_job(path, job); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_init_job failed with error %d", status); + g_free(job); + return -1; + } + qpl->sw_job = job; + return 0; +} + +/** + * multifd_qpl_init_jobs: initialize hardware jobs + * + * Use the QPL hardware path to initialize jobs + * + * @qpl: pointer to the QplData structure + * @size: the size of QPL hardware path job + * @errp: pointer to an error + */ +static void multifd_qpl_init_hw_job(QplData *qpl, uint32_t size, Error **errp) +{ + qpl_path_t path = qpl_path_hardware; + qpl_job *job = NULL; + qpl_status status; + + qpl->hw_jobs = g_new0(QplHwJob, qpl->page_num); + for (int i = 0; i < qpl->page_num; i++) { + job = g_malloc0(size); + status = qpl_init_job(path, job); + /* the job initialization should succeed after check_hw_avail */ + assert(status == QPL_STS_OK); + qpl->hw_jobs[i].job = job; + } +} + +/** + * multifd_qpl_init: initialize QplData structure + * + * Allocate and initialize a QplData structure + * + * Returns a QplData pointer on success or NULL on error + * + * @num: the number of pages + * @size: the page size + * @errp: pointer to an error + */ +static QplData *multifd_qpl_init(uint32_t num, uint32_t size, Error **errp) +{ + uint32_t job_size = 0; + QplData *qpl; + + qpl = g_new0(QplData, 1); + qpl->page_num = num; + if (multifd_qpl_init_sw_job(qpl, errp) != 0) { + g_free(qpl); + return NULL; + } + qpl->hw_avail = check_hw_avail(&job_size); + if (qpl->hw_avail) { + multifd_qpl_init_hw_job(qpl, job_size, errp); + } + qpl->zbuf = g_malloc0(size * num); + qpl->zlen = g_new0(uint32_t, num); + return qpl; +} + +/** + * multifd_qpl_deinit: clean up QplData structure + * + * Free jobs, buffers and the QplData structure + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_deinit(QplData *qpl) +{ + if (qpl) { + multifd_qpl_free_sw_job(qpl); + multifd_qpl_free_hw_job(qpl); + g_free(qpl->zbuf); + g_free(qpl->zlen); + g_free(qpl); + } +} + +/** + * multifd_qpl_send_setup: set up send side + * + * Set up the channel with QPL compression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + + /* + * the page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_qpl_send_cleanup: clean up send side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; +} + +/** + * multifd_qpl_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) +{ + /* Implement in next patch */ + return -1; +} + +/** + * multifd_qpl_recv_setup: set up receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + return 0; +} + +/** + * multifd_qpl_recv_cleanup: set up receive side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; +} + +/** + * multifd_qpl_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) +{ + /* Implement in next patch */ + return -1; +} + +static MultiFDMethods multifd_qpl_ops = { + .send_setup = multifd_qpl_send_setup, + .send_cleanup = multifd_qpl_send_cleanup, + .send_prepare = multifd_qpl_send_prepare, + .recv_setup = multifd_qpl_recv_setup, + .recv_cleanup = multifd_qpl_recv_cleanup, + .recv = multifd_qpl_recv, +}; static void multifd_qpl_register(void) { - /* noop */ + multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops); } migration_init(multifd_qpl_register); -- Gitee From 9c0666808448c393ffff4b44e3e5bb0f62e48a8f Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:09 +0800 Subject: [PATCH 883/939] migration/multifd: implement qpl compression and decompression commit f6fe9fea995249ecc2cd72975d803fbf4d512c02 upstream. QPL compression and decompression will use IAA hardware path if the IAA hardware is available. Otherwise the QPL library software path is used. The hardware path will automatically fall back to QPL software path if the IAA queues are busy. In some scenarios, this may happen frequently, such as configuring 4 channels but only one IAA device is available. In the case of insufficient IAA hardware resources, retry and fallback can help optimize performance: 1. Retry + SW fallback: total time: 14649 ms downtime: 25 ms throughput: 17666.57 mbps pages-per-second: 1509647 2. No fallback, always wait for work queues to become available total time: 18381 ms downtime: 25 ms throughput: 13698.65 mbps pages-per-second: 859607 If both the hardware and software paths fail, the uncompressed page is sent directly. Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 424 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 420 insertions(+), 4 deletions(-) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 6791a204d5..9265098ee7 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -13,9 +13,14 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qapi-types-migration.h" +#include "exec/ramblock.h" #include "multifd.h" #include "qpl/qpl.h" +/* Maximum number of retries to resubmit a job if IAA work queues are full */ +#define MAX_SUBMIT_RETRY_NUM (3) + typedef struct { /* the QPL hardware path job */ qpl_job *job; @@ -260,6 +265,225 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) p->iov = NULL; } +/** + * multifd_qpl_prepare_job: prepare the job + * + * Set the QPL job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @is_compression: indicates compression and decompression + * @input: pointer to the input data buffer + * @input_len: the length of the input data + * @output: pointer to the output data buffer + * @output_len: the length of the output data + */ +static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression, + uint8_t *input, uint32_t input_len, + uint8_t *output, uint32_t output_len) +{ + job->op = is_compression ? qpl_op_compress : qpl_op_decompress; + job->next_in_ptr = input; + job->next_out_ptr = output; + job->available_in = input_len; + job->available_out = output_len; + job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY; + /* only supports compression level 1 */ + job->level = 1; +} + +/** + * multifd_qpl_prepare_comp_job: prepare the compression job + * + * Set the compression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input, + uint8_t *output, uint32_t size) +{ + /* + * Set output length to less than the page size to force the job to + * fail in case it compresses to a larger size. We'll send that page + * without compression and skip the decompression operation on the + * destination. + */ + multifd_qpl_prepare_job(job, true, input, size, output, size - 1); +} + +/** + * multifd_qpl_prepare_decomp_job: prepare the decompression job + * + * Set the decompression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @len: the length of the input data + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t *input, + uint32_t len, uint8_t *output, + uint32_t size) +{ + multifd_qpl_prepare_job(job, false, input, len, output, size); +} + +/** + * multifd_qpl_fill_iov: fill in the IOV + * + * Fill in the QPL packet IOV + * + * @p: Params for the channel being used + * @data: pointer to the IOV data + * @len: The length of the IOV data + */ +static void multifd_qpl_fill_iov(MultiFDSendParams *p, uint8_t *data, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = data; + p->iov[p->iovs_num].iov_len = len; + p->iovs_num++; + p->next_packet_size += len; +} + +/** + * multifd_qpl_fill_packet: fill the compressed page into the QPL packet + * + * Fill the compressed page length and IOV into the QPL packet + * + * @idx: The index of the compressed length array + * @p: Params for the channel being used + * @data: pointer to the compressed page buffer + * @len: The length of the compressed page + */ +static void multifd_qpl_fill_packet(uint32_t idx, MultiFDSendParams *p, + uint8_t *data, uint32_t len) +{ + QplData *qpl = p->compress_data; + + qpl->zlen[idx] = cpu_to_be32(len); + multifd_qpl_fill_iov(p, data, len); +} + +/** + * multifd_qpl_submit_job: submit a job to the hardware + * + * Submit a QPL hardware job to the IAA device + * + * Returns true if the job is submitted successfully, otherwise false. + * + * @job: pointer to the qpl_job structure + */ +static bool multifd_qpl_submit_job(qpl_job *job) +{ + qpl_status status; + uint32_t num = 0; + +retry: + status = qpl_submit_job(job); + if (status == QPL_STS_QUEUES_ARE_BUSY_ERR) { + if (num < MAX_SUBMIT_RETRY_NUM) { + num++; + goto retry; + } + } + return (status == QPL_STS_OK); +} + +/** + * multifd_qpl_compress_pages_slow_path: compress pages using slow path + * + * Compress the pages using software. If compression fails, the uncompressed + * page will be sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *buf; + + for (int i = 0; i < p->pages->normal_num; i++) { + buf = p->pages->block->host + p->pages->offset[i]; + multifd_qpl_prepare_comp_job(job, buf, zbuf, size); + if (qpl_execute_job(job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + zbuf += size; + } +} + +/** + * multifd_qpl_compress_pages: compress pages + * + * Submit the pages to the IAA hardware for compression. If hardware + * compression fails, it falls back to software compression. If software + * compression also fails, the uncompressed page is sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + MultiFDPages_t *pages = p->pages; + uint32_t size = p->page_size; + QplHwJob *hw_job; + uint8_t *buf; + uint8_t *zbuf; + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + multifd_qpl_prepare_comp_job(hw_job->job, buf, zbuf, size); + if (multifd_qpl_submit_job(hw_job->job)) { + hw_job->fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + hw_job->fallback_sw_path = true; + multifd_qpl_prepare_comp_job(qpl->sw_job, buf, zbuf, size); + if (qpl_execute_job(qpl->sw_job) == QPL_STS_OK) { + hw_job->sw_output = zbuf; + hw_job->sw_output_len = qpl->sw_job->total_out; + } else { + hw_job->sw_output = buf; + hw_job->sw_output_len = size; + } + } + } + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + if (hw_job->fallback_sw_path) { + multifd_qpl_fill_packet(i, p, hw_job->sw_output, + hw_job->sw_output_len); + continue; + } + if (qpl_wait_job(hw_job->job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, hw_job->job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + } +} + /** * multifd_qpl_send_prepare: prepare data to be able to send * @@ -273,8 +497,26 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) { - /* Implement in next patch */ - return -1; + QplData *qpl = p->compress_data; + uint32_t len = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* The first IOV is used to store the compressed page lengths */ + len = p->pages->normal_num * sizeof(uint32_t); + multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len); + if (qpl->hw_avail) { + multifd_qpl_compress_pages(p); + } else { + multifd_qpl_compress_pages_slow_path(p); + } + +out: + p->flags |= MULTIFD_FLAG_QPL; + multifd_send_fill_packet(p); + return 0; } /** @@ -312,6 +554,140 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) p->compress_data = NULL; } +/** + * multifd_qpl_process_and_check_job: process and check a QPL job + * + * Process the job and check whether the job output length is the + * same as the specified length + * + * Returns true if the job execution succeeded and the output length + * is equal to the specified length, otherwise false. + * + * @job: pointer to the qpl_job structure + * @is_hardware: indicates whether the job is a hardware job + * @len: Specified output length + * @errp: pointer to an error + */ +static bool multifd_qpl_process_and_check_job(qpl_job *job, bool is_hardware, + uint32_t len, Error **errp) +{ + qpl_status status; + + status = (is_hardware ? qpl_wait_job(job) : qpl_execute_job(job)); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl job failed with error %d", status); + return false; + } + if (job->total_out != len) { + error_setg(errp, "qpl decompressed len %u, expected len %u", + job->total_out, len); + return false; + } + return true; +} + +/** + * multifd_qpl_decompress_pages_slow_path: decompress pages using slow path + * + * Decompress the pages using software + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, + Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + + for (int i = 0; i < p->normal_num; i++) { + len = qpl->zlen[i]; + addr = p->host + p->normal[i]; + /* the page is uncompressed, load it */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + zbuf += len; + } + return 0; +} + +/** + * multifd_qpl_decompress_pages: decompress pages + * + * Decompress the pages using the IAA hardware. If hardware + * decompression fails, it falls back to software decompression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + qpl_job *job; + + for (int i = 0; i < p->normal_num; i++) { + addr = p->host + p->normal[i]; + len = qpl->zlen[i]; + /* the page is uncompressed if received length equals the page size */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + + job = qpl->hw_jobs[i].job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (multifd_qpl_submit_job(job)) { + qpl->hw_jobs[i].fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + qpl->hw_jobs[i].fallback_sw_path = true; + job = qpl->sw_job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + } + zbuf += len; + } + + for (int i = 0; i < p->normal_num; i++) { + /* ignore pages that have already been processed */ + if (qpl->zlen[i] == size || qpl->hw_jobs[i].fallback_sw_path) { + continue; + } + + job = qpl->hw_jobs[i].job; + if (!multifd_qpl_process_and_check_job(job, true, size, errp)) { + return -1; + } + } + return 0; +} /** * multifd_qpl_recv: read the data from the channel into actual pages * @@ -325,8 +701,48 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) */ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) { - /* Implement in next patch */ - return -1; + QplData *qpl = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t len = 0; + uint32_t zbuf_len = 0; + int ret; + + if (flags != MULTIFD_FLAG_QPL) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QPL); + return -1; + } + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed page lengths */ + len = p->normal_num * sizeof(uint32_t); + assert(len < in_size); + ret = qio_channel_read_all(p->c, (void *) qpl->zlen, len, errp); + if (ret != 0) { + return ret; + } + for (int i = 0; i < p->normal_num; i++) { + qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); + assert(qpl->zlen[i] <= p->page_size); + zbuf_len += qpl->zlen[i]; + } + + /* read compressed pages */ + assert(in_size == len + zbuf_len); + ret = qio_channel_read_all(p->c, (void *) qpl->zbuf, zbuf_len, errp); + if (ret != 0) { + return ret; + } + + if (qpl->hw_avail) { + return multifd_qpl_decompress_pages(p, errp); + } + return multifd_qpl_decompress_pages_slow_path(p, errp); } static MultiFDMethods multifd_qpl_ops = { -- Gitee From 3b4704d5856f383244b0c2a1e6c180cdcc672eb0 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:10 +0800 Subject: [PATCH 884/939] tests/migration-test: add qpl compression test commit 08b82d207d138173ddd334c91b387213508a6e13 upstream. add qpl to compression method test for multifd migration the qpl compression supports software path and hardware path(IAA device), and the hardware path is used first by default. If the hardware path is unavailable, it will automatically fallback to the software path for testing. Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Conflicts: tests/qtest/migration-test.c [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 0ac5e7ddc9..16cb7993b3 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2582,6 +2582,15 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QPL +static void * +test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); +} +#endif /* CONFIG_QPL */ + static void test_multifd_tcp_none(void) { MigrateCommon args = { @@ -2617,6 +2626,17 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QPL +static void test_multifd_tcp_qpl(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qpl_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_GNUTLS static void * test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, @@ -3492,6 +3512,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/zstd", test_multifd_tcp_zstd); #endif +#ifdef CONFIG_QPL + migration_test_add("/migration/multifd/tcp/plain/qpl", + test_multifd_tcp_qpl); +#endif #ifdef CONFIG_GNUTLS migration_test_add("/migration/multifd/tcp/tls/psk/match", test_multifd_tcp_tls_psk_match); -- Gitee From 2d8e0ef9947bdb82ce70acd7d0605795bf775153 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:04 +0100 Subject: [PATCH 885/939] docs/migration: add uadk compression feature commit 3ae9bd97829213808298ae6d35ea26f8def15dc1 upstream. Document UADK(User Space Accelerator Development Kit) library details and how to use that for migration. Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao [s/Qemu/QEMU in docs] Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/uadk-compression.rst | 144 ++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 docs/devel/migration/uadk-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 9819393c12..0c9cb3dd6c 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -11,3 +11,4 @@ Migration has plenty of features to support different use cases. vfio virtio qpl-compression + uadk-compression diff --git a/docs/devel/migration/uadk-compression.rst b/docs/devel/migration/uadk-compression.rst new file mode 100644 index 0000000000..3f73345dd5 --- /dev/null +++ b/docs/devel/migration/uadk-compression.rst @@ -0,0 +1,144 @@ +========================================================= +User Space Accelerator Development Kit (UADK) Compression +========================================================= +UADK is a general-purpose user space accelerator framework that uses shared +virtual addressing (SVA) to provide a unified programming interface for +hardware acceleration of cryptographic and compression algorithms. + +UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE), +which enables hardware accelerators from different vendors that support SVA to +adapt to UADK. + +Currently, HiSilicon Kunpeng hardware accelerators have been registered with +UACCE. Through the UADK framework, users can run cryptographic and compression +algorithms using hardware accelerators instead of CPUs, freeing up CPU +computing power and improving computing performance. + +https://github.com/Linaro/uadk/tree/master/docs + +UADK Framework +============== +UADK consists of UACCE, vendors' drivers, and an algorithm layer. UADK requires +the hardware accelerator to support SVA, and the operating system to support +IOMMU and SVA. Hardware accelerators from different vendors are registered as +different character devices with UACCE by using kernel-mode drivers of the +vendors. A user can access the hardware accelerators by performing user-mode +operations on the character devices. + +:: + + +----------------------------------+ + | apps | + +----+------------------------+----+ + | | + | | + +-------+--------+ +-------+-------+ + | scheduler | | alg libraries | + +-------+--------+ +-------+-------+ + | | + | | + | | + | +--------+------+ + | | vendor drivers| + | +-+-------------+ + | | + | | + +--+------------------+--+ + | libwd | + User +----+-------------+-----+ + -------------------------------------------------- + Kernel +--+-----+ +------+ + | uacce | | smmu | + +---+----+ +------+ + | + +---+------------------+ + | vendor kernel driver | + +----------------------+ + -------------------------------------------------- + +----------------------+ + | HW Accelerators | + +----------------------+ + +UADK Installation +----------------- +Build UADK +^^^^^^^^^^ + +.. code-block:: shell + + git clone https://github.com/Linaro/uadk.git + cd uadk + mkdir build + ./autogen.sh + ./configure --prefix=$PWD/build + make + make install + +Without --prefix, UADK will be installed to /usr/local/lib by default. +If get error:"cannot find -lnuma", please install the libnuma-dev + +Run pkg-config libwd to ensure env is setup correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* export PKG_CONFIG_PATH=$PWD/build/lib/pkgconfig +* pkg-config libwd --cflags --libs + -I/usr/local/include -L/usr/local/lib -lwd + +* export PKG_CONFIG_PATH is required on demand. + Not required if UADK is installed to /usr/local/lib + +UADK Host Kernel Requirements +----------------------------- +User needs to make sure that ``UACCE`` is already supported in Linux kernel. +The kernel version should be at least v5.9 with SVA (Shared Virtual +Addressing) enabled. + +Kernel Configuration +^^^^^^^^^^^^^^^^^^^^ + +``UACCE`` could be built as module or built-in. + +Here's an example to enable UACCE with hardware accelerator in HiSilicon +Kunpeng platform. + +* CONFIG_IOMMU_SVA_LIB=y +* CONFIG_ARM_SMMU=y +* CONFIG_ARM_SMMU_V3=y +* CONFIG_ARM_SMMU_V3_SVA=y +* CONFIG_PCI_PASID=y +* CONFIG_UACCE=y +* CONFIG_CRYPTO_DEV_HISI_QM=y +* CONFIG_CRYPTO_DEV_HISI_ZIP=y + +Make sure all these above kernel configurations are selected. + +Accelerator dev node permissions +-------------------------------- +Harware accelerators(eg: HiSilicon Kunpeng Zip accelerator) gets registered to +UADK and char devices are created in dev directory. In order to access resources +on hardware accelerator devices, write permission should be provided to user. + +.. code-block:: shell + + $ sudo chmod 777 /dev/hisi_zip-* + +How To Use UADK Compression In QEMU Migration +--------------------------------------------- +* Make sure UADK is installed as above +* Build ``QEMU`` with ``--enable-uadk`` parameter + + E.g. configure --target-list=aarch64-softmmu --enable-kvm ``--enable-uadk`` + +* Enable ``UADK`` compression during migration + + Set ``migrate_set_parameter multifd-compression uadk`` + +Since UADK uses Shared Virtual Addressing(SVA) and device access virtual memory +directly it is possible that SMMUv3 may enounter page faults while walking the +IO page tables. This may impact the performance. In order to mitigate this, +please make sure to specify ``-mem-prealloc`` parameter to the destination VM +boot parameters. + +Though both UADK and ZLIB are based on the deflate compression algorithm, UADK +is not fully compatible with ZLIB. Hence, please make sure to use ``uadk`` on +both source and destination during migration. -- Gitee From 49db5292ea971c00a7e29eb6d20be24012c553bf Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:05 +0100 Subject: [PATCH 886/939] configure: Add uadk option commit cfc589a89b31930d9d658f4b0b6c4e6f33280e10 upstream. Add --enable-uadk and --disable-uadk options to enable and disable UADK compression accelerator. This is for using UADK based hardware accelerators for live migration. Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- meson.build | 14 ++++++++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 19 insertions(+) diff --git a/meson.build b/meson.build index 888af7e099..e3599b9a09 100644 --- a/meson.build +++ b/meson.build @@ -1049,6 +1049,18 @@ if not get_option('qpl').auto() or have_system required: get_option('qpl'), method: 'pkg-config') endif +uadk = not_found +if not get_option('uadk').auto() or have_system + libwd = dependency('libwd', version: '>=2.6', + required: get_option('uadk'), + method: 'pkg-config') + libwd_comp = dependency('libwd_comp', version: '>=2.6', + required: get_option('uadk'), + method: 'pkg-config') + if libwd.found() and libwd_comp.found() + uadk = declare_dependency(dependencies: [libwd, libwd_comp]) + endif +endif virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2288,6 +2300,7 @@ config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) config_host_data.set('CONFIG_QPL', qpl.found()) +config_host_data.set('CONFIG_UADK', uadk.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4463,6 +4476,7 @@ summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} summary_info += {'Query Processing Library support': qpl} +summary_info += {'UADK Library support': uadk} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index 82f73d51ce..709678fa18 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -261,6 +261,8 @@ option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') option('qpl', type : 'feature', value : 'auto', description: 'Query Processing Library support') +option('uadk', type : 'feature', value : 'auto', + description: 'UADK Library support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 784f74fde9..833b996818 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -223,6 +223,7 @@ meson_options_help() { printf "%s\n" ' xkbcommon xkbcommon support' printf "%s\n" ' zstd zstd compression support' printf "%s\n" ' qpl Query Processing Library support' + printf "%s\n" ' uadk UADK Library support' } _meson_option_parse() { case $1 in @@ -565,6 +566,8 @@ _meson_option_parse() { --disable-zstd) printf "%s" -Dzstd=disabled ;; --enable-qpl) printf "%s" -Dqpl=enabled ;; --disable-qpl) printf "%s" -Dqpl=disabled ;; + --enable-uadk) printf "%s" -Duadk=enabled ;; + --disable-uadk) printf "%s" -Duadk=disabled ;; *) return 1 ;; esac } -- Gitee From cf49f952f849aecd772144cee5285b746bfae228 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:06 +0100 Subject: [PATCH 887/939] migration/multifd: add uadk compression framework commit f3d8bb759d13a2e33389f00fa338d0761309029a upstream. Adds the skeleton to support uadk compression method. Complete functionality will be added in subsequent patches. Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-uadk.c | 20 ++++++++++++++++++++ migration/multifd.h | 5 +++-- qapi/migration.json | 5 ++++- 5 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 migration/multifd-uadk.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 6ee9744e00..650c42eaf8 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -711,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd/qpl", + "none/zlib/zstd/qpl/uadk", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 6652f68d32..264d04657f 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -41,6 +41,7 @@ if get_option('live_block_migration').allowed() endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) +system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c new file mode 100644 index 0000000000..c2bb07535b --- /dev/null +++ b/migration/multifd-uadk.c @@ -0,0 +1,20 @@ +/* + * Multifd UADK compression accelerator implementation + * + * Copyright (c) 2024 Huawei Technologies R & D (UK) Ltd + * + * Authors: + * Shameer Kolothum + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" + +static void multifd_uadk_register(void) +{ + /* noop for now */ +} +migration_init(multifd_uadk_register); diff --git a/migration/multifd.h b/migration/multifd.h index 41965df7a9..ace4ba050d 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -29,13 +29,14 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 3 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (7 << 1) +/* We reserve 4 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) #define MULTIFD_FLAG_QPL (4 << 1) +#define MULTIFD_FLAG_UADK (8 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index f8f3f6f272..f1a17c511b 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -630,12 +630,15 @@ # In-Memory Analytics Accelerator(IAA) accelerated compression # and decompression. (Since 9.1) # +# @uadk: use UADK library compression method. (Since 9.1) +# # Since: 5.0 ## { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, - { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } + { 'name': 'qpl', 'if': 'CONFIG_QPL' }, + { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] } ## # @MigMode: -- Gitee From f6ef2126594a919c5f921dfedf79631167efbc40 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:07 +0100 Subject: [PATCH 888/939] migration/multifd: Add UADK initialization commit 819dd20636d51d5dc9d42aa28edb3dd9c1b8b863 upstream. Initialize UADK session and allocate buffers required. The actual compression/decompression will only be done in a subsequent patch. Signed-off-by: Shameer Kolothum Reviewed-by: Fabiano Rosas Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 209 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 1 deletion(-) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index c2bb07535b..535411a405 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -12,9 +12,216 @@ #include "qemu/osdep.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "uadk/wd_comp.h" +#include "uadk/wd_sched.h" + +struct wd_data { + handle_t handle; + uint8_t *buf; + uint32_t *buf_hdr; +}; + +static bool uadk_hw_init(void) +{ + char alg[] = "zlib"; + int ret; + + ret = wd_comp_init2(alg, SCHED_POLICY_RR, TASK_HW); + if (ret && ret != -WD_EEXIST) { + return false; + } else { + return true; + } +} + +static struct wd_data *multifd_uadk_init_sess(uint32_t count, + uint32_t page_size, + bool compress, Error **errp) +{ + struct wd_comp_sess_setup ss = {0}; + struct sched_params param = {0}; + uint32_t size = count * page_size; + struct wd_data *wd; + + if (!uadk_hw_init()) { + error_setg(errp, "multifd: UADK hardware not available"); + return NULL; + } + + wd = g_new0(struct wd_data, 1); + ss.alg_type = WD_ZLIB; + if (compress) { + ss.op_type = WD_DIR_COMPRESS; + /* Add an additional page for handling output > input */ + size += page_size; + } else { + ss.op_type = WD_DIR_DECOMPRESS; + } + + /* We use default level 1 compression and 4K window size */ + param.type = ss.op_type; + ss.sched_param = ¶m; + + wd->handle = wd_comp_alloc_sess(&ss); + if (!wd->handle) { + error_setg(errp, "multifd: failed wd_comp_alloc_sess"); + goto out; + } + + wd->buf = g_try_malloc(size); + if (!wd->buf) { + error_setg(errp, "multifd: out of mem for uadk buf"); + goto out_free_sess; + } + wd->buf_hdr = g_new0(uint32_t, count); + return wd; + +out_free_sess: + wd_comp_free_sess(wd->handle); +out: + wd_comp_uninit2(); + g_free(wd); + return NULL; +} + +static void multifd_uadk_uninit_sess(struct wd_data *wd) +{ + wd_comp_free_sess(wd->handle); + wd_comp_uninit2(); + g_free(wd->buf); + g_free(wd->buf_hdr); + g_free(wd); +} + +/** + * multifd_uadk_send_setup: setup send side + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *wd; + + wd = multifd_uadk_init_sess(p->page_count, p->page_size, true, errp); + if (!wd) { + return -1; + } + + p->compress_data = wd; + assert(p->iov == NULL); + /* + * Each page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_uadk_send_cleanup: cleanup send side + * + * Close the channel and return memory. + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *wd = p->compress_data; + + multifd_uadk_uninit_sess(wd); + p->compress_data = NULL; +} + +/** + * multifd_uadk_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) +{ + return -1; +} + +/** + * multifd_uadk_recv_setup: setup receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + struct wd_data *wd; + + wd = multifd_uadk_init_sess(p->page_count, p->page_size, false, errp); + if (!wd) { + return -1; + } + p->compress_data = wd; + return 0; +} + +/** + * multifd_uadk_recv_cleanup: cleanup receive side + * + * Close the channel and return memory. + * + * @p: Params for the channel that we are using + */ +static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) +{ + struct wd_data *wd = p->compress_data; + + multifd_uadk_uninit_sess(wd); + p->compress_data = NULL; +} + +/** + * multifd_uadk_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) +{ + return -1; +} + +static MultiFDMethods multifd_uadk_ops = { + .send_setup = multifd_uadk_send_setup, + .send_cleanup = multifd_uadk_send_cleanup, + .send_prepare = multifd_uadk_send_prepare, + .recv_setup = multifd_uadk_recv_setup, + .recv_cleanup = multifd_uadk_recv_cleanup, + .recv = multifd_uadk_recv, +}; static void multifd_uadk_register(void) { - /* noop for now */ + multifd_register_ops(MULTIFD_COMPRESSION_UADK, &multifd_uadk_ops); } migration_init(multifd_uadk_register); -- Gitee From 7b83023e2ecc2debc243cd34032cbf143538f26c Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:08 +0100 Subject: [PATCH 889/939] migration/multifd: Add UADK based compression and decompression commit 3c49191a0d011d941b347fda8fdadd88c988e753 upstream. Uses UADK wd_do_comp_sync() API to (de)compress a normal page using hardware accelerator. Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 132 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index 535411a405..70bba92eaa 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -13,6 +13,7 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" +#include "exec/ramblock.h" #include "migration.h" #include "multifd.h" #include "options.h" @@ -142,6 +143,15 @@ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) p->compress_data = NULL; } +static inline void prepare_next_iov(MultiFDSendParams *p, void *base, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = (uint8_t *)base; + p->iov[p->iovs_num].iov_len = len; + p->next_packet_size += len; + p->iovs_num++; +} + /** * multifd_uadk_send_prepare: prepare data to be able to send * @@ -155,7 +165,56 @@ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) { - return -1; + struct wd_data *uadk_data = p->compress_data; + uint32_t hdr_size; + uint8_t *buf = uadk_data->buf; + int ret = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + hdr_size = p->pages->normal_num * sizeof(uint32_t); + /* prepare the header that stores the lengths of all compressed data */ + prepare_next_iov(p, uadk_data->buf_hdr, hdr_size); + + for (int i = 0; i < p->pages->normal_num; i++) { + struct wd_comp_req creq = { + .op_type = WD_DIR_COMPRESS, + .src = p->pages->block->host + p->pages->offset[i], + .src_len = p->page_size, + .dst = buf, + /* Set dst_len to double the src in case compressed out >= page_size */ + .dst_len = p->page_size * 2, + }; + + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed compression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len < p->page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); + prepare_next_iov(p, buf, creq.dst_len); + buf += creq.dst_len; + } else { + /* + * Send raw data if compressed out >= page_size. We might be better + * off sending raw data if output is slightly less than page_size + * as well because at the receive end we can skip the decompression. + * But it is tricky to find the right number here. + */ + uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size); + prepare_next_iov(p, p->pages->block->host + p->pages->offset[i], + p->page_size); + buf += p->page_size; + } + } +out: + p->flags |= MULTIFD_FLAG_UADK; + multifd_send_fill_packet(p); + return 0; } /** @@ -208,7 +267,76 @@ static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) */ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) { - return -1; + struct wd_data *uadk_data = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t hdr_len = p->normal_num * sizeof(uint32_t); + uint32_t data_len = 0; + uint8_t *buf = uadk_data->buf; + int ret = 0; + + if (flags != MULTIFD_FLAG_UADK) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_ZLIB); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed data lengths */ + assert(hdr_len < in_size); + ret = qio_channel_read_all(p->c, (void *) uadk_data->buf_hdr, + hdr_len, errp); + if (ret != 0) { + return ret; + } + + for (int i = 0; i < p->normal_num; i++) { + uadk_data->buf_hdr[i] = be32_to_cpu(uadk_data->buf_hdr[i]); + data_len += uadk_data->buf_hdr[i]; + assert(uadk_data->buf_hdr[i] <= p->page_size); + } + + /* read compressed data */ + assert(in_size == hdr_len + data_len); + ret = qio_channel_read_all(p->c, (void *)buf, data_len, errp); + if (ret != 0) { + return ret; + } + + for (int i = 0; i < p->normal_num; i++) { + struct wd_comp_req creq = { + .op_type = WD_DIR_DECOMPRESS, + .src = buf, + .src_len = uadk_data->buf_hdr[i], + .dst = p->host + p->normal[i], + .dst_len = p->page_size, + }; + + if (uadk_data->buf_hdr[i] == p->page_size) { + memcpy(p->host + p->normal[i], buf, p->page_size); + buf += p->page_size; + continue; + } + + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed decompression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len != p->page_size) { + error_setg(errp, "multifd %u: decompressed length error", p->id); + return -1; + } + buf += uadk_data->buf_hdr[i]; + } + + return 0; } static MultiFDMethods multifd_uadk_ops = { -- Gitee From 56d75b83e20501cbd35326823d3450ccede2823a Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:09 +0100 Subject: [PATCH 890/939] migration/multifd: Switch to no compression when no hardware support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c1dfd12168e1be0a940e97f85044098e18d18178 upstream. Send raw packets over if UADK hardware support is not available. This is to satisfy  Qemu qtest CI which may run on platforms that don't have UADK hardware support. Subsequent patch will add support for uadk migration qtest. Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Reviewed-by: Zhangfei Gao Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 92 +++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index 70bba92eaa..d12353fb21 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -17,6 +17,7 @@ #include "migration.h" #include "multifd.h" #include "options.h" +#include "qemu/error-report.h" #include "uadk/wd_comp.h" #include "uadk/wd_sched.h" @@ -48,29 +49,29 @@ static struct wd_data *multifd_uadk_init_sess(uint32_t count, uint32_t size = count * page_size; struct wd_data *wd; - if (!uadk_hw_init()) { - error_setg(errp, "multifd: UADK hardware not available"); - return NULL; - } - wd = g_new0(struct wd_data, 1); - ss.alg_type = WD_ZLIB; - if (compress) { - ss.op_type = WD_DIR_COMPRESS; - /* Add an additional page for handling output > input */ - size += page_size; - } else { - ss.op_type = WD_DIR_DECOMPRESS; - } - - /* We use default level 1 compression and 4K window size */ - param.type = ss.op_type; - ss.sched_param = ¶m; - wd->handle = wd_comp_alloc_sess(&ss); - if (!wd->handle) { - error_setg(errp, "multifd: failed wd_comp_alloc_sess"); - goto out; + if (uadk_hw_init()) { + ss.alg_type = WD_ZLIB; + if (compress) { + ss.op_type = WD_DIR_COMPRESS; + /* Add an additional page for handling output > input */ + size += page_size; + } else { + ss.op_type = WD_DIR_DECOMPRESS; + } + /* We use default level 1 compression and 4K window size */ + param.type = ss.op_type; + ss.sched_param = ¶m; + + wd->handle = wd_comp_alloc_sess(&ss); + if (!wd->handle) { + error_setg(errp, "multifd: failed wd_comp_alloc_sess"); + goto out; + } + } else { + /* For CI test use */ + warn_report_once("UADK hardware not available. Switch to no compression mode"); } wd->buf = g_try_malloc(size); @@ -82,7 +83,9 @@ static struct wd_data *multifd_uadk_init_sess(uint32_t count, return wd; out_free_sess: - wd_comp_free_sess(wd->handle); + if (wd->handle) { + wd_comp_free_sess(wd->handle); + } out: wd_comp_uninit2(); g_free(wd); @@ -91,7 +94,9 @@ out: static void multifd_uadk_uninit_sess(struct wd_data *wd) { - wd_comp_free_sess(wd->handle); + if (wd->handle) { + wd_comp_free_sess(wd->handle); + } wd_comp_uninit2(); g_free(wd->buf); g_free(wd->buf_hdr); @@ -188,23 +193,26 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) .dst_len = p->page_size * 2, }; - ret = wd_do_comp_sync(uadk_data->handle, &creq); - if (ret || creq.status) { - error_setg(errp, "multifd %u: failed compression, ret %d status %d", - p->id, ret, creq.status); - return -1; + if (uadk_data->handle) { + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed compression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len < p->page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); + prepare_next_iov(p, buf, creq.dst_len); + buf += creq.dst_len; + } } - if (creq.dst_len < p->page_size) { - uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); - prepare_next_iov(p, buf, creq.dst_len); - buf += creq.dst_len; - } else { - /* - * Send raw data if compressed out >= page_size. We might be better - * off sending raw data if output is slightly less than page_size - * as well because at the receive end we can skip the decompression. - * But it is tricky to find the right number here. - */ + /* + * Send raw data if no UADK hardware or if compressed out >= page_size. + * We might be better off sending raw data if output is slightly less + * than page_size as well because at the receive end we can skip the + * decompression. But it is tricky to find the right number here. + */ + if (!uadk_data->handle || creq.dst_len >= p->page_size) { uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size); prepare_next_iov(p, p->pages->block->host + p->pages->offset[i], p->page_size); @@ -323,6 +331,12 @@ static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) continue; } + if (unlikely(!uadk_data->handle)) { + error_setg(errp, "multifd %u: UADK HW not available for decompression", + p->id); + return -1; + } + ret = wd_do_comp_sync(uadk_data->handle, &creq); if (ret || creq.status) { error_setg(errp, "multifd %u: failed decompression, ret %d status %d", -- Gitee From 76db600f67d72fdb24d794954c85a902968f71ea Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 7 Jun 2024 14:53:10 +0100 Subject: [PATCH 891/939] tests/migration-test: add uadk compression test commit c519caa825f5eba6e204bed5a464df167a5421d0 upstream. Reviewed-by: Fabiano Rosas Signed-off-by: Shameer Kolothum Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 16cb7993b3..7ecf4ce9a5 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2590,6 +2590,14 @@ test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); } #endif /* CONFIG_QPL */ +#ifdef CONFIG_UADK +static void * +test_migrate_precopy_tcp_multifd_uadk_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "uadk"); +} +#endif /* CONFIG_UADK */ static void test_multifd_tcp_none(void) { @@ -2637,6 +2645,17 @@ static void test_multifd_tcp_qpl(void) } #endif +#ifdef CONFIG_UADK +static void test_multifd_tcp_uadk(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_uadk_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_GNUTLS static void * test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, @@ -3516,6 +3535,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/qpl", test_multifd_tcp_qpl); #endif +#ifdef CONFIG_UADK + migration_test_add("/migration/multifd/tcp/plain/uadk", + test_multifd_tcp_uadk); +#endif #ifdef CONFIG_GNUTLS migration_test_add("/migration/multifd/tcp/tls/psk/match", test_multifd_tcp_tls_psk_match); -- Gitee From 85507465a9de3d745204ad86c4cd4a6a7b5004b1 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 28 Aug 2024 11:56:48 -0300 Subject: [PATCH 892/939] migration/multifd: Fix p->iov leak in multifd-uadk.c commit 405e352d28c24991cacfdebccf67d56c4795cf6e upstream. The send_cleanup() hook should free the p->iov that was allocated at send_setup(). This was missed because the UADK code is conditional on the presence of the accelerator, so it's not tested by default. Fixes: 819dd20636 ("migration/multifd: Add UADK initialization") Reported-by: Peter Xu Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-uadk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c index d12353fb21..9a582fc919 100644 --- a/migration/multifd-uadk.c +++ b/migration/multifd-uadk.c @@ -146,6 +146,8 @@ static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) multifd_uadk_uninit_sess(wd); p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; } static inline void prepare_next_iov(MultiFDSendParams *p, void *base, -- Gitee From 5fa111eb3e3d73a0500d33d0b81638c579476845 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Fri, 30 Aug 2024 16:27:18 -0700 Subject: [PATCH 893/939] docs/migration: add qatzip compression feature commit 85da4cbe6e5eb6ba6f31c8b30ee4582625546da7 upstream. add Intel QATzip compression method introduction Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas Signed-off-by: Yuan Liu Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-2-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/qatzip-compression.rst | 165 ++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 docs/devel/migration/qatzip-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 0c9cb3dd6c..7c5ce9e79d 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -12,3 +12,4 @@ Migration has plenty of features to support different use cases. virtio qpl-compression uadk-compression + qatzip-compression diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst new file mode 100644 index 0000000000..862b383164 --- /dev/null +++ b/docs/devel/migration/qatzip-compression.rst @@ -0,0 +1,165 @@ +================== +QATzip Compression +================== +In scenarios with limited network bandwidth, the ``QATzip`` solution can help +users save a lot of host CPU resources by accelerating compression and +decompression through the Intel QuickAssist Technology(``QAT``) hardware. + + +The following test was conducted using 8 multifd channels and 10Gbps network +bandwidth. The results show that, compared to zstd, ``QATzip`` significantly +saves CPU resources on the sender and reduces migration time. Compared to the +uncompressed solution, ``QATzip`` greatly improves the dirty page processing +capability, indicated by the Pages per Second metric, and also reduces the +total migration time. + +:: + + VM Configuration: 16 vCPU and 64G memory + VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data. + QAT Devices: 4 + |-----------|--------|---------|----------|----------|------|------| + |8 Channels |Total |down |throughput|pages per | send | recv | + | |time(ms)|time(ms) |(mbps) |second | cpu %| cpu% | + |-----------|--------|---------|----------|----------|------|------| + |qatzip | 16630| 28| 10467| 2940235| 160| 360| + |-----------|--------|---------|----------|----------|------|------| + |zstd | 20165| 24| 8579| 2391465| 810| 340| + |-----------|--------|---------|----------|----------|------|------| + |none | 46063| 40| 10848| 330240| 45| 85| + |-----------|--------|---------|----------|----------|------|------| + + +QATzip Compression Framework +============================ + +``QATzip`` is a user space library which builds on top of the Intel QuickAssist +Technology to provide extended accelerated compression and decompression +services. + +For more ``QATzip`` introduction, please refer to `QATzip Introduction +`_ + +:: + + +----------------+ + | MultiFd Thread | + +-------+--------+ + | + | compress/decompress + +-------+--------+ + | QATzip library | + +-------+--------+ + | + +-------+--------+ + | QAT library | + +-------+--------+ + | user space + --------+--------------------- + | kernel space + +------+-------+ + | QAT Driver | + +------+-------+ + | + +------+-------+ + | QAT Devices | + +--------------+ + + +QATzip Installation +------------------- + +The ``QATzip`` installation package has been integrated into some Linux +distributions and can be installed directly. For example, the Ubuntu Server +24.04 LTS system can be installed using below command + +.. code-block:: shell + + #apt search qatzip + libqatzip-dev/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library development files + + libqatzip3/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library + + qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed] + Compression user-space tool for Intel QuickAssist Technology + + #sudo apt install libqatzip-dev libqatzip3 qatzip + +If your system does not support the ``QATzip`` installation package, you can +use the source code to build and install, please refer to `QATzip source code installation +`_ + +QAT Hardware Deployment +----------------------- + +``QAT`` supports physical functions(PFs) and virtual functions(VFs) for +deployment, and users can configure ``QAT`` resources for migration according +to actual needs. For more details about ``QAT`` deployment, please refer to +`Intel QuickAssist Technology Documentation +`_ + +For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview +`_ + +How To Use QATzip Compression +============================= + +1 - Install ``QATzip`` library + +2 - Build ``QEMU`` with ``--enable-qatzip`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip`` + +3 - Set ``migrate_set_parameter multifd-compression qatzip`` + +4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default +comp_level value is 1, and it supports levels from 1 to 9 + +QAT Memory Requirements +======================= + +The user needs to reserve system memory for the QAT memory management to +allocate DMA memory. The size of the reserved system memory depends on the +number of devices used for migration and the number of multifd channels. + +Because memory usage depends on QAT configuration, please refer to `QAT Memory +Driver Queries +`_ +for memory usage calculation. + +.. list-table:: An example of a PF used for migration + :header-rows: 1 + + * - Number of channels + - Sender memory usage + - Receiver memory usage + * - 2 + - 10M + - 10M + * - 4 + - 12M + - 14M + * - 8 + - 16M + - 20M + +How To Choose Between QATzip and QPL +==================================== +Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``), multiple built-in accelerators are supported including +``QAT`` and ``IAA``. The former can accelerate ``QATzip`` and the latter is +used to accelerate ``QPL``. + +Here are some suggestions: + +1 - If the live migration scenario is limited by network bandwidth and ``QAT`` +hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a +lot of host CPU resources for compression. + +2 - If the system cannot support shared virtual memory (SVM) technology, use +the ``QATzip`` method because ``QPL`` performance is not good without SVM +support. + +3 - For other scenarios, use the ``QPL`` method first. -- Gitee From ca73720f8e625f143a27acf7c1aedb1b426c1ee1 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:19 -0700 Subject: [PATCH 894/939] meson: Introduce 'qatzip' feature to the build system commit e28ed313c268aeb4e0cefb66dcd215c30e4443fe upstream. Add a 'qatzip' feature, which is automatically disabled, and which depends on the QATzip library if enabled. Reviewed-by: Fabiano Rosas Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-3-yichen.wang@bytedance.com Signed-off-by: Peter Xu Conflicts: scripts/meson-buildoptions.sh [jz: resolve simple context conflicts] Signed-off-by: Jason Zeng --- meson.build | 10 ++++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 15 insertions(+) diff --git a/meson.build b/meson.build index e3599b9a09..d221f5cad5 100644 --- a/meson.build +++ b/meson.build @@ -1061,6 +1061,14 @@ if not get_option('uadk').auto() or have_system uadk = declare_dependency(dependencies: [libwd, libwd_comp]) endif endif + +qatzip = not_found +if not get_option('qatzip').auto() or have_system + qatzip = dependency('qatzip', version: '>=1.1.2', + required: get_option('qatzip'), + method: 'pkg-config') +endif + virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2301,6 +2309,7 @@ config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) config_host_data.set('CONFIG_QPL', qpl.found()) config_host_data.set('CONFIG_UADK', uadk.found()) +config_host_data.set('CONFIG_QATZIP', qatzip.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4477,6 +4486,7 @@ summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} summary_info += {'Query Processing Library support': qpl} summary_info += {'UADK Library support': uadk} +summary_info += {'qatzip support': qatzip} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index 709678fa18..61996300d5 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -263,6 +263,8 @@ option('qpl', type : 'feature', value : 'auto', description: 'Query Processing Library support') option('uadk', type : 'feature', value : 'auto', description: 'UADK Library support') +option('qatzip', type: 'feature', value: 'auto', + description: 'QATzip compression support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 833b996818..8604fe8ffa 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -163,6 +163,7 @@ meson_options_help() { printf "%s\n" ' pixman pixman support' printf "%s\n" ' plugins TCG plugins via shared library loading' printf "%s\n" ' png PNG support with libpng' + printf "%s\n" ' qatzip QATzip compression support' printf "%s\n" ' pvrdma Enable PVRDMA support' printf "%s\n" ' qcow1 qcow1 image format support' printf "%s\n" ' qed qed image format support' @@ -430,6 +431,8 @@ _meson_option_parse() { --enable-png) printf "%s" -Dpng=enabled ;; --disable-png) printf "%s" -Dpng=disabled ;; --prefix=*) quote_sh "-Dprefix=$2" ;; + --enable-qatzip) printf "%s" -Dqatzip=enabled ;; + --disable-qatzip) printf "%s" -Dqatzip=disabled ;; --enable-pvrdma) printf "%s" -Dpvrdma=enabled ;; --disable-pvrdma) printf "%s" -Dpvrdma=disabled ;; --enable-qcow1) printf "%s" -Dqcow1=enabled ;; -- Gitee From cb3f1e1a84a3776d5382013cb9fcfe08c8ea9b3e Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:20 -0700 Subject: [PATCH 895/939] migration: Add migration parameters for QATzip commit 86c6eb1f39cbb7eb0467c114469e98ef699fb515 upstream. Adds support for migration parameters to control QATzip compression level. Acked-by: Markus Armbruster Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Reviewed-by: Fabiano Rosas Reviewed-by: Prasad Pandit Link: https://lore.kernel.org/r/20240830232722.58272-4-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration-hmp-cmds.c | 4 ++++ migration/options.c | 34 ++++++++++++++++++++++++++++++++++ migration/options.h | 1 + qapi/migration.json | 18 ++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 91e51eb7af..d6d5f373a1 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -669,6 +669,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zlib_level = true; visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; + case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL: + p->has_multifd_qatzip_level = true; + visit_type_uint8(v, param, &p->multifd_qatzip_level, &err); + break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); diff --git a/migration/options.c b/migration/options.c index e752163114..6ba7ff65a3 100644 --- a/migration/options.c +++ b/migration/options.c @@ -63,6 +63,13 @@ #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 +/* + * 1: best speed, ... 9: best compress ratio + * There is some nuance here. Refer to QATzip documentation to understand + * the mapping of QATzip levels to standard deflate levels. + */ +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1 + /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 @@ -147,6 +154,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, parameters.multifd_zlib_level, DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), + DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState, + parameters.multifd_qatzip_level, + DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL), DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, parameters.multifd_zstd_level, DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), @@ -888,6 +898,13 @@ int migrate_multifd_zlib_level(void) return s->parameters.multifd_zlib_level; } +int migrate_multifd_qatzip_level(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_qatzip_level; +} + int migrate_multifd_zstd_level(void) { MigrationState *s = migrate_get_current(); @@ -1019,6 +1036,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->multifd_compression = s->parameters.multifd_compression; params->has_multifd_zlib_level = true; params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_qatzip_level = true; + params->multifd_qatzip_level = s->parameters.multifd_qatzip_level; params->has_multifd_zstd_level = true; params->multifd_zstd_level = s->parameters.multifd_zstd_level; params->has_xbzrle_cache_size = true; @@ -1082,6 +1101,7 @@ void migrate_params_init(MigrationParameters *params) params->has_multifd_channels = true; params->has_multifd_compression = true; params->has_multifd_zlib_level = true; + params->has_multifd_qatzip_level = true; params->has_multifd_zstd_level = true; params->has_xbzrle_cache_size = true; params->has_max_postcopy_bandwidth = true; @@ -1221,6 +1241,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_multifd_qatzip_level && + ((params->multifd_qatzip_level > 9) || + (params->multifd_qatzip_level < 1))) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level", + "a value between 1 and 9"); + return false; + } + if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", @@ -1390,6 +1418,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + dest->multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { dest->multifd_zlib_level = params->multifd_zlib_level; } @@ -1556,6 +1587,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + s->parameters.multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { s->parameters.multifd_zlib_level = params->multifd_zlib_level; } diff --git a/migration/options.h b/migration/options.h index dbd52d7acd..6b2a893217 100644 --- a/migration/options.h +++ b/migration/options.h @@ -89,6 +89,7 @@ int migrate_hdbss_buffer_size(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); +int migrate_multifd_qatzip_level(void); int migrate_multifd_zstd_level(void); uint8_t migrate_throttle_trigger_threshold(void); const char *migrate_tls_authz(void); diff --git a/qapi/migration.json b/qapi/migration.json index f1a17c511b..255f5b50a6 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -885,6 +885,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -966,6 +971,7 @@ 'xbzrle-cache-size', 'max-postcopy-bandwidth', 'max-cpu-throttle', 'multifd-compression', 'multifd-zlib-level', 'multifd-zstd-level', + 'multifd-qatzip-level', 'block-bitmap-mapping', { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', @@ -1097,6 +1103,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1198,6 +1209,7 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', @@ -1354,6 +1366,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1451,6 +1468,7 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', -- Gitee From d5ad8ffdf67cb6a76d5b4bf7145488abaa53c2ae Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:21 -0700 Subject: [PATCH 896/939] migration: Introduce 'qatzip' compression method commit 80484f945989988091c5cd729c3e8bde6c14907a upstream. Adds support for 'qatzip' as an option for the multifd compression method parameter, and implements using QAT for 'qatzip' compression and decompression. Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Reviewed-by: Prasad Pandit Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-5-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-qatzip.c | 394 +++++++++++++++++++++++++++++++ migration/multifd.h | 5 +- qapi/migration.json | 3 + 5 files changed, 402 insertions(+), 3 deletions(-) create mode 100644 migration/multifd-qatzip.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 650c42eaf8..9cc2e38aba 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -711,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd/qpl/uadk", + "none/zlib/zstd/qpl/uadk/qatzip", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 264d04657f..aba2581705 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -42,6 +42,7 @@ endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file mode 100644 index 0000000000..3c787ed879 --- /dev/null +++ b/migration/multifd-qatzip.c @@ -0,0 +1,394 @@ +/* + * Multifd QATzip compression implementation + * + * Copyright (c) Bytedance + * + * Authors: + * Bryan Zhang + * Hao Xiang + * Yichen Wang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "exec/ramblock.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qapi-types-migration.h" +#include "options.h" +#include "multifd.h" +#include + +typedef struct { + /* + * Unique session for use with QATzip API + */ + QzSession_T sess; + + /* + * For compression: Buffer for pages to compress + * For decompression: Buffer for data to decompress + */ + uint8_t *in_buf; + uint32_t in_len; + + /* + * For compression: Output buffer of compressed data + * For decompression: Output buffer of decompressed data + */ + uint8_t *out_buf; + uint32_t out_len; +} QatzipData; + +/** + * qatzip_send_setup: Set up QATzip session and private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, 2); + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + /* Make sure to use configured QATzip compression level. */ + params.common_params.comp_lvl = migrate_multifd_qatzip_level(); + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + if (MULTIFD_PACKET_SIZE > UINT32_MAX) { + err_msg = "packet size too large for QAT"; + goto err; + } + + q->in_len = MULTIFD_PACKET_SIZE; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess); + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_send_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return None + */ +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + + g_free(p->iov); + p->iov = NULL; + p->compress_data = NULL; +} + +/** + * qatzip_send_prepare: Compress pages and update IO channel info. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) +{ + MultiFDPages_t *pages = p->pages; + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* + * Unlike other multifd compression implementations, we use a non-streaming + * API and place all the data into one buffer, rather than sending each + * page to the compression API at a time. Based on initial benchmarks, the + * non-streaming API outperforms the streaming API. Plus, the logic in QEMU + * is friendly to using the non-streaming API anyway. If either of these + * statements becomes no longer true, we can revisit adding a streaming + * implementation. + */ + for (int i = 0; i < pages->normal_num; i++) { + memcpy(q->in_buf + (i * p->page_size), + pages->block->host + pages->offset[i], + p->page_size); + } + + in_len = pages->normal_num * p->page_size; + if (in_len > q->in_len) { + error_setg(errp, "multifd %u: unexpectedly large input", p->id); + return -1; + } + out_len = q->out_len; + + ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK", + p->id, ret); + return -1; + } + if (in_len != pages->normal_num * p->page_size) { + error_setg(errp, "multifd %u: QATzip failed to compress all input", + p->id); + return -1; + } + + p->iov[p->iovs_num].iov_base = q->out_buf; + p->iov[p->iovs_num].iov_len = out_len; + p->iovs_num++; + p->next_packet_size = out_len; + +out: + p->flags |= MULTIFD_FLAG_QATZIP; + multifd_send_fill_packet(p); + return 0; +} + +/** + * qatzip_recv_setup: Set up QATzip session and allocate private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + /* + * Reserve extra spaces for the incoming packets. Current implementation + * doesn't send uncompressed pages in case the compression gets too big. + */ + q->in_len = MULTIFD_PACKET_SIZE * 2; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = MULTIFD_PACKET_SIZE; + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_recv_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @return None + */ +static void qatzip_recv_cleanup(MultiFDRecvParams *p) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + p->compress_data = NULL; +} + + +/** + * qatzip_recv: Decompress pages and copy them to the appropriate + * locations. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + uint32_t in_size = p->next_packet_size; + uint32_t expected_size = p->normal_num * p->page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (in_size > q->in_len) { + error_setg(errp, "multifd %u: received unexpectedly large packet", + p->id); + return -1; + } + + if (flags != MULTIFD_FLAG_QATZIP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QATZIP); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp); + if (ret != 0) { + return ret; + } + + in_len = in_size; + out_len = q->out_len; + ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: qzDecompress failed", p->id); + return -1; + } + if (out_len != expected_size) { + error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_len, expected_size); + return -1; + } + + /* Copy each page to its appropriate location. */ + for (int i = 0; i < p->normal_num; i++) { + memcpy(p->host + p->normal[i], + q->out_buf + p->page_size * i, + p->page_size); + } + return 0; +} + +static MultiFDMethods multifd_qatzip_ops = { + .send_setup = qatzip_send_setup, + .send_cleanup = qatzip_send_cleanup, + .send_prepare = qatzip_send_prepare, + .recv_setup = qatzip_recv_setup, + .recv_cleanup = qatzip_recv_cleanup, + .recv = qatzip_recv +}; + +static void multifd_qatzip_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops); +} + +migration_init(multifd_qatzip_register); diff --git a/migration/multifd.h b/migration/multifd.h index ace4ba050d..57c1334788 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -29,14 +29,15 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 4 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (0xf << 1) +/* We reserve 5 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) #define MULTIFD_FLAG_QPL (4 << 1) #define MULTIFD_FLAG_UADK (8 << 1) +#define MULTIFD_FLAG_QATZIP (16 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index 255f5b50a6..37e1d4857e 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -625,6 +625,8 @@ # # @zstd: use zstd compression method. # +# @qatzip: use qatzip compression method. (Since 9.2) +# # @qpl: use qpl compression method. Query Processing Library(qpl) is # based on the deflate compression algorithm and use the Intel # In-Memory Analytics Accelerator(IAA) accelerated compression @@ -637,6 +639,7 @@ { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'}, { 'name': 'qpl', 'if': 'CONFIG_QPL' }, { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] } -- Gitee From 049442961f30f504475a7cb4b4c02043a7fb3c04 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:22 -0700 Subject: [PATCH 897/939] tests/migration: Add integration test for 'qatzip' compression method commit afe166d4e8bc33bc448cd573b55d0ac094187d48 upstream. Adds an integration test for 'qatzip'. Reviewed-by: Fabiano Rosas Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-6-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 7ecf4ce9a5..3385ca1f15 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2582,6 +2582,18 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QATZIP +static void * +test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from, + QTestState *to) +{ + migrate_set_parameter_int(from, "multifd-qatzip-level", 2); + migrate_set_parameter_int(to, "multifd-qatzip-level", 2); + + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip"); +} +#endif + #ifdef CONFIG_QPL static void * test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, @@ -2634,6 +2646,17 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QATZIP +static void test_multifd_tcp_qatzip(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_QPL static void test_multifd_tcp_qpl(void) { @@ -3531,6 +3554,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/zstd", test_multifd_tcp_zstd); #endif +#ifdef CONFIG_QATZIP + migration_test_add("/migration/multifd/tcp/plain/qatzip", + test_multifd_tcp_qatzip); +#endif #ifdef CONFIG_QPL migration_test_add("/migration/multifd/tcp/plain/qpl", test_multifd_tcp_qpl); -- Gitee From 75ab1fea57e8925efd8a3bef827d0c0f0cdd1fa2 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 10 Sep 2024 07:41:38 +0200 Subject: [PATCH 898/939] migration/multifd: Fix loop conditions in multifd_zstd_send_prepare and multifd_zstd_recv commit cb0ed522a51a7d4b1fde535972d4aeeb82447928 upstream. GitHub's CodeQL reports four critical errors which are fixed by this commit: Unsigned difference expression compared to zero An expression (u - v > 0) with unsigned values u, v is only false if u == v, so all changed expressions did not work as expected. Signed-off-by: Stefan Weil Link: https://lore.kernel.org/r/20240910054138.1458555-1-sw@weilnetz.de [peterx: Fix mangled email for author] Signed-off-by: Peter Xu Conflicts: migration/multifd-zstd.c [jz: resolve context conflict due to p->page which not renamed to page yet] Signed-off-by: Jason Zeng --- migration/multifd-zstd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index ca17b7e310..46ee68b6ce 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -152,9 +152,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) */ do { ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush); - } while (ret > 0 && (z->in.size - z->in.pos > 0) - && (z->out.size - z->out.pos > 0)); - if (ret > 0 && (z->in.size - z->in.pos > 0)) { + } while (ret > 0 && (z->in.size > z->in.pos) + && (z->out.size > z->out.pos)); + if (ret > 0 && (z->in.size > z->in.pos)) { error_setg(errp, "multifd %u: compressStream buffer too small", p->id); return -1; @@ -299,7 +299,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) */ do { ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); - } while (ret > 0 && (z->in.size - z->in.pos > 0) + } while (ret > 0 && (z->in.size > z->in.pos) && (z->out.pos < p->page_size)); if (ret > 0 && (z->out.pos < p->page_size)) { error_setg(errp, "multifd %u: decompressStream buffer too small", -- Gitee From a15e40dc17b96c431ad4c71377a3a66e57a00dab Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 17 Sep 2024 15:58:02 -0300 Subject: [PATCH 899/939] migration/multifd: Fix rb->receivedmap cleanup race commit 4ce56229087860805877075ddb29dd44578365a9 upstream. Fix a segmentation fault in multifd when rb->receivedmap is cleared too early. After commit 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults"), multifd started using the rb->receivedmap bitmap, which belongs to ram.c and is initialized and *freed* from the ram SaveVMHandlers. Multifd threads are live until migration_incoming_state_destroy(), which is called after qemu_loadvm_state_cleanup(), leading to a crash when accessing rb->receivedmap. process_incoming_migration_co() ... qemu_loadvm_state() multifd_nocomp_recv() qemu_loadvm_state_cleanup() ramblock_recv_bitmap_set_offset() rb->receivedmap = NULL set_bit_atomic(..., rb->receivedmap) ... migration_incoming_state_destroy() multifd_recv_cleanup() multifd_recv_terminate_threads(NULL) Move the loadvm cleanup into migration_incoming_state_destroy(), after multifd_recv_cleanup() to ensure multifd threads have already exited when rb->receivedmap is cleared. Adjust the postcopy listen thread comment to indicate that we still want to skip the cpu synchronization. CC: qemu-stable@nongnu.org Fixes: 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults") Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240917185802.15619-3-farosas@suse.de [peterx: added comment in migration_incoming_state_destroy()] Signed-off-by: Peter Xu Conflicts: migration/migration.c [jz: resolve context conflict due to non-multifd compression which is already deleted in upstream while still in openEuler] Signed-off-by: Jason Zeng --- migration/migration.c | 5 +++++ migration/savevm.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 59c0bbee67..107e106b73 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -276,6 +276,11 @@ void migration_incoming_state_destroy(void) multifd_recv_cleanup(); compress_threads_load_cleanup(); + /* + * RAM state cleanup needs to happen after multifd cleanup, because + * multifd threads can use some of its states (receivedmap). + */ + qemu_loadvm_state_cleanup(); if (mis->to_src_file) { /* Tell source that we are done */ diff --git a/migration/savevm.c b/migration/savevm.c index cc65da605e..29389068df 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2959,7 +2959,10 @@ int qemu_loadvm_state(QEMUFile *f) trace_qemu_loadvm_state_post_main(ret); if (mis->have_listen_thread) { - /* Listen thread still going, can't clean up yet */ + /* + * Postcopy listen thread still going, don't synchronize the + * cpus yet. + */ return ret; } @@ -3002,7 +3005,6 @@ int qemu_loadvm_state(QEMUFile *f) } } - qemu_loadvm_state_cleanup(); cpu_synchronize_all_post_init(); return ret; -- Gitee From 82b23ca67d0a5d77cb0266b89f76b9c8c4bffb3d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 19 Sep 2024 12:06:11 -0300 Subject: [PATCH 900/939] migration/multifd: Ensure packet->ramblock is null-terminated commit 68e0fca625912c7c63a8bfbc784f53d4fefa1a13 upstream. Coverity points out that the current usage of strncpy to write the ramblock name allows the field to not have an ending '\0' in case idstr is already not null-terminated (e.g. if it's larger than 256 bytes). This is currently harmless because the packet->ramblock field is never touched again on the source side. The destination side reads only up to the field's size from the stream and forces the last byte to be 0. We're still open to a programming error in the future in case this field is ever passed into a function that expects a null-terminated string. Change from strncpy to QEMU's pstrcpy, which puts a '\0' at the end of the string and doesn't fill the extra space with zeros. (there's no spillage between iterations of fill_packet because after commit 87bb9e953e ("migration/multifd: Isolate ram pages packet data") the packet is always zeroed before filling) Resolves: Coverity CID 1560071 Reported-by: Peter Maydell Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240919150611.17074-1-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/multifd-nocomp.c [jz: upstream has split nocomp code into multifd-nocomp.c, while openEuler hasn't yet. The function that needs to be fixed is still named multifd_send_fill_packet in multifd.c, so we fix it in multifd.c] Signed-off-by: Jason Zeng --- migration/multifd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index 0fcecc3759..3761a803ed 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -16,6 +16,7 @@ #include "exec/target_page.h" #include "sysemu/sysemu.h" #include "exec/ramblock.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "qapi/qapi-events-migration.h" @@ -400,7 +401,8 @@ void multifd_send_fill_packet(MultiFDSendParams *p) packet->packet_num = cpu_to_be64(packet_num); if (pages->block) { - strncpy(packet->ramblock, pages->block->idstr, 256); + pstrcpy(packet->ramblock, sizeof(packet->ramblock), + pages->block->idstr); } for (i = 0; i < pages->num; i++) { -- Gitee From c927bd2c10ee92131eba56ab8d2c26dd9dedfe50 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 29 Oct 2024 15:58:15 +0100 Subject: [PATCH 901/939] migration/multifd: Zero p->flags before starting filling a packet commit 00b4b216534d84ace7b0583cec70a3aaf256cb25 upstream. This way there aren't stale flags there. p->flags can't contain SYNC to be sent at the next RAM packet since syncs are now handled separately in multifd_send_thread. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/r/1c96b6cdb797e6f035eb1a4ad9bfc24f4c7f5df8.1730203967.git.maciej.szmigiero@oracle.com Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- migration/multifd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3761a803ed..36581a5631 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -898,6 +898,7 @@ static void *multifd_send_thread(void *opaque) if (qatomic_load_acquire(&p->pending_job)) { MultiFDPages_t *pages = p->pages; + p->flags = 0; p->iovs_num = 0; assert(pages->num); @@ -944,7 +945,6 @@ static void *multifd_send_thread(void *opaque) } /* p->next_packet_size will always be zero for a SYNC packet */ stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; } qatomic_set(&p->pending_sync, false); -- Gitee From 123e52e1dc6629fec922dad4f7c97e23a82ec157 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Wed, 18 Dec 2024 17:14:11 +0800 Subject: [PATCH 902/939] multifd: bugfix for migration using compression methods commit cdc3970f8597ebdc1a4c2090cfb4d11e297329ed upstream. When compression is enabled on the migration channel and the pages processed are all zero pages, these pages will not be sent and updated on the target side, resulting in incorrect memory data on the source and target sides. The root cause is that all compression methods call multifd_send_prepare_common to determine whether to compress dirty pages, but multifd_send_prepare_common does not update the IOV of MultiFDPacket_t when all dirty pages are zero pages. The solution is to always update the IOV of MultiFDPacket_t regardless of whether the dirty pages are all zero pages. Intel-SIG: commit cdc3970f8597 multifd: bugfix for migration using compression methods Fixes: 303e6f54f9 ("migration/multifd: Implement zero page transmission on the multifd thread.") Cc: qemu-stable@nongnu.org #9.0+ Signed-off-by: Yuan Liu Reviewed-by: Jason Zeng Reviewed-by: Peter Xu Message-Id: <20241218091413.140396-2-yuan1.liu@intel.com> Signed-off-by: Fabiano Rosas Conflicts: migration/multifd-nocomp.c [jz: upstream has split nocomp code into multifd-nocomp.c, while openEuler hasn't yet. The function that needs to be fixed is still in multifd.c, so we fix it in multifd.c] Signed-off-by: Jason Zeng --- migration/multifd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 36581a5631..4c310deb61 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1488,6 +1488,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) bool multifd_send_prepare_common(MultiFDSendParams *p) { + multifd_send_prepare_header(p); multifd_send_zero_page_detect(p); if (!p->pages->normal_num) { @@ -1495,7 +1496,5 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) return false; } - multifd_send_prepare_header(p); - return true; } -- Gitee From 1b0fb2f08c76bc727e52ff763ed5bb7ee1bda820 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Wed, 18 Dec 2024 17:14:12 +0800 Subject: [PATCH 903/939] multifd: bugfix for incorrect migration data with QPL compression commit 2588a5f99b0c3493b4690e3ff01ed36f80e830cc upstream. When QPL compression is enabled on the migration channel and the same dirty page changes from a normal page to a zero page in the iterative memory copy, the dirty page will not be updated to a zero page again on the target side, resulting in incorrect memory data on the source and target sides. The root cause is that the target side does not record the normal pages to the receivedmap. The solution is to add ramblock_recv_bitmap_set_offset in target side to record the normal pages. Intel-SIG: commit 2588a5f99b0c multifd: bugfix for incorrect migration data with QPL compression Signed-off-by: Yuan Liu Reviewed-by: Jason Zeng Reviewed-by: Peter Xu Message-Id: <20241218091413.140396-3-yuan1.liu@intel.com> Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 9265098ee7..fea60e3937 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -730,6 +730,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); assert(qpl->zlen[i] <= p->page_size); zbuf_len += qpl->zlen[i]; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } /* read compressed pages */ -- Gitee From 7541385c82f3c85fc8727080bb224dd8761fe719 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Wed, 18 Dec 2024 17:14:13 +0800 Subject: [PATCH 904/939] multifd: bugfix for incorrect migration data with qatzip compression commit a523bc52166c80d8a04d46584f9f3868bd53ef69 upstream. When QPL compression is enabled on the migration channel and the same dirty page changes from a normal page to a zero page in the iterative memory copy, the dirty page will not be updated to a zero page again on the target side, resulting in incorrect memory data on the source and target sides. The root cause is that the target side does not record the normal pages to the receivedmap. The solution is to add ramblock_recv_bitmap_set_offset in target side to record the normal pages. Intel-SIG: commit a523bc52166c multifd: bugfix for incorrect migration data with qatzip compression Signed-off-by: Yuan Liu Reviewed-by: Jason Zeng Reviewed-by: Peter Xu Message-Id: <20241218091413.140396-4-yuan1.liu@intel.com> Signed-off-by: Fabiano Rosas Conflicts: migration/multifd-qatzip.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/multifd-qatzip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c index 3c787ed879..88b6fb44ad 100644 --- a/migration/multifd-qatzip.c +++ b/migration/multifd-qatzip.c @@ -373,6 +373,7 @@ static int qatzip_recv(MultiFDRecvParams *p, Error **errp) memcpy(p->host + p->normal[i], q->out_buf + p->page_size * i, p->page_size); + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return 0; } -- Gitee From e8587f657fd33f223227a167e94ed69db729e2ac Mon Sep 17 00:00:00 2001 From: eillon Date: Sun, 25 May 2025 22:22:58 +0800 Subject: [PATCH 905/939] hw/arm/virt: only support the HDBSS feature in aarch64 Only support the HDBSS feature in aarch64 architecture as it depends on the kvm. --- migration/ram.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 6acf518a34..a8308eb005 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2812,6 +2812,7 @@ static void xbzrle_cleanup(void) XBZRLE_cache_unlock(); } +#ifdef TARGET_AARCH64 static void kvm_update_hdbss_cap(bool enable) { KVMState *s = kvm_state; @@ -2836,6 +2837,7 @@ static void kvm_update_hdbss_cap(bool enable) return; } +#endif static void ram_save_cleanup(void *opaque) { @@ -2853,7 +2855,9 @@ static void ram_save_cleanup(void *opaque) * memory_global_dirty_log_stop will assert that * memory_global_dirty_log_start/stop used in pairs */ +#ifdef TARGET_AARCH64 kvm_update_hdbss_cap(false); +#endif memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); } } @@ -3257,7 +3261,9 @@ static void ram_init_bitmaps(RAMState *rs) ram_list_init_bitmaps(); /* We don't use dirty log with background snapshots */ if (!migrate_background_snapshot()) { +#ifdef TARGET_AARCH64 kvm_update_hdbss_cap(true); +#endif memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs, false); } -- Gitee From c709e84c483af5466b9bf1d289a70813942da7e0 Mon Sep 17 00:00:00 2001 From: libai Date: Tue, 1 Apr 2025 15:33:54 +0800 Subject: [PATCH 906/939] virtio-pci:Batch processing of IRQFD mapping for multi queue Virtio devices The virtio device with multiple queues currently calls ioctl every time it establishes an irqfd route for vq. Since the devices will not actually run until all queue irqfds are completed, these irqfd routes can be temporarily stored and submitted to kvm through ioctl at once to reduce the number of ioctl attempts and optimize the startup speed of virtio devices. Signed-off-by: libai --- hw/virtio/virtio-pci.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 06b125ec62..7cd15f70e3 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -49,6 +49,18 @@ * configuration space */ #define VIRTIO_PCI_CONFIG_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev)) +static KVMRouteChange virtio_pci_route_change; + +static inline void virtio_pci_begin_route_changes(void) +{ + virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state); +} + +static inline void virtio_pci_commit_route_changes(void) +{ + kvm_irqchip_commit_route_changes(&virtio_pci_route_change); +} + static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size, VirtIOPCIProxy *dev); static void virtio_pci_reset(DeviceState *qdev); @@ -815,12 +827,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, int ret; if (irqfd->users == 0) { - KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); - ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); + ret = kvm_irqchip_add_msi_route(&virtio_pci_route_change, vector, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_route_changes(&c); irqfd->virq = ret; } irqfd->users++; @@ -950,12 +960,14 @@ static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) } #endif + virtio_pci_begin_route_changes(); for (queue_no = 0; queue_no < nvqs; queue_no++) { if (!virtio_queue_get_num(vdev, queue_no)) { return -1; } ret = kvm_virtio_pci_vector_use_one(proxy, queue_no); } + virtio_pci_commit_route_changes(); #ifdef __aarch64__ if (!strcmp(vdev->name, "virtio-net") && ret != 0) { -- Gitee From 66749037256732f369c387c136e14f727a51951f Mon Sep 17 00:00:00 2001 From: libai Date: Tue, 1 Apr 2025 17:09:38 +0800 Subject: [PATCH 907/939] kvm/msi: Mark whether there is an IRQ route table update through changes This patch prevents unnecessary updates to the IRQ route without modification Signed-off-by: libai --- accel/kvm/kvm-all.c | 11 ++++++----- accel/stubs/kvm-stub.c | 2 +- hw/intc/ioapic.c | 5 +++-- hw/misc/ivshmem.c | 6 ++++-- hw/vfio/pci.c | 5 +++-- hw/virtio/virtio-pci.c | 5 +++-- include/sysemu/kvm.h | 2 +- target/i386/kvm/kvm.c | 6 ++++-- 8 files changed, 25 insertions(+), 17 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 825ecb99a8..aa41b42efc 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1902,10 +1902,11 @@ static void kvm_add_routing_entry(KVMState *s, set_gsi(s, entry->gsi); } -static int kvm_update_routing_entry(KVMState *s, +static int kvm_update_routing_entry(KVMRouteChange *c, struct kvm_irq_routing_entry *new_entry) { struct kvm_irq_routing_entry *entry; + KVMState *s = c->s; int n; for (n = 0; n < s->irq_routes->nr; n++) { @@ -1919,7 +1920,7 @@ static int kvm_update_routing_entry(KVMState *s, } *entry = *new_entry; - + c->changes++; return 0; } @@ -2051,7 +2052,7 @@ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) return virq; } -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg, PCIDevice *dev) { struct kvm_irq_routing_entry kroute = {}; @@ -2081,7 +2082,7 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, trace_kvm_irqchip_update_msi_route(virq); - return kvm_update_routing_entry(s, &kroute); + return kvm_update_routing_entry(c, &kroute); } static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, @@ -2223,7 +2224,7 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, abort(); } -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg) { return -ENOSYS; } diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index b071afee45..1fffdc0ea2 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -65,7 +65,7 @@ void kvm_irqchip_release_virq(KVMState *s, int virq) { } -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg, PCIDevice *dev) { return -ENOSYS; diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 716ffc8bbb..0b43aec8fa 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -195,6 +195,7 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s) int i; if (kvm_irqchip_is_split()) { + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); for (i = 0; i < IOAPIC_NUM_PINS; i++) { MSIMessage msg; struct ioapic_entry_info info; @@ -202,10 +203,10 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s) if (!info.masked) { msg.address = info.addr; msg.data = info.data; - kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL); + kvm_irqchip_update_msi_route(&c, i, msg, NULL); } } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); } #endif } diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index ad9a3c546e..f66491a7a7 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -278,6 +278,7 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector, IVShmemState *s = IVSHMEM_COMMON(dev); EventNotifier *n = &s->peers[s->vm_id].eventfds[vector]; MSIVector *v = &s->msi_vectors[vector]; + KVMRouteChange c; int ret; IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector); @@ -287,11 +288,12 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector, } assert(!v->unmasked); - ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev); + c = kvm_irqchip_begin_route_changes(kvm_state); + ret = kvm_irqchip_update_msi_route(&c, v->virq, msg, dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq); if (ret < 0) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 293deb8737..ce958848b6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -507,8 +507,9 @@ static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, PCIDevice *pdev) { - kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); - kvm_irqchip_commit_routes(kvm_state); + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); + kvm_irqchip_update_msi_route(&c, vector->virq, msg, pdev); + kvm_irqchip_commit_route_changes(&c); } static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 7cd15f70e3..a677fa0736 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1044,12 +1044,13 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg, + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); + ret = kvm_irqchip_update_msi_route(&c, irqfd->virq, msg, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); } } diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 176aa53cbe..16cccc881e 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -501,7 +501,7 @@ void kvm_init_cpu_signals(CPUState *cpu); * @return: virq (>=0) when success, errno (<0) when failed. */ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev); -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg, PCIDevice *dev); void kvm_irqchip_commit_routes(KVMState *s); diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 2df3ff99c3..3a88e65635 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5700,9 +5700,11 @@ void kvm_update_msi_routes_all(void *private, bool global, { int cnt = 0, vector; MSIRouteEntry *entry; + KVMRouteChange c; MSIMessage msg; PCIDevice *dev; + c = kvm_irqchip_begin_route_changes(kvm_state); /* TODO: explicit route update */ QLIST_FOREACH(entry, &msi_route_list, list) { cnt++; @@ -5719,9 +5721,9 @@ void kvm_update_msi_routes_all(void *private, bool global, */ continue; } - kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); + kvm_irqchip_update_msi_route(&c, entry->virq, msg, dev); } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); trace_kvm_x86_update_msi_routes(cnt); } -- Gitee From d6e6ef58847bf34db9535649bf33e2a72b59495e Mon Sep 17 00:00:00 2001 From: libai Date: Tue, 1 Apr 2025 17:28:02 +0800 Subject: [PATCH 908/939] virtio/irqfd: Batch processing of irqfd related operations during virtio device startup This patch adds batch processing for unmask operations Signed-off-by: libai --- hw/virtio/virtio-pci.c | 97 ++++++++++++++++++++++++++++++++++---- include/hw/virtio/virtio.h | 1 + 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index a677fa0736..558471307a 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -51,14 +51,86 @@ static KVMRouteChange virtio_pci_route_change; -static inline void virtio_pci_begin_route_changes(void) +static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, + EventNotifier *n, + unsigned int vector); + +static inline void virtio_pci_begin_route_changes(VirtIODevice *vdev) +{ + if (!vdev->defer_kvm_irq_routing) { + virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state); + } +} + +static inline void virtio_pci_commit_route_changes(VirtIODevice *vdev) { + if (!vdev->defer_kvm_irq_routing) { + kvm_irqchip_commit_route_changes(&virtio_pci_route_change); + } +} + +static void virtio_pci_prepare_kvm_msi_virq_batch(VirtIOPCIProxy *proxy) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (vdev->defer_kvm_irq_routing) { + qemu_log("invaild defer kvm irq routing state: %d\n", vdev->defer_kvm_irq_routing); + return; + } virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state); + vdev->defer_kvm_irq_routing = true; } -static inline void virtio_pci_commit_route_changes(void) +static void virtio_pci_commit_kvm_msi_virq_batch(VirtIOPCIProxy *proxy) { + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + EventNotifier *n; + VirtQueue *vq; + int vector, index, ret; + + if (!vdev->defer_kvm_irq_routing) { + qemu_log("invaild defer kvm irq routing state: %d\n", vdev->defer_kvm_irq_routing); + return; + } + vdev->defer_kvm_irq_routing = false; kvm_irqchip_commit_route_changes(&virtio_pci_route_change); + + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + return; + } + + for (vector = 0; vector < proxy->pci_dev.msix_entries_nr; vector++) { + if (msix_is_masked(&proxy->pci_dev, vector)) { + continue; + } + + if (vector == vdev->config_vector) { + n = virtio_config_get_guest_notifier(vdev); + ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + if (ret) { + qemu_log("config irqfd use failed: %d\n", ret); + } + continue; + } + + vq = virtio_vector_first_queue(vdev, vector); + + while (vq) { + index = virtio_get_queue_index(vq); + if (!virtio_queue_get_num(vdev, index)) { + break; + } + if (index < proxy->nvqs_with_notifiers) { + n = virtio_queue_get_guest_notifier(vq); + ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + if (ret < 0) { + qemu_log("Error: irqfd use failed: %d\n", ret); + } + } + vq = virtio_vector_next_queue(vq); + } + } } static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size, @@ -959,15 +1031,17 @@ static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) kvm_create_shadow_device(&proxy->pci_dev); } #endif - - virtio_pci_begin_route_changes(); for (queue_no = 0; queue_no < nvqs; queue_no++) { if (!virtio_queue_get_num(vdev, queue_no)) { return -1; } + } + + virtio_pci_begin_route_changes(vdev); + for (queue_no = 0; queue_no < nvqs; queue_no++) { ret = kvm_virtio_pci_vector_use_one(proxy, queue_no); } - virtio_pci_commit_route_changes(); + virtio_pci_commit_route_changes(vdev); #ifdef __aarch64__ if (!strcmp(vdev->name, "virtio-net") && ret != 0) { @@ -1044,13 +1118,13 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); - ret = kvm_irqchip_update_msi_route(&c, irqfd->virq, msg, + virtio_pci_begin_route_changes(vdev); + ret = kvm_irqchip_update_msi_route(&virtio_pci_route_change, irqfd->virq, msg, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_route_changes(&c); + virtio_pci_commit_route_changes(vdev); } } @@ -1065,7 +1139,9 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, event_notifier_set(n); } } else { - ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + if (!vdev->defer_kvm_irq_routing) { + ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + } } return ret; } @@ -1322,6 +1398,8 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) if ((with_irqfd || (vdev->use_guest_notifier_mask && k->guest_notifier_mask)) && assign) { + + virtio_pci_prepare_kvm_msi_virq_batch(proxy); if (with_irqfd) { proxy->vector_irqfd = g_malloc0(sizeof(*proxy->vector_irqfd) * @@ -1339,6 +1417,7 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) r = msix_set_vector_notifiers(&proxy->pci_dev, virtio_pci_vector_unmask, virtio_pci_vector_mask, virtio_pci_vector_poll); + virtio_pci_commit_kvm_msi_virq_batch(proxy); if (r < 0) { goto notifiers_error; } diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 78db2bde98..672f7445dd 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -147,6 +147,7 @@ struct VirtIODevice bool use_started; bool started; bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */ + bool defer_kvm_irq_routing; bool disable_legacy_check; bool vhost_started; VMChangeStateEntry *vmstate; -- Gitee From 0e3d3b9a3cd54340b2d9991918a172ed38670bcd Mon Sep 17 00:00:00 2001 From: libai Date: Wed, 2 Apr 2025 20:14:10 +0800 Subject: [PATCH 909/939] migration:Extand the fdtable in the incoming phase of migration Perform the fdtable extension in advance to avoid time consumption caused by triggering the fdtable extension during the migration downtime. Signed-off-by: libai --- migration/migration.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index dce22c2da5..9a433e615b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -68,6 +68,8 @@ #include "sysemu/dirtylimit.h" #include "qemu/sockets.h" +#define DEFAULT_FD_MAX 4096 + static NotifierList migration_state_notifiers = NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); @@ -1712,6 +1714,31 @@ void migrate_del_blocker(Error **reasonp) } } +/* + * Kernel will expand the fatable allocated to the qemu process when + * the number of fds held by qemu process exceeds a power of 2 (starting from 64). + * Each expansion introduces tens of ms of latency due to RCU synchronization. + * The expansion is completed during qemu process initialization to avoid + * triggering this action during the migration downtime phase. + */ +static void qemu_pre_extend_fdtable(void) +{ + int buffer[DEFAULT_FD_MAX] = {0}; + int i; + + /* expand fdtable */ + for (i = 0; i < DEFAULT_FD_MAX; i++) { + buffer[i] = qemu_dup(STDIN_FILENO); + } + + /* close tmp fd */ + for (i = 0; i < DEFAULT_FD_MAX; i++) { + if (buffer[i] > 0) { + (void)qemu_close(buffer[i]); + } + } +} + void qmp_migrate_incoming(const char *uri, bool has_channels, MigrationChannelList *channels, Error **errp) { @@ -1731,6 +1758,8 @@ void qmp_migrate_incoming(const char *uri, bool has_channels, return; } + qemu_pre_extend_fdtable(); + qemu_start_incoming_migration(uri, has_channels, channels, &local_err); if (local_err) { -- Gitee From 0cc093ba0d25536162685a0bd45b80f97d91cf15 Mon Sep 17 00:00:00 2001 From: libai Date: Wed, 9 Apr 2025 11:06:52 +0800 Subject: [PATCH 910/939] migration/memory:Optimize unnecessary memory region updates during live migration During the startup phase of the destination VM for live migration, there is no need to update the memory region in real time. Instead, just force commit once before each device load state. Signed-off-by: libai --- include/exec/memory.h | 5 +++++ migration/savevm.c | 7 ++++++ migration/vmstate.c | 8 +++++++ system/memory.c | 45 ++++++++++++++++++++++----------------- tests/unit/test-vmstate.c | 6 ++++++ 5 files changed, 51 insertions(+), 20 deletions(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index c14dc69d27..924bdbd481 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2567,6 +2567,11 @@ void memory_region_transaction_begin(void); */ void memory_region_transaction_commit(void); +/** + * memory_region_commit: Force commit memory region immediately. + */ +void memory_region_commit(void); + /** * memory_listener_register: register callbacks to be called when memory * sections are mapped or unmapped into an address diff --git a/migration/savevm.c b/migration/savevm.c index cc65da605e..030a4bf7d2 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2857,6 +2857,10 @@ int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) uint8_t section_type; int ret = 0; + if (qemu_mutex_iothread_locked()) { + memory_region_transaction_begin(); + } + retry: while (true) { section_type = qemu_get_byte(f); @@ -2900,6 +2904,9 @@ retry: } out: + if (qemu_mutex_iothread_locked()) { + memory_region_transaction_commit(); + } if (ret < 0) { qemu_file_set_error(f, ret); diff --git a/migration/vmstate.c b/migration/vmstate.c index bd08e390c5..e621d8ddb7 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -20,6 +20,7 @@ #include "qemu/bitops.h" #include "qemu/error-report.h" #include "trace.h" +#include "exec/memory.h" static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, JSONWriter *vmdesc, @@ -184,6 +185,13 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, return ret; } if (vmsd->post_load) { + /** + * We call memory_transaction_begin in qemu_loadvm_state_main, + * so address space will not be updated during vm state loading. + * But some dev need to use address space here, force commit + * memory region transaction before call post_load. + */ + memory_region_commit(); ret = vmsd->post_load(opaque, version_id); } trace_vmstate_load_state_end(vmsd->name, "end", ret); diff --git a/system/memory.c b/system/memory.c index 9db07fd832..fd76eb7048 100644 --- a/system/memory.c +++ b/system/memory.c @@ -1117,34 +1117,39 @@ void memory_region_transaction_begin(void) ++memory_region_transaction_depth; } -void memory_region_transaction_commit(void) +void memory_region_commit(void) { AddressSpace *as; + if (memory_region_update_pending) { + flatviews_reset(); + + MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); + + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_set_flatview(as); + address_space_update_ioeventfds(as); + } + memory_region_update_pending = false; + ioeventfd_update_pending = false; + MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); + } else if (ioeventfd_update_pending) { + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_ioeventfds(as); + } + ioeventfd_update_pending = false; + } +} + +void memory_region_transaction_commit(void) +{ assert(memory_region_transaction_depth); assert(qemu_mutex_iothread_locked()); --memory_region_transaction_depth; if (!memory_region_transaction_depth) { - if (memory_region_update_pending) { - flatviews_reset(); - - MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); - - QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - address_space_set_flatview(as); - address_space_update_ioeventfds(as); - } - memory_region_update_pending = false; - ioeventfd_update_pending = false; - MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); - } else if (ioeventfd_update_pending) { - QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - address_space_update_ioeventfds(as); - } - ioeventfd_update_pending = false; - } - } + memory_region_commit(); + } } static void memory_region_destructor_none(MemoryRegion *mr) diff --git a/tests/unit/test-vmstate.c b/tests/unit/test-vmstate.c index 0b7d5ecd68..22c586eee0 100644 --- a/tests/unit/test-vmstate.c +++ b/tests/unit/test-vmstate.c @@ -31,6 +31,7 @@ #include "../migration/savevm.h" #include "qemu/module.h" #include "io/channel-file.h" +#include "exec/memory.h" static int temp_fd; @@ -1479,6 +1480,11 @@ static void test_tmp_struct(void) g_assert_cmpint(obj.f, ==, 8); /* From the child->parent */ } +/* stub for ut */ +void memory_region_commit(void) +{ +} + int main(int argc, char **argv) { g_autofree char *temp_file = g_strdup_printf("%s/vmst.test.XXXXXX", -- Gitee From d43019e644fb93c64e9016c5d618d8e20a60270d Mon Sep 17 00:00:00 2001 From: libai Date: Wed, 9 Apr 2025 14:22:19 +0800 Subject: [PATCH 911/939] memory/eventfd:Introduce ioeventfd batch processing to reduce the time required to update ioeventfd Setting ioeventfd triggers kernel RCU synchronization, which is time-consuming. Change it to temporarily store the modification of ioeventfds, and submit it for effect after setting is complete. Signed-off-by: libai --- accel/kvm/kvm-all.c | 32 ++++++++++++++++++++++++++++++++ include/exec/memory.h | 21 +++++++++++++++++++++ linux-headers/linux/kvm.h | 6 ++++++ system/memory.c | 2 ++ 4 files changed, 61 insertions(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index aa41b42efc..f96afb1230 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1737,6 +1737,36 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener, } } +static int kvm_ioeventfd_batch(bool start) +{ + int ret; + struct kvm_ioeventfd iofd = { + .flags = start ? + KVM_IOEVENTFD_FLAG_BATCH_BEGIN : KVM_IOEVENTFD_FLAG_BATCH_END, + }; + + if (!kvm_enabled()) { + return -ENOSYS; + } + + ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); + if (ret < 0) { + return -errno; + } + + return 0; +} + +static void kvm_ioeventfd_begin(MemoryListener *listener) +{ + kvm_ioeventfd_batch(true); +} + +static void kvm_ioeventfd_end(MemoryListener *listener) +{ + kvm_ioeventfd_batch(false); +} + static void kvm_io_ioeventfd_del(MemoryListener *listener, MemoryRegionSection *section, bool match_data, uint64_t data, @@ -2631,6 +2661,8 @@ static int kvm_init(MachineState *ms) s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; + s->memory_listener.listener.eventfd_begin = kvm_ioeventfd_begin; + s->memory_listener.listener.eventfd_end = kvm_ioeventfd_end; kvm_memory_listener_register(s, &s->memory_listener, &address_space_memory, 0, "kvm-memory"); diff --git a/include/exec/memory.h b/include/exec/memory.h index 924bdbd481..69021ba491 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1079,6 +1079,27 @@ struct MemoryListener { void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section, bool match_data, uint64_t data, EventNotifier *e); + /** + * @eventfd_begin: + * + * Called during an address space begin to update ioeventfd, + * notify kvm that ioeventfd will be update in batches. + * + * @listener: The #MemoryListener. + */ + void (*eventfd_begin)(MemoryListener *listener); + + /** + * @eventfd_end: + * + * Called during an address space update ioeventfd end, + * notify kvm that all ioeventfd modifications have been submitted + * and batch processing can be started. + * + * @listener: The #MemoryListener. + */ + void (*eventfd_end)(MemoryListener *listener); + /** * @coalesced_io_add: * diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index a19683f1e9..0714651440 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -819,6 +819,8 @@ enum { kvm_ioeventfd_flag_nr_deassign, kvm_ioeventfd_flag_nr_virtio_ccw_notify, kvm_ioeventfd_flag_nr_fast_mmio, + kvm_ioeventfd_flag_nr_batch_begin, + kvm_ioeventfd_flag_nr_batch_end, kvm_ioeventfd_flag_nr_max, }; @@ -827,6 +829,10 @@ enum { #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) +#define KVM_IOEVENTFD_FLAG_BATCH_BEGIN \ + (1<< kvm_ioeventfd_flag_nr_batch_begin) +#define KVM_IOEVENTFD_FLAG_BATCH_END \ + (1 << kvm_ioeventfd_flag_nr_batch_end) #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) diff --git a/system/memory.c b/system/memory.c index fd76eb7048..08d34262c3 100644 --- a/system/memory.c +++ b/system/memory.c @@ -1134,10 +1134,12 @@ void memory_region_commit(void) ioeventfd_update_pending = false; MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); } else if (ioeventfd_update_pending) { + MEMORY_LISTENER_CALL_GLOBAL(eventfd_begin, Forward); QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { address_space_update_ioeventfds(as); } ioeventfd_update_pending = false; + MEMORY_LISTENER_CALL_GLOBAL(eventfd_end, Forward); } } -- Gitee From 3b09c85198f4970be18ba8597d545d5dc73a0ba1 Mon Sep 17 00:00:00 2001 From: libai Date: Thu, 10 Apr 2025 16:13:49 +0800 Subject: [PATCH 912/939] memory:Optimize flatview ioeventfd processing When updating memory regions, do not repeat updates for the same memory region to optimize the memory region update process Signed-off-by: libai --- include/exec/memory.h | 2 ++ system/memory.c | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 69021ba491..fe27f323b2 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1201,6 +1201,8 @@ struct FlatView { unsigned nr_allocated; struct AddressSpaceDispatch *dispatch; MemoryRegion *root; + #define FLATVIEW_FLAG_LAST_PROCESSED (1 << 0) + unsigned flags; }; static inline FlatView *address_space_to_flatview(AddressSpace *as) diff --git a/system/memory.c b/system/memory.c index 08d34262c3..7858aa1878 100644 --- a/system/memory.c +++ b/system/memory.c @@ -856,6 +856,13 @@ static void address_space_update_ioeventfds(AddressSpace *as) return; } + view = address_space_get_flatview(as); + if (view->flags & FLATVIEW_FLAG_LAST_PROCESSED) { + flatview_unref(view); + return; + } + view->flags |= FLATVIEW_FLAG_LAST_PROCESSED; + /* * It is likely that the number of ioeventfds hasn't changed much, so use * the previous size as the starting value, with some headroom to avoid @@ -864,7 +871,6 @@ static void address_space_update_ioeventfds(AddressSpace *as) ioeventfd_max = QEMU_ALIGN_UP(as->ioeventfd_nb, 4); ioeventfds = g_new(MemoryRegionIoeventfd, ioeventfd_max); - view = address_space_get_flatview(as); FOR_EACH_FLAT_RANGE(fr, view) { for (i = 0; i < fr->mr->ioeventfd_nb; ++i) { tmp = addrrange_shift(fr->mr->ioeventfds[i].addr, @@ -1111,6 +1117,17 @@ static void address_space_update_topology(AddressSpace *as) address_space_set_flatview(as); } +static void address_space_update_view(AddressSpace *as) +{ + FlatView *view; + + view = address_space_get_flatview(as); + if (view->flags & FLATVIEW_FLAG_LAST_PROCESSED) { + view->flags &= ~FLATVIEW_FLAG_LAST_PROCESSED; + } + flatview_unref(view); +} + void memory_region_transaction_begin(void) { qemu_flush_coalesced_mmio_buffer(); @@ -1132,6 +1149,9 @@ void memory_region_commit(void) } memory_region_update_pending = false; ioeventfd_update_pending = false; + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_view(as); + } MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); } else if (ioeventfd_update_pending) { MEMORY_LISTENER_CALL_GLOBAL(eventfd_begin, Forward); @@ -1139,6 +1159,9 @@ void memory_region_commit(void) address_space_update_ioeventfds(as); } ioeventfd_update_pending = false; + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_view(as); + } MEMORY_LISTENER_CALL_GLOBAL(eventfd_end, Forward); } } @@ -3149,6 +3172,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->name = g_strdup(name ? name : "anonymous"); address_space_update_topology(as); address_space_update_ioeventfds(as); + address_space_update_view(as); } static void do_address_space_destroy(AddressSpace *as) -- Gitee From c18333142111a3bd55429594436f25765d41077a Mon Sep 17 00:00:00 2001 From: Adttil <2429917001@qq.com> Date: Wed, 9 Apr 2025 22:57:50 +0800 Subject: [PATCH 913/939] vdpa/iommufd: All vdpa devices perform only one log_sync each time. For all vdpa devices, since they share the same dirty page bytemap, only one synchronization is needed each time the dirty page bytemap is synchronized. Signed-off-by: Adttil <2429917001@qq.com> Signed-off-by: libai --- hw/virtio/vdpa-dev-iommufd.c | 52 ++++++++++++++++++++++++++++++++++++ hw/virtio/vdpa-dev.c | 1 + hw/virtio/vhost.c | 16 +++++------ include/exec/memory.h | 10 +++++++ include/hw/virtio/vhost.h | 15 +++++++++++ system/memory.c | 6 +++++ 6 files changed, 92 insertions(+), 8 deletions(-) diff --git a/hw/virtio/vdpa-dev-iommufd.c b/hw/virtio/vdpa-dev-iommufd.c index 668c6a1cb1..2b0498f9dc 100644 --- a/hw/virtio/vdpa-dev-iommufd.c +++ b/hw/virtio/vdpa-dev-iommufd.c @@ -12,6 +12,9 @@ #include "exec/target_page.h" #include "exec/address-spaces.h" #include "hw/virtio/vdpa-dev-iommufd.h" +#include "migration/migration.h" +#include "qapi/qapi-commands-migration.h" +#include "hw/virtio/vhost.h" static QLIST_HEAD(, VDPAIOMMUFDContainer) vdpa_container_list = QLIST_HEAD_INITIALIZER(vdpa_container_list); @@ -118,6 +121,51 @@ static void vhost_vdpa_iommufd_container_region_del(MemoryListener *listener, memory_region_unref(section->mr); } +static void vhost_vdpa_iommufd_container_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener); + IOMMUFDHWPT *hwpt; + VhostVdpaDevice *vdev; + MigrationState *ms = migrate_get_current(); + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + QLIST_FOREACH(vdev, &hwpt->device_list, next) { + if (!vdev->dev.log_enabled || !vdev->dev.log) { + continue; + } + + /** + * For the vhost-vdpa device, log_sync is performed on the entire VM, + * that is, this sync is for the entire flatview. + * Therefore, the first MemoryRegionSection of flatview needs to be + * synchronized. The rest of the mrs do not need to be synchronized. + */ + if (is_first_section(section)) { + int r = vdev->dev.vhost_ops->vhost_log_sync(&vdev->dev); + if (r < 0) { + qemu_log("Failed to sync dirty log: %d\n", r); + if (migration_is_running(ms->state)) { + qmp_migrate_cancel(NULL); + } + return; + } + } + + /** + * Dirty maps are merged separately by MRS, so each MRS needs to be iterated. + */ + if (vhost_bytemap_log_support(&vdev->dev)) { + vhost_sync_dirty_bytemap(&vdev->dev, section); + } else { + vhost_sync_dirty_bitmap(&vdev->dev, section, 0x0, ~0x0ULL); + } + return; + } + } +} + + /* * IOTLB API used by vhost vdpa iommufd container */ @@ -125,6 +173,7 @@ const MemoryListener vhost_vdpa_iommufd_container_listener = { .name = "vhost-vdpa-iommufd-container", .region_add = vhost_vdpa_iommufd_container_region_add, .region_del = vhost_vdpa_iommufd_container_region_del, + .log_sync = vhost_vdpa_iommufd_container_log_sync, }; static int vhost_vdpa_container_connect_iommufd(VDPAIOMMUFDContainer *container) @@ -268,6 +317,7 @@ static int vhost_vdpa_container_attach_device(VDPAIOMMUFDContainer *container, V ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id); if (ret == 0) { QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next); + vdev->dev.has_container = true; return 0; } } @@ -293,6 +343,7 @@ static int vhost_vdpa_container_attach_device(VDPAIOMMUFDContainer *container, V } QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next); + vdev->dev.has_container = true; QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); return 0; @@ -318,6 +369,7 @@ static void vhost_vdpa_container_detach_device(VDPAIOMMUFDContainer *container, ioctl(vdev->vhostfd, VHOST_VDPA_DETACH_IOMMUFD_PT, &hwpt->hwpt_id); QLIST_SAFE_REMOVE(vdev, next); + vdev->dev.has_container = false; /* No device using this hwpt, free it */ if (QLIST_EMPTY(&hwpt->device_list)) { diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index b256ad540c..7ce8547419 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -111,6 +111,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) v->dev.vq_index = 0; v->dev.vq_index_end = v->dev.nvqs; v->dev.backend_features = 0; + v->dev.has_container = false; v->started = false; ret = vhost_vdpa_get_iova_range(v->vhostfd, &iova_range); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index ed2f41e47a..58207e472b 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -46,7 +46,7 @@ do { } while (0) #endif -static inline bool vhost_bytemap_log_support(struct vhost_dev *dev) +bool vhost_bytemap_log_support(struct vhost_dev *dev) { return (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG)); } @@ -159,10 +159,10 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev) } } -static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, - MemoryRegionSection *section, - hwaddr first, - hwaddr last) +int vhost_sync_dirty_bitmap(struct vhost_dev *dev, + MemoryRegionSection *section, + hwaddr first, + hwaddr last) { int i; hwaddr start_addr; @@ -239,8 +239,8 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, return 0; } -static int vhost_sync_dirty_bytemap(struct vhost_dev *dev, - MemoryRegionSection *section) +int vhost_sync_dirty_bytemap(struct vhost_dev *dev, + MemoryRegionSection *section) { unsigned long *bytemap = dev->log->log; return memory_section_set_dirty_bytemap(section, bytemap); @@ -253,7 +253,7 @@ static void vhost_log_sync(MemoryListener *listener, memory_listener); MigrationState *ms = migrate_get_current(); - if (!dev->log_enabled || !dev->log) { + if (!dev->log_enabled || !dev->log || dev->has_container) { return; } diff --git a/include/exec/memory.h b/include/exec/memory.h index c14dc69d27..e58ca3d368 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2085,6 +2085,16 @@ void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client); void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, hwaddr size); +/** + * is_first_section: Determine whether a MemoryRegionSection is the first section + * + * Determine whether a MemoryRegionSection is the first section + * of its corresponding parent MemoryRegion. + * + * @section: MemoryRegionSection + */ +bool is_first_section(MemoryRegionSection *section); + /** * memory_region_clear_dirty_bitmap - clear dirty bitmap for memory range * diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 9ca5819deb..598ae13757 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -133,6 +133,7 @@ struct vhost_dev { QLIST_HEAD(, vhost_iommu) iommu_list; IOMMUNotifier n; const VhostDevConfigOps *config_ops; + bool has_container; }; extern const VhostOps kernel_ops; @@ -206,6 +207,14 @@ static inline bool vhost_dev_is_started(struct vhost_dev *hdev) return hdev->started; } +/** + * vhost_bytemap_log_support() - check if the vhost device supports dirty bytemap + * @dev: common vhost_dev structure + * + * Return: true if the vhost device supports dirty bytemap, false otherwise. + */ +bool vhost_bytemap_log_support(struct vhost_dev *dev); + /** * vhost_dev_start() - start the vhost device * @hdev: common vhost_dev structure @@ -343,6 +352,12 @@ int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, struct vhost_inflight *inflight); bool used_memslots_is_exceeded(void); bool vhost_dev_has_iommu(struct vhost_dev *dev); +int vhost_sync_dirty_bitmap(struct vhost_dev *dev, + MemoryRegionSection *section, + hwaddr first, + hwaddr last); +int vhost_sync_dirty_bytemap(struct vhost_dev *dev, + MemoryRegionSection *section); #ifdef CONFIG_VHOST int vhost_reset_device(struct vhost_dev *hdev); diff --git a/system/memory.c b/system/memory.c index 9db07fd832..dff55f7388 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2271,6 +2271,12 @@ void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, memory_region_get_dirty_log_mask(mr)); } +bool is_first_section(MemoryRegionSection *section) +{ + return section->fv->ranges->addr.start == section->offset_within_address_space && + section->fv->ranges->addr.size == section->size; +} + /* * If memory region `mr' is NULL, do global sync. Otherwise, sync * dirty bitmap for the specified memory region. -- Gitee From d13e44fe048159d48891887169f756ac974d07fb Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Mon, 26 May 2025 16:49:00 +0800 Subject: [PATCH 914/939] hw/arm/virt: decouple migrate_hdbss_buffer_size() with kvm_update_hdbss_cap() So that we can move kvm_update_hdbss_cap() to accel/kvm/kvm-all.c Signed-of-by: Jason Zeng --- migration/ram.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index a8308eb005..ee57da62f6 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2813,7 +2813,7 @@ static void xbzrle_cleanup(void) } #ifdef TARGET_AARCH64 -static void kvm_update_hdbss_cap(bool enable) +static void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size) { KVMState *s = kvm_state; int size, ret; @@ -2822,7 +2822,7 @@ static void kvm_update_hdbss_cap(bool enable) return; } - size = migrate_hdbss_buffer_size(); + size = hdbss_buffer_size; if (size < 0 || size > MAX_HDBSS_BUFFER_SIZE) { fprintf(stderr, "Invalid hdbss buffer size: %d\n", size); return; @@ -2856,7 +2856,7 @@ static void ram_save_cleanup(void *opaque) * memory_global_dirty_log_start/stop used in pairs */ #ifdef TARGET_AARCH64 - kvm_update_hdbss_cap(false); + kvm_update_hdbss_cap(false, 0); #endif memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); } @@ -3262,7 +3262,7 @@ static void ram_init_bitmaps(RAMState *rs) /* We don't use dirty log with background snapshots */ if (!migrate_background_snapshot()) { #ifdef TARGET_AARCH64 - kvm_update_hdbss_cap(true); + kvm_update_hdbss_cap(true, migrate_hdbss_buffer_size()); #endif memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs, false); -- Gitee From ff64aed3c87427dfa65fa85aef93b44372aefe7d Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Mon, 26 May 2025 16:59:20 +0800 Subject: [PATCH 915/939] hw/arm/virt: HDBSS: fix arm-softmmu build on x86 platform Move kvm_update_hdbss_cap() to accel/kvm/kvm-stub.c, check kvm_enabled() and add stub function Fixes: e549f32b1a88 ("hw/arm/virt: support the HDBSS feature") Signed-off-by: Jason Zeng --- accel/kvm/kvm-all.c | 25 +++++++++++++++++++++++++ accel/stubs/kvm-stub.c | 5 +++++ include/sysemu/kvm.h | 8 ++++++++ migration/migration.h | 7 ------- migration/ram.c | 35 ++++++----------------------------- 5 files changed, 44 insertions(+), 36 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index f96afb1230..7d175d3262 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -3251,6 +3251,31 @@ bool kvm_arm_supports_user_irq(void) return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); } +void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size) +{ + KVMState *s = kvm_state; + int size, ret; + + if (s == NULL || !kvm_check_extension(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK)) { + return; + } + + size = hdbss_buffer_size; + if (size < 0 || size > MAX_HDBSS_BUFFER_SIZE) { + fprintf(stderr, "Invalid hdbss buffer size: %d\n", size); + return; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK, 0, + enable ? size : 0); + if (ret) { + fprintf(stderr, "Could not %s KVM_CAP_ARM_HW_DIRTY_STATE_TRACK: %d\n", + enable ? "enable" : "disable", ret); + } + + return; +} + #ifdef KVM_CAP_SET_GUEST_DEBUG struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc) { diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 1fffdc0ea2..2625175b99 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -119,6 +119,11 @@ bool kvm_arm_supports_user_irq(void) return false; } +void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size) +{ + g_assert_not_reached(); +} + bool kvm_dirty_ring_enabled(void) { return false; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 16cccc881e..098257e72f 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -229,6 +229,14 @@ int kvm_has_gsi_routing(void); */ bool kvm_arm_supports_user_irq(void); +/* + * The default HDBSS size. The value ranges [0, 9]. + * Set to 0 to disable the HDBSS feature. + */ +#define DEFAULT_HDBSS_BUFFER_SIZE 0 +#define MAX_HDBSS_BUFFER_SIZE 9 + +void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size); int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); int kvm_on_sigbus(int code, void *addr); diff --git a/migration/migration.h b/migration/migration.h index 4a95f00157..eeddb7c0bd 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -48,13 +48,6 @@ struct PostcopyBlocktimeContext; */ #define CLEAR_BITMAP_SHIFT_MAX 31 -/* - * The default HDBSS size. The value ranges [0, 9]. - * Set to 0 to disable the HDBSS feature. - */ -#define DEFAULT_HDBSS_BUFFER_SIZE 0 -#define MAX_HDBSS_BUFFER_SIZE 9 - /* This is an abstraction of a "temp huge page" for postcopy's purpose */ typedef struct { /* diff --git a/migration/ram.c b/migration/ram.c index ee57da62f6..91bec89a6e 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2812,33 +2812,6 @@ static void xbzrle_cleanup(void) XBZRLE_cache_unlock(); } -#ifdef TARGET_AARCH64 -static void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size) -{ - KVMState *s = kvm_state; - int size, ret; - - if (s == NULL || !kvm_check_extension(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK)) { - return; - } - - size = hdbss_buffer_size; - if (size < 0 || size > MAX_HDBSS_BUFFER_SIZE) { - fprintf(stderr, "Invalid hdbss buffer size: %d\n", size); - return; - } - - ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK, 0, - enable ? size : 0); - if (ret) { - fprintf(stderr, "Could not %s KVM_CAP_ARM_HW_DIRTY_STATE_TRACK: %d\n", - enable ? "enable" : "disable", ret); - } - - return; -} -#endif - static void ram_save_cleanup(void *opaque) { RAMState **rsp = opaque; @@ -2856,7 +2829,9 @@ static void ram_save_cleanup(void *opaque) * memory_global_dirty_log_start/stop used in pairs */ #ifdef TARGET_AARCH64 - kvm_update_hdbss_cap(false, 0); + if (kvm_enabled()) { + kvm_update_hdbss_cap(false, 0); + } #endif memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); } @@ -3262,7 +3237,9 @@ static void ram_init_bitmaps(RAMState *rs) /* We don't use dirty log with background snapshots */ if (!migrate_background_snapshot()) { #ifdef TARGET_AARCH64 - kvm_update_hdbss_cap(true, migrate_hdbss_buffer_size()); + if (kvm_enabled()) { + kvm_update_hdbss_cap(true, migrate_hdbss_buffer_size()); + } #endif memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs, false); -- Gitee From 3f2e953c7faf3043396a649d4891d3d95441e70f Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Mon, 26 May 2025 17:06:57 +0800 Subject: [PATCH 916/939] arm: VirtCCA: fix arm-softmmu build on x86 platform Add stub function for kvm_load_user_data(). Fixes: 9eacd1a6df68 ("arm: VirtCCA: CVM support UEFI boot") Signed-off-by: Jason Zeng --- accel/stubs/kvm-stub.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 2625175b99..e68f3433ad 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -133,3 +133,9 @@ uint32_t kvm_dirty_ring_size(void) { return 0; } + +int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, + struct kvm_numa_info *numa_info) +{ + return -ENOSYS; +} -- Gitee From e97171b8b362b0122754a936053c9793a6ad2f57 Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Mon, 26 May 2025 17:08:49 +0800 Subject: [PATCH 917/939] arm: cvm: fix arm-softmmu build on x86 platform Add stub function for tmm_set_sec_addr() and tmm_set_hpre_addr() Fixes: dffc0f55d93e ("cvm : Add support for TEE-based national encryption acceleration.") Signed-off-by: Jason Zeng --- target/arm/kvm_arm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 62fbb713f4..76137289df 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -497,6 +497,16 @@ static inline void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, { g_assert_not_reached(); } + +static inline void tmm_set_sec_addr(hwaddr base, int num) +{ + g_assert_not_reached(); +} + +static inline void tmm_set_hpre_addr(hwaddr base, int num) +{ + g_assert_not_reached(); +} #endif /** -- Gitee From 4db69439ab84a108795f7dc1ea218aa746f1d2be Mon Sep 17 00:00:00 2001 From: Jinqian Yang Date: Fri, 16 May 2025 18:20:17 +0800 Subject: [PATCH 918/939] target/arm: support the IPIV feature virt inclusion category: feature bugzilla: https://gitee.com/openeuler/qemu/issues/IC1EV7 ------------------------------------------------------------------------ QEMU uses ioctl to enable IPIV. Signed-off-by: Jinqian Yang --- linux-headers/linux/kvm.h | 1 + target/arm/kvm.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index b711c04506..92fc1fbb85 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1214,6 +1214,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502 +#define KVM_CAP_ARM_HISI_IPIV 798 #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index ee5ba68305..ab31515a2a 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -257,6 +257,22 @@ int kvm_arch_get_default_type(MachineState *ms) return fixed_ipa ? 0 : size; } +static void kvm_update_ipiv_cap(KVMState *s) +{ + int ret; + + if (!kvm_check_extension(s, KVM_CAP_ARM_HISI_IPIV)) { + return; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HISI_IPIV, 0); + if (ret) { + fprintf(stderr, "Could not enable KVM_CAP_ARM_HISI_IPIV: %d\n", ret); + } + + return; +} + int kvm_arch_init(MachineState *ms, KVMState *s) { MachineClass *mc = MACHINE_GET_CLASS(ms); @@ -330,6 +346,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } kvm_arm_init_debug(s); + kvm_update_ipiv_cap(s); return ret; } -- Gitee From 5b9ece5e96c40f56e7c84bf15d4a5a7d1205bc25 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 26 May 2025 16:58:25 +0800 Subject: [PATCH 919/939] sync header file from upstream The local interrupt controller simulation header file is inconsistent with the upstream header file. To ensure uapi compatibility, the upstream interrupt controller simulation header file is now synchronized. Signed-off-by: Xianglai Li --- hw/intc/loongarch_extioi_kvm.c | 2 +- hw/intc/loongarch_ipi_kvm.c | 2 +- hw/intc/loongarch_pch_pic_kvm.c | 2 +- linux-headers/asm-loongarch/kvm.h | 15 ++++++--------- linux-headers/linux/kvm.h | 13 +++++++------ target/loongarch/kvm/kvm.c | 4 ---- 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/hw/intc/loongarch_extioi_kvm.c b/hw/intc/loongarch_extioi_kvm.c index 2e7c764b7c..94af4378e4 100644 --- a/hw/intc/loongarch_extioi_kvm.c +++ b/hw/intc/loongarch_extioi_kvm.c @@ -115,7 +115,7 @@ static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) } if (!extioi_class->is_created) { - cd.type = KVM_DEV_TYPE_LA_EXTIOI; + cd.type = KVM_DEV_TYPE_LOONGARCH_EIOINTC; ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); if (ret < 0) { error_setg_errno(errp, errno, diff --git a/hw/intc/loongarch_ipi_kvm.c b/hw/intc/loongarch_ipi_kvm.c index fd308eb0c0..57fc05db77 100644 --- a/hw/intc/loongarch_ipi_kvm.c +++ b/hw/intc/loongarch_ipi_kvm.c @@ -128,7 +128,7 @@ static void kvm_loongarch_ipi_realize(DeviceState *dev, Error **errp) } if (!ipi_class->is_created) { - cd.type = KVM_DEV_TYPE_LA_IPI; + cd.type = KVM_DEV_TYPE_LOONGARCH_IPI; ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); if (ret < 0) { error_setg_errno(errp, errno, "Creating the KVM device failed"); diff --git a/hw/intc/loongarch_pch_pic_kvm.c b/hw/intc/loongarch_pch_pic_kvm.c index 8f66d9a01f..e9cef02f9a 100644 --- a/hw/intc/loongarch_pch_pic_kvm.c +++ b/hw/intc/loongarch_pch_pic_kvm.c @@ -113,7 +113,7 @@ static void kvm_loongarch_pch_pic_realize(DeviceState *dev, Error **errp) } if (!pch_pic_class->is_created) { - cd.type = KVM_DEV_TYPE_LA_PCH_PIC; + cd.type = KVM_DEV_TYPE_LOONGARCH_PCHPIC; ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); if (ret < 0) { error_setg_errno(errp, errno, diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h index 34abd65939..d22b19e134 100644 --- a/linux-headers/asm-loongarch/kvm.h +++ b/linux-headers/asm-loongarch/kvm.h @@ -135,25 +135,22 @@ struct kvm_iocsr_entry { #define KVM_IRQCHIP_NUM_PINS 64 #define KVM_MAX_CORES 256 -#define KVM_LOONGARCH_VM_HAVE_IRQCHIP 0x40000001 +#define KVM_DEV_LOONGARCH_IPI_GRP_REGS 0x40000001 -#define KVM_DEV_LOONGARCH_IPI_GRP_REGS 0x40000002 +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS 0x40000002 -#define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS 0x40000003 - -#define KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS 0x40000006 +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS 0x40000003 #define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU 0x0 #define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE 0x1 #define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE 0x2 -#define KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL 0x40000007 +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL 0x40000004 #define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU 0x0 #define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE 0x1 #define KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED 0x3 -#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL 0x40000004 -#define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT 0 - #define KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS 0x40000005 +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL 0x40000006 +#define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT 0 #endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 0714651440..413663e332 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -1487,12 +1487,13 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA - KVM_DEV_TYPE_LA_PCH_PIC = 0x100, -#define KVM_DEV_TYPE_LA_PCH_PIC KVM_DEV_TYPE_LA_PCH_PIC - KVM_DEV_TYPE_LA_IPI, -#define KVM_DEV_TYPE_LA_IPI KVM_DEV_TYPE_LA_IPI - KVM_DEV_TYPE_LA_EXTIOI, -#define KVM_DEV_TYPE_LA_EXTIOI KVM_DEV_TYPE_LA_EXTIOI + KVM_DEV_TYPE_LOONGARCH_IPI, +#define KVM_DEV_TYPE_LOONGARCH_IPI KVM_DEV_TYPE_LOONGARCH_IPI + KVM_DEV_TYPE_LOONGARCH_EIOINTC, +#define KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_EIOINTC + KVM_DEV_TYPE_LOONGARCH_PCHPIC, +#define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC + KVM_DEV_TYPE_MAX, }; diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c index f6e008a517..f724e77a1b 100644 --- a/target/loongarch/kvm/kvm.c +++ b/target/loongarch/kvm/kvm.c @@ -973,10 +973,6 @@ int kvm_arch_get_default_type(MachineState *ms) int kvm_arch_init(MachineState *ms, KVMState *s) { cap_has_mp_state = kvm_check_extension(s, KVM_CAP_MP_STATE); - if(!kvm_vm_check_attr(kvm_state, KVM_LOONGARCH_VM_HAVE_IRQCHIP, KVM_LOONGARCH_VM_HAVE_IRQCHIP)) { - s->kernel_irqchip_allowed = false; - } - return 0; } -- Gitee From 8bbff5547eb88886ee13fa8eb95658318c457298 Mon Sep 17 00:00:00 2001 From: yechao-w Date: Fri, 30 May 2025 09:07:29 +0800 Subject: [PATCH 920/939] smbios: add processor-family option commit b5831d79671cea3f7bd42cffab93fe6eab8c3db0 upstream For RISC-V the SMBIOS standard requires specific values of the processor family value depending on the bitness of the CPU. Add a processor-family option for SMBIOS table 4. The value of processor-family may exceed 255 and therefore must be provided in the Processor Family 2 field. Set the Processor Family field to 0xFE which signals that the Processor Family 2 is used. Signed-off-by: Heinrich Schuchardt Reviewed-by: Alistair Francis Reviewed-by: Andrew Jones Message-ID: <20240123184229.10415-2-heinrich.schuchardt@canonical.com> Signed-off-by: Alistair Francis --- hw/smbios/smbios.c | 13 +++++++++++-- qemu-options.hx | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 2a90601ac5..647bc6d603 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -102,6 +102,7 @@ static struct { #define DEFAULT_CPU_SPEED 2000 static struct { + uint16_t processor_family; const char *sock_pfx, *manufacturer, *version, *serial, *asset, *part; uint64_t max_speed; uint64_t current_speed; @@ -110,6 +111,7 @@ static struct { .max_speed = DEFAULT_CPU_SPEED, .current_speed = DEFAULT_CPU_SPEED, .processor_id = 0, + .processor_family = 0x01, /* Other */ }; struct type8_instance { @@ -337,6 +339,10 @@ static const QemuOptDesc qemu_smbios_type4_opts[] = { .name = "part", .type = QEMU_OPT_STRING, .help = "part number", + }, { + .name = "processor-family", + .type = QEMU_OPT_NUMBER, + .help = "processor family", }, { .name = "processor-id", .type = QEMU_OPT_NUMBER, @@ -726,7 +732,7 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) snprintf(sock_str, sizeof(sock_str), "%s%2x", type4.sock_pfx, instance); SMBIOS_TABLE_SET_STR(4, socket_designation_str, sock_str); t->processor_type = 0x03; /* CPU */ - t->processor_family = 0x01; /* Other */ + t->processor_family = 0xfe; /* use Processor Family 2 field */ SMBIOS_TABLE_SET_STR(4, processor_manufacturer_str, type4.manufacturer); if (type4.processor_id == 0) { t->processor_id[0] = cpu_to_le32(smbios_cpuid_version); @@ -758,7 +764,7 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) t->thread_count = (threads_per_socket > 255) ? 0xFF : threads_per_socket; t->processor_characteristics = cpu_to_le16(0x02); /* Unknown */ - t->processor_family2 = cpu_to_le16(0x01); /* Other */ + t->processor_family2 = cpu_to_le16(type4.processor_family); if (tbl_len == SMBIOS_TYPE_4_LEN_V30) { t->core_count2 = t->core_enabled2 = cpu_to_le16(cores_per_socket); @@ -1402,6 +1408,9 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) return; } save_opt(&type4.sock_pfx, opts, "sock_pfx"); + type4.processor_family = qemu_opt_get_number(opts, + "processor-family", + 0x01 /* Other */); save_opt(&type4.manufacturer, opts, "manufacturer"); save_opt(&type4.version, opts, "version"); save_opt(&type4.serial, opts, "serial"); diff --git a/qemu-options.hx b/qemu-options.hx index 7fe76c4b1d..cbaa2e5367 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2679,7 +2679,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios, " specify SMBIOS type 3 fields\n" "-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str]\n" " [,asset=str][,part=str][,max-speed=%d][,current-speed=%d]\n" - " [,processor-id=%d]\n" + " [,processor-family=%d,processor-id=%d]\n" " specify SMBIOS type 4 fields\n" "-smbios type=8[,external_reference=str][,internal_reference=str][,connector_type=%d][,port_type=%d]\n" " specify SMBIOS type 8 fields\n" @@ -2707,7 +2707,7 @@ SRST ``-smbios type=3[,manufacturer=str][,version=str][,serial=str][,asset=str][,sku=str]`` Specify SMBIOS type 3 fields -``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-id=%d]`` +``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]`` Specify SMBIOS type 4 fields ``-smbios type=11[,value=str][,path=filename]`` -- Gitee From 4e1255411ea509a014d860f3cab1b5425b6556c8 Mon Sep 17 00:00:00 2001 From: yechao-w Date: Fri, 30 May 2025 09:08:40 +0800 Subject: [PATCH 921/939] smbios: function to set default processor family commit 6f3b727bcc867688034ef1489a58e958142973b1 upstream Provide a function to set the default processor family. Signed-off-by: Heinrich Schuchardt Reviewed-by: Andrew Jones Message-ID: <20240123184229.10415-3-heinrich.schuchardt@canonical.com> Signed-off-by: Alistair Francis --- hw/smbios/smbios.c | 7 +++++++ include/hw/firmware/smbios.h | 1 + 2 files changed, 8 insertions(+) diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 647bc6d603..c0c5a81e66 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -989,6 +989,13 @@ void smbios_set_cpuid(uint32_t version, uint32_t features) field = value; \ } +void smbios_set_default_processor_family(uint16_t processor_family) +{ + if (type4.processor_family <= 0x01) { + type4.processor_family = processor_family; + } +} + void smbios_set_defaults(const char *manufacturer, const char *product, const char *version, bool legacy_mode, bool uuid_encoded, SmbiosEntryPointType ep_type) diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h index 7f3259a630..6e514982d4 100644 --- a/include/hw/firmware/smbios.h +++ b/include/hw/firmware/smbios.h @@ -295,6 +295,7 @@ void smbios_set_cpuid(uint32_t version, uint32_t features); void smbios_set_defaults(const char *manufacturer, const char *product, const char *version, bool legacy_mode, bool uuid_encoded, SmbiosEntryPointType ep_type); +void smbios_set_default_processor_family(uint16_t processor_family); uint8_t *smbios_get_table_legacy(MachineState *ms, size_t *length); void smbios_get_tables(MachineState *ms, const struct smbios_phys_mem_area *mem_array, -- Gitee From 5f27d198462966c13dc60e32be48978ecc987698 Mon Sep 17 00:00:00 2001 From: yechao-w Date: Fri, 30 May 2025 09:12:20 +0800 Subject: [PATCH 922/939] target/riscv: SMBIOS support for RISC-V virt machine commit ecf286478475d11ae4cdef7e52d9c8e1672f2868 upstream Generate SMBIOS tables for the RISC-V mach-virt. Add CONFIG_SMBIOS=y to the RISC-V default config. Set the default processor family in the type 4 table. The implementation is based on the corresponding ARM and Loongson code. With the patch the following firmware tables are provided: etc/smbios/smbios-anchor etc/smbios/smbios-tables Signed-off-by: Heinrich Schuchardt Reviewed-by: Andrew Jones Message-ID: <20240123184229.10415-4-heinrich.schuchardt@canonical.com> Signed-off-by: Alistair Francis --- hw/riscv/Kconfig | 1 + hw/riscv/virt.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig index b6a5eb4452..1e11ac9432 100644 --- a/hw/riscv/Kconfig +++ b/hw/riscv/Kconfig @@ -41,6 +41,7 @@ config RISCV_VIRT select RISCV_IMSIC select SIFIVE_PLIC select SIFIVE_TEST + select SMBIOS select VIRTIO_MMIO select FW_CFG_DMA select PLATFORM_BUS diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c index d2eac24156..9b29ed1108 100644 --- a/hw/riscv/virt.c +++ b/hw/riscv/virt.c @@ -36,6 +36,7 @@ #include "hw/riscv/boot.h" #include "hw/riscv/numa.h" #include "kvm/kvm_riscv.h" +#include "hw/firmware/smbios.h" #include "hw/intc/riscv_aclint.h" #include "hw/intc/riscv_aplic.h" #include "hw/intc/riscv_imsic.h" @@ -1249,6 +1250,45 @@ static void create_platform_bus(RISCVVirtState *s, DeviceState *irqchip) sysbus_mmio_get_region(sysbus, 0)); } +static void virt_build_smbios(RISCVVirtState *s) +{ + MachineClass *mc = MACHINE_GET_CLASS(s); + MachineState *ms = MACHINE(s); + uint8_t *smbios_tables, *smbios_anchor; + size_t smbios_tables_len, smbios_anchor_len; + struct smbios_phys_mem_area mem_array; + const char *product = "QEMU Virtual Machine"; + + if (kvm_enabled()) { + product = "KVM Virtual Machine"; + } + + smbios_set_defaults("QEMU", product, mc->name, false, + true, SMBIOS_ENTRY_POINT_TYPE_64); + + if (riscv_is_32bit(&s->soc[0])) { + smbios_set_default_processor_family(0x200); + } else { + smbios_set_default_processor_family(0x201); + } + + /* build the array of physical mem area from base_memmap */ + mem_array.address = s->memmap[VIRT_DRAM].base; + mem_array.length = ms->ram_size; + + smbios_get_tables(ms, &mem_array, 1, + &smbios_tables, &smbios_tables_len, + &smbios_anchor, &smbios_anchor_len, + &error_fatal); + + if (smbios_anchor) { + fw_cfg_add_file(s->fw_cfg, "etc/smbios/smbios-tables", + smbios_tables, smbios_tables_len); + fw_cfg_add_file(s->fw_cfg, "etc/smbios/smbios-anchor", + smbios_anchor, smbios_anchor_len); + } +} + static void virt_machine_done(Notifier *notifier, void *data) { RISCVVirtState *s = container_of(notifier, RISCVVirtState, @@ -1337,6 +1377,8 @@ static void virt_machine_done(Notifier *notifier, void *data) riscv_setup_direct_kernel(kernel_entry, fdt_load_addr); } + virt_build_smbios(s); + if (virt_is_acpi_enabled(s)) { virt_acpi_setup(s); } -- Gitee From 987e286cc7614c5ff3cc9096798675d7da70a5ea Mon Sep 17 00:00:00 2001 From: yechao-w Date: Fri, 30 May 2025 09:13:08 +0800 Subject: [PATCH 923/939] qemu-options: enable -smbios option on RISC-V commit e2ff0dec156eff4e109c678654df1225d384fd14 upstream With SMBIOS support added for RISC-V we also should enable the command line option. Signed-off-by: Heinrich Schuchardt Reviewed-by: Daniel Henrique Barboza Acked-by: Alistair Francis Reviewed-by: Andrew Jones Message-ID: <20240123184229.10415-5-heinrich.schuchardt@canonical.com> Signed-off-by: Alistair Francis --- qemu-options.hx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qemu-options.hx b/qemu-options.hx index cbaa2e5367..55765fb34c 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2690,7 +2690,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios, " specify SMBIOS type 17 fields\n" "-smbios type=41[,designation=str][,kind=str][,instance=%d][,pcidev=str]\n" " specify SMBIOS type 41 fields\n", - QEMU_ARCH_I386 | QEMU_ARCH_ARM | QEMU_ARCH_LOONGARCH) + QEMU_ARCH_I386 | QEMU_ARCH_ARM | QEMU_ARCH_LOONGARCH | QEMU_ARCH_RISCV) SRST ``-smbios file=binary`` Load SMBIOS entry from binary file. -- Gitee From a06fe21504564a75d2cfdd3b133b67719edc78ec Mon Sep 17 00:00:00 2001 From: panhengchang Date: Thu, 5 Jun 2025 10:05:11 +0800 Subject: [PATCH 924/939] qapi/misc-target: Add Virtcca capability struct and query command. Introduce a new QAPI struct "VirtccaCapility" to represent the VIRTCCA feature capability with a boolean "enabled" filed. Add "query-virtcca-capabilties" command to retrieve this capability information, which targeting HISI AARCH64 platforms. Signed-off-by: panghengchang --- qapi/misc-target.json | 29 +++++++++++++++++++++++++++++ target/arm/kvm-tmm.c | 33 +++++++++++++++++++++++++++++++++ tests/qtest/qmp-cmd-test.c | 1 + 3 files changed, 63 insertions(+) diff --git a/qapi/misc-target.json b/qapi/misc-target.json index 88291453ba..76ed52b65b 100644 --- a/qapi/misc-target.json +++ b/qapi/misc-target.json @@ -487,3 +487,32 @@ { 'command': 'xen-event-inject', 'data': { 'port': 'uint32' }, 'if': 'TARGET_I386' } + +## +# @VirtccaCapability: +# +# The struct describes capability for VIRTCCA feature. +# +# Since: 8.2.0 +## +{ 'struct': 'VirtccaCapability', + 'data': { 'enabled': 'bool' }, + 'if': 'TARGET_AARCH64' } + +## +# @query-virtcca-capabilities: +# +# This command is used to get the VIRTCCA capabilities, and is supported +# on HISI AARCH64 platforms only. +# +# Returns: VirtccaCapability objects. +# +# Since: 8.2.0 +# +# Example: +# +# -> { "execute": "query-virtcca-capabilities" } +# <- { "return": { "enabled": true } } +## +{ 'command': 'query-virtcca-capabilities', 'returns': 'VirtccaCapability', + 'if': 'TARGET_AARCH64' } \ No newline at end of file diff --git a/target/arm/kvm-tmm.c b/target/arm/kvm-tmm.c index ea6bcc0f40..d18ac10896 100644 --- a/target/arm/kvm-tmm.c +++ b/target/arm/kvm-tmm.c @@ -15,11 +15,13 @@ #include "kvm_arm.h" #include "migration/blocker.h" #include "qapi/error.h" +#include "qapi/qapi-commands-misc-target.h" #include "qom/object_interfaces.h" #include "sysemu/kvm.h" #include "sysemu/runstate.h" #include "hw/loader.h" #include "linux-headers/asm-arm64/kvm.h" +#include #define TYPE_TMM_GUEST "tmm-guest" OBJECT_DECLARE_SIMPLE_TYPE(TmmGuest, TMM_GUEST) @@ -27,6 +29,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(TmmGuest, TMM_GUEST) #define TMM_PAGE_SIZE qemu_real_host_page_size() #define TMM_MAX_PMU_CTRS 0x20 #define TMM_MAX_CFG 6 +#define TMM_MEMORY_INFO_SYSFS "/sys/kernel/tmm/memory_info" typedef struct { uint32_t kae_vf_num; @@ -406,3 +409,33 @@ static void tmm_register_types(void) type_register_static(&tmm_guest_info); } type_init(tmm_register_types); + +static VirtccaCapability *virtcca_get_capabilities(Error **errp) +{ + VirtccaCapability *cap = NULL; + uint64_t tmi_version = 0; + int rc = 0; + + if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { + error_setg(errp, "VIRTCCA is not enabled in KVM"); + return NULL; + } + + rc = access(TMM_MEMORY_INFO_SYSFS, R_OK); + if (rc < 0) { + error_setg_errno(errp, errno, "VIRTCCA: Failed to read %s", + TMM_MEMORY_INFO_SYSFS); + return NULL; + } + + cap = g_new0(VirtccaCapability, 1); + + cap->enabled = true; + + return cap; +} + +VirtccaCapability *qmp_query_virtcca_capabilities(Error **errp) +{ + return virtcca_get_capabilities(errp); +} \ No newline at end of file diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c index 2c15f60958..df1f93ea6a 100644 --- a/tests/qtest/qmp-cmd-test.c +++ b/tests/qtest/qmp-cmd-test.c @@ -110,6 +110,7 @@ static bool query_is_ignored(const char *cmd) "query-sev-capabilities", "query-sgx", "query-sgx-capabilities", + "query-virtcca-capabilities", /* Success depends on enabling dirty page rate limit */ "query-vcpu-dirty-limit", NULL -- Gitee From f80776f3dfd1d05ef3328d5be9fe42df095f4bc1 Mon Sep 17 00:00:00 2001 From: yxk Date: Mon, 21 Apr 2025 04:00:46 -0400 Subject: [PATCH 925/939] Fix error in virtCCA CoDA scenario. Add 'iommu_type' VFIO_TYPE1v2_S_IOMMU in vfio_get_iommu_class to avoid error happens in virtCCA CoDA scenario. Signed-off-by: yxk --- hw/vfio/container.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 64eacfd912..539cf34b20 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -439,6 +439,7 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) switch (iommu_type) { case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_S_IOMMU: klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); break; case VFIO_SPAPR_TCE_v2_IOMMU: -- Gitee From 25c0fad8f9a2ac10f184d346f87da03506314ed6 Mon Sep 17 00:00:00 2001 From: Zhou Wang Date: Fri, 13 Jun 2025 11:26:54 +0800 Subject: [PATCH 926/939] Revert "backends/iommufd: Make iommufd_backend_*() return bool" Revert "backends/iommufd: Make iommufd_backend_*() return bool" and fix the way of vdpa codes use related iommufd APIs. Signed-off-by: Zhou Wang Signed-off-by: Jian Cai --- backends/iommufd.c | 29 ++++++++++++++++------------- backends/trace-events | 4 ++-- hw/virtio/vdpa-dev-iommufd.c | 6 +++--- include/sysemu/iommufd.h | 6 +++--- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/backends/iommufd.c b/backends/iommufd.c index 62df6e41f0..4446efaa32 100644 --- a/backends/iommufd.c +++ b/backends/iommufd.c @@ -74,21 +74,23 @@ static void iommufd_backend_class_init(ObjectClass *oc, void *data) object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); } -bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) { - int fd; + int fd, ret = 0; if (be->owned && !be->users) { fd = qemu_open("/dev/iommu", O_RDWR, errp); if (fd < 0) { - return false; + ret = fd; + goto out; } be->fd = fd; } be->users++; - - trace_iommufd_backend_connect(be->fd, be->owned, be->users); - return true; +out: + trace_iommufd_backend_connect(be->fd, be->owned, + be->users, ret); + return ret; } void iommufd_backend_disconnect(IOMMUFDBackend *be) @@ -105,24 +107,25 @@ out: trace_iommufd_backend_disconnect(be->fd, be->users); } -bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, - Error **errp) +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp) { - int fd = be->fd; + int ret, fd = be->fd; struct iommu_ioas_alloc alloc_data = { .size = sizeof(alloc_data), .flags = 0, }; - if (ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data)) { + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); + if (ret) { error_setg_errno(errp, errno, "Failed to allocate ioas"); - return false; + return ret; } *ioas_id = alloc_data.out_ioas_id; - trace_iommufd_backend_alloc_ioas(fd, *ioas_id); + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); - return true; + return ret; } void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) diff --git a/backends/trace-events b/backends/trace-events index 8fe77149b2..f8592a2711 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -7,13 +7,13 @@ dbus_vmstate_loading(const char *id) "id: %s" dbus_vmstate_saving(const char *id) "id: %s" # iommufd.c -iommufd_backend_connect(int fd, bool owned, uint32_t users) "fd=%d owned=%d users=%d" +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" -iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas) " iommufd=%d ioas=%d" +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" diff --git a/hw/virtio/vdpa-dev-iommufd.c b/hw/virtio/vdpa-dev-iommufd.c index 2b0498f9dc..f5718bae99 100644 --- a/hw/virtio/vdpa-dev-iommufd.c +++ b/hw/virtio/vdpa-dev-iommufd.c @@ -186,12 +186,12 @@ static int vhost_vdpa_container_connect_iommufd(VDPAIOMMUFDContainer *container) return -1; } - if (!iommufd_backend_connect(iommufd, &err)) { + if (iommufd_backend_connect(iommufd, &err)) { error_report_err(err); return -1; } - if (!iommufd_backend_alloc_ioas(iommufd, &ioas_id, &err)) { + if (iommufd_backend_alloc_ioas(iommufd, &ioas_id, &err)) { error_report_err(err); iommufd_backend_disconnect(iommufd); return -1; @@ -480,4 +480,4 @@ void vhost_vdpa_detach_container(VhostVdpaDevice *vdev) vhost_vdpa_container_disconnect_iommufd(container); vhost_vdpa_destroy_container(container); -} \ No newline at end of file +} diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h index 0531a4ad98..908c94d811 100644 --- a/include/sysemu/iommufd.h +++ b/include/sysemu/iommufd.h @@ -43,11 +43,11 @@ typedef struct IOMMUFDViommu { uint32_t viommu_id; } IOMMUFDViommu; -bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); void iommufd_backend_disconnect(IOMMUFDBackend *be); -bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, - Error **errp); +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp); void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); -- Gitee From ab6aa0196a12fa15db9c94212ccea39164417cc8 Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 14 Jun 2025 15:43:39 +0800 Subject: [PATCH 927/939] backends/tpm: Avoid using g_alloca() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry picked from commit 0ff9cd9a6af54ccaa293e252aa356fb150788099 tpm_emulator_ctrlcmd() is not in hot path. Use the heap instead of the stack, removing the g_alloca() call. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Thomas Huth Reviewed-by: Stefan Berger Reviewed-by: Stefan Hajnoczi Message-Id: <20250605193540.59874-3-philmd@linaro.org> Signed-off-by: dinglimin --- backends/tpm/tpm_emulator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index f7f1b4ad7a..0d07df216e 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -128,10 +128,10 @@ static int tpm_emulator_ctrlcmd(TPMEmulator *tpm, unsigned long cmd, void *msg, CharBackend *dev = &tpm->ctrl_chr; uint32_t cmd_no = cpu_to_be32(cmd); ssize_t n = sizeof(uint32_t) + msg_len_in; - uint8_t *buf = NULL; WITH_QEMU_LOCK_GUARD(&tpm->mutex) { - buf = g_alloca(n); + g_autofree uint8_t *buf = g_malloc(n); + memcpy(buf, &cmd_no, sizeof(cmd_no)); memcpy(buf + sizeof(cmd_no), msg, msg_len_in); -- Gitee From 693b6555bb16c82ec8fefa50263b0e8fcdc54cdc Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 14 Jun 2025 15:59:16 +0800 Subject: [PATCH 928/939] tests/unit/test-char: Avoid using g_alloca() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tests/unit/test-char: Avoid using g_alloca() Do not use g_alloca(), simply allocate the CharBackend structure on the stack. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Stefan Hajnoczi Signed-off-by: dinglimin --- tests/unit/test-char.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/test-char.c b/tests/unit/test-char.c index 649fdf64e1..0cb2633190 100644 --- a/tests/unit/test-char.c +++ b/tests/unit/test-char.c @@ -574,7 +574,7 @@ static void char_udp_test_internal(Chardev *reuse_chr, int sock) struct sockaddr_in other; SocketIdleData d = { 0, }; Chardev *chr; - CharBackend *be; + CharBackend stack_be, *be = &stack_be; socklen_t alen = sizeof(other); int ret; char buf[10]; @@ -590,7 +590,6 @@ static void char_udp_test_internal(Chardev *reuse_chr, int sock) chr = qemu_chr_new("client", tmp, NULL); g_assert_nonnull(chr); - be = g_alloca(sizeof(CharBackend)); qemu_chr_fe_init(be, chr, &error_abort); } -- Gitee From 62cedbd18455e0b800c9ab0a47eef599c5309eaa Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 14 Jun 2025 16:40:39 +0800 Subject: [PATCH 929/939] virtio processes indirect descriptors even if the respected feature VIRTIO_RING_F_INDIRECT_DESC was not negotiated. If qemu is used with reduced set of features to emulate the hardware device that does not support indirect descriptors, the will probably trigger problematic flows on the hardware setup but do not reveal the mistake on qemu. Add LOG_GUEST_ERROR for such case. This will issue logs with '-d guest_errors' in the command line Signed-off-by: Yuri Benditovich Message-Id: <20250515063237.808293-1-yuri.benditovich@daynix.com> Signed-off-by: Yuri Benditovich Signed-off-by: dinglimin --- hw/virtio/virtio.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 4f5b241fd3..f57b6c955e 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -204,6 +204,15 @@ static const char *virtio_id_to_name(uint16_t device_id) return name; } +static void virtio_check_indirect_feature(VirtIODevice *vdev) +{ + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Device %s: indirect_desc was not negotiated!\n", + vdev->name); + } +} + /* Called within call_rcu(). */ static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) { @@ -1614,6 +1623,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) goto done; } + virtio_check_indirect_feature(vdev); + /* loop over the indirect descriptor table */ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, desc.addr, desc.len, false); @@ -1744,6 +1755,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) goto done; } + virtio_check_indirect_feature(vdev); + /* loop over the indirect descriptor table */ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, desc.addr, desc.len, false); -- Gitee From 3e4513fcbbb00aff1d8147cee3b93c2bbf3a68fb Mon Sep 17 00:00:00 2001 From: dinglimin Date: Sat, 14 Jun 2025 17:09:25 +0800 Subject: [PATCH 930/939] hw/audio/cs4231a: fix assertion error in isa_bus_get_irq This patch fixes an assertion error in isa_bus_get_irq() in /hw/isa/isa-bus.c by adding a constraint to the irq property. Patch v1 misused ISA_NUM_IRQS, pls ignore that. Signed-off-by: Zheng Huang Link: https://lore.kernel.org/r/6d228069-e38f-4c46-813f-edcccc5c47e4@gmail.com Signed-off-by: Paolo Bonzini Signed-off-by: dinglimin --- hw/audio/cs4231a.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c index 3aa105748d..88dfd0bb7f 100644 --- a/hw/audio/cs4231a.c +++ b/hw/audio/cs4231a.c @@ -682,6 +682,11 @@ static void cs4231a_realizefn (DeviceState *dev, Error **errp) return; } + if (s->irq >= ISA_NUM_IRQS) { + error_setg(errp, "Invalid IRQ %d (max %d)", s->irq, ISA_NUM_IRQS - 1); + return; + } + s->pic = isa_bus_get_irq(bus, s->irq); k = ISADMA_GET_CLASS(s->isa_dma); k->register_channel(s->isa_dma, s->dma, cs_dma_read, s); -- Gitee From e4c28afade86b8533b46bc87a56a8a0f32ab191a Mon Sep 17 00:00:00 2001 From: Jia Qingtong Date: Mon, 16 Jun 2025 17:24:13 +0800 Subject: [PATCH 931/939] hw/virtio/virtio-pci:Support shadow device for virtio-net/blk/scsi devices Currently we only support shadow device for "virtio-net", now let's extend this feature to support "virtio-blk" and "virtio-scsi" devices. Signed-off-by: Yanan Wang Signed-off-by: Jia Qingtong --- hw/virtio/virtio-pci.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 558471307a..13220c258d 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1020,6 +1020,15 @@ int __attribute__((weak)) kvm_delete_shadow_device(PCIDevice *dev) } #endif +#ifdef __aarch64__ +static bool shadow_device_supported(VirtIODevice *vdev) +{ + return !strcmp(vdev->name, "virtio-net") || + !strcmp(vdev->name, "virtio-blk") || + !strcmp(vdev->name, "virtio-scsi"); +} +#endif + static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) { int queue_no; @@ -1027,7 +1036,7 @@ static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); #ifdef __aarch64__ - if (!strcmp(vdev->name, "virtio-net")) { + if (shadow_device_supported(vdev)) { kvm_create_shadow_device(&proxy->pci_dev); } #endif @@ -1044,7 +1053,7 @@ static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) virtio_pci_commit_route_changes(vdev); #ifdef __aarch64__ - if (!strcmp(vdev->name, "virtio-net") && ret != 0) { + if (shadow_device_supported(vdev) && ret != 0) { kvm_delete_shadow_device(&proxy->pci_dev); } #endif @@ -1093,7 +1102,7 @@ static void kvm_virtio_pci_vector_vq_release(VirtIOPCIProxy *proxy, int nvqs) } #ifdef __aarch64__ - if (!strcmp(vdev->name, "virtio-net")) { + if (shadow_device_supported(vdev)) { kvm_delete_shadow_device(&proxy->pci_dev); } #endif -- Gitee From 851559a1442a824559f273380c7ad1fa06f559e8 Mon Sep 17 00:00:00 2001 From: yechao-w Date: Thu, 19 Jun 2025 10:27:13 +0800 Subject: [PATCH 932/939] qemu-options.hx: correct formatting -smbios type=4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 68baeaafa562e360188fb3be8a9451db1c5bd862 upstream processor-family and processor-id can be assigned independently. Add missing brackets. Fixes: b5831d79671c ("smbios: add processor-family option") Signed-off-by: Heinrich Schuchardt Reviewed-by: Thomas Huth Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20240729204816.11905-1-heinrich.schuchardt@canonical.com> Signed-off-by: Philippe Mathieu-Daudé --- qemu-options.hx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qemu-options.hx b/qemu-options.hx index 55765fb34c..b09d692d5b 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2679,7 +2679,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios, " specify SMBIOS type 3 fields\n" "-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str]\n" " [,asset=str][,part=str][,max-speed=%d][,current-speed=%d]\n" - " [,processor-family=%d,processor-id=%d]\n" + " [,processor-family=%d][,processor-id=%d]\n" " specify SMBIOS type 4 fields\n" "-smbios type=8[,external_reference=str][,internal_reference=str][,connector_type=%d][,port_type=%d]\n" " specify SMBIOS type 8 fields\n" -- Gitee From 555841f0b5d38681d5bec899cba9fc67d92d2a3a Mon Sep 17 00:00:00 2001 From: panhengchang Date: Mon, 23 Jun 2025 18:48:09 +0800 Subject: [PATCH 933/939] qapi/misc-target: Add KVM option to isolate virtcca detection interface. Add 'CONFIG_KVM' to isolate "VirtccaCapability" and "query-virtcca-capabilities". Signed-off-by: panghengchang --- qapi/misc-target.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/qapi/misc-target.json b/qapi/misc-target.json index 76ed52b65b..3df0062f3d 100644 --- a/qapi/misc-target.json +++ b/qapi/misc-target.json @@ -497,7 +497,8 @@ ## { 'struct': 'VirtccaCapability', 'data': { 'enabled': 'bool' }, - 'if': 'TARGET_AARCH64' } + 'if': { 'all': ['TARGET_AARCH64' , 'CONFIG_KVM'] } +} ## # @query-virtcca-capabilities: @@ -515,4 +516,5 @@ # <- { "return": { "enabled": true } } ## { 'command': 'query-virtcca-capabilities', 'returns': 'VirtccaCapability', - 'if': 'TARGET_AARCH64' } \ No newline at end of file + 'if': { 'all': ['TARGET_AARCH64' , 'CONFIG_KVM'] } +} \ No newline at end of file -- Gitee From 69f44f27b30970cf19c0d5507a11852facace775 Mon Sep 17 00:00:00 2001 From: panhengchang Date: Tue, 24 Jun 2025 09:38:35 +0800 Subject: [PATCH 934/939] Add stub function for 'tmm_get_kae_num' if 'CONFIG_KVM' is not set. Signed-off-by: panghengchang --- target/arm/kvm_arm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 76137289df..a29d4548f4 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -507,6 +507,11 @@ static inline void tmm_set_hpre_addr(hwaddr base, int num) { g_assert_not_reached(); } + +static inline int tmm_get_kae_num(void) +{ + g_assert_not_reached(); +} #endif /** -- Gitee From ede25e9b7c5cc8ce1c668f306bfbe5c90564570b Mon Sep 17 00:00:00 2001 From: gubin Date: Wed, 25 Jun 2025 17:13:10 +0800 Subject: [PATCH 935/939] block/blkio: Make s->mem_region_alignment be 64 bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from 615eaeab3d318ba239d54141a4251746782f65c1 With GCC 14 the code failed to compile on i686 (and was wrong for any version of GCC): ../block/blkio.c: In function ‘blkio_file_open’: ../block/blkio.c:857:28: error: passing argument 3 of ‘blkio_get_uint64’ from incompatible pointer type [-Wincompatible-pointer-types] 857 | &s->mem_region_alignment); | ^~~~~~~~~~~~~~~~~~~~~~~~ | | | size_t * {aka unsigned int *} In file included from ../block/blkio.c:12: /usr/include/blkio.h:49:67: note: expected ‘uint64_t *’ {aka ‘long long unsigned int *’} but argument is of type ‘size_t *’ {aka ‘unsigned int *’} 49 | int blkio_get_uint64(struct blkio *b, const char *name, uint64_t *value); | ~~~~~~~~~~^~~~~ Signed-off-by: Richard W.M. Jones Message-id: 20240130122006.2977938-1-rjones@redhat.com Signed-off-by: Stefan Hajnoczi Signed-off-by: gubin --- block/blkio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blkio.c b/block/blkio.c index 027c16ceb6..52ac94527f 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -68,7 +68,7 @@ typedef struct { CoQueue bounce_available; /* The value of the "mem-region-alignment" property */ - size_t mem_region_alignment; + uint64_t mem_region_alignment; /* Can we skip adding/deleting blkio_mem_regions? */ bool needs_mem_regions; -- Gitee From a01e9f722d8e187493cda6acf645012793bc95fe Mon Sep 17 00:00:00 2001 From: gubin Date: Wed, 25 Jun 2025 17:18:04 +0800 Subject: [PATCH 936/939] block/io: accept NULL qiov in bdrv_pad_request cherry-pick from 3f934817c82c2f1bf1c238f8d1065a3be10a3c9e Some operations, e.g. block-stream, perform reads while discarding the results (only copy-on-read matters). In this case, they will pass NULL as the target QEMUIOVector, which will however trip bdrv_pad_request, since it wants to extend its passed vector. In particular, this is the case for the blk_co_preadv() call in stream_populate(). If there is no qiov, no operation can be done with it, but the bytes and offset still need to be updated, so the subsequent aligned read will actually be aligned and not run into an assertion failure. Originally-by: Stefan Reiter Signed-off-by: Thomas Lamprecht Signed-off-by: Fiona Ebner Message-ID: <20240322095009.346989-2-f.ebner@proxmox.com> Reviewed-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf Signed-off-by: gubin Signed-off-by: gubin --- block/io.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/block/io.c b/block/io.c index a280a5a4c9..27d6a1a04b 100644 --- a/block/io.c +++ b/block/io.c @@ -1756,22 +1756,29 @@ static int bdrv_pad_request(BlockDriverState *bs, return 0; } - sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, - &sliced_head, &sliced_tail, - &sliced_niov); - - /* Guaranteed by bdrv_check_request32() */ - assert(*bytes <= SIZE_MAX); - ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, - sliced_head, *bytes); - if (ret < 0) { - bdrv_padding_finalize(pad); - return ret; + /* + * For prefetching in stream_populate(), no qiov is passed along, because + * only copy-on-read matters. + */ + if (qiov && *qiov) { + sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, + &sliced_head, &sliced_tail, + &sliced_niov); + + /* Guaranteed by bdrv_check_request32() */ + assert(*bytes <= SIZE_MAX); + ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, + sliced_head, *bytes); + if (ret < 0) { + bdrv_padding_finalize(pad); + return ret; + } + *qiov = &pad->local_qiov; + *qiov_offset = 0; } + *bytes += pad->head + pad->tail; *offset -= pad->head; - *qiov = &pad->local_qiov; - *qiov_offset = 0; if (padded) { *padded = true; } -- Gitee From 19ef3764888b212a63603ac46e88b4cfd99dd7b2 Mon Sep 17 00:00:00 2001 From: gubin Date: Wed, 25 Jun 2025 17:24:49 +0800 Subject: [PATCH 937/939] target/arm: Adjust and validate mtedesc sizem1 cherry-pick from b12a7671b6099a26ce5d5ab09701f151e21c112c When we added SVE_MTEDESC_SHIFT, we effectively limited the maximum size of MTEDESC. Adjust SIZEM1 to consume the remaining bits (32 - 10 - 5 - 12 == 5). Assert that the data to be stored fits within the field (expecting 8 * 4 - 1 == 31, exact fit). Cc: qemu-stable@nongnu.org Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson Tested-by: Gustavo Romero Message-id: 20240207025210.8837-4-richard.henderson@linaro.org Signed-off-by: Peter Maydell Signed-off-by: gubin --- target/arm/internals.h | 2 +- target/arm/tcg/translate-sve.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/target/arm/internals.h b/target/arm/internals.h index 20b9c1da38..ed9bfb29c8 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1265,7 +1265,7 @@ FIELD(MTEDESC, TBI, 4, 2) FIELD(MTEDESC, TCMA, 6, 2) FIELD(MTEDESC, WRITE, 8, 1) FIELD(MTEDESC, ALIGN, 9, 3) -FIELD(MTEDESC, SIZEM1, 12, SIMD_DATA_BITS - 12) /* size - 1 */ +FIELD(MTEDESC, SIZEM1, 12, SIMD_DATA_BITS - SVE_MTEDESC_SHIFT - 12) /* size - 1 */ bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr); uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra); diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index 1d8e0d29bf..1b722ae75d 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -4457,17 +4457,18 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, { unsigned vsz = vec_full_reg_size(s); TCGv_ptr t_pg; + uint32_t sizem1; int desc = 0; assert(mte_n >= 1 && mte_n <= 4); + sizem1 = (mte_n << dtype_msz(dtype)) - 1; + assert(sizem1 <= R_MTEDESC_SIZEM1_MASK >> R_MTEDESC_SIZEM1_SHIFT); if (s->mte_active[0]) { - int msz = dtype_msz(dtype); - desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); - desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, sizem1); desc <<= SVE_MTEDESC_SHIFT; } else { addr = clean_data_tbi(s, addr); -- Gitee From 512f4f585ee6d51c2ee909ec9451b3137d5941df Mon Sep 17 00:00:00 2001 From: lijunwei Date: Tue, 1 Jul 2025 17:17:54 +0800 Subject: [PATCH 938/939] add support for Phytium 2000+ and Phytium S2500 --- hw/arm/virt.c | 2 ++ target/arm/cpu64.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index a43f18020c..2957b844d2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -238,6 +238,8 @@ static const char *valid_cpus[] = { ARM_CPU_TYPE_NAME("cortex-a53"), ARM_CPU_TYPE_NAME("cortex-a57"), ARM_CPU_TYPE_NAME("Kunpeng-920"), + ARM_CPU_TYPE_NAME("FT-2000+"), + ARM_CPU_TYPE_NAME("Tengyun-S2500"), ARM_CPU_TYPE_NAME("host"), ARM_CPU_TYPE_NAME("max"), }; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 6eca55ac29..4a1d2daeb6 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -799,6 +799,34 @@ static void aarch64_host_initfn(Object *obj) #endif } +static void aarch64_max_ft2000plus_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); + } else { + aarch64_a72_initfn(obj); + cpu->midr = 0x70186622; + } + aarch64_add_sve_properties(obj); + aarch64_add_pauth_properties(obj); +} + +static void aarch64_max_tengyun_s2500_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); + } else { + aarch64_a72_initfn(obj); + cpu->midr = 0x70186632; + } + aarch64_add_sve_properties(obj); + aarch64_add_pauth_properties(obj); +} + static void aarch64_max_initfn(Object *obj) { if (kvm_enabled() || hvf_enabled()) { @@ -821,6 +849,8 @@ static const ARMCPUInfo aarch64_cpus[] = { { .name = "cortex-a57", .initfn = aarch64_a57_initfn }, { .name = "cortex-a53", .initfn = aarch64_a53_initfn }, { .name = "Kunpeng-920", .initfn = aarch64_kunpeng_920_initfn}, + { .name = "FT-2000+", .initfn = aarch64_max_ft2000plus_initfn }, + { .name = "Tengyun-S2500", .initfn = aarch64_max_tengyun_s2500_initfn }, { .name = "max", .initfn = aarch64_max_initfn }, #if defined(CONFIG_KVM) || defined(CONFIG_HVF) { .name = "host", .initfn = aarch64_host_initfn }, -- Gitee From 4bfb3158b99bbcdba9466f628cb0989a3d36fd2b Mon Sep 17 00:00:00 2001 From: lijunwei Date: Tue, 1 Jul 2025 17:50:10 +0800 Subject: [PATCH 939/939] virtio-net: Fix num_buffers for version 1 --- hw/net/virtio-net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 7184c9c526..25044385dc 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1996,7 +1996,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, sg, elem->in_num, offsetof(typeof(mhdr), num_buffers), sizeof(mhdr.num_buffers)); - } + }else { + mhdr.num_buffers = cpu_to_le16(1); + } receive_header(n, sg, elem->in_num, buf, size); if (n->rss_data.populate_hash) { -- Gitee