diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml
index 91663946de460c40f126184b34a3d5cea2a57daa..983c3c132e80561104ee56eb1695e9c15c893306 100644
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -579,6 +579,9 @@ build-tci:
     - make check-tcg
 
 # Check our reduced build configurations
+# requires libfdt: aarch64, arm, i386, loongarch64, microblaze, microblazeel,
+#   mips64el, or1k, ppc, ppc64, riscv32, riscv64, rx, x86_64
+# does not build without boards: i386, s390x, sh4, sh4eb, x86_64
 build-without-defaults:
   extends: .native_build_job_template
   needs:
diff --git a/.travis.yml b/.travis.yml
index 76859d48da5a1bff564111b44d73cd1d9f8695f9..597d151b802d7b8e31998e986fab8aef00baef3c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,7 +35,7 @@ env:
     - TEST_BUILD_CMD=""
     - TEST_CMD="make check V=1"
     # This is broadly a list of "mainline" system targets which have support across the major distros
-    - MAIN_SOFTMMU_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu"
+    - MAIN_SYSTEM_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu"
     - CCACHE_SLOPPINESS="include_file_ctime,include_file_mtime"
     - CCACHE_MAXSIZE=1G
     - G_MESSAGES_DEBUG=error
@@ -114,7 +114,7 @@ jobs:
       env:
         - TEST_CMD="make check check-tcg V=1"
         - CONFIG="--disable-containers --enable-fdt=system
-                  --target-list=${MAIN_SOFTMMU_TARGETS} --cxx=/bin/false"
+                  --target-list=${MAIN_SYSTEM_TARGETS} --cxx=/bin/false"
         - UNRELIABLE=true
 
     - name: "[ppc64] GCC check-tcg"
@@ -185,7 +185,7 @@ jobs:
       env:
         - TEST_CMD="make check check-tcg V=1"
         - CONFIG="--disable-containers --enable-fdt=system
-                  --target-list=${MAIN_SOFTMMU_TARGETS},s390x-linux-user"
+                  --target-list=${MAIN_SYSTEM_TARGETS},s390x-linux-user"
         - UNRELIABLE=true
       script:
         - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$?
@@ -226,7 +226,7 @@ jobs:
           - genisoimage
       env:
         - CONFIG="--disable-containers --enable-fdt=system --audio-drv-list=sdl
-                  --disable-user --target-list-exclude=${MAIN_SOFTMMU_TARGETS}"
+                  --disable-user --target-list-exclude=${MAIN_SYSTEM_TARGETS}"
 
     - name: "[s390x] GCC (user)"
       arch: s390x
diff --git a/Kconfig.host b/Kconfig.host
index f496475f8e2ef89533c1e5f97e42fc8f3e74ac5e..faf58d9af513fa8f18882bf8ab0b04bd6fc67b57 100644
--- a/Kconfig.host
+++ b/Kconfig.host
@@ -28,6 +28,7 @@ config VHOST_USER
 
 config VHOST_VDPA
     bool
+    select IOMMUFD
 
 config VHOST_KERNEL
     bool
diff --git a/MAINTAINERS b/MAINTAINERS
index 695e0bd34fbba253d77570e5b3ef8dabe7a174b3..ada87bfa9e063798cf366992aacc09bfdcfd0230 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2167,6 +2167,19 @@ F: hw/vfio/ap.c
 F: docs/system/s390x/vfio-ap.rst
 L: qemu-s390x@nongnu.org
 
+iommufd
+M: Yi Liu <yi.l.liu@intel.com>
+M: Eric Auger <eric.auger@redhat.com>
+M: Zhenzhong Duan <zhenzhong.duan@intel.com>
+S: Supported
+F: backends/iommufd.c
+F: include/sysemu/iommufd.h
+F: backends/host_iommu_device.c
+F: include/sysemu/host_iommu_device.h
+F: include/qemu/chardev_open.h
+F: util/chardev_open.c
+F: docs/devel/vfio-iommufd.rst
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c
index 6195150a0b4d2f310614d4efa18d409dd0c28044..54f19028b828d8e329160ec26359c618f1f0b317 100644
--- a/accel/kvm/kvm-accel-ops.c
+++ b/accel/kvm/kvm-accel-ops.c
@@ -112,6 +112,9 @@ static void kvm_accel_ops_class_init(ObjectClass *oc, void *data)
     ops->remove_breakpoint = kvm_remove_breakpoint;
     ops->remove_all_breakpoints = kvm_remove_all_breakpoints;
 #endif
+
+    ops->control_pre_system_reset = kvm_cpus_control_pre_system_reset;
+    ops->control_post_system_reset = kvm_cpus_control_post_system_reset;
 }
 
 static const TypeInfo kvm_accel_ops_type = {
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index e39a810a4e92333a251711a223eb57b05dc2033d..7d175d3262fe0b78097b526d11660053013af932 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -52,6 +52,8 @@
 #include "hw/boards.h"
 #include "sysemu/stats.h"
 
+#include "sysemu/kvm.h"
+
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
@@ -86,6 +88,9 @@ struct KVMParkedVcpu {
 };
 
 KVMState *kvm_state;
+
+bool virtcca_cvm_allowed = false;
+
 bool kvm_kernel_irqchip;
 bool kvm_split_irqchip;
 bool kvm_async_interrupts_allowed;
@@ -98,6 +103,7 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_vm_attributes_allowed;
 bool kvm_msi_use_devid;
+bool kvm_csv3_allowed;
 bool kvm_has_guest_debug;
 static int kvm_sstep_flags;
 static bool kvm_immediate_exit;
@@ -324,14 +330,72 @@ err:
     return ret;
 }
 
+void kvm_park_vcpu(CPUState *cpu)
+{
+    struct KVMParkedVcpu *vcpu;
+
+    trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
+
+    vcpu = g_malloc0(sizeof(*vcpu));
+    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+    vcpu->kvm_fd = cpu->kvm_fd;
+    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+}
+
+int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+    struct KVMParkedVcpu *cpu;
+    int kvm_fd = -ENOENT;
+
+    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+        if (cpu->vcpu_id == vcpu_id) {
+            QLIST_REMOVE(cpu, node);
+            kvm_fd = cpu->kvm_fd;
+            g_free(cpu);
+            break;
+        }
+    }
+
+    trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
+
+    return kvm_fd;
+}
+
+int kvm_create_vcpu(CPUState *cpu)
+{
+    unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
+    KVMState *s = kvm_state;
+    int kvm_fd;
+
+    /* check if the KVM vCPU already exist but is parked */
+    kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
+    if (kvm_fd < 0) {
+        /* vCPU not parked: create a new KVM vCPU */
+        kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
+        if (kvm_fd < 0) {
+            error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
+            return kvm_fd;
+        }
+    }
+
+    cpu->kvm_fd = kvm_fd;
+    cpu->kvm_state = s;
+    cpu->vcpu_dirty = true;
+    cpu->dirty_pages = 0;
+    cpu->throttle_us_per_full = 0;
+
+    trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
+
+    return 0;
+}
+
 static int do_kvm_destroy_vcpu(CPUState *cpu)
 {
     KVMState *s = kvm_state;
     long mmap_size;
-    struct KVMParkedVcpu *vcpu = NULL;
     int ret = 0;
 
-    DPRINTF("kvm_destroy_vcpu\n");
+    trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
 
     ret = kvm_arch_destroy_vcpu(cpu);
     if (ret < 0) {
@@ -357,10 +421,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
         }
     }
 
-    vcpu = g_malloc0(sizeof(*vcpu));
-    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
-    vcpu->kvm_fd = cpu->kvm_fd;
-    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+    kvm_park_vcpu(cpu);
 err:
     return ret;
 }
@@ -373,24 +434,6 @@ void kvm_destroy_vcpu(CPUState *cpu)
     }
 }
 
-static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
-{
-    struct KVMParkedVcpu *cpu;
-
-    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
-        if (cpu->vcpu_id == vcpu_id) {
-            int kvm_fd;
-
-            QLIST_REMOVE(cpu, node);
-            kvm_fd = cpu->kvm_fd;
-            g_free(cpu);
-            return kvm_fd;
-        }
-    }
-
-    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
-}
-
 int kvm_init_vcpu(CPUState *cpu, Error **errp)
 {
     KVMState *s = kvm_state;
@@ -399,19 +442,14 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
 
     trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
 
-    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
+    ret = kvm_create_vcpu(cpu);
     if (ret < 0) {
-        error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
+        error_setg_errno(errp, -ret,
+                         "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
                          kvm_arch_vcpu_id(cpu));
         goto err;
     }
 
-    cpu->kvm_fd = ret;
-    cpu->kvm_state = s;
-    cpu->vcpu_dirty = true;
-    cpu->dirty_pages = 0;
-    cpu->throttle_us_per_full = 0;
-
     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
     if (mmap_size < 0) {
         ret = mmap_size;
@@ -1699,6 +1737,36 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener,
     }
 }
 
+static int kvm_ioeventfd_batch(bool start)
+{
+    int ret;
+    struct kvm_ioeventfd iofd = {
+        .flags = start ?
+            KVM_IOEVENTFD_FLAG_BATCH_BEGIN : KVM_IOEVENTFD_FLAG_BATCH_END,
+    };
+
+    if (!kvm_enabled()) {
+        return -ENOSYS;
+    }
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
+    if (ret < 0) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static void kvm_ioeventfd_begin(MemoryListener *listener)
+{
+    kvm_ioeventfd_batch(true);
+}
+
+static void kvm_ioeventfd_end(MemoryListener *listener)
+{
+    kvm_ioeventfd_batch(false);
+}
+
 static void kvm_io_ioeventfd_del(MemoryListener *listener,
                                  MemoryRegionSection *section,
                                  bool match_data, uint64_t data,
@@ -1834,7 +1902,10 @@ void kvm_irqchip_commit_routes(KVMState *s)
     s->irq_routes->flags = 0;
     trace_kvm_irqchip_commit_routes();
     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
-    assert(ret == 0);
+    if (ret < 0) {
+        error_report("Set GSI routing failed: %m");
+        abort();
+    }
 }
 
 static void kvm_add_routing_entry(KVMState *s,
@@ -1861,10 +1932,11 @@ static void kvm_add_routing_entry(KVMState *s,
     set_gsi(s, entry->gsi);
 }
 
-static int kvm_update_routing_entry(KVMState *s,
+static int kvm_update_routing_entry(KVMRouteChange *c,
                                     struct kvm_irq_routing_entry *new_entry)
 {
     struct kvm_irq_routing_entry *entry;
+    KVMState *s = c->s;
     int n;
 
     for (n = 0; n < s->irq_routes->nr; n++) {
@@ -1878,7 +1950,7 @@ static int kvm_update_routing_entry(KVMState *s,
         }
 
         *entry = *new_entry;
-
+        c->changes++;
         return 0;
     }
 
@@ -2010,7 +2082,7 @@ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
     return virq;
 }
 
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg,
                                  PCIDevice *dev)
 {
     struct kvm_irq_routing_entry kroute = {};
@@ -2033,13 +2105,14 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
         kroute.flags = KVM_MSI_VALID_DEVID;
         kroute.u.msi.devid = pci_requester_id(dev);
     }
-    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
+    if (msg.address &&
+        kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
         return -EINVAL;
     }
 
     trace_kvm_irqchip_update_msi_route(virq);
 
-    return kvm_update_routing_entry(s, &kroute);
+    return kvm_update_routing_entry(c, &kroute);
 }
 
 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
@@ -2181,7 +2254,7 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
     abort();
 }
 
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg)
 {
     return -ENOSYS;
 }
@@ -2308,7 +2381,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
 
 bool kvm_dirty_ring_enabled(void)
 {
-    return kvm_state->kvm_dirty_ring_size ? true : false;
+    return kvm_state && kvm_state->kvm_dirty_ring_size;
 }
 
 static void query_stats_cb(StatsResultList **result, StatsTarget target,
@@ -2320,6 +2393,11 @@ uint32_t kvm_dirty_ring_size(void)
     return kvm_state->kvm_dirty_ring_size;
 }
 
+static inline bool kvm_is_virtcca_cvm_type(int type)
+{
+    return type & VIRTCCA_CVM_TYPE;
+}
+
 static int kvm_init(MachineState *ms)
 {
     MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2344,6 +2422,7 @@ static int kvm_init(MachineState *ms)
     qemu_mutex_init(&kml_slots_lock);
 
     s = KVM_STATE(ms->accelerator);
+    kvm_state = s;
 
     /*
      * On systems where the kernel can support different base page
@@ -2412,6 +2491,10 @@ static int kvm_init(MachineState *ms)
         goto err;
     }
 
+    if (kvm_is_virtcca_cvm_type(type)) {
+        virtcca_cvm_allowed = true;
+    }
+
     do {
         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
     } while (ret == -EINTR);
@@ -2559,8 +2642,6 @@ static int kvm_init(MachineState *ms)
 #endif
     }
 
-    kvm_state = s;
-
     ret = kvm_arch_init(ms, s);
     if (ret < 0) {
         goto err;
@@ -2580,6 +2661,8 @@ static int kvm_init(MachineState *ms)
     s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
     s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
     s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
+    s->memory_listener.listener.eventfd_begin = kvm_ioeventfd_begin;
+    s->memory_listener.listener.eventfd_end = kvm_ioeventfd_end;
 
     kvm_memory_listener_register(s, &s->memory_listener,
                                  &address_space_memory, 0, "kvm-memory");
@@ -2761,6 +2844,16 @@ void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
 }
 
+void kvm_cpus_control_pre_system_reset(void)
+{
+    kvm_vm_ioctl(kvm_state, KVM_CONTROL_VCPU_PRE_SYSTEM_RESET, NULL);
+}
+
+void kvm_cpus_control_post_system_reset(void)
+{
+    kvm_vm_ioctl(kvm_state, KVM_CONTROL_VCPU_POST_SYSTEM_RESET, NULL);
+}
+
 #ifdef KVM_HAVE_MCE_INJECTION
 static __thread void *pending_sigbus_addr;
 static __thread int pending_sigbus_code;
@@ -2993,7 +3086,7 @@ int kvm_cpu_exec(CPUState *cpu)
 
     if (ret < 0) {
         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
-        vm_stop(RUN_STATE_INTERNAL_ERROR);
+        qemu_system_guest_panicked(cpu_get_crash_info(cpu));
     }
 
     qatomic_set(&cpu->exit_request, 0);
@@ -3158,6 +3251,31 @@ bool kvm_arm_supports_user_irq(void)
     return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
 }
 
+void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size)
+{
+    KVMState *s = kvm_state;
+    int size, ret;
+
+    if (s == NULL || !kvm_check_extension(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK)) {
+        return;
+    }
+
+    size = hdbss_buffer_size;
+    if (size < 0 || size > MAX_HDBSS_BUFFER_SIZE) {
+        fprintf(stderr, "Invalid hdbss buffer size: %d\n", size);
+        return;
+    }
+
+    ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK, 0,
+                            enable ? size : 0);
+    if (ret) {
+        fprintf(stderr, "Could not %s KVM_CAP_ARM_HW_DIRTY_STATE_TRACK: %d\n",
+                        enable ? "enable" : "disable", ret);
+    }
+
+    return;
+}
+
 #ifdef KVM_CAP_SET_GUEST_DEBUG
 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
 {
@@ -3468,6 +3586,28 @@ int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
     return r;
 }
 
+int kvm_load_user_data(hwaddr loader_start, hwaddr dtb_info, hwaddr data_start, hwaddr data_size, hwaddr ram_size,
+                       struct kvm_numa_info *numa_info)
+{
+    KVMState *state = kvm_state;
+    struct kvm_user_data data;
+    int ret;
+
+    data.loader_start = loader_start;
+    data.dtb_info = dtb_info;
+    data.data_start = data_start;
+    data.data_size = data_size;
+    data.ram_size = ram_size;
+    memcpy(&data.numa_info, numa_info, sizeof(struct kvm_numa_info));
+
+    ret = kvm_vm_ioctl(state, KVM_LOAD_USER_DATA, &data);
+    if (ret < 0) {
+        error_report("%s: KVM_LOAD_USER_DATA failed!\n", __func__);
+    }
+
+    return ret;
+}
+
 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
                                  hwaddr start_addr, hwaddr size)
 {
@@ -3568,6 +3708,11 @@ bool kvm_kernel_irqchip_split(void)
     return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
 }
 
+bool kvm_smccc_filter_enabled(void)
+{
+    return kvm_state->kvm_smccc_filter_enabled;
+}
+
 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
                                     const char *name, void *opaque,
                                     Error **errp)
@@ -3613,6 +3758,7 @@ static void kvm_accel_instance_init(Object *obj)
     /* KVM dirty ring is by default off */
     s->kvm_dirty_ring_size = 0;
     s->kvm_dirty_ring_with_bitmap = false;
+    s->kvm_smccc_filter_enabled = false;
     s->kvm_eager_split_size = 0;
     s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
     s->notify_window = 0;
diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h
index ca40add32c3dd90f74ed692c88796323e6b08584..27b9d0d9dbfc83bb5060e48795ae6b858030c061 100644
--- a/accel/kvm/kvm-cpus.h
+++ b/accel/kvm/kvm-cpus.h
@@ -23,4 +23,7 @@ int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len);
 int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len);
 void kvm_remove_all_breakpoints(CPUState *cpu);
 
+void kvm_cpus_control_pre_system_reset(void);
+void kvm_cpus_control_post_system_reset(void);
+
 #endif /* KVM_CPUS_H */
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index 399aaeb0ec757cf50ba44943d4ab4d30e02c1a0d..9c880fdcf4f67b985dfb94cea39428a291c9d7e3 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -9,6 +9,10 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p"
 kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s"
 kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s"
 kvm_init_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
+kvm_create_vcpu(int cpu_index, unsigned long arch_cpu_id, int kvm_fd) "index: %d, id: %lu, kvm fd: %d"
+kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
+kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
+kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s"
 kvm_irqchip_commit_routes(void) ""
 kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d"
 kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d"
@@ -26,3 +30,10 @@ kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"P
 kvm_dirty_ring_reaper_kick(const char *reason) "%s"
 kvm_dirty_ring_flush(int finished) "%d"
 
+kvm_failed_get_vcpu_mmap_size(void) ""
+kvm_cpu_exec(void) ""
+kvm_interrupt_exit_request(void) ""
+kvm_io_window_exit(void) ""
+kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32
+kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s"
+kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 1b37d9a302cccce5f7636bff492cdd2401996d48..e68f3433ad2c816b449eb015be3134f0124d59a5 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -25,6 +25,10 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_msi_use_devid;
 
+bool virtcca_cvm_allowed;
+
+bool kvm_csv3_allowed;
+
 void kvm_flush_coalesced_mmio_buffer(void)
 {
 }
@@ -61,7 +65,7 @@ void kvm_irqchip_release_virq(KVMState *s, int virq)
 {
 }
 
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg,
                                  PCIDevice *dev)
 {
     return -ENOSYS;
@@ -115,6 +119,11 @@ bool kvm_arm_supports_user_irq(void)
     return false;
 }
 
+void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size)
+{
+    g_assert_not_reached();
+}
+
 bool kvm_dirty_ring_enabled(void)
 {
     return false;
@@ -124,3 +133,9 @@ uint32_t kvm_dirty_ring_size(void)
 {
     return 0;
 }
+
+int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size,
+                       struct kvm_numa_info *numa_info)
+{
+    return -ENOSYS;
+}
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index 3d2a896220cc4ad9646af55f1cf850273062e6db..eb37f9e8a89c20b77bb86a350754562049cb1bee 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -712,7 +712,7 @@ static void tb_record(TranslationBlock *tb)
     tb_page_addr_t paddr0 = tb_page_addr0(tb);
     tb_page_addr_t paddr1 = tb_page_addr1(tb);
     tb_page_addr_t pindex0 = paddr0 >> TARGET_PAGE_BITS;
-    tb_page_addr_t pindex1 = paddr0 >> TARGET_PAGE_BITS;
+    tb_page_addr_t pindex1 = paddr1 >> TARGET_PAGE_BITS;
 
     assert(paddr0 != -1);
     if (unlikely(paddr1 != -1) && pindex0 != pindex1) {
@@ -744,7 +744,7 @@ static void tb_remove(TranslationBlock *tb)
     tb_page_addr_t paddr0 = tb_page_addr0(tb);
     tb_page_addr_t paddr1 = tb_page_addr1(tb);
     tb_page_addr_t pindex0 = paddr0 >> TARGET_PAGE_BITS;
-    tb_page_addr_t pindex1 = paddr0 >> TARGET_PAGE_BITS;
+    tb_page_addr_t pindex1 = paddr1 >> TARGET_PAGE_BITS;
 
     assert(paddr0 != -1);
     if (unlikely(paddr1 != -1) && pindex0 != pindex1) {
diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
index fac80095bbd09944870a29312de0a906c63ca3c0..73866990ced13943d8f7d71aff71ae71ec855f04 100644
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -122,6 +122,7 @@ static void *mttcg_cpu_thread_fn(void *arg)
     qemu_mutex_unlock_iothread();
     rcu_remove_force_rcu_notifier(&force_rcu.notifier);
     rcu_unregister_thread();
+    tcg_unregister_thread();
     return NULL;
 }
 
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 68b252cb8e8d55a17ce0fe4c1db44fa89012747c..e87848a5e25abac2d6cb870b2516683fa9bda364 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -794,7 +794,7 @@ static int probe_access_internal(CPUArchState *env, vaddr addr,
     if (guest_addr_valid_untagged(addr)) {
         int page_flags = page_get_flags(addr);
         if (page_flags & acc_flag) {
-            if ((acc_flag == PAGE_READ || acc_flag == PAGE_WRITE)
+            if (access_type != MMU_INST_FETCH
                 && cpu_plugin_mem_cbs_enabled(env_cpu(env))) {
                 return TLB_MMIO;
             }
diff --git a/audio/audio.c b/audio/audio.c
index 8d1e4ad92271b39893c4e6454cce4d66d36458fa..7ac74f9e16e0fc5b6d49aae3569152c3c0ad753c 100644
--- a/audio/audio.c
+++ b/audio/audio.c
@@ -1744,7 +1744,7 @@ static AudioState *audio_init(Audiodev *dev, Error **errp)
         if (driver) {
             done = !audio_driver_init(s, driver, dev, errp);
         } else {
-            error_setg(errp, "Unknown audio driver `%s'\n", drvname);
+            error_setg(errp, "Unknown audio driver `%s'", drvname);
         }
         if (!done) {
             goto out;
diff --git a/audio/pwaudio.c b/audio/pwaudio.c
index 3ce5f6507b47270c1283cabf5b47ad81932d6e5e..5d1c7126d33e8e2a362ec80f9c930a854603f9c2 100644
--- a/audio/pwaudio.c
+++ b/audio/pwaudio.c
@@ -770,13 +770,15 @@ qpw_audio_init(Audiodev *dev, Error **errp)
     pw->core = pw_context_connect(pw->context, NULL, 0);
     if (pw->core == NULL) {
         pw_thread_loop_unlock(pw->thread_loop);
-        goto fail_error;
+        error_setg_errno(errp, errno, "Failed to connect to PipeWire instance");
+        goto fail;
     }
 
     if (pw_core_add_listener(pw->core, &pw->core_listener,
                              &core_events, pw) < 0) {
         pw_thread_loop_unlock(pw->thread_loop);
-        goto fail_error;
+        error_setg(errp, "Failed to add PipeWire listener");
+        goto fail;
     }
     if (wait_resync(pw) < 0) {
         pw_thread_loop_unlock(pw->thread_loop);
@@ -786,8 +788,6 @@ qpw_audio_init(Audiodev *dev, Error **errp)
 
     return g_steal_pointer(&pw);
 
-fail_error:
-    error_setg(errp, "Failed to initialize PW context");
 fail:
     if (pw->thread_loop) {
         pw_thread_loop_stop(pw->thread_loop);
diff --git a/backends/Kconfig b/backends/Kconfig
index f35abc16092808b1fe5b033a346908e2d66bff0b..8d0be5a263ff1ca6231a766990b1b89068d06a2e 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -1 +1,4 @@
 source tpm/Kconfig
+
+config IOMMUFD
+    bool
diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index 39d04552807fc15fce884c2925ed57af17877bef..940104ee55222337b577ee23944a63c3c25d46bf 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -23,6 +23,7 @@
 
 #include "qemu/osdep.h"
 #include "sysemu/cryptodev.h"
+#include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "standard-headers/linux/virtio_crypto.h"
 #include "crypto/cipher.h"
@@ -396,8 +397,8 @@ static int cryptodev_builtin_create_session(
     case VIRTIO_CRYPTO_HASH_CREATE_SESSION:
     case VIRTIO_CRYPTO_MAC_CREATE_SESSION:
     default:
-        error_setg(&local_error, "Unsupported opcode :%" PRIu32 "",
-                   sess_info->op_code);
+        error_report("Unsupported opcode :%" PRIu32 "",
+                     sess_info->op_code);
         return -VIRTIO_CRYPTO_NOTSUPP;
     }
 
@@ -427,7 +428,9 @@ static int cryptodev_builtin_close_session(
                       CRYPTODEV_BACKEND_BUILTIN(backend);
     CryptoDevBackendBuiltinSession *session;
 
-    assert(session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]);
+    if (session_id >= MAX_NUM_SESSIONS || !builtin->sessions[session_id]) {
+        return -VIRTIO_CRYPTO_INVSESS;
+    }
 
     session = builtin->sessions[session_id];
     if (session->cipher) {
@@ -552,8 +555,8 @@ static int cryptodev_builtin_operation(
 
     if (op_info->session_id >= MAX_NUM_SESSIONS ||
               builtin->sessions[op_info->session_id] == NULL) {
-        error_setg(&local_error, "Cannot find a valid session id: %" PRIu64 "",
-                   op_info->session_id);
+        error_report("Cannot find a valid session id: %" PRIu64 "",
+                     op_info->session_id);
         return -VIRTIO_CRYPTO_INVSESS;
     }
 
diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c
index 45aba1ff67b895a9c22658e7fd9577cfa5bb6264..45b287a953542936b7b81fa0b91d80e0b4fc03cc 100644
--- a/backends/cryptodev-lkcf.c
+++ b/backends/cryptodev-lkcf.c
@@ -330,6 +330,8 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task)
             cryptodev_lkcf_set_op_desc(&session->akcipher_opts, op_desc,
                                        sizeof(op_desc), &local_error) != 0) {
             error_report_err(local_error);
+            status = -VIRTIO_CRYPTO_ERR;
+            goto out;
         } else {
             key_id = add_key(KCTL_KEY_TYPE_PKEY, "lkcf-backend-priv-key",
                              p8info, p8info_len, KCTL_KEY_RING);
@@ -346,6 +348,7 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task)
                                         session->key, session->keylen,
                                         &local_error);
         if (!akcipher) {
+            error_report_err(local_error);
             status = -VIRTIO_CRYPTO_ERR;
             goto out;
         }
diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c
index c3283ba84a55e53e3f57d22093c011c588d1d85b..b8e95ca8b42c53f5353ffa0b237dc8fbf57011aa 100644
--- a/backends/cryptodev-vhost-user.c
+++ b/backends/cryptodev-vhost-user.c
@@ -281,8 +281,7 @@ static int cryptodev_vhost_user_create_session(
         break;
 
     default:
-        error_setg(&local_error, "Unsupported opcode :%" PRIu32 "",
-                   sess_info->op_code);
+        error_report("Unsupported opcode :%" PRIu32 "", sess_info->op_code);
         return -VIRTIO_CRYPTO_NOTSUPP;
     }
 
diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index e5006bd215c8172c4bdba7e4aaa93361a7e04124..fff89fd62ae1354458bea517a000cca615068b74 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -398,6 +398,7 @@ static void cryptodev_backend_set_ops(Object *obj, Visitor *v,
 static void
 cryptodev_backend_complete(UserCreatable *uc, Error **errp)
 {
+    ERRP_GUARD();
     CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc);
     CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc);
     uint32_t services;
@@ -406,11 +407,20 @@ cryptodev_backend_complete(UserCreatable *uc, Error **errp)
     QTAILQ_INIT(&backend->opinfos);
     value = backend->tc.buckets[THROTTLE_OPS_TOTAL].avg;
     cryptodev_backend_set_throttle(backend, THROTTLE_OPS_TOTAL, value, errp);
+    if (*errp) {
+        return;
+    }
     value = backend->tc.buckets[THROTTLE_BPS_TOTAL].avg;
     cryptodev_backend_set_throttle(backend, THROTTLE_BPS_TOTAL, value, errp);
+    if (*errp) {
+        return;
+    }
 
     if (bc->init) {
         bc->init(backend, errp);
+        if (*errp) {
+            return;
+        }
     }
 
     services = backend->conf.crypto_services;
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
new file mode 100644
index 0000000000000000000000000000000000000000..8f2dda1beb9bbea27a61c17d439aeb19ec26cc90
--- /dev/null
+++ b/backends/host_iommu_device.c
@@ -0,0 +1,33 @@
+/*
+ * Host IOMMU device abstract
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan <zhenzhong.duan@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/host_iommu_device.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
+                            host_iommu_device,
+                            HOST_IOMMU_DEVICE,
+                            OBJECT)
+
+static void host_iommu_device_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void host_iommu_device_init(Object *obj)
+{
+}
+
+static void host_iommu_device_finalize(Object *obj)
+{
+    HostIOMMUDevice *hiod = HOST_IOMMU_DEVICE(obj);
+
+    g_free(hiod->name);
+}
diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index 361d4a8103ef82cba0994b24bc989a65798741ef..ce63a372a3fce4733c1adb02f7940766ba1ba2d5 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -20,9 +20,14 @@
 #include "qom/object.h"
 #include "qapi/visitor.h"
 #include "qapi/qapi-visit-common.h"
+#include "sysemu/kvm.h"
+#include "exec/address-spaces.h"
 
 OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE)
 
+bool virtcca_shared_hugepage_mapped = false;
+uint64_t virtcca_cvm_ram_size = 0;
+uint64_t virtcca_cvm_gpa_start = 0;
 
 struct HostMemoryBackendFile {
     HostMemoryBackend parent_obj;
@@ -36,6 +41,91 @@ struct HostMemoryBackendFile {
     OnOffAuto rom;
 };
 
+/* Parse the path of the hugepages memory file used for memory sharing */
+static int virtcca_parse_share_mem_path(char *src, char *dst)
+{
+    int ret = 0;
+    char src_copy[PATH_MAX];
+    char *token = NULL;
+    char *last_dir = NULL;
+    char *second_last_dir = NULL;
+    static const char delimiter[] = "/";
+
+    if (src == NULL || dst == NULL ||
+        strlen(src) == 0 || strlen(src) > PATH_MAX - 1) {
+        error_report("Invalid input: NULL pointer or invalid string length.");
+        return -1;
+    }
+
+    strcpy(src_copy, src);
+    token = strtok(src_copy, delimiter);
+
+    /* Iterate over the path segments to find the second-to-last directory */
+    while (token != NULL) {
+        second_last_dir = last_dir;
+        last_dir = token;
+        token = strtok(NULL, delimiter);
+    }
+
+    /* Check if the second-to-last directory is found */
+    if (second_last_dir == NULL) {
+        error_report("Invalid path: second-to-last directory not found.");
+        return -1;
+    }
+
+    /*
+     * Construct the share memory path by appending the extracted domain name
+     * to the hugepages memory filesystem prefix
+     */
+    ret = snprintf(dst, PATH_MAX, "/dev/hugepages/libvirt/qemu/%s",
+                   second_last_dir);
+
+    if (ret < 0 || ret >= PATH_MAX) {
+        error_report("Error: snprintf failed to construct the share mem path");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Create a hugepage memory region in the virtcca scenario
+ * for sharing with process like vhost-user and others.
+ */
+static void
+virtcca_shared_backend_memory_alloc(char *mem_path, uint32_t ram_flags, Error **errp)
+{
+    char dst[PATH_MAX];
+    uint64_t size = virtcca_cvm_ram_size;
+
+    if (virtcca_parse_share_mem_path(mem_path, dst)) {
+        error_report("parse virtcca share memory path failed");
+        exit(1);
+    }
+
+    /*
+     * 1) CVM_GPA_START = 3GB --> fix size = 1GB
+     * 2) CVM_GPA_START = 1GB && ram_size >= 3GB --> size = 3GB
+     * 3) CVM_GPA_START = 1GB && ram_size < 3GB  --> size = ram_size
+     */
+    if (virtcca_cvm_gpa_start != DEFAULT_VM_GPA_START) {
+        size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - virtcca_cvm_gpa_start;
+    } else if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START) {
+        size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START;
+    }
+
+    virtcca_shared_hugepage = g_new(MemoryRegion, 1);
+    memory_region_init_ram_from_file(virtcca_shared_hugepage, NULL,
+                                     "virtcca_shared_hugepage", size,
+                                     VIRTCCA_SHARED_HUGEPAGE_ALIGN,
+                                     ram_flags, dst, 0, errp);
+    if (*errp) {
+        error_reportf_err(*errp, "cannot init RamBlock for virtcca_shared_hugepage: ");
+        exit(1);
+    }
+    virtcca_shared_hugepage_mapped = true;
+}
+
 static void
 file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
 {
@@ -90,6 +180,12 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
                                      backend->size, fb->align, ram_flags,
                                      fb->mem_path, fb->offset, errp);
     g_free(name);
+
+    if (virtcca_cvm_enabled() && backend->share &&
+        (strcmp(fb->mem_path, "/dev/shm") != 0) &&
+        !virtcca_shared_hugepage_mapped) {
+        virtcca_shared_backend_memory_alloc(fb->mem_path, ram_flags, errp);
+    }
 #endif
 }
 
diff --git a/backends/iommufd.c b/backends/iommufd.c
new file mode 100644
index 0000000000000000000000000000000000000000..4446efaa32a2644ffc49fbd1632d1ef8dd7ed3d4
--- /dev/null
+++ b/backends/iommufd.c
@@ -0,0 +1,532 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/iommufd.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+#include "trace.h"
+#include "hw/vfio/vfio-common.h"
+#include <sys/ioctl.h>
+#include <linux/iommufd.h>
+
+static void iommufd_backend_init(Object *obj)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+    be->fd = -1;
+    be->users = 0;
+    be->owned = true;
+}
+
+static void iommufd_backend_finalize(Object *obj)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+    if (be->owned) {
+        close(be->fd);
+        be->fd = -1;
+    }
+}
+
+static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
+{
+    ERRP_GUARD();
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+    int fd = -1;
+
+    fd = monitor_fd_param(monitor_cur(), str, errp);
+    if (fd == -1) {
+        error_prepend(errp, "Could not parse remote object fd %s:", str);
+        return;
+    }
+    be->fd = fd;
+    be->owned = false;
+    trace_iommu_backend_set_fd(be->fd);
+}
+
+static bool iommufd_backend_can_be_deleted(UserCreatable *uc)
+{
+    IOMMUFDBackend *be = IOMMUFD_BACKEND(uc);
+
+    return !be->users;
+}
+
+static void iommufd_backend_class_init(ObjectClass *oc, void *data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+    ucc->can_be_deleted = iommufd_backend_can_be_deleted;
+
+    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
+}
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+{
+    int fd, ret = 0;
+
+    if (be->owned && !be->users) {
+        fd = qemu_open("/dev/iommu", O_RDWR, errp);
+        if (fd < 0) {
+            ret = fd;
+            goto out;
+        }
+        be->fd = fd;
+    }
+    be->users++;
+out:
+    trace_iommufd_backend_connect(be->fd, be->owned,
+                                  be->users, ret);
+    return ret;
+}
+
+void iommufd_backend_disconnect(IOMMUFDBackend *be)
+{
+    if (!be->users) {
+        goto out;
+    }
+    be->users--;
+    if (!be->users && be->owned) {
+        close(be->fd);
+        be->fd = -1;
+    }
+out:
+    trace_iommufd_backend_disconnect(be->fd, be->users);
+}
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+                               Error **errp)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_alloc alloc_data  = {
+        .size = sizeof(alloc_data),
+        .flags = 0,
+    };
+
+    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
+    if (ret) {
+        error_setg_errno(errp, errno, "Failed to allocate ioas");
+        return ret;
+    }
+
+    *ioas_id = alloc_data.out_ioas_id;
+    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+
+    return ret;
+}
+
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
+{
+    int ret, fd = be->fd;
+    struct iommu_destroy des = {
+        .size = sizeof(des),
+        .id = id,
+    };
+
+    ret = ioctl(fd, IOMMU_DESTROY, &des);
+    trace_iommufd_backend_free_id(fd, id, ret);
+    if (ret) {
+        error_report("Failed to free id: %u %m", id);
+    }
+}
+
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+                            ram_addr_t size, void *vaddr, bool readonly)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_map map = {
+        .size = sizeof(map),
+        .flags = IOMMU_IOAS_MAP_READABLE |
+                 IOMMU_IOAS_MAP_FIXED_IOVA,
+        .ioas_id = ioas_id,
+        .__reserved = 0,
+        .user_va = (uintptr_t)vaddr,
+        .iova = iova,
+        .length = size,
+    };
+
+    if (!readonly) {
+        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
+    }
+
+    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
+    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
+                                  vaddr, readonly, ret);
+    if (ret) {
+        ret = -errno;
+
+        /* TODO: Not support mapping hardware PCI BAR region for now. */
+        if (errno == EFAULT) {
+            warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
+        } else {
+            error_report("IOMMU_IOAS_MAP failed: %m");
+        }
+    }
+    return ret;
+}
+
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+                              hwaddr iova, ram_addr_t size)
+{
+    int ret, fd = be->fd;
+    struct iommu_ioas_unmap unmap = {
+        .size = sizeof(unmap),
+        .ioas_id = ioas_id,
+        .iova = iova,
+        .length = size,
+    };
+
+    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
+    /*
+     * IOMMUFD takes mapping as some kind of object, unmapping
+     * nonexistent mapping is treated as deleting a nonexistent
+     * object and return ENOENT. This is different from legacy
+     * backend which allows it. vIOMMU may trigger a lot of
+     * redundant unmapping, to avoid flush the log, treat them
+     * as succeess for IOMMUFD just like legacy backend.
+     */
+    if (ret && errno == ENOENT) {
+        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
+        ret = 0;
+    } else {
+        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
+    }
+
+    if (ret) {
+        ret = -errno;
+        error_report("IOMMU_IOAS_UNMAP failed: %m");
+    }
+    return ret;
+}
+
+bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
+                                uint32_t pt_id, uint32_t flags,
+                                uint32_t data_type, uint32_t data_len,
+                                void *data_ptr, uint32_t *out_hwpt,
+                                uint32_t *out_fault_fd, Error **errp)
+{
+    int ret, fd = be->fd;
+    struct iommu_hwpt_alloc alloc_hwpt = {
+        .size = sizeof(struct iommu_hwpt_alloc),
+        .flags = flags,
+        .dev_id = dev_id,
+        .pt_id = pt_id,
+        .data_type = data_type,
+        .data_len = data_len,
+        .data_uptr = (uintptr_t)data_ptr,
+    };
+
+    if (flags & IOMMU_HWPT_FAULT_ID_VALID) {
+
+        struct iommu_fault_alloc cmd = {
+            .size = sizeof(cmd),
+        };
+
+        ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd);
+        if (ret) {
+            ret = -errno;
+            error_report("IOMMU_FAULT_ALLOC failed: %m");
+        } else {
+            alloc_hwpt.fault_id = cmd.out_fault_id;
+            if (out_fault_fd) {
+                *out_fault_fd = cmd.out_fault_fd;
+            }
+        }
+    }
+
+    ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
+    trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type,
+                                     data_len, (uintptr_t)data_ptr,
+                                     alloc_hwpt.out_hwpt_id, ret);
+    if (ret) {
+        error_setg_errno(errp, errno, "Failed to allocate hwpt");
+        return false;
+    }
+
+    *out_hwpt = alloc_hwpt.out_hwpt_id;
+    return true;
+}
+
+bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be,
+                                        uint32_t hwpt_id, bool start,
+                                        Error **errp)
+{
+    int ret;
+    struct iommu_hwpt_set_dirty_tracking set_dirty = {
+            .size = sizeof(set_dirty),
+            .hwpt_id = hwpt_id,
+            .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0,
+    };
+
+    ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty);
+    trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0);
+    if (ret) {
+        error_setg_errno(errp, errno,
+                         "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed",
+                         hwpt_id);
+        return false;
+    }
+
+    return true;
+}
+
+bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be,
+                                      uint32_t hwpt_id,
+                                      uint64_t iova, ram_addr_t size,
+                                      uint64_t page_size, uint64_t *data,
+                                      Error **errp)
+{
+    int ret;
+    struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = {
+        .size = sizeof(get_dirty_bitmap),
+        .hwpt_id = hwpt_id,
+        .iova = iova,
+        .length = size,
+        .page_size = page_size,
+        .data = (uintptr_t)data,
+    };
+
+    ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap);
+    trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size,
+                                           page_size, ret ? errno : 0);
+    if (ret) {
+        error_setg_errno(errp, errno,
+                         "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx
+                         " size: 0x"RAM_ADDR_FMT") failed", iova, size);
+        return false;
+    }
+
+    return true;
+}
+
+bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+                                     uint32_t *type, void *data, uint32_t len,
+                                     uint64_t *caps, uint8_t *max_pasid_log2,
+                                     Error **errp)
+{
+    struct iommu_hw_info info = {
+        .size = sizeof(info),
+        .dev_id = devid,
+        .data_len = len,
+        .data_uptr = (uintptr_t)data,
+    };
+
+    if (ioctl(be->fd, IOMMU_GET_HW_INFO, &info)) {
+        error_setg_errno(errp, errno, "Failed to get hardware info");
+        return false;
+    }
+
+    g_assert(type);
+    *type = info.out_data_type;
+    g_assert(caps);
+    *caps = info.out_capabilities;
+    *max_pasid_log2 = info.out_max_pasid_log2;
+
+    return true;
+}
+
+int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id,
+                                     uint32_t data_type, uint32_t entry_len,
+                                     uint32_t *entry_num, void *data_ptr)
+{
+    int ret, fd = be->fd;
+    struct iommu_hwpt_invalidate cache = {
+        .size = sizeof(cache),
+        .hwpt_id = hwpt_id,
+        .data_type = data_type,
+        .entry_len = entry_len,
+        .entry_num = *entry_num,
+        .data_uptr = (uintptr_t)data_ptr,
+    };
+
+    ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache);
+
+    trace_iommufd_backend_invalidate_cache(fd, hwpt_id, data_type, entry_len,
+                                           *entry_num, cache.entry_num,
+                                           (uintptr_t)data_ptr, ret);
+    if (ret) {
+        *entry_num = cache.entry_num;
+        error_report("IOMMU_HWPT_INVALIDATE failed: %s", strerror(errno));
+        ret = -errno;
+    } else {
+        g_assert(*entry_num == cache.entry_num);
+    }
+
+    return ret;
+}
+
+struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be,
+                                                   uint32_t dev_id,
+                                                   uint32_t viommu_type,
+                                                   uint32_t hwpt_id)
+{
+    int ret, fd = be->fd;
+    struct IOMMUFDViommu *viommu = g_malloc(sizeof(*viommu));
+    struct iommu_viommu_alloc alloc_viommu = {
+        .size = sizeof(alloc_viommu),
+        .type = viommu_type,
+        .dev_id = dev_id,
+        .hwpt_id = hwpt_id,
+    };
+
+    if (!viommu) {
+        error_report("failed to allocate viommu object");
+        return NULL;
+    }
+
+    ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu);
+
+    trace_iommufd_backend_alloc_viommu(fd, viommu_type, dev_id, hwpt_id,
+                                       alloc_viommu.out_viommu_id, ret);
+    if (ret) {
+        error_report("IOMMU_VIOMMU_ALLOC failed: %s", strerror(errno));
+        g_free(viommu);
+        return NULL;
+    }
+
+    viommu->viommu_id = alloc_viommu.out_viommu_id;
+    viommu->s2_hwpt_id = hwpt_id;
+    viommu->iommufd = be;
+    return viommu;
+}
+
+struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev,
+                                               IOMMUFDViommu *viommu,
+                                               uint64_t virt_id)
+{
+    int ret, fd = viommu->iommufd->fd;
+    struct IOMMUFDVdev *vdev = g_malloc(sizeof(*vdev));
+    struct iommu_vdevice_alloc alloc_vdev = {
+        .size = sizeof(alloc_vdev),
+        .viommu_id = viommu->viommu_id,
+        .dev_id = idev->devid,
+        .virt_id = virt_id,
+    };
+
+    ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev);
+
+    trace_iommufd_backend_alloc_vdev(fd, idev->devid, viommu->viommu_id, virt_id,
+                                     alloc_vdev.out_vdevice_id, ret);
+
+    if (ret) {
+        error_report("IOMMU_VDEVICE_ALLOC failed: %s", strerror(errno));
+        g_free(vdev);
+        return NULL;
+    }
+
+    vdev->idev = idev;
+    vdev->viommu = viommu;
+    vdev->virt_id = virt_id;
+    vdev->vdev_id = alloc_vdev.out_vdevice_id;
+    return vdev;
+}
+
+int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id,
+                                    uint32_t data_type, uint32_t entry_len,
+                                    uint32_t *entry_num, void *data_ptr)
+{
+    int ret, fd = be->fd;
+    struct iommu_hwpt_invalidate cache = {
+        .size = sizeof(cache),
+        .hwpt_id = viommu_id,
+        .data_type = data_type,
+        .entry_len = entry_len,
+        .entry_num = *entry_num,
+        .data_uptr = (uint64_t)data_ptr,
+    };
+
+    ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache);
+
+    trace_iommufd_viommu_invalidate_cache(fd, viommu_id, data_type,
+                                          entry_len, *entry_num,
+                                          cache.entry_num,
+                                          (uint64_t)data_ptr, ret);
+    if (ret) {
+        *entry_num = cache.entry_num;
+        error_report("IOMMU_VIOMMU_INVALIDATE failed: %s", strerror(errno));
+        ret = -errno;
+    } else {
+        g_assert(*entry_num == cache.entry_num);
+    }
+
+    return ret;
+}
+
+bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                           uint32_t hwpt_id, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFDClass *idevc =
+        HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev);
+
+    g_assert(idevc->attach_hwpt);
+    return idevc->attach_hwpt(idev, hwpt_id, errp);
+}
+
+bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                           Error **errp)
+{
+    HostIOMMUDeviceIOMMUFDClass *idevc =
+        HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev);
+
+    g_assert(idevc->detach_hwpt);
+    return idevc->detach_hwpt(idev, errp);
+}
+
+static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
+{
+    HostIOMMUDeviceCaps *caps = &hiod->caps;
+
+    switch (cap) {
+    case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE:
+        return caps->type;
+    case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+        return vfio_device_get_aw_bits(hiod->agent);
+    default:
+        error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
+        return -EINVAL;
+    }
+}
+
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
+{
+    HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+    hioc->get_cap = hiod_iommufd_get_cap;
+};
+
+static const TypeInfo types[] = {
+    {
+        .name = TYPE_IOMMUFD_BACKEND,
+        .parent = TYPE_OBJECT,
+        .instance_size = sizeof(IOMMUFDBackend),
+        .instance_init = iommufd_backend_init,
+        .instance_finalize = iommufd_backend_finalize,
+        .class_size = sizeof(IOMMUFDBackendClass),
+        .class_init = iommufd_backend_class_init,
+        .interfaces = (InterfaceInfo[]) {
+            { TYPE_USER_CREATABLE },
+            { }
+        }
+    }, {
+        .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+        .parent = TYPE_HOST_IOMMU_DEVICE,
+        .instance_size = sizeof(HostIOMMUDeviceIOMMUFD),
+        .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass),
+        .class_init = hiod_iommufd_class_init,
+        .abstract = true,
+    }
+};
+DEFINE_TYPES(types)
diff --git a/backends/meson.build b/backends/meson.build
index 914c7c4afb905cfe710ad23dd1ee42907f6d1679..68b5e34e04959e52472fdf6dba7a703540e4fa4c 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -13,6 +13,7 @@ system_ss.add([files(
 system_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c'))
 system_ss.add(when: 'CONFIG_POSIX', if_true: files('hostmem-file.c'))
 system_ss.add(when: 'CONFIG_LINUX', if_true: files('hostmem-memfd.c'))
+system_ss.add(when: 'CONFIG_LINUX', if_true: files('host_iommu_device.c'))
 if keyutils.found()
     system_ss.add(keyutils, files('cryptodev-lkcf.c'))
 endif
@@ -20,6 +21,7 @@ if have_vhost_user
   system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
+system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
 if have_vhost_user_crypto
   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
 endif
diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c
index f7f1b4ad7a80588c747883fe212d1207687728ed..0d07df216e47a3ced2a35b030b5c0911394608fc 100644
--- a/backends/tpm/tpm_emulator.c
+++ b/backends/tpm/tpm_emulator.c
@@ -128,10 +128,10 @@ static int tpm_emulator_ctrlcmd(TPMEmulator *tpm, unsigned long cmd, void *msg,
     CharBackend *dev = &tpm->ctrl_chr;
     uint32_t cmd_no = cpu_to_be32(cmd);
     ssize_t n = sizeof(uint32_t) + msg_len_in;
-    uint8_t *buf = NULL;
 
     WITH_QEMU_LOCK_GUARD(&tpm->mutex) {
-        buf = g_alloca(n);
+        g_autofree uint8_t *buf = g_malloc(n);
+
         memcpy(buf, &cmd_no, sizeof(cmd_no));
         memcpy(buf + sizeof(cmd_no), msg, msg_len_in);
 
diff --git a/backends/trace-events b/backends/trace-events
index 652eb76a5723e2053fe97338c481309c58284d6a..f8592a27111234a6a59cf7fc0fcc253268876fb1 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -5,3 +5,20 @@ dbus_vmstate_pre_save(void)
 dbus_vmstate_post_load(int version_id) "version_id: %d"
 dbus_vmstate_loading(const char *id) "id: %s"
 dbus_vmstate_saving(const char *id) "id: %s"
+
+# iommufd.c
+iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
+iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
+iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
+iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
+iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
+iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
+iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
+iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)"
+iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)"
+iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)"
+iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)"
+iommufd_viommu_invalidate_cache(int iommufd, uint32_t viommu_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d viommu_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
diff --git a/block.c b/block.c
index bfb0861ec61d2965e1ebd1d3d05976ab25a76c7f..6a2abfabcbc0e0cc4daacf7429e0e9129325eef2 100644
--- a/block.c
+++ b/block.c
@@ -68,6 +68,9 @@
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
+#define DEFAULT_BIOS_BOOT_LOADER_DIR "/usr/share/edk2"
+#define DEFAULT_NVRAM_TEMPLATE_DIR "/var/lib/libvirt/qemu/nvram"
+
 /* Protected by BQL */
 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
@@ -86,6 +89,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
                                            BlockDriverState *parent,
                                            const BdrvChildClass *child_class,
                                            BdrvChildRole child_role,
+                                           bool parse_filename,
                                            Error **errp);
 
 static bool bdrv_recurse_has_child(BlockDriverState *bs,
@@ -2047,7 +2051,8 @@ static void parse_json_protocol(QDict *options, const char **pfilename,
  * block driver has been specified explicitly.
  */
 static int bdrv_fill_options(QDict **options, const char *filename,
-                             int *flags, Error **errp)
+                             int *flags, bool allow_parse_filename,
+                             Error **errp)
 {
     const char *drvname;
     bool protocol = *flags & BDRV_O_PROTOCOL;
@@ -2089,7 +2094,7 @@ static int bdrv_fill_options(QDict **options, const char *filename,
     if (protocol && filename) {
         if (!qdict_haskey(*options, "filename")) {
             qdict_put_str(*options, "filename", filename);
-            parse_filename = true;
+            parse_filename = allow_parse_filename;
         } else {
             error_setg(errp, "Can't specify 'file' and 'filename' options at "
                              "the same time");
@@ -3675,7 +3680,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
     }
 
     backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
-                                   &child_of_bds, bdrv_backing_role(bs), errp);
+                                   &child_of_bds, bdrv_backing_role(bs), true,
+                                   errp);
     if (!backing_hd) {
         bs->open_flags |= BDRV_O_NO_BACKING;
         error_prepend(errp, "Could not open backing file: ");
@@ -3712,7 +3718,8 @@ free_exit:
 static BlockDriverState *
 bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
                    BlockDriverState *parent, const BdrvChildClass *child_class,
-                   BdrvChildRole child_role, bool allow_none, Error **errp)
+                   BdrvChildRole child_role, bool allow_none,
+                   bool parse_filename, Error **errp)
 {
     BlockDriverState *bs = NULL;
     QDict *image_options;
@@ -3743,7 +3750,8 @@ bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
     }
 
     bs = bdrv_open_inherit(filename, reference, image_options, 0,
-                           parent, child_class, child_role, errp);
+                           parent, child_class, child_role, parse_filename,
+                           errp);
     if (!bs) {
         goto done;
     }
@@ -3753,6 +3761,37 @@ done:
     return bs;
 }
 
+static BdrvChild *bdrv_open_child_common(const char *filename,
+                                         QDict *options, const char *bdref_key,
+                                         BlockDriverState *parent,
+                                         const BdrvChildClass *child_class,
+                                         BdrvChildRole child_role,
+                                         bool allow_none, bool parse_filename,
+                                         Error **errp)
+{
+    BlockDriverState *bs;
+    BdrvChild *child;
+    AioContext *ctx;
+
+    GLOBAL_STATE_CODE();
+
+    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
+                            child_role, allow_none, parse_filename, errp);
+    if (bs == NULL) {
+        return NULL;
+    }
+
+    bdrv_graph_wrlock(NULL);
+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
+    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
+                              errp);
+    aio_context_release(ctx);
+    bdrv_graph_wrunlock(NULL);
+
+    return child;
+}
+
 /*
  * Opens a disk image whose options are given as BlockdevRef in another block
  * device's options.
@@ -3778,31 +3817,15 @@ BdrvChild *bdrv_open_child(const char *filename,
                            BdrvChildRole child_role,
                            bool allow_none, Error **errp)
 {
-    BlockDriverState *bs;
-    BdrvChild *child;
-    AioContext *ctx;
-
-    GLOBAL_STATE_CODE();
-
-    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
-                            child_role, allow_none, errp);
-    if (bs == NULL) {
-        return NULL;
-    }
-
-    bdrv_graph_wrlock(NULL);
-    ctx = bdrv_get_aio_context(bs);
-    aio_context_acquire(ctx);
-    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
-                              errp);
-    aio_context_release(ctx);
-    bdrv_graph_wrunlock(NULL);
-
-    return child;
+    return bdrv_open_child_common(filename, options, bdref_key, parent,
+                                  child_class, child_role, allow_none, false,
+                                  errp);
 }
 
 /*
- * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
+ * This does mostly the same as bdrv_open_child(), but for opening the primary
+ * child of a node. A notable difference from bdrv_open_child() is that it
+ * enables filename parsing for protocol names (including json:).
  *
  * The caller must hold the lock of the main AioContext and no other AioContext.
  * @parent can move to a different AioContext in this function. Callers must
@@ -3819,8 +3842,8 @@ int bdrv_open_file_child(const char *filename,
     role = parent->drv->is_filter ?
         (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
 
-    if (!bdrv_open_child(filename, options, bdref_key, parent,
-                         &child_of_bds, role, false, errp))
+    if (!bdrv_open_child_common(filename, options, bdref_key, parent,
+                                &child_of_bds, role, false, true, errp))
     {
         return -EINVAL;
     }
@@ -3865,7 +3888,8 @@ BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
 
     }
 
-    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
+    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, false,
+                           errp);
     obj = NULL;
     qobject_unref(obj);
     visit_free(v);
@@ -3962,7 +3986,7 @@ static BlockDriverState * no_coroutine_fn
 bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
                   int flags, BlockDriverState *parent,
                   const BdrvChildClass *child_class, BdrvChildRole child_role,
-                  Error **errp)
+                  bool parse_filename, Error **errp)
 {
     int ret;
     BlockBackend *file = NULL;
@@ -4011,9 +4035,11 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
     }
 
     /* json: syntax counts as explicit options, as if in the QDict */
-    parse_json_protocol(options, &filename, &local_err);
-    if (local_err) {
-        goto fail;
+    if (parse_filename) {
+        parse_json_protocol(options, &filename, &local_err);
+        if (local_err) {
+            goto fail;
+        }
     }
 
     bs->explicit_options = qdict_clone_shallow(options);
@@ -4038,7 +4064,8 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
                                      parent->open_flags, parent->options);
     }
 
-    ret = bdrv_fill_options(&options, filename, &flags, &local_err);
+    ret = bdrv_fill_options(&options, filename, &flags, parse_filename,
+                            &local_err);
     if (ret < 0) {
         goto fail;
     }
@@ -4107,7 +4134,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
 
         file_bs = bdrv_open_child_bs(filename, options, "file", bs,
                                      &child_of_bds, BDRV_CHILD_IMAGE,
-                                     true, &local_err);
+                                     true, true, &local_err);
         if (local_err) {
             goto fail;
         }
@@ -4270,7 +4297,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
     GLOBAL_STATE_CODE();
 
     return bdrv_open_inherit(filename, reference, options, flags, NULL,
-                             NULL, 0, errp);
+                             NULL, 0, true, errp);
 }
 
 /* Return true if the NULL-terminated @list contains @str */
@@ -7017,7 +7044,13 @@ int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
     assert(!(bs->open_flags & BDRV_O_INACTIVE));
     assert_bdrv_graph_readable();
 
-    if (bs->drv->bdrv_co_invalidate_cache) {
+    /*
+     * It's not necessary for bios bootloader and nvram template to drop cache
+     * when migration, skip this step for them to avoid dowtime increase.
+     */
+    if (bs->drv->bdrv_co_invalidate_cache &&
+        !strstr(bs->filename, DEFAULT_BIOS_BOOT_LOADER_DIR) &&
+        !strstr(bs->filename, DEFAULT_NVRAM_TEMPLATE_DIR)) {
         bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
@@ -7298,6 +7331,22 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
                                 bdrv_get_device_or_node_name(bs));
         return true;
     }
+
+    /*
+     * When migration puts a BDRV_O_INACTIVE flag on driver's open_flags,
+     * we fake a blocker that doesn't exist. From now on, block jobs
+     * will not be permitted.
+     */
+    if ((op == BLOCK_OP_TYPE_RESIZE || op == BLOCK_OP_TYPE_COMMIT_SOURCE ||
+         op == BLOCK_OP_TYPE_MIRROR_SOURCE || op == BLOCK_OP_TYPE_MIRROR_TARGET) &&
+         (bs->open_flags & BDRV_O_INACTIVE)) {
+        if (errp) {
+            error_setg(errp, "block device is in use by migration with"
+                       " a driver BDRV_O_INACTIVE flag setted");
+        }
+        return true;
+    }
+
     return false;
 }
 
diff --git a/block/blkio.c b/block/blkio.c
index 0a0a6c0f5fde6ae874982671dbe9842d2d8c70e0..52ac94527fb4c788424345d1823d2ad50f88fb6d 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -68,7 +68,7 @@ typedef struct {
     CoQueue bounce_available;
 
     /* The value of the "mem-region-alignment" property */
-    size_t mem_region_alignment;
+    uint64_t mem_region_alignment;
 
     /* Can we skip adding/deleting blkio_mem_regions? */
     bool needs_mem_regions;
@@ -89,6 +89,9 @@ static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
     /* Pad size to reduce frequency of resize calls */
     bytes += 128 * 1024;
 
+    /* Align the pool size to avoid blkio_alloc_mem_region() failure */
+    bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment);
+
     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
         int ret;
 
@@ -896,8 +899,10 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
-    bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
-                               BDRV_REQ_NO_FALLBACK;
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+#ifdef CONFIG_BLKIO_WRITE_ZEROS_FUA
+    bs->supported_zero_flags |= BDRV_REQ_FUA;
+#endif
 
     qemu_mutex_init(&s->blkio_lock);
     qemu_co_mutex_init(&s->bounce_lock);
diff --git a/block/block-backend.c b/block/block-backend.c
index ec21148806982ae42c5a58526b31069f3c481a71..bfbbb18af1c3faf8b1282007089a70827fbe9068 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -92,6 +92,15 @@ struct BlockBackend {
      * Accessed with atomic ops.
      */
     unsigned int in_flight;
+
+    /* Timer for retry on errors. */
+    QEMUTimer *retry_timer;
+    /* Interval in ms to trigger next retry. */
+    int64_t retry_interval;
+    /* Start time of the first error. Used to check timeout. */
+    int64_t retry_start_time;
+    /* Retry timeout. 0 represents infinite retry. */
+    int64_t retry_timeout;
 };
 
 typedef struct BlockBackendAIOCB {
@@ -368,6 +377,11 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
     blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
     blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
 
+    blk->retry_timer = NULL;
+    blk->retry_interval = BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL;
+    blk->retry_start_time = 0;
+    blk->retry_timeout = 0;
+
     block_acct_init(&blk->stats);
 
     qemu_mutex_init(&blk->queued_requests_lock);
@@ -508,6 +522,10 @@ static void blk_delete(BlockBackend *blk)
     QTAILQ_REMOVE(&block_backends, blk, link);
     drive_info_del(blk->legacy_dinfo);
     block_acct_cleanup(&blk->stats);
+    if (blk->retry_timer) {
+        timer_del(blk->retry_timer);
+        timer_free(blk->retry_timer);
+    }
     g_free(blk);
 }
 
@@ -1102,6 +1120,14 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
     blk->dev_ops = ops;
     blk->dev_opaque = opaque;
 
+    if ((blk->on_read_error == BLOCKDEV_ON_ERROR_RETRY ||
+         blk->on_write_error == BLOCKDEV_ON_ERROR_RETRY) &&
+        ops->retry_request_cb) {
+        blk->retry_timer = aio_timer_new(blk->ctx, QEMU_CLOCK_REALTIME,
+                                         SCALE_MS, ops->retry_request_cb,
+                                         opaque);
+    }
+
     /* Are we currently quiesced? Should we enforce this right now? */
     if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) {
         ops->drained_begin(opaque);
@@ -2120,6 +2146,39 @@ void blk_drain_all(void)
     bdrv_drain_all_end();
 }
 
+void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval)
+{
+    blk->retry_interval = interval;
+}
+
+void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout)
+{
+    blk->retry_timeout = timeout;
+}
+
+static bool blk_error_retry_timeout(BlockBackend *blk)
+{
+    /* No timeout set, infinite retries. */
+    if (!blk->retry_timeout) {
+        return false;
+    }
+
+    /* The first time an error occurs. */
+    if (!blk->retry_start_time) {
+        blk->retry_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        return false;
+    }
+
+    return qemu_clock_get_ms(QEMU_CLOCK_REALTIME) > (blk->retry_start_time +
+                                                     blk->retry_timeout);
+}
+
+void blk_error_retry_reset_timeout(BlockBackend *blk)
+{
+    if (blk->retry_timer && blk->retry_start_time)
+        blk->retry_start_time = 0;
+}
+
 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
                       BlockdevOnError on_write_error)
 {
@@ -2150,6 +2209,9 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
         return BLOCK_ERROR_ACTION_REPORT;
     case BLOCKDEV_ON_ERROR_IGNORE:
         return BLOCK_ERROR_ACTION_IGNORE;
+    case BLOCKDEV_ON_ERROR_RETRY:
+        return (blk->retry_timer && !blk_error_retry_timeout(blk)) ?
+               BLOCK_ERROR_ACTION_RETRY : BLOCK_ERROR_ACTION_REPORT;
     case BLOCKDEV_ON_ERROR_AUTO:
     default:
         abort();
@@ -2198,6 +2260,12 @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action,
         qemu_system_vmstop_request_prepare();
         send_qmp_error_event(blk, action, is_read, error);
         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
+    } else if (action == BLOCK_ERROR_ACTION_RETRY) {
+        if (!blk->quiesce_counter) {
+            timer_mod(blk->retry_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
+                                        blk->retry_interval);
+            send_qmp_error_event(blk, action, is_read, error);
+        }
     } else {
         send_qmp_error_event(blk, action, is_read, error);
     }
diff --git a/block/file-posix.c b/block/file-posix.c
index b862406c719307d448908db87905cd4eef50ee34..787f613d5234c2a8a2f6569fc7bb98c64a42fa37 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -128,6 +128,10 @@
 #define FTYPE_CD     1
 
 #define MAX_BLOCKSIZE	4096
+#define DEFAULT_BUFFER_SIZE 65536
+#define BUFFER_ALIGN_SIZE   65536
+#define MIN_BUFFER_SIZE     65536
+#define MAX_BUFFER_SIZE     16777216
 
 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
  * leaving a few more bytes for its future use. */
@@ -158,6 +162,7 @@ typedef struct BDRVRawState {
 
     bool has_discard:1;
     bool has_write_zeroes:1;
+    bool discard_zeroes:1;
     bool use_linux_aio:1;
     bool use_linux_io_uring:1;
     int page_cache_inconsistent; /* errno from fdatasync failure */
@@ -202,6 +207,8 @@ typedef struct RawPosixAIOData {
     off_t aio_offset;
     uint64_t aio_nbytes;
 
+    size_t buffer_size;
+
     union {
         struct {
             struct iovec *iov;
@@ -765,6 +772,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
             ret = -EINVAL;
             goto fail;
         } else {
+            s->discard_zeroes = true;
             s->has_fallocate = true;
         }
     } else {
@@ -790,12 +798,19 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 #endif
 
     if (S_ISBLK(st.st_mode)) {
+#ifdef BLKDISCARDZEROES
+        unsigned int arg;
+        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
+            s->discard_zeroes = true;
+        }
+#endif
 #ifdef __linux__
         /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
          * not rely on the contents of discarded blocks unless using O_DIRECT.
          * Same for BLKZEROOUT.
          */
         if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+            s->discard_zeroes = false;
             s->has_write_zeroes = false;
         }
 #endif
@@ -813,7 +828,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
 #endif
     s->needs_alignment = raw_needs_alignment(bs);
 
-    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
+    bs->supported_zero_flags = s->discard_zeroes ? (BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) : 0;
     if (S_ISREG(st.st_mode)) {
         /* When extending regular files, we get zeros from the OS */
         bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
@@ -1408,7 +1423,7 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
                                      Error **errp)
 {
     BDRVRawState *s = bs->opaque;
-    BlockZoneModel zoned;
+    BlockZoneModel zoned = BLK_Z_NONE;
     int ret;
 
     ret = get_sysfs_zoned_model(st, &zoned);
@@ -2621,7 +2636,8 @@ static void raw_close(BlockDriverState *bs)
  */
 static int coroutine_fn
 raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
-                     PreallocMode prealloc, Error **errp)
+                     PreallocMode prealloc, size_t buffer_size,
+                     Error **errp)
 {
     RawPosixAIOData acb;
 
@@ -2630,6 +2646,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
         .aio_fildes     = fd,
         .aio_type       = QEMU_AIO_TRUNCATE,
         .aio_offset     = offset,
+        .buffer_size    = buffer_size,
         .truncate       = {
             .prealloc       = prealloc,
             .errp           = errp,
@@ -2655,7 +2672,8 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 
     if (S_ISREG(st.st_mode)) {
         /* Always resizes to the exact @offset */
-        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
+        return raw_regular_truncate(bs, s->fd, offset, prealloc,
+                                    DEFAULT_BUFFER_SIZE, errp);
     }
 
     if (prealloc != PREALLOC_MODE_OFF) {
@@ -2873,6 +2891,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
     int fd;
     uint64_t perm, shared;
     int result = 0;
+    int flags = O_RDWR | O_BINARY;
+    size_t buffer_size = DEFAULT_BUFFER_SIZE;
 
     /* Validate options and set default values */
     assert(options->driver == BLOCKDEV_DRIVER_FILE);
@@ -2892,9 +2912,19 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
         error_setg(errp, "Extent size hint is too large");
         goto out;
     }
+    if (!file_opts->cache) {
+        file_opts->cache = g_strdup("writeback");
+    }
+    if (file_opts->preallocation == PREALLOC_MODE_FULL &&
+        !strcmp(file_opts->cache, "none")) {
+        flags |= O_DIRECT;
+    }
+    if (file_opts->has_buffersize) {
+        buffer_size = file_opts->buffersize;
+    }
 
     /* Create file */
-    fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
+    fd = qemu_create(file_opts->filename, flags, 0644, errp);
     if (fd < 0) {
         result = -errno;
         goto out;
@@ -2929,7 +2959,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
     }
 
     /* Clear the file by truncating it to 0 */
-    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
+    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF,
+                                  buffer_size, errp);
     if (result < 0) {
         goto out_unlock;
     }
@@ -2973,7 +3004,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
     /* Resize and potentially preallocate the file to the desired
      * final size */
     result = raw_regular_truncate(NULL, fd, file_opts->size,
-                                  file_opts->preallocation, errp);
+                                  file_opts->preallocation,
+                                  buffer_size, errp);
     if (result < 0) {
         goto out_unlock;
     }
@@ -2994,6 +3026,8 @@ out_close:
         error_setg_errno(errp, -result, "Could not close the new file");
     }
 out:
+    g_free(file_opts->cache);
+    file_opts->cache = NULL;
     return result;
 }
 
@@ -3009,6 +3043,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
     PreallocMode prealloc;
     char *buf = NULL;
     Error *local_err = NULL;
+    size_t buffersize = DEFAULT_BUFFER_SIZE;
+    char *cache = NULL;
 
     /* Skip file: protocol prefix */
     strstart(filename, "file:", &filename);
@@ -3031,6 +3067,21 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
         return -EINVAL;
     }
 
+    buffersize = qemu_opt_get_size_del(opts, BLOCK_OPT_BUFFER_SIZE,
+                                       DEFAULT_BUFFER_SIZE);
+    if (buffersize < MIN_BUFFER_SIZE || buffersize > MAX_BUFFER_SIZE) {
+        error_setg_errno(errp, EINVAL, "Buffer size must be between %d "
+                         "and %d", MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
+        return -EINVAL;
+    }
+
+    cache = qemu_opt_get_del(opts, BLOCK_OPT_CACHE);
+    if (!cache) {
+        cache = g_strdup("writeback");
+    }
+
+    buffersize = ROUND_UP(buffersize, BUFFER_ALIGN_SIZE);
+
     options = (BlockdevCreateOptions) {
         .driver     = BLOCKDEV_DRIVER_FILE,
         .u.file     = {
@@ -3042,6 +3093,9 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
             .nocow              = nocow,
             .has_extent_size_hint = has_extent_size_hint,
             .extent_size_hint   = extent_size_hint,
+            .has_buffersize     = true,
+            .buffersize         = buffersize,
+            .cache              = cache,
         },
     };
     return raw_co_create(&options, errp);
@@ -3732,6 +3786,16 @@ static QemuOptsList raw_create_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "Extent size hint for the image file, 0 to disable"
         },
+        {
+            .name = BLOCK_OPT_CACHE,
+            .type = QEMU_OPT_STRING,
+            .help = "Cache mode (allowed values: writeback, none)"
+        },
+        {
+            .name = BLOCK_OPT_BUFFER_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "write buffer size"
+        },
         { /* end of list */ }
     }
 };
diff --git a/block/io.c b/block/io.c
index 7e62fabbf57cc9865492c62b0181ad9e8396769e..27d6a1a04bcc3a75cb3cd423f0375883f6305771 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1756,22 +1756,29 @@ static int bdrv_pad_request(BlockDriverState *bs,
         return 0;
     }
 
-    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-                                  &sliced_head, &sliced_tail,
-                                  &sliced_niov);
-
-    /* Guaranteed by bdrv_check_request32() */
-    assert(*bytes <= SIZE_MAX);
-    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-                                  sliced_head, *bytes);
-    if (ret < 0) {
-        bdrv_padding_finalize(pad);
-        return ret;
+    /*
+     * For prefetching in stream_populate(), no qiov is passed along, because
+     * only copy-on-read matters.
+     */
+    if (qiov && *qiov) {
+        sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
+                                      &sliced_head, &sliced_tail,
+                                      &sliced_niov);
+
+        /* Guaranteed by bdrv_check_request32() */
+        assert(*bytes <= SIZE_MAX);
+        ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
+                                      sliced_head, *bytes);
+        if (ret < 0) {
+            bdrv_padding_finalize(pad);
+            return ret;
+        }
+        *qiov = &pad->local_qiov;
+        *qiov_offset = 0;
     }
+
     *bytes += pad->head + pad->tail;
     *offset -= pad->head;
-    *qiov = &pad->local_qiov;
-    *qiov_offset = 0;
     if (padded) {
         *padded = true;
     }
@@ -1885,6 +1892,11 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
         return -EINVAL;
     }
 
+    /* If opened with discard=off we should never unmap. */
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        flags &= ~BDRV_REQ_MAY_UNMAP;
+    }
+
     /* Invalidate the cached block-status data range if this write overlaps */
     bdrv_bsc_invalidate_range(bs, offset, bytes);
 
@@ -2338,10 +2350,6 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
     assert_bdrv_graph_readable();
 
-    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
-        flags &= ~BDRV_REQ_MAY_UNMAP;
-    }
-
     return bdrv_co_pwritev(child, offset, bytes, NULL,
                            BDRV_REQ_ZERO_WRITE | flags);
 }
diff --git a/block/mirror.c b/block/mirror.c
index cd9d3ad4a8065ac18528f1ac07a25093d7b496cb..20b3e8e5d825a1410b72834bbd312ae3af337263 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1774,7 +1774,7 @@ static BlockJob *mirror_start_job(
      * reads on the top, while disabling it in the intermediate nodes, and make
      * the backing chain writable. */
     mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name,
-                                         BDRV_O_RDWR, errp);
+                                         BDRV_O_RDWR | BDRV_O_NOCACHE, errp);
     if (mirror_top_bs == NULL) {
         return NULL;
     }
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index c729cbf1eb8ab228f35ae6b6e87f2a6809fdab3b..78a6975852d01e4131f33e7930afe908107ef16a 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -415,7 +415,8 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict)
         goto exit;
     }
 
-    nbd_server_start(addr, NULL, NULL, 0, &local_err);
+    nbd_server_start(addr, NULL, NULL, NBD_DEFAULT_MAX_CONNECTIONS,
+                     &local_err);
     qapi_free_SocketAddress(addr);
     if (local_err != NULL) {
         goto exit;
diff --git a/block/parallels.c b/block/parallels.c
index 9205a0864fa6ba96319eb58b3da54d11e99dcadb..8f2b58e1c9978c71a4b185d4757d636b42593cb1 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -1298,6 +1298,10 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
         error_setg(errp, "Catalog too large");
         return -EFBIG;
     }
+    if (le64_to_cpu(ph.ext_off) >= (INT64_MAX >> BDRV_SECTOR_BITS)) {
+        error_setg(errp, "Invalid image: Too big offset");
+        return -EFBIG;
+    }
 
     size = bat_entry_off(s->bat_size);
     s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs));
diff --git a/block/qcow2.c b/block/qcow2.c
index 13e032bd5e2e07565bfd29cfd481560437bcd609..7af7c0bee4cfd93987d02b6b3cb3294c86a0b1b1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1636,7 +1636,22 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    if (open_data_file) {
+    if (open_data_file && (flags & BDRV_O_NO_IO)) {
+        /*
+         * Don't open the data file for 'qemu-img info' so that it can be used
+         * to verify that an untrusted qcow2 image doesn't refer to external
+         * files.
+         *
+         * Note: This still makes has_data_file() return true.
+         */
+        if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
+            s->data_file = NULL;
+        } else {
+            s->data_file = bs->file;
+        }
+        qdict_extract_subqdict(options, NULL, "data-file.");
+        qdict_del(options, "data-file");
+    } else if (open_data_file) {
         /* Open external data file */
         bdrv_graph_co_rdunlock();
         s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs,
diff --git a/block/raw-format.c b/block/raw-format.c
index 1111dffd54f7da49bffa43d6177208b96e929aba..8195ed87cc76bd03a0e633ca797105db25f9c485 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -111,7 +111,7 @@ raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset,
     if (offset > real_size) {
         error_setg(errp, "Offset (%" PRIu64 ") cannot be greater than "
                    "size of the containing file (%" PRId64 ")",
-                   s->offset, real_size);
+                   offset, real_size);
         return -EINVAL;
     }
 
@@ -119,7 +119,7 @@ raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset,
         error_setg(errp, "The sum of offset (%" PRIu64 ") and size "
                    "(%" PRIu64 ") has to be smaller or equal to the "
                    " actual size of the containing file (%" PRId64 ")",
-                   s->offset, s->size, real_size);
+                   offset, size, real_size);
         return -EINVAL;
     }
 
diff --git a/block/vvfat.c b/block/vvfat.c
index 9d050ba3aea08eeac3d284e7d1e41545ff5b9f9f..9010f3f33f7dfe3b29610d9b26e6a2b488a12213 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2525,7 +2525,7 @@ commit_one_file(BDRVVVFATState* s, int dir_index, uint32_t offset)
         return -1;
     }
 
-    for (i = s->cluster_size; i < offset; i += s->cluster_size)
+    for (i = 0; i < offset; i += s->cluster_size)
         c = modified_fat_get(s, c);
 
     fd = qemu_open_old(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666);
diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index 213012435f4616f7c84bc436c10dab158994a368..b36f41b7c5ae731957fd878204db4c16afe7dce5 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -21,12 +21,18 @@
 #include "io/channel-socket.h"
 #include "io/net-listener.h"
 
+typedef struct NBDConn {
+    QIOChannelSocket *cioc;
+    QLIST_ENTRY(NBDConn) next;
+} NBDConn;
+
 typedef struct NBDServerData {
     QIONetListener *listener;
     QCryptoTLSCreds *tlscreds;
     char *tlsauthz;
     uint32_t max_connections;
     uint32_t connections;
+    QLIST_HEAD(, NBDConn) conns;
 } NBDServerData;
 
 static NBDServerData *nbd_server;
@@ -51,6 +57,14 @@ int nbd_server_max_connections(void)
 
 static void nbd_blockdev_client_closed(NBDClient *client, bool ignored)
 {
+    NBDConn *conn = nbd_client_owner(client);
+
+    assert(qemu_in_main_thread() && nbd_server);
+
+    object_unref(OBJECT(conn->cioc));
+    QLIST_REMOVE(conn, next);
+    g_free(conn);
+
     nbd_client_put(client);
     assert(nbd_server->connections > 0);
     nbd_server->connections--;
@@ -60,31 +74,56 @@ static void nbd_blockdev_client_closed(NBDClient *client, bool ignored)
 static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc,
                        gpointer opaque)
 {
+    NBDConn *conn = g_new0(NBDConn, 1);
+
+    assert(qemu_in_main_thread() && nbd_server);
     nbd_server->connections++;
+    object_ref(OBJECT(cioc));
+    conn->cioc = cioc;
+    QLIST_INSERT_HEAD(&nbd_server->conns, conn, next);
     nbd_update_server_watch(nbd_server);
 
     qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server");
-    nbd_client_new(cioc, nbd_server->tlscreds, nbd_server->tlsauthz,
-                   nbd_blockdev_client_closed);
+    /* TODO - expose handshake timeout as QMP option */
+    nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS,
+                   nbd_server->tlscreds, nbd_server->tlsauthz,
+                   nbd_blockdev_client_closed, conn);
 }
 
 static void nbd_update_server_watch(NBDServerData *s)
 {
-    if (!s->max_connections || s->connections < s->max_connections) {
-        qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, NULL);
-    } else {
-        qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL);
+    if (s->listener) {
+        if (!s->max_connections || s->connections < s->max_connections) {
+            qio_net_listener_set_client_func(s->listener, nbd_accept, NULL,
+                                             NULL);
+        } else {
+            qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL);
+        }
     }
 }
 
 static void nbd_server_free(NBDServerData *server)
 {
+    NBDConn *conn, *tmp;
+
     if (!server) {
         return;
     }
 
+    /*
+     * Forcefully close the listener socket, and any clients that have
+     * not yet disconnected on their own.
+     */
     qio_net_listener_disconnect(server->listener);
     object_unref(OBJECT(server->listener));
+    server->listener = NULL;
+    QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) {
+        qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH,
+                             NULL);
+    }
+
+    AIO_WAIT_WHILE_UNLOCKED(NULL, server->connections > 0);
+
     if (server->tlscreds) {
         object_unref(OBJECT(server->tlscreds));
     }
@@ -168,6 +207,10 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds,
 
 void nbd_server_start_options(NbdServerOptions *arg, Error **errp)
 {
+    if (!arg->has_max_connections) {
+        arg->max_connections = NBD_DEFAULT_MAX_CONNECTIONS;
+    }
+
     nbd_server_start(arg->addr, arg->tls_creds, arg->tls_authz,
                      arg->max_connections, errp);
 }
@@ -180,6 +223,10 @@ void qmp_nbd_server_start(SocketAddressLegacy *addr,
 {
     SocketAddress *addr_flat = socket_address_flatten(addr);
 
+    if (!has_max_connections) {
+        max_connections = NBD_DEFAULT_MAX_CONNECTIONS;
+    }
+
     nbd_server_start(addr_flat, tls_creds, tls_authz, max_connections, errp);
     qapi_free_SocketAddress(addr_flat);
 }
diff --git a/blockdev.c b/blockdev.c
index c91f49e7b62c04a3bd8155826294692a4ebfa5fc..d2fe5c361c6302f57eb67459f5d21c68743bc3fb 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -326,6 +326,8 @@ static int parse_block_error_action(const char *buf, bool is_read, Error **errp)
         return BLOCKDEV_ON_ERROR_STOP;
     } else if (!strcmp(buf, "report")) {
         return BLOCKDEV_ON_ERROR_REPORT;
+    } else if (!strcmp(buf, "retry")) {
+        return BLOCKDEV_ON_ERROR_RETRY;
     } else {
         error_setg(errp, "'%s' invalid %s error action",
                    buf, is_read ? "read" : "write");
@@ -482,6 +484,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     const char *buf;
     int bdrv_flags = 0;
     int on_read_error, on_write_error;
+    int64_t retry_interval, retry_timeout;
     OnOffAuto account_invalid, account_failed;
     bool writethrough, read_only;
     BlockBackend *blk;
@@ -493,6 +496,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     QDict *interval_dict = NULL;
     QList *interval_list = NULL;
     const char *id;
+    const char *cache;
     BlockdevDetectZeroesOptions detect_zeroes =
         BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF;
     const char *throttling_group = NULL;
@@ -556,7 +560,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
         qdict_put_str(bs_opts, "driver", buf);
     }
 
-    on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
+    on_write_error = BLOCKDEV_ON_ERROR_REPORT;
     if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
         on_write_error = parse_block_error_action(buf, 0, &error);
         if (error) {
@@ -574,12 +578,31 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
         }
     }
 
+    retry_interval = qemu_opt_get_number(opts, "retry_interval",
+                                         BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL);
+    retry_timeout = qemu_opt_get_number(opts, "retry_timeout", 0);
+
     if (snapshot) {
         bdrv_flags |= BDRV_O_SNAPSHOT;
     }
 
     read_only = qemu_opt_get_bool(opts, BDRV_OPT_READ_ONLY, false);
 
+    if ((!file || !*file) && qdict_size(bs_opts) == 2) {
+        cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH);
+        if (cache && !strcmp(cache, "on")) {
+            bdrv_flags |= BDRV_O_NO_FLUSH;
+        }
+
+        cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_DIRECT);
+        if (cache && !strcmp(cache, "on")) {
+            bdrv_flags |= BDRV_O_NOCACHE;
+        }
+
+        qdict_del(bs_opts, BDRV_OPT_CACHE_NO_FLUSH);
+        qdict_del(bs_opts, BDRV_OPT_CACHE_DIRECT);
+    }
+
     /* init */
     if ((!file || !*file) && !qdict_size(bs_opts)) {
         BlockBackendRootState *blk_rs;
@@ -637,6 +660,11 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
 
     blk_set_enable_write_cache(blk, !writethrough);
     blk_set_on_error(blk, on_read_error, on_write_error);
+    if (on_read_error == BLOCKDEV_ON_ERROR_RETRY ||
+        on_write_error == BLOCKDEV_ON_ERROR_RETRY) {
+        blk_set_on_error_retry_interval(blk, retry_interval);
+        blk_set_on_error_retry_timeout(blk, retry_timeout);
+    }
 
     if (!monitor_add_blk(blk, id, errp)) {
         blk_unref(blk);
@@ -771,6 +799,14 @@ QemuOptsList qemu_legacy_drive_opts = {
             .name = "werror",
             .type = QEMU_OPT_STRING,
             .help = "write error action",
+        },{
+            .name = "retry_interval",
+            .type = QEMU_OPT_NUMBER,
+            .help = "interval for retry action in millisecond",
+        },{
+            .name = "retry_timeout",
+            .type = QEMU_OPT_NUMBER,
+            .help = "timeout for retry action in millisecond",
         },{
             .name = "copy-on-read",
             .type = QEMU_OPT_BOOL,
@@ -793,6 +829,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type,
     BlockInterfaceType type;
     int max_devs, bus_id, unit_id, index;
     const char *werror, *rerror;
+    int64_t retry_interval, retry_timeout;
     bool read_only = false;
     bool copy_on_read;
     const char *filename;
@@ -1011,6 +1048,29 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type,
         qdict_put_str(bs_opts, "rerror", rerror);
     }
 
+    if (qemu_opt_find(legacy_opts, "retry_interval")) {
+        if ((werror == NULL || strcmp(werror, "retry")) &&
+            (rerror == NULL || strcmp(rerror, "retry"))) {
+            error_setg(errp, "retry_interval is only supported "
+                             "by werror/rerror=retry");
+            goto fail;
+        }
+        retry_interval = qemu_opt_get_number(legacy_opts, "retry_interval",
+                             BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL);
+        qdict_put_int(bs_opts, "retry_interval", retry_interval);
+    }
+
+    if (qemu_opt_find(legacy_opts, "retry_timeout")) {
+        if ((werror == NULL || strcmp(werror, "retry")) &&
+            (rerror == NULL || strcmp(rerror, "retry"))) {
+            error_setg(errp, "retry_timeout is only supported "
+                             "by werror/rerror=retry");
+            goto fail;
+        }
+        retry_timeout = qemu_opt_get_number(legacy_opts, "retry_timeout", 0);
+        qdict_put_int(bs_opts, "retry_timeout", retry_timeout);
+    }
+
     /* Actual block device init: Functionality shared with blockdev-add */
     blk = blockdev_init(filename, bs_opts, errp);
     bs_opts = NULL;
@@ -3792,6 +3852,14 @@ QemuOptsList qemu_common_drive_opts = {
             .name = "werror",
             .type = QEMU_OPT_STRING,
             .help = "write error action",
+        },{
+            .name = "retry_interval",
+            .type = QEMU_OPT_NUMBER,
+            .help = "interval for retry action in millisecond",
+        },{
+            .name = "retry_timeout",
+            .type = QEMU_OPT_NUMBER,
+            .help = "timeout for retry action in millisecond",
         },{
             .name = BDRV_OPT_READ_ONLY,
             .type = QEMU_OPT_BOOL,
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 73947da188d68d78fdb3ca78f4045db3519aab7a..0c9ab069aee3a88b7c7dbc353b60f32140c1aa43 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -337,6 +337,28 @@ static GSource *tcp_chr_add_watch(Chardev *chr, GIOCondition cond)
     return qio_channel_create_watch(s->ioc, cond);
 }
 
+static void tcp_chr_set_reconnect_time(Chardev *chr,
+                                       int64_t reconnect_time)
+{
+    SocketChardev *s = SOCKET_CHARDEV(chr);
+    s->reconnect_time = reconnect_time;
+}
+
+void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time)
+{
+    ChardevClass *cc = CHARDEV_GET_CLASS(chr);
+    SocketChardev *s = SOCKET_CHARDEV(chr);
+
+    /* if sock dev is listen, dont set reconnect time */
+    if (s->is_listen) {
+        return;
+    }
+
+    if (cc->chr_set_reconnect_time) {
+        cc->chr_set_reconnect_time(chr, reconnect_time);
+    }
+}
+
 static void remove_hup_source(SocketChardev *s)
 {
     if (s->hup_source != NULL) {
@@ -492,9 +514,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
         s->max_size <= 0) {
         return TRUE;
     }
-    len = sizeof(buf);
-    if (len > s->max_size) {
-        len = s->max_size;
+    len = tcp_chr_read_poll(opaque);
+    if (len > sizeof(buf)) {
+        len = sizeof(buf);
     }
     size = tcp_chr_recv(chr, (void *)buf, len);
     if (size == 0 || (size == -1 && errno != EAGAIN)) {
@@ -537,7 +559,7 @@ static int tcp_chr_sync_read(Chardev *chr, const uint8_t *buf, int len)
     if (s->state != TCP_CHARDEV_STATE_DISCONNECTED) {
         qio_channel_set_blocking(s->ioc, false, NULL);
     }
-    if (size == 0) {
+    if (size == 0 && chr->chr_for_flag != CHR_FOR_VHOST_USER) {
         /* connection closed */
         tcp_chr_disconnect(chr);
     }
@@ -1543,6 +1565,7 @@ static void char_socket_class_init(ObjectClass *oc, void *data)
     cc->set_msgfds = tcp_set_msgfds;
     cc->chr_add_client = tcp_chr_add_client;
     cc->chr_add_watch = tcp_chr_add_watch;
+    cc->chr_set_reconnect_time = tcp_chr_set_reconnect_time;
     cc->chr_update_read_handler = tcp_chr_update_read_handler;
 
     object_class_property_add(oc, "addr", "SocketAddress",
diff --git a/chardev/char-stdio.c b/chardev/char-stdio.c
index 3c648678ab14c7bbbd802a0400e3d5625f81207b..b960ddd4e4ca4da5dd6e4d2b2554dc83c532dd6f 100644
--- a/chardev/char-stdio.c
+++ b/chardev/char-stdio.c
@@ -41,6 +41,7 @@
 /* init terminal so that we can grab keys */
 static struct termios oldtty;
 static int old_fd0_flags;
+static int old_fd1_flags;
 static bool stdio_in_use;
 static bool stdio_allow_signal;
 static bool stdio_echo_state;
@@ -50,6 +51,8 @@ static void term_exit(void)
     if (stdio_in_use) {
         tcsetattr(0, TCSANOW, &oldtty);
         fcntl(0, F_SETFL, old_fd0_flags);
+        fcntl(1, F_SETFL, old_fd1_flags);
+        stdio_in_use = false;
     }
 }
 
@@ -102,6 +105,7 @@ static void qemu_chr_open_stdio(Chardev *chr,
 
     stdio_in_use = true;
     old_fd0_flags = fcntl(0, F_GETFL);
+    old_fd1_flags = fcntl(1, F_GETFL);
     tcgetattr(0, &oldtty);
     if (!g_unix_set_fd_nonblocking(0, true, NULL)) {
         error_setg_errno(errp, errno, "Failed to set FD nonblocking");
diff --git a/configs/devices/aarch64-softmmu/default.mak b/configs/devices/aarch64-softmmu/default.mak
index f82a04c27d1ba66a1199d0d249e48b69828f8bb6..8d66d0f1af7ce1b857e051b19227b508b4a20f16 100644
--- a/configs/devices/aarch64-softmmu/default.mak
+++ b/configs/devices/aarch64-softmmu/default.mak
@@ -8,3 +8,4 @@ include ../arm-softmmu/default.mak
 # CONFIG_XLNX_ZYNQMP_ARM=n
 # CONFIG_XLNX_VERSAL=n
 # CONFIG_SBSA_REF=n
+# CONFIG_CPUFREQ=n
diff --git a/configs/devices/i386-softmmu/default.mak b/configs/devices/i386-softmmu/default.mak
index 598c6646dfc0f941385b4d07939739bf9ae53e5c..e948e54e4e902a26f257b9a89ccb66bc77703e21 100644
--- a/configs/devices/i386-softmmu/default.mak
+++ b/configs/devices/i386-softmmu/default.mak
@@ -23,6 +23,8 @@
 #CONFIG_TPM_TIS_ISA=n
 #CONFIG_VTD=n
 #CONFIG_SGX=n
+#CONFIG_CSV=n
+#CONFIG_HYGON_CSV_MIG_ACCEL=n
 
 # Boards:
 #
diff --git a/configs/devices/loongarch64-softmmu/default.mak b/configs/devices/loongarch64-softmmu/default.mak
index 928bc117ef781a5f64512b972b459751d08717ce..ffe705836fde47b47f82db9baa8d1b8073eec4df 100644
--- a/configs/devices/loongarch64-softmmu/default.mak
+++ b/configs/devices/loongarch64-softmmu/default.mak
@@ -1,3 +1,7 @@
 # Default configuration for loongarch64-softmmu
 
-CONFIG_LOONGARCH_VIRT=y
+# Uncomment the following lines to disable these optional devices:
+# CONFIG_PCI_DEVICES=n
+
+# Boards are selected by default, uncomment to keep out of the build.
+# CONFIG_LOONGARCH_VIRT=n
diff --git a/configs/targets/loongarch64-softmmu.mak b/configs/targets/loongarch64-softmmu.mak
index f23780fdd89945b82078b1680273bfa93d9f2ff1..0034c336200fb0629775e713982da70309464a54 100644
--- a/configs/targets/loongarch64-softmmu.mak
+++ b/configs/targets/loongarch64-softmmu.mak
@@ -1,5 +1,6 @@
 TARGET_ARCH=loongarch64
 TARGET_BASE_ARCH=loongarch
+TARGET_KVM_HAVE_GUEST_DEBUG=y
 TARGET_SUPPORTS_MTTCG=y
 TARGET_XML_FILES= gdb-xml/loongarch-base32.xml gdb-xml/loongarch-base64.xml gdb-xml/loongarch-fpu.xml
 TARGET_NEED_FDT=y
diff --git a/configure b/configure
index bdda912f3626000ed0006276a36919bcf30821c1..6036de83a4abf7516e65c36d71c47ebd93779977 100755
--- a/configure
+++ b/configure
@@ -445,6 +445,7 @@ case "$cpu" in
   loongarch*)
     cpu=loongarch64
     host_arch=loongarch64
+    linux_arch=loongarch
     ;;
 
   mips64*)
diff --git a/contrib/plugins/lockstep.c b/contrib/plugins/lockstep.c
index 237543b43a76a1e657ddaef150314fc295366764..0c6f060183ac8cdb5d50db4488ea3cebd17ae9f7 100644
--- a/contrib/plugins/lockstep.c
+++ b/contrib/plugins/lockstep.c
@@ -100,6 +100,31 @@ static void plugin_exit(qemu_plugin_id_t id, void *p)
     plugin_cleanup(id);
 }
 
+/*
+ * g_memdup has been deprecated in Glib since 2.68 and
+ * will complain about it if you try to use it. However until
+ * glib_req_ver for QEMU is bumped we make a copy of the glib-compat
+ * handler.
+ */
+static inline gpointer g_memdup2_qemu(gconstpointer mem, gsize byte_size)
+{
+#if GLIB_CHECK_VERSION(2, 68, 0)
+    return g_memdup2(mem, byte_size);
+#else
+    gpointer new_mem;
+
+    if (mem && byte_size != 0) {
+        new_mem = g_malloc(byte_size);
+        memcpy(new_mem, mem, byte_size);
+    } else {
+        new_mem = NULL;
+    }
+
+    return new_mem;
+#endif
+}
+#define g_memdup2(m, s) g_memdup2_qemu(m, s)
+
 static void report_divergance(ExecState *us, ExecState *them)
 {
     DivergeState divrec = { log, 0 };
diff --git a/contrib/vhost-user-gpu/virgl.c b/contrib/vhost-user-gpu/virgl.c
index d1ccdf7d066853a4bd60be0f9b7a4730c7ca5c1c..51da0e3667f9ef8ffa280f02f8a1c8f4c8c84354 100644
--- a/contrib/vhost-user-gpu/virgl.c
+++ b/contrib/vhost-user-gpu/virgl.c
@@ -327,7 +327,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id,
 #ifdef VIRGL_RENDERER_RESOURCE_INFO_EXT_VERSION
     struct virgl_renderer_resource_info_ext info_ext;
     ret = virgl_renderer_resource_get_info_ext(resource_id, &info_ext);
-    if (ret < 0) {
+    if (ret) {
         return ret;
     }
 
@@ -335,7 +335,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id,
     *modifiers = info_ext.modifiers;
 #else
     ret = virgl_renderer_resource_get_info(resource_id, info);
-    if (ret < 0) {
+    if (ret) {
         return ret;
     }
 
@@ -372,7 +372,7 @@ virgl_cmd_set_scanout(VuGpu *g,
         uint64_t modifiers = 0;
         ret = virgl_get_resource_info_modifiers(ss.resource_id, &info,
                                                 &modifiers);
-        if (ret == -1) {
+        if (ret) {
             g_critical("%s: illegal resource specified %d\n",
                        __func__, ss.resource_id);
             cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
diff --git a/cpu-common.c b/cpu-common.c
index c81fd72d16d5c9e5f6f8af2d10b6f8f20e9cd3a6..a949ad7ca3cf5a526efed41c089d0646a5b10faa 100644
--- a/cpu-common.c
+++ b/cpu-common.c
@@ -24,6 +24,7 @@
 #include "sysemu/cpus.h"
 #include "qemu/lockable.h"
 #include "trace/trace-root.h"
+#include "hw/boards.h"
 
 QemuMutex qemu_cpu_list_lock;
 static QemuCond exclusive_cond;
@@ -107,6 +108,46 @@ void cpu_list_remove(CPUState *cpu)
     cpu_list_generation_id++;
 }
 
+CPUState *qemu_get_possible_cpu(int index)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    const CPUArchIdList *possible_cpus = ms->possible_cpus;
+
+    if (possible_cpus == NULL) {
+        return qemu_get_cpu(index);
+    }
+
+    assert((index >= 0) && (index < possible_cpus->len));
+
+    return CPU(possible_cpus->cpus[index].cpu);
+}
+
+bool qemu_present_cpu(CPUState *cpu)
+{
+    return cpu;
+}
+
+bool qemu_enabled_cpu(CPUState *cpu)
+{
+    return cpu && !cpu->disabled;
+}
+
+bool qemu_persistent_cpu(CPUState *cpu)
+{
+    /* cpu state can be faked to the guest via acpi */
+    return cpu->acpi_persistent;
+}
+
+uint64_t qemu_get_cpu_archid(int cpu_index)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    const CPUArchIdList *possible_cpus = ms->possible_cpus;
+
+    assert((cpu_index >= 0) && (cpu_index < possible_cpus->len));
+
+    return possible_cpus->cpus[cpu_index].arch_id;
+}
+
 CPUState *qemu_get_cpu(int index)
 {
     CPUState *cpu;
@@ -193,6 +234,9 @@ void start_exclusive(void)
     CPUState *other_cpu;
     int running_cpus;
 
+    /* Ensure we are not running, or start_exclusive will be blocked. */
+    g_assert(!current_cpu->running);
+
     if (current_cpu->exclusive_context_count) {
         current_cpu->exclusive_context_count++;
         return;
diff --git a/crypto/block-luks.c b/crypto/block-luks.c
index fb01ec38bbf00e29cadae5deeee1e4304d9a1e85..f0813d69b4d13b584d48d2565d85136f525da7bb 100644
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -95,12 +95,23 @@ qcrypto_block_luks_cipher_size_map_twofish[] = {
     { 0, 0 },
 };
 
+#ifdef CONFIG_CRYPTO_SM4
+static const QCryptoBlockLUKSCipherSizeMap
+qcrypto_block_luks_cipher_size_map_sm4[] = {
+    { 16, QCRYPTO_CIPHER_ALG_SM4},
+    { 0, 0 },
+};
+#endif
+
 static const QCryptoBlockLUKSCipherNameMap
 qcrypto_block_luks_cipher_name_map[] = {
     { "aes", qcrypto_block_luks_cipher_size_map_aes },
     { "cast5", qcrypto_block_luks_cipher_size_map_cast5 },
     { "serpent", qcrypto_block_luks_cipher_size_map_serpent },
     { "twofish", qcrypto_block_luks_cipher_size_map_twofish },
+#ifdef CONFIG_CRYPTO_SM4
+    { "sm4", qcrypto_block_luks_cipher_size_map_sm4},
+#endif
 };
 
 QEMU_BUILD_BUG_ON(sizeof(struct QCryptoBlockLUKSKeySlot) != 48);
diff --git a/crypto/cipher-gcrypt.c.inc b/crypto/cipher-gcrypt.c.inc
index a6a0117717f5bc1061e91612981ee26eaa8a1d3b..6b82280f9000295a1003ef4a164a32a913f8e32a 100644
--- a/crypto/cipher-gcrypt.c.inc
+++ b/crypto/cipher-gcrypt.c.inc
@@ -20,6 +20,56 @@
 
 #include <gcrypt.h>
 
+static int qcrypto_cipher_alg_to_gcry_alg(QCryptoCipherAlgorithm alg)
+{
+    switch (alg) {
+    case QCRYPTO_CIPHER_ALG_DES:
+        return GCRY_CIPHER_DES;
+    case QCRYPTO_CIPHER_ALG_3DES:
+        return GCRY_CIPHER_3DES;
+    case QCRYPTO_CIPHER_ALG_AES_128:
+        return GCRY_CIPHER_AES128;
+    case QCRYPTO_CIPHER_ALG_AES_192:
+        return GCRY_CIPHER_AES192;
+    case QCRYPTO_CIPHER_ALG_AES_256:
+        return GCRY_CIPHER_AES256;
+    case QCRYPTO_CIPHER_ALG_CAST5_128:
+        return GCRY_CIPHER_CAST5;
+    case QCRYPTO_CIPHER_ALG_SERPENT_128:
+        return GCRY_CIPHER_SERPENT128;
+    case QCRYPTO_CIPHER_ALG_SERPENT_192:
+        return GCRY_CIPHER_SERPENT192;
+    case QCRYPTO_CIPHER_ALG_SERPENT_256:
+        return GCRY_CIPHER_SERPENT256;
+    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
+        return GCRY_CIPHER_TWOFISH128;
+    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+        return GCRY_CIPHER_TWOFISH;
+#ifdef CONFIG_CRYPTO_SM4
+    case QCRYPTO_CIPHER_ALG_SM4:
+        return GCRY_CIPHER_SM4;
+#endif
+    default:
+        return GCRY_CIPHER_NONE;
+    }
+}
+
+static int qcrypto_cipher_mode_to_gcry_mode(QCryptoCipherMode mode)
+{
+    switch (mode) {
+    case QCRYPTO_CIPHER_MODE_ECB:
+        return GCRY_CIPHER_MODE_ECB;
+    case QCRYPTO_CIPHER_MODE_XTS:
+        return GCRY_CIPHER_MODE_XTS;
+    case QCRYPTO_CIPHER_MODE_CBC:
+        return GCRY_CIPHER_MODE_CBC;
+    case QCRYPTO_CIPHER_MODE_CTR:
+        return GCRY_CIPHER_MODE_CTR;
+    default:
+        return GCRY_CIPHER_MODE_NONE;
+    }
+}
+
 bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
                              QCryptoCipherMode mode)
 {
@@ -35,6 +85,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
     case QCRYPTO_CIPHER_ALG_SERPENT_256:
     case QCRYPTO_CIPHER_ALG_TWOFISH_128:
     case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+#ifdef CONFIG_CRYPTO_SM4
+    case QCRYPTO_CIPHER_ALG_SM4:
+#endif
         break;
     default:
         return false;
@@ -185,67 +238,26 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg,
         return NULL;
     }
 
-    switch (alg) {
-    case QCRYPTO_CIPHER_ALG_DES:
-        gcryalg = GCRY_CIPHER_DES;
-        break;
-    case QCRYPTO_CIPHER_ALG_3DES:
-        gcryalg = GCRY_CIPHER_3DES;
-        break;
-    case QCRYPTO_CIPHER_ALG_AES_128:
-        gcryalg = GCRY_CIPHER_AES128;
-        break;
-    case QCRYPTO_CIPHER_ALG_AES_192:
-        gcryalg = GCRY_CIPHER_AES192;
-        break;
-    case QCRYPTO_CIPHER_ALG_AES_256:
-        gcryalg = GCRY_CIPHER_AES256;
-        break;
-    case QCRYPTO_CIPHER_ALG_CAST5_128:
-        gcryalg = GCRY_CIPHER_CAST5;
-        break;
-    case QCRYPTO_CIPHER_ALG_SERPENT_128:
-        gcryalg = GCRY_CIPHER_SERPENT128;
-        break;
-    case QCRYPTO_CIPHER_ALG_SERPENT_192:
-        gcryalg = GCRY_CIPHER_SERPENT192;
-        break;
-    case QCRYPTO_CIPHER_ALG_SERPENT_256:
-        gcryalg = GCRY_CIPHER_SERPENT256;
-        break;
-    case QCRYPTO_CIPHER_ALG_TWOFISH_128:
-        gcryalg = GCRY_CIPHER_TWOFISH128;
-        break;
-    case QCRYPTO_CIPHER_ALG_TWOFISH_256:
-        gcryalg = GCRY_CIPHER_TWOFISH;
-        break;
-    default:
+    gcryalg = qcrypto_cipher_alg_to_gcry_alg(alg);
+    if (gcryalg == GCRY_CIPHER_NONE) {
         error_setg(errp, "Unsupported cipher algorithm %s",
                    QCryptoCipherAlgorithm_str(alg));
         return NULL;
     }
 
-    drv = &qcrypto_gcrypt_driver;
-    switch (mode) {
-    case QCRYPTO_CIPHER_MODE_ECB:
-        gcrymode = GCRY_CIPHER_MODE_ECB;
-        break;
-    case QCRYPTO_CIPHER_MODE_XTS:
-        gcrymode = GCRY_CIPHER_MODE_XTS;
-        break;
-    case QCRYPTO_CIPHER_MODE_CBC:
-        gcrymode = GCRY_CIPHER_MODE_CBC;
-        break;
-    case QCRYPTO_CIPHER_MODE_CTR:
-        drv = &qcrypto_gcrypt_ctr_driver;
-        gcrymode = GCRY_CIPHER_MODE_CTR;
-        break;
-    default:
+    gcrymode = qcrypto_cipher_mode_to_gcry_mode(mode);
+    if (gcrymode == GCRY_CIPHER_MODE_NONE) {
         error_setg(errp, "Unsupported cipher mode %s",
                    QCryptoCipherMode_str(mode));
         return NULL;
     }
 
+    if (mode == QCRYPTO_CIPHER_MODE_CTR) {
+        drv = &qcrypto_gcrypt_ctr_driver;
+    } else {
+        drv = &qcrypto_gcrypt_driver;
+    }
+
     ctx = g_new0(QCryptoCipherGcrypt, 1);
     ctx->base.driver = drv;
 
diff --git a/crypto/cipher-nettle.c.inc b/crypto/cipher-nettle.c.inc
index 24cc61f87bfc4ae7ab8ff523a4105a67bace67a1..2654b439c18912bd65aba6b7da3431ece73d432c 100644
--- a/crypto/cipher-nettle.c.inc
+++ b/crypto/cipher-nettle.c.inc
@@ -33,6 +33,9 @@
 #ifndef CONFIG_QEMU_PRIVATE_XTS
 #include <nettle/xts.h>
 #endif
+#ifdef CONFIG_CRYPTO_SM4
+#include <nettle/sm4.h>
+#endif
 
 static inline bool qcrypto_length_check(size_t len, size_t blocksize,
                                         Error **errp)
@@ -426,6 +429,30 @@ DEFINE_ECB_CBC_CTR_XTS(qcrypto_nettle_twofish,
                        QCryptoNettleTwofish, TWOFISH_BLOCK_SIZE,
                        twofish_encrypt_native, twofish_decrypt_native)
 
+#ifdef CONFIG_CRYPTO_SM4
+typedef struct QCryptoNettleSm4 {
+    QCryptoCipher base;
+    struct sm4_ctx key[2];
+} QCryptoNettleSm4;
+
+static void sm4_encrypt_native(void *ctx, size_t length,
+                               uint8_t *dst, const uint8_t *src)
+{
+    struct sm4_ctx *keys = ctx;
+    sm4_crypt(&keys[0], length, dst, src);
+}
+
+static void sm4_decrypt_native(void *ctx, size_t length,
+                               uint8_t *dst, const uint8_t *src)
+{
+    struct sm4_ctx *keys = ctx;
+    sm4_crypt(&keys[1], length, dst, src);
+}
+
+DEFINE_ECB(qcrypto_nettle_sm4,
+           QCryptoNettleSm4, SM4_BLOCK_SIZE,
+           sm4_encrypt_native, sm4_decrypt_native)
+#endif
 
 bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
                              QCryptoCipherMode mode)
@@ -443,6 +470,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg,
     case QCRYPTO_CIPHER_ALG_TWOFISH_128:
     case QCRYPTO_CIPHER_ALG_TWOFISH_192:
     case QCRYPTO_CIPHER_ALG_TWOFISH_256:
+#ifdef CONFIG_CRYPTO_SM4
+    case QCRYPTO_CIPHER_ALG_SM4:
+#endif
         break;
     default:
         return false;
@@ -495,8 +525,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg,
             case QCRYPTO_CIPHER_MODE_CTR:
                 drv = &qcrypto_nettle_des_driver_ctr;
                 break;
-            default:
+            case QCRYPTO_CIPHER_MODE_XTS:
                 goto bad_cipher_mode;
+            default:
+                g_assert_not_reached();
             }
 
             ctx = g_new0(QCryptoNettleDES, 1);
@@ -521,8 +553,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg,
             case QCRYPTO_CIPHER_MODE_CTR:
                 drv = &qcrypto_nettle_des3_driver_ctr;
                 break;
-            default:
+            case QCRYPTO_CIPHER_MODE_XTS:
                 goto bad_cipher_mode;
+            default:
+                g_assert_not_reached();
             }
 
             ctx = g_new0(QCryptoNettleDES3, 1);
@@ -633,8 +667,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg,
             case QCRYPTO_CIPHER_MODE_CTR:
                 drv = &qcrypto_nettle_cast128_driver_ctr;
                 break;
-            default:
+            case QCRYPTO_CIPHER_MODE_XTS:
                 goto bad_cipher_mode;
+            default:
+                g_assert_not_reached();
             }
 
             ctx = g_new0(QCryptoNettleCAST128, 1);
@@ -701,6 +737,32 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg,
 
             return &ctx->base;
         }
+#ifdef CONFIG_CRYPTO_SM4
+    case QCRYPTO_CIPHER_ALG_SM4:
+        {
+            QCryptoNettleSm4 *ctx;
+            const QCryptoCipherDriver *drv;
+
+            switch (mode) {
+            case QCRYPTO_CIPHER_MODE_ECB:
+                drv = &qcrypto_nettle_sm4_driver_ecb;
+                break;
+            case QCRYPTO_CIPHER_MODE_CBC:
+            case QCRYPTO_CIPHER_MODE_CTR:
+            case QCRYPTO_CIPHER_MODE_XTS:
+                goto bad_cipher_mode;
+            default:
+                g_assert_not_reached();
+            }
+
+            ctx = g_new0(QCryptoNettleSm4, 1);
+            ctx->base.driver = drv;
+            sm4_set_encrypt_key(&ctx->key[0], key);
+            sm4_set_decrypt_key(&ctx->key[1], key);
+
+            return &ctx->base;
+        }
+#endif
 
     default:
         error_setg(errp, "Unsupported cipher algorithm %s",
diff --git a/crypto/cipher.c b/crypto/cipher.c
index 74b09a5b261bae00c01afa788dbca99017f4110e..5f512768ea3b05852960a65fadcf38b3ca6265d1 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -38,6 +38,9 @@ static const size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = {
     [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
     [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24,
     [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32,
+#ifdef CONFIG_CRYPTO_SM4
+    [QCRYPTO_CIPHER_ALG_SM4] = 16,
+#endif
 };
 
 static const size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
@@ -53,6 +56,9 @@ static const size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
     [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
     [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 16,
     [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 16,
+#ifdef CONFIG_CRYPTO_SM4
+    [QCRYPTO_CIPHER_ALG_SM4] = 16,
+#endif
 };
 
 static const bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = {
diff --git a/crypto/hash-gcrypt.c b/crypto/hash-gcrypt.c
index 829e48258d326c3d8a9c7a5d2ef85a06a0dabfb2..8b305bd29aef78568ea5d09f6fca5e7af3bf3d87 100644
--- a/crypto/hash-gcrypt.c
+++ b/crypto/hash-gcrypt.c
@@ -33,13 +33,16 @@ static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG__MAX] = {
     [QCRYPTO_HASH_ALG_SHA384] = GCRY_MD_SHA384,
     [QCRYPTO_HASH_ALG_SHA512] = GCRY_MD_SHA512,
     [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MD_RMD160,
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = GCRY_MD_SM3,
+#endif
 };
 
 gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg)
 {
     if (alg < G_N_ELEMENTS(qcrypto_hash_alg_map) &&
         qcrypto_hash_alg_map[alg] != GCRY_MD_NONE) {
-        return true;
+        return gcry_md_test_algo(qcrypto_hash_alg_map[alg]) == 0;
     }
     return false;
 }
@@ -53,7 +56,7 @@ qcrypto_gcrypt_hash_bytesv(QCryptoHashAlgorithm alg,
                            size_t *resultlen,
                            Error **errp)
 {
-    int i, ret;
+    gcry_error_t ret;
     gcry_md_hd_t md;
     unsigned char *digest;
 
@@ -66,7 +69,7 @@ qcrypto_gcrypt_hash_bytesv(QCryptoHashAlgorithm alg,
 
     ret = gcry_md_open(&md, qcrypto_hash_alg_map[alg], 0);
 
-    if (ret < 0) {
+    if (ret != 0) {
         error_setg(errp,
                    "Unable to initialize hash algorithm: %s",
                    gcry_strerror(ret));
diff --git a/crypto/hash-nettle.c b/crypto/hash-nettle.c
index 1ca1a4106283f2bc3f11d362433aac69d9737c81..0c2f8ce86c1d7ff5d6eedee967dc9fa269ccfa9b 100644
--- a/crypto/hash-nettle.c
+++ b/crypto/hash-nettle.c
@@ -25,6 +25,9 @@
 #include <nettle/md5.h>
 #include <nettle/sha.h>
 #include <nettle/ripemd160.h>
+#ifdef CONFIG_CRYPTO_SM3
+#include <nettle/sm3.h>
+#endif
 
 typedef void (*qcrypto_nettle_init)(void *ctx);
 typedef void (*qcrypto_nettle_write)(void *ctx,
@@ -42,6 +45,9 @@ union qcrypto_hash_ctx {
     struct sha384_ctx sha384;
     struct sha512_ctx sha512;
     struct ripemd160_ctx ripemd160;
+#ifdef CONFIG_CRYPTO_SM3
+    struct sm3_ctx sm3;
+#endif
 };
 
 struct qcrypto_hash_alg {
@@ -92,6 +98,14 @@ struct qcrypto_hash_alg {
         .result = (qcrypto_nettle_result)ripemd160_digest,
         .len = RIPEMD160_DIGEST_SIZE,
     },
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = {
+        .init = (qcrypto_nettle_init)sm3_init,
+        .write = (qcrypto_nettle_write)sm3_update,
+        .result = (qcrypto_nettle_result)sm3_digest,
+        .len = SM3_DIGEST_SIZE,
+    },
+#endif
 };
 
 gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg)
diff --git a/crypto/hash.c b/crypto/hash.c
index b0f8228bdcb4a0b39e194065570065881ac51396..8f1502ce686ea6417d709f5d13d00009ab6253d7 100644
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -30,6 +30,9 @@ static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = {
     [QCRYPTO_HASH_ALG_SHA384] = 48,
     [QCRYPTO_HASH_ALG_SHA512] = 64,
     [QCRYPTO_HASH_ALG_RIPEMD160] = 20,
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = 32,
+#endif
 };
 
 size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg)
diff --git a/crypto/hmac-gcrypt.c b/crypto/hmac-gcrypt.c
index 0c6f9797114c26f56301da6d9a97de98014046bd..15926fccfafd10199f7dfd8ef4e0923a09908be3 100644
--- a/crypto/hmac-gcrypt.c
+++ b/crypto/hmac-gcrypt.c
@@ -26,6 +26,9 @@ static int qcrypto_hmac_alg_map[QCRYPTO_HASH_ALG__MAX] = {
     [QCRYPTO_HASH_ALG_SHA384] = GCRY_MAC_HMAC_SHA384,
     [QCRYPTO_HASH_ALG_SHA512] = GCRY_MAC_HMAC_SHA512,
     [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MAC_HMAC_RMD160,
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = GCRY_MAC_HMAC_SM3,
+#endif
 };
 
 typedef struct QCryptoHmacGcrypt QCryptoHmacGcrypt;
@@ -37,7 +40,7 @@ bool qcrypto_hmac_supports(QCryptoHashAlgorithm alg)
 {
     if (alg < G_N_ELEMENTS(qcrypto_hmac_alg_map) &&
         qcrypto_hmac_alg_map[alg] != GCRY_MAC_NONE) {
-        return true;
+        return gcry_mac_test_algo(qcrypto_hmac_alg_map[alg]) == 0;
     }
 
     return false;
diff --git a/crypto/hmac-nettle.c b/crypto/hmac-nettle.c
index 1ad6c4f253069223b6a479bca25b25e858030918..3f103792abe481b25c2e6abdf3de455e0a0a3a68 100644
--- a/crypto/hmac-nettle.c
+++ b/crypto/hmac-nettle.c
@@ -38,6 +38,9 @@ struct QCryptoHmacNettle {
         struct hmac_sha256_ctx sha256_ctx; /* equals hmac_sha224_ctx */
         struct hmac_sha512_ctx sha512_ctx; /* equals hmac_sha384_ctx */
         struct hmac_ripemd160_ctx ripemd160_ctx;
+#ifdef CONFIG_CRYPTO_SM3
+	struct hmac_sm3_ctx sm3_ctx;
+#endif
     } u;
 };
 
@@ -89,6 +92,14 @@ struct qcrypto_nettle_hmac_alg {
         .digest = (qcrypto_nettle_hmac_digest)hmac_ripemd160_digest,
         .len = RIPEMD160_DIGEST_SIZE,
     },
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = {
+        .setkey = (qcrypto_nettle_hmac_setkey)hmac_sm3_set_key,
+        .update = (qcrypto_nettle_hmac_update)hmac_sm3_update,
+        .digest = (qcrypto_nettle_hmac_digest)hmac_sm3_digest,
+        .len = SM3_DIGEST_SIZE,
+    },
+#endif
 };
 
 bool qcrypto_hmac_supports(QCryptoHashAlgorithm alg)
diff --git a/crypto/init.c b/crypto/init.c
index fb7f1bff1053526a8b7c1f097ada5951f907ab8a..674d237fa9166c80f43bf4467df243f0b748581c 100644
--- a/crypto/init.c
+++ b/crypto/init.c
@@ -34,14 +34,11 @@
 
 #include "crypto/random.h"
 
-/* #define DEBUG_GNUTLS */
-#ifdef DEBUG_GNUTLS
-static void qcrypto_gnutls_log(int level, const char *str)
-{
-    fprintf(stderr, "%d: %s", level, str);
-}
-#endif
 
+/*
+ * To debug GNUTLS see env vars listed in
+ * https://gnutls.org/manual/html_node/Debugging-and-auditing.html
+ */
 int qcrypto_init(Error **errp)
 {
 #ifdef CONFIG_GNUTLS
@@ -53,10 +50,6 @@ int qcrypto_init(Error **errp)
                    gnutls_strerror(ret));
         return -1;
     }
-#ifdef DEBUG_GNUTLS
-    gnutls_global_set_log_level(10);
-    gnutls_global_set_log_function(qcrypto_gnutls_log);
-#endif
 #endif
 
 #ifdef CONFIG_GCRYPT
diff --git a/crypto/pbkdf-gcrypt.c b/crypto/pbkdf-gcrypt.c
index a8d8e64f4d46bbb65f8d0ef71cbe20c08bca31ed..09b38d0d6e1cc8c6640372309afac2450767ed54 100644
--- a/crypto/pbkdf-gcrypt.c
+++ b/crypto/pbkdf-gcrypt.c
@@ -33,6 +33,9 @@ bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash)
     case QCRYPTO_HASH_ALG_SHA384:
     case QCRYPTO_HASH_ALG_SHA512:
     case QCRYPTO_HASH_ALG_RIPEMD160:
+#ifdef CONFIG_CRYPTO_SM3
+    case QCRYPTO_HASH_ALG_SM3:
+#endif
         return true;
     default:
         return false;
@@ -54,6 +57,9 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash,
         [QCRYPTO_HASH_ALG_SHA384] = GCRY_MD_SHA384,
         [QCRYPTO_HASH_ALG_SHA512] = GCRY_MD_SHA512,
         [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MD_RMD160,
+#ifdef CONFIG_CRYPTO_SM3
+        [QCRYPTO_HASH_ALG_SM3] = GCRY_MD_SM3,
+#endif
     };
     int ret;
 
diff --git a/crypto/pbkdf-nettle.c b/crypto/pbkdf-nettle.c
index d6293c25a13b98880be6110e577f4d35a7af711c..5fea570bd3313b56c4440e939bae58610ae175cb 100644
--- a/crypto/pbkdf-nettle.c
+++ b/crypto/pbkdf-nettle.c
@@ -34,6 +34,9 @@ bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash)
     case QCRYPTO_HASH_ALG_SHA384:
     case QCRYPTO_HASH_ALG_SHA512:
     case QCRYPTO_HASH_ALG_RIPEMD160:
+#ifdef CONFIG_CRYPTO_SM3
+    case QCRYPTO_HASH_ALG_SM3:
+#endif
         return true;
     default:
         return false;
@@ -55,6 +58,9 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash,
         struct hmac_sha384_ctx sha384;
         struct hmac_sha512_ctx sha512;
         struct hmac_ripemd160_ctx ripemd160;
+#ifdef CONFIG_CRYPTO_SM3
+        struct hmac_sm3_ctx sm3;
+#endif
     } ctx;
 
     if (iterations > UINT_MAX) {
@@ -106,6 +112,13 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash,
         PBKDF2(&ctx.ripemd160, hmac_ripemd160_update, hmac_ripemd160_digest,
                RIPEMD160_DIGEST_SIZE, iterations, nsalt, salt, nout, out);
         break;
+#ifdef CONFIG_CRYPTO_SM3
+    case QCRYPTO_HASH_ALG_SM3:
+        hmac_sm3_set_key(&ctx.sm3, nkey, key);
+        PBKDF2(&ctx.sm3, hmac_sm3_update, hmac_sm3_digest,
+               SM3_DIGEST_SIZE, iterations, nsalt, salt, nout, out);
+        break;
+#endif
 
     default:
         error_setg_errno(errp, ENOSYS,
diff --git a/crypto/pbkdf.c b/crypto/pbkdf.c
index 8d198c152cf2ee06ea340d6b9adea4cbfd001136..d1c06ef3ed3bbe4f4509f624e6ba9dc59e71b72e 100644
--- a/crypto/pbkdf.c
+++ b/crypto/pbkdf.c
@@ -19,6 +19,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/thread.h"
 #include "qapi/error.h"
 #include "crypto/pbkdf.h"
 #ifndef _WIN32
@@ -85,12 +86,28 @@ static int qcrypto_pbkdf2_get_thread_cpu(unsigned long long *val_ms,
 #endif
 }
 
-uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash,
-                                    const uint8_t *key, size_t nkey,
-                                    const uint8_t *salt, size_t nsalt,
-                                    size_t nout,
-                                    Error **errp)
+typedef struct CountItersData {
+    QCryptoHashAlgorithm hash;
+    const uint8_t *key;
+    size_t nkey;
+    const uint8_t *salt;
+    size_t nsalt;
+    size_t nout;
+    uint64_t iterations;
+    Error **errp;
+} CountItersData;
+
+static void *threaded_qcrypto_pbkdf2_count_iters(void *data)
 {
+    CountItersData *iters_data = (CountItersData *) data;
+    QCryptoHashAlgorithm hash = iters_data->hash;
+    const uint8_t *key = iters_data->key;
+    size_t nkey = iters_data->nkey;
+    const uint8_t *salt = iters_data->salt;
+    size_t nsalt = iters_data->nsalt;
+    size_t nout = iters_data->nout;
+    Error **errp = iters_data->errp;
+
     uint64_t ret = -1;
     g_autofree uint8_t *out = g_new(uint8_t, nout);
     uint64_t iterations = (1 << 15);
@@ -114,7 +131,10 @@ uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash,
 
         delta_ms = end_ms - start_ms;
 
-        if (delta_ms > 500) {
+        if (delta_ms == 0) { /* sanity check */
+            error_setg(errp, "Unable to get accurate CPU usage");
+            goto cleanup;
+        } else if (delta_ms > 500) {
             break;
         } else if (delta_ms < 100) {
             iterations = iterations * 10;
@@ -129,5 +149,24 @@ uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash,
 
  cleanup:
     memset(out, 0, nout);
-    return ret;
+    iters_data->iterations = ret;
+    return NULL;
+}
+
+uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash,
+                                    const uint8_t *key, size_t nkey,
+                                    const uint8_t *salt, size_t nsalt,
+                                    size_t nout,
+                                    Error **errp)
+{
+    CountItersData data = {
+        hash, key, nkey, salt, nsalt, nout, 0, errp
+    };
+    QemuThread thread;
+
+    qemu_thread_create(&thread, "pbkdf2", threaded_qcrypto_pbkdf2_count_iters,
+                       &data, QEMU_THREAD_JOINABLE);
+    qemu_thread_join(&thread);
+
+    return data.iterations;
 }
diff --git a/crypto/tlscredspsk.c b/crypto/tlscredspsk.c
index 546cad1c5a49d25d20de84ca4613681e612c5810..0d6b71a37cf273b8d2ce3e3e447220572f0d5e17 100644
--- a/crypto/tlscredspsk.c
+++ b/crypto/tlscredspsk.c
@@ -243,6 +243,7 @@ qcrypto_tls_creds_psk_finalize(Object *obj)
     QCryptoTLSCredsPSK *creds = QCRYPTO_TLS_CREDS_PSK(obj);
 
     qcrypto_tls_creds_psk_unload(creds);
+    g_free(creds->username);
 }
 
 static void
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index 6f81df92bcaba790477aff1ccb51048409331950..5636e9cf1d78bfb859cd8282778e777ac6fe18ea 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -11,12 +11,12 @@ Details about QEMU's various subsystems including how to add features to them.
    block-coroutine-wrapper
    clocks
    ebpf_rss
-   migration
+   migration/index
    multi-process
    reset
    s390-cpu-topology
    s390-dasd-ipl
    tracing
-   vfio-migration
+   vfio-iommufd
    writing-monitor-commands
    virtio-backends
diff --git a/docs/devel/migration/best-practices.rst b/docs/devel/migration/best-practices.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7c34a30149d68065cb547637e878777863586dd
--- /dev/null
+++ b/docs/devel/migration/best-practices.rst
@@ -0,0 +1,48 @@
+==============
+Best practices
+==============
+
+Debugging
+=========
+
+The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``.
+
+Example usage:
+
+.. code-block:: shell
+
+  $ qemu-system-x86_64 -display none -monitor stdio
+  (qemu) migrate "exec:cat > mig"
+  (qemu) q
+  $ ./scripts/analyze-migration.py -f mig
+  {
+    "ram (3)": {
+        "section sizes": {
+            "pc.ram": "0x0000000008000000",
+  ...
+
+See also ``analyze-migration.py -h`` help for more options.
+
+Firmware
+========
+
+Migration migrates the copies of RAM and ROM, and thus when running
+on the destination it includes the firmware from the source. Even after
+resetting a VM, the old firmware is used.  Only once QEMU has been restarted
+is the new firmware in use.
+
+- Changes in firmware size can cause changes in the required RAMBlock size
+  to hold the firmware and thus migration can fail.  In practice it's best
+  to pad firmware images to convenient powers of 2 with plenty of space
+  for growth.
+
+- Care should be taken with device emulation code so that newer
+  emulation code can work with older firmware to allow forward migration.
+
+- Care should be taken with newer firmware so that backward migration
+  to older systems with older device emulation code will work.
+
+In some cases it may be best to tie specific firmware versions to specific
+versioned machine types to cut down on the combinations that will need
+support.  This is also useful when newer versions of firmware outgrow
+the padding.
diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5a5417ef069ef38f2039eb94dc5fcc6370e69af3
--- /dev/null
+++ b/docs/devel/migration/compatibility.rst
@@ -0,0 +1,517 @@
+Backwards compatibility
+=======================
+
+How backwards compatibility works
+---------------------------------
+
+When we do migration, we have two QEMU processes: the source and the
+target.  There are two cases, they are the same version or they are
+different versions.  The easy case is when they are the same version.
+The difficult one is when they are different versions.
+
+There are two things that are different, but they have very similar
+names and sometimes get confused:
+
+- QEMU version
+- machine type version
+
+Let's start with a practical example, we start with:
+
+- qemu-system-x86_64 (v5.2), from now on qemu-5.2.
+- qemu-system-x86_64 (v5.1), from now on qemu-5.1.
+
+Related to this are the "latest" machine types defined on each of
+them:
+
+- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2
+- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1
+
+First of all, migration is only supposed to work if you use the same
+machine type in both source and destination. The QEMU hardware
+configuration needs to be the same also on source and destination.
+Most aspects of the backend configuration can be changed at will,
+except for a few cases where the backend features influence frontend
+device feature exposure.  But that is not relevant for this section.
+
+I am going to list the number of combinations that we can have.  Let's
+start with the trivial ones, QEMU is the same on source and
+destination:
+
+1 - qemu-5.2 -M pc-5.2  -> migrates to -> qemu-5.2 -M pc-5.2
+
+  This is the latest QEMU with the latest machine type.
+  This have to work, and if it doesn't work it is a bug.
+
+2 - qemu-5.1 -M pc-5.1  -> migrates to -> qemu-5.1 -M pc-5.1
+
+  Exactly the same case than the previous one, but for 5.1.
+  Nothing to see here either.
+
+This are the easiest ones, we will not talk more about them in this
+section.
+
+Now we start with the more interesting cases.  Consider the case where
+we have the same QEMU version in both sides (qemu-5.2) but we are using
+the latest machine type for that version (pc-5.2) but one of an older
+QEMU version, in this case pc-5.1.
+
+3 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
+
+  It needs to use the definition of pc-5.1 and the devices as they
+  were configured on 5.1, but this should be easy in the sense that
+  both sides are the same QEMU and both sides have exactly the same
+  idea of what the pc-5.1 machine is.
+
+4 - qemu-5.1 -M pc-5.2  -> migrates to -> qemu-5.1 -M pc-5.2
+
+  This combination is not possible as the qemu-5.1 doesn't understand
+  pc-5.2 machine type.  So nothing to worry here.
+
+Now it comes the interesting ones, when both QEMU processes are
+different.  Notice also that the machine type needs to be pc-5.1,
+because we have the limitation than qemu-5.1 doesn't know pc-5.2.  So
+the possible cases are:
+
+5 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.1 -M pc-5.1
+
+  This migration is known as newer to older.  We need to make sure
+  when we are developing 5.2 we need to take care about not to break
+  migration to qemu-5.1.  Notice that we can't make updates to
+  qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is
+  in qemu-5.2 side to make the relevant changes.
+
+6 - qemu-5.1 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
+
+  This migration is known as older to newer.  We need to make sure
+  than we are able to receive migrations from qemu-5.1. The problem is
+  similar to the previous one.
+
+If qemu-5.1 and qemu-5.2 were the same, there will not be any
+compatibility problems.  But the reason that we create qemu-5.2 is to
+get new features, devices, defaults, etc.
+
+If we get a device that has a new feature, or change a default value,
+we have a problem when we try to migrate between different QEMU
+versions.
+
+So we need a way to tell qemu-5.2 that when we are using machine type
+pc-5.1, it needs to **not** use the feature, to be able to migrate to
+real qemu-5.1.
+
+And the equivalent part when migrating from qemu-5.1 to qemu-5.2.
+qemu-5.2 has to expect that it is not going to get data for the new
+feature, because qemu-5.1 doesn't know about it.
+
+How do we tell QEMU about these device feature changes?  In
+hw/core/machine.c:hw_compat_X_Y arrays.
+
+If we change a default value, we need to put back the old value on
+that array.  And the device, during initialization needs to look at
+that array to see what value it needs to get for that feature.  And
+what are we going to put in that array, the value of a property.
+
+To create a property for a device, we need to use one of the
+DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the
+macros that exist.  With it, we set the default value for that
+property, and that is what it is going to get in the latest released
+version.  But if we want a different value for a previous version, we
+can change that in the hw_compat_X_Y arrays.
+
+hw_compat_X_Y is an array of registers that have the format:
+
+- name_device
+- name_property
+- value
+
+Let's see a practical example.
+
+In qemu-5.2 virtio-blk-device got multi queue support.  This is a
+change that is not backward compatible.  In qemu-5.1 it has one
+queue. In qemu-5.2 it has the same number of queues as the number of
+cpus in the system.
+
+When we are doing migration, if we migrate from a device that has 4
+queues to a device that have only one queue, we don't know where to
+put the extra information for the other 3 queues, and we fail
+migration.
+
+Similar problem when we migrate from qemu-5.1 that has only one queue
+to qemu-5.2, we only sent information for one queue, but destination
+has 4, and we have 3 queues that are not properly initialized and
+anything can happen.
+
+So, how can we address this problem.  Easy, just convince qemu-5.2
+that when it is running pc-5.1, it needs to set the number of queues
+for virtio-blk-devices to 1.
+
+That way we fix the cases 5 and 6.
+
+5 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.1 -M pc-5.1
+
+    qemu-5.2 -M pc-5.1 sets number of queues to be 1.
+    qemu-5.1 -M pc-5.1 expects number of queues to be 1.
+
+    correct.  migration works.
+
+6 - qemu-5.1 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
+
+    qemu-5.1 -M pc-5.1 sets number of queues to be 1.
+    qemu-5.2 -M pc-5.1 expects number of queues to be 1.
+
+    correct.  migration works.
+
+And now the other interesting case, case 3.  In this case we have:
+
+3 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
+
+    Here we have the same QEMU in both sides.  So it doesn't matter a
+    lot if we have set the number of queues to 1 or not, because
+    they are the same.
+
+    WRONG!
+
+    Think what happens if we do one of this double migrations:
+
+    A -> migrates -> B -> migrates -> C
+
+    where:
+
+    A: qemu-5.1 -M pc-5.1
+    B: qemu-5.2 -M pc-5.1
+    C: qemu-5.2 -M pc-5.1
+
+    migration A -> B is case 6, so number of queues needs to be 1.
+
+    migration B -> C is case 3, so we don't care.  But actually we
+    care because we haven't started the guest in qemu-5.2, it came
+    migrated from qemu-5.1.  So to be in the safe place, we need to
+    always use number of queues 1 when we are using pc-5.1.
+
+Now, how was this done in reality?  The following commit shows how it
+was done::
+
+  commit 9445e1e15e66c19e42bea942ba810db28052cd05
+  Author: Stefan Hajnoczi <stefanha@redhat.com>
+  Date:   Tue Aug 18 15:33:47 2020 +0100
+
+  virtio-blk-pci: default num_queues to -smp N
+
+The relevant parts for migration are::
+
+    @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = {
+     #endif
+         DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
+                         true),
+    -    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
+    +    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues,
+    +                       VIRTIO_BLK_AUTO_NUM_QUEUES),
+         DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
+
+It changes the default value of num_queues.  But it fishes it for old
+machine types to have the right value::
+
+    @@ -31,6 +31,7 @@
+     GlobalProperty hw_compat_5_1[] = {
+         ...
+    +    { "virtio-blk-device", "num-queues", "1"},
+         ...
+     };
+
+A device with different features on both sides
+----------------------------------------------
+
+Let's assume that we are using the same QEMU binary on both sides,
+just to make the things easier.  But we have a device that has
+different features on both sides of the migration.  That can be
+because the devices are different, because the kernel driver of both
+devices have different features, whatever.
+
+How can we get this to work with migration.  The way to do that is
+"theoretically" easy.  You have to get the features that the device
+has in the source of the migration.  The features that the device has
+on the target of the migration, you get the intersection of the
+features of both sides, and that is the way that you should launch
+QEMU.
+
+Notice that this is not completely related to QEMU.  The most
+important thing here is that this should be handled by the managing
+application that launches QEMU.  If QEMU is configured correctly, the
+migration will succeed.
+
+That said, actually doing it is complicated.  Almost all devices are
+bad at being able to be launched with only some features enabled.
+With one big exception: cpus.
+
+You can read the documentation for QEMU x86 cpu models here:
+
+https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html
+
+See when they talk about migration they recommend that one chooses the
+newest cpu model that is supported for all cpus.
+
+Let's say that we have:
+
+Host A:
+
+Device X has the feature Y
+
+Host B:
+
+Device X has not the feature Y
+
+If we try to migrate without any care from host A to host B, it will
+fail because when migration tries to load the feature Y on
+destination, it will find that the hardware is not there.
+
+Doing this would be the equivalent of doing with cpus:
+
+Host A:
+
+$ qemu-system-x86_64 -cpu host
+
+Host B:
+
+$ qemu-system-x86_64 -cpu host
+
+When both hosts have different cpu features this is guaranteed to
+fail.  Especially if Host B has less features than host A.  If host A
+has less features than host B, sometimes it works.  Important word of
+last sentence is "sometimes".
+
+So, forgetting about cpu models and continuing with the -cpu host
+example, let's see that the differences of the cpus is that Host A and
+B have the following features:
+
+Features:   'pcid'  'stibp' 'taa-no'
+Host A:        X       X
+Host B:                        X
+
+And we want to migrate between them, the way configure both QEMU cpu
+will be:
+
+Host A:
+
+$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off
+
+Host B:
+
+$ qemu-system-x86_64 -cpu host,taa-no=off
+
+And you would be able to migrate between them.  It is responsibility
+of the management application or of the user to make sure that the
+configuration is correct.  QEMU doesn't know how to look at this kind
+of features in general.
+
+Notice that we don't recommend to use -cpu host for migration.  It is
+used in this example because it makes the example simpler.
+
+Other devices have worse control about individual features.  If they
+want to be able to migrate between hosts that show different features,
+the device needs a way to configure which ones it is going to use.
+
+In this section we have considered that we are using the same QEMU
+binary in both sides of the migration.  If we use different QEMU
+versions process, then we need to have into account all other
+differences and the examples become even more complicated.
+
+How to mitigate when we have a backward compatibility error
+-----------------------------------------------------------
+
+We broke migration for old machine types continuously during
+development.  But as soon as we find that there is a problem, we fix
+it.  The problem is what happens when we detect after we have done a
+release that something has gone wrong.
+
+Let see how it worked with one example.
+
+After the release of qemu-8.0 we found a problem when doing migration
+of the machine type pc-7.2.
+
+- $ qemu-7.2 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
+
+  This migration works
+
+- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
+
+  This migration works
+
+- $ qemu-8.0 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
+
+  This migration fails
+
+- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
+
+  This migration fails
+
+So clearly something fails when migration between qemu-7.2 and
+qemu-8.0 with machine type pc-7.2.  The error messages, and git bisect
+pointed to this commit.
+
+In qemu-8.0 we got this commit::
+
+    commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2
+    Author: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+    Date:   Thu Mar 2 13:37:02 2023 +0000
+
+    hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register
+
+
+The relevant bits of the commit for our example are this ones::
+
+    --- a/hw/pci/pcie_aer.c
+    +++ b/hw/pci/pcie_aer.c
+    @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev,
+
+         pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
+                      PCI_ERR_UNC_SUPPORTED);
+    +    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
+    +                 PCI_ERR_UNC_MASK_DEFAULT);
+    +    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
+    +                 PCI_ERR_UNC_SUPPORTED);
+
+         pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
+                     PCI_ERR_UNC_SEVERITY_DEFAULT);
+
+The patch changes how we configure PCI space for AER.  But QEMU fails
+when the PCI space configuration is different between source and
+destination.
+
+The following commit shows how this got fixed::
+
+    commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f
+    Author: Leonardo Bras <leobras@redhat.com>
+    Date:   Tue May 2 21:27:02 2023 -0300
+
+    hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0
+
+    [...]
+
+The relevant parts of the fix in QEMU are as follow:
+
+First, we create a new property for the device to be able to configure
+the old behaviour or the new behaviour::
+
+    diff --git a/hw/pci/pci.c b/hw/pci/pci.c
+    index 8a87ccc8b0..5153ad63d6 100644
+    --- a/hw/pci/pci.c
+    +++ b/hw/pci/pci.c
+    @@ -79,6 +79,8 @@ static Property pci_props[] = {
+         DEFINE_PROP_STRING("failover_pair_id", PCIDevice,
+                            failover_pair_id),
+         DEFINE_PROP_UINT32("acpi-index",  PCIDevice, acpi_index, 0),
+    +    DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present,
+    +                    QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
+         DEFINE_PROP_END_OF_LIST()
+     };
+
+Notice that we enable the feature for new machine types.
+
+Now we see how the fix is done.  This is going to depend on what kind
+of breakage happens, but in this case it is quite simple::
+
+    diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
+    index 103667c368..374d593ead 100644
+    --- a/hw/pci/pcie_aer.c
+    +++ b/hw/pci/pcie_aer.c
+    @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver,
+    uint16_t offset,
+
+         pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
+                      PCI_ERR_UNC_SUPPORTED);
+    -    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
+    -                 PCI_ERR_UNC_MASK_DEFAULT);
+    -    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
+    -                 PCI_ERR_UNC_SUPPORTED);
+    +
+    +    if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) {
+    +        pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
+    +                     PCI_ERR_UNC_MASK_DEFAULT);
+    +        pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
+    +                     PCI_ERR_UNC_SUPPORTED);
+    +    }
+
+         pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
+                      PCI_ERR_UNC_SEVERITY_DEFAULT);
+
+I.e. If the property bit is enabled, we configure it as we did for
+qemu-8.0.  If the property bit is not set, we configure it as it was in 7.2.
+
+And now, everything that is missing is disabling the feature for old
+machine types::
+
+    diff --git a/hw/core/machine.c b/hw/core/machine.c
+    index 47a34841a5..07f763eb2e 100644
+    --- a/hw/core/machine.c
+    +++ b/hw/core/machine.c
+    @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = {
+         { "e1000e", "migrate-timadj", "off" },
+         { "virtio-mem", "x-early-migration", "false" },
+         { "migration", "x-preempt-pre-7-2", "true" },
+    +    { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" },
+     };
+     const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
+
+And now, when qemu-8.0.1 is released with this fix, all combinations
+are going to work as supposed.
+
+- $ qemu-7.2 -M pc-7.2  ->  qemu-7.2 -M pc-7.2 (works)
+- $ qemu-8.0.1 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2 (works)
+- $ qemu-8.0.1 -M pc-7.2  ->  qemu-7.2 -M pc-7.2 (works)
+- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2 (works)
+
+So the normality has been restored and everything is ok, no?
+
+Not really, now our matrix is much bigger.  We started with the easy
+cases, migration from the same version to the same version always
+works:
+
+- $ qemu-7.2 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
+- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
+- $ qemu-8.0.1 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
+
+Now the interesting ones.  When the QEMU processes versions are
+different.  For the 1st set, their fail and we can do nothing, both
+versions are released and we can't change anything.
+
+- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
+- $ qemu-8.0 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
+
+This two are the ones that work. The whole point of making the
+change in qemu-8.0.1 release was to fix this issue:
+
+- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
+- $ qemu-8.0.1 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
+
+But now we found that qemu-8.0 neither can migrate to qemu-7.2 not
+qemu-8.0.1.
+
+- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
+- $ qemu-8.0.1 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
+
+So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to
+anything except to qemu-8.0.
+
+Can we do better?
+
+Yeap.  If we know that we are going to do this migration:
+
+- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
+
+We can launch the appropriate devices with::
+
+  --device...,x-pci-e-err-unc-mask=on
+
+And now we can receive a migration from 8.0.  And from now on, we can
+do that migration to new machine types if we remember to enable that
+property for pc-7.2.  Notice that we need to remember, it is not
+enough to know that the source of the migration is qemu-8.0.  Think of
+this example:
+
+$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2
+
+In the second migration, the source is not qemu-8.0, but we still have
+that "problem" and have that property enabled.  Notice that we need to
+continue having this mark/property until we have this machine
+rebooted.  But it is not a normal reboot (that don't reload QEMU) we
+need the machine to poweroff/poweron on a fixed QEMU.  And from now
+on we can use the proper real machine.
diff --git a/docs/devel/migration/dirty-limit.rst b/docs/devel/migration/dirty-limit.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f32329d5fda2e239d378c0bc4662fd8cf6d7d23
--- /dev/null
+++ b/docs/devel/migration/dirty-limit.rst
@@ -0,0 +1,71 @@
+Dirty limit
+===========
+
+The dirty limit, short for dirty page rate upper limit, is a new capability
+introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM
+dirty ring to throttle down the guest during live migration.
+
+The algorithm framework is as follows:
+
+::
+
+  ------------------------------------------------------------------------------
+  main   --------------> throttle thread ------------> PREPARE(1) <--------
+  thread  \                                                |              |
+           \                                               |              |
+            \                                              V              |
+             -\                                        CALCULATE(2)       |
+               \                                           |              |
+                \                                          |              |
+                 \                                         V              |
+                  \                                    SET PENALTY(3) -----
+                   -\                                      |
+                     \                                     |
+                      \                                    V
+                       -> virtual CPU thread -------> ACCEPT PENALTY(4)
+  ------------------------------------------------------------------------------
+
+When the qmp command qmp_set_vcpu_dirty_limit is called for the first time,
+the QEMU main thread starts the throttle thread. The throttle thread, once
+launched, executes the loop, which consists of three steps:
+
+  - PREPARE (1)
+
+     The entire work of PREPARE (1) is preparation for the second stage,
+     CALCULATE(2), as the name implies. It involves preparing the dirty
+     page rate value and the corresponding upper limit of the VM:
+     The dirty page rate is calculated via the KVM dirty ring mechanism,
+     which tells QEMU how many dirty pages a virtual CPU has had since the
+     last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper
+     limit is specified by caller, therefore fetch it directly.
+
+  - CALCULATE (2)
+
+     Calculate a suitable sleep period for each virtual CPU, which will be
+     used to determine the penalty for the target virtual CPU. The
+     computation must be done carefully in order to reduce the dirty page
+     rate progressively down to the upper limit without oscillation. To
+     achieve this, two strategies are provided: the first is to add or
+     subtract sleep time based on the ratio of the current dirty page rate
+     to the limit, which is used when the current dirty page rate is far
+     from the limit; the second is to add or subtract a fixed time when
+     the current dirty page rate is close to the limit.
+
+  - SET PENALTY (3)
+
+     Set the sleep time for each virtual CPU that should be penalized based
+     on the results of the calculation supplied by step CALCULATE (2).
+
+After completing the three above stages, the throttle thread loops back
+to step PREPARE (1) until the dirty limit is reached.
+
+On the other hand, each virtual CPU thread reads the sleep duration and
+sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that
+is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will
+obviously exit to the path and get penalized, whereas virtual CPUs involved
+with read processes will not.
+
+In summary, thanks to the KVM dirty ring technology, the dirty limit
+algorithm will restrict virtual CPUs as needed to keep their dirty page
+rate inside the limit. This leads to more steady reading performance during
+live migration and can aid in improving large guest responsiveness.
diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7c5ce9e79d4eafe18c9c01b83ddccd5fcc28abbf
--- /dev/null
+++ b/docs/devel/migration/features.rst
@@ -0,0 +1,15 @@
+Migration features
+==================
+
+Migration has plenty of features to support different use cases.
+
+.. toctree::
+   :maxdepth: 2
+
+   postcopy
+   dirty-limit
+   vfio
+   virtio
+   qpl-compression
+   uadk-compression
+   qatzip-compression
diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2aa294d6314bd4cb18623e0caf9adc6d93fd030e
--- /dev/null
+++ b/docs/devel/migration/index.rst
@@ -0,0 +1,13 @@
+Migration
+=========
+
+This is the main entry for QEMU migration documentations.  It explains how
+QEMU live migration works.
+
+.. toctree::
+   :maxdepth: 2
+
+   main
+   features
+   compatibility
+   best-practices
diff --git a/docs/devel/migration.rst b/docs/devel/migration/main.rst
similarity index 38%
rename from docs/devel/migration.rst
rename to docs/devel/migration/main.rst
index ec55089b2530a0296f858b1f078b557d12e154b7..396c7c51ca8552e7d542201bd21da54d1e96439b 100644
--- a/docs/devel/migration.rst
+++ b/docs/devel/migration/main.rst
@@ -1,6 +1,6 @@
-=========
-Migration
-=========
+===================
+Migration framework
+===================
 
 QEMU has code to load/save the state of the guest that it is running.
 These are two complementary operations.  Saving the state just does
@@ -52,27 +52,6 @@ All these migration protocols use the same infrastructure to
 save/restore state devices.  This infrastructure is shared with the
 savevm/loadvm functionality.
 
-Debugging
-=========
-
-The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``.
-
-Example usage:
-
-.. code-block:: shell
-
-  $ qemu-system-x86_64 -display none -monitor stdio
-  (qemu) migrate "exec:cat > mig"
-  (qemu) q
-  $ ./scripts/analyze-migration.py -f mig
-  {
-    "ram (3)": {
-        "section sizes": {
-            "pc.ram": "0x0000000008000000",
-  ...
-
-See also ``analyze-migration.py -h`` help for more options.
-
 Common infrastructure
 =====================
 
@@ -594,921 +573,3 @@ path.
      Return path  - opened by main thread, written by main thread AND postcopy
      thread (protected by rp_mutex)
 
-Dirty limit
-=====================
-The dirty limit, short for dirty page rate upper limit, is a new capability
-introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM
-dirty ring to throttle down the guest during live migration.
-
-The algorithm framework is as follows:
-
-::
-
-  ------------------------------------------------------------------------------
-  main   --------------> throttle thread ------------> PREPARE(1) <--------
-  thread  \                                                |              |
-           \                                               |              |
-            \                                              V              |
-             -\                                        CALCULATE(2)       |
-               \                                           |              |
-                \                                          |              |
-                 \                                         V              |
-                  \                                    SET PENALTY(3) -----
-                   -\                                      |
-                     \                                     |
-                      \                                    V
-                       -> virtual CPU thread -------> ACCEPT PENALTY(4)
-  ------------------------------------------------------------------------------
-
-When the qmp command qmp_set_vcpu_dirty_limit is called for the first time,
-the QEMU main thread starts the throttle thread. The throttle thread, once
-launched, executes the loop, which consists of three steps:
-
-  - PREPARE (1)
-
-     The entire work of PREPARE (1) is preparation for the second stage,
-     CALCULATE(2), as the name implies. It involves preparing the dirty
-     page rate value and the corresponding upper limit of the VM:
-     The dirty page rate is calculated via the KVM dirty ring mechanism,
-     which tells QEMU how many dirty pages a virtual CPU has had since the
-     last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper
-     limit is specified by caller, therefore fetch it directly.
-
-  - CALCULATE (2)
-
-     Calculate a suitable sleep period for each virtual CPU, which will be
-     used to determine the penalty for the target virtual CPU. The
-     computation must be done carefully in order to reduce the dirty page
-     rate progressively down to the upper limit without oscillation. To
-     achieve this, two strategies are provided: the first is to add or
-     subtract sleep time based on the ratio of the current dirty page rate
-     to the limit, which is used when the current dirty page rate is far
-     from the limit; the second is to add or subtract a fixed time when
-     the current dirty page rate is close to the limit.
-
-  - SET PENALTY (3)
-
-     Set the sleep time for each virtual CPU that should be penalized based
-     on the results of the calculation supplied by step CALCULATE (2).
-
-After completing the three above stages, the throttle thread loops back
-to step PREPARE (1) until the dirty limit is reached.
-
-On the other hand, each virtual CPU thread reads the sleep duration and
-sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that
-is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will
-obviously exit to the path and get penalized, whereas virtual CPUs involved
-with read processes will not.
-
-In summary, thanks to the KVM dirty ring technology, the dirty limit
-algorithm will restrict virtual CPUs as needed to keep their dirty page
-rate inside the limit. This leads to more steady reading performance during
-live migration and can aid in improving large guest responsiveness.
-
-Postcopy
-========
-
-'Postcopy' migration is a way to deal with migrations that refuse to converge
-(or take too long to converge) its plus side is that there is an upper bound on
-the amount of migration traffic and time it takes, the down side is that during
-the postcopy phase, a failure of *either* side causes the guest to be lost.
-
-In postcopy the destination CPUs are started before all the memory has been
-transferred, and accesses to pages that are yet to be transferred cause
-a fault that's translated by QEMU into a request to the source QEMU.
-
-Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
-doesn't finish in a given time the switch is made to postcopy.
-
-Enabling postcopy
------------------
-
-To enable postcopy, issue this command on the monitor (both source and
-destination) prior to the start of migration:
-
-``migrate_set_capability postcopy-ram on``
-
-The normal commands are then used to start a migration, which is still
-started in precopy mode.  Issuing:
-
-``migrate_start_postcopy``
-
-will now cause the transition from precopy to postcopy.
-It can be issued immediately after migration is started or any
-time later on.  Issuing it after the end of a migration is harmless.
-
-Blocktime is a postcopy live migration metric, intended to show how
-long the vCPU was in state of interruptible sleep due to pagefault.
-That metric is calculated both for all vCPUs as overlapped value, and
-separately for each vCPU. These values are calculated on destination
-side.  To enable postcopy blocktime calculation, enter following
-command on destination monitor:
-
-``migrate_set_capability postcopy-blocktime on``
-
-Postcopy blocktime can be retrieved by query-migrate qmp command.
-postcopy-blocktime value of qmp command will show overlapped blocking
-time for all vCPU, postcopy-vcpu-blocktime will show list of blocking
-time per vCPU.
-
-.. note::
-  During the postcopy phase, the bandwidth limits set using
-  ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that
-  the destination is waiting for).
-
-Postcopy device transfer
-------------------------
-
-Loading of device data may cause the device emulation to access guest RAM
-that may trigger faults that have to be resolved by the source, as such
-the migration stream has to be able to respond with page data *during* the
-device load, and hence the device data has to be read from the stream completely
-before the device load begins to free the stream up.  This is achieved by
-'packaging' the device data into a blob that's read in one go.
-
-Source behaviour
-----------------
-
-Until postcopy is entered the migration stream is identical to normal
-precopy, except for the addition of a 'postcopy advise' command at
-the beginning, to tell the destination that postcopy might happen.
-When postcopy starts the source sends the page discard data and then
-forms the 'package' containing:
-
-   - Command: 'postcopy listen'
-   - The device state
-
-     A series of sections, identical to the precopy streams device state stream
-     containing everything except postcopiable devices (i.e. RAM)
-   - Command: 'postcopy run'
-
-The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the
-contents are formatted in the same way as the main migration stream.
-
-During postcopy the source scans the list of dirty pages and sends them
-to the destination without being requested (in much the same way as precopy),
-however when a page request is received from the destination, the dirty page
-scanning restarts from the requested location.  This causes requested pages
-to be sent quickly, and also causes pages directly after the requested page
-to be sent quickly in the hope that those pages are likely to be used
-by the destination soon.
-
-Destination behaviour
----------------------
-
-Initially the destination looks the same as precopy, with a single thread
-reading the migration stream; the 'postcopy advise' and 'discard' commands
-are processed to change the way RAM is managed, but don't affect the stream
-processing.
-
-::
-
-  ------------------------------------------------------------------------------
-                          1      2   3     4 5                      6   7
-  main -----DISCARD-CMD_PACKAGED ( LISTEN  DEVICE     DEVICE DEVICE RUN )
-  thread                             |       |
-                                     |     (page request)
-                                     |        \___
-                                     v            \
-  listen thread:                     --- page -- page -- page -- page -- page --
-
-                                     a   b        c
-  ------------------------------------------------------------------------------
-
-- On receipt of ``CMD_PACKAGED`` (1)
-
-   All the data associated with the package - the ( ... ) section in the diagram -
-   is read into memory, and the main thread recurses into qemu_loadvm_state_main
-   to process the contents of the package (2) which contains commands (3,6) and
-   devices (4...)
-
-- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
-
-   a new thread (a) is started that takes over servicing the migration stream,
-   while the main thread carries on loading the package.   It loads normal
-   background page data (b) but if during a device load a fault happens (5)
-   the returned page (c) is loaded by the listen thread allowing the main
-   threads device load to carry on.
-
-- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6)
-
-   letting the destination CPUs start running.  At the end of the
-   ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and
-   is no longer used by migration, while the listen thread carries on servicing
-   page data until the end of migration.
-
-Postcopy Recovery
------------------
-
-Comparing to precopy, postcopy is special on error handlings.  When any
-error happens (in this case, mostly network errors), QEMU cannot easily
-fail a migration because VM data resides in both source and destination
-QEMU instances.  On the other hand, when issue happens QEMU on both sides
-will go into a paused state.  It'll need a recovery phase to continue a
-paused postcopy migration.
-
-The recovery phase normally contains a few steps:
-
-  - When network issue occurs, both QEMU will go into PAUSED state
-
-  - When the network is recovered (or a new network is provided), the admin
-    can setup the new channel for migration using QMP command
-    'migrate-recover' on destination node, preparing for a resume.
-
-  - On source host, the admin can continue the interrupted postcopy
-    migration using QMP command 'migrate' with resume=true flag set.
-
-  - After the connection is re-established, QEMU will continue the postcopy
-    migration on both sides.
-
-During a paused postcopy migration, the VM can logically still continue
-running, and it will not be impacted from any page access to pages that
-were already migrated to destination VM before the interruption happens.
-However, if any of the missing pages got accessed on destination VM, the VM
-thread will be halted waiting for the page to be migrated, it means it can
-be halted until the recovery is complete.
-
-The impact of accessing missing pages can be relevant to different
-configurations of the guest.  For example, when with async page fault
-enabled, logically the guest can proactively schedule out the threads
-accessing missing pages.
-
-Postcopy states
----------------
-
-Postcopy moves through a series of states (see postcopy_state) from
-ADVISE->DISCARD->LISTEN->RUNNING->END
-
- - Advise
-
-    Set at the start of migration if postcopy is enabled, even
-    if it hasn't had the start command; here the destination
-    checks that its OS has the support needed for postcopy, and performs
-    setup to ensure the RAM mappings are suitable for later postcopy.
-    The destination will fail early in migration at this point if the
-    required OS support is not present.
-    (Triggered by reception of POSTCOPY_ADVISE command)
-
- - Discard
-
-    Entered on receipt of the first 'discard' command; prior to
-    the first Discard being performed, hugepages are switched off
-    (using madvise) to ensure that no new huge pages are created
-    during the postcopy phase, and to cause any huge pages that
-    have discards on them to be broken.
-
- - Listen
-
-    The first command in the package, POSTCOPY_LISTEN, switches
-    the destination state to Listen, and starts a new thread
-    (the 'listen thread') which takes over the job of receiving
-    pages off the migration stream, while the main thread carries
-    on processing the blob.  With this thread able to process page
-    reception, the destination now 'sensitises' the RAM to detect
-    any access to missing pages (on Linux using the 'userfault'
-    system).
-
- - Running
-
-    POSTCOPY_RUN causes the destination to synchronise all
-    state and start the CPUs and IO devices running.  The main
-    thread now finishes processing the migration package and
-    now carries on as it would for normal precopy migration
-    (although it can't do the cleanup it would do as it
-    finishes a normal migration).
-
- - Paused
-
-    Postcopy can run into a paused state (normally on both sides when
-    happens), where all threads will be temporarily halted mostly due to
-    network errors.  When reaching paused state, migration will make sure
-    the qemu binary on both sides maintain the data without corrupting
-    the VM.  To continue the migration, the admin needs to fix the
-    migration channel using the QMP command 'migrate-recover' on the
-    destination node, then resume the migration using QMP command 'migrate'
-    again on source node, with resume=true flag set.
-
- - End
-
-    The listen thread can now quit, and perform the cleanup of migration
-    state, the migration is now complete.
-
-Source side page map
---------------------
-
-The 'migration bitmap' in postcopy is basically the same as in the precopy,
-where each of the bit to indicate that page is 'dirty' - i.e. needs
-sending.  During the precopy phase this is updated as the CPU dirties
-pages, however during postcopy the CPUs are stopped and nothing should
-dirty anything any more. Instead, dirty bits are cleared when the relevant
-pages are sent during postcopy.
-
-Postcopy with hugepages
------------------------
-
-Postcopy now works with hugetlbfs backed memory:
-
-  a) The linux kernel on the destination must support userfault on hugepages.
-  b) The huge-page configuration on the source and destination VMs must be
-     identical; i.e. RAMBlocks on both sides must use the same page size.
-  c) Note that ``-mem-path /dev/hugepages``  will fall back to allocating normal
-     RAM if it doesn't have enough hugepages, triggering (b) to fail.
-     Using ``-mem-prealloc`` enforces the allocation using hugepages.
-  d) Care should be taken with the size of hugepage used; postcopy with 2MB
-     hugepages works well, however 1GB hugepages are likely to be problematic
-     since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
-     and until the full page is transferred the destination thread is blocked.
-
-Postcopy with shared memory
----------------------------
-
-Postcopy migration with shared memory needs explicit support from the other
-processes that share memory and from QEMU. There are restrictions on the type of
-memory that userfault can support shared.
-
-The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs``
-(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)``
-for hugetlbfs which may be a problem in some configurations).
-
-The vhost-user code in QEMU supports clients that have Postcopy support,
-and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes
-to support postcopy.
-
-The client needs to open a userfaultfd and register the areas
-of memory that it maps with userfault.  The client must then pass the
-userfaultfd back to QEMU together with a mapping table that allows
-fault addresses in the clients address space to be converted back to
-RAMBlock/offsets.  The client's userfaultfd is added to the postcopy
-fault-thread and page requests are made on behalf of the client by QEMU.
-QEMU performs 'wake' operations on the client's userfaultfd to allow it
-to continue after a page has arrived.
-
-.. note::
-  There are two future improvements that would be nice:
-    a) Some way to make QEMU ignorant of the addresses in the clients
-       address space
-    b) Avoiding the need for QEMU to perform ufd-wake calls after the
-       pages have arrived
-
-Retro-fitting postcopy to existing clients is possible:
-  a) A mechanism is needed for the registration with userfault as above,
-     and the registration needs to be coordinated with the phases of
-     postcopy.  In vhost-user extra messages are added to the existing
-     control channel.
-  b) Any thread that can block due to guest memory accesses must be
-     identified and the implication understood; for example if the
-     guest memory access is made while holding a lock then all other
-     threads waiting for that lock will also be blocked.
-
-Postcopy Preemption Mode
-------------------------
-
-Postcopy preempt is a new capability introduced in 8.0 QEMU release, it
-allows urgent pages (those got page fault requested from destination QEMU
-explicitly) to be sent in a separate preempt channel, rather than queued in
-the background migration channel.  Anyone who cares about latencies of page
-faults during a postcopy migration should enable this feature.  By default,
-it's not enabled.
-
-Firmware
-========
-
-Migration migrates the copies of RAM and ROM, and thus when running
-on the destination it includes the firmware from the source. Even after
-resetting a VM, the old firmware is used.  Only once QEMU has been restarted
-is the new firmware in use.
-
-- Changes in firmware size can cause changes in the required RAMBlock size
-  to hold the firmware and thus migration can fail.  In practice it's best
-  to pad firmware images to convenient powers of 2 with plenty of space
-  for growth.
-
-- Care should be taken with device emulation code so that newer
-  emulation code can work with older firmware to allow forward migration.
-
-- Care should be taken with newer firmware so that backward migration
-  to older systems with older device emulation code will work.
-
-In some cases it may be best to tie specific firmware versions to specific
-versioned machine types to cut down on the combinations that will need
-support.  This is also useful when newer versions of firmware outgrow
-the padding.
-
-
-Backwards compatibility
-=======================
-
-How backwards compatibility works
----------------------------------
-
-When we do migration, we have two QEMU processes: the source and the
-target.  There are two cases, they are the same version or they are
-different versions.  The easy case is when they are the same version.
-The difficult one is when they are different versions.
-
-There are two things that are different, but they have very similar
-names and sometimes get confused:
-
-- QEMU version
-- machine type version
-
-Let's start with a practical example, we start with:
-
-- qemu-system-x86_64 (v5.2), from now on qemu-5.2.
-- qemu-system-x86_64 (v5.1), from now on qemu-5.1.
-
-Related to this are the "latest" machine types defined on each of
-them:
-
-- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2
-- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1
-
-First of all, migration is only supposed to work if you use the same
-machine type in both source and destination. The QEMU hardware
-configuration needs to be the same also on source and destination.
-Most aspects of the backend configuration can be changed at will,
-except for a few cases where the backend features influence frontend
-device feature exposure.  But that is not relevant for this section.
-
-I am going to list the number of combinations that we can have.  Let's
-start with the trivial ones, QEMU is the same on source and
-destination:
-
-1 - qemu-5.2 -M pc-5.2  -> migrates to -> qemu-5.2 -M pc-5.2
-
-  This is the latest QEMU with the latest machine type.
-  This have to work, and if it doesn't work it is a bug.
-
-2 - qemu-5.1 -M pc-5.1  -> migrates to -> qemu-5.1 -M pc-5.1
-
-  Exactly the same case than the previous one, but for 5.1.
-  Nothing to see here either.
-
-This are the easiest ones, we will not talk more about them in this
-section.
-
-Now we start with the more interesting cases.  Consider the case where
-we have the same QEMU version in both sides (qemu-5.2) but we are using
-the latest machine type for that version (pc-5.2) but one of an older
-QEMU version, in this case pc-5.1.
-
-3 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
-
-  It needs to use the definition of pc-5.1 and the devices as they
-  were configured on 5.1, but this should be easy in the sense that
-  both sides are the same QEMU and both sides have exactly the same
-  idea of what the pc-5.1 machine is.
-
-4 - qemu-5.1 -M pc-5.2  -> migrates to -> qemu-5.1 -M pc-5.2
-
-  This combination is not possible as the qemu-5.1 doesn't understand
-  pc-5.2 machine type.  So nothing to worry here.
-
-Now it comes the interesting ones, when both QEMU processes are
-different.  Notice also that the machine type needs to be pc-5.1,
-because we have the limitation than qemu-5.1 doesn't know pc-5.2.  So
-the possible cases are:
-
-5 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.1 -M pc-5.1
-
-  This migration is known as newer to older.  We need to make sure
-  when we are developing 5.2 we need to take care about not to break
-  migration to qemu-5.1.  Notice that we can't make updates to
-  qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is
-  in qemu-5.2 side to make the relevant changes.
-
-6 - qemu-5.1 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
-
-  This migration is known as older to newer.  We need to make sure
-  than we are able to receive migrations from qemu-5.1. The problem is
-  similar to the previous one.
-
-If qemu-5.1 and qemu-5.2 were the same, there will not be any
-compatibility problems.  But the reason that we create qemu-5.2 is to
-get new features, devices, defaults, etc.
-
-If we get a device that has a new feature, or change a default value,
-we have a problem when we try to migrate between different QEMU
-versions.
-
-So we need a way to tell qemu-5.2 that when we are using machine type
-pc-5.1, it needs to **not** use the feature, to be able to migrate to
-real qemu-5.1.
-
-And the equivalent part when migrating from qemu-5.1 to qemu-5.2.
-qemu-5.2 has to expect that it is not going to get data for the new
-feature, because qemu-5.1 doesn't know about it.
-
-How do we tell QEMU about these device feature changes?  In
-hw/core/machine.c:hw_compat_X_Y arrays.
-
-If we change a default value, we need to put back the old value on
-that array.  And the device, during initialization needs to look at
-that array to see what value it needs to get for that feature.  And
-what are we going to put in that array, the value of a property.
-
-To create a property for a device, we need to use one of the
-DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the
-macros that exist.  With it, we set the default value for that
-property, and that is what it is going to get in the latest released
-version.  But if we want a different value for a previous version, we
-can change that in the hw_compat_X_Y arrays.
-
-hw_compat_X_Y is an array of registers that have the format:
-
-- name_device
-- name_property
-- value
-
-Let's see a practical example.
-
-In qemu-5.2 virtio-blk-device got multi queue support.  This is a
-change that is not backward compatible.  In qemu-5.1 it has one
-queue. In qemu-5.2 it has the same number of queues as the number of
-cpus in the system.
-
-When we are doing migration, if we migrate from a device that has 4
-queues to a device that have only one queue, we don't know where to
-put the extra information for the other 3 queues, and we fail
-migration.
-
-Similar problem when we migrate from qemu-5.1 that has only one queue
-to qemu-5.2, we only sent information for one queue, but destination
-has 4, and we have 3 queues that are not properly initialized and
-anything can happen.
-
-So, how can we address this problem.  Easy, just convince qemu-5.2
-that when it is running pc-5.1, it needs to set the number of queues
-for virtio-blk-devices to 1.
-
-That way we fix the cases 5 and 6.
-
-5 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.1 -M pc-5.1
-
-    qemu-5.2 -M pc-5.1 sets number of queues to be 1.
-    qemu-5.1 -M pc-5.1 expects number of queues to be 1.
-
-    correct.  migration works.
-
-6 - qemu-5.1 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
-
-    qemu-5.1 -M pc-5.1 sets number of queues to be 1.
-    qemu-5.2 -M pc-5.1 expects number of queues to be 1.
-
-    correct.  migration works.
-
-And now the other interesting case, case 3.  In this case we have:
-
-3 - qemu-5.2 -M pc-5.1  -> migrates to -> qemu-5.2 -M pc-5.1
-
-    Here we have the same QEMU in both sides.  So it doesn't matter a
-    lot if we have set the number of queues to 1 or not, because
-    they are the same.
-
-    WRONG!
-
-    Think what happens if we do one of this double migrations:
-
-    A -> migrates -> B -> migrates -> C
-
-    where:
-
-    A: qemu-5.1 -M pc-5.1
-    B: qemu-5.2 -M pc-5.1
-    C: qemu-5.2 -M pc-5.1
-
-    migration A -> B is case 6, so number of queues needs to be 1.
-
-    migration B -> C is case 3, so we don't care.  But actually we
-    care because we haven't started the guest in qemu-5.2, it came
-    migrated from qemu-5.1.  So to be in the safe place, we need to
-    always use number of queues 1 when we are using pc-5.1.
-
-Now, how was this done in reality?  The following commit shows how it
-was done::
-
-  commit 9445e1e15e66c19e42bea942ba810db28052cd05
-  Author: Stefan Hajnoczi <stefanha@redhat.com>
-  Date:   Tue Aug 18 15:33:47 2020 +0100
-
-  virtio-blk-pci: default num_queues to -smp N
-
-The relevant parts for migration are::
-
-    @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = {
-     #endif
-         DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
-                         true),
-    -    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
-    +    DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues,
-    +                       VIRTIO_BLK_AUTO_NUM_QUEUES),
-         DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
-
-It changes the default value of num_queues.  But it fishes it for old
-machine types to have the right value::
-
-    @@ -31,6 +31,7 @@
-     GlobalProperty hw_compat_5_1[] = {
-         ...
-    +    { "virtio-blk-device", "num-queues", "1"},
-         ...
-     };
-
-A device with different features on both sides
-----------------------------------------------
-
-Let's assume that we are using the same QEMU binary on both sides,
-just to make the things easier.  But we have a device that has
-different features on both sides of the migration.  That can be
-because the devices are different, because the kernel driver of both
-devices have different features, whatever.
-
-How can we get this to work with migration.  The way to do that is
-"theoretically" easy.  You have to get the features that the device
-has in the source of the migration.  The features that the device has
-on the target of the migration, you get the intersection of the
-features of both sides, and that is the way that you should launch
-QEMU.
-
-Notice that this is not completely related to QEMU.  The most
-important thing here is that this should be handled by the managing
-application that launches QEMU.  If QEMU is configured correctly, the
-migration will succeed.
-
-That said, actually doing it is complicated.  Almost all devices are
-bad at being able to be launched with only some features enabled.
-With one big exception: cpus.
-
-You can read the documentation for QEMU x86 cpu models here:
-
-https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html
-
-See when they talk about migration they recommend that one chooses the
-newest cpu model that is supported for all cpus.
-
-Let's say that we have:
-
-Host A:
-
-Device X has the feature Y
-
-Host B:
-
-Device X has not the feature Y
-
-If we try to migrate without any care from host A to host B, it will
-fail because when migration tries to load the feature Y on
-destination, it will find that the hardware is not there.
-
-Doing this would be the equivalent of doing with cpus:
-
-Host A:
-
-$ qemu-system-x86_64 -cpu host
-
-Host B:
-
-$ qemu-system-x86_64 -cpu host
-
-When both hosts have different cpu features this is guaranteed to
-fail.  Especially if Host B has less features than host A.  If host A
-has less features than host B, sometimes it works.  Important word of
-last sentence is "sometimes".
-
-So, forgetting about cpu models and continuing with the -cpu host
-example, let's see that the differences of the cpus is that Host A and
-B have the following features:
-
-Features:   'pcid'  'stibp' 'taa-no'
-Host A:        X       X
-Host B:                        X
-
-And we want to migrate between them, the way configure both QEMU cpu
-will be:
-
-Host A:
-
-$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off
-
-Host B:
-
-$ qemu-system-x86_64 -cpu host,taa-no=off
-
-And you would be able to migrate between them.  It is responsibility
-of the management application or of the user to make sure that the
-configuration is correct.  QEMU doesn't know how to look at this kind
-of features in general.
-
-Notice that we don't recommend to use -cpu host for migration.  It is
-used in this example because it makes the example simpler.
-
-Other devices have worse control about individual features.  If they
-want to be able to migrate between hosts that show different features,
-the device needs a way to configure which ones it is going to use.
-
-In this section we have considered that we are using the same QEMU
-binary in both sides of the migration.  If we use different QEMU
-versions process, then we need to have into account all other
-differences and the examples become even more complicated.
-
-How to mitigate when we have a backward compatibility error
------------------------------------------------------------
-
-We broke migration for old machine types continuously during
-development.  But as soon as we find that there is a problem, we fix
-it.  The problem is what happens when we detect after we have done a
-release that something has gone wrong.
-
-Let see how it worked with one example.
-
-After the release of qemu-8.0 we found a problem when doing migration
-of the machine type pc-7.2.
-
-- $ qemu-7.2 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
-
-  This migration works
-
-- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
-
-  This migration works
-
-- $ qemu-8.0 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
-
-  This migration fails
-
-- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
-
-  This migration fails
-
-So clearly something fails when migration between qemu-7.2 and
-qemu-8.0 with machine type pc-7.2.  The error messages, and git bisect
-pointed to this commit.
-
-In qemu-8.0 we got this commit::
-
-    commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2
-    Author: Jonathan Cameron <Jonathan.Cameron@huawei.com>
-    Date:   Thu Mar 2 13:37:02 2023 +0000
-
-    hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register
-
-
-The relevant bits of the commit for our example are this ones::
-
-    --- a/hw/pci/pcie_aer.c
-    +++ b/hw/pci/pcie_aer.c
-    @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev,
-
-         pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
-                      PCI_ERR_UNC_SUPPORTED);
-    +    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
-    +                 PCI_ERR_UNC_MASK_DEFAULT);
-    +    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
-    +                 PCI_ERR_UNC_SUPPORTED);
-
-         pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
-                     PCI_ERR_UNC_SEVERITY_DEFAULT);
-
-The patch changes how we configure PCI space for AER.  But QEMU fails
-when the PCI space configuration is different between source and
-destination.
-
-The following commit shows how this got fixed::
-
-    commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f
-    Author: Leonardo Bras <leobras@redhat.com>
-    Date:   Tue May 2 21:27:02 2023 -0300
-
-    hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0
-
-    [...]
-
-The relevant parts of the fix in QEMU are as follow:
-
-First, we create a new property for the device to be able to configure
-the old behaviour or the new behaviour::
-
-    diff --git a/hw/pci/pci.c b/hw/pci/pci.c
-    index 8a87ccc8b0..5153ad63d6 100644
-    --- a/hw/pci/pci.c
-    +++ b/hw/pci/pci.c
-    @@ -79,6 +79,8 @@ static Property pci_props[] = {
-         DEFINE_PROP_STRING("failover_pair_id", PCIDevice,
-                            failover_pair_id),
-         DEFINE_PROP_UINT32("acpi-index",  PCIDevice, acpi_index, 0),
-    +    DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present,
-    +                    QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
-         DEFINE_PROP_END_OF_LIST()
-     };
-
-Notice that we enable the feature for new machine types.
-
-Now we see how the fix is done.  This is going to depend on what kind
-of breakage happens, but in this case it is quite simple::
-
-    diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
-    index 103667c368..374d593ead 100644
-    --- a/hw/pci/pcie_aer.c
-    +++ b/hw/pci/pcie_aer.c
-    @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver,
-    uint16_t offset,
-
-         pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
-                      PCI_ERR_UNC_SUPPORTED);
-    -    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
-    -                 PCI_ERR_UNC_MASK_DEFAULT);
-    -    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
-    -                 PCI_ERR_UNC_SUPPORTED);
-    +
-    +    if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) {
-    +        pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
-    +                     PCI_ERR_UNC_MASK_DEFAULT);
-    +        pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
-    +                     PCI_ERR_UNC_SUPPORTED);
-    +    }
-
-         pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
-                      PCI_ERR_UNC_SEVERITY_DEFAULT);
-
-I.e. If the property bit is enabled, we configure it as we did for
-qemu-8.0.  If the property bit is not set, we configure it as it was in 7.2.
-
-And now, everything that is missing is disabling the feature for old
-machine types::
-
-    diff --git a/hw/core/machine.c b/hw/core/machine.c
-    index 47a34841a5..07f763eb2e 100644
-    --- a/hw/core/machine.c
-    +++ b/hw/core/machine.c
-    @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = {
-         { "e1000e", "migrate-timadj", "off" },
-         { "virtio-mem", "x-early-migration", "false" },
-         { "migration", "x-preempt-pre-7-2", "true" },
-    +    { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" },
-     };
-     const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
-
-And now, when qemu-8.0.1 is released with this fix, all combinations
-are going to work as supposed.
-
-- $ qemu-7.2 -M pc-7.2  ->  qemu-7.2 -M pc-7.2 (works)
-- $ qemu-8.0.1 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2 (works)
-- $ qemu-8.0.1 -M pc-7.2  ->  qemu-7.2 -M pc-7.2 (works)
-- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2 (works)
-
-So the normality has been restored and everything is ok, no?
-
-Not really, now our matrix is much bigger.  We started with the easy
-cases, migration from the same version to the same version always
-works:
-
-- $ qemu-7.2 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
-- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
-- $ qemu-8.0.1 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
-
-Now the interesting ones.  When the QEMU processes versions are
-different.  For the 1st set, their fail and we can do nothing, both
-versions are released and we can't change anything.
-
-- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
-- $ qemu-8.0 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
-
-This two are the ones that work. The whole point of making the
-change in qemu-8.0.1 release was to fix this issue:
-
-- $ qemu-7.2 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
-- $ qemu-8.0.1 -M pc-7.2  ->  qemu-7.2 -M pc-7.2
-
-But now we found that qemu-8.0 neither can migrate to qemu-7.2 not
-qemu-8.0.1.
-
-- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
-- $ qemu-8.0.1 -M pc-7.2  ->  qemu-8.0 -M pc-7.2
-
-So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to
-anything except to qemu-8.0.
-
-Can we do better?
-
-Yeap.  If we know that we are going to do this migration:
-
-- $ qemu-8.0 -M pc-7.2  ->  qemu-8.0.1 -M pc-7.2
-
-We can launch the appropriate devices with::
-
-  --device...,x-pci-e-err-unc-mask=on
-
-And now we can receive a migration from 8.0.  And from now on, we can
-do that migration to new machine types if we remember to enable that
-property for pc-7.2.  Notice that we need to remember, it is not
-enough to know that the source of the migration is qemu-8.0.  Think of
-this example:
-
-$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2
-
-In the second migration, the source is not qemu-8.0, but we still have
-that "problem" and have that property enabled.  Notice that we need to
-continue having this mark/property until we have this machine
-rebooted.  But it is not a normal reboot (that don't reload QEMU) we
-need the machine to poweroff/poweron on a fixed QEMU.  And from now
-on we can use the proper real machine.
diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6c51e96d798418ecb048a71bba8b0847d971bcc7
--- /dev/null
+++ b/docs/devel/migration/postcopy.rst
@@ -0,0 +1,313 @@
+========
+Postcopy
+========
+
+.. contents::
+
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side causes the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+Enabling postcopy
+=================
+
+To enable postcopy, issue this command on the monitor (both source and
+destination) prior to the start of migration:
+
+``migrate_set_capability postcopy-ram on``
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode.  Issuing:
+
+``migrate_start_postcopy``
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on.  Issuing it after the end of a migration is harmless.
+
+Blocktime is a postcopy live migration metric, intended to show how
+long the vCPU was in state of interruptible sleep due to pagefault.
+That metric is calculated both for all vCPUs as overlapped value, and
+separately for each vCPU. These values are calculated on destination
+side.  To enable postcopy blocktime calculation, enter following
+command on destination monitor:
+
+``migrate_set_capability postcopy-blocktime on``
+
+Postcopy blocktime can be retrieved by query-migrate qmp command.
+postcopy-blocktime value of qmp command will show overlapped blocking
+time for all vCPU, postcopy-vcpu-blocktime will show list of blocking
+time per vCPU.
+
+.. note::
+  During the postcopy phase, the bandwidth limits set using
+  ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that
+  the destination is waiting for).
+
+Postcopy internals
+==================
+
+State machine
+-------------
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+ - Advise
+
+    Set at the start of migration if postcopy is enabled, even
+    if it hasn't had the start command; here the destination
+    checks that its OS has the support needed for postcopy, and performs
+    setup to ensure the RAM mappings are suitable for later postcopy.
+    The destination will fail early in migration at this point if the
+    required OS support is not present.
+    (Triggered by reception of POSTCOPY_ADVISE command)
+
+ - Discard
+
+    Entered on receipt of the first 'discard' command; prior to
+    the first Discard being performed, hugepages are switched off
+    (using madvise) to ensure that no new huge pages are created
+    during the postcopy phase, and to cause any huge pages that
+    have discards on them to be broken.
+
+ - Listen
+
+    The first command in the package, POSTCOPY_LISTEN, switches
+    the destination state to Listen, and starts a new thread
+    (the 'listen thread') which takes over the job of receiving
+    pages off the migration stream, while the main thread carries
+    on processing the blob.  With this thread able to process page
+    reception, the destination now 'sensitises' the RAM to detect
+    any access to missing pages (on Linux using the 'userfault'
+    system).
+
+ - Running
+
+    POSTCOPY_RUN causes the destination to synchronise all
+    state and start the CPUs and IO devices running.  The main
+    thread now finishes processing the migration package and
+    now carries on as it would for normal precopy migration
+    (although it can't do the cleanup it would do as it
+    finishes a normal migration).
+
+ - Paused
+
+    Postcopy can run into a paused state (normally on both sides when
+    happens), where all threads will be temporarily halted mostly due to
+    network errors.  When reaching paused state, migration will make sure
+    the qemu binary on both sides maintain the data without corrupting
+    the VM.  To continue the migration, the admin needs to fix the
+    migration channel using the QMP command 'migrate-recover' on the
+    destination node, then resume the migration using QMP command 'migrate'
+    again on source node, with resume=true flag set.
+
+ - End
+
+    The listen thread can now quit, and perform the cleanup of migration
+    state, the migration is now complete.
+
+Device transfer
+---------------
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up.  This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+----------------
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+   - Command: 'postcopy listen'
+   - The device state
+
+     A series of sections, identical to the precopy streams device state stream
+     containing everything except postcopiable devices (i.e. RAM)
+   - Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location.  This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+---------------------
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+::
+
+  ------------------------------------------------------------------------------
+                          1      2   3     4 5                      6   7
+  main -----DISCARD-CMD_PACKAGED ( LISTEN  DEVICE     DEVICE DEVICE RUN )
+  thread                             |       |
+                                     |     (page request)
+                                     |        \___
+                                     v            \
+  listen thread:                     --- page -- page -- page -- page -- page --
+
+                                     a   b        c
+  ------------------------------------------------------------------------------
+
+- On receipt of ``CMD_PACKAGED`` (1)
+
+   All the data associated with the package - the ( ... ) section in the diagram -
+   is read into memory, and the main thread recurses into qemu_loadvm_state_main
+   to process the contents of the package (2) which contains commands (3,6) and
+   devices (4...)
+
+- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+
+   a new thread (a) is started that takes over servicing the migration stream,
+   while the main thread carries on loading the package.   It loads normal
+   background page data (b) but if during a device load a fault happens (5)
+   the returned page (c) is loaded by the listen thread allowing the main
+   threads device load to carry on.
+
+- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6)
+
+   letting the destination CPUs start running.  At the end of the
+   ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and
+   is no longer used by migration, while the listen thread carries on servicing
+   page data until the end of migration.
+
+Source side page bitmap
+-----------------------
+
+The 'migration bitmap' in postcopy is basically the same as in the precopy,
+where each of the bit to indicate that page is 'dirty' - i.e. needs
+sending.  During the precopy phase this is updated as the CPU dirties
+pages, however during postcopy the CPUs are stopped and nothing should
+dirty anything any more. Instead, dirty bits are cleared when the relevant
+pages are sent during postcopy.
+
+Postcopy features
+=================
+
+Postcopy recovery
+-----------------
+
+Comparing to precopy, postcopy is special on error handlings.  When any
+error happens (in this case, mostly network errors), QEMU cannot easily
+fail a migration because VM data resides in both source and destination
+QEMU instances.  On the other hand, when issue happens QEMU on both sides
+will go into a paused state.  It'll need a recovery phase to continue a
+paused postcopy migration.
+
+The recovery phase normally contains a few steps:
+
+  - When network issue occurs, both QEMU will go into PAUSED state
+
+  - When the network is recovered (or a new network is provided), the admin
+    can setup the new channel for migration using QMP command
+    'migrate-recover' on destination node, preparing for a resume.
+
+  - On source host, the admin can continue the interrupted postcopy
+    migration using QMP command 'migrate' with resume=true flag set.
+
+  - After the connection is re-established, QEMU will continue the postcopy
+    migration on both sides.
+
+During a paused postcopy migration, the VM can logically still continue
+running, and it will not be impacted from any page access to pages that
+were already migrated to destination VM before the interruption happens.
+However, if any of the missing pages got accessed on destination VM, the VM
+thread will be halted waiting for the page to be migrated, it means it can
+be halted until the recovery is complete.
+
+The impact of accessing missing pages can be relevant to different
+configurations of the guest.  For example, when with async page fault
+enabled, logically the guest can proactively schedule out the threads
+accessing missing pages.
+
+Postcopy with hugepages
+-----------------------
+
+Postcopy now works with hugetlbfs backed memory:
+
+  a) The linux kernel on the destination must support userfault on hugepages.
+  b) The huge-page configuration on the source and destination VMs must be
+     identical; i.e. RAMBlocks on both sides must use the same page size.
+  c) Note that ``-mem-path /dev/hugepages``  will fall back to allocating normal
+     RAM if it doesn't have enough hugepages, triggering (b) to fail.
+     Using ``-mem-prealloc`` enforces the allocation using hugepages.
+  d) Care should be taken with the size of hugepage used; postcopy with 2MB
+     hugepages works well, however 1GB hugepages are likely to be problematic
+     since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
+     and until the full page is transferred the destination thread is blocked.
+
+Postcopy with shared memory
+---------------------------
+
+Postcopy migration with shared memory needs explicit support from the other
+processes that share memory and from QEMU. There are restrictions on the type of
+memory that userfault can support shared.
+
+The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs``
+(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)``
+for hugetlbfs which may be a problem in some configurations).
+
+The vhost-user code in QEMU supports clients that have Postcopy support,
+and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes
+to support postcopy.
+
+The client needs to open a userfaultfd and register the areas
+of memory that it maps with userfault.  The client must then pass the
+userfaultfd back to QEMU together with a mapping table that allows
+fault addresses in the clients address space to be converted back to
+RAMBlock/offsets.  The client's userfaultfd is added to the postcopy
+fault-thread and page requests are made on behalf of the client by QEMU.
+QEMU performs 'wake' operations on the client's userfaultfd to allow it
+to continue after a page has arrived.
+
+.. note::
+  There are two future improvements that would be nice:
+    a) Some way to make QEMU ignorant of the addresses in the clients
+       address space
+    b) Avoiding the need for QEMU to perform ufd-wake calls after the
+       pages have arrived
+
+Retro-fitting postcopy to existing clients is possible:
+  a) A mechanism is needed for the registration with userfault as above,
+     and the registration needs to be coordinated with the phases of
+     postcopy.  In vhost-user extra messages are added to the existing
+     control channel.
+  b) Any thread that can block due to guest memory accesses must be
+     identified and the implication understood; for example if the
+     guest memory access is made while holding a lock then all other
+     threads waiting for that lock will also be blocked.
+
+Postcopy preemption mode
+------------------------
+
+Postcopy preempt is a new capability introduced in 8.0 QEMU release, it
+allows urgent pages (those got page fault requested from destination QEMU
+explicitly) to be sent in a separate preempt channel, rather than queued in
+the background migration channel.  Anyone who cares about latencies of page
+faults during a postcopy migration should enable this feature.  By default,
+it's not enabled.
diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst
new file mode 100644
index 0000000000000000000000000000000000000000..862b3831647b3e8766c308ae1067c1baa8aa8f3e
--- /dev/null
+++ b/docs/devel/migration/qatzip-compression.rst
@@ -0,0 +1,165 @@
+==================
+QATzip Compression
+==================
+In scenarios with limited network bandwidth, the ``QATzip`` solution can help
+users save a lot of host CPU resources by accelerating compression and
+decompression through the Intel QuickAssist Technology(``QAT``) hardware.
+
+
+The following test was conducted using 8 multifd channels and 10Gbps network
+bandwidth. The results show that, compared to zstd, ``QATzip`` significantly
+saves CPU resources on the sender and reduces migration time. Compared to the
+uncompressed solution, ``QATzip`` greatly improves the dirty page processing
+capability, indicated by the Pages per Second metric, and also reduces the
+total migration time.
+
+::
+
+   VM Configuration: 16 vCPU and 64G memory
+   VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data.
+   QAT Devices: 4
+   |-----------|--------|---------|----------|----------|------|------|
+   |8 Channels |Total   |down     |throughput|pages per | send | recv |
+   |           |time(ms)|time(ms) |(mbps)    |second    | cpu %| cpu% |
+   |-----------|--------|---------|----------|----------|------|------|
+   |qatzip     |   16630|       28|     10467|   2940235|   160|   360|
+   |-----------|--------|---------|----------|----------|------|------|
+   |zstd       |   20165|       24|      8579|   2391465|   810|   340|
+   |-----------|--------|---------|----------|----------|------|------|
+   |none       |   46063|       40|     10848|    330240|    45|    85|
+   |-----------|--------|---------|----------|----------|------|------|
+
+
+QATzip Compression Framework
+============================
+
+``QATzip`` is a user space library which builds on top of the Intel QuickAssist
+Technology to provide extended accelerated compression and decompression
+services.
+
+For more ``QATzip`` introduction, please refer to `QATzip Introduction
+<https://github.com/intel/QATzip?tab=readme-ov-file#introductionl>`_
+
+::
+
+  +----------------+
+  | MultiFd Thread |
+  +-------+--------+
+          |
+          | compress/decompress
+  +-------+--------+
+  | QATzip library |
+  +-------+--------+
+          |
+  +-------+--------+
+  |  QAT library   |
+  +-------+--------+
+          |         user space
+  --------+---------------------
+          |         kernel space
+   +------+-------+
+   |  QAT  Driver |
+   +------+-------+
+          |
+   +------+-------+
+   | QAT Devices  |
+   +--------------+
+
+
+QATzip Installation
+-------------------
+
+The ``QATzip`` installation package has been integrated into some Linux
+distributions and can be installed directly. For example, the Ubuntu Server
+24.04 LTS system can be installed using below command
+
+.. code-block:: shell
+
+   #apt search qatzip
+   libqatzip-dev/noble 1.2.0-0ubuntu3 amd64
+     Intel QuickAssist user space library development files
+
+   libqatzip3/noble 1.2.0-0ubuntu3 amd64
+     Intel QuickAssist user space library
+
+   qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed]
+     Compression user-space tool for Intel QuickAssist Technology
+
+   #sudo apt install libqatzip-dev libqatzip3 qatzip
+
+If your system does not support the ``QATzip`` installation package, you can
+use the source code to build and install, please refer to `QATzip source code installation
+<https://github.com/intel/QATzip?tab=readme-ov-file#build-intel-quickassist-technology-driver>`_
+
+QAT Hardware Deployment
+-----------------------
+
+``QAT`` supports physical functions(PFs) and virtual functions(VFs) for
+deployment, and users can configure ``QAT`` resources for migration according
+to actual needs. For more details about ``QAT`` deployment, please refer to
+`Intel QuickAssist Technology Documentation
+<https://intel.github.io/quickassist/index.html>`_
+
+For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview
+<https://www.intel.com/content/www/us/en/architecture-and-technology/intel-quick-assist-technology-overview.html>`_
+
+How To Use QATzip Compression
+=============================
+
+1 - Install ``QATzip`` library
+
+2 - Build ``QEMU`` with ``--enable-qatzip`` parameter
+
+  E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip``
+
+3 - Set ``migrate_set_parameter multifd-compression qatzip``
+
+4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default
+comp_level value is 1, and it supports levels from 1 to 9
+
+QAT Memory Requirements
+=======================
+
+The user needs to reserve system memory for the QAT memory management to
+allocate DMA memory. The size of the reserved system memory depends on the
+number of devices used for migration and the number of multifd channels.
+
+Because memory usage depends on QAT configuration, please refer to `QAT Memory
+Driver Queries
+<https://intel.github.io/quickassist/PG/infrastructure_debugability.html?highlight=memory>`_
+for memory usage calculation.
+
+.. list-table:: An example of a PF used for migration
+  :header-rows: 1
+
+  * - Number of channels
+    - Sender memory usage
+    - Receiver memory usage
+  * - 2
+    - 10M
+    - 10M
+  * - 4
+    - 12M
+    - 14M
+  * - 8
+    - 16M
+    - 20M
+
+How To Choose Between QATzip and QPL
+====================================
+Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids
+processor(``SPR``), multiple built-in accelerators are supported including
+``QAT`` and ``IAA``.  The former can accelerate ``QATzip`` and the latter is
+used to accelerate ``QPL``.
+
+Here are some suggestions:
+
+1 - If the live migration scenario is limited by network bandwidth and ``QAT``
+hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a
+lot of host CPU resources for compression.
+
+2 - If the system cannot support shared virtual memory (SVM) technology, use
+the ``QATzip`` method because ``QPL`` performance is not good without SVM
+support.
+
+3 - For other scenarios, use the ``QPL`` method first.
diff --git a/docs/devel/migration/qpl-compression.rst b/docs/devel/migration/qpl-compression.rst
new file mode 100644
index 0000000000000000000000000000000000000000..990992d78655b1d1ce7bb07f5fc46cc8c6b3dd81
--- /dev/null
+++ b/docs/devel/migration/qpl-compression.rst
@@ -0,0 +1,260 @@
+===============
+QPL Compression
+===============
+The Intel Query Processing Library (Intel ``QPL``) is an open-source library to
+provide compression and decompression features and it is based on deflate
+compression algorithm (RFC 1951).
+
+The ``QPL`` compression relies on Intel In-Memory Analytics Accelerator(``IAA``)
+and Shared Virtual Memory(``SVM``) technology, they are new features supported
+from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids
+processor(``SPR``).
+
+For more ``QPL`` introduction, please refer to `QPL Introduction
+<https://intel.github.io/qpl/documentation/introduction_docs/introduction.html>`_
+
+QPL Compression Framework
+=========================
+
+::
+
+  +----------------+       +------------------+
+  | MultiFD Thread |       |accel-config tool |
+  +-------+--------+       +--------+---------+
+          |                         |
+          |                         |
+          |compress/decompress      |
+  +-------+--------+                | Setup IAA
+  |  QPL library   |                | Resources
+  +-------+---+----+                |
+          |   |                     |
+          |   +-------------+-------+
+          |   Open IAA      |
+          |   Devices +-----+-----+
+          |           |idxd driver|
+          |           +-----+-----+
+          |                 |
+          |                 |
+          |           +-----+-----+
+          +-----------+IAA Devices|
+      Submit jobs     +-----------+
+      via enqcmd
+
+
+QPL Build And Installation
+--------------------------
+
+.. code-block:: shell
+
+  $git clone --recursive https://github.com/intel/qpl.git qpl
+  $mkdir qpl/build
+  $cd qpl/build
+  $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DQPL_LIBRARY_TYPE=SHARED ..
+  $sudo cmake --build . --target install
+
+For more details about ``QPL`` installation, please refer to `QPL Installation
+<https://intel.github.io/qpl/documentation/get_started_docs/installation.html>`_
+
+IAA Device Management
+---------------------
+
+The number of ``IAA`` devices will vary depending on the Xeon product model.
+On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with up to
+4 devices per socket.
+
+By default, all ``IAA`` devices are disabled and need to be configured and
+enabled by users manually.
+
+Check the number of devices through the following command
+
+.. code-block:: shell
+
+  #lspci -d 8086:0cfe
+  6a:02.0 System peripheral: Intel Corporation Device 0cfe
+  6f:02.0 System peripheral: Intel Corporation Device 0cfe
+  74:02.0 System peripheral: Intel Corporation Device 0cfe
+  79:02.0 System peripheral: Intel Corporation Device 0cfe
+  e7:02.0 System peripheral: Intel Corporation Device 0cfe
+  ec:02.0 System peripheral: Intel Corporation Device 0cfe
+  f1:02.0 System peripheral: Intel Corporation Device 0cfe
+  f6:02.0 System peripheral: Intel Corporation Device 0cfe
+
+IAA Device Configuration And Enabling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``accel-config`` tool is used to enable ``IAA`` devices and configure
+``IAA`` hardware resources(work queues and engines). One ``IAA`` device
+has 8 work queues and 8 processing engines, multiple engines can be assigned
+to a work queue via ``group`` attribute.
+
+For ``accel-config`` installation, please refer to `accel-config installation
+<https://github.com/intel/idxd-config>`_
+
+One example of configuring and enabling an ``IAA`` device.
+
+.. code-block:: shell
+
+  #accel-config config-engine iax1/engine1.0 -g 0
+  #accel-config config-engine iax1/engine1.1 -g 0
+  #accel-config config-engine iax1/engine1.2 -g 0
+  #accel-config config-engine iax1/engine1.3 -g 0
+  #accel-config config-engine iax1/engine1.4 -g 0
+  #accel-config config-engine iax1/engine1.5 -g 0
+  #accel-config config-engine iax1/engine1.6 -g 0
+  #accel-config config-engine iax1/engine1.7 -g 0
+  #accel-config config-wq iax1/wq1.0 -g 0 -s 128 -p 10 -b 1 -t 128 -m shared -y user -n app1 -d user
+  #accel-config enable-device iax1
+  #accel-config enable-wq iax1/wq1.0
+
+.. note::
+   IAX is an early name for IAA
+
+- The ``IAA`` device index is 1, use ``ls -lh /sys/bus/dsa/devices/iax*``
+  command to query the ``IAA`` device index.
+
+- 8 engines and 1 work queue are configured in group 0, so all compression jobs
+  submitted to this work queue can be processed by all engines at the same time.
+
+- Set work queue attributes including the work mode, work queue size and so on.
+
+- Enable the ``IAA1`` device and work queue 1.0
+
+.. note::
+
+  Set work queue mode to shared mode, since ``QPL`` library only supports
+  shared mode
+
+For more detailed configuration, please refer to `IAA Configuration Samples
+<https://github.com/intel/idxd-config/tree/stable/Documentation/accfg>`_
+
+IAA Unit Test
+^^^^^^^^^^^^^
+
+- Enabling ``IAA`` devices for Xeon platform, please refer to `IAA User Guide
+  <https://www.intel.com/content/www/us/en/content-details/780887/intel-in-memory-analytics-accelerator-intel-iaa.html>`_
+
+- ``IAA`` device driver is Intel Data Accelerator Driver (idxd), it is
+  recommended that the minimum version of Linux kernel is 5.18.
+
+- Add ``"intel_iommu=on,sm_on"`` parameter to kernel command line
+  for ``SVM`` feature enabling.
+
+Here is an easy way to verify ``IAA`` device driver and ``SVM`` with `iaa_test
+<https://github.com/intel/idxd-config/tree/stable/test>`_
+
+.. code-block:: shell
+
+  #./test/iaa_test
+   [ info] alloc wq 0 shared size 128 addr 0x7f26cebe5000 batch sz 0xfffffffe xfer sz 0x80000000
+   [ info] test noop: tflags 0x1 num_desc 1
+   [ info] preparing descriptor for noop
+   [ info] Submitted all noop jobs
+   [ info] verifying task result for 0x16f7e20
+   [ info] test with op 0 passed
+
+
+IAA Resources Allocation For Migration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There is no ``IAA`` resource configuration parameters for migration and
+``accel-config`` tool configuration cannot directly specify the ``IAA``
+resources used for migration.
+
+The multifd migration with ``QPL`` compression method  will use all work
+queues that are enabled and shared mode.
+
+.. note::
+
+  Accessing IAA resources requires ``sudo`` command or ``root`` privileges
+  by default. Administrators can modify the IAA device node ownership
+  so that QEMU can use IAA with specified user permissions.
+
+  For example
+
+  #chown -R qemu /dev/iax
+
+Shared Virtual Memory(SVM) Introduction
+=======================================
+
+An ability for an accelerator I/O device to operate in the same virtual
+memory space of applications on host processors. It also implies the
+ability to operate from pageable memory, avoiding functional requirements
+to pin memory for DMA operations.
+
+When using ``SVM`` technology, users do not need to reserve memory for the
+``IAA`` device and perform pin memory operation. The ``IAA`` device can
+directly access data using the virtual address of the process.
+
+For more ``SVM`` technology, please refer to
+`Shared Virtual Addressing (SVA) with ENQCMD
+<https://docs.kernel.org/next/x86/sva.html>`_
+
+
+How To Use QPL Compression In Migration
+=======================================
+
+1 - Installation of ``QPL`` library and ``accel-config`` library if using IAA
+
+2 - Configure and enable ``IAA`` devices and work queues via ``accel-config``
+
+3 - Build ``QEMU`` with ``--enable-qpl`` parameter
+
+  E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qpl``
+
+4 - Enable ``QPL`` compression during migration
+
+  Set ``migrate_set_parameter multifd-compression qpl`` when migrating, the
+  ``QPL`` compression does not support configuring the compression level, it
+  only supports one compression level.
+
+The Difference Between QPL And ZLIB
+===================================
+
+Although both ``QPL`` and ``ZLIB`` are based on the deflate compression
+algorithm, and ``QPL`` can support the header and tail of ``ZLIB``, ``QPL``
+is still not fully compatible with the ``ZLIB`` compression in the migration.
+
+``QPL`` only supports 4K history buffer, and ``ZLIB`` is 32K by default.
+``ZLIB`` compresses data that ``QPL`` may not decompress correctly and
+vice versa.
+
+``QPL`` does not support the ``Z_SYNC_FLUSH`` operation in ``ZLIB`` streaming
+compression, current ``ZLIB`` implementation uses ``Z_SYNC_FLUSH``, so each
+``multifd`` thread has a ``ZLIB`` streaming context, and all page compression
+and decompression are based on this stream. ``QPL`` cannot decompress such data
+and vice versa.
+
+The introduction for ``Z_SYNC_FLUSH``, please refer to `Zlib Manual
+<https://www.zlib.net/manual.html>`_
+
+The Best Practices
+==================
+When user enables the IAA device for ``QPL`` compression, it is recommended
+to add ``-mem-prealloc`` parameter to the destination boot parameters. This
+parameter can avoid the occurrence of I/O page fault and reduce the overhead
+of IAA compression and decompression.
+
+The example of booting with ``-mem-prealloc`` parameter
+
+.. code-block:: shell
+
+   $qemu-system-x86_64 --enable-kvm -cpu host --mem-prealloc ...
+
+
+An example about I/O page fault measurement of destination without
+``-mem-prealloc``, the ``svm_prq`` indicates the number of I/O page fault
+occurrences and processing time.
+
+.. code-block:: shell
+
+  #echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+  #echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+  #echo 3 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+  #echo 4 > /sys/kernel/debug/iommu/intel/dmar_perf_latency
+  #cat /sys/kernel/debug/iommu/intel/dmar_perf_latency
+  IOMMU: dmar18 Register Base Address: c87fc000
+                  <0.1us   0.1us-1us    1us-10us  10us-100us   100us-1ms    1ms-10ms      >=10ms     min(us)     max(us) average(us)
+   inv_iotlb           0         286         123           0           0           0           0           0           1           0
+  inv_devtlb           0         276         133           0           0           0           0           0           2           0
+     inv_iec           0           0           0           0           0           0           0           0           0           0
+     svm_prq           0           0       25206         364         395           0           0           1         556           9
diff --git a/docs/devel/migration/uadk-compression.rst b/docs/devel/migration/uadk-compression.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3f73345dd531826b8246c563ee8a860a8cda61de
--- /dev/null
+++ b/docs/devel/migration/uadk-compression.rst
@@ -0,0 +1,144 @@
+=========================================================
+User Space Accelerator Development Kit (UADK) Compression
+=========================================================
+UADK is a general-purpose user space accelerator framework that uses shared
+virtual addressing (SVA) to provide a unified programming interface for
+hardware acceleration of cryptographic and compression algorithms.
+
+UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE),
+which enables hardware accelerators from different vendors that support SVA to
+adapt to UADK.
+
+Currently, HiSilicon Kunpeng hardware accelerators have been registered with
+UACCE. Through the UADK framework, users can run cryptographic and compression
+algorithms using hardware accelerators instead of CPUs, freeing up CPU
+computing power and improving computing performance.
+
+https://github.com/Linaro/uadk/tree/master/docs
+
+UADK Framework
+==============
+UADK consists of UACCE, vendors' drivers, and an algorithm layer. UADK requires
+the hardware accelerator to support SVA, and the operating system to support
+IOMMU and SVA. Hardware accelerators from different vendors are registered as
+different character devices with UACCE by using kernel-mode drivers of the
+vendors. A user can access the hardware accelerators by performing user-mode
+operations on the character devices.
+
+::
+
+          +----------------------------------+
+          |                apps              |
+          +----+------------------------+----+
+               |                        |
+               |                        |
+       +-------+--------+       +-------+-------+
+       |   scheduler    |       | alg libraries |
+       +-------+--------+       +-------+-------+
+               |                         |
+               |                         |
+               |                         |
+               |                +--------+------+
+               |                | vendor drivers|
+               |                +-+-------------+
+               |                  |
+               |                  |
+            +--+------------------+--+
+            |         libwd          |
+    User    +----+-------------+-----+
+    --------------------------------------------------
+    Kernel    +--+-----+   +------+
+              | uacce  |   | smmu |
+              +---+----+   +------+
+                  |
+              +---+------------------+
+              | vendor kernel driver |
+              +----------------------+
+    --------------------------------------------------
+             +----------------------+
+             |   HW Accelerators    |
+             +----------------------+
+
+UADK Installation
+-----------------
+Build UADK
+^^^^^^^^^^
+
+.. code-block:: shell
+
+    git clone https://github.com/Linaro/uadk.git
+    cd uadk
+    mkdir build
+    ./autogen.sh
+    ./configure --prefix=$PWD/build
+    make
+    make install
+
+Without --prefix, UADK will be installed to /usr/local/lib by default.
+If get error:"cannot find -lnuma", please install the libnuma-dev
+
+Run pkg-config libwd to ensure env is setup correctly
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* export PKG_CONFIG_PATH=$PWD/build/lib/pkgconfig
+* pkg-config libwd --cflags --libs
+  -I/usr/local/include -L/usr/local/lib -lwd
+
+* export PKG_CONFIG_PATH is required on demand.
+  Not required if UADK is installed to /usr/local/lib
+
+UADK Host Kernel Requirements
+-----------------------------
+User needs to make sure that ``UACCE`` is already supported in Linux kernel.
+The kernel version should be at least v5.9 with SVA (Shared Virtual
+Addressing) enabled.
+
+Kernel Configuration
+^^^^^^^^^^^^^^^^^^^^
+
+``UACCE`` could be built as module or built-in.
+
+Here's an example to enable UACCE with hardware accelerator in HiSilicon
+Kunpeng platform.
+
+*    CONFIG_IOMMU_SVA_LIB=y
+*    CONFIG_ARM_SMMU=y
+*    CONFIG_ARM_SMMU_V3=y
+*    CONFIG_ARM_SMMU_V3_SVA=y
+*    CONFIG_PCI_PASID=y
+*    CONFIG_UACCE=y
+*    CONFIG_CRYPTO_DEV_HISI_QM=y
+*    CONFIG_CRYPTO_DEV_HISI_ZIP=y
+
+Make sure all these above kernel configurations are selected.
+
+Accelerator dev node permissions
+--------------------------------
+Harware accelerators(eg: HiSilicon Kunpeng Zip accelerator) gets registered to
+UADK and char devices are created in dev directory. In order to access resources
+on hardware accelerator devices, write permission should be provided to user.
+
+.. code-block:: shell
+
+    $ sudo chmod 777 /dev/hisi_zip-*
+
+How To Use UADK Compression In QEMU Migration
+---------------------------------------------
+* Make sure UADK is installed as above
+* Build ``QEMU`` with ``--enable-uadk`` parameter
+
+  E.g. configure --target-list=aarch64-softmmu --enable-kvm ``--enable-uadk``
+
+* Enable ``UADK`` compression during migration
+
+  Set ``migrate_set_parameter multifd-compression uadk``
+
+Since UADK uses Shared Virtual Addressing(SVA) and device access virtual memory
+directly it is possible that SMMUv3 may enounter page faults while walking the
+IO page tables. This may impact the performance. In order to mitigate this,
+please make sure to specify ``-mem-prealloc`` parameter to the destination VM
+boot parameters.
+
+Though both UADK and ZLIB are based on the deflate compression algorithm, UADK
+is not fully compatible with ZLIB. Hence, please make sure to use ``uadk`` on
+both source and destination during migration.
diff --git a/docs/devel/vfio-migration.rst b/docs/devel/migration/vfio.rst
similarity index 99%
rename from docs/devel/vfio-migration.rst
rename to docs/devel/migration/vfio.rst
index 605fe60e9695a50813b1294bb970ce2c39d2ba07..c49482eab66d8e831ea1c2c791fc895b51893e4d 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/migration/vfio.rst
@@ -1,5 +1,5 @@
 =====================
-VFIO device Migration
+VFIO device migration
 =====================
 
 Migration of virtual machine involves saving the state for each device that
diff --git a/docs/devel/virtio-migration.txt b/docs/devel/migration/virtio.rst
similarity index 37%
rename from docs/devel/virtio-migration.txt
rename to docs/devel/migration/virtio.rst
index 98a6b0ffb5731034be51158bba0ebdd04ddb07bd..611a18b821519e51fc5ddc84b1b5049a8219387a 100644
--- a/docs/devel/virtio-migration.txt
+++ b/docs/devel/migration/virtio.rst
@@ -1,5 +1,6 @@
-Virtio devices and migration
-============================
+=======================
+Virtio device migration
+=======================
 
 Copyright 2015 IBM Corp.
 
@@ -8,91 +9,97 @@ the COPYING file in the top-level directory.
 
 Saving and restoring the state of virtio devices is a bit of a twisty maze,
 for several reasons:
+
 - state is distributed between several parts:
+
   - virtio core, for common fields like features, number of queues, ...
+
   - virtio transport (pci, ccw, ...), for the different proxy devices and
     transport specific state (msix vectors, indicators, ...)
+
   - virtio device (net, blk, ...), for the different device types and their
     state (mac address, request queue, ...)
+
 - most fields are saved via the stream interface; subsequently, subsections
   have been added to make cross-version migration possible
 
 This file attempts to document the current procedure and point out some
 caveats.
 
-
 Save state procedure
 ====================
 
-virtio core               virtio transport          virtio device
------------               ----------------          -------------
-
-                                                    save() function registered
-                                                    via VMState wrapper on
-                                                    device class
-virtio_save()                                       <----------
-             ------>      save_config()
-                          - save proxy device
-                          - save transport-specific
-                            device fields
-- save common device
-  fields
-- save common virtqueue
-  fields
-             ------>      save_queue()
-                          - save transport-specific
-                            virtqueue fields
-             ------>                               save_device()
-                                                   - save device-specific
-                                                     fields
-- save subsections
-  - device endianness,
-    if changed from
-    default endianness
-  - 64 bit features, if
-    any high feature bit
-    is set
-  - virtio-1 virtqueue
-    fields, if VERSION_1
-    is set
+::
 
+  virtio core               virtio transport          virtio device
+  -----------               ----------------          -------------
+
+                                                      save() function registered
+                                                      via VMState wrapper on
+                                                      device class
+  virtio_save()                                       <----------
+               ------>      save_config()
+                            - save proxy device
+                            - save transport-specific
+                              device fields
+  - save common device
+    fields
+  - save common virtqueue
+    fields
+               ------>      save_queue()
+                            - save transport-specific
+                              virtqueue fields
+               ------>                               save_device()
+                                                     - save device-specific
+                                                       fields
+  - save subsections
+    - device endianness,
+      if changed from
+      default endianness
+    - 64 bit features, if
+      any high feature bit
+      is set
+    - virtio-1 virtqueue
+      fields, if VERSION_1
+      is set
 
 Load state procedure
 ====================
 
-virtio core               virtio transport          virtio device
------------               ----------------          -------------
-
-                                                    load() function registered
-                                                    via VMState wrapper on
-                                                    device class
-virtio_load()                                       <----------
-             ------>      load_config()
-                          - load proxy device
-                          - load transport-specific
-                            device fields
-- load common device
-  fields
-- load common virtqueue
-  fields
-             ------>      load_queue()
-                          - load transport-specific
-                            virtqueue fields
-- notify guest
-             ------>                               load_device()
-                                                   - load device-specific
-                                                     fields
-- load subsections
-  - device endianness
-  - 64 bit features
-  - virtio-1 virtqueue
-    fields
-- sanitize endianness
-- sanitize features
-- virtqueue index sanity
-  check
-                                                   - feature-dependent setup
+::
 
+  virtio core               virtio transport          virtio device
+  -----------               ----------------          -------------
+
+                                                      load() function registered
+                                                      via VMState wrapper on
+                                                      device class
+  virtio_load()                                       <----------
+               ------>      load_config()
+                            - load proxy device
+                            - load transport-specific
+                              device fields
+  - load common device
+    fields
+  - load common virtqueue
+    fields
+               ------>      load_queue()
+                            - load transport-specific
+                              virtqueue fields
+  - notify guest
+               ------>                               load_device()
+                                                     - load device-specific
+                                                       fields
+  - load subsections
+    - device endianness
+    - 64 bit features
+    - virtio-1 virtqueue
+      fields
+  - sanitize endianness
+  - sanitize features
+  - virtqueue index sanity
+    check
+                                                     - feature-dependent setup
 
 Implications of this setup
 ==========================
diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3d1c11f175e5968e9f1519da70c9a0a6ced03995
--- /dev/null
+++ b/docs/devel/vfio-iommufd.rst
@@ -0,0 +1,166 @@
+===============================
+IOMMUFD BACKEND usage with VFIO
+===============================
+
+(Same meaning for backend/container/BE)
+
+With the introduction of iommufd, the Linux kernel provides a generic
+interface for user space drivers to propagate their DMA mappings to kernel
+for assigned devices. While the legacy kernel interface is group-centric,
+the new iommufd interface is device-centric, relying on device fd and iommufd.
+
+To support both interfaces in the QEMU VFIO device, introduce a base container
+to abstract the common part of VFIO legacy and iommufd container. So that the
+generic VFIO code can use either container.
+
+The base container implements generic functions such as memory_listener and
+address space management whereas the derived container implements callbacks
+specific to either legacy or iommufd. Each container has its own way to setup
+secure context and dma management interface. The below diagram shows how it
+looks like with both containers.
+
+::
+
+                      VFIO                           AddressSpace/Memory
+      +-------+  +----------+  +-----+  +-----+
+      |  pci  |  | platform |  |  ap |  | ccw |
+      +---+---+  +----+-----+  +--+--+  +--+--+     +----------------------+
+          |           |           |        |        |   AddressSpace       |
+          |           |           |        |        +------------+---------+
+      +---V-----------V-----------V--------V----+               /
+      |           VFIOAddressSpace              | <------------+
+      |                  |                      |  MemoryListener
+      |        VFIOContainerBase list           |
+      +-------+----------------------------+----+
+              |                            |
+              |                            |
+      +-------V------+            +--------V----------+
+      |   iommufd    |            |    vfio legacy    |
+      |  container   |            |     container     |
+      +-------+------+            +--------+----------+
+              |                            |
+              | /dev/iommu                 | /dev/vfio/vfio
+              | /dev/vfio/devices/vfioX    | /dev/vfio/$group_id
+  Userspace   |                            |
+  ============+============================+===========================
+  Kernel      |  device fd                 |
+              +---------------+            | group/container fd
+              | (BIND_IOMMUFD |            | (SET_CONTAINER/SET_IOMMU)
+              |  ATTACH_IOAS) |            | device fd
+              |               |            |
+              |       +-------V------------V-----------------+
+      iommufd |       |                vfio                  |
+  (map/unmap  |       +---------+--------------------+-------+
+  ioas_copy)  |                 |                    | map/unmap
+              |                 |                    |
+       +------V------+    +-----V------+      +------V--------+
+       | iommfd core |    |  device    |      |  vfio iommu   |
+       +-------------+    +------------+      +---------------+
+
+* Secure Context setup
+
+  - iommufd BE: uses device fd and iommufd to setup secure context
+    (bind_iommufd, attach_ioas)
+  - vfio legacy BE: uses group fd and container fd to setup secure context
+    (set_container, set_iommu)
+
+* Device access
+
+  - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX``
+  - vfio legacy BE: device fd is retrieved from group fd ioctl
+
+* DMA Mapping flow
+
+  1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener
+  2. VFIO populates DMA map/unmap via the container BEs
+     * iommufd BE: uses iommufd
+     * vfio legacy BE: uses container fd
+
+Example configuration
+=====================
+
+Step 1: configure the host device
+---------------------------------
+
+It's exactly same as the VFIO device with legacy VFIO container.
+
+Step 2: configure QEMU
+----------------------
+
+Interactions with the ``/dev/iommu`` are abstracted by a new iommufd
+object (compiled in with the ``CONFIG_IOMMUFD`` option).
+
+Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must
+be linked with an iommufd object. It gets a new optional property
+named iommufd which allows to pass an iommufd object. Take ``vfio-pci``
+device for example:
+
+.. code-block:: bash
+
+    -object iommufd,id=iommufd0
+    -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0
+
+Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a
+management layer. In such a case the fd is passed, the fd supports a
+string naming the fd or a number, for example:
+
+.. code-block:: bash
+
+    -object iommufd,id=iommufd0,fd=22
+    -device vfio-pci,iommufd=iommufd0,fd=23
+
+If the ``fd`` property is not passed, the fd is opened by QEMU.
+
+If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd
+is not used and the user gets the behavior based on the legacy VFIO
+container:
+
+.. code-block:: bash
+
+    -device vfio-pci,host=0000:02:00.0
+
+Supported platform
+==================
+
+Supports x86, ARM and s390x currently.
+
+Caveats
+=======
+
+Dirty page sync
+---------------
+
+Dirty page sync with iommufd backend is unsupported yet, live migration is
+disabled by default. But it can be force enabled like below, low efficient
+though.
+
+.. code-block:: bash
+
+    -object iommufd,id=iommufd0
+    -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on
+
+P2P DMA
+-------
+
+PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI
+BAR region yet. Below warning shows for assigned PCI device, it's not a bug.
+
+.. code-block:: none
+
+    qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
+    qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
+
+FD passing with mdev
+--------------------
+
+``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev.
+If FD passing is used, there is no way to know that and the mdev is treated
+like a real PCI device. There is an error as below if user wants to enable
+RAM discarding for mdev.
+
+.. code-block:: none
+
+    qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices
+
+``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend
+devices are always mdev and RAM discarding is force enabled.
diff --git a/docs/multi-thread-compression.txt b/docs/multi-thread-compression.txt
index 95b1556f67b8a256176819296e1b152a99003096..450e5de469b33ef605e5b9ddffb500971290ce8b 100644
--- a/docs/multi-thread-compression.txt
+++ b/docs/multi-thread-compression.txt
@@ -33,14 +33,15 @@ thread compression can be used to accelerate the compression process.
 
 The decompression speed of Zlib is at least 4 times as quick as
 compression, if the source and destination CPU have equal speed,
-keeping the compression thread count 4 times the decompression
-thread count can avoid resource waste.
+and you choose Zlib as compression method, keeping the compression
+thread count 4 times the decompression thread count can avoid resource waste.
 
 Compression level can be used to control the compression speed and the
-compression ratio. High compression ratio will take more time, level 0
-stands for no compression, level 1 stands for the best compression
-speed, and level 9 stands for the best compression ratio. Users can
-select a level number between 0 and 9.
+compression ratio. High compression ratio will take more time,
+level 1 stands for the best compression speed, and higher level means higher
+compression ration. For Zlib, users can select a level number between 0 and 9,
+where level 0 stands for no compression. For Zstd, users can select a
+level number between 1 and 22.
 
 
 When to use the multiple thread compression in live migration
@@ -116,16 +117,19 @@ to support the multiple thread compression migration:
 2. Activate compression on the source:
     {qemu} migrate_set_capability compress on
 
-3. Set the compression thread count on source:
+3. Set the compression method:
+    {qemu} migrate_set_parameter compress_method zstd
+
+4. Set the compression thread count on source:
     {qemu} migrate_set_parameter compress-threads 12
 
-4. Set the compression level on the source:
+5. Set the compression level on the source:
     {qemu} migrate_set_parameter compress-level 1
 
-5. Set the decompression thread count on destination:
+6. Set the decompression thread count on destination:
     {qemu} migrate_set_parameter decompress-threads 3
 
-6. Start outgoing migration:
+7. Start outgoing migration:
     {qemu} migrate -d tcp:destination.host:4444
     {qemu} info migrate
     Capabilities: ... compress: on
@@ -136,6 +140,7 @@ The following are the default settings:
     compress-threads: 8
     decompress-threads: 2
     compress-level: 1 (which means best speed)
+    compress_method: zlib
 
 So, only the first two steps are required to use the multiple
 thread compression in migration. You can do more if the default
@@ -143,7 +148,7 @@ settings are not appropriate.
 
 TODO
 ====
-Some faster (de)compression method such as LZ4 and Quicklz can help
-to reduce the CPU consumption when doing (de)compression. If using
-these faster (de)compression method, less (de)compression threads
+Comparing to Zlib, Some faster (de)compression method such as LZ4
+and Quicklz can help to reduce the CPU consumption when doing (de)compression.
+If using these faster (de)compression method, less (de)compression threads
 are needed when doing the migration.
diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst b/docs/specs/acpi_hw_reduced_hotplug.rst
index 0bd3f9399fee04bc7829f6f5936a398720dc7eeb..3acd6fcd8b8fd84492391f702643b29af935b2c7 100644
--- a/docs/specs/acpi_hw_reduced_hotplug.rst
+++ b/docs/specs/acpi_hw_reduced_hotplug.rst
@@ -64,7 +64,8 @@ GED IO interface (4 byte access)
        0: Memory hotplug event
        1: System power down event
        2: NVDIMM hotplug event
-    3-31: Reserved
+       3: CPU hotplug event
+    4-31: Reserved
 
 **write_access:**
 
diff --git a/docs/sphinx/depfile.py b/docs/sphinx/depfile.py
index afdcbcec6e76b5be3071046cec6006c795fb4ac2..e74be6af98bcd57355f7f85b268d2f65d6af1ac3 100644
--- a/docs/sphinx/depfile.py
+++ b/docs/sphinx/depfile.py
@@ -19,7 +19,7 @@
 
 def get_infiles(env):
     for x in env.found_docs:
-        yield env.doc2path(x)
+        yield str(env.doc2path(x))
         yield from ((os.path.join(env.srcdir, dep)
                     for dep in env.dependencies[x]))
     for mod in sys.modules.values():
diff --git a/docs/sphinx/qapidoc.py b/docs/sphinx/qapidoc.py
index 658c288f8fe9b674e8c7230053658f44934145e5..3d19853444247613e59447033dca89a27ac38efe 100644
--- a/docs/sphinx/qapidoc.py
+++ b/docs/sphinx/qapidoc.py
@@ -229,15 +229,15 @@ def _nodes_for_enum_values(self, doc):
         section += dlnode
         return [section]
 
-    def _nodes_for_arguments(self, doc, boxed_arg_type):
+    def _nodes_for_arguments(self, doc, arg_type):
         """Return list of doctree nodes for the arguments section"""
-        if boxed_arg_type:
+        if arg_type and not arg_type.is_implicit():
             assert not doc.args
             section = self._make_section('Arguments')
             dlnode = nodes.definition_list()
             dlnode += self._make_dlitem(
                 [nodes.Text('The members of '),
-                 nodes.literal('', boxed_arg_type.name)],
+                 nodes.literal('', arg_type.name)],
                 None)
             section += dlnode
             return [section]
@@ -341,8 +341,7 @@ def visit_command(self, name, info, ifcond, features, arg_type,
                       allow_preconfig, coroutine):
         doc = self._cur_doc
         self._add_doc('Command',
-                      self._nodes_for_arguments(doc,
-                                                arg_type if boxed else None)
+                      self._nodes_for_arguments(doc, arg_type)
                       + self._nodes_for_features(doc)
                       + self._nodes_for_sections(doc)
                       + self._nodes_for_if_section(ifcond))
@@ -350,8 +349,7 @@ def visit_command(self, name, info, ifcond, features, arg_type,
     def visit_event(self, name, info, ifcond, features, arg_type, boxed):
         doc = self._cur_doc
         self._add_doc('Event',
-                      self._nodes_for_arguments(doc,
-                                                arg_type if boxed else None)
+                      self._nodes_for_arguments(doc, arg_type)
                       + self._nodes_for_features(doc)
                       + self._nodes_for_sections(doc)
                       + self._nodes_for_if_section(ifcond))
diff --git a/docs/system/cpu-models-x86.rst.inc b/docs/system/cpu-models-x86.rst.inc
index 7f6368f999b10dc943c0c084e4b3253834674c46..37fe1d0ac8197c70d92ea7f62c3b5ad3a17987b3 100644
--- a/docs/system/cpu-models-x86.rst.inc
+++ b/docs/system/cpu-models-x86.rst.inc
@@ -71,6 +71,16 @@ mixture of host CPU models between machines, if live migration
 compatibility is required, use the newest CPU model that is compatible
 across all desired hosts.
 
+``ClearwaterForest``
+    Intel Xeon Processor (ClearwaterForest, 2025)
+
+``SierraForest``, ``SierraForest-v2``
+    Intel Xeon Processor (SierraForest, 2024), SierraForest-v2 mitigates
+    the GDS and RFDS vulnerabilities with stepping 3.
+
+``GraniteRapids``, ``GraniteRapids-v2``
+    Intel Xeon Processor (GraniteRapids, 2024)
+
 ``Cascadelake-Server``, ``Cascadelake-Server-noTSX``
     Intel Xeon Processor (Cascade Lake, 2019), with "stepping" levels 6
     or 7 only.  (The Cascade Lake Xeon processor with *stepping 5 is
@@ -181,7 +191,7 @@ features are included if using "Host passthrough" or "Host model".
   CVE-2018-12127, [MSBDS] CVE-2018-12126).
 
   This is an MSR (Model-Specific Register) feature rather than a CPUID feature,
-  so it will not appear in the Linux ``/proc/cpuinfo`` in the host or
+  therefore it will not appear in the Linux ``/proc/cpuinfo`` in the host or
   guest.  Instead, the host kernel uses it to populate the MDS
   vulnerability file in ``sysfs``.
 
@@ -189,10 +199,10 @@ features are included if using "Host passthrough" or "Host model".
   affected} in the ``/sys/devices/system/cpu/vulnerabilities/mds`` file.
 
 ``taa-no``
-  Recommended to inform that the guest that the host is ``not``
+  Recommended to inform the guest that the host is ``not``
   vulnerable to CVE-2019-11135, TSX Asynchronous Abort (TAA).
 
-  This too is an MSR feature, so it does not show up in the Linux
+  This is also an MSR feature, therefore it does not show up in the Linux
   ``/proc/cpuinfo`` in the host or guest.
 
   It should only be enabled for VMs if the host reports ``Not affected``
@@ -214,7 +224,7 @@ features are included if using "Host passthrough" or "Host model".
   By disabling TSX, KVM-based guests can avoid paying the price of
   mitigating TSX-based attacks.
 
-  Note that ``tsx-ctrl`` too is an MSR feature, so it does not show
+  Note that ``tsx-ctrl`` is also an MSR feature, therefore it does not show
   up in the Linux ``/proc/cpuinfo`` in the host or guest.
 
   To validate that Intel TSX is indeed disabled for the guest, there are
@@ -223,6 +233,38 @@ features are included if using "Host passthrough" or "Host model".
   ``/sys/devices/system/cpu/vulnerabilities/tsx_async_abort`` file in
   the guest should report ``Mitigation: TSX disabled``.
 
+``bhi-no``
+  Recommended to inform the guest that the host is ``not``
+  vulnerable to CVE-2022-0001, Branch History Injection (BHI).
+
+  This is also an MSR feature, therefore it does not show up in the Linux
+  ``/proc/cpuinfo`` in the host or guest.
+
+  It should only be enabled for VMs if the host reports
+  ``BHI: Not affected`` in the
+  ``/sys/devices/system/cpu/vulnerabilities/spectre_v2`` file.
+
+``gds-no``
+  Recommended to inform the guest that the host is ``not``
+  vulnerable to CVE-2022-40982, Gather Data Sampling (GDS).
+
+  This is also an MSR feature, therefore it does not show up in the Linux
+  ``/proc/cpuinfo`` in the host or guest.
+
+  It should only be enabled for VMs if the host reports ``Not affected``
+  in the ``/sys/devices/system/cpu/vulnerabilities/gather_data_sampling``
+  file.
+
+``rfds-no``
+  Recommended to inform the guest that the host is ``not``
+  vulnerable to CVE-2023-28746, Register File Data Sampling (RFDS).
+
+  This is also an MSR feature, therefore it does not show up in the Linux
+  ``/proc/cpuinfo`` in the host or guest.
+
+  It should only be enabled for VMs if the host reports ``Not affected``
+  in the ``/sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling``
+  file.
 
 Preferred CPU models for AMD x86 hosts
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst
index d1f3277cb026281d34e6ab9257eed30c5e640f89..e1b2d18fb1066b9391138aa07ae56be16f721137 100644
--- a/docs/system/device-emulation.rst
+++ b/docs/system/device-emulation.rst
@@ -98,3 +98,4 @@ Emulated Devices
    devices/canokey.rst
    devices/usb-u2f.rst
    devices/igb.rst
+   devices/vhost-vdpa-generic-device.rst
diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst b/docs/system/devices/vhost-vdpa-generic-device.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25fbcac60e64b7e7fcc50bc5105e623bfbe2fcc1
--- /dev/null
+++ b/docs/system/devices/vhost-vdpa-generic-device.rst
@@ -0,0 +1,46 @@
+
+=========================
+vhost-vDPA generic device
+=========================
+
+This document explains the usage of the vhost-vDPA generic device.
+
+Description
+-----------
+
+vDPA(virtio data path acceleration) device is a device that uses a datapath
+which complies with the virtio specifications with vendor specific control
+path.
+
+QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one
+is type sensitive which means QEMU needs to know the actual device type
+(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which
+is type insensitive
+
+The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio
+subsystem. It is quite small, but it can support any type of virtio device.
+
+Examples
+--------
+
+Prepare the vhost-vDPA backends first:
+
+::
+  host# ls -l /dev/vhost-vdpa-*
+  crw------- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
+
+Start QEMU with virtio-mmio bus:
+
+::
+  host# qemu-system                                                  \
+      -M microvm -m 512 -smp 2 -kernel ... -initrd ...               \
+      -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0           \
+      ...
+
+Start QEMU with virtio-pci bus:
+
+::
+  host# qemu-system                                                  \
+      -M pc -m 512 -smp 2                                            \
+      -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0       \
+      ...\
diff --git a/docs/system/i386/amd-memory-encryption.rst b/docs/system/i386/amd-memory-encryption.rst
index e9bc142bc1308dfcae7b6cf375e43bc5dd15f7e2..b7e3f46ff689f1588de12756441afb0974811a2c 100644
--- a/docs/system/i386/amd-memory-encryption.rst
+++ b/docs/system/i386/amd-memory-encryption.rst
@@ -177,7 +177,45 @@ TODO
 Live Migration
 ---------------
 
-TODO
+AMD SEV encrypts the memory of VMs and because a different key is used
+in each VM, the hypervisor will be unable to simply copy the
+ciphertext from one VM to another to migrate the VM. Instead the AMD SEV Key
+Management API provides sets of function which the hypervisor can use
+to package a guest page for migration, while maintaining the confidentiality
+provided by AMD SEV.
+
+SEV guest VMs have the concept of private and shared memory. The private
+memory is encrypted with the guest-specific key, while shared memory may
+be encrypted with the hypervisor key. The migration APIs provided by the
+SEV API spec should be used for migrating the private pages. The
+KVM_GET_PAGE_ENC_BITMAP ioctl can be used to get the guest page encryption
+bitmap. The bitmap can be used to check if the given guest page is
+private or shared.
+
+Before initiating the migration, we need to know the targets machine's public
+Diffie-Hellman key (PDH) and certificate chain. It can be retrieved
+with the 'query-sev-capabilities' QMP command or using the sev-tool. The
+migrate-set-parameter can be used to pass the target machine's PDH and
+certificate chain.
+
+During the migration flow, the SEND_START is called on the source hypervisor
+to create an outgoing encryption context. The SEV guest policy dictates whether
+the certificate passed through the migrate-sev-set-info command will be
+validated. SEND_UPDATE_DATA is called to encrypt the guest private pages.
+After migration is completed, SEND_FINISH is called to destroy the encryption
+context and make the VM non-runnable to protect it against cloning.
+
+On the target machine, RECEIVE_START is called first to create an
+incoming encryption context. The RECEIVE_UPDATE_DATA is called to copy
+the received encrypted page into guest memory. After migration has
+completed, RECEIVE_FINISH is called to make the VM runnable.
+
+For more information about the migration see SEV API Appendix A
+Usage flow (Live migration section).
+
+NOTE:
+To protect against the memory clone SEV APIs are designed to make the VM
+unrunnable in case of the migration failure.
 
 References
 ----------
diff --git a/docs/system/loongarch/virt.rst b/docs/system/loongarch/virt.rst
index c37268b404d4d2580439abd2e31b6e3492f765de..aa4719d4bdffd6636984cf67641b9fc5bde4f179 100644
--- a/docs/system/loongarch/virt.rst
+++ b/docs/system/loongarch/virt.rst
@@ -28,6 +28,37 @@ The ``qemu-system-loongarch64`` provides emulation for virt
 machine. You can specify the machine type ``virt`` and
 cpu type ``la464``.
 
+CPU Topology
+------------
+
+The ``LA464`` type CPUs have the concept of Socket Core and Thread.
+
+For example:
+
+``-smp 1,maxcpus=M,sockets=S,cores=C,threads=T``
+
+The above parameters indicate that the machine has a maximum of ``M`` vCPUs and
+``S`` sockets, each socket has ``C`` cores, each core has ``T`` threads,
+and each thread corresponds to a vCPU.
+
+Then ``M`` ``S`` ``C`` ``T`` has the following relationship:
+
+``M = S * C * T``
+
+In the CPU topology relationship, When we know the ``socket_id`` ``core_id``
+and ``thread_id`` of the CPU, we can calculate its ``arch_id``:
+
+``arch_id = (socket_id * S) + (core_id * C) + (thread_id * T)``
+
+Similarly, when we know the ``arch_id`` of the CPU,
+we can also get its ``socket_id`` ``core_id`` and ``thread_id``:
+
+``socket_id = arch_id / (C * T)``
+
+``core_id = (arch_id / T) % C``
+
+``thread_id = arch_id % T``
+
 Boot options
 ------------
 
diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index 4459c065f1942b7061a3e8b54352df2004978094..3653adb963ee17f5165b46b053935e3c42e1445e 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -406,7 +406,7 @@ Command description:
   Compare exits with ``0`` in case the images are equal and with ``1``
   in case the images differ. Other exit codes mean an error occurred during
   execution and standard error output should contain an error message.
-  The following table sumarizes all exit codes of the compare subcommand:
+  The following table summarizes all exit codes of the compare subcommand:
 
   0
     Images are identical (or requested help was printed)
diff --git a/docs/tools/virtfs-proxy-helper.rst b/docs/tools/virtfs-proxy-helper.rst
index bd310ebb07b5fc0ae9ceeeb5418cf4edfdd2192e..175b4809260093f85b45904e399b6c9bfb72e829 100644
--- a/docs/tools/virtfs-proxy-helper.rst
+++ b/docs/tools/virtfs-proxy-helper.rst
@@ -55,7 +55,7 @@ The following options are supported:
 .. option:: -f, --fd SOCKET_ID
 
   Use given file descriptor as socket descriptor for communicating with
-  qemu proxy fs drier. Usually a helper like libvirt will create
+  qemu proxy fs driver. Usually a helper like libvirt will create
   socketpair and pass one of the fds as parameter to this option.
 
 .. option:: -s, --socket SOCKET_FILE
diff --git a/dump/dump.c b/dump/dump.c
index 481905076493c7d723e01f84a08e76ea0e5550e3..787059ac2c8118311124ccbce3f1839219827df8 100644
--- a/dump/dump.c
+++ b/dump/dump.c
@@ -20,6 +20,7 @@
 #include "sysemu/dump.h"
 #include "sysemu/runstate.h"
 #include "sysemu/cpus.h"
+#include "sysemu/kvm.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-dump.h"
 #include "qapi/qapi-events-dump.h"
@@ -2065,6 +2066,12 @@ void qmp_dump_guest_memory(bool paging, const char *protocol,
                            Error **errp)
 {
     ERRP_GUARD();
+
+    if (virtcca_cvm_enabled()) {
+        error_setg(errp, "The dump-guest-memory command is temporarily unsupported in cvm.");
+        return;
+    }
+
     const char *p;
     int fd;
     DumpState *s;
diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
index 46d752bbc2cd31694af5502aa69d233c3a2548ed..31c3dae52540f0f803a75d9f191c59a4dfe69852 100644
--- a/gdbstub/gdbstub.c
+++ b/gdbstub/gdbstub.c
@@ -582,6 +582,19 @@ void gdb_register_coprocessor(CPUState *cpu,
     }
 }
 
+void gdb_unregister_coprocessor_all(CPUState *cpu)
+{
+    /*
+     * Safe to nuke everything. GDBRegisterState::xml is static const char so
+     * it won't be freed
+     */
+    g_array_free(cpu->gdb_regs, true);
+
+    cpu->gdb_regs = NULL;
+    cpu->gdb_num_regs = 0;
+    cpu->gdb_num_g_regs = 0;
+}
+
 static void gdb_process_breakpoint_remove_all(GDBProcess *p)
 {
     CPUState *cpu = gdb_get_first_cpu_in_process(p);
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index af636cfb2d30564b0c9d7c5c7c993b108c2bd2d0..9a291d1b51dece98fbc5774ae12e72f26f21b4d6 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -2587,6 +2587,11 @@ static void coroutine_fn v9fs_readdir(void *opaque)
         retval = -EINVAL;
         goto out_nofid;
     }
+    if (fidp->fid_type != P9_FID_DIR) {
+        warn_report_once("9p: bad client: T_readdir on non-directory stream");
+        retval = -ENOTDIR;
+        goto out;
+    }
     if (!fidp->fs.dir.stream) {
         retval = -EINVAL;
         goto out;
diff --git a/hw/acpi/acpi-cpu-hotplug-stub.c b/hw/acpi/acpi-cpu-hotplug-stub.c
index 3fc4b14c260fb55ffff585860df8ac652516302a..c6c61bb9cd33c1486093af9787a8d0fedc006053 100644
--- a/hw/acpi/acpi-cpu-hotplug-stub.c
+++ b/hw/acpi/acpi-cpu-hotplug-stub.c
@@ -19,6 +19,12 @@ void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner,
     return;
 }
 
+void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner,
+                         CPUHotplugState *state, hwaddr base_addr)
+{
+    return;
+}
+
 void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list)
 {
     return;
diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index af66bde0f55d5db5c123958f0f15afd5bbc047b8..3fb996c034c395f91208e97f6984b25069ff7134 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -47,7 +47,7 @@ static void build_prepend_byte(GArray *array, uint8_t val)
     g_array_prepend_val(array, val);
 }
 
-static void build_append_byte(GArray *array, uint8_t val)
+void build_append_byte(GArray *array, uint8_t val)
 {
     g_array_append_val(array, val);
 }
@@ -1554,6 +1554,28 @@ Aml *aml_sleep(uint64_t msec)
     return var;
 }
 
+/* ACPI 5.0b: 6.4.3.7 Generic Register Descriptor */
+Aml *aml_generic_register(AmlRegionSpace rs, uint8_t reg_width,
+                          uint8_t reg_offset, AmlAccessType type, uint64_t addr)
+{
+    int i;
+    Aml *var = aml_alloc();
+    build_append_byte(var->buf, 0x82); /* Generic Register Descriptor */
+    build_append_byte(var->buf, 0x0C); /* Length, bits[7:0] value = 0x0C */
+    build_append_byte(var->buf, 0);    /* Length, bits[15:8] value = 0 */
+    build_append_byte(var->buf, rs);   /* Address Space ID */
+    build_append_byte(var->buf, reg_width);   /* Register Bit Width */
+    build_append_byte(var->buf, reg_offset);  /* Register Bit Offset */
+    build_append_byte(var->buf, type);        /* Access Size */
+
+    /* Register address */
+    for (i = 0; i < 8; i++) {
+        build_append_byte(var->buf, extract64(addr, i * 8, 8));
+    }
+
+    return var;
+}
+
 static uint8_t Hex2Byte(const char *src)
 {
     int hi, lo;
@@ -1968,10 +1990,10 @@ void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms,
  * ACPI spec, Revision 6.3
  * 5.2.29.1 Processor hierarchy node structure (Type 0)
  */
-static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags,
-                                           uint32_t parent, uint32_t id,
-                                           uint32_t *priv_rsrc,
-                                           uint32_t priv_num)
+void build_processor_hierarchy_node(GArray *tbl, uint32_t flags,
+                                    uint32_t parent, uint32_t id,
+                                    uint32_t *priv_rsrc,
+                                    uint32_t priv_num)
 {
     int i;
 
@@ -1994,6 +2016,59 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags,
     }
 }
 
+void build_spcr(GArray *table_data, BIOSLinker *linker,
+                const AcpiSpcrData *f, const uint8_t rev,
+                const char *oem_id, const char *oem_table_id)
+{
+    AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id,
+                        .oem_table_id = oem_table_id };
+
+    acpi_table_begin(&table, table_data);
+    /* Interface type */
+    build_append_int_noprefix(table_data, f->interface_type, 1);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 3);
+    /* Base Address */
+    build_append_gas(table_data, f->base_addr.id, f->base_addr.width,
+                     f->base_addr.offset, f->base_addr.size,
+                     f->base_addr.addr);
+    /* Interrupt type */
+    build_append_int_noprefix(table_data, f->interrupt_type, 1);
+    /* IRQ */
+    build_append_int_noprefix(table_data, f->pc_interrupt, 1);
+    /* Global System Interrupt */
+    build_append_int_noprefix(table_data, f->interrupt, 4);
+    /* Baud Rate */
+    build_append_int_noprefix(table_data, f->baud_rate, 1);
+    /* Parity */
+    build_append_int_noprefix(table_data, f->parity, 1);
+    /* Stop Bits */
+    build_append_int_noprefix(table_data, f->stop_bits, 1);
+    /* Flow Control */
+    build_append_int_noprefix(table_data, f->flow_control, 1);
+    /* Language */
+    build_append_int_noprefix(table_data, f->language, 1);
+    /* Terminal Type */
+    build_append_int_noprefix(table_data, f->terminal_type, 1);
+    /* PCI Device ID  */
+    build_append_int_noprefix(table_data, f->pci_device_id, 2);
+    /* PCI Vendor ID */
+    build_append_int_noprefix(table_data, f->pci_vendor_id, 2);
+    /* PCI Bus Number */
+    build_append_int_noprefix(table_data, f->pci_bus, 1);
+    /* PCI Device Number */
+    build_append_int_noprefix(table_data, f->pci_device, 1);
+    /* PCI Function Number */
+    build_append_int_noprefix(table_data, f->pci_function, 1);
+    /* PCI Flags */
+    build_append_int_noprefix(table_data, f->pci_flags, 4);
+    /* PCI Segment */
+    build_append_int_noprefix(table_data, f->pci_segment, 1);
+    /* Reserved */
+    build_append_int_noprefix(table_data, 0, 4);
+
+    acpi_table_end(linker, &table);
+}
 /*
  * ACPI spec, Revision 6.3
  * 5.2.29 Processor Properties Topology Table (PPTT)
diff --git a/hw/acpi/core.c b/hw/acpi/core.c
index ec5e127d17226ee9a7e51860eb64b897ba3f7d72..b6241f70e9eeebe11907b8e1f97acc4d79741947 100644
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -24,6 +24,7 @@
 #include "hw/acpi/acpi.h"
 #include "hw/nvram/fw_cfg.h"
 #include "qemu/config-file.h"
+#include "qemu/log.h"
 #include "qapi/error.h"
 #include "qapi/opts-visitor.h"
 #include "qapi/qapi-events-run-state.h"
@@ -588,13 +589,16 @@ static void acpi_pm_cnt_write(void *opaque, hwaddr addr, uint64_t val,
         uint16_t sus_typ = (val >> 10) & 7;
         switch (sus_typ) {
         case 0: /* soft power off */
+	    qemu_log("VM will be soft power off\n");
             qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
             break;
         case 1:
+	    qemu_log("VM will be suspend state\n");
             qemu_system_suspend_request();
             break;
         default:
             if (sus_typ == ar->pm1.cnt.s4_val) { /* S4 request */
+		qemu_log("VM will be S4 state\n");
                 qapi_event_send_suspend_disk();
                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
             }
diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
index 011d2c6c2ddf94f1b3f9af99436f99c11655deea..9a3b417504776daaa3ffc1d62c4fac065ad1fba6 100644
--- a/hw/acpi/cpu.c
+++ b/hw/acpi/cpu.c
@@ -1,13 +1,13 @@
 #include "qemu/osdep.h"
 #include "migration/vmstate.h"
 #include "hw/acpi/cpu.h"
+#include "hw/acpi/cpu_hotplug.h"
 #include "hw/core/cpu.h"
 #include "qapi/error.h"
 #include "qapi/qapi-events-acpi.h"
 #include "trace.h"
 #include "sysemu/numa.h"
 
-#define ACPI_CPU_HOTPLUG_REG_LEN 12
 #define ACPI_CPU_SELECTOR_OFFSET_WR 0
 #define ACPI_CPU_FLAGS_OFFSET_RW 4
 #define ACPI_CPU_CMD_OFFSET_WR 5
@@ -64,10 +64,11 @@ static uint64_t cpu_hotplug_rd(void *opaque, hwaddr addr, unsigned size)
     cdev = &cpu_st->devs[cpu_st->selector];
     switch (addr) {
     case ACPI_CPU_FLAGS_OFFSET_RW: /* pack and return is_* fields */
-        val |= cdev->cpu ? 1 : 0;
+        val |= cdev->is_enabled ? 1 : 0;
         val |= cdev->is_inserting ? 2 : 0;
         val |= cdev->is_removing  ? 4 : 0;
         val |= cdev->fw_remove  ? 16 : 0;
+        val |= cdev->is_present ? 32 : 0;
         trace_cpuhp_acpi_read_flags(cpu_st->selector, val);
         break;
     case ACPI_CPU_CMD_DATA_OFFSET_RW:
@@ -226,7 +227,20 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner,
     state->dev_count = id_list->len;
     state->devs = g_new0(typeof(*state->devs), state->dev_count);
     for (i = 0; i < id_list->len; i++) {
-        state->devs[i].cpu =  CPU(id_list->cpus[i].cpu);
+        struct CPUState *cpu = CPU(id_list->cpus[i].cpu);
+        if (qemu_present_cpu(cpu)) {
+            state->devs[i].is_present = true;
+        } else {
+            state->devs[i].is_present = false;
+        }
+
+        if (qemu_enabled_cpu(cpu)) {
+            state->devs[i].cpu = cpu;
+            state->devs[i].is_enabled = true;
+        } else {
+            state->devs[i].is_enabled = false;
+        }
+
         state->devs[i].arch_id = id_list->cpus[i].arch_id;
     }
     memory_region_init_io(&state->ctrl_reg, owner, &cpu_hotplug_ops, state,
@@ -259,6 +273,8 @@ void acpi_cpu_plug_cb(HotplugHandler *hotplug_dev,
     }
 
     cdev->cpu = CPU(dev);
+    cdev->is_present = true;
+    cdev->is_enabled = true;
     if (dev->hotplugged) {
         cdev->is_inserting = true;
         acpi_send_event(DEVICE(hotplug_dev), ACPI_CPU_HOTPLUG_STATUS);
@@ -290,16 +306,23 @@ void acpi_cpu_unplug_cb(CPUHotplugState *cpu_st,
         return;
     }
 
+    cdev->is_enabled = false;
+    if (!qemu_persistent_cpu(CPU(dev))) {
+        cdev->is_present = false;
+    }
+
     cdev->cpu = NULL;
 }
 
 static const VMStateDescription vmstate_cpuhp_sts = {
     .name = "CPU hotplug device state",
-    .version_id = 1,
+    .version_id = 2,
     .minimum_version_id = 1,
     .fields      = (VMStateField[]) {
         VMSTATE_BOOL(is_inserting, AcpiCpuStatus),
         VMSTATE_BOOL(is_removing, AcpiCpuStatus),
+        VMSTATE_BOOL(is_present, AcpiCpuStatus),
+        VMSTATE_BOOL(is_enabled, AcpiCpuStatus),
         VMSTATE_UINT32(ost_event, AcpiCpuStatus),
         VMSTATE_UINT32(ost_status, AcpiCpuStatus),
         VMSTATE_END_OF_LIST()
@@ -337,11 +360,15 @@ const VMStateDescription vmstate_cpu_hotplug = {
 #define CPU_REMOVE_EVENT  "CRMV"
 #define CPU_EJECT_EVENT   "CEJ0"
 #define CPU_FW_EJECT_EVENT "CEJF"
+#define CPU_PRESENT       "CPRS"
 
 void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
-                    build_madt_cpu_fn build_madt_cpu, hwaddr io_base,
+                    build_madt_cpu_fn build_madt_cpu,
+                    build_cpu_cppc_fn build_cpu_cppc,
+                    hwaddr base_addr,
                     const char *res_root,
-                    const char *event_handler_method)
+                    const char *event_handler_method,
+                    AmlRegionSpace rs)
 {
     Aml *ifctx;
     Aml *field;
@@ -365,14 +392,22 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
             aml_name_decl("_UID", aml_string("CPU Hotplug resources")));
         aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0));
 
+        assert((rs == AML_SYSTEM_IO) || (rs == AML_SYSTEM_MEMORY));
+
         crs = aml_resource_template();
-        aml_append(crs, aml_io(AML_DECODE16, io_base, io_base, 1,
+        if (rs == AML_SYSTEM_IO) {
+            aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1,
                                ACPI_CPU_HOTPLUG_REG_LEN));
+        } else if (rs == AML_SYSTEM_MEMORY) {
+            aml_append(crs, aml_memory32_fixed(base_addr,
+                               ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE));
+        }
+
         aml_append(cpu_ctrl_dev, aml_name_decl("_CRS", crs));
 
         /* declare CPU hotplug MMIO region with related access fields */
         aml_append(cpu_ctrl_dev,
-            aml_operation_region("PRST", AML_SYSTEM_IO, aml_int(io_base),
+            aml_operation_region("PRST", rs, aml_int(base_addr),
                                  ACPI_CPU_HOTPLUG_REG_LEN));
 
         field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK,
@@ -388,7 +423,9 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
         aml_append(field, aml_named_field(CPU_EJECT_EVENT, 1));
         /* tell firmware to do device eject, write only */
         aml_append(field, aml_named_field(CPU_FW_EJECT_EVENT, 1));
-        aml_append(field, aml_reserved_field(3));
+        /* 1 if present, read only */
+        aml_append(field, aml_named_field(CPU_PRESENT, 1));
+        aml_append(field, aml_reserved_field(2));
         aml_append(field, aml_named_field(CPU_COMMAND, 8));
         aml_append(cpu_ctrl_dev, field);
 
@@ -418,6 +455,7 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
         Aml *ctrl_lock = aml_name("%s.%s", cphp_res_path, CPU_LOCK);
         Aml *cpu_selector = aml_name("%s.%s", cphp_res_path, CPU_SELECTOR);
         Aml *is_enabled = aml_name("%s.%s", cphp_res_path, CPU_ENABLED);
+        Aml *is_present = aml_name("%s.%s", cphp_res_path, CPU_PRESENT);
         Aml *cpu_cmd = aml_name("%s.%s", cphp_res_path, CPU_COMMAND);
         Aml *cpu_data = aml_name("%s.%s", cphp_res_path, CPU_DATA);
         Aml *ins_evt = aml_name("%s.%s", cphp_res_path, CPU_INSERT_EVENT);
@@ -446,13 +484,26 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
         {
             Aml *idx = aml_arg(0);
             Aml *sta = aml_local(0);
+            Aml *ifctx2;
+            Aml *else_ctx;
 
             aml_append(method, aml_acquire(ctrl_lock, 0xFFFF));
             aml_append(method, aml_store(idx, cpu_selector));
             aml_append(method, aml_store(zero, sta));
-            ifctx = aml_if(aml_equal(is_enabled, one));
+            ifctx = aml_if(aml_equal(is_present, one));
             {
-                aml_append(ifctx, aml_store(aml_int(0xF), sta));
+                ifctx2 = aml_if(aml_equal(is_enabled, one));
+                {
+                    /* cpu is present and enabled */
+                    aml_append(ifctx2, aml_store(aml_int(0xF), sta));
+                }
+                aml_append(ifctx, ifctx2);
+                else_ctx = aml_else();
+                {
+                    /* cpu is present but disabled */
+                    aml_append(else_ctx, aml_store(aml_int(0xD), sta));
+                }
+                aml_append(ifctx, else_ctx);
             }
             aml_append(method, ifctx);
             aml_append(method, aml_release(ctrl_lock));
@@ -658,14 +709,20 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
                 aml_append(dev, aml_name_decl("_UID", uid));
             }
 
+            if (build_cpu_cppc) {
+                build_cpu_cppc(i, arch_ids->len, dev);
+            }
+
             method = aml_method("_STA", 0, AML_SERIALIZED);
             aml_append(method, aml_return(aml_call1(CPU_STS_METHOD, uid)));
             aml_append(dev, method);
 
             /* build _MAT object */
-            build_madt_cpu(i, arch_ids, madt_buf, true); /* set enabled flag */
-            aml_append(dev, aml_name_decl("_MAT",
-                aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data)));
+            if (build_madt_cpu) {
+                build_madt_cpu(i, arch_ids, madt_buf, true); /* set enabled flag */
+                aml_append(dev, aml_name_decl("_MAT",
+                    aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data)));
+            }
             g_array_free(madt_buf, true);
 
             if (CPU(arch_ids->cpus[i].cpu) != first_cpu) {
@@ -696,9 +753,11 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
     aml_append(sb_scope, cpus_dev);
     aml_append(table, sb_scope);
 
-    method = aml_method(event_handler_method, 0, AML_NOTSERIALIZED);
-    aml_append(method, aml_call0("\\_SB.CPUS." CPU_SCAN_METHOD));
-    aml_append(table, method);
+    if (event_handler_method) {
+        method = aml_method(event_handler_method, 0, AML_NOTSERIALIZED);
+        aml_append(method, aml_call0("\\_SB.CPUS." CPU_SCAN_METHOD));
+        aml_append(table, method);
+    }
 
     g_free(cphp_res_path);
 }
diff --git a/hw/acpi/cpufreq.c b/hw/acpi/cpufreq.c
new file mode 100644
index 0000000000000000000000000000000000000000..a76f7b8fa2c86e790f6e66d73093e388e9bd8a21
--- /dev/null
+++ b/hw/acpi/cpufreq.c
@@ -0,0 +1,280 @@
+/*
+ * ACPI CPPC register device
+ *
+ * Support for showing CPU frequency in guest OS.
+ *
+ * Copyright (c) 2019 HUAWEI TECHNOLOGIES CO.,LTD.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/sysbus.h"
+#include "chardev/char.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "qemu/option.h"
+#include "sysemu/sysemu.h"
+#include "hw/acpi/acpi-defs.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "hw/boards.h"
+
+#define TYPE_CPUFREQ "cpufreq"
+#define CPUFREQ(obj) OBJECT_CHECK(CpuhzState, (obj), TYPE_CPUFREQ)
+#define NOMINAL_FREQ_FILE "/sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq"
+#define CPU_MAX_FREQ_FILE "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"
+#define HZ_MAX_LENGTH 1024
+#define MAX_SUPPORT_SPACE 0x10000
+
+/*
+ * Since Hi1616 will not support CPPC, we simply use its nominal frequency as
+ * the default.
+ */
+#define DEFAULT_HZ 2400
+
+int cppc_regs_offset[CPPC_REG_COUNT] = {
+    [HIGHEST_PERF] = 0,
+    [NOMINAL_PERF] = 4,
+    [LOW_NON_LINEAR_PERF] = 8,
+    [LOWEST_PERF] = 12,
+    [GUARANTEED_PERF] = 16,
+    [DESIRED_PERF] = 20,
+    [MIN_PERF] = -1,
+    [MAX_PERF] = -1,
+    [PERF_REDUC_TOLERANCE] = -1,
+    [TIME_WINDOW] = -1,
+    [CTR_WRAP_TIME] = -1,
+    [REFERENCE_CTR] = 24,
+    [DELIVERED_CTR] = 32,
+    [PERF_LIMITED] = 40,
+    [ENABLE] = -1,
+    [AUTO_SEL_ENABLE] = -1,
+    [AUTO_ACT_WINDOW] = -1,
+    [ENERGY_PERF] = -1,
+    [REFERENCE_PERF] = -1,
+    [LOWEST_FREQ] = 44,
+    [NOMINAL_FREQ] = 48,
+};
+
+typedef struct CpuhzState {
+    SysBusDevice parent_obj;
+
+    MemoryRegion iomem;
+    uint32_t HighestPerformance;
+    uint32_t NominalPerformance;
+    uint32_t LowestNonlinearPerformance;
+    uint32_t LowestPerformance;
+    uint32_t GuaranteedPerformance;
+    uint32_t DesiredPerformance;
+    uint64_t ReferencePerformanceCounter;
+    uint64_t DeliveredPerformanceCounter;
+    uint32_t PerformanceLimited;
+    uint32_t LowestFreq;
+    uint32_t NominalFreq;
+    uint32_t num_cpu;
+    uint32_t reg_size;
+} CpuhzState;
+
+
+static uint64_t cpufreq_read(void *opaque, hwaddr offset, unsigned size)
+{
+    CpuhzState *s = (CpuhzState *)opaque;
+    uint64_t r;
+    uint64_t n;
+
+    if (offset >= s->num_cpu * CPPC_REG_PER_CPU_STRIDE) {
+        warn_report("cpufreq_read: offset 0x%lx out of range", offset);
+        return 0;
+    }
+
+    n = offset % CPPC_REG_PER_CPU_STRIDE;
+    switch (n) {
+    case 0:
+        r = s->HighestPerformance;
+        break;
+    case 4:
+        r = s->NominalPerformance;
+        break;
+    case 8:
+        r = s->LowestNonlinearPerformance;
+        break;
+    case 12:
+        r = s->LowestPerformance;
+        break;
+    case 16:
+        r = s->GuaranteedPerformance;
+        break;
+    case 20:
+        r = s->DesiredPerformance;
+        break;
+    /*
+     * We don't have real counters and it is hard to emulate, so always set the
+     * counter value to 1 to rely on Linux to use the DesiredPerformance value
+     * directly.
+     */
+    case 24:
+        r = s->ReferencePerformanceCounter;
+        break;
+    /*
+     * Guest may still access the register by 32bit; add the process to
+     * eliminate unnecessary warnings.
+     */
+    case 28:
+        r = s->ReferencePerformanceCounter >> 32;
+        break;
+    case 32:
+        r = s->DeliveredPerformanceCounter;
+        break;
+    case 36:
+        r = s->DeliveredPerformanceCounter >> 32;
+        break;
+
+    case 40:
+        r = s->PerformanceLimited;
+        break;
+    case 44:
+        r = s->LowestFreq;
+        break;
+    case 48:
+        r = s->NominalFreq;
+        break;
+    default:
+        error_printf("cpufreq_read: Bad offset 0x%lx\n", offset);
+        r = 0;
+        break;
+    }
+    return r;
+}
+
+static void cpufreq_write(void *opaque, hwaddr offset,
+                           uint64_t value, unsigned size)
+{
+    CpuhzState *s = CPUFREQ(opaque);
+    uint64_t n;
+
+    if (offset >= s->num_cpu * CPPC_REG_PER_CPU_STRIDE) {
+        error_printf("cpufreq_write: offset 0x%lx out of range", offset);
+        return;
+    }
+
+    n = offset % CPPC_REG_PER_CPU_STRIDE;
+
+    switch (n) {
+    case 20:
+        break;
+    default:
+        error_printf("cpufreq_write: Bad offset 0x%lx\n", offset);
+    }
+}
+
+static uint32_t CPPC_Read(const char *hostpath)
+{
+    int fd;
+    char buffer[HZ_MAX_LENGTH] = { 0 };
+    uint64_t hz;
+    int len;
+    const char *endptr = NULL;
+    int ret;
+
+    fd = qemu_open_old(hostpath, O_RDONLY);
+    if (fd < 0) {
+        return 0;
+    }
+
+    len = read(fd, buffer, HZ_MAX_LENGTH);
+    qemu_close(fd);
+    if (len <= 0) {
+        return 0;
+    }
+    ret = qemu_strtoul(buffer, &endptr, 0, &hz);
+    if (ret < 0) {
+        return 0;
+    }
+    return (uint32_t)hz;
+}
+
+static const MemoryRegionOps cpufreq_ops = {
+    .read = cpufreq_read,
+    .write = cpufreq_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void hz_init(CpuhzState *s)
+{
+    uint32_t hz;
+
+    hz = CPPC_Read(NOMINAL_FREQ_FILE);
+    if (hz == 0) {
+        hz = CPPC_Read(CPU_MAX_FREQ_FILE);
+        if (hz == 0) {
+            hz = DEFAULT_HZ;
+        } else {
+            /* Value in CpuMaxFrequency is in KHz unit; convert to MHz */
+            hz = hz / 1000;
+        }
+    }
+
+    s->HighestPerformance = hz;
+    s->NominalPerformance = hz;
+    s->LowestNonlinearPerformance = hz;
+    s->LowestPerformance = hz;
+    s->GuaranteedPerformance = hz;
+    s->DesiredPerformance = hz;
+    s->ReferencePerformanceCounter = 1;
+    s->DeliveredPerformanceCounter = 1;
+    s->PerformanceLimited = 0;
+    s->LowestFreq = hz;
+    s->NominalFreq = hz;
+}
+
+static void cpufreq_init(Object *obj)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    CpuhzState *s = CPUFREQ(obj);
+
+    MachineState *ms = MACHINE(qdev_get_machine());
+    s->num_cpu = ms->smp.max_cpus;
+
+    s->reg_size = s->num_cpu * CPPC_REG_PER_CPU_STRIDE;
+    if (s->reg_size > MAX_SUPPORT_SPACE) {
+        error_report("Required space 0x%x excesses the max support 0x%x",
+                 s->reg_size, MAX_SUPPORT_SPACE);
+        goto err_end;
+    }
+
+    memory_region_init_io(&s->iomem, OBJECT(s), &cpufreq_ops, s, "cpufreq",
+                          s->reg_size);
+    sysbus_init_mmio(sbd, &s->iomem);
+    hz_init(s);
+    return;
+
+err_end:
+    /* Set desired perf register offset to -1 to indicate no support for CPPC */
+    cppc_regs_offset[DESIRED_PERF] = -1;
+}
+
+static const TypeInfo cpufreq_arm_info = {
+    .name          = TYPE_CPUFREQ,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(CpuhzState),
+    .instance_init = cpufreq_init,
+};
+
+static void cpufreq_register_types(void)
+{
+    type_register_static(&cpufreq_arm_info);
+}
+
+type_init(cpufreq_register_types)
diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index a3d31631fe0dba5b797baecbf85432c4a8e7e8e7..755653dc265c7c2242a68ee144052c7b74b47b22 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -12,6 +12,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "hw/acpi/acpi.h"
+#include "hw/acpi/cpu.h"
 #include "hw/acpi/generic_event_device.h"
 #include "hw/irq.h"
 #include "hw/mem/pc-dimm.h"
@@ -25,6 +26,7 @@ static const uint32_t ged_supported_events[] = {
     ACPI_GED_MEM_HOTPLUG_EVT,
     ACPI_GED_PWR_DOWN_EVT,
     ACPI_GED_NVDIMM_HOTPLUG_EVT,
+    ACPI_GED_CPU_HOTPLUG_EVT,
 };
 
 /*
@@ -107,6 +109,10 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev,
                 aml_append(if_ctx, aml_call0(MEMORY_DEVICES_CONTAINER "."
                                              MEMORY_SLOT_SCAN_METHOD));
                 break;
+            case ACPI_GED_CPU_HOTPLUG_EVT:
+                aml_append(if_ctx, aml_call0(ACPI_CPU_CONTAINER "."
+                                             ACPI_CPU_SCAN_METHOD));
+                break;
             case ACPI_GED_PWR_DOWN_EVT:
                 aml_append(if_ctx,
                            aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE),
@@ -197,9 +203,9 @@ static void ged_regs_write(void *opaque, hwaddr addr, uint64_t data,
 
     switch (addr) {
     case ACPI_GED_REG_SLEEP_CTL:
-        slp_typ = (data >> 2) & 0x07;
-        slp_en  = (data >> 5) & 0x01;
-        if (slp_en && slp_typ == 5) {
+        slp_typ = (data >> ACPI_GED_SLP_TYP_POS) & ACPI_GED_SLP_TYP_MASK;
+        slp_en  = !!(data & ACPI_GED_SLP_EN);
+        if (slp_en && slp_typ == ACPI_GED_SLP_TYP_S5) {
             qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
         }
         return;
@@ -234,6 +240,8 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev,
         } else {
             acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp);
         }
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        acpi_cpu_plug_cb(hotplug_dev, &s->cpuhp_state, dev, errp);
     } else {
         error_setg(errp, "virt: device plug request for unsupported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -248,6 +256,8 @@ static void acpi_ged_unplug_request_cb(HotplugHandler *hotplug_dev,
     if ((object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) &&
                        !(object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)))) {
         acpi_memory_unplug_request_cb(hotplug_dev, &s->memhp_state, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp);
     } else {
         error_setg(errp, "acpi: device unplug request for unsupported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -261,6 +271,8 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev,
 
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
         acpi_memory_unplug_cb(&s->memhp_state, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        acpi_cpu_unplug_cb(&s->cpuhp_state, dev, errp);
     } else {
         error_setg(errp, "acpi: device unplug for unsupported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -272,6 +284,7 @@ static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list)
     AcpiGedState *s = ACPI_GED(adev);
 
     acpi_memory_ospm_status(&s->memhp_state, list);
+    acpi_cpu_ospm_status(&s->cpuhp_state, list);
 }
 
 static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev)
@@ -286,6 +299,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev)
         sel = ACPI_GED_PWR_DOWN_EVT;
     } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) {
         sel = ACPI_GED_NVDIMM_HOTPLUG_EVT;
+    } else if (ev & ACPI_CPU_HOTPLUG_STATUS) {
+        sel = ACPI_GED_CPU_HOTPLUG_EVT;
     } else {
         /* Unknown event. Return without generating interrupt. */
         warn_report("GED: Unsupported event %d. No irq injected", ev);
@@ -318,6 +333,16 @@ static const VMStateDescription vmstate_memhp_state = {
     }
 };
 
+static const VMStateDescription vmstate_cpuhp_state = {
+    .name = "acpi-ged/cpuhp",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_CPU_HOTPLUG(cpuhp_state, AcpiGedState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_ged_state = {
     .name = "acpi-ged-state",
     .version_id = 1,
@@ -366,17 +391,55 @@ static const VMStateDescription vmstate_acpi_ged = {
     },
     .subsections = (const VMStateDescription * []) {
         &vmstate_memhp_state,
+        &vmstate_cpuhp_state,
         &vmstate_ghes_state,
         NULL
     }
 };
 
+static void acpi_ged_realize(DeviceState *dev, Error **errp)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    AcpiGedState *s = ACPI_GED(dev);
+    uint32_t ged_events;
+    int i;
+
+    ged_events = ctpop32(s->ged_event_bitmap);
+
+    for (i = 0; i < ARRAY_SIZE(ged_supported_events) && ged_events; i++) {
+        uint32_t event = s->ged_event_bitmap & ged_supported_events[i];
+
+        if (!event) {
+            continue;
+        }
+
+        switch (event) {
+        case ACPI_GED_CPU_HOTPLUG_EVT:
+            /* initialize CPU Hotplug related regions */
+            memory_region_init(&s->container_cpuhp, OBJECT(dev),
+                                "cpuhp container",
+                                ACPI_CPU_HOTPLUG_REG_LEN);
+            sysbus_init_mmio(sbd, &s->container_cpuhp);
+            cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev),
+                                &s->cpuhp_state, 0);
+            break;
+        }
+        ged_events--;
+    }
+
+    if (ged_events) {
+        error_report("Unsupported events specified");
+        abort();
+    }
+}
+
 static void acpi_ged_initfn(Object *obj)
 {
     DeviceState *dev = DEVICE(obj);
     AcpiGedState *s = ACPI_GED(dev);
     SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
     GEDState *ged_st = &s->ged_state;
+    MachineClass *mc;
 
     memory_region_init_io(&ged_st->evt, obj, &ged_evt_ops, ged_st,
                           TYPE_ACPI_GED, ACPI_GED_EVT_SEL_LEN);
@@ -400,6 +463,15 @@ static void acpi_ged_initfn(Object *obj)
     memory_region_init_io(&ged_st->regs, obj, &ged_regs_ops, ged_st,
                           TYPE_ACPI_GED "-regs", ACPI_GED_REG_COUNT);
     sysbus_init_mmio(sbd, &ged_st->regs);
+
+    mc = MACHINE_GET_CLASS(qdev_get_machine());
+    if (mc->possible_cpu_arch_ids) {
+        memory_region_init(&s->container_cpuhp, OBJECT(dev), "cpuhp container",
+                        ACPI_CPU_HOTPLUG_REG_LEN);
+        sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container_cpuhp);
+        cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev),
+                            &s->cpuhp_state, 0);
+    }
 }
 
 static void acpi_ged_class_init(ObjectClass *class, void *data)
@@ -411,6 +483,7 @@ static void acpi_ged_class_init(ObjectClass *class, void *data)
     dc->desc = "ACPI Generic Event Device";
     device_class_set_props(dc, acpi_ged_properties);
     dc->vmsd = &vmstate_acpi_ged;
+    dc->realize = acpi_ged_realize;
 
     hc->plug = acpi_ged_device_plug_cb;
     hc->unplug_request = acpi_ged_unplug_request_cb;
diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build
index fc1b952379a88a74f336e6ae50614ee3c25eb2d1..d36b10ea3c7e9128c23319aed19178c3e2c71ec3 100644
--- a/hw/acpi/meson.build
+++ b/hw/acpi/meson.build
@@ -27,6 +27,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_ICH9', if_true: files('ich9.c', 'ich9_tco.c'))
 acpi_ss.add(when: 'CONFIG_ACPI_ERST', if_true: files('erst.c'))
 acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c'))
 acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c'))
+acpi_ss.add(when: 'CONFIG_CPUFREQ', if_true: files('cpufreq.c'))
 if have_tpm
   acpi_ss.add(files('tpm.c'))
 endif
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 3ada335a24374d4d543bc14c19b0a0b0bd1505b3..4a0ea0628f99ccc49ede3ffadc7e2aeab6f45478 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM_VIRT
     imply TPM_TIS_SYSBUS
     imply TPM_TIS_I2C
     imply NVDIMM
+    imply IOMMUFD
     select ARM_GIC
     select ACPI
     select ARM_SMMUV3
@@ -29,6 +30,7 @@ config ARM_VIRT
     select ACPI_HW_REDUCED
     select ACPI_APEI
     select ACPI_VIOT
+    select ACPI_CPU_HOTPLUG
     select VIRTIO_MEM_SUPPORTED
     select ACPI_CXL
     select ACPI_HMAT
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 84ea6a807a4eddd998cb48934a773ede5146fdf1..9a33601d358c1c3610c7c9a6ae974e66027a3ad4 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -11,6 +11,7 @@
 #include "qemu/datadir.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qemu/log.h"
 #include <libfdt.h>
 #include "hw/arm/boot.h"
 #include "hw/arm/linux-boot-if.h"
@@ -26,6 +27,7 @@
 #include "qemu/config-file.h"
 #include "qemu/option.h"
 #include "qemu/units.h"
+#include "kvm_arm.h"
 
 /* Kernel boot protocol is specified in the kernel docs
  * Documentation/arm/Booting and Documentation/arm64/booting.txt
@@ -41,6 +43,9 @@
 
 #define BOOTLOADER_MAX_SIZE         (4 * KiB)
 
+#define UEFI_MAX_SIZE 0x8000000
+#define UEFI_LOADER_START 0x0
+#define DTB_MAX 0x200000
 AddressSpace *arm_boot_address_space(ARMCPU *cpu,
                                      const struct arm_boot_info *info)
 {
@@ -682,7 +687,7 @@ fail:
     return -1;
 }
 
-static void do_cpu_reset(void *opaque)
+void do_cpu_reset(void *opaque)
 {
     ARMCPU *cpu = opaque;
     CPUState *cs = CPU(cpu);
@@ -1141,10 +1146,59 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu,
     for (cs = first_cpu; cs; cs = CPU_NEXT(cs)) {
         ARM_CPU(cs)->env.boot_info = info;
     }
+
+    if (kvm_enabled() && virtcca_cvm_enabled()) {
+        if (info->dtb_limit == 0) {
+            info->dtb_limit = info->dtb_start + DTB_MAX;
+        }
+        kvm_load_user_data(info->loader_start, 0x1, info->dtb_start,
+                           info->dtb_limit - info->dtb_start, info->ram_size, (struct kvm_numa_info *)info->numa_info);
+        tmm_add_ram_region(info->loader_start, image_high_addr - info->loader_start,
+                           info->initrd_start, info->dtb_limit - info->initrd_start, true);
+    }
+}
+
+static void arm_setup_confidential_firmware_boot(ARMCPU *cpu,
+                                                 struct arm_boot_info *info,
+                                                 const char *firmware_filename)
+{
+    uint64_t tmi_version = 0;
+    int ret = -1;
+
+    if (kvm_enabled()) {
+        ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version);
+    }
+    if (ret < 0) {
+        error_report("please check the kernel version!");
+        exit(EXIT_FAILURE);
+    }
+    if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) {
+        error_report("please check the tmi version!");
+        exit(EXIT_FAILURE);
+    }
+    ssize_t fw_size;
+    const char *fname;
+    AddressSpace *as = arm_boot_address_space(cpu, info);
+
+    fname = qemu_find_file(QEMU_FILE_TYPE_BIOS, firmware_filename);
+    if (!fname) {
+        error_report("Could not find firmware image '%s'", firmware_filename);
+        exit(EXIT_FAILURE);
+    }
+
+    fw_size = load_image_targphys_as(firmware_filename,
+                                     info->firmware_base,
+                                     info->firmware_max_size, as);
+
+    if (fw_size <= 0) {
+        error_report("could not load firmware '%s'", firmware_filename);
+        exit(EXIT_FAILURE);
+    }
 }
 
-static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info)
+static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename)
 {
+    hwaddr mmio_start, mmio_size;
     /* Set up for booting firmware (which might load a kernel via fw_cfg) */
 
     if (have_dtb(info)) {
@@ -1154,6 +1208,8 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info)
          * DTB to the base of RAM for the bootloader to pick up.
          */
         info->dtb_start = info->loader_start;
+        if (info->confidential)
+            tmm_add_ram_region(UEFI_LOADER_START, UEFI_MAX_SIZE, info->dtb_start, DTB_MAX , true);
     }
 
     if (info->kernel_filename) {
@@ -1194,6 +1250,12 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info)
         }
     }
 
+    if (info->confidential) {
+        arm_setup_confidential_firmware_boot(cpu, info, firmware_filename);
+        virtcca_kvm_get_mmio_addr(&mmio_start, &mmio_size);
+        kvm_load_user_data(info->loader_start, DTB_MAX, mmio_start, mmio_size, info->ram_size,
+	        (struct kvm_numa_info *)info->numa_info);
+    }
     /*
      * We will start from address 0 (typically a boot ROM image) in the
      * same way as hardware. Leave env->boot_info NULL, so that
@@ -1226,19 +1288,60 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info)
      * doesn't support secure.
      */
     assert(!(info->secure_board_setup && kvm_enabled()));
+
+    qemu_log("load the kernel\n");
+
     info->kernel_filename = ms->kernel_filename;
     info->kernel_cmdline = ms->kernel_cmdline;
     info->initrd_filename = ms->initrd_filename;
     info->dtb_filename = ms->dtb;
     info->dtb_limit = 0;
+    if (kvm_enabled() && virtcca_cvm_enabled()) {
+        info->ram_size = ms->ram_size;
+        info->numa_info = g_malloc(sizeof(struct kvm_numa_info));
+        struct kvm_numa_info *numa_info = (struct kvm_numa_info *) info->numa_info;
+        if (ms->numa_state != NULL && ms->numa_state->num_nodes > 0) {
+            numa_info->numa_cnt = ms->numa_state->num_nodes;
+            uint64_t mem_base = info->loader_start;
+            for (int64_t i = 0; i < ms->numa_state->num_nodes && i < MAX_NUMA_NODE; i++) {
+                uint64_t mem_len = ms->numa_state->nodes[i].node_mem;
+                numa_info->numa_nodes[i].numa_id = i;
+                numa_info->numa_nodes[i].ipa_start = mem_base;
+                numa_info->numa_nodes[i].ipa_size = mem_len;
+                memcpy(numa_info->numa_nodes[i].host_numa_nodes, ms->numa_state->nodes[i].node_memdev->host_nodes,
+                       MAX_NODES / BITS_PER_LONG * sizeof(uint64_t));
+                mem_base += mem_len;
+            }
+        } else {
+            numa_info->numa_cnt = 1;
+            numa_info->numa_nodes[0].numa_id = 0;
+            numa_info->numa_nodes[0].ipa_start = info->loader_start;
+            numa_info->numa_nodes[0].ipa_size = info->ram_size;
+            memset(numa_info->numa_nodes[0].host_numa_nodes, 0, MAX_NODES / BITS_PER_LONG * sizeof(uint64_t));
+        }
+
+        for (int cpu_idx = ms->smp.cpus - 1; cpu_idx >= 0; cpu_idx--) {
+            ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu_idx));
+            CPUState *local_cs = CPU(armcpu);
+            uint64_t node_id = 0;
+            if (ms->possible_cpus->cpus[local_cs->cpu_index].props.has_node_id)
+                node_id = ms->possible_cpus->cpus[local_cs->cpu_index].props.node_id;
+            bitmap_set((unsigned long *)numa_info->numa_nodes[node_id].cpu_id, cpu_idx, 1);
+        }
+    }
 
     /* Load the kernel.  */
     if (!info->kernel_filename || info->firmware_loaded) {
-        arm_setup_firmware_boot(cpu, info);
+        arm_setup_firmware_boot(cpu, info, ms->firmware);
     } else {
         arm_setup_direct_kernel_boot(cpu, info);
     }
 
+    if (kvm_enabled() && virtcca_cvm_enabled()) {
+        g_free(info->numa_info);
+        info->numa_info = NULL;
+    }
+
     /*
      * Disable the PSCI conduit if it is set up to target the same
      * or a lower EL than the one we're going to start the guest code in.
diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c
index 668db5ed61962e4bbc2c0a3945e8ba704e16b88d..9d9c263ef8244c2634bf06266efd7d7ff953f1c6 100644
--- a/hw/arm/mps2-tz.c
+++ b/hw/arm/mps2-tz.c
@@ -435,7 +435,7 @@ static MemoryRegion *make_uart(MPS2TZMachineState *mms, void *opaque,
                                const char *name, hwaddr size,
                                const int *irqs, const PPCExtraData *extradata)
 {
-    /* The irq[] array is tx, rx, combined, in that order */
+    /* The irq[] array is rx, tx, combined, in that order */
     MPS2TZMachineClass *mmc = MPS2TZ_MACHINE_GET_CLASS(mms);
     CMSDKAPBUART *uart = opaque;
     int i = uart - &mms->uart[0];
@@ -447,8 +447,8 @@ static MemoryRegion *make_uart(MPS2TZMachineState *mms, void *opaque,
     qdev_prop_set_uint32(DEVICE(uart), "pclk-frq", mmc->apb_periph_frq);
     sysbus_realize(SYS_BUS_DEVICE(uart), &error_fatal);
     s = SYS_BUS_DEVICE(uart);
-    sysbus_connect_irq(s, 0, get_sse_irq_in(mms, irqs[0]));
-    sysbus_connect_irq(s, 1, get_sse_irq_in(mms, irqs[1]));
+    sysbus_connect_irq(s, 0, get_sse_irq_in(mms, irqs[1]));
+    sysbus_connect_irq(s, 1, get_sse_irq_in(mms, irqs[0]));
     sysbus_connect_irq(s, 2, qdev_get_gpio_in(orgate_dev, i * 2));
     sysbus_connect_irq(s, 3, qdev_get_gpio_in(orgate_dev, i * 2 + 1));
     sysbus_connect_irq(s, 4, get_sse_irq_in(mms, irqs[2]));
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 9a8ac45431abb80fc5e9f60104248cb356509088..6c4b82757fef510cb6a497da737fa91fa94ebcfe 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 #include "exec/target_page.h"
 #include "hw/core/cpu.h"
+#include "hw/pci/pci_device.h"
 #include "hw/qdev-properties.h"
 #include "qapi/error.h"
 #include "qemu/jhash.h"
@@ -75,6 +76,16 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg,
     uint8_t level = 4 - (inputsize - 4) / stride;
     SMMUTLBEntry *entry = NULL;
 
+    /*
+     * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However,
+     * KVM still requests for an iommu address space for an MSI fixup by looking
+     * up stage-1 page table. Make sure we don't go through the emulated pathway
+     * so that the emulated iotlb will not need any invalidation.
+     */
+    if (bs->nested) {
+        return NULL;
+    }
+
     while (level <= 3) {
         uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz);
         uint64_t mask = subpage_size - 1;
@@ -110,6 +121,16 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new)
     SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1);
     uint8_t tg = (new->granule - 10) / 2;
 
+    /*
+     * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However,
+     * KVM still requests for an iommu address space for an MSI fixup by looking
+     * up stage-1 page table. Make sure we don't go through the emulated pathway
+     * so that the emulated iotlb will not need any invalidation.
+     */
+    if (bs->nested) {
+        return;
+    }
+
     if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) {
         smmu_iotlb_inv_all(bs);
     }
@@ -569,12 +590,9 @@ SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num)
     return NULL;
 }
 
-static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
+static SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus)
 {
-    SMMUState *s = opaque;
     SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus);
-    SMMUDevice *sdev;
-    static unsigned int index;
 
     if (!sbus) {
         sbus = g_malloc0(sizeof(SMMUPciBus) +
@@ -583,7 +601,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
         g_hash_table_insert(s->smmu_pcibus_by_busptr, bus, sbus);
     }
 
-    sdev = sbus->pbdev[devfn];
+    return sbus;
+}
+
+static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus,
+                                 PCIBus *bus, int devfn)
+{
+    SMMUDevice *sdev = sbus->pbdev[devfn];
+    static unsigned int index;
+
     if (!sdev) {
         char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn, index++);
 
@@ -596,37 +622,386 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
         memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu),
                                  s->mrtypename,
                                  OBJECT(s), name, UINT64_MAX);
+        if (s->nested) {
+            address_space_init(&sdev->as_sysmem, &s->root, name);
+        }
         address_space_init(&sdev->as,
                            MEMORY_REGION(&sdev->iommu), name);
         trace_smmu_add_mr(name);
         g_free(name);
     }
 
-    return &sdev->as;
+    return sdev;
+}
+
+static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
+{
+    SMMUState *s = opaque;
+    SMMUPciBus *sbus = smmu_get_sbus(s, bus);
+    SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn);
+    bool is_vfio = false;
+    PCIDevice *pdev;
+
+    pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) {
+        is_vfio = true;
+    }
+
+    /* Return the system as if the device uses stage-2 only */
+    if (s->nested && !sdev->s1_hwpt && is_vfio) {
+        return &sdev->as_sysmem;
+    } else {
+        return &sdev->as;
+    }
+}
+
+static bool smmu_dev_attach_viommu(SMMUDevice *sdev,
+                                   HostIOMMUDeviceIOMMUFD *idev, Error **errp)
+{
+    struct iommu_hwpt_arm_smmuv3 bypass_data = {
+        .ste = { 0x9ULL, 0x0ULL }, //0x1ULL << (108 - 64) },
+    };
+    struct iommu_hwpt_arm_smmuv3 abort_data = {
+        .ste = { 0x1ULL, 0x0ULL },
+    };
+    SMMUState *s = sdev->smmu;
+    SMMUS2Hwpt *s2_hwpt;
+    SMMUViommu *viommu;
+    uint32_t s2_hwpt_id;
+
+    if (s->viommu) {
+        return host_iommu_device_iommufd_attach_hwpt(
+                       idev, s->viommu->s2_hwpt->hwpt_id, errp);
+    }
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id,
+                                    IOMMU_HWPT_ALLOC_NEST_PARENT,
+                                    IOMMU_HWPT_DATA_NONE, 0, NULL,
+                                    &s2_hwpt_id, NULL, errp)) {
+        error_setg(errp, "failed to allocate an S2 hwpt");
+        return false;
+    }
+
+    /* Attach to S2 for MSI cookie */
+    if (!host_iommu_device_iommufd_attach_hwpt(idev, s2_hwpt_id, errp)) {
+        error_setg(errp, "failed to attach stage-2 HW pagetable");
+        goto free_s2_hwpt;
+    }
+
+    viommu = g_new0(SMMUViommu, 1);
+
+    viommu->core = iommufd_backend_alloc_viommu(idev->iommufd, idev->devid,
+                                                IOMMU_VIOMMU_TYPE_ARM_SMMUV3,
+                                                s2_hwpt_id);
+    if (!viommu->core) {
+        error_setg(errp, "failed to allocate a viommu");
+        goto free_viommu;
+    }
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                    viommu->core->viommu_id, 0,
+                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
+                                    sizeof(abort_data), &abort_data,
+                                    &viommu->abort_hwpt_id, NULL, errp)) {
+        error_setg(errp, "failed to allocate an abort pagetable");
+        goto free_viommu_core;
+    }
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                    viommu->core->viommu_id, 0,
+                                    IOMMU_HWPT_DATA_ARM_SMMUV3,
+                                    sizeof(bypass_data), &bypass_data,
+                                    &viommu->bypass_hwpt_id, NULL, errp)) {
+        error_setg(errp, "failed to allocate a bypass pagetable");
+        goto free_abort_hwpt;
+    }
+
+    if (!host_iommu_device_iommufd_attach_hwpt(
+                idev, viommu->bypass_hwpt_id, errp)) {
+        error_setg(errp, "failed to attach the bypass pagetable");
+        goto free_bypass_hwpt;
+    }
+
+    s2_hwpt = g_new0(SMMUS2Hwpt, 1);
+    s2_hwpt->iommufd = idev->iommufd;
+    s2_hwpt->hwpt_id = s2_hwpt_id;
+    s2_hwpt->ioas_id = idev->ioas_id;
+
+    viommu->iommufd = idev->iommufd;
+    viommu->s2_hwpt = s2_hwpt;
+
+    s->viommu = viommu;
+    return true;
+
+free_bypass_hwpt:
+    iommufd_backend_free_id(idev->iommufd, viommu->bypass_hwpt_id);
+free_abort_hwpt:
+    iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id);
+free_viommu_core:
+    iommufd_backend_free_id(idev->iommufd, viommu->core->viommu_id);
+    g_free(viommu->core);
+free_viommu:
+    g_free(viommu);
+    host_iommu_device_iommufd_attach_hwpt(idev, sdev->idev->ioas_id, errp);
+free_s2_hwpt:
+    iommufd_backend_free_id(idev->iommufd, s2_hwpt_id);
+    return false;
+}
+
+static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+                                     HostIOMMUDevice *hiod, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+    SMMUState *s = opaque;
+    SMMUPciBus *sbus = smmu_get_sbus(s, bus);
+    SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn);
+
+    if (!s->nested) {
+        return true;
+    }
+
+    if (sdev->idev) {
+        if (sdev->idev != idev) {
+            return false;//-EEXIST;
+        } else {
+            return true;
+        }
+    }
+
+    if (!idev) {
+        return true;
+    }
+
+    if (!smmu_dev_attach_viommu(sdev, idev, errp)) {
+        error_report("Unable to attach viommu");
+        return false;
+    }
+
+    sdev->idev = idev;
+    sdev->viommu = s->viommu;
+    QLIST_INSERT_HEAD(&s->viommu->device_list, sdev, next);
+    trace_smmu_set_iommu_device(devfn, smmu_get_sid(sdev));
+
+    return true;
+}
+
+static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+    SMMUVdev *vdev;
+    SMMUDevice *sdev;
+    SMMUViommu *viommu;
+    SMMUState *s = opaque;
+    SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus);
+
+    if (!s->nested) {
+        return;
+    }
+
+    if (!sbus) {
+        return;
+    }
+
+    sdev = sbus->pbdev[devfn];
+    if (!sdev) {
+        return;
+    }
+
+    if (!host_iommu_device_iommufd_attach_hwpt(sdev->idev,
+                                               sdev->idev->ioas_id, NULL)) {
+        error_report("Unable to attach dev to the default HW pagetable");
+    }
+
+    vdev = sdev->vdev;
+    viommu = sdev->viommu;
+
+    sdev->idev = NULL;
+    sdev->viommu = NULL;
+    sdev->vdev = NULL;
+    QLIST_REMOVE(sdev, next);
+    trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev));
+
+    if (vdev) {
+        iommufd_backend_free_id(viommu->iommufd, vdev->core->vdev_id);
+        g_free(vdev->core);
+        g_free(vdev);
+    }
+
+    if (QLIST_EMPTY(&viommu->device_list)) {
+        iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id);
+        iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id);
+        iommufd_backend_free_id(viommu->iommufd, viommu->core->viommu_id);
+        g_free(viommu->core);
+        iommufd_backend_free_id(viommu->iommufd, viommu->s2_hwpt->hwpt_id);
+        g_free(viommu->s2_hwpt);
+        g_free(viommu);
+        s->viommu = NULL;
+    }
+}
+
+static bool smmu_dev_get_pasid_cap(PCIBus *bus,
+                                   void *opaque, int devfn)
+{
+    assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+    return true;
 }
 
 static const PCIIOMMUOps smmu_ops = {
     .get_address_space = smmu_find_add_as,
+    .set_iommu_device = smmu_dev_set_iommu_device,
+    .unset_iommu_device = smmu_dev_unset_iommu_device,
+    .get_pasid_cap = smmu_dev_get_pasid_cap,
 };
 
-IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid)
+SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid)
 {
     uint8_t bus_n, devfn;
     SMMUPciBus *smmu_bus;
-    SMMUDevice *smmu;
 
     bus_n = PCI_BUS_NUM(sid);
     smmu_bus = smmu_find_smmu_pcibus(s, bus_n);
     if (smmu_bus) {
         devfn = SMMU_PCI_DEVFN(sid);
-        smmu = smmu_bus->pbdev[devfn];
-        if (smmu) {
-            return &smmu->iommu;
-        }
+        return smmu_bus->pbdev[devfn];
     }
     return NULL;
 }
 
+/* IOMMUFD helpers */
+int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type,
+                      uint32_t data_len, uint8_t *pasid, void *data)
+{
+    uint64_t caps;
+
+    if (!sdev || !sdev->idev) {
+        return -ENOENT;
+    }
+
+    return !iommufd_backend_get_device_info(sdev->idev->iommufd,
+                                            sdev->idev->devid, data_type, data,
+                                            data_len, &caps, pasid, NULL);
+}
+
+void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort)
+{
+    HostIOMMUDeviceIOMMUFD *idev = sdev->idev;
+    SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt;
+    uint32_t hwpt_id;
+
+    if (!s1_hwpt || !sdev->viommu) {
+        return;
+    }
+
+    if (abort) {
+        hwpt_id = sdev->viommu->abort_hwpt_id;
+    } else {
+        hwpt_id = sdev->viommu->bypass_hwpt_id;
+    }
+
+     /* ToDo: May be better to move the below to smmuv3. */
+    if (s1_hwpt->out_fault_fd) {
+        struct io_uring *ring = &s1_hwpt->fault_ring;
+        struct io_uring_sqe *sqe;
+        struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1};
+
+        s1_hwpt->exiting = true;
+        /* Send out a timeout sqe for the read handler to exit */
+        sqe  = io_uring_get_sqe(ring);
+        io_uring_prep_timeout(sqe, &ts, 0, 0);
+        io_uring_submit(ring);
+
+        qemu_cond_signal(&s1_hwpt->fault_cond);
+        qemu_thread_join(&s1_hwpt->read_fault_thread);
+        qemu_thread_join(&s1_hwpt->write_fault_thread);
+        qemu_mutex_destroy(&s1_hwpt->fault_mutex);
+        io_uring_queue_exit(&s1_hwpt->fault_ring);
+    }
+
+    if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) {
+        return;
+    }
+
+    iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id);
+    sdev->s1_hwpt = NULL;
+    g_free(s1_hwpt);
+}
+
+int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type,
+                                uint32_t data_len, void *data,
+                                bool req_fault_fd)
+{
+    SMMUViommu *viommu = sdev->viommu;
+    SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt;
+    HostIOMMUDeviceIOMMUFD *idev = sdev->idev;
+    uint32_t flags = 0;
+
+    if (!idev || !viommu) {
+        return -ENOENT;
+    }
+
+    if (s1_hwpt) {
+        smmu_dev_uninstall_nested_ste(sdev, false);
+    }
+
+    s1_hwpt = g_new0(SMMUS1Hwpt, 1);
+    if (!s1_hwpt) {
+        return -ENOMEM;
+    }
+
+    s1_hwpt->smmu = sdev->smmu;
+    s1_hwpt->sdev = sdev;
+    s1_hwpt->viommu = viommu;
+    s1_hwpt->iommufd = idev->iommufd;
+
+    if (req_fault_fd) {
+        flags |= IOMMU_HWPT_FAULT_ID_VALID;
+    }
+
+    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                    viommu->core->viommu_id, flags, data_type,
+                                    data_len, data, &s1_hwpt->hwpt_id,
+                                    &s1_hwpt->out_fault_fd, NULL)) {
+        goto free;
+    }
+
+    if (!host_iommu_device_iommufd_attach_hwpt(idev, s1_hwpt->hwpt_id, NULL)) {
+        goto free_hwpt;
+    }
+
+    sdev->s1_hwpt = s1_hwpt;
+
+    return 0;
+free_hwpt:
+    iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id);
+free:
+    sdev->s1_hwpt = NULL;
+    g_free(s1_hwpt);
+
+    return -EINVAL;
+}
+
+int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len,
+                               uint32_t *num, void *reqs)
+{
+    if (!s1_hwpt) {
+        return -ENOENT;
+    }
+
+    return iommufd_backend_invalidate_cache(s1_hwpt->iommufd, s1_hwpt->hwpt_id,
+                                            type, len, num, reqs);
+}
+
+int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type,
+                                 uint32_t len, uint32_t *num, void *reqs)
+{
+    if (!viommu) {
+        return -ENOENT;
+    }
+
+    return iommufd_viommu_invalidate_cache(viommu->iommufd, viommu->viommu_id,
+                                           type, len, num, reqs);
+}
+
 /* Unmap all notifiers attached to @mr */
 static void smmu_inv_notifiers_mr(IOMMUMemoryRegion *mr)
 {
@@ -664,6 +1039,14 @@ static void smmu_base_realize(DeviceState *dev, Error **errp)
                                      g_free, g_free);
     s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL);
 
+    if (s->nested) {
+        memory_region_init(&s->root, OBJECT(s), "root", UINT64_MAX);
+        memory_region_init_alias(&s->sysmem, OBJECT(s),
+                                 "smmu-sysmem", get_system_memory(), 0,
+                                 memory_region_size(get_system_memory()));
+        memory_region_add_subregion(&s->root, 0, &s->sysmem);
+    }
+
     if (s->primary_bus) {
         pci_setup_iommu(s->primary_bus, &smmu_ops, s);
     } else {
@@ -683,6 +1066,7 @@ static Property smmu_dev_properties[] = {
     DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0),
     DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus,
                      TYPE_PCI_BUS, PCIBus *),
+    DEFINE_PROP_BOOL("nested", SMMUState, nested, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h
index 843bebb185d77f44681033db54dd0c5196c7459d..5a81dd1b82af4814710f5aa9fde2907062749940 100644
--- a/hw/arm/smmu-internal.h
+++ b/hw/arm/smmu-internal.h
@@ -142,6 +142,7 @@ typedef struct SMMUIOTLBPageInvInfo {
 } SMMUIOTLBPageInvInfo;
 
 typedef struct SMMUSIDRange {
+    SMMUState *state;
     uint32_t start;
     uint32_t end;
 } SMMUSIDRange;
diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
index 6076025ad6a0f9031c5c03f06dc13a7ad1b6cd7b..cfc04c563e0229f6bc3f1ee33f3d942827172551 100644
--- a/hw/arm/smmuv3-internal.h
+++ b/hw/arm/smmuv3-internal.h
@@ -74,6 +74,7 @@ REG32(IDR1,                0x4)
     FIELD(IDR1, ECMDQ,        31, 1)
 
 #define SMMU_IDR1_SIDSIZE 16
+#define SMMU_IDR1_SSIDSIZE 16
 #define SMMU_CMDQS   19
 #define SMMU_EVENTQS 19
 
@@ -104,7 +105,7 @@ REG32(IDR5,                0x14)
      FIELD(IDR5, VAX,        10, 2);
      FIELD(IDR5, STALL_MAX,  16, 16);
 
-#define SMMU_IDR5_OAS 4
+#define SMMU_IDR5_OAS 5
 
 REG32(IIDR,                0x18)
 REG32(AIDR,                0x1c)
@@ -226,6 +227,19 @@ static inline bool smmuv3_gerror_irq_enabled(SMMUv3State *s)
 #define Q_CONS_WRAP(q) (((q)->cons & WRAP_MASK(q)) >> (q)->log2size)
 #define Q_PROD_WRAP(q) (((q)->prod & WRAP_MASK(q)) >> (q)->log2size)
 
+#define Q_IDX(llq, p)                   ((p) & ((1 << (llq)->max_n_shift) - 1))
+
+static inline int smmuv3_q_ncmds(SMMUQueue *q)
+{
+	uint32_t prod = Q_PROD(q);
+	uint32_t cons = Q_CONS(q);
+
+	if (Q_PROD_WRAP(q) == Q_CONS_WRAP(q))
+		return prod - cons;
+	else
+		return WRAP_MASK(q) - cons + prod;
+}
+
 static inline bool smmuv3_q_full(SMMUQueue *q)
 {
     return ((q->cons ^ q->prod) & WRAP_INDEX_MASK(q)) == WRAP_MASK(q);
@@ -552,6 +566,7 @@ typedef struct CD {
 
 #define STE_S1FMT(x)       extract32((x)->word[0], 4 , 2)
 #define STE_S1CDMAX(x)     extract32((x)->word[1], 27, 5)
+#define STE_S1DSS(x)       extract32((x)->word[2], 0,  2)
 #define STE_S1STALLD(x)    extract32((x)->word[2], 27, 1)
 #define STE_EATS(x)        extract32((x)->word[2], 28, 2)
 #define STE_STRW(x)        extract32((x)->word[2], 30, 2)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index c3871ae067ffb74a315025a8f54513e6e81b2a11..c0fcdd75746e1df006365a9091cfd1bc4bc6a320 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -24,6 +24,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "cpu.h"
 #include "trace.h"
 #include "qemu/log.h"
@@ -33,6 +34,9 @@
 #include "hw/arm/smmuv3.h"
 #include "smmuv3-internal.h"
 #include "smmu-internal.h"
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
 
 #define PTW_RECORD_FAULT(cfg)   (((cfg)->stage == 1) ? (cfg)->record_faults : \
                                  (cfg)->s2cfg.record_faults)
@@ -254,6 +258,81 @@ void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *info)
     info->recorded = true;
 }
 
+static void smmuv3_nested_init_regs(SMMUv3State *s)
+{
+    SMMUState *bs = ARM_SMMU(s);
+    SMMUDevice *sdev;
+    uint32_t data_type;
+    uint32_t val;
+    uint8_t pasid;
+    int ret;
+
+    if (!bs->nested || !bs->viommu) {
+        return;
+    }
+
+    sdev = QLIST_FIRST(&bs->viommu->device_list);
+    if (!sdev) {
+        return;
+    }
+
+    if (sdev->info.idr[0]) {
+        error_report("reusing the previous hw_info");
+        goto out;
+    }
+
+    ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &pasid,
+                            &sdev->info);
+    if (ret) {
+        error_report("failed to get SMMU device info");
+        return;
+    }
+
+    if (data_type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) {
+        error_report( "Wrong data type (%d)!", data_type);
+        return;
+    }
+
+out:
+    trace_smmuv3_get_device_info(sdev->info.idr[0], sdev->info.idr[1],
+                                 sdev->info.idr[3], sdev->info.idr[5]);
+
+    val = FIELD_EX32(sdev->info.idr[0], IDR0, BTM);
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, BTM, val);
+    val = FIELD_EX32(sdev->info.idr[0], IDR0, ATS);
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, val);
+    val = FIELD_EX32(sdev->info.idr[0], IDR0, ASID16);
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, val);
+    val = FIELD_EX32(sdev->info.idr[0], IDR0, TERM_MODEL);
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, val);
+    val = FIELD_EX32(sdev->info.idr[0], IDR0, STALL_MODEL);
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, val);
+    val = FIELD_EX32(sdev->info.idr[0], IDR0, STLEVEL);
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, val);
+
+    val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE);
+    s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val);
+    s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, pasid);
+
+    val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD);
+    s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val);
+    val = FIELD_EX32(sdev->info.idr[3], IDR3, RIL);
+    s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, val);
+    val = FIELD_EX32(sdev->info.idr[3], IDR3, BBML);
+    s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, val);
+
+    val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN4K);
+    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, val);
+    val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN16K);
+    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, val);
+    val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN64K);
+    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, val);
+    val = FIELD_EX32(sdev->info.idr[5], IDR5, OAS);
+    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, val);
+
+    /* FIXME check iidr and aidr registrs too */
+}
+
 static void smmuv3_init_regs(SMMUv3State *s)
 {
     /* Based on sys property, the stages supported in smmu will be advertised.*/
@@ -268,13 +347,14 @@ static void smmuv3_init_regs(SMMUv3State *s)
     s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, 1); /* 16-bit ASID */
     s->idr[0] = FIELD_DP32(s->idr[0], IDR0, VMID16, 1); /* 16-bit VMID */
     s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TTENDIAN, 2); /* little endian */
-    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 1); /* No stall */
+    s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 0); /* stall */
     /* terminated transaction will always be aborted/error returned */
     s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, 1);
     /* 2-level stream table supported */
     s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, 1);
 
     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, SMMU_IDR1_SIDSIZE);
+    s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, SMMU_IDR1_SSIDSIZE);
     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS);
     s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS,   SMMU_CMDQS);
 
@@ -286,12 +366,15 @@ static void smmuv3_init_regs(SMMUv3State *s)
     s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1);
     s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2);
 
-    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */
+    s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 48 bits */
     /* 4K, 16K and 64K granule support */
     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
     s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1);
 
+    /* Override IDR fields with HW caps */
+    smmuv3_nested_init_regs(s);
+
     s->cmdq.base = deposit64(s->cmdq.base, 0, 5, SMMU_CMDQS);
     s->cmdq.prod = 0;
     s->cmdq.cons = 0;
@@ -486,6 +569,27 @@ bad_ste:
     return -EINVAL;
 }
 
+static void decode_ste_config(SMMUTransCfg *cfg, uint32_t config)
+{
+
+    if (STE_CFG_ABORT(config)) {
+        cfg->aborted = true;
+        return;
+    }
+    if (STE_CFG_BYPASS(config)) {
+        cfg->bypassed = true;
+        return;
+    }
+
+    if (STE_CFG_S1_ENABLED(config)) {
+        cfg->stage = SMMU_STAGE_1;
+    }
+
+    if (STE_CFG_S2_ENABLED(config)) {
+        cfg->stage |= SMMU_STAGE_2;
+    }
+}
+
 /* Returns < 0 in case of invalid STE, 0 otherwise */
 static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
                       STE *ste, SMMUEventInfo *event)
@@ -502,12 +606,19 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
 
     config = STE_CONFIG(ste);
 
-    if (STE_CFG_ABORT(config)) {
+    decode_ste_config(cfg, config);
+
+    /* S1DSS.Terminate is same as Config.abort for default stream */
+    if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0) {
         cfg->aborted = true;
+    }
+
+    if (cfg->aborted || cfg->bypassed) {
         return 0;
     }
 
-    if (STE_CFG_BYPASS(config)) {
+    /* S1DSS.Bypass is same as Config.bypass for default stream */
+    if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0x1) {
         cfg->bypassed = true;
         return 0;
     }
@@ -545,13 +656,14 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
         }
     }
 
-    if (STE_S1CDMAX(ste) != 0) {
+    if (!FIELD_EX32(s->idr[1], IDR1, SSIDSIZE) && STE_S1CDMAX(ste) != 0) {
         qemu_log_mask(LOG_UNIMP,
                       "SMMUv3 does not support multiple context descriptors yet\n");
         goto bad_ste;
     }
 
-    if (STE_S1STALLD(ste)) {
+    /* STALL_MODEL being 0b01 means "stall is not supported" */
+    if ((FIELD_EX32(s->idr[0], IDR0, STALL_MODEL) & 0x1) && STE_S1STALLD(ste)) {
         qemu_log_mask(LOG_UNIMP,
                       "SMMUv3 S1 stalling fault model not allowed yet\n");
         goto bad_ste;
@@ -669,9 +781,6 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event)
     if (!CD_A(cd)) {
         goto bad_cd; /* SMMU_IDR0.TERM_MODEL == 1 */
     }
-    if (CD_S(cd)) {
-        goto bad_cd; /* !STE_SECURE && SMMU_IDR0.STALL_MODEL == 1 */
-    }
     if (CD_HA(cd) || CD_HD(cd)) {
         goto bad_cd; /* HTTU = 0 */
     }
@@ -1153,30 +1262,352 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd)
     }
 }
 
+static void smmuv3_report_iommu_fault(SMMUS1Hwpt *hwpt,
+                                      struct iommu_hwpt_pgfault *fault)
+{
+    PendFaultEntry *pend;
+    SMMUDevice *sdev = hwpt->sdev;
+    SMMUv3State *s3 = sdev->smmu;
+    uint32_t sid = smmu_get_sid(sdev);
+    SMMUEventInfo info = {0};
+
+    info.sid = sid;
+    info.type = SMMU_EVT_F_TRANSLATION;
+    info.u.f_translation.addr = fault->addr;
+    info.u.f_translation.stall = true;
+    info.u.f_translation.ssid = fault->pasid;
+    info.u.f_translation.stag = fault->grpid;
+
+    if (fault->flags | IOMMU_PGFAULT_FLAGS_PASID_VALID) {
+        info.u.f_translation.ssv = true;
+    }
+    if (fault->perm & IOMMU_PGFAULT_PERM_READ) {
+        info.u.f_translation.rnw = true;
+    }
+    if (fault->perm & IOMMU_PGFAULT_PERM_PRIV) {
+        info.u.f_translation.pnu = true;
+    }
+    if (fault->perm & IOMMU_PGFAULT_PERM_EXEC) {
+        info.u.f_translation.ind = true;
+    }
+
+    pend = g_new0(PendFaultEntry, 1);
+    memcpy(&pend->fault, fault, sizeof(*fault));
+    qemu_mutex_lock(&hwpt->fault_mutex);
+    QTAILQ_INSERT_TAIL(&hwpt->pendfault, pend, entry);
+    qemu_mutex_unlock(&hwpt->fault_mutex);
+    smmuv3_record_event(s3, &info);
+    return;
+}
+
+static void smmuv3_notify_stall_resume(SMMUState *bs, uint32_t sid,
+                                       uint32_t stag, uint32_t code)
+{
+    SMMUDevice *sdev = smmu_find_sdev(bs, sid);
+    PageRespEntry *msg;
+    PendFaultEntry *pend, *tmp;
+    SMMUS1Hwpt *hwpt;
+    bool found = false;
+
+    if (!sdev) {
+        return;
+    }
+
+    hwpt = sdev->s1_hwpt;
+    msg = g_new0(PageRespEntry, 1);
+
+    /* Kernel expects addr and pasid info for page response */
+    qemu_mutex_lock(&hwpt->fault_mutex);
+    QTAILQ_FOREACH_SAFE(pend, &hwpt->pendfault, entry, tmp) {
+        if (pend->fault.grpid == stag) {
+            QTAILQ_REMOVE(&hwpt->pendfault, pend, entry);
+            msg->resp.cookie = pend->fault.cookie;
+            msg->resp.code = code;
+            QTAILQ_INSERT_TAIL(&hwpt->pageresp, msg, entry);
+            qemu_cond_signal(&hwpt->fault_cond);
+
+            g_free(pend);
+            found = true;
+            break;
+        }
+    }
+
+    qemu_mutex_unlock(&hwpt->fault_mutex);
+    if (!found) {
+        warn_report("No matching fault for resume(stag 0x%x), drop!", stag);
+        return;
+    }
+}
+
+static void *write_fault_handler(void *opaque)
+{
+    SMMUS1Hwpt *hwpt = opaque;
+    PageRespEntry *msg, *tmp;
+    struct iommu_hwpt_page_response *resp;
+    int ret;
+
+    resp = g_new0(struct iommu_hwpt_page_response, 1);
+    while (!hwpt->exiting) {
+        /* Check we have any pending responses */
+        qemu_mutex_lock(&hwpt->fault_mutex);
+        qemu_cond_wait(&hwpt->fault_cond, &hwpt->fault_mutex);
+        QTAILQ_FOREACH_SAFE(msg, &hwpt->pageresp, entry, tmp) {
+            QTAILQ_REMOVE(&hwpt->pageresp, msg, entry);
+            memcpy(resp, &msg->resp, sizeof(*resp));
+            g_free(msg);
+
+            ret = write(hwpt->out_fault_fd, resp, sizeof(*resp));
+            if (ret != sizeof(*resp)) {
+                warn_report("Write resp[cookie 0x%x] fail %d",
+                             resp->cookie, ret);
+            }
+        }
+        qemu_mutex_unlock(&hwpt->fault_mutex);
+    }
+    g_free(resp);
+    return NULL;
+}
+
+static void *read_fault_handler(void *opaque)
+{
+    SMMUS1Hwpt *hwpt = opaque;
+    struct io_uring_sqe *sqe;
+    struct io_uring_cqe *cqe;
+    struct iommu_hwpt_pgfault *fault;
+    struct io_uring *ring = &hwpt->fault_ring;
+    void *data;
+    int ret;
+
+    fault = g_new0(struct iommu_hwpt_pgfault, 1);
+    while (!hwpt->exiting) {
+        sqe = io_uring_get_sqe(ring);
+        io_uring_prep_read(sqe, hwpt->out_fault_fd, fault,
+                           sizeof(*fault), 0);
+        io_uring_sqe_set_data(sqe, fault);
+        io_uring_submit(ring);
+
+        ret = io_uring_wait_cqe(ring, &cqe);
+        if (ret == 0) {
+            if (cqe->res == sizeof(*fault)) {
+                data = io_uring_cqe_get_data(cqe);
+                smmuv3_report_iommu_fault(hwpt, data);
+            }
+        } else {
+            warn_report("Read fault[hwpt_id 0x%x] failed %d",
+                         hwpt->hwpt_id, ret);
+        }
+        io_uring_cqe_seen(ring, cqe);
+    }
+    g_free(fault);
+    return NULL;
+}
+
+static void create_fault_handlers(SMMUS1Hwpt *hwpt)
+{
+    if (!hwpt->out_fault_fd) {
+        warn_report("No fault fd for hwpt id: %d", hwpt->hwpt_id);
+        return;
+    }
+
+    io_uring_queue_init(1024, &hwpt->fault_ring, 0);
+    qemu_mutex_init(&hwpt->fault_mutex);
+    qemu_cond_init(&hwpt->fault_cond);
+    QTAILQ_INIT(&hwpt->pageresp);
+    QTAILQ_INIT(&hwpt->pendfault);
+    qemu_thread_create(&hwpt->read_fault_thread, "io fault read",
+                       read_fault_handler,
+                       hwpt, QEMU_THREAD_JOINABLE);
+    qemu_thread_create(&hwpt->write_fault_thread, "io fault write",
+                       write_fault_handler,
+                       hwpt, QEMU_THREAD_JOINABLE);
+}
+static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid)
+{
+#ifdef __linux__
+    SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid,
+                           .inval_ste_allowed = true};
+    struct iommu_hwpt_arm_smmuv3 nested_data = {};
+    SMMUv3State *s = sdev->smmu;
+    SMMUState *bs = &s->smmu_state;
+    bool req_fault_fd = false;
+    uint32_t config;
+    STE ste;
+    int ret;
+
+    if (!sdev->viommu || !bs->nested) {
+        return;
+    }
+
+    if (!sdev->vdev && sdev->idev && sdev->viommu) {
+        SMMUVdev *vdev = g_new0(SMMUVdev, 1);
+        vdev->core = iommufd_backend_alloc_vdev(sdev->idev, sdev->viommu->core,
+                                                sid);
+        if (!vdev->core) {
+            error_report("failed to allocate a vDEVICE");
+            g_free(vdev);
+            return;
+        }
+        sdev->vdev = vdev;
+    }
+
+    ret = smmu_find_ste(sdev->smmu, sid, &ste, &event);
+    if (ret) {
+        /*
+         * For a 2-level Stream Table, the level-2 table might not be ready
+         * until the device gets inserted to the stream table. Ignore this.
+         */
+        return;
+    }
+
+    config = STE_CONFIG(&ste);
+    if (!STE_VALID(&ste) || !STE_CFG_S1_ENABLED(config)) {
+        smmu_dev_uninstall_nested_ste(sdev, STE_CFG_ABORT(config));
+        smmuv3_flush_config(sdev);
+        return;
+    }
+
+    nested_data.ste[0] = (uint64_t)ste.word[0] | (uint64_t)ste.word[1] << 32;
+    nested_data.ste[1] = (uint64_t)ste.word[2] | (uint64_t)ste.word[3] << 32;
+    /* V | CONFIG | S1FMT | S1CTXPTR | S1CDMAX */
+    nested_data.ste[0] &= 0xf80fffffffffffffULL;
+    /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */
+    nested_data.ste[1] &= 0x380000ffULL;
+
+    if (STE_S1CDMAX(&ste)) {
+        req_fault_fd = true;
+    }
+
+    ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3,
+                                      sizeof(nested_data), &nested_data,
+                                      req_fault_fd);
+    if (ret) {
+        error_report("Unable to install nested STE=%16LX:%16LX, ret=%d",
+                     nested_data.ste[1], nested_data.ste[0], ret);
+    }
+
+    if (req_fault_fd) {
+        create_fault_handlers(sdev->s1_hwpt);
+    }
+
+    trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]);
+#endif
+}
+
 static gboolean
-smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data)
+_smmuv3_invalidate_ste(SMMUDevice *sdev, SMMUSIDRange *sid_range)
 {
-    SMMUDevice *sdev = (SMMUDevice *)key;
     uint32_t sid = smmu_get_sid(sdev);
-    SMMUSIDRange *sid_range = (SMMUSIDRange *)user_data;
 
     if (sid < sid_range->start || sid > sid_range->end) {
         return false;
     }
+    smmuv3_flush_config(sdev);
+    smmuv3_install_nested_ste(sdev, sid);
     trace_smmuv3_config_cache_inv(sid);
     return true;
 }
 
+static gboolean
+smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data)
+{
+    return _smmuv3_invalidate_ste((SMMUDevice *)key, (SMMUSIDRange *)user_data);
+}
+
+static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range)
+{
+    SMMUState *bs = sid_range->state;
+    SMMUDevice *sdev;
+
+    if (!bs->viommu) {
+        return;
+    }
+
+    QLIST_FOREACH(sdev, &bs->viommu->device_list, next) {
+        if (smmu_get_sid(sdev)) {
+            _smmuv3_invalidate_ste(sdev, sid_range);
+        }
+    }
+}
+
+/**
+ * SMMUCommandBatch - batch of commands to issue for nested SMMU invalidation
+ * @cmds: Pointer to list of commands
+ * @cons: Pointer to list of CONS corresponding to the commands
+ * @ncmds: Total ncmds in the batch
+ * @dev_cache: Issue to a device cache
+ */
+typedef struct SMMUCommandBatch {
+    Cmd *cmds;
+    uint32_t *cons;
+    uint32_t ncmds;
+    bool dev_cache;
+} SMMUCommandBatch;
+
+/* Update batch->ncmds to the number of execute cmds */
+static int smmuv3_issue_cmd_batch(SMMUState *bs, SMMUCommandBatch *batch)
+{
+    uint32_t total = batch->ncmds;
+    int ret;
+
+    ret = smmu_viommu_invalidate_cache(bs->viommu->core,
+                                       IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3,
+                                       sizeof(Cmd), &batch->ncmds, batch->cmds);
+    if (total != batch->ncmds) {
+        error_report("%s failed: ret=%d, total=%d, done=%d",
+                      __func__, ret, total, batch->ncmds);
+        return ret;
+    }
+
+    batch->ncmds = 0;
+    batch->dev_cache = false;
+    return ret;
+}
+
+static int smmuv3_batch_cmds(SMMUState *bs, SMMUCommandBatch *batch,
+                             Cmd *cmd, uint32_t *cons, bool dev_cache)
+{
+    int ret;
+
+    if (!bs->nested || !bs->viommu) {
+        return 0;
+    }
+
+    /*
+     * Currently separate dev_cache and hwpt for safety, which might not be
+     * necessary if underlying HW SMMU does not have the errata.
+     *
+     * TODO check IIDR register values read from hw_info.
+     */
+    if (batch->ncmds && (dev_cache != batch->dev_cache)) {
+        ret = smmuv3_issue_cmd_batch(bs, batch);
+        if (ret) {
+            *cons = batch->cons[batch->ncmds];
+            return ret;
+        }
+    }
+    batch->dev_cache = dev_cache;
+    batch->cmds[batch->ncmds] = *cmd;
+    batch->cons[batch->ncmds++] = *cons;
+    return 0;
+}
+
 static int smmuv3_cmdq_consume(SMMUv3State *s)
 {
     SMMUState *bs = ARM_SMMU(s);
     SMMUCmdError cmd_error = SMMU_CERROR_NONE;
     SMMUQueue *q = &s->cmdq;
     SMMUCommandType type = 0;
+    SMMUCommandBatch batch = {};
+    uint32_t ncmds = 0;
 
     if (!smmuv3_cmdq_enabled(s)) {
         return 0;
     }
+
+    ncmds = smmuv3_q_ncmds(q);
+    batch.cmds = g_new0(Cmd, ncmds);
+    batch.cons = g_new0(uint32_t, ncmds);
+
     /*
      * some commands depend on register values, typically CR0. In case those
      * register values change while handling the command, spec says it
@@ -1217,21 +1648,20 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
         case SMMU_CMD_CFGI_STE:
         {
             uint32_t sid = CMD_SID(&cmd);
-            IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
-            SMMUDevice *sdev;
+            SMMUDevice *sdev = smmu_find_sdev(bs, sid);
 
             if (CMD_SSEC(&cmd)) {
                 cmd_error = SMMU_CERROR_ILL;
                 break;
             }
 
-            if (!mr) {
+            if (!sdev) {
                 break;
             }
 
             trace_smmuv3_cmdq_cfgi_ste(sid);
-            sdev = container_of(mr, SMMUDevice, iommu);
             smmuv3_flush_config(sdev);
+            smmuv3_install_nested_ste(sdev, sid);
 
             break;
         }
@@ -1247,33 +1677,40 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
             }
 
             mask = (1ULL << (range + 1)) - 1;
+            sid_range.state = bs;
             sid_range.start = sid & ~mask;
             sid_range.end = sid_range.start + mask;
 
             trace_smmuv3_cmdq_cfgi_ste_range(sid_range.start, sid_range.end);
             g_hash_table_foreach_remove(bs->configs, smmuv3_invalidate_ste,
                                         &sid_range);
+            smmuv3_invalidate_nested_ste(&sid_range);
             break;
         }
         case SMMU_CMD_CFGI_CD:
         case SMMU_CMD_CFGI_CD_ALL:
         {
             uint32_t sid = CMD_SID(&cmd);
-            IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid);
-            SMMUDevice *sdev;
+            SMMUDevice *sdev = smmu_find_sdev(bs, sid);
 
             if (CMD_SSEC(&cmd)) {
                 cmd_error = SMMU_CERROR_ILL;
                 break;
             }
 
-            if (!mr) {
+            if (!sdev) {
                 break;
             }
 
             trace_smmuv3_cmdq_cfgi_cd(sid);
-            sdev = container_of(mr, SMMUDevice, iommu);
             smmuv3_flush_config(sdev);
+
+            if (sdev->s1_hwpt) {
+                if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) {
+                    cmd_error = SMMU_CERROR_ILL;
+                    break;
+                }
+            }
             break;
         }
         case SMMU_CMD_TLBI_NH_ASID:
@@ -1288,6 +1725,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
             trace_smmuv3_cmdq_tlbi_nh_asid(asid);
             smmu_inv_notifiers_all(&s->smmu_state);
             smmu_iotlb_inv_asid(bs, asid);
+            if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) {
+                cmd_error = SMMU_CERROR_ILL;
+                break;
+            }
             break;
         }
         case SMMU_CMD_TLBI_NH_ALL:
@@ -1300,6 +1741,11 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
             trace_smmuv3_cmdq_tlbi_nh();
             smmu_inv_notifiers_all(&s->smmu_state);
             smmu_iotlb_inv_all(bs);
+
+            if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) {
+                cmd_error = SMMU_CERROR_ILL;
+                break;
+            }
             break;
         case SMMU_CMD_TLBI_NH_VAA:
         case SMMU_CMD_TLBI_NH_VA:
@@ -1308,7 +1754,24 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
                 break;
             }
             smmuv3_range_inval(bs, &cmd);
+
+            if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) {
+                cmd_error = SMMU_CERROR_ILL;
+                break;
+            }
             break;
+        case SMMU_CMD_ATC_INV:
+        {
+            SMMUDevice *sdev = smmu_find_sdev(bs, CMD_SID(&cmd));
+
+            if (sdev->s1_hwpt) {
+                if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) {
+                    cmd_error = SMMU_CERROR_ILL;
+                    break;
+                }
+            }
+            break;
+        }
         case SMMU_CMD_TLBI_S12_VMALL:
         {
             uint16_t vmid = CMD_VMID(&cmd);
@@ -1340,12 +1803,23 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
         case SMMU_CMD_TLBI_EL2_ASID:
         case SMMU_CMD_TLBI_EL2_VA:
         case SMMU_CMD_TLBI_EL2_VAA:
-        case SMMU_CMD_ATC_INV:
         case SMMU_CMD_PRI_RESP:
-        case SMMU_CMD_RESUME:
         case SMMU_CMD_STALL_TERM:
             trace_smmuv3_unhandled_cmd(type);
             break;
+        case SMMU_CMD_RESUME:
+        {
+            uint32_t sid = CMD_SID(&cmd);
+            uint16_t stag = CMD_RESUME_STAG(&cmd);
+            uint8_t action = CMD_RESUME_AC(&cmd);
+            uint32_t code = IOMMUFD_PAGE_RESP_INVALID;
+
+            if (action) {
+                code = IOMMUFD_PAGE_RESP_SUCCESS;
+            }
+            smmuv3_notify_stall_resume(bs, sid, stag, code);
+            break;
+        }
         default:
             cmd_error = SMMU_CERROR_ILL;
             break;
@@ -1365,12 +1839,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
          */
         queue_cons_incr(q);
     }
+    qemu_mutex_lock(&s->mutex);
+    if (!cmd_error && batch.ncmds && bs->viommu) {
+        if (smmuv3_issue_cmd_batch(bs, &batch)) {
+            q->cons = batch.cons[batch.ncmds];
+            cmd_error = SMMU_CERROR_ILL;
+        }
+    }
+    qemu_mutex_unlock(&s->mutex);
 
     if (cmd_error) {
         trace_smmuv3_cmdq_consume_error(smmu_cmd_string(type), cmd_error);
         smmu_write_cmdq_err(s, cmd_error);
         smmuv3_trigger_irq(s, SMMU_IRQ_GERROR, R_GERROR_CMDQ_ERR_MASK);
     }
+    g_free(batch.cmds);
+    g_free(batch.cons);
 
     trace_smmuv3_cmdq_consume_out(Q_PROD(q), Q_CONS(q),
                                   Q_PROD_WRAP(q), Q_CONS_WRAP(q));
@@ -1746,6 +2230,11 @@ static void smmu_realize(DeviceState *d, Error **errp)
     SysBusDevice *dev = SYS_BUS_DEVICE(d);
     Error *local_err = NULL;
 
+    if (s->stage && strcmp("1", s->stage)) {
+        /* Only support nested with an stage1 only vSMMU */
+        sys->nested = false;
+    }
+
     c->parent_realize(d, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -1764,6 +2253,38 @@ static void smmu_realize(DeviceState *d, Error **errp)
     smmu_init_irq(s, dev);
 }
 
+static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque)
+{
+    DeviceState *d = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+        PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+        if (d->parent_bus && !strcmp(bus->qbus.name, d->parent_bus->name)) {
+            object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus),
+                                     &error_abort);
+        }
+    }
+    return 0;
+}
+
+static void smmu_accel_realize(DeviceState *d, Error **errp)
+{
+    SMMUv3AccelState *s_nested = ARM_SMMUV3_ACCEL(d);
+    SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_nested);
+    SysBusDevice *dev = SYS_BUS_DEVICE(d);
+    Error *local_err = NULL;
+
+    object_child_foreach_recursive(object_get_root(),
+                                   smmuv3_accel_pci_host_bridge, d);
+    object_property_set_bool(OBJECT(dev), "nested", true, &error_abort);
+
+    c->parent_realize(d, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
+
 static const VMStateDescription vmstate_smmuv3_queue = {
     .name = "smmuv3_queue",
     .version_id = 1,
@@ -1862,6 +2383,19 @@ static void smmuv3_class_init(ObjectClass *klass, void *data)
     device_class_set_props(dc, smmuv3_properties);
 }
 
+static void smmuv3_accel_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass);
+
+    dc->vmsd = &vmstate_smmuv3;
+    device_class_set_parent_realize(dc, smmu_accel_realize,
+                                    &c->parent_realize);
+    dc->user_creatable = true;
+    dc->hotpluggable = false;
+    dc->bus_type = TYPE_PCIE_BUS;
+}
+
 static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
                                       IOMMUNotifierFlag old,
                                       IOMMUNotifierFlag new,
@@ -1876,12 +2410,9 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu,
         return -EINVAL;
     }
 
-    if (new & IOMMU_NOTIFIER_MAP) {
-        error_setg(errp,
-                   "device %02x.%02x.%x requires iommu MAP notifier which is "
-                   "not currently supported", pci_bus_num(sdev->bus),
-                   PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn));
-        return -EINVAL;
+    /* nested-smmuv3 does not need IOMMU_NOTIFIER_MAP. Ignore it. */
+    if (s->nested) {
+        new &= ~IOMMU_NOTIFIER_MAP;
     }
 
     if (old == IOMMU_NOTIFIER_NONE) {
@@ -1903,6 +2434,14 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass,
     imrc->notify_flag_changed = smmuv3_notify_flag_changed;
 }
 
+static const TypeInfo smmuv3_accel_type_info = {
+    .name          = TYPE_ARM_SMMUV3_ACCEL,
+    .parent        = TYPE_ARM_SMMUV3,
+    .instance_size = sizeof(SMMUv3AccelState),
+    .class_size    = sizeof(SMMUv3AccelClass),
+    .class_init    = smmuv3_accel_class_init,
+};
+
 static const TypeInfo smmuv3_type_info = {
     .name          = TYPE_ARM_SMMUV3,
     .parent        = TYPE_ARM_SMMU,
@@ -1921,6 +2460,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = {
 static void smmuv3_register_types(void)
 {
     type_register(&smmuv3_type_info);
+    type_register(&smmuv3_accel_type_info);
     type_register(&smmuv3_iommu_memory_region_info);
 }
 
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index cdc1ea06a81c560246fb94cabbeb4e0ab26ed3c4..490da6349cf49193bed9e49d1687eb5fb9de9561 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -5,6 +5,8 @@ virt_acpi_setup(void) "No fw cfg or ACPI disabled. Bailing out."
 
 # smmu-common.c
 smmu_add_mr(const char *name) "%s"
+smmu_set_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)"
+smmu_unset_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)"
 smmu_ptw_level(int stage, int level, uint64_t iova, size_t subpage_size, uint64_t baseaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d iova=0x%"PRIx64" subpage_sz=0x%zx baseaddr=0x%"PRIx64" offset=%d => pte=0x%"PRIx64
 smmu_ptw_invalid_pte(int stage, int level, uint64_t baseaddr, uint64_t pteaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d base@=0x%"PRIx64" pte@=0x%"PRIx64" offset=%d pte=0x%"PRIx64
 smmu_ptw_page_pte(int stage, int level,  uint64_t iova, uint64_t baseaddr, uint64_t pteaddr, uint64_t pte, uint64_t address) "stage=%d level=%d iova=0x%"PRIx64" base@=0x%"PRIx64" pte@=0x%"PRIx64" pte=0x%"PRIx64" page address = 0x%"PRIx64
@@ -53,5 +55,7 @@ smmuv3_cmdq_tlbi_s12_vmid(uint16_t vmid) "vmid=%d"
 smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x"
 smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s"
 smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s"
+smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x"
 smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64
+smmuv3_install_nested_ste(uint32_t sid, uint64_t ste_1, uint64_t ste_0) "sid=%d ste=%"PRIx64":%"PRIx64
 
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 8bc35a483c9233f4b718bd3a66c5ec807d048b1b..5e949671a1f67dabe82d8a94b40bfd6f868ddda9 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -58,11 +58,255 @@
 #include "migration/vmstate.h"
 #include "hw/acpi/ghes.h"
 #include "hw/acpi/viot.h"
+#include "kvm_arm.h"
 
 #define ARM_SPI_BASE 32
 
 #define ACPI_BUILD_TABLE_SIZE             0x20000
 
+/*
+ * ACPI spec, Revision 6.3
+ * 5.2.29.2 Cache Type Structure (Type 1)
+ */
+static void build_cache_hierarchy_node(GArray *tbl, uint32_t next_level,
+                                       uint32_t cache_type)
+{
+    build_append_byte(tbl, 1);
+    build_append_byte(tbl, 24);
+    build_append_int_noprefix(tbl, 0, 2);
+    build_append_int_noprefix(tbl, 127, 4);
+    build_append_int_noprefix(tbl, next_level, 4);
+
+    switch (cache_type) {
+    case ARM_L1D_CACHE: /* L1 dcache info */
+        build_append_int_noprefix(tbl, ARM_L1DCACHE_SIZE, 4);
+        build_append_int_noprefix(tbl, ARM_L1DCACHE_SETS, 4);
+        build_append_byte(tbl, ARM_L1DCACHE_ASSOCIATIVITY);
+        build_append_byte(tbl, ARM_L1DCACHE_ATTRIBUTES);
+        build_append_int_noprefix(tbl, ARM_L1DCACHE_LINE_SIZE, 2);
+        break;
+    case ARM_L1I_CACHE: /* L1 icache info */
+        build_append_int_noprefix(tbl, ARM_L1ICACHE_SIZE, 4);
+        build_append_int_noprefix(tbl, ARM_L1ICACHE_SETS, 4);
+        build_append_byte(tbl, ARM_L1ICACHE_ASSOCIATIVITY);
+        build_append_byte(tbl, ARM_L1ICACHE_ATTRIBUTES);
+        build_append_int_noprefix(tbl, ARM_L1ICACHE_LINE_SIZE, 2);
+        break;
+    case ARM_L1_CACHE: /* L1 cache info */
+        build_append_int_noprefix(tbl, ARM_L1CACHE_SIZE, 4);
+        build_append_int_noprefix(tbl, ARM_L1CACHE_SETS, 4);
+        build_append_byte(tbl, ARM_L1CACHE_ASSOCIATIVITY);
+        build_append_byte(tbl, ARM_L1CACHE_ATTRIBUTES);
+        build_append_int_noprefix(tbl, ARM_L1CACHE_LINE_SIZE, 2);
+        break;
+    case ARM_L2_CACHE: /* L2 cache info */
+        build_append_int_noprefix(tbl, ARM_L2CACHE_SIZE, 4);
+        build_append_int_noprefix(tbl, ARM_L2CACHE_SETS, 4);
+        build_append_byte(tbl, ARM_L2CACHE_ASSOCIATIVITY);
+        build_append_byte(tbl, ARM_L2CACHE_ATTRIBUTES);
+        build_append_int_noprefix(tbl, ARM_L2CACHE_LINE_SIZE, 2);
+        break;
+    case ARM_L3_CACHE: /* L3 cache info */
+        build_append_int_noprefix(tbl, ARM_L3CACHE_SIZE, 4);
+        build_append_int_noprefix(tbl, ARM_L3CACHE_SETS, 4);
+        build_append_byte(tbl, ARM_L3CACHE_ASSOCIATIVITY);
+        build_append_byte(tbl, ARM_L3CACHE_ATTRIBUTES);
+        build_append_int_noprefix(tbl, ARM_L3CACHE_LINE_SIZE, 2);
+        break;
+    default:
+        build_append_int_noprefix(tbl, 0, 4);
+        build_append_int_noprefix(tbl, 0, 4);
+        build_append_byte(tbl, 0);
+        build_append_byte(tbl, 0);
+        build_append_int_noprefix(tbl, 0, 2);
+    }
+}
+
+/*
+ * ACPI spec, Revision 6.3
+ * 5.2.29 Processor Properties Topology Table (PPTT)
+ */
+static void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms,
+                const char *oem_id, const char *oem_table_id)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    GQueue *list = g_queue_new();
+    guint pptt_start = table_data->len;
+    guint parent_offset;
+    guint length, i;
+    int uid = 0;
+    int socket;
+    AcpiTable table = { .sig = "PPTT", .rev = 2,
+                        .oem_id = oem_id, .oem_table_id = oem_table_id };
+    bool unified_l1 = cpu_l1_cache_unified(0);
+
+    acpi_table_begin(&table, table_data);
+
+    for (socket = 0; socket < ms->smp.sockets; socket++) {
+        uint32_t l3_cache_offset = table_data->len - pptt_start;
+        build_cache_hierarchy_node(table_data, 0, ARM_L3_CACHE);
+
+        g_queue_push_tail(list,
+            GUINT_TO_POINTER(table_data->len - pptt_start));
+        build_processor_hierarchy_node(
+            table_data,
+            /*
+             * Physical package - represents the boundary
+             * of a physical package
+             */
+            (1 << 0),
+            0, socket, &l3_cache_offset, 1);
+    }
+
+    if (mc->smp_props.clusters_supported) {
+        length = g_queue_get_length(list);
+        for (i = 0; i < length; i++) {
+            int cluster;
+
+            parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
+            for (cluster = 0; cluster < ms->smp.clusters; cluster++) {
+                g_queue_push_tail(list,
+                    GUINT_TO_POINTER(table_data->len - pptt_start));
+                build_processor_hierarchy_node(
+                    table_data,
+                    (0 << 0), /* not a physical package */
+                    parent_offset, cluster, NULL, 0);
+            }
+        }
+    }
+
+    length = g_queue_get_length(list);
+    for (i = 0; i < length; i++) {
+        int core;
+
+        parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
+        for (core = 0; core < ms->smp.cores; core++) {
+            uint32_t priv_rsrc[3] = {};
+            priv_rsrc[0] = table_data->len - pptt_start; /* L2 cache offset */
+            build_cache_hierarchy_node(table_data, 0, ARM_L2_CACHE);
+
+            if (unified_l1) {
+                priv_rsrc[1] = table_data->len - pptt_start; /* L1 cache offset */
+                build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1_CACHE);
+            } else {
+                priv_rsrc[1] = table_data->len - pptt_start; /* L1 dcache offset */
+                build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1D_CACHE);
+                priv_rsrc[2] = table_data->len - pptt_start; /* L1 icache offset */
+                build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1I_CACHE);
+            }
+
+            if (ms->smp.threads > 1) {
+                g_queue_push_tail(list,
+                    GUINT_TO_POINTER(table_data->len - pptt_start));
+                build_processor_hierarchy_node(
+                    table_data,
+                    (0 << 0), /* not a physical package */
+                    parent_offset, core, priv_rsrc, 3);
+            } else {
+                build_processor_hierarchy_node(
+                    table_data,
+                    (1 << 1) | /* ACPI Processor ID valid */
+                    (1 << 3),  /* Node is a Leaf */
+                    parent_offset, uid++, priv_rsrc, 3);
+            }
+        }
+    }
+
+    length = g_queue_get_length(list);
+    for (i = 0; i < length; i++) {
+        int thread;
+
+        parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list));
+        for (thread = 0; thread < ms->smp.threads; thread++) {
+            build_processor_hierarchy_node(
+                table_data,
+                (1 << 1) | /* ACPI Processor ID valid */
+                (1 << 2) | /* Processor is a Thread */
+                (1 << 3),  /* Node is a Leaf */
+                parent_offset, uid++, NULL, 0);
+        }
+    }
+
+    g_queue_free(list);
+    acpi_table_end(linker, &table);
+}
+
+static void acpi_dsdt_add_psd(Aml *dev, int cpus)
+{
+    Aml *pkg;
+    Aml *sub;
+
+    sub = aml_package(5);
+    aml_append(sub, aml_int(5));
+    aml_append(sub, aml_int(0));
+    /* Assume all vCPUs belong to the same domain */
+    aml_append(sub, aml_int(0));
+    /* SW_ANY: OSPM coordinate, initiate on any processor */
+    aml_append(sub, aml_int(0xFD));
+    aml_append(sub, aml_int(cpus));
+
+    pkg = aml_package(1);
+    aml_append(pkg, sub);
+
+    aml_append(dev, aml_name_decl("_PSD", pkg));
+}
+
+static void acpi_dsdt_add_cppc(Aml *dev, uint64_t cpu_base, int *regs_offset)
+{
+    Aml *cpc;
+    int i;
+
+    /* Use version 3 of CPPC table from ACPI 6.3 */
+    cpc = aml_package(23);
+    aml_append(cpc, aml_int(23));
+    aml_append(cpc, aml_int(3));
+
+    for (i = 0; i < CPPC_REG_COUNT; i++) {
+        Aml *res;
+        uint8_t reg_width;
+        uint8_t acc_type;
+        uint64_t addr;
+
+        if (regs_offset[i] == -1) {
+            reg_width = 0;
+            acc_type = AML_ANY_ACC;
+            addr = 0;
+        } else {
+            addr = cpu_base + regs_offset[i];
+            if (i == REFERENCE_CTR || i == DELIVERED_CTR) {
+                reg_width = 64;
+                acc_type = AML_QWORD_ACC;
+            } else {
+                reg_width = 32;
+                acc_type = AML_DWORD_ACC;
+            }
+        }
+
+        res = aml_resource_template();
+        aml_append(res, aml_generic_register(AML_SYSTEM_MEMORY, reg_width, 0,
+                                             acc_type, addr));
+        aml_append(cpc, res);
+    }
+
+    aml_append(dev, aml_name_decl("_CPC", cpc));
+}
+
+static void virt_acpi_dsdt_cpu_cppc(int ncpu, int num_cpu, Aml *dev) {
+    VirtMachineState *vms = VIRT_MACHINE(qdev_get_machine());
+    const MemMapEntry *cppc_memmap = &vms->memmap[VIRT_CPUFREQ];
+
+    /*
+     * Append _CPC and _PSD to support CPU frequence show
+     * Check CPPC available by DESIRED_PERF register
+     */
+    if (cppc_regs_offset[DESIRED_PERF] != -1) {
+        acpi_dsdt_add_cppc(dev,
+                           cppc_memmap->base + ncpu * CPPC_REG_PER_CPU_STRIDE,
+                           cppc_regs_offset);
+        acpi_dsdt_add_psd(dev, num_cpu);
+    }
+}
+
 static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms)
 {
     MachineState *ms = MACHINE(vms);
@@ -72,6 +316,9 @@ static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms)
         Aml *dev = aml_device("C%.03X", i);
         aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0007")));
         aml_append(dev, aml_name_decl("_UID", aml_int(i)));
+
+        virt_acpi_dsdt_cpu_cppc(i, ms->smp.cpus, dev);
+
         aml_append(scope, dev);
     }
 }
@@ -159,6 +406,54 @@ static void acpi_dsdt_add_virtio(Aml *scope,
     }
 }
 
+static void acpi_dsdt_add_hisi_sec(Aml *scope,
+                                   const MemMapEntry *virtio_mmio_memmap,
+                                   int dev_id)
+{
+    hwaddr size = 0x10000;
+
+    /*
+     * Calculate the base address for the sec device node.
+     * Each device group contains one sec device and one hpre device,spaced by 2 * size.
+     */
+    hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size;
+
+    Aml *dev = aml_device("SE%02u", dev_id);
+    aml_append(dev, aml_name_decl("_HID", aml_string("SEC07")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(dev_id)));
+    aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
+
+    Aml *crs = aml_resource_template();
+
+    aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+    aml_append(scope, dev);
+}
+
+static void acpi_dsdt_add_hisi_hpre(Aml *scope,
+                                    const MemMapEntry *virtio_mmio_memmap,
+                                    int dev_id)
+{
+    hwaddr size = 0x10000;
+
+    /*
+     * Calculate the base address for the hpre device node.
+     * Each hpre device follows the corresponding sec device by an additional offset of size.
+     */
+    hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size + size;
+
+    Aml *dev = aml_device("HP%02u", dev_id);
+    aml_append(dev, aml_name_decl("_HID", aml_string("HPRE07")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(dev_id)));
+    aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
+
+    Aml *crs = aml_resource_template();
+
+    aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+    aml_append(scope, dev);
+}
+
 static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap,
                               uint32_t irq, VirtMachineState *vms)
 {
@@ -171,6 +466,14 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap,
         .bus    = vms->bus,
     };
 
+    /*
+     * Accel SMMU requires RMRs for MSI 1-1 mapping, which
+     * require _DSM for PreservingPCI Boot Configurations
+     */
+    if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) {
+        cfg.preserve_config = true;
+    }
+
     if (vms->highmem_mmio) {
         cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO];
     }
@@ -249,7 +552,7 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms)
 #define IORT_NODE_OFFSET 48
 
 static void build_iort_id_mapping(GArray *table_data, uint32_t input_base,
-                                  uint32_t id_count, uint32_t out_ref)
+                                  uint32_t id_count, uint32_t out_ref, uint32_t flags)
 {
     /* Table 4 ID mapping format */
     build_append_int_noprefix(table_data, input_base, 4); /* Input base */
@@ -257,7 +560,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base,
     build_append_int_noprefix(table_data, input_base, 4); /* Output base */
     build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */
     /* Flags */
-    build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4);
+    build_append_int_noprefix(table_data, flags, 4); /* Flags */
 }
 
 struct AcpiIortIdMapping {
@@ -299,6 +602,50 @@ static int iort_idmap_compare(gconstpointer a, gconstpointer b)
     return idmap_a->input_base - idmap_b->input_base;
 }
 
+static void
+build_iort_rmr_nodes(GArray *table_data, GArray *smmu_idmaps,
+                     size_t *smmu_offset, uint32_t *id)
+{
+    AcpiIortIdMapping *range;
+    int i;
+
+    for (i = 0; i < smmu_idmaps->len; i++) {
+        range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i);
+        int bdf = range->input_base;
+
+        /* Table 18 Reserved Memory Range Node */
+
+        build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */
+        /* Length */
+        build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20, 2);
+        build_append_int_noprefix(table_data, 3, 1); /* Revision */
+        build_append_int_noprefix(table_data, *id, 4); /* Identifier */
+        /* Number of ID mappings */
+        build_append_int_noprefix(table_data, 1, 4);
+        /* Reference to ID Array */
+        build_append_int_noprefix(table_data, 28, 4);
+
+        /* RMR specific data */
+
+        /* Flags */
+        build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4);
+        /* Number of Memory Range Descriptors */
+        build_append_int_noprefix(table_data, 1 , 4);
+        /* Reference to Memory Range Descriptors */
+        build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4);
+        build_iort_id_mapping(table_data, bdf, range->id_count, smmu_offset[i], 1);
+
+        /* Table 19 Memory Range Descriptor */
+
+        /* Physical Range offset */
+        build_append_int_noprefix(table_data, 0x8000000, 8);
+        /* Physical Range length */
+        build_append_int_noprefix(table_data, 0x100000, 8);
+        build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+        *id += 1;
+    }
+}
+
 /*
  * Input Output Remapping Table (IORT)
  * Conforms to "IO Remapping Table System Software on ARM Platforms",
@@ -308,19 +655,34 @@ static void
 build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
     int i, nb_nodes, rc_mapping_count;
-    const uint32_t iort_node_offset = IORT_NODE_OFFSET;
-    size_t node_size, smmu_offset = 0;
+    size_t node_size, *smmu_offset;
     AcpiIortIdMapping *idmap;
+    hwaddr base;
+    int irq, num_smmus = 0;
     uint32_t id = 0;
     GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
     GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
 
-    AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id,
+    AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id,
                         .oem_table_id = vms->oem_table_id };
     /* Table 2 The IORT */
     acpi_table_begin(&table, table_data);
 
-    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
+    if (vms->smmu_accel_count) {
+        irq = vms->irqmap[VIRT_SMMU_ACCEL] + ARM_SPI_BASE;
+        base = vms->memmap[VIRT_SMMU_ACCEL].base;
+        num_smmus = vms->smmu_accel_count;
+    } else if (virt_has_smmuv3(vms)) {
+        irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
+        base = vms->memmap[VIRT_SMMU].base;
+        num_smmus = 1;
+    }
+
+    smmu_offset = g_new0(size_t, num_smmus);
+    nb_nodes = 2; /* RC, ITS */
+    nb_nodes += num_smmus; /* SMMU nodes */
+
+    if (virt_has_smmuv3(vms)) {
         AcpiIortIdMapping next_range = {0};
 
         object_child_foreach_recursive(object_get_root(),
@@ -342,18 +704,19 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
             }
 
             next_range.input_base = idmap->input_base + idmap->id_count;
+            if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) {
+                nb_nodes++; /* RMR node per SMMU */
+            }
         }
 
         /* Append the last RC -> ITS ID mapping */
-        if (next_range.input_base < 0xFFFF) {
-            next_range.id_count = 0xFFFF - next_range.input_base;
+        if (next_range.input_base < 0x10000) {
+            next_range.id_count = 0x10000 - next_range.input_base;
             g_array_append_val(its_idmaps, next_range);
         }
 
-        nb_nodes = 3; /* RC, ITS, SMMUv3 */
         rc_mapping_count = smmu_idmaps->len + its_idmaps->len;
     } else {
-        nb_nodes = 2; /* RC, ITS */
         rc_mapping_count = 1;
     }
     /* Number of IORT Nodes */
@@ -375,10 +738,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     /* GIC ITS Identifier Array */
     build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4);
 
-    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
-        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
+    for (i = 0; i < num_smmus; i++) {
+        smmu_offset[i] = table_data->len - table.table_offset;
 
-        smmu_offset = table_data->len - table.table_offset;
         /* Table 9 SMMUv3 Format */
         build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */
         node_size =  SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE;
@@ -389,7 +751,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         /* Reference to ID Array */
         build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4);
         /* Base address */
-        build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8);
+        build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8);
         /* Flags */
         build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4);
         build_append_int_noprefix(table_data, 0, 4); /* Reserved */
@@ -400,12 +762,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */
         build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */
         build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */
+        irq += NUM_SMMU_IRQS;
         build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */
         /* DeviceID mapping index (ignored since interrupts are GSIV based) */
         build_append_int_noprefix(table_data, 0, 4);
 
         /* output IORT node is the ITS group node (the first node) */
-        build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET);
+        build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0);
     }
 
     /* Table 17 Root Complex Node */
@@ -438,7 +801,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     build_append_int_noprefix(table_data, 0, 3); /* Reserved */
 
     /* Output Reference */
-    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
+    if (virt_has_smmuv3(vms)) {
         AcpiIortIdMapping *range;
 
         /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */
@@ -446,7 +809,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
             range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i);
             /* output IORT node is the smmuv3 node */
             build_iort_id_mapping(table_data, range->input_base,
-                                  range->id_count, smmu_offset);
+                                  range->id_count, smmu_offset[i], 0);
         }
 
         /* bypassed RIDs connect to ITS group node directly: RC -> ITS */
@@ -454,11 +817,15 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
             range = &g_array_index(its_idmaps, AcpiIortIdMapping, i);
             /* output IORT node is the ITS group node (the first node) */
             build_iort_id_mapping(table_data, range->input_base,
-                                  range->id_count, iort_node_offset);
+                                  range->id_count, IORT_NODE_OFFSET, 0);
         }
     } else {
         /* output IORT node is the ITS group node (the first node) */
-        build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET);
+        build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0);
+    }
+
+    if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) {
+        build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id);
     }
 
     acpi_table_end(linker, &table);
@@ -471,48 +838,34 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
  * Rev: 1.07
  */
 static void
-build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
+spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
-    AcpiTable table = { .sig = "SPCR", .rev = 2, .oem_id = vms->oem_id,
-                        .oem_table_id = vms->oem_table_id };
-
-    acpi_table_begin(&table, table_data);
-
-    /* Interface Type */
-    build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */
-    build_append_int_noprefix(table_data, 0, 3); /* Reserved */
-    /* Base Address */
-    build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3,
-                     vms->memmap[VIRT_UART].base);
-    /* Interrupt Type */
-    build_append_int_noprefix(table_data,
-        (1 << 3) /* Bit[3] ARMH GIC interrupt */, 1);
-    build_append_int_noprefix(table_data, 0, 1); /* IRQ */
-    /* Global System Interrupt */
-    build_append_int_noprefix(table_data,
-                              vms->irqmap[VIRT_UART] + ARM_SPI_BASE, 4);
-    build_append_int_noprefix(table_data, 3 /* 9600 */, 1); /* Baud Rate */
-    build_append_int_noprefix(table_data, 0 /* No Parity */, 1); /* Parity */
-    /* Stop Bits */
-    build_append_int_noprefix(table_data, 1 /* 1 Stop bit */, 1);
-    /* Flow Control */
-    build_append_int_noprefix(table_data,
-        (1 << 1) /* RTS/CTS hardware flow control */, 1);
-    /* Terminal Type */
-    build_append_int_noprefix(table_data, 0 /* VT100 */, 1);
-    build_append_int_noprefix(table_data, 0, 1); /* Language */
-    /* PCI Device ID  */
-    build_append_int_noprefix(table_data, 0xffff /* not a PCI device*/, 2);
-    /* PCI Vendor ID */
-    build_append_int_noprefix(table_data, 0xffff /* not a PCI device*/, 2);
-    build_append_int_noprefix(table_data, 0, 1); /* PCI Bus Number */
-    build_append_int_noprefix(table_data, 0, 1); /* PCI Device Number */
-    build_append_int_noprefix(table_data, 0, 1); /* PCI Function Number */
-    build_append_int_noprefix(table_data, 0, 4); /* PCI Flags */
-    build_append_int_noprefix(table_data, 0, 1); /* PCI Segment */
-    build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+    AcpiSpcrData serial = {
+        .interface_type = 3,       /* ARM PL011 UART */
+        .base_addr.id = AML_AS_SYSTEM_MEMORY,
+        .base_addr.width = 32,
+        .base_addr.offset = 0,
+        .base_addr.size = 3,
+        .base_addr.addr = vms->memmap[VIRT_UART].base,
+        .interrupt_type = (1 << 3),/* Bit[3] ARMH GIC interrupt*/
+        .pc_interrupt = 0,         /* IRQ */
+        .interrupt = (vms->irqmap[VIRT_UART] + ARM_SPI_BASE),
+        .baud_rate = 3,            /* 9600 */
+        .parity = 0,               /* No Parity */
+        .stop_bits = 1,            /* 1 Stop bit */
+        .flow_control = 1 << 1,    /* RTS/CTS hardware flow control */
+        .terminal_type = 0,        /* VT100 */
+        .language = 0,             /* Language */
+        .pci_device_id = 0xffff,   /* not a PCI device*/
+        .pci_vendor_id = 0xffff,   /* not a PCI device*/
+        .pci_bus = 0,
+        .pci_device = 0,
+        .pci_function = 0,
+        .pci_flags = 0,
+        .pci_segment = 0,
+    };
 
-    acpi_table_end(linker, &table);
+    build_spcr(table_data, linker, &serial, 2, vms->oem_id, vms->oem_table_id);
 }
 
 /*
@@ -700,14 +1053,50 @@ static void build_append_gicr(GArray *table_data, uint64_t base, uint32_t size)
     build_append_int_noprefix(table_data, size, 4); /* Discovery Range Length */
 }
 
+static uint32_t virt_acpi_get_gicc_flags(CPUState *cpu, VirtMachineState *vms)
+{
+    /* can only exist in 'enabled' state */
+    if (!vms->cpu_hotplug_enabled) {
+        return 1;
+    }
+
+    /*
+     * ARM GIC CPU Interface can be 'online-capable' or 'enabled' at boot. We
+     * MUST set 'online-capable' bit for all hotpluggable CPUs.
+     * Change Link: https://bugzilla.tianocore.org/show_bug.cgi?id=3706
+     *
+     *   UEFI ACPI Specification 6.5
+     *   Section: 5.2.12.14. GIC CPU Interface (GICC) Structure
+     *   Table:   5.37 GICC CPU Interface Flags
+     *   Link: https://uefi.org/specs/ACPI/6.5
+     *
+     * Cold-booted CPUs, except for the first/boot CPU, SHOULD be allowed to be
+     * hot(un)plug as well but for this to happen these MUST have
+     * 'online-capable' bit set. Later creates compatibility problem with legacy
+     * OS as it might ignore online-capable' bits during boot time and hence
+     * some CPUs might not get detected. To fix this MADT GIC CPU interface flag
+     * should be allowed to have both bits set i.e. 'online-capable' and
+     * 'Enabled' bits together. This change will require UEFI ACPI standard
+     * change. Till this happens exposing all cold-booted CPUs as 'enabled' only
+     *
+     */
+    return cpu && cpu->cold_booted ? 1 : (1 << 3);
+}
+
 static void
 build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
     int i;
     VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms);
+    MachineState *ms = MACHINE(vms);
     const MemMapEntry *memmap = vms->memmap;
     AcpiTable table = { .sig = "APIC", .rev = 4, .oem_id = vms->oem_id,
                         .oem_table_id = vms->oem_table_id };
+    unsigned int max_cpus = ms->smp.max_cpus;
+
+    if (!vms->cpu_hotplug_enabled) {
+        max_cpus = ms->smp.cpus;
+    }
 
     acpi_table_begin(&table, table_data);
     /* Local Interrupt Controller Address */
@@ -726,12 +1115,13 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     build_append_int_noprefix(table_data, vms->gic_version, 1);
     build_append_int_noprefix(table_data, 0, 3);   /* Reserved */
 
-    for (i = 0; i < MACHINE(vms)->smp.cpus; i++) {
-        ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(i));
+    for (i = 0; i < max_cpus; i++) {
+        CPUState *cpu = qemu_get_possible_cpu(i);
         uint64_t physical_base_address = 0, gich = 0, gicv = 0;
         uint32_t vgic_interrupt = vms->virt ? ARCH_GIC_MAINT_IRQ : 0;
-        uint32_t pmu_interrupt = arm_feature(&armcpu->env, ARM_FEATURE_PMU) ?
-                                             VIRTUAL_PMU_IRQ : 0;
+        uint32_t pmu_interrupt = vms->pmu ? VIRTUAL_PMU_IRQ : 0;
+        uint32_t flags = virt_acpi_get_gicc_flags(cpu, vms);
+        uint64_t mpidr = qemu_get_cpu_archid(i);
 
         if (vms->gic_version == VIRT_GIC_VERSION_2) {
             physical_base_address = memmap[VIRT_GIC_CPU].base;
@@ -746,7 +1136,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         build_append_int_noprefix(table_data, i, 4);    /* GIC ID */
         build_append_int_noprefix(table_data, i, 4);    /* ACPI Processor UID */
         /* Flags */
-        build_append_int_noprefix(table_data, 1, 4);    /* Enabled */
+        build_append_int_noprefix(table_data, flags, 4);
         /* Parking Protocol Version */
         build_append_int_noprefix(table_data, 0, 4);
         /* Performance Interrupt GSIV */
@@ -760,7 +1150,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         build_append_int_noprefix(table_data, vgic_interrupt, 4);
         build_append_int_noprefix(table_data, 0, 8);    /* GICR Base Address*/
         /* MPIDR */
-        build_append_int_noprefix(table_data, armcpu->mp_affinity, 8);
+        build_append_int_noprefix(table_data, mpidr, 8);
         /* Processor Power Efficiency Class */
         build_append_int_noprefix(table_data, 0, 1);
         /* Reserved */
@@ -837,6 +1227,55 @@ static void build_fadt_rev6(GArray *table_data, BIOSLinker *linker,
     build_fadt(table_data, linker, &fadt, vms->oem_id, vms->oem_table_id);
 }
 
+static void build_virt_osc_method(Aml *scope, VirtMachineState *vms)
+{
+    Aml *if_uuid, *else_uuid, *if_rev, *if_caps_masked, *method;
+    Aml *a_cdw1 = aml_name("CDW1");
+    Aml *a_cdw2 = aml_local(0);
+
+    method = aml_method("_OSC", 4, AML_NOTSERIALIZED);
+    aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1"));
+
+    /* match UUID */
+    if_uuid = aml_if(aml_equal(
+        aml_arg(0), aml_touuid("0811B06E-4A27-44F9-8D60-3CBBC22E7B48")));
+
+    aml_append(if_uuid, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2"));
+    aml_append(if_uuid, aml_store(aml_name("CDW2"), a_cdw2));
+
+    /* check unknown revision in arg(1) */
+    if_rev = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1))));
+    /* set revision error bits,  DWORD1 Bit[3] */
+    aml_append(if_rev, aml_or(a_cdw1, aml_int(0x08), a_cdw1));
+    aml_append(if_uuid, if_rev);
+
+    /*
+     * check support for vCPU hotplug type(=enabled) platform-wide capability
+     * in DWORD2 as sepcified in the below ACPI Specification ECR,
+     *  # https://bugzilla.tianocore.org/show_bug.cgi?id=4481
+     */
+    if (vms->acpi_dev) {
+        aml_append(if_uuid, aml_and(a_cdw2, aml_int(0x800000), a_cdw2));
+        /* check if OSPM specified hotplug capability bits were masked */
+        if_caps_masked = aml_if(aml_lnot(aml_equal(aml_name("CDW2"), a_cdw2)));
+        aml_append(if_caps_masked, aml_or(a_cdw1, aml_int(0x10), a_cdw1));
+        aml_append(if_uuid, if_caps_masked);
+    }
+    aml_append(if_uuid, aml_store(a_cdw2, aml_name("CDW2")));
+
+    aml_append(method, if_uuid);
+    else_uuid = aml_else();
+
+    /* set unrecognized UUID error bits, DWORD1 Bit[2] */
+    aml_append(else_uuid, aml_or(a_cdw1, aml_int(4), a_cdw1));
+    aml_append(method, else_uuid);
+
+    aml_append(method, aml_return(aml_arg(3)));
+    aml_append(scope, method);
+
+    return;
+}
+
 /* DSDT */
 static void
 build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
@@ -858,7 +1297,22 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
      * the RTC ACPI device at all when using UEFI.
      */
     scope = aml_scope("\\_SB");
-    acpi_dsdt_add_cpus(scope, vms);
+
+    if (vms->cpu_hotplug_enabled) {
+        CPUHotplugFeatures opts = {
+             .acpi_1_compatible = false,
+             .has_legacy_cphp = false
+        };
+
+        build_cpus_aml(scope, ms, opts, NULL, virt_acpi_dsdt_cpu_cppc,
+                       memmap[VIRT_CPUHP_ACPI].base,
+                       "\\_SB", NULL, AML_SYSTEM_MEMORY);
+    } else {
+        acpi_dsdt_add_cpus(scope, vms);
+    }
+
+    build_virt_osc_method(scope, vms);
+
     acpi_dsdt_add_uart(scope, &memmap[VIRT_UART],
                        (irqmap[VIRT_UART] + ARM_SPI_BASE));
     if (vmc->acpi_expose_flash) {
@@ -868,6 +1322,15 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     acpi_dsdt_add_virtio(scope, &memmap[VIRT_MMIO],
                     (irqmap[VIRT_MMIO] + ARM_SPI_BASE), NUM_VIRTIO_TRANSPORTS);
     acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms);
+
+    if (virtcca_cvm_enabled()) {
+        int kae_num = tmm_get_kae_num();
+        for (int i = 0; i < kae_num; i++) {
+            acpi_dsdt_add_hisi_sec(scope, &memmap[VIRT_KAE_DEVICE], i);
+            acpi_dsdt_add_hisi_hpre(scope, &memmap[VIRT_KAE_DEVICE], i);
+        }
+    }
+
     if (vms->acpi_dev) {
         build_ged_aml(scope, "\\_SB."GED_DEVICE,
                       HOTPLUG_HANDLER(vms->acpi_dev),
@@ -951,7 +1414,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
 
     if (!vmc->no_cpu_topology) {
         acpi_add_table(table_offsets, tables_blob);
-        build_pptt(tables_blob, tables->linker, ms,
+        build_pptt_arm(tables_blob, tables->linker, ms,
                    vms->oem_id, vms->oem_table_id);
     }
 
@@ -969,7 +1432,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
     }
 
     acpi_add_table(table_offsets, tables_blob);
-    build_spcr(tables_blob, tables->linker, vms);
+    spcr_setup(tables_blob, tables->linker, vms);
 
     acpi_add_table(table_offsets, tables_blob);
     build_dbg2(tables_blob, tables->linker, vms);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index be2856c018aa14c6ffda283cd7f99fb0d8e7a803..2957b844d28e5307cbb6be8cde001d08c3ffbeae 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -32,6 +32,7 @@
 #include "qemu/datadir.h"
 #include "qemu/units.h"
 #include "qemu/option.h"
+#include "qemu/log.h"
 #include "monitor/qdev.h"
 #include "hw/sysbus.h"
 #include "hw/arm/boot.h"
@@ -45,6 +46,8 @@
 #include "sysemu/device_tree.h"
 #include "sysemu/numa.h"
 #include "sysemu/runstate.h"
+#include "sysemu/reset.h"
+#include "sysemu/sysemu.h"
 #include "sysemu/tpm.h"
 #include "sysemu/tcg.h"
 #include "sysemu/kvm.h"
@@ -77,10 +80,12 @@
 #include "hw/mem/pc-dimm.h"
 #include "hw/mem/nvdimm.h"
 #include "hw/acpi/generic_event_device.h"
+#include "hw/acpi/cpu_hotplug.h"
 #include "hw/virtio/virtio-md-pci.h"
 #include "hw/virtio/virtio-iommu.h"
 #include "hw/char/pl011.h"
 #include "qemu/guest-random.h"
+#include "qapi/qmp/qdict.h"
 
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
     static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
@@ -150,13 +155,18 @@ static const MemMapEntry base_memmap[] = {
     [VIRT_FW_CFG] =             { 0x09020000, 0x00000018 },
     [VIRT_GPIO] =               { 0x09030000, 0x00001000 },
     [VIRT_SECURE_UART] =        { 0x09040000, 0x00001000 },
-    [VIRT_SMMU] =               { 0x09050000, 0x00020000 },
+    [VIRT_SMMU] =               { 0x09050000, SMMU_IO_LEN },
     [VIRT_PCDIMM_ACPI] =        { 0x09070000, MEMORY_HOTPLUG_IO_LEN },
     [VIRT_ACPI_GED] =           { 0x09080000, ACPI_GED_EVT_SEL_LEN },
     [VIRT_NVDIMM_ACPI] =        { 0x09090000, NVDIMM_ACPI_IO_LEN},
     [VIRT_PVTIME] =             { 0x090a0000, 0x00010000 },
     [VIRT_SECURE_GPIO] =        { 0x090b0000, 0x00001000 },
+    [VIRT_CPUHP_ACPI] =         { 0x090c0000, ACPI_CPU_HOTPLUG_REG_LEN},
     [VIRT_MMIO] =               { 0x0a000000, 0x00000200 },
+    /* In the virtCCA scenario, this space is used for MSI interrupt mapping */
+    [VIRT_CVM_MSI] =            { 0x0a001000, 0x00fff000 },
+    [VIRT_CPUFREQ] =            { 0x0b000000, 0x00010000 },
+    [VIRT_SMMU_ACCEL] =        { 0x0b010000, 0x00ff0000},
     /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
     [VIRT_PLATFORM_BUS] =       { 0x0c000000, 0x02000000 },
     [VIRT_SECURE_MEM] =         { 0x0e000000, 0x01000000 },
@@ -167,6 +177,12 @@ static const MemMapEntry base_memmap[] = {
     [VIRT_MEM] =                { GiB, LEGACY_RAMLIMIT_BYTES },
 };
 
+void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size)
+{
+    *mmio_start = base_memmap[VIRT_PCIE_MMIO].base;
+    *mmio_size = base_memmap[VIRT_PCIE_MMIO].size;
+}
+
 /*
  * Highmem IO Regions: This memory map is floating, located after the RAM.
  * Each MemMapEntry base (GPA) will be dynamically computed, depending on the
@@ -202,6 +218,7 @@ static const int a15irqmap[] = {
     [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */
     [VIRT_SMMU] = 74,    /* ...to 74 + NUM_SMMU_IRQS - 1 */
     [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */
+    [VIRT_SMMU_ACCEL] = 200,
 };
 
 static const char *valid_cpus[] = {
@@ -220,10 +237,19 @@ static const char *valid_cpus[] = {
 #endif
     ARM_CPU_TYPE_NAME("cortex-a53"),
     ARM_CPU_TYPE_NAME("cortex-a57"),
+    ARM_CPU_TYPE_NAME("Kunpeng-920"),
+    ARM_CPU_TYPE_NAME("FT-2000+"),
+    ARM_CPU_TYPE_NAME("Tengyun-S2500"),
     ARM_CPU_TYPE_NAME("host"),
     ARM_CPU_TYPE_NAME("max"),
 };
 
+static CPUArchId *virt_find_cpu_slot(MachineState *ms, int vcpuid);
+static int virt_get_socket_id(const MachineState *ms, int cpu_index);
+static int virt_get_cluster_id(const MachineState *ms, int cpu_index);
+static int virt_get_core_id(const MachineState *ms, int cpu_index);
+static int virt_get_thread_id(const MachineState *ms, int cpu_index);
+
 static bool cpu_type_valid(const char *cpu)
 {
     int i;
@@ -271,8 +297,16 @@ static void create_fdt(VirtMachineState *vms)
 
     /* /chosen must exist for load_dtb to fill in necessary properties later */
     qemu_fdt_add_subnode(fdt, "/chosen");
+
+    g_autofree char *kvm_type = NULL;
+    if (object_property_find(OBJECT(current_machine), "kvm-type")) {
+        kvm_type = object_property_get_str(OBJECT(current_machine),
+                                           "kvm-type", &error_abort);
+    }
     if (vms->dtb_randomness) {
-        create_randomness(ms, "/chosen");
+        if (!(kvm_type && !strcmp(kvm_type, "cvm"))) {
+            create_randomness(ms, "/chosen");
+        }
     }
 
     if (vms->secure) {
@@ -378,6 +412,122 @@ static void fdt_add_timer_nodes(const VirtMachineState *vms)
                            INTID_TO_PPI(ARCH_TIMER_NS_EL2_IRQ), irqflags);
 }
 
+/*
+ * In CLIDR_EL1 exposed to guest by the hypervisor, L1 cache type
+ * maybe unified or seperate ins and data. We need to read the
+ * guest visable CLIDR_EL1 and check L1 cache type.
+ */
+bool cpu_l1_cache_unified(int cpu)
+{
+    bool unified = false;
+    uint64_t clidr;
+    ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu));
+    CPUState *cs = CPU(armcpu);
+    int ret;
+
+    if (kvm_enabled()) {
+        struct kvm_one_reg reg = {
+            .id = ARM64_REG_CLIDR_EL1,
+            .addr = (uintptr_t)&clidr
+        };
+
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+        if (ret) {
+            error_setg(&error_fatal, "Get vCPU clidr from KVM failed:%d", ret);
+            return unified;
+        }
+
+        if (CLIDR_CTYPE(clidr, 1) == CTYPE_UNIFIED) {
+            unified = true;
+        }
+    }
+
+    return unified;
+}
+
+static void fdt_add_l3cache_nodes(const VirtMachineState *vms)
+{
+    int i;
+    const MachineState *ms = MACHINE(vms);
+    int cpus_per_socket = ms->smp.clusters * ms->smp.cores * ms->smp.threads;
+    int sockets = (ms->smp.cpus + cpus_per_socket - 1) / cpus_per_socket;
+
+    for (i = 0; i < sockets; i++) {
+        char *nodename = g_strdup_printf("/cpus/l3-cache%d", i);
+
+        qemu_fdt_add_subnode(ms->fdt, nodename);
+        qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache");
+        qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true");
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-level", 3);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L3CACHE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size",
+                              ARM_L3CACHE_LINE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L3CACHE_SETS);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle",
+                              qemu_fdt_alloc_phandle(ms->fdt));
+        g_free(nodename);
+    }
+}
+
+static void fdt_add_l2cache_nodes(const VirtMachineState *vms)
+{
+    const MachineState *ms = MACHINE(vms);
+    int cpus_per_socket = ms->smp.clusters * ms->smp.cores * ms->smp.threads;
+    int cpu;
+
+    for (cpu = 0; cpu < ms->smp.cpus; cpu++) {
+        char *next_path = g_strdup_printf("/cpus/l3-cache%d",
+                                          cpu / cpus_per_socket);
+        char *nodename = g_strdup_printf("/cpus/l2-cache%d", cpu);
+
+        qemu_fdt_add_subnode(ms->fdt, nodename);
+        qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true");
+        qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache");
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L2CACHE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size",
+                              ARM_L2CACHE_LINE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L2CACHE_SETS);
+        qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache",
+                                 next_path);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle",
+                              qemu_fdt_alloc_phandle(ms->fdt));
+
+        g_free(next_path);
+        g_free(nodename);
+    }
+}
+
+static void fdt_add_l1cache_prop(const VirtMachineState *vms,
+                                 char *nodename, int cpu)
+{
+    const MachineState *ms = MACHINE(vms);
+    char *next_path = g_strdup_printf("/cpus/l2-cache%d", cpu);
+    bool unified_l1 = cpu_l1_cache_unified(0);
+
+    if (unified_l1) {
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L1CACHE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size",
+                              ARM_L1CACHE_LINE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L1CACHE_SETS);
+    } else {
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-size",
+                              ARM_L1DCACHE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-line-size",
+                              ARM_L1DCACHE_LINE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-sets",
+                              ARM_L1DCACHE_SETS);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-size",
+                              ARM_L1ICACHE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-line-size",
+                              ARM_L1ICACHE_LINE_SIZE);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-sets",
+                              ARM_L1ICACHE_SETS);
+    }
+    qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", next_path);
+
+    g_free(next_path);
+}
+
 static void fdt_add_cpu_nodes(const VirtMachineState *vms)
 {
     int cpu;
@@ -412,6 +562,11 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms)
     qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#address-cells", addr_cells);
     qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#size-cells", 0x0);
 
+    if (!vmc->no_cpu_topology) {
+        fdt_add_l3cache_nodes(vms);
+        fdt_add_l2cache_nodes(vms);
+    }
+
     for (cpu = smp_cpus - 1; cpu >= 0; cpu--) {
         char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu);
         ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu));
@@ -441,6 +596,7 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms)
         }
 
         if (!vmc->no_cpu_topology) {
+            fdt_add_l1cache_prop(vms, nodename, cpu);
             qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle",
                                   qemu_fdt_alloc_phandle(ms->fdt));
         }
@@ -644,7 +800,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms)
     DeviceState *dev;
     MachineState *ms = MACHINE(vms);
     int irq = vms->irqmap[VIRT_ACPI_GED];
-    uint32_t event = ACPI_GED_PWR_DOWN_EVT;
+    uint32_t event = ACPI_GED_PWR_DOWN_EVT | ACPI_GED_CPU_HOTPLUG_EVT;
 
     if (ms->ram_slots) {
         event |= ACPI_GED_MEM_HOTPLUG_EVT;
@@ -660,11 +816,22 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms)
 
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vms->memmap[VIRT_ACPI_GED].base);
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 1, vms->memmap[VIRT_PCDIMM_ACPI].base);
+    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 3, vms->memmap[VIRT_CPUHP_ACPI].base);
     sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, qdev_get_gpio_in(vms->gic, irq));
 
     return dev;
 }
 
+static void virt_add_gic_cpuhp_notifier(VirtMachineState *vms)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(vms);
+
+    if (mc->has_hotpluggable_cpus && vms->gic_version >= VIRT_GIC_VERSION_3) {
+        Notifier *cpuhp_notifier = gicv3_cpuhp_notifier(vms->gic);
+        notifier_list_add(&vms->cpuhp_notifiers, cpuhp_notifier);
+    }
+}
+
 static void create_its(VirtMachineState *vms)
 {
     const char *itsclass = its_class_name();
@@ -713,6 +880,107 @@ static void create_v2m(VirtMachineState *vms)
     vms->msi_controller = VIRT_MSI_CTRL_GICV2M;
 }
 
+/*
+ * Mapping from the output timer irq lines from the CPU to the GIC PPI inputs
+ * we use for the virt board.
+ */
+const int timer_irq[] = {
+    [GTIMER_PHYS] = ARCH_TIMER_NS_EL1_IRQ,
+    [GTIMER_VIRT] = ARCH_TIMER_VIRT_IRQ,
+    [GTIMER_HYP]  = ARCH_TIMER_NS_EL2_IRQ,
+    [GTIMER_SEC]  = ARCH_TIMER_S_EL1_IRQ,
+};
+
+static void unwire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs)
+{
+    MachineState *ms = MACHINE(vms);
+    unsigned int max_cpus = ms->smp.max_cpus;
+    DeviceState *cpudev = DEVICE(cs);
+    DeviceState *gicdev = vms->gic;
+    int cpu = CPU(cs)->cpu_index;
+    int type = vms->gic_version;
+    int irq;
+
+    if (!vms->cpu_hotplug_enabled) {
+        max_cpus = ms->smp.cpus;
+    }
+
+    for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) {
+        qdev_disconnect_gpio_out_named(cpudev, NULL, irq);
+    }
+
+    if (type != VIRT_GIC_VERSION_2) {
+        qdev_disconnect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt",
+                                       0);
+    } else if (vms->virt) {
+        qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ,
+                                       cpu + 4 * max_cpus);
+    }
+
+    /*
+     * RFC: Question: This currently does not takes care of intimating the
+     * devices which might be sitting on system bus. Do we need a
+     * sysbus_disconnect_irq() which also does the job of notification beside
+     * disconnection?
+     */
+    qdev_disconnect_gpio_out_named(cpudev, "pmu-interrupt", 0);
+    qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, cpu);
+    qdev_disconnect_gpio_out_named(gicdev,
+                                   SYSBUS_DEVICE_GPIO_IRQ, cpu + max_cpus);
+    qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ,
+                                   cpu + 2 * max_cpus);
+    qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ,
+                                   cpu + 3 * max_cpus);
+}
+
+static void wire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs)
+{
+    MachineState *ms = MACHINE(vms);
+    unsigned int max_cpus = ms->smp.max_cpus;
+    DeviceState *cpudev = DEVICE(cs);
+    DeviceState *gicdev = vms->gic;
+    int cpu = CPU(cs)->cpu_index;
+    int type = vms->gic_version;
+    SysBusDevice *gicbusdev;
+    int intidbase;
+    int irq;
+
+    if (!vms->cpu_hotplug_enabled) {
+        max_cpus = ms->smp.cpus;
+    }
+
+    intidbase = NUM_IRQS + cpu * GIC_INTERNAL;
+
+    for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) {
+        qdev_connect_gpio_out(cpudev, irq,
+                              qdev_get_gpio_in(gicdev,
+                                               intidbase + timer_irq[irq]));
+    }
+
+    gicbusdev = SYS_BUS_DEVICE(gicdev);
+    if (type != VIRT_GIC_VERSION_2) {
+        qemu_irq qirq = qdev_get_gpio_in(gicdev,
+                                        intidbase + ARCH_GIC_MAINT_IRQ);
+        qdev_connect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt",
+                                    0, qirq);
+    } else if (vms->virt) {
+        qemu_irq qirq = qdev_get_gpio_in(gicdev,
+                                        intidbase + ARCH_GIC_MAINT_IRQ);
+        sysbus_connect_irq(gicbusdev, cpu + 4 * max_cpus, qirq);
+    }
+
+    qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0,
+                                qdev_get_gpio_in(gicdev,
+                                                 intidbase + VIRTUAL_PMU_IRQ));
+    sysbus_connect_irq(gicbusdev, cpu, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ));
+    sysbus_connect_irq(gicbusdev, cpu + max_cpus,
+                       qdev_get_gpio_in(cpudev, ARM_CPU_FIQ));
+    sysbus_connect_irq(gicbusdev, cpu + 2 * max_cpus,
+                       qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ));
+    sysbus_connect_irq(gicbusdev, cpu + 3 * max_cpus,
+                       qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ));
+}
+
 static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
 {
     MachineState *ms = MACHINE(vms);
@@ -721,9 +989,14 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
     const char *gictype;
     int i;
     unsigned int smp_cpus = ms->smp.cpus;
+    unsigned int max_cpus = ms->smp.max_cpus;
     uint32_t nb_redist_regions = 0;
     int revision;
 
+    if (!vms->cpu_hotplug_enabled) {
+        max_cpus = ms->smp.cpus;
+    }
+
     if (vms->gic_version == VIRT_GIC_VERSION_2) {
         gictype = gic_class_name();
     } else {
@@ -745,7 +1018,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
     }
     vms->gic = qdev_new(gictype);
     qdev_prop_set_uint32(vms->gic, "revision", revision);
-    qdev_prop_set_uint32(vms->gic, "num-cpu", smp_cpus);
+    qdev_prop_set_uint32(vms->gic, "num-cpu", max_cpus);
     /* Note that the num-irq property counts both internal and external
      * interrupts; there are always 32 of the former (mandated by GIC spec).
      */
@@ -757,7 +1030,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
     if (vms->gic_version != VIRT_GIC_VERSION_2) {
         QList *redist_region_count;
         uint32_t redist0_capacity = virt_redist_capacity(vms, VIRT_GIC_REDIST);
-        uint32_t redist0_count = MIN(smp_cpus, redist0_capacity);
+        uint32_t redist0_count = MIN(max_cpus, redist0_capacity);
 
         nb_redist_regions = virt_gicv3_redist_region_count(vms);
 
@@ -768,7 +1041,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
                 virt_redist_capacity(vms, VIRT_HIGH_GIC_REDIST2);
 
             qlist_append_int(redist_region_count,
-                MIN(smp_cpus - redist0_count, redist1_capacity));
+                MIN(max_cpus - redist0_count, redist1_capacity));
         }
         qdev_prop_set_array(vms->gic, "redist-region-count",
                             redist_region_count);
@@ -808,46 +1081,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
      * and the GIC's IRQ/FIQ/VIRQ/VFIQ interrupt outputs to the CPU's inputs.
      */
     for (i = 0; i < smp_cpus; i++) {
-        DeviceState *cpudev = DEVICE(qemu_get_cpu(i));
-        int intidbase = NUM_IRQS + i * GIC_INTERNAL;
-        /* Mapping from the output timer irq lines from the CPU to the
-         * GIC PPI inputs we use for the virt board.
-         */
-        const int timer_irq[] = {
-            [GTIMER_PHYS] = ARCH_TIMER_NS_EL1_IRQ,
-            [GTIMER_VIRT] = ARCH_TIMER_VIRT_IRQ,
-            [GTIMER_HYP]  = ARCH_TIMER_NS_EL2_IRQ,
-            [GTIMER_SEC]  = ARCH_TIMER_S_EL1_IRQ,
-        };
-
-        for (unsigned irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) {
-            qdev_connect_gpio_out(cpudev, irq,
-                                  qdev_get_gpio_in(vms->gic,
-                                                   intidbase + timer_irq[irq]));
-        }
-
-        if (vms->gic_version != VIRT_GIC_VERSION_2) {
-            qemu_irq irq = qdev_get_gpio_in(vms->gic,
-                                            intidbase + ARCH_GIC_MAINT_IRQ);
-            qdev_connect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt",
-                                        0, irq);
-        } else if (vms->virt) {
-            qemu_irq irq = qdev_get_gpio_in(vms->gic,
-                                            intidbase + ARCH_GIC_MAINT_IRQ);
-            sysbus_connect_irq(gicbusdev, i + 4 * smp_cpus, irq);
-        }
-
-        qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0,
-                                    qdev_get_gpio_in(vms->gic, intidbase
-                                                     + VIRTUAL_PMU_IRQ));
-
-        sysbus_connect_irq(gicbusdev, i, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ));
-        sysbus_connect_irq(gicbusdev, i + smp_cpus,
-                           qdev_get_gpio_in(cpudev, ARM_CPU_FIQ));
-        sysbus_connect_irq(gicbusdev, i + 2 * smp_cpus,
-                           qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ));
-        sysbus_connect_irq(gicbusdev, i + 3 * smp_cpus,
-                           qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ));
+        wire_gic_cpu_irqs(vms, qemu_get_cpu(i));
     }
 
     fdt_add_gic_node(vms);
@@ -857,6 +1091,9 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
     } else if (vms->gic_version == VIRT_GIC_VERSION_2) {
         create_v2m(vms);
     }
+
+    /* add GIC CPU hot(un)plug update notifier */
+    virt_add_gic_cpuhp_notifier(vms);
 }
 
 static void create_uart(const VirtMachineState *vms, int uart,
@@ -907,6 +1144,16 @@ static void create_uart(const VirtMachineState *vms, int uart,
     g_free(nodename);
 }
 
+static void create_cpufreq(const VirtMachineState *vms, MemoryRegion *mem)
+{
+    hwaddr base = vms->memmap[VIRT_CPUFREQ].base;
+    DeviceState *dev = qdev_new("cpufreq");
+    SysBusDevice *s = SYS_BUS_DEVICE(dev);
+
+    sysbus_realize_and_unref(s, &error_fatal);
+    memory_region_add_subregion(mem, base, sysbus_mmio_get_region(s, 0));
+}
+
 static void create_rtc(const VirtMachineState *vms)
 {
     char *nodename;
@@ -936,6 +1183,7 @@ static void virt_powerdown_req(Notifier *n, void *opaque)
 {
     VirtMachineState *s = container_of(n, VirtMachineState, powerdown_notifier);
 
+    qemu_log("send powerdown to vm.\n");
     if (s->acpi_dev) {
         acpi_send_event(s->acpi_dev, ACPI_POWER_DOWN_STATUS);
     } else {
@@ -1160,6 +1408,9 @@ static void virt_flash_map1(PFlashCFI01 *flash,
     qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE);
     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
 
+    if (virtcca_cvm_enabled()) {
+        return;
+    }
     memory_region_add_subregion(sysmem, base,
                                 sysbus_mmio_get_region(SYS_BUS_DEVICE(dev),
                                                        0));
@@ -1195,6 +1446,10 @@ static void virt_flash_fdt(VirtMachineState *vms,
     MachineState *ms = MACHINE(vms);
     char *nodename;
 
+    if (virtcca_cvm_enabled()) {
+        return;
+    }
+
     if (sysmem == secure_sysmem) {
         /* Report both flash devices as a single node in the DT */
         nodename = g_strdup_printf("/flash@%" PRIx64, flashbase);
@@ -1230,6 +1485,23 @@ static void virt_flash_fdt(VirtMachineState *vms,
     }
 }
 
+static bool virt_confidential_firmware_init(VirtMachineState *vms,
+                                            MemoryRegion *sysmem)
+{
+    MemoryRegion *fw_ram;
+    hwaddr fw_base = vms->memmap[VIRT_FLASH].base;
+    hwaddr fw_size = vms->memmap[VIRT_FLASH].size;
+
+    if (!MACHINE(vms)->firmware) {
+        return false;
+    }
+
+    fw_ram = g_new(MemoryRegion, 1);
+    memory_region_init_ram(fw_ram, NULL, "fw_ram", fw_size, NULL);
+    memory_region_add_subregion(sysmem, fw_base, fw_ram);
+    return true;
+}
+
 static bool virt_firmware_init(VirtMachineState *vms,
                                MemoryRegion *sysmem,
                                MemoryRegion *secure_sysmem)
@@ -1248,6 +1520,10 @@ static bool virt_firmware_init(VirtMachineState *vms,
 
     pflash_blk0 = pflash_cfi01_get_blk(vms->flash[0]);
 
+    if (virtcca_cvm_enabled()) {
+        return virt_confidential_firmware_init(vms, sysmem);
+    }
+
     bios_name = MACHINE(vms)->firmware;
     if (bios_name) {
         char *fname;
@@ -1289,7 +1565,7 @@ static FWCfgState *create_fw_cfg(const VirtMachineState *vms, AddressSpace *as)
     char *nodename;
 
     fw_cfg = fw_cfg_init_mem_wide(base + 8, base, 8, base + 16, as);
-    fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, (uint16_t)ms->smp.cpus);
+    fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus);
 
     nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base);
     qemu_fdt_add_subnode(ms->fdt, nodename);
@@ -1775,6 +2051,33 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
         vms->memmap[i] = base_memmap[i];
     }
 
+    /* fix VIRT_MEM range */
+    if (object_property_find(OBJECT(current_machine), "kvm-type")) {
+        g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
+                                                            "kvm-type", &error_abort);
+
+        if (!strcmp(kvm_type, "cvm")) {
+            /* support kae vf device tree nodes */
+            vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 };
+            vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 };
+            uint64_t tmi_version = 0;
+            int ret = -1;
+            if (kvm_enabled()) {
+                ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION,  &tmi_version);
+            }
+            if (ret < 0) {
+                warn_report("can not get tmi version");
+            }
+            if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) {
+                vms->memmap[VIRT_MEM].base = 3 * GiB;
+            }
+            virtcca_cvm_gpa_start = vms->memmap[VIRT_MEM].base;
+            vms->memmap[VIRT_MEM].size = ms->ram_size;
+            info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base),
+                 (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size));
+        }
+    }
+
     if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) {
         error_report("unsupported number of memory slots: %"PRIu64,
                      ms->ram_slots);
@@ -1962,12 +2265,15 @@ static void finalize_gic_version(VirtMachineState *vms)
  */
 static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem)
 {
+    CPUArchIdList *possible_cpus = vms->parent.possible_cpus;
     int max_cpus = MACHINE(vms)->smp.max_cpus;
-    bool aarch64, pmu, steal_time;
+    MachineState *ms = MACHINE(vms);
+    bool aarch64, steal_time;
     CPUState *cpu;
+    int n;
 
     aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
-    pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
+    vms->pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
     steal_time = object_property_get_bool(OBJECT(first_cpu),
                                           "kvm-steal-time", NULL);
 
@@ -1994,8 +2300,13 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem)
             memory_region_add_subregion(sysmem, pvtime_reg_base, pvtime);
         }
 
-        CPU_FOREACH(cpu) {
-            if (pmu) {
+        for (n = 0; n < possible_cpus->len; n++) {
+            cpu = qemu_get_possible_cpu(n);
+            if (!qemu_present_cpu(cpu)) {
+                continue;
+            }
+
+            if (vms->pmu) {
                 assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));
                 if (kvm_irqchip_in_kernel()) {
                     kvm_arm_pmu_set_irq(cpu, VIRTUAL_PMU_IRQ);
@@ -2020,6 +2331,207 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem)
             }
         }
     }
+
+    if (kvm_enabled() || tcg_enabled()) {
+        for (n = 0; n < possible_cpus->len; n++) {
+            cpu = qemu_get_possible_cpu(n);
+            if (!qemu_present_cpu(cpu)) {
+                continue;
+            }
+
+            /*
+             * Now, GIC has been sized with possible CPUs and we dont require
+             * disabled vCPU objects to be represented in the QOM. Release the
+             * disabled ARMCPU objects earlier used during init for pre-sizing.
+             *
+             * We fake to the guest through ACPI about the presence(_STA.PRES=1)
+             * of these non-existent vCPUs at VMM/qemu and present these as
+             * disabled vCPUs(_STA.ENA=0) so that they cant be used. These vCPUs
+             * can be later added to the guest through hotplug exchanges when
+             * ARMCPU objects are created back again using 'device_add' QMP
+             * command.
+             */
+            /*
+             * RFC: Question: Other approach could've been to keep them forever
+             * and release it only once when qemu exits as part of finalize or
+             * when new vCPU is hotplugged. In the later old could be released
+             * for the newly created object for the same vCPU?
+             */
+            if (!qemu_enabled_cpu(cpu)) {
+                CPUArchId *cpu_slot;
+                cpu_slot = virt_find_cpu_slot(ms, cpu->cpu_index);
+                cpu_slot->cpu = NULL;
+                object_unref(OBJECT(cpu));
+            }
+        }
+    }
+}
+
+static void virt_cpu_set_properties(Object *cpuobj, const CPUArchId *cpu_slot,
+                                    Error **errp)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    VirtMachineState *vms = VIRT_MACHINE(ms);
+    Error *local_err = NULL;
+    VirtMachineClass *vmc;
+
+    vmc = VIRT_MACHINE_GET_CLASS(ms);
+
+    /* now, set the cpu object property values */
+    numa_cpu_pre_plug(cpu_slot, DEVICE(cpuobj), &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    object_property_set_int(cpuobj, "mp-affinity", cpu_slot->arch_id, NULL);
+
+    if (!vms->secure) {
+        object_property_set_bool(cpuobj, "has_el3", false, NULL);
+    }
+
+    if (!vms->virt && object_property_find(cpuobj, "has_el2")) {
+        object_property_set_bool(cpuobj, "has_el2", false, NULL);
+    }
+
+    if (vmc->kvm_no_adjvtime &&
+        object_property_find(cpuobj, "kvm-no-adjvtime")) {
+        object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL);
+    }
+
+    if (vmc->no_kvm_steal_time &&
+        object_property_find(cpuobj, "kvm-steal-time")) {
+        object_property_set_bool(cpuobj, "kvm-steal-time", false, NULL);
+    }
+
+    if (vmc->no_pmu && object_property_find(cpuobj, "pmu")) {
+        object_property_set_bool(cpuobj, "pmu", false, NULL);
+    }
+
+    if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) {
+        object_property_set_bool(cpuobj, "lpa2", false, NULL);
+    }
+
+    if (object_property_find(cpuobj, "reset-cbar")) {
+        object_property_set_int(cpuobj, "reset-cbar",
+                                vms->memmap[VIRT_CPUPERIPHS].base,
+                                &local_err);
+        if (local_err) {
+            goto out;
+        }
+    }
+
+    /* link already initialized {secure,tag}-memory regions to this cpu */
+    object_property_set_link(cpuobj, "memory", OBJECT(vms->sysmem), &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    if (vms->secure) {
+        object_property_set_link(cpuobj, "secure-memory",
+                                 OBJECT(vms->secure_sysmem), &local_err);
+        if (local_err) {
+            goto out;
+        }
+    }
+
+    if (vms->mte) {
+        if (!object_property_find(cpuobj, "tag-memory")) {
+            error_setg(&local_err, "MTE requested, but not supported "
+                       "by the guest CPU");
+            if (local_err) {
+                goto out;
+            }
+        }
+
+        object_property_set_link(cpuobj, "tag-memory", OBJECT(vms->tag_sysmem),
+                                 &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        if (vms->secure) {
+            object_property_set_link(cpuobj, "secure-tag-memory",
+                                     OBJECT(vms->secure_tag_sysmem),
+                                     &local_err);
+            if (local_err) {
+                goto out;
+            }
+        }
+    }
+
+    /*
+     * RFC: Question: this must only be called for the hotplugged cpus. For the
+     * cold booted secondary cpus this is being taken care in arm_load_kernel()
+     * in boot.c. Perhaps we should remove that code now?
+     */
+    if (vms->psci_conduit != QEMU_PSCI_CONDUIT_DISABLED) {
+        object_property_set_int(cpuobj, "psci-conduit", vms->psci_conduit,
+                                &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        /* Secondary CPUs start in PSCI powered-down state */
+        if (CPU(cpuobj)->cpu_index > 0) {
+            object_property_set_bool(cpuobj, "start-powered-off", true, NULL);
+        }
+    }
+
+out:
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return;
+}
+
+static void fdt_add_hisi_sec_nodes(const VirtMachineState *vms, int dev_id)
+{
+    const MachineState *ms = MACHINE(vms);
+    hwaddr size = 0x10000;
+
+    /*
+     * Calculate the base address for the sec device node.
+     * Each device group contains one sec device and one hpre device,spaced by 2 * size.
+     */
+    hwaddr base = vms->memmap[VIRT_KAE_DEVICE].base + dev_id * 2 * size;
+    char *nodename;
+
+    tmm_set_sec_addr(base, dev_id);
+
+    nodename = g_strdup_printf("/hisi-sec@%" PRIx64, base);
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "hisilicon,hip07-sec-vf");
+    qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size);
+    g_free(nodename);
+}
+
+static void fdt_add_hisi_hpre_nodes(const VirtMachineState *vms, int dev_id)
+{
+    const MachineState *ms = MACHINE(vms);
+    hwaddr size = 0x10000;
+
+    /*
+     * Calculate the base address for the hpre device node.
+     * Each hpre device follows the corresponding sec device by an additional offset of size.
+     */
+    hwaddr base = vms->memmap[VIRT_KAE_DEVICE].base + dev_id * 2 * size + size;
+    char *nodename;
+
+    tmm_set_hpre_addr(base, dev_id);
+
+    nodename = g_strdup_printf("/hisi-hpre@%" PRIx64, base);
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "hisilicon,hip07-hpre-vf");
+    qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size);
+    g_free(nodename);
+}
+
+static void fdt_add_all_hisi_nodes(const VirtMachineState *vms, int dev_id)
+{
+    for (int i = 0; i < dev_id; i++) {
+        fdt_add_hisi_sec_nodes(vms, i);
+        fdt_add_hisi_hpre_nodes(vms, i);
+    }
 }
 
 static void machvirt_init(MachineState *machine)
@@ -2028,22 +2540,25 @@ static void machvirt_init(MachineState *machine)
     VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(machine);
     MachineClass *mc = MACHINE_GET_CLASS(machine);
     const CPUArchIdList *possible_cpus;
-    MemoryRegion *sysmem = get_system_memory();
+    MemoryRegion *secure_tag_sysmem = NULL;
     MemoryRegion *secure_sysmem = NULL;
     MemoryRegion *tag_sysmem = NULL;
-    MemoryRegion *secure_tag_sysmem = NULL;
+    MemoryRegion *sysmem;
     int n, virt_max_cpus;
     bool firmware_loaded;
     bool aarch64 = true;
     bool has_ged = !vmc->no_ged;
     unsigned int smp_cpus = machine->smp.cpus;
     unsigned int max_cpus = machine->smp.max_cpus;
+    ObjectClass *cpu_class;
 
     if (!cpu_type_valid(machine->cpu_type)) {
         error_report("mach-virt: CPU type %s not supported", machine->cpu_type);
         exit(1);
     }
 
+    finalize_gic_version(vms);
+
     possible_cpus = mc->possible_cpu_arch_ids(machine);
 
     /*
@@ -2070,10 +2585,7 @@ static void machvirt_init(MachineState *machine)
         virt_set_memmap(vms, pa_bits);
     }
 
-    /* We can probe only here because during property set
-     * KVM is not available yet
-     */
-    finalize_gic_version(vms);
+    sysmem = vms->sysmem = get_system_memory();
 
     if (vms->secure) {
         /*
@@ -2082,7 +2594,7 @@ static void machvirt_init(MachineState *machine)
          * containing the system memory at low priority; any secure-only
          * devices go in at higher priority and take precedence.
          */
-        secure_sysmem = g_new(MemoryRegion, 1);
+        secure_sysmem = vms->secure_sysmem = g_new(MemoryRegion, 1);
         memory_region_init(secure_sysmem, OBJECT(machine), "secure-memory",
                            UINT64_MAX);
         memory_region_add_subregion_overlap(secure_sysmem, 0, sysmem, -1);
@@ -2103,7 +2615,7 @@ static void machvirt_init(MachineState *machine)
      */
     if (vms->secure && firmware_loaded) {
         vms->psci_conduit = QEMU_PSCI_CONDUIT_DISABLED;
-    } else if (vms->virt) {
+    } else if (vms->virt || virtcca_cvm_enabled()) {
         vms->psci_conduit = QEMU_PSCI_CONDUIT_SMC;
     } else {
         vms->psci_conduit = QEMU_PSCI_CONDUIT_HVC;
@@ -2155,107 +2667,129 @@ static void machvirt_init(MachineState *machine)
         exit(1);
     }
 
-    create_fdt(vms);
+    if (vms->mte) {
+        /* Create the memory region only once, but link to all cpus later */
+        tag_sysmem = vms->tag_sysmem = g_new(MemoryRegion, 1);
+        memory_region_init(tag_sysmem, OBJECT(machine),
+                           "tag-memory", UINT64_MAX / 32);
 
-    assert(possible_cpus->len == max_cpus);
-    for (n = 0; n < possible_cpus->len; n++) {
-        Object *cpuobj;
-        CPUState *cs;
+        if (vms->secure) {
+            secure_tag_sysmem = vms->secure_tag_sysmem = g_new(MemoryRegion, 1);
+            memory_region_init(secure_tag_sysmem, OBJECT(machine),
+                               "secure-tag-memory", UINT64_MAX / 32);
 
-        if (n >= smp_cpus) {
-            break;
+            /* As with ram, secure-tag takes precedence over tag.  */
+            memory_region_add_subregion_overlap(secure_tag_sysmem, 0,
+                                                tag_sysmem, -1);
         }
+    }
 
-        cpuobj = object_new(possible_cpus->cpus[n].type);
-        object_property_set_int(cpuobj, "mp-affinity",
-                                possible_cpus->cpus[n].arch_id, NULL);
-
-        cs = CPU(cpuobj);
-        cs->cpu_index = n;
-
-        numa_cpu_pre_plug(&possible_cpus->cpus[cs->cpu_index], DEVICE(cpuobj),
-                          &error_fatal);
+    create_fdt(vms);
 
-        aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL);
+    if (virtcca_cvm_enabled()) {
+        int kae_num = tmm_get_kae_num();
+        fdt_add_all_hisi_nodes(vms, kae_num);
 
-        if (!vms->secure) {
-            object_property_set_bool(cpuobj, "has_el3", false, NULL);
+        int ret = kvm_arm_tmm_init(machine->cgs, &error_fatal);
+        if (ret != 0) {
+            error_report("fail to initialize TMM");
+            exit(1);
         }
+    }
 
-        if (!vms->virt && object_property_find(cpuobj, "has_el2")) {
-            object_property_set_bool(cpuobj, "has_el2", false, NULL);
-        }
+    qemu_log("cpu init start\n");
 
-        if (vmc->kvm_no_adjvtime &&
-            object_property_find(cpuobj, "kvm-no-adjvtime")) {
-            object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL);
-        }
+    cpu_class = object_class_by_name(machine->cpu_type);
+    has_ged = has_ged && firmware_loaded &&
+        virt_is_acpi_enabled(vms) &&
+        !!object_class_dynamic_cast(cpu_class, TYPE_AARCH64_CPU);
 
-        if (vmc->no_kvm_steal_time &&
-            object_property_find(cpuobj, "kvm-steal-time")) {
-            object_property_set_bool(cpuobj, "kvm-steal-time", false, NULL);
-        }
+    if (tcg_enabled() || hvf_enabled() || qtest_enabled() ||
+        (kvm_enabled() && !kvm_smccc_filter_enabled()) ||
+        (vms->gic_version < VIRT_GIC_VERSION_3) || !has_ged) {
+        vms->cpu_hotplug_enabled = false;
+    } else {
+        vms->cpu_hotplug_enabled = true;
+    }
 
-        if (vmc->no_pmu && object_property_find(cpuobj, "pmu")) {
-            object_property_set_bool(cpuobj, "pmu", false, NULL);
+    if (!vms->cpu_hotplug_enabled) {
+        if (machine->smp.max_cpus > smp_cpus) {
+            warn_report("cpu hotplug feature has been disabled");
         }
+    }
 
-        if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) {
-            object_property_set_bool(cpuobj, "lpa2", false, NULL);
-        }
+    notifier_list_init(&vms->cpuhp_notifiers);
+    assert(possible_cpus->len == max_cpus);
+    for (n = 0; n < possible_cpus->len; n++) {
+        Object *cpuobj;
+        CPUState *cs;
 
-        if (object_property_find(cpuobj, "reset-cbar")) {
-            object_property_set_int(cpuobj, "reset-cbar",
-                                    vms->memmap[VIRT_CPUPERIPHS].base,
-                                    &error_abort);
+        if (!vms->cpu_hotplug_enabled && n >= smp_cpus) {
+            break;
         }
 
-        object_property_set_link(cpuobj, "memory", OBJECT(sysmem),
-                                 &error_abort);
-        if (vms->secure) {
-            object_property_set_link(cpuobj, "secure-memory",
-                                     OBJECT(secure_sysmem), &error_abort);
-        }
-
-        if (vms->mte) {
-            /* Create the memory region only once, but link to all cpus. */
-            if (!tag_sysmem) {
-                /*
-                 * The property exists only if MemTag is supported.
-                 * If it is, we must allocate the ram to back that up.
-                 */
-                if (!object_property_find(cpuobj, "tag-memory")) {
-                    error_report("MTE requested, but not supported "
-                                 "by the guest CPU");
-                    exit(1);
-                }
+        cpuobj = object_new(possible_cpus->cpus[n].type);
+        cs = CPU(cpuobj);
 
-                tag_sysmem = g_new(MemoryRegion, 1);
-                memory_region_init(tag_sysmem, OBJECT(machine),
-                                   "tag-memory", UINT64_MAX / 32);
+        aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL);
+        object_property_set_int(cpuobj, "socket-id",
+                                virt_get_socket_id(machine, n), NULL);
+        object_property_set_int(cpuobj, "cluster-id",
+                                virt_get_cluster_id(machine, n), NULL);
+        object_property_set_int(cpuobj, "core-id",
+                                virt_get_core_id(machine, n), NULL);
+        object_property_set_int(cpuobj, "thread-id",
+                                virt_get_thread_id(machine, n), NULL);
+
+        if (n < smp_cpus) {
+            qdev_realize(DEVICE(cpuobj), NULL, &error_fatal);
+            object_unref(cpuobj);
+        } else {
+            CPUArchId *cpu_slot;
 
-                if (vms->secure) {
-                    secure_tag_sysmem = g_new(MemoryRegion, 1);
-                    memory_region_init(secure_tag_sysmem, OBJECT(machine),
-                                       "secure-tag-memory", UINT64_MAX / 32);
+            /* handling for vcpus which are yet to be hot-plugged */
+            cs->cpu_index = n;
+            cpu_slot = virt_find_cpu_slot(machine, cs->cpu_index);
 
-                    /* As with ram, secure-tag takes precedence over tag.  */
-                    memory_region_add_subregion_overlap(secure_tag_sysmem, 0,
-                                                        tag_sysmem, -1);
-                }
-            }
+            /*
+             * ARM host vCPU features need to be fixed at the boot time. But as
+             * per current approach this CPU object will be destroyed during
+             * cpu_post_init(). During hotplug of vCPUs these properties are
+             * initialized again.
+             */
+            virt_cpu_set_properties(cpuobj, cpu_slot, &error_fatal);
 
-            object_property_set_link(cpuobj, "tag-memory", OBJECT(tag_sysmem),
-                                     &error_abort);
-            if (vms->secure) {
-                object_property_set_link(cpuobj, "secure-tag-memory",
-                                         OBJECT(secure_tag_sysmem),
-                                         &error_abort);
+            /*
+             * For KVM, we shall be pre-creating the now disabled/un-plugged
+             * possbile host vcpus and park them till the time they are
+             * actually hot plugged. This is required to pre-size the host
+             * GICC and GICR with the all possible vcpus for this VM.
+             */
+            if (kvm_enabled()) {
+                kvm_arm_create_host_vcpu(ARM_CPU(cs));
             }
+            /*
+             * Add disabled vCPU to CPU slot during the init phase of the virt
+             * machine
+             * 1. We need this ARMCPU object during the GIC init. This object
+             *    will facilitate in pre-realizing the GIC. Any info like
+             *    mp-affinity(required to derive gicr_type) etc. could still be
+             *    fetched while preserving QOM abstraction akin to realized
+             *    vCPUs.
+             * 2. Now, after initialization of the virt machine is complete we
+             *    could use two approaches to deal with this ARMCPU object:
+             *    (i) re-use this ARMCPU object during hotplug of this vCPU.
+             *                             OR
+             *    (ii) defer release this ARMCPU object after gic has been
+             *         initialized or during pre-plug phase when a vCPU is
+             *         hotplugged.
+             *
+             *    We will use the (ii) approach and release the ARMCPU objects
+             *    after GIC and machine has been fully initialized during
+             *    machine_init_done() phase.
+             */
+             cpu_slot->cpu = OBJECT(cs);
         }
-
-        qdev_realize(DEVICE(cpuobj), NULL, &error_fatal);
-        object_unref(cpuobj);
     }
     fdt_add_timer_nodes(vms);
     fdt_add_cpu_nodes(vms);
@@ -2267,12 +2801,18 @@ static void machvirt_init(MachineState *machine)
 
     create_gic(vms, sysmem);
 
+    if (has_ged) {
+        vms->acpi_dev = create_acpi_ged(vms);
+    }
+
     virt_cpu_post_init(vms, sysmem);
 
     fdt_add_pmu_nodes(vms);
 
     create_uart(vms, VIRT_UART, sysmem, serial_hd(0));
 
+    create_cpufreq(vms, sysmem);
+
     if (vms->secure) {
         create_secure_ram(vms, secure_sysmem, secure_tag_sysmem);
         create_uart(vms, VIRT_SECURE_UART, secure_sysmem, serial_hd(1));
@@ -2289,9 +2829,7 @@ static void machvirt_init(MachineState *machine)
 
     create_pcie(vms);
 
-    if (has_ged && aarch64 && firmware_loaded && virt_is_acpi_enabled(vms)) {
-        vms->acpi_dev = create_acpi_ged(vms);
-    } else {
+    if (!has_ged) {
         create_gpio_devices(vms, VIRT_GPIO, sysmem);
     }
 
@@ -2332,6 +2870,9 @@ static void machvirt_init(MachineState *machine)
     vms->bootinfo.get_dtb = machvirt_dtb;
     vms->bootinfo.skip_dtb_autoload = true;
     vms->bootinfo.firmware_loaded = firmware_loaded;
+    vms->bootinfo.firmware_base = vms->memmap[VIRT_FLASH].base;
+    vms->bootinfo.firmware_max_size = vms->memmap[VIRT_FLASH].size;
+    vms->bootinfo.confidential = virtcca_cvm_enabled();
     vms->bootinfo.psci_conduit = vms->psci_conduit;
     arm_load_kernel(ARM_CPU(first_cpu), machine, &vms->bootinfo);
 
@@ -2661,6 +3202,54 @@ static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx)
     return socket_id % ms->numa_state->num_nodes;
 }
 
+static int virt_get_socket_id(const MachineState *ms, int cpu_index)
+{
+    assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len);
+
+    return ms->possible_cpus->cpus[cpu_index].props.socket_id;
+}
+
+static int virt_get_cluster_id(const MachineState *ms, int cpu_index)
+{
+    assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len);
+
+    return ms->possible_cpus->cpus[cpu_index].props.cluster_id;
+}
+
+static int virt_get_core_id(const MachineState *ms, int cpu_index)
+{
+    assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len);
+
+    return ms->possible_cpus->cpus[cpu_index].props.core_id;
+}
+
+static int virt_get_thread_id(const MachineState *ms, int cpu_index)
+{
+    assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len);
+
+    return ms->possible_cpus->cpus[cpu_index].props.thread_id;
+}
+
+static int
+virt_get_cpu_id_from_cpu_topo(const MachineState *ms, DeviceState *dev)
+{
+    int cpu_id, sock_vcpu_num, clus_vcpu_num, core_vcpu_num;
+    ARMCPU *cpu = ARM_CPU(dev);
+
+    /* calculate total logical cpus across socket/cluster/core */
+    sock_vcpu_num = cpu->socket_id * (ms->smp.threads * ms->smp.cores *
+                    ms->smp.clusters);
+    clus_vcpu_num = cpu->cluster_id * (ms->smp.threads * ms->smp.cores);
+    core_vcpu_num = cpu->core_id * ms->smp.threads;
+
+    /* get vcpu-id(logical cpu index) for this vcpu from this topology */
+    cpu_id = (sock_vcpu_num + clus_vcpu_num + core_vcpu_num) + cpu->thread_id;
+
+    assert(cpu_id >= 0 && cpu_id < ms->possible_cpus->len);
+
+    return cpu_id;
+}
+
 static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
 {
     int n;
@@ -2678,6 +3267,7 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
     ms->possible_cpus->len = max_cpus;
     for (n = 0; n < ms->possible_cpus->len; n++) {
         ms->possible_cpus->cpus[n].type = ms->cpu_type;
+        ms->possible_cpus->cpus[n].vcpus_count = 1;
         ms->possible_cpus->cpus[n].arch_id =
             virt_cpu_mp_affinity(vms, n);
 
@@ -2698,6 +3288,50 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
     return ms->possible_cpus;
 }
 
+static CPUArchId *virt_find_cpu_slot(MachineState *ms, int vcpuid)
+{
+    VirtMachineState *vms = VIRT_MACHINE(ms);
+    CPUArchId *found_cpu;
+    uint64_t mp_affinity;
+
+    assert(vcpuid >= 0 && vcpuid < ms->possible_cpus->len);
+
+    /*
+     * RFC: Question:
+     * TBD: Should mp-affinity be treated as MPIDR?
+     */
+    mp_affinity = virt_cpu_mp_affinity(vms, vcpuid);
+    found_cpu = &ms->possible_cpus->cpus[vcpuid];
+
+    assert(found_cpu->arch_id == mp_affinity);
+
+    /*
+     * RFC: Question:
+     * Slot-id is the index where vCPU with certain arch-id(=mpidr/ap-affinity)
+     * is plugged. For Host KVM, MPIDR for vCPU is derived using vcpu-id.
+     * As I understand, MPIDR and vcpu-id are property of vCPU but slot-id is
+     * more related to machine? Current code assumes slot-id and vcpu-id are
+     * same i.e. meaning of slot is bit vague.
+     *
+     * Q1: Is there any requirement to clearly represent slot and dissociate it
+     *     from vcpu-id?
+     * Q2: Should we make MPIDR within host KVM user configurable?
+     *
+     *          +----+----+----+----+----+----+----+----+
+     * MPIDR    |||  Res  |   Aff2  |   Aff1  |  Aff0   |
+     *          +----+----+----+----+----+----+----+----+
+     *                     \         \         \   |    |
+     *                      \   8bit  \   8bit  \  |4bit|
+     *                       \<------->\<------->\ |<-->|
+     *                        \         \         \|    |
+     *          +----+----+----+----+----+----+----+----+
+     * VCPU-ID  |  Byte4  |  Byte2  |  Byte1  |  Byte0  |
+     *          +----+----+----+----+----+----+----+----+
+     */
+
+    return found_cpu;
+}
+
 static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
                                  Error **errp)
 {
@@ -2741,6 +3375,244 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
                          dev, &error_abort);
 }
 
+static void virt_update_gic(VirtMachineState *vms, CPUState *cs)
+{
+    GICv3CPUHotplugInfo gic_info = { .gic = vms->gic, .cpu = cs };
+
+    /* notify gic to stitch GICC to this new cpu */
+    notifier_list_notify(&vms->cpuhp_notifiers, &gic_info);
+}
+
+static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                              Error **errp)
+{
+    VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+    MachineState *ms = MACHINE(hotplug_dev);
+    ARMCPU *cpu = ARM_CPU(dev);
+    CPUState *cs = CPU(dev);
+    CPUArchId *cpu_slot;
+
+    /* sanity check the cpu */
+    if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
+        error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
+                   ms->cpu_type);
+        return;
+    }
+
+    if ((cpu->thread_id < 0) || (cpu->thread_id >= ms->smp.threads)) {
+        error_setg(errp, "Invalid thread-id %u specified, correct range 0:%u",
+                   cpu->thread_id, ms->smp.threads - 1);
+        return;
+    }
+
+    if ((cpu->core_id < 0) || (cpu->core_id >= ms->smp.cores)) {
+        error_setg(errp, "Invalid core-id %d specified, correct range 0:%u",
+                   cpu->core_id, ms->smp.cores - 1);
+        return;
+    }
+
+    if ((cpu->cluster_id < 0) || (cpu->cluster_id >= ms->smp.clusters)) {
+        error_setg(errp, "Invalid cluster-id %u specified, correct range 0:%u",
+                   cpu->cluster_id, ms->smp.clusters - 1);
+        return;
+    }
+
+    if ((cpu->socket_id < 0) || (cpu->socket_id >= ms->smp.sockets)) {
+        error_setg(errp, "Invalid socket-id %u specified, correct range 0:%u",
+                   cpu->socket_id, ms->smp.sockets - 1);
+        return;
+    }
+
+    cs->cpu_index = virt_get_cpu_id_from_cpu_topo(ms, dev);
+
+    /* Except for cold-booted vCPUs, this should check presence of ACPI GED */
+    if (cs->cpu_index >= ms->smp.cpus && !vms->acpi_dev) {
+        error_setg(errp, "GED acpi device does not exists");
+        return;
+    }
+
+    if (cs->cpu_index >= ms->smp.cpus && !vms->cpu_hotplug_enabled) {
+        error_setg(errp, "CPU [cold|hot]plug not supported on this machine");
+        return;
+    }
+
+    cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index);
+    if (qemu_present_cpu(CPU(cpu_slot->cpu))) {
+        error_setg(errp, "cpu(id%d=%d:%d:%d:%d) with arch-id %" PRIu64 " exist",
+                   cs->cpu_index, cpu->socket_id, cpu->cluster_id, cpu->core_id,
+                   cpu->thread_id, cpu_slot->arch_id);
+        return;
+    }
+    virt_cpu_set_properties(OBJECT(cs), cpu_slot, errp);
+
+    /*
+     * Fix the GIC for this new vCPU being plugged. The QOM CPU object for the
+     * new vCPU need to be updated in the corresponding QOM GICv3CPUState object
+     * We also need to re-wire the IRQs for this new CPU object. This update
+     * is limited to the QOM only and does not affects the KVM. Later has
+     * already been pre-sized with possible CPU at VM init time. This is a
+     * workaround to the constraints posed by ARM architecture w.r.t supporting
+     * CPU Hotplug. Specification does not exist for the later.
+     * This patch-up is required both for {cold,hot}-plugged vCPUs. Cold-inited
+     * vCPUs have their GIC state initialized during machvit_init().
+     */
+    if (vms->acpi_dev) {
+        virt_update_gic(vms, cs);
+        wire_gic_cpu_irqs(vms, cs);
+    }
+
+    /*
+     * To give persistent presence view of vCPUs to the guest, ACPI might need
+     * to fake the presence of the vCPUs to the guest but keep them disabled.
+     * This shall be used during the init of ACPI Hotplug state and hot-unplug
+     */
+     cs->acpi_persistent = true;
+
+    if (!dev->hotplugged) {
+        cs->cold_booted = true;
+    }
+#ifdef CONFIG_KVM
+    if (cs->cpu_index >= ms->smp.cpus) {
+        cpu->kvm_sve_finalized = true;
+    }
+#endif
+}
+
+static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                          Error **errp)
+{
+    VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+    MachineState *ms = MACHINE(hotplug_dev);
+    CPUState *cs = CPU(dev);
+    Error *local_err = NULL;
+    CPUArchId *cpu_slot;
+
+    /* insert the cold/hot-plugged vcpu in the slot */
+    cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index);
+    cpu_slot->cpu = OBJECT(dev);
+
+    /*
+     * Update the ACPI Hotplug state both for vCPUs being {hot,cold}-plugged.
+     * vCPUs can be cold-plugged using '-device' option. For vCPUs being hot
+     * plugged, guest is also notified.
+     */
+    if (vms->acpi_dev) {
+        HotplugHandlerClass *hhc;
+        /* update acpi hotplug state and send cpu hotplug event to guest */
+        hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
+        hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err);
+        if (local_err) {
+            goto fail;
+        }
+        /* register this cpu for reset & update F/W info for the next boot */
+        qemu_register_reset(do_cpu_reset, ARM_CPU(cs));
+    }
+
+    vms->boot_cpus++;
+    if (vms->fw_cfg) {
+        fw_cfg_modify_i16(vms->fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus);
+    }
+
+    cs->disabled = false;
+    return;
+fail:
+    error_propagate(errp, local_err);
+}
+
+static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev,
+                                    DeviceState *dev, Error **errp)
+{
+    VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+    HotplugHandlerClass *hhc;
+    ARMCPU *cpu = ARM_CPU(dev);
+    CPUState *cs = CPU(dev);
+    Error *local_err = NULL;
+
+    if (!vms->acpi_dev || !dev->realized) {
+        error_setg(errp, "GED does not exists or device is not realized!");
+        return;
+    }
+
+    if (!vms->cpu_hotplug_enabled) {
+        error_setg(errp, "CPU hot(un)plug not supported on this machine");
+        return;
+    }
+
+    /*
+     * UEFI ACPI standard change is required to make both 'enabled' and the
+     * 'online-capable' bit co-exist instead of being mutually exclusive.
+     * check virt_acpi_get_gicc_flags() for more details.
+     *
+     * Disable the unplugging of cold-booted vCPUs as a temporary mitigation.
+     */
+    if (cs->cold_booted) {
+        error_setg(errp, "Hot-unplug of cold-booted CPU not supported!");
+        return;
+    }
+
+    if (cs->cpu_index == first_cpu->cpu_index) {
+        error_setg(errp, "Boot CPU(id%d=%d:%d:%d:%d) hot-unplug not supported",
+                   first_cpu->cpu_index, cpu->socket_id, cpu->cluster_id,
+                   cpu->core_id, cpu->thread_id);
+        return;
+    }
+
+    /* request cpu hotplug from guest */
+    hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
+    hhc->unplug_request(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err);
+    if (local_err) {
+        goto fail;
+    }
+
+    return;
+fail:
+    error_propagate(errp, local_err);
+}
+
+static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                            Error **errp)
+{
+    VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+    MachineState *ms = MACHINE(hotplug_dev);
+    HotplugHandlerClass *hhc;
+    CPUState *cs = CPU(dev);
+    Error *local_err = NULL;
+    CPUArchId *cpu_slot;
+
+    if (!vms->acpi_dev || !dev->realized) {
+        error_setg(errp, "GED does not exists or device is not realized!");
+        return;
+    }
+
+    cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index);
+
+    /* update the acpi cpu hotplug state for cpu hot-unplug */
+    hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
+    hhc->unplug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err);
+    if (local_err) {
+        goto fail;
+    }
+
+    unwire_gic_cpu_irqs(vms, cs);
+    virt_update_gic(vms, cs);
+
+    qemu_unregister_reset(do_cpu_reset, ARM_CPU(cs));
+    vms->boot_cpus--;
+    if (vms->fw_cfg) {
+        fw_cfg_modify_i16(vms->fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus);
+    }
+
+    qobject_unref(dev->opts);
+    dev->opts = NULL;
+
+    cpu_slot->cpu = NULL;
+    cs->disabled = true;
+
+    return;
+fail:
+    error_propagate(errp, local_err);
+}
+
 static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                                             DeviceState *dev, Error **errp)
 {
@@ -2783,6 +3655,8 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
         qlist_append_str(reserved_regions, resv_prop_str);
         qdev_prop_set_array(dev, "reserved-regions", reserved_regions);
         g_free(resv_prop_str);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        virt_cpu_pre_plug(hotplug_dev, dev, errp);
     }
 }
 
@@ -2790,10 +3664,34 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
                                         DeviceState *dev, Error **errp)
 {
     VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
+    MachineClass *mc = MACHINE_GET_CLASS(vms);
 
-    if (vms->platform_bus_dev) {
-        MachineClass *mc = MACHINE_GET_CLASS(vms);
+    /* For smmuv3-nested devices we need to set the mem & irq */
+    if (device_is_dynamic_sysbus(mc, dev) &&
+        object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_ACCEL)) {
+        hwaddr base = vms->memmap[VIRT_SMMU_ACCEL].base;
+        int irq =  vms->irqmap[VIRT_SMMU_ACCEL];
+
+        if (vms->smmu_accel_count >= MAX_SMMU_ACCEL) {
+            error_setg(errp, "smmuv3-nested max count reached!");
+            return;
+        }
+
+        base += (vms->smmu_accel_count * SMMU_IO_LEN);
+        irq += (vms->smmu_accel_count * NUM_SMMU_IRQS);
+
+        sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
+        for (int i = 0; i < 4; i++) {
+            sysbus_connect_irq(SYS_BUS_DEVICE(dev), i,
+                               qdev_get_gpio_in(vms->gic, irq + i));
+        }
+        if (vms->iommu != VIRT_IOMMU_SMMUV3_ACCEL) {
+            vms->iommu = VIRT_IOMMU_SMMUV3_ACCEL;
+        }
+        vms->smmu_accel_count++;
+    }
 
+    if (vms->platform_bus_dev) {
         if (device_is_dynamic_sysbus(mc, dev)) {
             platform_bus_link_device(PLATFORM_BUS_DEVICE(vms->platform_bus_dev),
                                      SYS_BUS_DEVICE(dev));
@@ -2804,6 +3702,8 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev,
         virt_memory_plug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
         virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        virt_cpu_plug(hotplug_dev, dev, errp);
     }
 
     if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
@@ -2861,6 +3761,8 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
         virtio_md_pci_unplug_request(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev),
                                      errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        virt_cpu_unplug_request(hotplug_dev, dev, errp);
     } else {
         error_setg(errp, "device unplug request for unsupported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -2874,6 +3776,8 @@ static void virt_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
         virt_dimm_unplug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
         virtio_md_pci_unplug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+        virt_cpu_unplug(hotplug_dev, dev, errp);
     } else {
         error_setg(errp, "virt: device unplug for unsupported device"
                    " type: %s", object_get_typename(OBJECT(dev)));
@@ -2888,7 +3792,8 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
     if (device_is_dynamic_sysbus(mc, dev) ||
         object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
         object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) ||
-        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         return HOTPLUG_HANDLER(machine);
     }
     return NULL;
@@ -2901,6 +3806,15 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
 static int virt_kvm_type(MachineState *ms, const char *type_str)
 {
     VirtMachineState *vms = VIRT_MACHINE(ms);
+    int virtcca_cvm_type = 0;
+    if (object_property_find(OBJECT(current_machine), "kvm-type")) {
+        g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
+                                                            "kvm-type", &error_abort);
+
+        if (!strcmp(kvm_type, "cvm")) {
+            virtcca_cvm_type = VIRTCCA_CVM_TYPE;
+        }
+    }
     int max_vm_pa_size, requested_pa_size;
     bool fixed_ipa;
 
@@ -2930,7 +3844,9 @@ static int virt_kvm_type(MachineState *ms, const char *type_str)
      * the implicit legacy 40b IPA setting, in which case the kvm_type
      * must be 0.
      */
-    return fixed_ipa ? 0 : requested_pa_size;
+    return strcmp(type_str, "cvm") == 0 ?
+        ((fixed_ipa ? 0 : requested_pa_size) | virtcca_cvm_type) :
+        (fixed_ipa ? 0 : requested_pa_size);
 }
 
 static void virt_machine_class_init(ObjectClass *oc, void *data)
@@ -2948,6 +3864,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE);
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM);
+    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_ACCEL);
 #ifdef CONFIG_TPM
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS);
 #endif
@@ -2965,6 +3882,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
 #endif
     mc->get_default_cpu_node_id = virt_get_default_cpu_node_id;
     mc->kvm_type = virt_kvm_type;
+    mc->has_hotpluggable_cpus = true;
     assert(!mc->get_hotplug_handler);
     mc->get_hotplug_handler = virt_machine_get_hotplug_handler;
     hc->pre_plug = virt_machine_device_pre_plug_cb;
@@ -3101,6 +4019,19 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
 
 }
 
+static char *virt_get_kvm_type(Object *obj, Error **errp G_GNUC_UNUSED)
+{
+    VirtMachineState *vms = VIRT_MACHINE(obj);
+    return g_strdup(vms->kvm_type);
+}
+
+static void virt_set_kvm_type(Object *obj, const char *value, Error **errp G_GNUC_UNUSED)
+{
+    VirtMachineState *vms = VIRT_MACHINE(obj);
+    g_free(vms->kvm_type);
+    vms->kvm_type = g_strdup(value);
+}
+
 static void virt_instance_init(Object *obj)
 {
     VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -3115,6 +4046,9 @@ static void virt_instance_init(Object *obj)
     /* EL2 is also disabled by default, for similar reasons */
     vms->virt = false;
 
+    /* CPU hotplug is enabled by default */
+    vms->cpu_hotplug_enabled = true;
+
     /* High memory is enabled by default */
     vms->highmem = true;
     vms->highmem_compact = !vmc->no_highmem_compact;
@@ -3158,6 +4092,9 @@ static void virt_instance_init(Object *obj)
 
     vms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
     vms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
+
+    object_property_add_str(obj, "kvm-type", virt_get_kvm_type, virt_set_kvm_type);
+    object_property_set_description(obj, "kvm-type", "CVM or Normal VM");
 }
 
 static const TypeInfo virt_machine_info = {
diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c
index 3aa105748d36c1b8efa3c1bd2bc81a353365b7d5..88dfd0bb7fb31d71061c239140c747a5a133f336 100644
--- a/hw/audio/cs4231a.c
+++ b/hw/audio/cs4231a.c
@@ -682,6 +682,11 @@ static void cs4231a_realizefn (DeviceState *dev, Error **errp)
         return;
     }
 
+    if (s->irq >= ISA_NUM_IRQS) {
+        error_setg(errp, "Invalid IRQ %d (max %d)", s->irq, ISA_NUM_IRQS - 1);
+        return;
+    }
+
     s->pic = isa_bus_get_irq(bus, s->irq);
     k = ISADMA_GET_CLASS(s->isa_dma);
     k->register_channel(s->isa_dma, s->dma, cs_dma_read, s);
diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c
index 0bc20d49f6cf43ea37141d4543c2d79d8e2bd68a..ac908e56c6bbc6b17a23ae6f6fac499fc7af213b 100644
--- a/hw/audio/hda-codec.c
+++ b/hw/audio/hda-codec.c
@@ -487,8 +487,7 @@ static void hda_audio_setup(HDAAudioStream *st)
     if (st->output) {
         if (use_timer) {
             cb = hda_audio_output_cb;
-            st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                    hda_audio_output_timer, st);
+            timer_del(st->buft);
         } else {
             cb = hda_audio_compat_output_cb;
         }
@@ -497,8 +496,7 @@ static void hda_audio_setup(HDAAudioStream *st)
     } else {
         if (use_timer) {
             cb = hda_audio_input_cb;
-            st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                    hda_audio_input_timer, st);
+            timer_del(st->buft);
         } else {
             cb = hda_audio_compat_input_cb;
         }
@@ -726,8 +724,12 @@ static void hda_audio_init(HDACodecDevice *hda,
                 st->gain_right = QEMU_HDA_AMP_STEPS;
                 st->compat_bpos = sizeof(st->compat_buf);
                 st->output = true;
+                st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                        hda_audio_output_timer, st);
             } else {
                 st->output = false;
+                st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+                                        hda_audio_input_timer, st);
             }
             st->format = AC_FMT_TYPE_PCM | AC_FMT_BITS_16 |
                 (1 << AC_FMT_CHAN_SHIFT);
@@ -750,9 +752,7 @@ static void hda_audio_exit(HDACodecDevice *hda)
         if (st->node == NULL) {
             continue;
         }
-        if (a->use_timer) {
-            timer_del(st->buft);
-        }
+        timer_free(st->buft);
         if (st->output) {
             AUD_close_out(&a->card, st->voice.out);
         } else {
diff --git a/hw/audio/trace-events b/hw/audio/trace-events
index b1870ff224b3c1677d2ec976e981b3495ec3eb9d..b8ef5727678f6dcf8c4d1341f38ea519b946a3f5 100644
--- a/hw/audio/trace-events
+++ b/hw/audio/trace-events
@@ -41,7 +41,6 @@ asc_update_irq(int irq, int a, int b) "set IRQ to %d (A: 0x%x B: 0x%x)"
 
 #virtio-snd.c
 virtio_snd_get_config(void *vdev, uint32_t jacks, uint32_t streams, uint32_t chmaps) "snd %p: get_config jacks=%"PRIu32" streams=%"PRIu32" chmaps=%"PRIu32""
-virtio_snd_set_config(void *vdev, uint32_t jacks, uint32_t new_jacks, uint32_t streams, uint32_t new_streams, uint32_t chmaps, uint32_t new_chmaps) "snd %p: set_config jacks from %"PRIu32"->%"PRIu32", streams from %"PRIu32"->%"PRIu32", chmaps from %"PRIu32"->%"PRIu32
 virtio_snd_get_features(void *vdev, uint64_t features) "snd %p: get_features 0x%"PRIx64
 virtio_snd_vm_state_running(void) "vm state running"
 virtio_snd_vm_state_stopped(void) "vm state stopped"
diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c
index 137fa77a01ce85803962f3027743353b991f179e..9f7a69e408a3cee3782e1762862006114386b972 100644
--- a/hw/audio/virtio-snd.c
+++ b/hw/audio/virtio-snd.c
@@ -107,29 +107,6 @@ virtio_snd_get_config(VirtIODevice *vdev, uint8_t *config)
 
 }
 
-static void
-virtio_snd_set_config(VirtIODevice *vdev, const uint8_t *config)
-{
-    VirtIOSound *s = VIRTIO_SND(vdev);
-    const virtio_snd_config *sndconfig =
-        (const virtio_snd_config *)config;
-
-
-   trace_virtio_snd_set_config(vdev,
-                               s->snd_conf.jacks,
-                               sndconfig->jacks,
-                               s->snd_conf.streams,
-                               sndconfig->streams,
-                               s->snd_conf.chmaps,
-                               sndconfig->chmaps);
-
-    memcpy(&s->snd_conf, sndconfig, sizeof(virtio_snd_config));
-    le32_to_cpus(&s->snd_conf.jacks);
-    le32_to_cpus(&s->snd_conf.streams);
-    le32_to_cpus(&s->snd_conf.chmaps);
-
-}
-
 static void
 virtio_snd_pcm_buffer_free(VirtIOSoundPCMBuffer *buffer)
 {
@@ -400,7 +377,7 @@ static void virtio_snd_get_qemu_audsettings(audsettings *as,
     as->nchannels = MIN(AUDIO_MAX_CHANNELS, params->channels);
     as->fmt = virtio_snd_get_qemu_format(params->format);
     as->freq = virtio_snd_get_qemu_freq(params->rate);
-    as->endianness = target_words_bigendian() ? 1 : 0;
+    as->endianness = 0; /* Conforming to VIRTIO 1.0: always little endian. */
 }
 
 /*
@@ -1274,7 +1251,7 @@ static void virtio_snd_pcm_in_cb(void *data, int available)
 {
     VirtIOSoundPCMStream *stream = data;
     VirtIOSoundPCMBuffer *buffer;
-    size_t size;
+    size_t size, max_size;
 
     WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) {
         while (!QSIMPLEQ_EMPTY(&stream->queue)) {
@@ -1288,7 +1265,12 @@ static void virtio_snd_pcm_in_cb(void *data, int available)
                 continue;
             }
 
+            max_size = iov_size(buffer->elem->in_sg, buffer->elem->in_num);
             for (;;) {
+                if (buffer->size >= max_size) {
+                    return_rx_buffer(stream, buffer);
+                    break;
+                }
                 size = AUD_read(stream->voice.in,
                         buffer->data + buffer->size,
                         MIN(available, (stream->params.period_bytes -
@@ -1399,7 +1381,6 @@ static void virtio_snd_class_init(ObjectClass *klass, void *data)
     vdc->realize = virtio_snd_realize;
     vdc->unrealize = virtio_snd_unrealize;
     vdc->get_config = virtio_snd_get_config;
-    vdc->set_config = virtio_snd_set_config;
     vdc->get_features = get_features;
     vdc->reset = virtio_snd_reset;
     vdc->legacy_features = 0;
diff --git a/hw/block/block.c b/hw/block/block.c
index 9f52ee6e728e34f8779a30e2afe7628b29e2dde9..6bece8770917382585ab0447e4bc5ed8df042c30 100644
--- a/hw/block/block.c
+++ b/hw/block/block.c
@@ -239,6 +239,16 @@ bool blkconf_apply_backend_options(BlockConf *conf, bool readonly,
     blk_set_enable_write_cache(blk, wce);
     blk_set_on_error(blk, rerror, werror);
 
+    if (rerror == BLOCKDEV_ON_ERROR_RETRY ||
+        werror == BLOCKDEV_ON_ERROR_RETRY) {
+        if (conf->retry_interval >= 0) {
+            blk_set_on_error_retry_interval(blk, conf->retry_interval);
+        }
+        if (conf->retry_timeout >= 0) {
+            blk_set_on_error_retry_timeout(blk, conf->retry_timeout);
+        }
+    }
+
     block_acct_setup(blk_get_stats(blk), conf->account_invalid,
                      conf->account_failed);
     return true;
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index a1f8e155227591fb428f065d11658f133e348a6b..beedc0cf5f859e3a1aae094ab1ad5b5bac1a3f0d 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -90,6 +90,10 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
             block_acct_failed(blk_get_stats(s->blk), &req->acct);
         }
         virtio_blk_free_request(req);
+    } else if (action == BLOCK_ERROR_ACTION_RETRY) {
+        req->mr_next = NULL;
+        req->next = s->rq;
+        s->rq = req;
     }
 
     blk_error_action(s->blk, action, is_read, error);
@@ -131,6 +135,7 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
             }
         }
 
+        blk_error_retry_reset_timeout(s->blk);
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
         block_acct_done(blk_get_stats(s->blk), &req->acct);
         virtio_blk_free_request(req);
@@ -150,6 +155,7 @@ static void virtio_blk_flush_complete(void *opaque, int ret)
         }
     }
 
+    blk_error_retry_reset_timeout(s->blk);
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
     block_acct_done(blk_get_stats(s->blk), &req->acct);
     virtio_blk_free_request(req);
@@ -172,6 +178,7 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
         }
     }
 
+    blk_error_retry_reset_timeout(s->blk);
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
     if (is_write_zeroes) {
         block_acct_done(blk_get_stats(s->blk), &req->acct);
@@ -783,7 +790,8 @@ static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
             sizeof(struct virtio_blk_zone_report) +
             sizeof(struct virtio_blk_zone_descriptor)) {
         virtio_error(vdev, "in buffer too small for zone report");
-        return;
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
     }
 
     /* start byte offset of the zone report */
@@ -852,7 +860,7 @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
     } else {
         if (bs->bl.zone_size > capacity - offset) {
             /* The zoned device allows the last smaller zone. */
-            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
+            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1ull);
         } else {
             len = bs->bl.zone_size;
         }
@@ -1183,12 +1191,12 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 {
     VirtIOBlock *s = opaque;
 
-    VirtIOBlockReq *req = s->rq;
+    VirtIOBlockReq *req;
     MultiReqBuffer mrb = {};
 
-    s->rq = NULL;
-
     aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    req = s->rq;
+    s->rq = NULL;
     while (req) {
         VirtIOBlockReq *next = req->next;
         if (virtio_blk_handle_request(req, &mrb)) {
@@ -1541,10 +1549,44 @@ static void virtio_blk_drained_end(void *opaque)
     }
 }
 
+static void virtio_blk_retry_request(void *opaque)
+{
+    VirtIOBlock *s = VIRTIO_BLK(opaque);
+
+    VirtIOBlockReq *req;
+    MultiReqBuffer mrb = {};
+
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    req = s->rq;
+    s->rq = NULL;
+    while (req) {
+        VirtIOBlockReq *next = req->next;
+        if (virtio_blk_handle_request(req, &mrb)) {
+            /* Device is now broken and won't do any processing until it gets
+             * reset. Already queued requests will be lost: let's purge them.
+             */
+            while (req) {
+                next = req->next;
+                virtqueue_detach_element(req->vq, &req->elem, 0);
+                virtio_blk_free_request(req);
+                req = next;
+            }
+            break;
+        }
+        req = next;
+    }
+
+    if (mrb.num_reqs) {
+        virtio_blk_submit_multireq(s, &mrb);
+    }
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+}
+
 static const BlockDevOps virtio_block_ops = {
     .resize_cb     = virtio_blk_resize,
     .drained_begin = virtio_blk_drained_begin,
     .drained_end   = virtio_blk_drained_end,
+    .retry_request_cb = virtio_blk_retry_request,
 };
 
 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
diff --git a/hw/char/Kconfig b/hw/char/Kconfig
index 6b6cf2fc1dff949b1f454308391d74953624a207..335a60c2c14c5e248aed1377daf6f2e4991bd96c 100644
--- a/hw/char/Kconfig
+++ b/hw/char/Kconfig
@@ -71,3 +71,7 @@ config GOLDFISH_TTY
 
 config SHAKTI_UART
     bool
+
+config CPUFREQ
+    bool
+    default y
diff --git a/hw/char/pl011.c b/hw/char/pl011.c
index 58edeb9ddb6b39ead1d47d7bb74a1f5f04f4fd18..bc65d778d23f8ae9c2d52d9244858e052ebc3401 100644
--- a/hw/char/pl011.c
+++ b/hw/char/pl011.c
@@ -314,6 +314,10 @@ static void pl011_write(void *opaque, hwaddr offset,
     case 17: /* UARTICR */
         s->int_level &= ~value;
         pl011_update(s);
+        if (!s->int_enabled && !s->int_level) {
+            s->read_count = 0;
+            s->read_pos = 0;
+        }
         break;
     case 18: /* UARTDMACR */
         s->dmacr = value;
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index dd619f0731e8ef7f0f8e8ff647131a998918bc4b..096214b11b183e6ed420d46d5a293388778769c3 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -257,6 +257,8 @@ static size_t send_control_event(VirtIOSerial *vser, uint32_t port_id,
     virtio_stw_p(vdev, &cpkt.value, value);
 
     trace_virtio_serial_send_control_event(port_id, event, value);
+    qemu_log("virtio serial port %d send control message"
+             " event = %d, value = %d\n", port_id, event, value);
     return send_control_msg(vser, &cpkt, sizeof(cpkt));
 }
 
@@ -364,6 +366,9 @@ static void handle_control_message(VirtIOSerial *vser, void *buf, size_t len)
     cpkt.value = virtio_lduw_p(vdev, &gcpkt->value);
 
     trace_virtio_serial_handle_control_message(cpkt.event, cpkt.value);
+    qemu_log("virtio serial port '%u' handle control message"
+             " event = %d, value = %d\n",
+             virtio_ldl_p(vdev, &gcpkt->id), cpkt.event, cpkt.value);
 
     if (cpkt.event == VIRTIO_CONSOLE_DEVICE_READY) {
         if (!cpkt.value) {
@@ -985,8 +990,7 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
-                                   &dev->mem_reentrancy_guard);
+    port->bh = virtio_bh_new_guarded(dev, flush_queued_data_bh, port);
     port->elem = NULL;
 }
 
diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
index 82dae51a550b52b836fe9802784e6d50fb71c337..e36ca2c207093a41ff4554dc41941fe424edad25 100644
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@@ -262,6 +262,10 @@ static void cpu_common_finalize(Object *obj)
 {
     CPUState *cpu = CPU(obj);
 
+    /* If cleanup didn't happen in context to gdb_unregister_coprocessor_all */
+    if (cpu->gdb_regs) {
+        g_array_free(cpu->gdb_regs, TRUE);
+    }
     qemu_lockcnt_destroy(&cpu->in_ioctl_lock);
     qemu_mutex_destroy(&cpu->work_mutex);
 }
diff --git a/hw/core/gpio.c b/hw/core/gpio.c
index 80d07a6ec99ade02cbf69ed4b2f691e04eef03d7..abb164d5c0e7d10c02ea0002e125d3a35bca5da6 100644
--- a/hw/core/gpio.c
+++ b/hw/core/gpio.c
@@ -143,7 +143,7 @@ qemu_irq qdev_get_gpio_out_connector(DeviceState *dev, const char *name, int n)
 
 /* disconnect a GPIO output, returning the disconnected input (if any) */
 
-static qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev,
+qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev,
                                                const char *name, int n)
 {
     char *propname = g_strdup_printf("%s[%d]",
diff --git a/hw/core/irq.c b/hw/core/irq.c
index 3f14e2dda74709f35014a37d5997f0c646b4f240..df9b5dac9be4fffb545286aaf2ab3d41da1b42df 100644
--- a/hw/core/irq.c
+++ b/hw/core/irq.c
@@ -110,7 +110,10 @@ void qemu_irq_intercept_in(qemu_irq *gpio_in, qemu_irq_handler handler, int n)
     int i;
     qemu_irq *old_irqs = qemu_allocate_irqs(NULL, NULL, n);
     for (i = 0; i < n; i++) {
-        *old_irqs[i] = *gpio_in[i];
+        old_irqs[i]->handler = gpio_in[i]->handler;
+        old_irqs[i]->opaque = gpio_in[i]->opaque;
+        old_irqs[i]->n = gpio_in[i]->n;
+
         gpio_in[i]->handler = handler;
         gpio_in[i]->opaque = &old_irqs[i];
     }
diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
index 3860a50c3b7bad1f496f2de980624e824e58dec7..f1389ef644a91c7f2eab7571372f18d8cc5194ed 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
@@ -8,6 +8,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "sysemu/rtc.h"
 #include "hw/acpi/vmgenid.h"
 #include "hw/boards.h"
 #include "hw/intc/intc.h"
@@ -373,6 +374,11 @@ HumanReadableText *qmp_x_query_irq(Error **errp)
     return human_readable_text_from_str(buf);
 }
 
+int64_t qmp_query_rtc_date_diff(Error **errp)
+{
+    return get_rtc_date_diff();
+}
+
 GuidInfo *qmp_query_vm_generation_id(Error **errp)
 {
     GuidInfo *info;
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 0c1739814124ca1045a383a1f3b9e791221a0bd6..965682619bdf23d58c33cab7c965ee3225c23ca8 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -160,6 +160,8 @@ const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0);
 GlobalProperty hw_compat_3_1[] = {
     { "pcie-root-port", "x-speed", "2_5" },
     { "pcie-root-port", "x-width", "1" },
+    { "pcie-root-port", "fast-plug", "0" },
+    { "pcie-root-port", "fast-unplug", "0" },
     { "memory-backend-file", "x-use-canonical-path-for-ramblock-id", "true" },
     { "memory-backend-memfd", "x-use-canonical-path-for-ramblock-id", "true" },
     { "tpm-crb", "ppi", "false" },
diff --git a/hw/core/numa.c b/hw/core/numa.c
index f08956ddb0ff794833213b1bbe98c7702871743d..98d896e68704465a7785d90ee67f0318060d41ac 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -42,6 +42,8 @@
 #include "qemu/option.h"
 #include "qemu/config-file.h"
 #include "qemu/cutils.h"
+#include "exec/address-spaces.h"
+#include "sysemu/kvm.h"
 
 QemuOptsList qemu_numa_opts = {
     .name = "numa",
@@ -641,6 +643,21 @@ static void numa_init_memdev_container(MachineState *ms, MemoryRegion *ram)
     }
 }
 
+/*
+ * Add virtcca_shared_hugepage as a sub-MR to the root MR of address space
+ * address_space_memory and address_space_virtcca_shared_memory.
+ */
+static void virtcca_shared_memory_configuration(MachineState *ms)
+{
+    MemoryRegion *alias_mr = g_new(MemoryRegion, 1);
+
+    memory_region_add_subregion_overlap(ms->ram, 0, virtcca_shared_hugepage, 1);
+    memory_region_init_alias(alias_mr, NULL, "alias-mr", virtcca_shared_hugepage,
+                             0, int128_get64(virtcca_shared_hugepage->size));
+    memory_region_add_subregion(address_space_virtcca_shared_memory.root,
+                                virtcca_cvm_gpa_start, alias_mr);
+}
+
 void numa_complete_configuration(MachineState *ms)
 {
     int i;
@@ -711,6 +728,10 @@ void numa_complete_configuration(MachineState *ms)
             memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id,
                                ms->ram_size);
             numa_init_memdev_container(ms, ms->ram);
+            if (virtcca_cvm_enabled() && virtcca_shared_hugepage &&
+                virtcca_shared_hugepage->ram_block) {
+                virtcca_shared_memory_configuration(ms);
+            }
         }
         /* QEMU needs at least all unique node pair distances to build
          * the whole NUMA distance table. QEMU treats the distance table
diff --git a/hw/core/platform-bus.c b/hw/core/platform-bus.c
index b8487b26b674e46e445749118abc7b2cc6918f0f..dc58bf505aa2f5c0460eed9b41d9ee41d92b28f1 100644
--- a/hw/core/platform-bus.c
+++ b/hw/core/platform-bus.c
@@ -145,9 +145,12 @@ static void platform_bus_map_mmio(PlatformBusDevice *pbus, SysBusDevice *sbdev,
      * the target device's memory region
      */
     for (off = 0; off < pbus->mmio_size; off += alignment) {
-        if (!memory_region_find(&pbus->mmio, off, size).mr) {
+        MemoryRegion *mr = memory_region_find(&pbus->mmio, off, size).mr;
+        if (!mr) {
             found_region = true;
             break;
+        } else {
+            memory_region_unref(mr);
         }
     }
 
diff --git a/hw/core/ptimer.c b/hw/core/ptimer.c
index e03165febf149dcdb25b1a4bb5c39a0a8fcd5869..7177ecfab0dbbb2e6ea2f4f60d371b00cff9a9af 100644
--- a/hw/core/ptimer.c
+++ b/hw/core/ptimer.c
@@ -83,7 +83,7 @@ static void ptimer_reload(ptimer_state *s, int delta_adjust)
         delta = s->delta = s->limit;
     }
 
-    if (s->period == 0) {
+    if (s->period == 0 && s->period_frac == 0) {
         if (!qtest_enabled()) {
             fprintf(stderr, "Timer with period zero, disabling\n");
         }
@@ -309,7 +309,7 @@ void ptimer_run(ptimer_state *s, int oneshot)
 
     assert(s->in_transaction);
 
-    if (was_disabled && s->period == 0) {
+    if (was_disabled && s->period == 0 && s->period_frac == 0) {
         if (!qtest_enabled()) {
             fprintf(stderr, "Timer with period zero, disabling\n");
         }
diff --git a/hw/core/qdev-prop-internal.h b/hw/core/qdev-prop-internal.h
index d7b77844fe0b2fa14ee8db6728b4ce9fb005e4e5..68b1b9d10c86ab4195c245e7708442de7675e179 100644
--- a/hw/core/qdev-prop-internal.h
+++ b/hw/core/qdev-prop-internal.h
@@ -22,6 +22,8 @@ void qdev_propinfo_set_default_value_uint(ObjectProperty *op,
 
 void qdev_propinfo_get_int32(Object *obj, Visitor *v, const char *name,
                              void *opaque, Error **errp);
+void qdev_propinfo_get_int64(Object *obj, Visitor *v, const char *name,
+                             void *opaque, Error **errp);
 void qdev_propinfo_get_size32(Object *obj, Visitor *v, const char *name,
                               void *opaque, Error **errp);
 
diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c
index 1473ab3d5e956796fd445ad0d109a9619bfd93b8..9cc2e38aba27e0cf5d33a599ccc9e21c2f4245dc 100644
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -635,6 +635,51 @@ const PropertyInfo qdev_prop_blockdev_on_error = {
     .set_default_value = qdev_propinfo_set_default_value_enum,
 };
 
+static void set_retry_time(Object *obj, Visitor *v, const char *name,
+                           void *opaque, Error **errp)
+{
+    DeviceState *dev = DEVICE(obj);
+    Property *prop = opaque;
+    int64_t value, *ptr = object_field_prop_ptr(obj, prop);
+    Error *local_err = NULL;
+
+    if (dev->realized) {
+        qdev_prop_set_after_realize(dev, name, errp);
+        return;
+    }
+
+    visit_type_int64(v, name, &value, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    /* value should not be negative */
+    if (value < 0) {
+        error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
+                   dev->id ? : "", name, (int64_t)value, 0L, LONG_MAX);
+        return;
+    }
+
+    *ptr = value;
+}
+
+const PropertyInfo qdev_prop_blockdev_retry_interval = {
+    .name = "BlockdevRetryInterval",
+    .description = "Interval for retry error handling policy",
+    .get   = qdev_propinfo_get_int64,
+    .set   = set_retry_time,
+    .set_default_value = qdev_propinfo_set_default_value_int,
+};
+
+const PropertyInfo qdev_prop_blockdev_retry_timeout = {
+    .name = "BlockdevRetryTimeout",
+    .description = "Timeout for retry error handling policy",
+    .get   = qdev_propinfo_get_int64,
+    .set   = set_retry_time,
+    .set_default_value = qdev_propinfo_set_default_value_int,
+};
+
 /* --- BIOS CHS translation */
 
 QEMU_BUILD_BUG_ON(sizeof(BiosAtaTranslation) != sizeof(int));
@@ -666,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
 const PropertyInfo qdev_prop_multifd_compression = {
     .name = "MultiFDCompression",
     .description = "multifd_compression values, "
-                   "none/zlib/zstd",
+                   "none/zlib/zstd/qpl/uadk/qatzip",
     .enum_table = &MultiFDCompression_lookup,
     .get = qdev_propinfo_get_enum,
     .set = qdev_propinfo_set_enum,
@@ -687,6 +732,16 @@ const PropertyInfo qdev_prop_mig_mode = {
     .set_default_value = qdev_propinfo_set_default_value_enum,
 };
 
+const PropertyInfo qdev_prop_zero_page_detection = {
+    .name = "ZeroPageDetection",
+    .description = "zero_page_detection values, "
+                   "none,legacy,multifd",
+    .enum_table = &ZeroPageDetection_lookup,
+    .get = qdev_propinfo_get_enum,
+    .set = qdev_propinfo_set_enum,
+    .set_default_value = qdev_propinfo_set_default_value_enum,
+};
+
 /* --- Reserved Region --- */
 
 /*
@@ -1157,6 +1212,17 @@ const PropertyInfo qdev_prop_uuid = {
     .set_default_value = set_default_uuid_auto,
 };
 
+/* --- CompressMethod --- */
+const PropertyInfo qdev_prop_compress_method = {
+    .name = "CompressMethod",
+    .description = "multi-thread compression method, "
+                   "zlib/zstd",
+    .enum_table = &CompressMethod_lookup,
+    .get = qdev_propinfo_get_enum,
+    .set = qdev_propinfo_set_enum,
+    .set_default_value = qdev_propinfo_set_default_value_enum,
+};
+
 /* --- s390 cpu entitlement policy --- */
 
 QEMU_BUILD_BUG_ON(sizeof(CpuS390Entitlement) != sizeof(int));
diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c
index 840006e953c077c4f00bca8c90b1ede16fd3daf5..19b7450b4db19459f9bb1e00986e0a38afd254a1 100644
--- a/hw/core/qdev-properties.c
+++ b/hw/core/qdev-properties.c
@@ -398,7 +398,7 @@ static void set_uint64(Object *obj, Visitor *v, const char *name,
     visit_type_uint64(v, name, ptr, errp);
 }
 
-static void get_int64(Object *obj, Visitor *v, const char *name,
+void qdev_propinfo_get_int64(Object *obj, Visitor *v, const char *name,
                       void *opaque, Error **errp)
 {
     Property *prop = opaque;
@@ -425,7 +425,7 @@ const PropertyInfo qdev_prop_uint64 = {
 
 const PropertyInfo qdev_prop_int64 = {
     .name  = "int64",
-    .get   = get_int64,
+    .get   = qdev_propinfo_get_int64,
     .set   = set_int64,
     .set_default_value = qdev_propinfo_set_default_value_int,
 };
diff --git a/hw/core/reset.c b/hw/core/reset.c
index d3263b613e661eeb7605d2e63ad2321c7838df46..fa63bfedb7083188a5e798dd35534b8635b61738 100644
--- a/hw/core/reset.c
+++ b/hw/core/reset.c
@@ -25,6 +25,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/queue.h"
+#include "qemu/log.h"
 #include "sysemu/reset.h"
 
 /* reset/shutdown handler */
@@ -75,6 +76,7 @@ void qemu_devices_reset(ShutdownCause reason)
 {
     QEMUResetEntry *re, *nre;
 
+    qemu_log("reset all devices\n");
     /* reset all devices */
     QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) {
         if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD &&
diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c
index eebcd28f9a377178022016399dcc3ce4cc12854b..58f4dc614c2ccedf475d7a92e4eff8982b232732 100644
--- a/hw/core/sysbus-fdt.c
+++ b/hw/core/sysbus-fdt.c
@@ -489,6 +489,7 @@ static const BindingEntry bindings[] = {
 #ifdef CONFIG_LINUX
     TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node),
     TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node),
+    TYPE_BINDING("arm-smmuv3-accel", no_fdt_node),
     VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node),
 #endif
 #ifdef CONFIG_TPM
diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 2aa776c79c7465198328046d32072c58110c7fc7..c5f5fcfd64d00d400e8d3640afb1bbe0de7bb49b 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -26,6 +26,7 @@ static void cxl_fixed_memory_window_config(CXLState *cxl_state,
                                            CXLFixedMemoryWindowOptions *object,
                                            Error **errp)
 {
+    ERRP_GUARD();
     g_autofree CXLFixedWindow *fw = g_malloc0(sizeof(*fw));
     strList *target;
     int i;
diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 6eff56fb1b345d1fd440f18c6b39541186157ac2..68d6457becde027f73f3bc9c5e2dd2a9b0740a1d 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -505,6 +505,9 @@ static CXLRetCode cmd_get_physical_port_state(const struct cxl_cmd *cmd,
     in = (struct cxl_fmapi_get_phys_port_state_req_pl *)payload_in;
     out = (struct cxl_fmapi_get_phys_port_state_resp_pl *)payload_out;
 
+    if (len_in < sizeof(*in)) {
+        return CXL_MBOX_INVALID_PAYLOAD_LENGTH;
+    }
     /* Check if what was requested can fit */
     if (sizeof(*out) + sizeof(*out->ports) * in->num_ports > cci->payload_max) {
         return CXL_MBOX_INVALID_INPUT;
@@ -897,8 +900,8 @@ static CXLRetCode cmd_ccls_set_lsa(const struct cxl_cmd *cmd,
     const size_t hdr_len = offsetof(struct set_lsa_pl, data);
 
     *len_out = 0;
-    if (!len_in) {
-        return CXL_MBOX_SUCCESS;
+    if (len_in < hdr_len) {
+        return CXL_MBOX_INVALID_PAYLOAD_LENGTH;
     }
 
     if (set_lsa_payload->offset + len_in > cvc->get_lsa_size(ct3d) + hdr_len) {
diff --git a/hw/display/bcm2835_fb.c b/hw/display/bcm2835_fb.c
index a05277674f2c64634e1489fe895545d50e8ec2bb..c45da149d99f25ec3ced04e0bbfd96a2cec7f11a 100644
--- a/hw/display/bcm2835_fb.c
+++ b/hw/display/bcm2835_fb.c
@@ -145,7 +145,7 @@ static bool fb_use_offsets(BCM2835FBConfig *config)
      * viewport size is larger than the physical screen. (It doesn't
      * prevent the guest setting this silly viewport setting, though...)
      */
-    return config->xres_virtual > config->xres &&
+    return config->xres_virtual > config->xres ||
         config->yres_virtual > config->yres;
 }
 
diff --git a/hw/display/macfb.c b/hw/display/macfb.c
index d61541ccb5d5d3d13c65539bef4f237f690fc540..170da35757aded66339c41a195a857396d49d719 100644
--- a/hw/display/macfb.c
+++ b/hw/display/macfb.c
@@ -714,6 +714,7 @@ static void macfb_nubus_set_irq(void *opaque, int n, int level)
 
 static void macfb_nubus_realize(DeviceState *dev, Error **errp)
 {
+    ERRP_GUARD();
     NubusDevice *nd = NUBUS_DEVICE(dev);
     MacfbNubusState *s = NUBUS_MACFB(dev);
     MacfbNubusDeviceClass *ndc = NUBUS_MACFB_GET_CLASS(dev);
diff --git a/hw/display/vga.c b/hw/display/vga.c
index 37557c3442aa8b709e74930502ed3a573da3d222..3f1358676bd99d59917e45e6b3e20c14ff814a3b 100644
--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@@ -39,6 +39,8 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 
+#include "sysemu/kvm.h"
+
 //#define DEBUG_VGA_MEM
 //#define DEBUG_VGA_REG
 
@@ -1748,6 +1750,13 @@ static void vga_draw_blank(VGACommonState *s, int full_update)
     if (s->last_scr_width <= 0 || s->last_scr_height <= 0)
         return;
 
+    if (is_buffer_shared(surface)) {
+        /* unshare buffer, otherwise the blanking corrupts vga vram */
+        surface = qemu_create_displaysurface(s->last_scr_width,
+                                             s->last_scr_height);
+        dpy_gfx_replace_surface(s->con, surface);
+    }
+
     w = s->last_scr_width * surface_bytes_per_pixel(surface);
     d = surface_data(surface);
     for(i = 0; i < s->last_scr_height; i++) {
@@ -1783,6 +1792,11 @@ static void vga_update_display(void *opaque)
             s->cursor_blink_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
             full_update = 1;
         }
+
+        /* Force to full update in CSV guest. */
+        if (kvm_csv3_enabled())
+            full_update = 1;
+
         switch(graphic_mode) {
         case GMODE_TEXT:
             vga_draw_text(s, full_update);
diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 709c8a02a1468dfb26355bf363e717f1b410c183..373f04a7b43e4f6c7387847a2e70732dda1396d1 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -385,7 +385,7 @@ vhost_user_gpu_chr_read(void *opaque)
     }
 
     msg->request = request;
-    msg->flags = size;
+    msg->flags = flags;
     msg->size = size;
 
     if (request == VHOST_USER_GPU_CURSOR_UPDATE ||
diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c
index 8bb7a2c21fe7878731cbe9289206ac4f1fc80535..9f34d0e6619cbb9a66a7398722ab134fbf490e5b 100644
--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -181,7 +181,7 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g,
         memset(&info, 0, sizeof(info));
         ret = virgl_renderer_resource_get_info(ss.resource_id, &info);
 #endif
-        if (ret == -1) {
+        if (ret) {
             qemu_log_mask(LOG_GUEST_ERROR,
                           "%s: illegal resource specified %d\n",
                           __func__, ss.resource_id);
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index b016d3bac85c27ccbfb64241d6d9e5f319957401..a7146388221f7dacfa50d2eea65beb3fa541cadb 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -679,10 +679,6 @@ static void virtio_gpu_do_set_scanout(VirtIOGPU *g,
 
         /* realloc the surface ptr */
         scanout->ds = qemu_create_displaysurface_pixman(rect);
-        if (!scanout->ds) {
-            *error = VIRTIO_GPU_RESP_ERR_UNSPEC;
-            return;
-        }
 #ifdef WIN32
         qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, fb->offset);
 #endif
@@ -1418,9 +1414,6 @@ static int virtio_gpu_post_load(void *opaque, int version_id)
             return -EINVAL;
         }
         scanout->ds = qemu_create_displaysurface_pixman(res->image);
-        if (!scanout->ds) {
-            return -EINVAL;
-        }
 #ifdef WIN32
         qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, 0);
 #endif
@@ -1463,10 +1456,8 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
 
     g->ctrl_vq = virtio_get_queue(vdev, 0);
     g->cursor_vq = virtio_get_queue(vdev, 1);
-    g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g,
-                                     &qdev->mem_reentrancy_guard);
-    g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g,
-                                       &qdev->mem_reentrancy_guard);
+    g->ctrl_bh = virtio_bh_new_guarded(qdev, virtio_gpu_ctrl_bh, g);
+    g->cursor_bh = virtio_bh_new_guarded(qdev, virtio_gpu_cursor_bh, g);
     g->reset_bh = qemu_bh_new(virtio_gpu_reset_bh, g);
     qemu_cond_init(&g->reset_cond);
     QTAILQ_INIT(&g->reslist);
diff --git a/hw/gpio/aspeed_gpio.c b/hw/gpio/aspeed_gpio.c
index 1e267dd48203c23d3266124ef0e14aa1bd275c64..0fc3d4c05fcf4766f50e230971388b06bec4f8f3 100644
--- a/hw/gpio/aspeed_gpio.c
+++ b/hw/gpio/aspeed_gpio.c
@@ -281,7 +281,7 @@ static void aspeed_gpio_update(AspeedGPIOState *s, GPIOSets *regs,
     diff &= mode_mask;
     if (diff) {
         for (gpio = 0; gpio < ASPEED_GPIOS_PER_SET; gpio++) {
-            uint32_t mask = 1 << gpio;
+            uint32_t mask = 1U << gpio;
 
             /* If the gpio needs to be updated... */
             if (!(diff & mask)) {
diff --git a/hw/i2c/smbus_slave.c b/hw/i2c/smbus_slave.c
index 2ef2c7c5f69495464bc5965ae9b251db6425ad70..b35516a404c510768de3a61d967bd5a632aa57c8 100644
--- a/hw/i2c/smbus_slave.c
+++ b/hw/i2c/smbus_slave.c
@@ -25,11 +25,15 @@
 #define DPRINTF(fmt, ...) \
 do { printf("smbus(%02x): " fmt , dev->i2c.address, ## __VA_ARGS__); } while (0)
 #define BADF(fmt, ...) \
-do { fprintf(stderr, "smbus: error: " fmt , ## __VA_ARGS__); exit(1);} while (0)
+do { g_autofree char *qom_path = object_get_canonical_path(OBJECT(dev));  \
+    fprintf(stderr, "%s: smbus: error: " fmt , qom_path, ## __VA_ARGS__); \
+            exit(1); } while (0)
 #else
 #define DPRINTF(fmt, ...) do {} while(0)
 #define BADF(fmt, ...) \
-do { fprintf(stderr, "smbus: error: " fmt , ## __VA_ARGS__);} while (0)
+do { g_autofree char *qom_path = object_get_canonical_path(OBJECT(dev));  \
+    fprintf(stderr, "%s: smbus: error: " fmt , qom_path, ## __VA_ARGS__); \
+             } while (0)
 #endif
 
 enum {
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 55850791df4148f5535eb06b76e09dabf75d84f1..908f29e02bc6a8ff84e3fe147d4da9dcbf739f41 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -10,6 +10,15 @@ config SGX
     bool
     depends on KVM
 
+config CSV
+    bool
+    select HYGON_CSV_MIG_ACCEL
+    depends on SEV
+
+config HYGON_CSV_MIG_ACCEL
+    bool
+    depends on CSV
+
 config PC
     bool
     imply APPLESMC
@@ -26,6 +35,7 @@ config PC
     imply QXL
     imply SEV
     imply SGX
+    imply CSV
     imply TEST_DEVICES
     imply TPM_CRB
     imply TPM_TIS_ISA
@@ -95,6 +105,7 @@ config Q35
     imply E1000E_PCI_EXPRESS
     imply VMPORT
     imply VMMOUSE
+    imply IOMMUFD
     select PC_PCI
     select PC_ACPI
     select PCI_EXPRESS_Q35
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 80db183b786afbe1fc44b04e45ac0eaa08f7f3d2..e10799ecc6b2ad9c1f63195126f48434a33ee425 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1545,8 +1545,9 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
             .smi_path = pm->smi_on_cpuhp ? "\\_SB.PCI0.SMI0.SMIC" : NULL,
             .fw_unplugs_cpu = pm->smi_on_cpu_unplug,
         };
-        build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry,
-                       pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02");
+        build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, NULL,
+                       pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02",
+                       AML_SYSTEM_IO);
     }
 
     if (pcms->memhp_io_base && nr_mem) {
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 4203144da9865efa61afc0df2fae12f8fd432728..12742b1433f82ffc2c482518eaa8671829dff851 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -346,12 +346,12 @@ static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
                                uint64_t gpa, IOMMUTLBEntry to_cache,
                                uint16_t domid)
 {
-    AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
-    uint64_t *key = g_new(uint64_t, 1);
-    uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
-
     /* don't cache erroneous translations */
     if (to_cache.perm != IOMMU_NONE) {
+        AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
+        uint64_t *key = g_new(uint64_t, 1);
+        uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
+
         trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
                 PCI_FUNC(devid), gpa, to_cache.translated_addr);
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 5085a6fee3f1d4543b5947c1bc99cbc06982239d..60d86e0cb692c56f43f82feee9c3c2916fa83580 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -61,6 +61,12 @@ struct vtd_as_key {
     uint32_t pasid;
 };
 
+/* bus/devfn is PCI device's real BDF not the aliased one */
+struct vtd_hiod_key {
+    PCIBus *bus;
+    uint8_t devfn;
+};
+
 struct vtd_iotlb_key {
     uint64_t gfn;
     uint32_t pasid;
@@ -250,6 +256,25 @@ static guint vtd_as_hash(gconstpointer v)
     return (guint)(value << 8 | key->devfn);
 }
 
+/* Same implementation as vtd_as_hash() */
+static guint vtd_hiod_hash(gconstpointer v)
+{
+    return vtd_as_hash(v);
+}
+
+static gboolean vtd_hiod_equal(gconstpointer v1, gconstpointer v2)
+{
+    const struct vtd_hiod_key *key1 = v1;
+    const struct vtd_hiod_key *key2 = v2;
+
+    return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
+
+static void vtd_hiod_destroy(gpointer v)
+{
+    object_unref(v);
+}
+
 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
                                           gpointer user_data)
 {
@@ -2813,6 +2838,7 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s)
     if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
         error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
                           __func__, val);
+        vtd_handle_inv_queue_error(s);
         return;
     }
     s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
@@ -3812,6 +3838,87 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
     return vtd_dev_as;
 }
 
+static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod,
+                           Error **errp)
+{
+    HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+    int ret;
+
+    if (!hiodc->get_cap) {
+        error_setg(errp, ".get_cap() not implemented");
+        return false;
+    }
+
+    /* Common checks */
+    ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp);
+    if (ret < 0) {
+        return false;
+    }
+    if (s->aw_bits > ret) {
+        error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret);
+        return false;
+    }
+
+    return true;
+}
+
+static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+                                     HostIOMMUDevice *hiod, Error **errp)
+{
+    IntelIOMMUState *s = opaque;
+    struct vtd_as_key key = {
+        .bus = bus,
+        .devfn = devfn,
+    };
+    struct vtd_as_key *new_key;
+
+    assert(hiod);
+
+    vtd_iommu_lock(s);
+
+    if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) {
+        error_setg(errp, "Host IOMMU device already exist");
+        vtd_iommu_unlock(s);
+        return false;
+    }
+
+    if (!vtd_check_hiod(s, hiod, errp)) {
+        vtd_iommu_unlock(s);
+        return false;
+    }
+
+    new_key = g_malloc(sizeof(*new_key));
+    new_key->bus = bus;
+    new_key->devfn = devfn;
+
+    object_ref(hiod);
+    g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod);
+
+    vtd_iommu_unlock(s);
+
+    return true;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+    IntelIOMMUState *s = opaque;
+    struct vtd_as_key key = {
+        .bus = bus,
+        .devfn = devfn,
+    };
+
+    vtd_iommu_lock(s);
+
+    if (!g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) {
+        vtd_iommu_unlock(s);
+        return;
+    }
+
+    g_hash_table_remove(s->vtd_host_iommu_dev, &key);
+
+    vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -3934,30 +4041,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
     return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-    memset(s->csr, 0, DMAR_REG_SIZE);
-    memset(s->wmask, 0, DMAR_REG_SIZE);
-    memset(s->w1cmask, 0, DMAR_REG_SIZE);
-    memset(s->womask, 0, DMAR_REG_SIZE);
-
-    s->root = 0;
-    s->root_scalable = false;
-    s->dmar_enabled = false;
-    s->intr_enabled = false;
-    s->iq_head = 0;
-    s->iq_tail = 0;
-    s->iq = 0;
-    s->iq_size = 0;
-    s->qi_enabled = false;
-    s->iq_last_desc_type = VTD_INV_DESC_NONE;
-    s->iq_dw = false;
-    s->next_frcd_reg = 0;
     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
              VTD_CAP_MGAW(s->aw_bits);
@@ -3974,27 +4061,6 @@ static void vtd_init(IntelIOMMUState *s)
     }
     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-    /*
-     * Rsvd field masks for spte
-     */
-    vtd_spte_rsvd[0] = ~0ULL;
-    vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-                                                  x86_iommu->dt_supported);
-    vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-    vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-    vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-    vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
-                                                         x86_iommu->dt_supported);
-    vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
-                                                         x86_iommu->dt_supported);
-
-    if (s->scalable_mode || s->snoop_control) {
-        vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-        vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-        vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-    }
-
     if (x86_iommu_ir_supported(x86_iommu)) {
         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
         if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4027,6 +4093,56 @@ static void vtd_init(IntelIOMMUState *s)
     if (s->pasid) {
         s->ecap |= VTD_ECAP_PASID;
     }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+    memset(s->csr, 0, DMAR_REG_SIZE);
+    memset(s->wmask, 0, DMAR_REG_SIZE);
+    memset(s->w1cmask, 0, DMAR_REG_SIZE);
+    memset(s->womask, 0, DMAR_REG_SIZE);
+
+    s->root = 0;
+    s->root_scalable = false;
+    s->dmar_enabled = false;
+    s->intr_enabled = false;
+    s->iq_head = 0;
+    s->iq_tail = 0;
+    s->iq = 0;
+    s->iq_size = 0;
+    s->qi_enabled = false;
+    s->iq_last_desc_type = VTD_INV_DESC_NONE;
+    s->iq_dw = false;
+    s->next_frcd_reg = 0;
+
+    vtd_cap_init(s);
+
+    /*
+     * Rsvd field masks for spte
+     */
+    vtd_spte_rsvd[0] = ~0ULL;
+    vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+                                                  x86_iommu->dt_supported);
+    vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+    vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+                                                    x86_iommu->dt_supported);
+    vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+                                                    x86_iommu->dt_supported);
+
+    if (s->scalable_mode || s->snoop_control) {
+        vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+        vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+        vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+    }
 
     vtd_reset_caches(s);
 
@@ -4107,6 +4223,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
     .get_address_space = vtd_host_dma_iommu,
+    .set_iommu_device = vtd_dev_set_iommu_device,
+    .unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4230,6 +4348,8 @@ static void vtd_realize(DeviceState *dev, Error **errp)
                                      g_free, g_free);
     s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
                                       g_free, g_free);
+    s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_hiod_hash, vtd_hiod_equal,
+                                                  g_free, vtd_hiod_destroy);
     vtd_init(s);
     pci_setup_iommu(bus, &vtd_iommu_ops, dev);
     /* Pseudo address space under root PCI bus. */
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 29b9964733ed118f330753c04146bebf4580cddf..204e34db86c53361f217b3b9174d015e3c5aeb08 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -743,6 +743,111 @@ void xen_load_linux(PCMachineState *pcms)
     x86ms->fw_cfg = fw_cfg;
 }
 
+static int try_create_2MB_page(uint32_t page_num)
+{
+    char nr_hp_num_s[256] = {0};
+    char free_hp_num_s[256] = {0};
+    const char *nr_hugepages_dir = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages";
+    const char *free_hugepages_dir = "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages";
+    int nr_hp_num = -1, free_hp_num = -1, ret = -1;
+    int nr_fd = qemu_open_old(nr_hugepages_dir, O_RDWR);
+    int free_fd = qemu_open_old(free_hugepages_dir, O_RDONLY);
+
+    if (nr_fd < 0 || free_fd < 0) {
+        error_report("%s: qemu_open failed: %s\n", __func__, strerror(errno));
+        goto end;
+    }
+
+    if (read(nr_fd, nr_hp_num_s, 256) < 0)
+        goto end;
+    if (read(free_fd, free_hp_num_s, 256) < 0)
+        goto end;
+
+    nr_hp_num = atoi(nr_hp_num_s);
+    free_hp_num = atoi(free_hp_num_s);
+    if (nr_hp_num < 0 || free_hp_num < 0)
+        goto end;
+
+    if (page_num <= free_hp_num) {
+        ret = 0;
+        goto end;
+    }
+
+    nr_hp_num += (page_num - free_hp_num);
+    snprintf (nr_hp_num_s, 256, "%d", nr_hp_num);
+    if (write(nr_fd, nr_hp_num_s, strlen(nr_hp_num_s)) < 0)
+        goto end;
+
+    ret = 0;
+end:
+    if (nr_fd >= 0)
+        close(nr_fd);
+    if (free_fd >= 0)
+        close(free_fd);
+    return ret;
+}
+
+#define HUGEPAGE_NUM_MAX  128
+#define HUGEPAGE_SIZE     (1024*1024*2)
+static void mem2_init(MachineState *ms, MemoryRegion *system_memory)
+{
+    MemoryRegion *mem2_mr;
+    char mr_name[128] = {0};
+    void *ram = NULL;
+    int ret = 0, lock_fd;
+    const char *lock_file = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages";
+    uint32_t page_num = ms->ram2_size / HUGEPAGE_SIZE, i;
+
+    if (HUGEPAGE_NUM_MAX < page_num) {
+        error_report("\"-mem2 'size='\" needs to Less than %dM\n",
+                        (HUGEPAGE_SIZE * HUGEPAGE_NUM_MAX) / (1024 * 1024));
+        exit(EXIT_FAILURE);
+    }
+
+    // Apply for hugepages from OS and use them, which needs to be synchronized
+    lock_fd = qemu_open_old(lock_file, O_WRONLY);
+    if (lock_fd < 0) {
+        error_report("%s: open %s failed: %s\n", __func__, lock_file, strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    while (qemu_lock_fd(lock_fd, 0, 0, true)) {
+        if (errno != EACCES && errno != EAGAIN) {
+            error_report("qemu_lock_fd failed: %s\n", strerror(errno));
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    /** try to create hugepage.
+     *  If there are enough free hugepages, then do nothing.
+     */
+    ret = try_create_2MB_page(page_num);
+    if (ret) {
+        error_report("%s: Failed to allocate hugepage\n", __func__);
+        goto unlock;
+    }
+
+    for (i = 0; i < page_num; ++i) {
+        mem2_mr = g_malloc(sizeof(*mem2_mr));
+        ram = mmap(NULL, HUGEPAGE_SIZE, PROT_READ | PROT_WRITE,
+                    MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0);
+        if (ram == MAP_FAILED) {
+            error_report("%s: mmap failed: %s", __func__, strerror(errno));
+            goto unlock;
+        }
+
+        sprintf(mr_name, "mem2-%d", i);
+        memory_region_init_ram_ptr(mem2_mr, NULL, mr_name, HUGEPAGE_SIZE, ram);
+        memory_region_add_subregion(system_memory, ms->ram2_base + (i * HUGEPAGE_SIZE), mem2_mr);
+    }
+
+    ret = 0;
+unlock:
+    qemu_unlock_fd(lock_fd, 0, 0);
+    if (ret)
+        exit(EXIT_FAILURE);
+}
+
 #define PC_ROM_MIN_VGA     0xc0000
 #define PC_ROM_MIN_OPTION  0xc8000
 #define PC_ROM_MAX         0xe0000
@@ -965,6 +1070,22 @@ void pc_memory_init(PCMachineState *pcms,
                        E820_RAM);
     }
 
+    if (machine->ram2_size && machine->ram2_base) {
+        if (0x100000000ULL + x86ms->above_4g_mem_size > machine->ram2_base) {
+            error_report("\"-mem2 'base'\" needs to greater 0x%llx\n",
+                            0x100000000ULL + x86ms->above_4g_mem_size);
+            exit(EXIT_FAILURE);
+        }
+        if (machine->ram2_base & (HUGEPAGE_SIZE - 1) ||
+                machine->ram2_size & (HUGEPAGE_SIZE - 1)) {
+            error_report("\"-mem2 'base|size'\" needs to aligned to 0x%x\n", HUGEPAGE_SIZE);
+            exit(EXIT_FAILURE);
+        }
+
+        mem2_init(machine, system_memory);
+        e820_add_entry(machine->ram2_base, machine->ram2_size, E820_RAM);
+    }
+
     if (pcms->sgx_epc.size != 0) {
         e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
     }
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index c8d9e71b889b673bc64d89f3d01a240007610552..7c6a9102501b9146372ac0674cf24e85511ef4b4 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -37,6 +37,7 @@
 #include "hw/block/flash.h"
 #include "sysemu/kvm.h"
 #include "sev.h"
+#include "csv.h"
 
 #define FLASH_SECTOR_SIZE 4096
 
@@ -263,7 +264,21 @@ void x86_firmware_configure(void *ptr, int size)
             error_report("failed to locate and/or save reset vector");
             exit(1);
         }
+        if (csv3_enabled()) {
+            ram_addr_t offset = 0;
+            MemoryRegion *mr;
 
-        sev_encrypt_flash(ptr, size, &error_fatal);
+            if (kvm_csv3_should_set_priv_mem())
+                csv3_set_guest_private_memory(&error_fatal);
+
+            mr = memory_region_from_host(ptr, &offset);
+            if (!mr) {
+                error_report("failed to get memory region of flash");
+                exit(1);
+            }
+            csv3_load_data(mr->addr + offset, ptr, size, &error_fatal);
+        } else {
+            sev_encrypt_flash(ptr, size, &error_fatal);
+        }
     }
 }
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index afdc44b8e056dc4e6f999d699b47d65e896f951c..8062e1743c1d6d2cf3b7643f81afffcd5c0fcfcc 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1519,8 +1519,10 @@ static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes)
 {
     AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 
-    tx_bytes += le32_to_cpu(ad->cur_cmd->status);
-    ad->cur_cmd->status = cpu_to_le32(tx_bytes);
+    if (ad->cur_cmd) {
+        tx_bytes += le32_to_cpu(ad->cur_cmd->status);
+        ad->cur_cmd->status = cpu_to_le32(tx_bytes);
+    }
 }
 
 static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write)
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index dca1cc9efc1d435a098d28395e652b04561374ec..3d895c07f459faab1d1a2bd1f19b5d51bbc9cf00 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -119,9 +119,6 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
     return;
 
 done:
-    dma_memory_unmap(&address_space_memory, io->dma_mem, io->dma_len,
-                     io->dir, io->dma_len);
-
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->blk), &s->acct);
     } else {
@@ -202,9 +199,6 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
     return;
 
 done:
-    dma_memory_unmap(&address_space_memory, io->dma_mem, io->dma_len,
-                     io->dir, io->dma_len);
-
     if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) {
         if (ret < 0) {
             block_acct_failed(blk_get_stats(s->blk), &s->acct);
diff --git a/hw/input/ps2.c b/hw/input/ps2.c
index c8fd23cf3600f415fe68e9259ed57ebaa09a8863..b647561069023d65349143df7a6e5abd33ae3968 100644
--- a/hw/input/ps2.c
+++ b/hw/input/ps2.c
@@ -167,7 +167,7 @@ void ps2_queue_noirq(PS2State *s, int b)
     }
 
     q->data[q->wptr] = b;
-    if (++q->wptr == PS2_BUFFER_SIZE) {
+    if (++q->wptr >= PS2_BUFFER_SIZE) {
         q->wptr = 0;
     }
     q->count++;
@@ -557,7 +557,7 @@ uint32_t ps2_read_data(PS2State *s)
         val = q->data[index];
     } else {
         val = q->data[q->rptr];
-        if (++q->rptr == PS2_BUFFER_SIZE) {
+        if (++q->rptr >= PS2_BUFFER_SIZE) {
             q->rptr = 0;
         }
         q->count--;
diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig
index 97d550b06b77d16d08410183af798c50a1535bb0..91c7aa668e3bf908c3f86d242dd0403066eb8dc4 100644
--- a/hw/intc/Kconfig
+++ b/hw/intc/Kconfig
@@ -93,10 +93,16 @@ config NIOS2_VIC
 config LOONGARCH_IPI
     bool
 
+config LOONGARCH_IPI_KVM
+    bool
+
 config LOONGARCH_PCH_PIC
     bool
     select UNIMP
 
+config LOONGARCH_PCH_PIC_KVM
+    bool
+
 config LOONGARCH_PCH_MSI
     select MSI_NONBROKEN
     bool
@@ -104,3 +110,6 @@ config LOONGARCH_PCH_MSI
 
 config LOONGARCH_EXTIOI
     bool
+
+config LOONGARCH_EXTIOI_KVM
+    bool
diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 074cf50af25da0321eecf1e42c0d4d9fc86e2a04..f0582f7a49729e81c1f99e9e6393d33c16260ce2 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -1263,9 +1263,14 @@ static void gic_dist_writeb(void *opaque, hwaddr offset,
                     trace_gic_enable_irq(irq + i);
                 }
                 GIC_DIST_SET_ENABLED(irq + i, cm);
-                /* If a raised level triggered IRQ enabled then mark
-                   is as pending.  */
-                if (GIC_DIST_TEST_LEVEL(irq + i, mask)
+                /*
+                 * If a raised level triggered IRQ enabled then mark
+                 * it as pending on 11MPCore. For other GIC revisions we
+                 * handle the "level triggered and line asserted" check
+                 * at the other end in gic_test_pending().
+                 */
+                if (s->revision == REV_11MPCORE
+                        && GIC_DIST_TEST_LEVEL(irq + i, mask)
                         && !GIC_DIST_TEST_EDGE_TRIGGER(irq + i)) {
                     DPRINTF("Set %d pending mask %x\n", irq + i, mask);
                     GIC_DIST_SET_PENDING(irq + i, mask);
@@ -1658,7 +1663,7 @@ static MemTxResult gic_cpu_read(GICState *s, int cpu, int offset,
             *data = s->h_apr[gic_get_vcpu_real_id(cpu)];
         } else if (gic_cpu_ns_access(s, cpu, attrs)) {
             /* NS view of GICC_APR<n> is the top half of GIC_NSAPR<n> */
-            *data = gic_apr_ns_view(s, regno, cpu);
+            *data = gic_apr_ns_view(s, cpu, regno);
         } else {
             *data = s->apr[regno][cpu];
         }
@@ -1746,7 +1751,7 @@ static MemTxResult gic_cpu_write(GICState *s, int cpu, int offset,
             s->h_apr[gic_get_vcpu_real_id(cpu)] = value;
         } else if (gic_cpu_ns_access(s, cpu, attrs)) {
             /* NS view of GICC_APR<n> is the top half of GIC_NSAPR<n> */
-            gic_apr_write_ns_view(s, regno, cpu, value);
+            gic_apr_write_ns_view(s, cpu, regno, value);
         } else {
             s->apr[regno][cpu] = value;
         }
diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c
index 0b8f79a122765f31c28cbd51464b1e407b5ac90a..e1c7c8c4bc804598426c2645b5e2175a604a02f4 100644
--- a/hw/intc/arm_gicv3.c
+++ b/hw/intc/arm_gicv3.c
@@ -410,6 +410,7 @@ static void arm_gicv3_class_init(ObjectClass *klass, void *data)
     ARMGICv3Class *agc = ARM_GICV3_CLASS(klass);
 
     agcc->post_load = arm_gicv3_post_load;
+    agcc->init_cpu_reginfo = gicv3_init_cpu_reginfo;
     device_class_set_parent_realize(dc, arm_gic_realize, &agc->parent_realize);
 }
 
diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index 2ebf880eada850e7880aa478ec0b44bafa5fe9ef..5667d9f40b453ce9cd4956260bea37e0b860205f 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -25,6 +25,7 @@
 #include "qapi/error.h"
 #include "qemu/module.h"
 #include "qemu/error-report.h"
+#include "hw/boards.h"
 #include "hw/core/cpu.h"
 #include "hw/intc/arm_gicv3_common.h"
 #include "hw/qdev-properties.h"
@@ -33,7 +34,6 @@
 #include "hw/arm/linux-boot-if.h"
 #include "sysemu/kvm.h"
 
-
 static void gicv3_gicd_no_migration_shift_bug_post_load(GICv3State *cs)
 {
     if (cs->gicd_no_migration_shift_bug) {
@@ -322,6 +322,61 @@ void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler,
     }
 }
 
+static int arm_gicv3_get_proc_num(GICv3State *s, CPUState *cpu)
+{
+    uint64_t mp_affinity;
+    uint64_t gicr_typer;
+    uint64_t cpu_affid;
+    int i;
+
+    mp_affinity = object_property_get_uint(OBJECT(cpu), "mp-affinity", NULL);
+    /* match the cpu mp-affinity to get the gic cpuif number */
+    for (i = 0; i < s->num_cpu; i++) {
+        gicr_typer = s->cpu[i].gicr_typer;
+        cpu_affid = (gicr_typer >> 32) & 0xFFFFFF;
+        if (cpu_affid == mp_affinity) {
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+static void arm_gicv3_cpu_update_notifier(Notifier *notifier, void * data)
+{
+    GICv3CPUHotplugInfo *gic_info = (GICv3CPUHotplugInfo *)data;
+    CPUState *cpu = gic_info->cpu;
+    ARMGICv3CommonClass *c;
+    int gic_cpuif_num;
+    GICv3State *s;
+
+    s = ARM_GICV3_COMMON(gic_info->gic);
+    c = ARM_GICV3_COMMON_GET_CLASS(s);
+
+    /* this shall get us mapped gicv3 cpuif corresponding to mpidr */
+    gic_cpuif_num = arm_gicv3_get_proc_num(s, cpu);
+    if (gic_cpuif_num < 0) {
+        error_report("Failed to associate cpu %d with any GIC cpuif",
+                     cpu->cpu_index);
+        abort();
+    }
+
+    /* check if update is for vcpu hot-unplug */
+    if (qemu_enabled_cpu(cpu)) {
+        s->cpu[gic_cpuif_num].cpu = NULL;
+        return;
+    }
+
+    /* re-stitch the gic cpuif to this new cpu */
+    gicv3_set_gicv3state(cpu, &s->cpu[gic_cpuif_num]);
+    gicv3_set_cpustate(&s->cpu[gic_cpuif_num], cpu);
+
+    /* initialize the registers info for this newly added cpu */
+    if (c->init_cpu_reginfo) {
+        c->init_cpu_reginfo(cpu);
+    }
+}
+
 static void arm_gicv3_common_realize(DeviceState *dev, Error **errp)
 {
     GICv3State *s = ARM_GICV3_COMMON(dev);
@@ -392,10 +447,13 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp)
     s->cpu = g_new0(GICv3CPUState, s->num_cpu);
 
     for (i = 0; i < s->num_cpu; i++) {
-        CPUState *cpu = qemu_get_cpu(i);
+        CPUState *cpu = qemu_get_possible_cpu(i) ? : qemu_get_cpu(i);
         uint64_t cpu_affid;
 
-        s->cpu[i].cpu = cpu;
+        if (qemu_enabled_cpu(cpu)) {
+            s->cpu[i].cpu = cpu;
+        }
+
         s->cpu[i].gic = s;
         /* Store GICv3CPUState in CPUARMState gicv3state pointer */
         gicv3_set_gicv3state(cpu, &s->cpu[i]);
@@ -441,13 +499,20 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp)
         s->cpu[cpuidx - 1].gicr_typer |= GICR_TYPER_LAST;
     }
 
+    s->cpu_update_notifier.notify = arm_gicv3_cpu_update_notifier;
+
     s->itslist = g_ptr_array_new();
 }
 
 static void arm_gicv3_finalize(Object *obj)
 {
     GICv3State *s = ARM_GICV3_COMMON(obj);
+    Object *ms = qdev_get_machine();
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
 
+    if (mc->has_hotpluggable_cpus) {
+        notifier_remove(&s->cpu_update_notifier);
+    }
     g_free(s->redist_region_count);
 }
 
diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index ab1a00508e6f178c6059c8644492a4b455febd87..a0135100744115fe3326dc965c662708f43799ad 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -934,6 +934,10 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
     ARMCPU *cpu = ARM_CPU(cs->cpu);
     CPUARMState *env = &cpu->env;
 
+    if (!qemu_enabled_cpu(cs->cpu)) {
+        return;
+    }
+
     g_assert(qemu_mutex_iothread_locked());
 
     trace_gicv3_cpuif_update(gicv3_redist_affid(cs), cs->hppi.irq,
@@ -1826,6 +1830,10 @@ static void icc_generate_sgi(CPUARMState *env, GICv3CPUState *cs,
     for (i = 0; i < s->num_cpu; i++) {
         GICv3CPUState *ocs = &s->cpu[i];
 
+        if (!qemu_enabled_cpu(ocs->cpu)) {
+            continue;
+        }
+
         if (irm) {
             /* IRM == 1 : route to all CPUs except self */
             if (cs == ocs) {
@@ -2774,6 +2782,127 @@ static const ARMCPRegInfo gicv3_cpuif_ich_apxr23_reginfo[] = {
     },
 };
 
+void gicv3_init_cpu_reginfo(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    GICv3CPUState *gcs = icc_cs_from_env(&cpu->env);
+
+    /*
+     * If the CPU doesn't define a GICv3 configuration, probably because
+     * in real hardware it doesn't have one, then we use default values
+     * matching the one used by most Arm CPUs. This applies to:
+     *  cpu->gic_num_lrs
+     *  cpu->gic_vpribits
+     *  cpu->gic_vprebits
+     *  cpu->gic_pribits
+     */
+
+    /*
+     * Note that we can't just use the GICv3CPUState as an opaque pointer
+     * in define_arm_cp_regs_with_opaque(), because when we're called back
+     * it might be with code translated by CPU 0 but run by CPU 1, in
+     * which case we'd get the wrong value.
+     * So instead we define the regs with no ri->opaque info, and
+     * get back to the GICv3CPUState from the CPUARMState.
+     */
+    define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);
+
+    /*
+     * The CPU implementation specifies the number of supported
+     * bits of physical priority. For backwards compatibility
+     * of migration, we have a compat property that forces use
+     * of 8 priority bits regardless of what the CPU really has.
+     */
+    if (gcs->gic->force_8bit_prio) {
+        gcs->pribits = 8;
+    } else {
+        gcs->pribits = cpu->gic_pribits ?: 5;
+    }
+
+    /*
+     * The GICv3 has separate ID register fields for virtual priority
+     * and preemption bit values, but only a single ID register field
+     * for the physical priority bits. The preemption bit count is
+     * always the same as the priority bit count, except that 8 bits
+     * of priority means 7 preemption bits. We precalculate the
+     * preemption bits because it simplifies the code and makes the
+     * parallels between the virtual and physical bits of the GIC
+     * a bit clearer.
+     */
+    gcs->prebits = gcs->pribits;
+    if (gcs->prebits == 8) {
+        gcs->prebits--;
+    }
+    /*
+     * Check that CPU code defining pribits didn't violate
+     * architectural constraints our implementation relies on.
+     */
+    g_assert(gcs->pribits >= 4 && gcs->pribits <= 8);
+
+    /*
+     * gicv3_cpuif_reginfo[] defines ICC_AP*R0_EL1; add definitions
+     * for ICC_AP*R{1,2,3}_EL1 if the prebits value requires them.
+     */
+    if (gcs->prebits >= 6) {
+        define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr1_reginfo);
+    }
+    if (gcs->prebits == 7) {
+        define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr23_reginfo);
+    }
+
+    if (arm_feature(&cpu->env, ARM_FEATURE_EL2)) {
+        int j;
+
+        gcs->num_list_regs = cpu->gic_num_lrs ?: 4;
+        gcs->vpribits = cpu->gic_vpribits ?: 5;
+        gcs->vprebits = cpu->gic_vprebits ?: 5;
+
+        /*
+         * Check against architectural constraints: getting these
+         * wrong would be a bug in the CPU code defining these,
+         * and the implementation relies on them holding.
+         */
+        g_assert(gcs->vprebits <= gcs->vpribits);
+        g_assert(gcs->vprebits >= 5 && gcs->vprebits <= 7);
+        g_assert(gcs->vpribits >= 5 && gcs->vpribits <= 8);
+
+        define_arm_cp_regs(cpu, gicv3_cpuif_hcr_reginfo);
+
+        for (j = 0; j < gcs->num_list_regs; j++) {
+            /*
+             * Note that the AArch64 LRs are 64-bit; the AArch32 LRs
+             * are split into two cp15 regs, LR (the low part, with the
+             * same encoding as the AArch64 LR) and LRC (the high part).
+             */
+            ARMCPRegInfo lr_regset[] = {
+                { .name = "ICH_LRn_EL2", .state = ARM_CP_STATE_BOTH,
+                  .opc0 = 3, .opc1 = 4, .crn = 12,
+                  .crm = 12 + (j >> 3), .opc2 = j & 7,
+                  .type = ARM_CP_IO | ARM_CP_NO_RAW,
+                  .access = PL2_RW,
+                  .readfn = ich_lr_read,
+                  .writefn = ich_lr_write,
+                },
+                { .name = "ICH_LRCn_EL2", .state = ARM_CP_STATE_AA32,
+                  .cp = 15, .opc1 = 4, .crn = 12,
+                  .crm = 14 + (j >> 3), .opc2 = j & 7,
+                  .type = ARM_CP_IO | ARM_CP_NO_RAW,
+                  .access = PL2_RW,
+                  .readfn = ich_lr_read,
+                  .writefn = ich_lr_write,
+                },
+            };
+            define_arm_cp_regs(cpu, lr_regset);
+        }
+        if (gcs->vprebits >= 6) {
+            define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr1_reginfo);
+        }
+        if (gcs->vprebits == 7) {
+            define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo);
+        }
+    }
+}
+
 static void gicv3_cpuif_el_change_hook(ARMCPU *cpu, void *opaque)
 {
     GICv3CPUState *cs = opaque;
@@ -2796,131 +2925,23 @@ void gicv3_init_cpuif(GICv3State *s)
 
     for (i = 0; i < s->num_cpu; i++) {
         ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i));
-        GICv3CPUState *cs = &s->cpu[i];
-
-        /*
-         * If the CPU doesn't define a GICv3 configuration, probably because
-         * in real hardware it doesn't have one, then we use default values
-         * matching the one used by most Arm CPUs. This applies to:
-         *  cpu->gic_num_lrs
-         *  cpu->gic_vpribits
-         *  cpu->gic_vprebits
-         *  cpu->gic_pribits
-         */
-
-        /* Note that we can't just use the GICv3CPUState as an opaque pointer
-         * in define_arm_cp_regs_with_opaque(), because when we're called back
-         * it might be with code translated by CPU 0 but run by CPU 1, in
-         * which case we'd get the wrong value.
-         * So instead we define the regs with no ri->opaque info, and
-         * get back to the GICv3CPUState from the CPUARMState.
-         *
-         * These CP regs callbacks can be called from either TCG or HVF code.
-         */
-        define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);
-
-        /*
-         * The CPU implementation specifies the number of supported
-         * bits of physical priority. For backwards compatibility
-         * of migration, we have a compat property that forces use
-         * of 8 priority bits regardless of what the CPU really has.
-         */
-        if (s->force_8bit_prio) {
-            cs->pribits = 8;
-        } else {
-            cs->pribits = cpu->gic_pribits ?: 5;
-        }
-
-        /*
-         * The GICv3 has separate ID register fields for virtual priority
-         * and preemption bit values, but only a single ID register field
-         * for the physical priority bits. The preemption bit count is
-         * always the same as the priority bit count, except that 8 bits
-         * of priority means 7 preemption bits. We precalculate the
-         * preemption bits because it simplifies the code and makes the
-         * parallels between the virtual and physical bits of the GIC
-         * a bit clearer.
-         */
-        cs->prebits = cs->pribits;
-        if (cs->prebits == 8) {
-            cs->prebits--;
-        }
-        /*
-         * Check that CPU code defining pribits didn't violate
-         * architectural constraints our implementation relies on.
-         */
-        g_assert(cs->pribits >= 4 && cs->pribits <= 8);
-
-        /*
-         * gicv3_cpuif_reginfo[] defines ICC_AP*R0_EL1; add definitions
-         * for ICC_AP*R{1,2,3}_EL1 if the prebits value requires them.
-         */
-        if (cs->prebits >= 6) {
-            define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr1_reginfo);
-        }
-        if (cs->prebits == 7) {
-            define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr23_reginfo);
-        }
-
-        if (arm_feature(&cpu->env, ARM_FEATURE_EL2)) {
-            int j;
 
-            cs->num_list_regs = cpu->gic_num_lrs ?: 4;
-            cs->vpribits = cpu->gic_vpribits ?: 5;
-            cs->vprebits = cpu->gic_vprebits ?: 5;
-
-            /* Check against architectural constraints: getting these
-             * wrong would be a bug in the CPU code defining these,
-             * and the implementation relies on them holding.
-             */
-            g_assert(cs->vprebits <= cs->vpribits);
-            g_assert(cs->vprebits >= 5 && cs->vprebits <= 7);
-            g_assert(cs->vpribits >= 5 && cs->vpribits <= 8);
-
-            define_arm_cp_regs(cpu, gicv3_cpuif_hcr_reginfo);
-
-            for (j = 0; j < cs->num_list_regs; j++) {
-                /* Note that the AArch64 LRs are 64-bit; the AArch32 LRs
-                 * are split into two cp15 regs, LR (the low part, with the
-                 * same encoding as the AArch64 LR) and LRC (the high part).
+        if (qemu_enabled_cpu(CPU(cpu))) {
+            GICv3CPUState *cs = icc_cs_from_env(&cpu->env);
+            gicv3_init_cpu_reginfo(CPU(cpu));
+            if (tcg_enabled() || qtest_enabled()) {
+                /*
+                 * We can only trap EL changes with TCG. However the GIC
+                 * interrupt state only changes on EL changes involving EL2 or
+                 * EL3, so for the non-TCG case this is OK, as EL2 and EL3 can't
+                 * exist.
                  */
-                ARMCPRegInfo lr_regset[] = {
-                    { .name = "ICH_LRn_EL2", .state = ARM_CP_STATE_BOTH,
-                      .opc0 = 3, .opc1 = 4, .crn = 12,
-                      .crm = 12 + (j >> 3), .opc2 = j & 7,
-                      .type = ARM_CP_IO | ARM_CP_NO_RAW,
-                      .access = PL2_RW,
-                      .readfn = ich_lr_read,
-                      .writefn = ich_lr_write,
-                    },
-                    { .name = "ICH_LRCn_EL2", .state = ARM_CP_STATE_AA32,
-                      .cp = 15, .opc1 = 4, .crn = 12,
-                      .crm = 14 + (j >> 3), .opc2 = j & 7,
-                      .type = ARM_CP_IO | ARM_CP_NO_RAW,
-                      .access = PL2_RW,
-                      .readfn = ich_lr_read,
-                      .writefn = ich_lr_write,
-                    },
-                };
-                define_arm_cp_regs(cpu, lr_regset);
-            }
-            if (cs->vprebits >= 6) {
-                define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr1_reginfo);
-            }
-            if (cs->vprebits == 7) {
-                define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo);
+                arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook,
+                                            cs);
+            } else {
+                assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2));
+                assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3));
             }
         }
-        if (tcg_enabled() || qtest_enabled()) {
-            /*
-             * We can only trap EL changes with TCG. However the GIC interrupt
-             * state only changes on EL changes involving EL2 or EL3, so for
-             * the non-TCG case this is OK, as EL2 and EL3 can't exist.
-             */
-            arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, cs);
-        } else {
-            assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2));
-            assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3));
-        }
     }
 }
diff --git a/hw/intc/arm_gicv3_cpuif_common.c b/hw/intc/arm_gicv3_cpuif_common.c
index ff1239f65db946da62754436fbf67fbd50ce378c..381cf2754b7e4095d547aefc716c7d2937d49369 100644
--- a/hw/intc/arm_gicv3_cpuif_common.c
+++ b/hw/intc/arm_gicv3_cpuif_common.c
@@ -20,3 +20,8 @@ void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s)
 
     env->gicv3state = (void *)s;
 };
+
+void gicv3_set_cpustate(GICv3CPUState *s, CPUState *cpu)
+{
+    s->cpu = cpu;
+}
diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c
index 77eb37e13173f8dc0e71614ac0c511fe4f23f97b..dd2a60fa20d2341568a9fcdfd9da3114b9ed73cc 100644
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@@ -24,6 +24,7 @@
 #include "hw/intc/arm_gicv3_common.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "sysemu/cpus.h"
 #include "sysemu/kvm.h"
 #include "sysemu/runstate.h"
 #include "kvm_arm.h"
@@ -458,6 +459,18 @@ static void kvm_arm_gicv3_put(GICv3State *s)
         GICv3CPUState *c = &s->cpu[ncpu];
         int num_pri_bits;
 
+        /*
+         * To support hotplug of vcpus we need to make sure all gic cpuif/GICC
+         * are initialized at machvirt init time. Once the init is done we
+         * release the ARMCPU object for disabled vcpus but this leg could hit
+         * during reset of GICC later as well i.e. after init has happened and
+         * all of the cases we want to make sure we dont acess the GICC for
+         * the disabled VCPUs.
+         */
+        if (!qemu_enabled_cpu(c->cpu)) {
+            continue;
+        }
+
         kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, true);
         kvm_gicc_access(s, ICC_CTLR_EL1, ncpu,
                         &c->icc_ctlr_el1[GICV3_NS], true);
@@ -616,6 +629,11 @@ static void kvm_arm_gicv3_get(GICv3State *s)
         GICv3CPUState *c = &s->cpu[ncpu];
         int num_pri_bits;
 
+        /* don't access GICC for the disabled vCPUs. */
+        if (!qemu_enabled_cpu(c->cpu)) {
+            continue;
+        }
+
         kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, false);
         kvm_gicc_access(s, ICC_CTLR_EL1, ncpu,
                         &c->icc_ctlr_el1[GICV3_NS], false);
@@ -695,10 +713,19 @@ static void arm_gicv3_icc_reset(CPUARMState *env, const ARMCPRegInfo *ri)
         return;
     }
 
+    /*
+     * This shall be called even when vcpu is being hotplugged or onlined and
+     * other vcpus might be running. Host kernel KVM code to handle device
+     * access of IOCTLs KVM_{GET|SET}_DEVICE_ATTR might fail due to inability to
+     * grab vcpu locks for all the vcpus. Hence, we need to pause all vcpus to
+     * facilitate locking within host.
+     */
+    pause_all_vcpus();
     /* Initialize to actual HW supported configuration */
     kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS,
                       KVM_VGIC_ATTR(ICC_CTLR_EL1, c->gicr_typer),
                       &c->icc_ctlr_el1[GICV3_NS], false, &error_abort);
+    resume_all_vcpus();
 
     c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS];
 }
@@ -777,6 +804,10 @@ static void vm_change_state_handler(void *opaque, bool running,
     }
 }
 
+static void kvm_gicv3_init_cpu_reginfo(CPUState *cs)
+{
+    define_arm_cp_regs(ARM_CPU(cs), gicv3_cpuif_reginfo);
+}
 
 static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp)
 {
@@ -808,9 +839,10 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp)
     gicv3_init_irqs_and_mmio(s, kvm_arm_gicv3_set_irq, NULL);
 
     for (i = 0; i < s->num_cpu; i++) {
-        ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i));
-
-        define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);
+        CPUState *cs = qemu_get_cpu(i);
+        if (qemu_enabled_cpu(cs)) {
+            kvm_gicv3_init_cpu_reginfo(cs);
+        }
     }
 
     /* Try to create the device via the device control API */
@@ -897,6 +929,7 @@ static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data)
 
     agcc->pre_save = kvm_arm_gicv3_get;
     agcc->post_load = kvm_arm_gicv3_put;
+    agcc->init_cpu_reginfo = kvm_gicv3_init_cpu_reginfo;
     device_class_set_parent_realize(dc, kvm_arm_gicv3_realize,
                                     &kgc->parent_realize);
     resettable_class_set_parent_phases(rc, NULL, kvm_arm_gicv3_reset_hold, NULL,
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 29d5cdc1b6984365320c4c1b22eb16358950d01f..0bed0f6e2af2a566452094ba9028fcc1cb859d1c 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -709,6 +709,7 @@ void gicv3_redist_vinvall(GICv3CPUState *cs, uint64_t vptaddr);
 
 void gicv3_redist_send_sgi(GICv3CPUState *cs, int grp, int irq, bool ns);
 void gicv3_init_cpuif(GICv3State *s);
+void gicv3_init_cpu_reginfo(CPUState *cs);
 
 /**
  * gicv3_cpuif_update:
@@ -848,5 +849,6 @@ static inline void gicv3_cache_all_target_cpustates(GICv3State *s)
 }
 
 void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s);
+void gicv3_set_cpustate(GICv3CPUState *s, CPUState *cpu);
 
 #endif /* QEMU_ARM_GICV3_INTERNAL_H */
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 716ffc8bbbda1db4cfba8ecbdbce04ffc8455564..0b43aec8fa97504f7c8dc0b9450ffa1a0b7388d8 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -195,6 +195,7 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s)
     int i;
 
     if (kvm_irqchip_is_split()) {
+        KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
             MSIMessage msg;
             struct ioapic_entry_info info;
@@ -202,10 +203,10 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s)
             if (!info.masked) {
                 msg.address = info.addr;
                 msg.data = info.data;
-                kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL);
+                kvm_irqchip_update_msi_route(&c, i, msg, NULL);
             }
         }
-        kvm_irqchip_commit_routes(kvm_state);
+        kvm_irqchip_commit_route_changes(&c);
     }
 #endif
 }
diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c
index 24fb3af8cc31fceda1169b9e26f549ac6b299a46..fa23e247ca331434e6dd653cca4399dda5b1fd36 100644
--- a/hw/intc/loongarch_extioi.c
+++ b/hw/intc/loongarch_extioi.c
@@ -8,6 +8,7 @@
 #include "qemu/osdep.h"
 #include "qemu/module.h"
 #include "qemu/log.h"
+#include "qapi/error.h"
 #include "hw/irq.h"
 #include "hw/sysbus.h"
 #include "hw/loongarch/virt.h"
@@ -32,23 +33,23 @@ static void extioi_update_irq(LoongArchExtIOI *s, int irq, int level)
         if (((s->enable[irq_index]) & irq_mask) == 0) {
             return;
         }
-        s->coreisr[cpu][irq_index] |= irq_mask;
-        found = find_first_bit(s->sw_isr[cpu][ipnum], EXTIOI_IRQS);
-        set_bit(irq, s->sw_isr[cpu][ipnum]);
+        s->cpu[cpu].coreisr[irq_index] |= irq_mask;
+        found = find_first_bit(s->cpu[cpu].sw_isr[ipnum], EXTIOI_IRQS);
+        set_bit(irq, s->cpu[cpu].sw_isr[ipnum]);
         if (found < EXTIOI_IRQS) {
             /* other irq is handling, need not update parent irq level */
             return;
         }
     } else {
-        s->coreisr[cpu][irq_index] &= ~irq_mask;
-        clear_bit(irq, s->sw_isr[cpu][ipnum]);
-        found = find_first_bit(s->sw_isr[cpu][ipnum], EXTIOI_IRQS);
+        s->cpu[cpu].coreisr[irq_index] &= ~irq_mask;
+        clear_bit(irq, s->cpu[cpu].sw_isr[ipnum]);
+        found = find_first_bit(s->cpu[cpu].sw_isr[ipnum], EXTIOI_IRQS);
         if (found < EXTIOI_IRQS) {
             /* other irq is handling, need not update parent irq level */
             return;
         }
     }
-    qemu_set_irq(s->parent_irq[cpu][ipnum], level);
+    qemu_set_irq(s->cpu[cpu].parent_irq[ipnum], level);
 }
 
 static void extioi_setirq(void *opaque, int irq, int level)
@@ -96,7 +97,7 @@ static MemTxResult extioi_readw(void *opaque, hwaddr addr, uint64_t *data,
         index = (offset - EXTIOI_COREISR_START) >> 2;
         /* using attrs to get current cpu index */
         cpu = attrs.requester_id;
-        *data = s->coreisr[cpu][index];
+        *data = s->cpu[cpu].coreisr[index];
         break;
     case EXTIOI_COREMAP_START ... EXTIOI_COREMAP_END - 1:
         index = (offset - EXTIOI_COREMAP_START) >> 2;
@@ -129,12 +130,68 @@ static inline void extioi_enable_irq(LoongArchExtIOI *s, int index,\
     }
 }
 
+static inline void extioi_update_sw_coremap(LoongArchExtIOI *s, int irq,
+                                            uint64_t val, bool notify)
+{
+    int i, cpu;
+
+    /*
+     * loongarch only support little endian,
+     * so we paresd the value with little endian.
+     */
+    val = cpu_to_le64(val);
+
+    for (i = 0; i < 4; i++) {
+        cpu = val & 0xff;
+	if (!(s->status & BIT(EXTIOI_ENABLE_CPU_ENCODE))) {
+            cpu = ctz32(cpu);
+            cpu = (cpu >= 4) ? 0 : cpu;
+        }
+        val = val >> 8;
+
+        if (s->sw_coremap[irq + i] == cpu) {
+            continue;
+        }
+
+        if (notify && test_bit(irq + i, (unsigned long *)s->isr)) {
+            /*
+             * lower irq at old cpu and raise irq at new cpu
+             */
+            extioi_update_irq(s, irq + i, 0);
+            s->sw_coremap[irq + i] = cpu;
+            extioi_update_irq(s, irq + i, 1);
+        } else {
+            s->sw_coremap[irq + i] = cpu;
+        }
+    }
+}
+
+static inline void extioi_update_sw_ipmap(LoongArchExtIOI *s, int index,
+                                          uint64_t val)
+{
+    int i;
+    uint8_t ipnum;
+
+    /*
+     * loongarch only support little endian,
+     * so we paresd the value with little endian.
+     */
+    val = cpu_to_le64(val);
+    for (i = 0; i < 4; i++) {
+        ipnum = val & 0xff;
+        ipnum = ctz32(ipnum);
+        ipnum = (ipnum >= 4) ? 0 : ipnum;
+        s->sw_ipmap[index * 4 + i] = ipnum;
+        val = val >> 8;
+    }
+}
+
 static MemTxResult extioi_writew(void *opaque, hwaddr addr,
                           uint64_t val, unsigned size,
                           MemTxAttrs attrs)
 {
     LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque);
-    int i, cpu, index, old_data, irq;
+    int cpu, index, old_data, irq;
     uint32_t offset;
 
     trace_loongarch_extioi_writew(addr, val);
@@ -152,20 +209,7 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr,
          */
         index = (offset - EXTIOI_IPMAP_START) >> 2;
         s->ipmap[index] = val;
-        /*
-         * loongarch only support little endian,
-         * so we paresd the value with little endian.
-         */
-        val = cpu_to_le64(val);
-        for (i = 0; i < 4; i++) {
-            uint8_t ipnum;
-            ipnum = val & 0xff;
-            ipnum = ctz32(ipnum);
-            ipnum = (ipnum >= 4) ? 0 : ipnum;
-            s->sw_ipmap[index * 4 + i] = ipnum;
-            val = val >> 8;
-        }
-
+        extioi_update_sw_ipmap(s, index, val);
         break;
     case EXTIOI_ENABLE_START ... EXTIOI_ENABLE_END - 1:
         index = (offset - EXTIOI_ENABLE_START) >> 2;
@@ -189,8 +233,8 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr,
         index = (offset - EXTIOI_COREISR_START) >> 2;
         /* using attrs to get current cpu index */
         cpu = attrs.requester_id;
-        old_data = s->coreisr[cpu][index];
-        s->coreisr[cpu][index] = old_data & ~val;
+        old_data = s->cpu[cpu].coreisr[index];
+        s->cpu[cpu].coreisr[index] = old_data & ~val;
         /* write 1 to clear interrupt */
         old_data &= val;
         irq = ctz32(old_data);
@@ -204,33 +248,8 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr,
         irq = offset - EXTIOI_COREMAP_START;
         index = irq / 4;
         s->coremap[index] = val;
-        /*
-         * loongarch only support little endian,
-         * so we paresd the value with little endian.
-         */
-        val = cpu_to_le64(val);
 
-        for (i = 0; i < 4; i++) {
-            cpu = val & 0xff;
-            cpu = ctz32(cpu);
-            cpu = (cpu >= 4) ? 0 : cpu;
-            val = val >> 8;
-
-            if (s->sw_coremap[irq + i] == cpu) {
-                continue;
-            }
-
-            if (test_bit(irq, (unsigned long *)s->isr)) {
-                /*
-                 * lower irq at old cpu and raise irq at new cpu
-                 */
-                extioi_update_irq(s, irq + i, 0);
-                s->sw_coremap[irq + i] = cpu;
-                extioi_update_irq(s, irq + i, 1);
-            } else {
-                s->sw_coremap[irq + i] = cpu;
-            }
-        }
+        extioi_update_sw_coremap(s, irq, val, true);
         break;
     default:
         break;
@@ -248,65 +267,192 @@ static const MemoryRegionOps extioi_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
-static const VMStateDescription vmstate_loongarch_extioi = {
-    .name = TYPE_LOONGARCH_EXTIOI,
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields = (VMStateField[]) {
-        VMSTATE_UINT32_ARRAY(bounce, LoongArchExtIOI, EXTIOI_IRQS_GROUP_COUNT),
-        VMSTATE_UINT32_2DARRAY(coreisr, LoongArchExtIOI, EXTIOI_CPUS,
-                               EXTIOI_IRQS_GROUP_COUNT),
-        VMSTATE_UINT32_ARRAY(nodetype, LoongArchExtIOI,
-                             EXTIOI_IRQS_NODETYPE_COUNT / 2),
-        VMSTATE_UINT32_ARRAY(enable, LoongArchExtIOI, EXTIOI_IRQS / 32),
-        VMSTATE_UINT32_ARRAY(isr, LoongArchExtIOI, EXTIOI_IRQS / 32),
-        VMSTATE_UINT32_ARRAY(ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE / 4),
-        VMSTATE_UINT32_ARRAY(coremap, LoongArchExtIOI, EXTIOI_IRQS / 4),
-        VMSTATE_UINT8_ARRAY(sw_ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE),
-        VMSTATE_UINT8_ARRAY(sw_coremap, LoongArchExtIOI, EXTIOI_IRQS),
+static MemTxResult extioi_virt_readw(void *opaque, hwaddr addr, uint64_t *data,
+                                     unsigned size, MemTxAttrs attrs)
+{
+    LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque);
 
-        VMSTATE_END_OF_LIST()
+    switch (addr) {
+    case EXTIOI_VIRT_FEATURES:
+        *data = s->features;
+        break;
+    case EXTIOI_VIRT_CONFIG:
+        *data = s->status;
+        break;
+    default:
+        break;
+    }
+
+    return MEMTX_OK;
+}
+
+static MemTxResult extioi_virt_writew(void *opaque, hwaddr addr,
+                          uint64_t val, unsigned size,
+                          MemTxAttrs attrs)
+{
+    LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque);
+
+    switch (addr) {
+    case EXTIOI_VIRT_FEATURES:
+        return MEMTX_ACCESS_ERROR;
+
+    case EXTIOI_VIRT_CONFIG:
+        /*
+         * extioi features can only be set at disabled status
+         */
+        if ((s->status & BIT(EXTIOI_ENABLE)) && val) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        s->status = val & s->features;
+        break;
+    default:
+        break;
     }
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps extioi_virt_ops = {
+    .read_with_attrs = extioi_virt_readw,
+    .write_with_attrs = extioi_virt_writew,
+    .impl.min_access_size = 4,
+    .impl.max_access_size = 4,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 8,
+    .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
-static void loongarch_extioi_instance_init(Object *obj)
+static void loongarch_extioi_realize(DeviceState *dev, Error **errp)
 {
-    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
-    LoongArchExtIOI *s = LOONGARCH_EXTIOI(obj);
-    int i, cpu, pin;
+    LoongArchExtIOI *s = LOONGARCH_EXTIOI(dev);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    int i, pin;
+
+    if (s->num_cpu == 0) {
+        error_setg(errp, "num-cpu must be at least 1");
+        return;
+    }
 
     for (i = 0; i < EXTIOI_IRQS; i++) {
-        sysbus_init_irq(dev, &s->irq[i]);
+        sysbus_init_irq(sbd, &s->irq[i]);
     }
 
-    qdev_init_gpio_in(DEVICE(obj), extioi_setirq, EXTIOI_IRQS);
+    qdev_init_gpio_in(dev, extioi_setirq, EXTIOI_IRQS);
+    memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops,
+                          s, "extioi_system_mem", 0x900);
+    sysbus_init_mmio(sbd, &s->extioi_system_mem);
+
+    if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) {
+        memory_region_init_io(&s->virt_extend, OBJECT(s), &extioi_virt_ops,
+                              s, "extioi_virt", EXTIOI_VIRT_SIZE);
+        sysbus_init_mmio(sbd, &s->virt_extend);
+        s->features |= EXTIOI_VIRT_HAS_FEATURES;
+    } else {
+        s->status |= BIT(EXTIOI_ENABLE);
+    }
 
-    for (cpu = 0; cpu < EXTIOI_CPUS; cpu++) {
-        memory_region_init_io(&s->extioi_iocsr_mem[cpu], OBJECT(s), &extioi_ops,
-                              s, "extioi_iocsr", 0x900);
-        sysbus_init_mmio(dev, &s->extioi_iocsr_mem[cpu]);
+    s->cpu = g_new0(ExtIOICore, s->num_cpu);
+    if (s->cpu == NULL) {
+        error_setg(errp, "Memory allocation for ExtIOICore faile");
+        return;
+    }
+
+    for (i = 0; i < s->num_cpu; i++) {
         for (pin = 0; pin < LS3A_INTC_IP; pin++) {
-            qdev_init_gpio_out(DEVICE(obj), &s->parent_irq[cpu][pin], 1);
+            qdev_init_gpio_out(dev, &s->cpu[i].parent_irq[pin], 1);
         }
     }
-    memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops,
-                          s, "extioi_system_mem", 0x900);
-    sysbus_init_mmio(dev, &s->extioi_system_mem);
 }
 
+static void loongarch_extioi_finalize(Object *obj)
+{
+    LoongArchExtIOI *s = LOONGARCH_EXTIOI(obj);
+
+    g_free(s->cpu);
+}
+
+static void loongarch_extioi_reset(DeviceState *d)
+{
+    LoongArchExtIOI *s = LOONGARCH_EXTIOI(d);
+
+    /* use legacy interrupt routing method by default */
+    if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) {
+        s->status = 0;
+    }
+}
+
+static int vmstate_extioi_post_load(void *opaque, int version_id)
+{
+    LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque);
+    int i, start_irq;
+
+    for (i = 0; i < (EXTIOI_IRQS / 4); i++) {
+        start_irq = i * 4;
+        extioi_update_sw_coremap(s, start_irq, s->coremap[i], false);
+    }
+
+    for (i = 0; i < (EXTIOI_IRQS_IPMAP_SIZE / 4); i++) {
+        extioi_update_sw_ipmap(s, i, s->ipmap[i]);
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_extioi_core = {
+    .name = "extioi-core",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (const VMStateField[]) {
+        VMSTATE_UINT32_ARRAY(coreisr, ExtIOICore, EXTIOI_IRQS_GROUP_COUNT),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_loongarch_extioi = {
+    .name = TYPE_LOONGARCH_EXTIOI,
+    .version_id = 2,
+    .minimum_version_id = 2,
+    .post_load = vmstate_extioi_post_load,
+    .fields = (const VMStateField[]) {
+        VMSTATE_UINT32_ARRAY(bounce, LoongArchExtIOI, EXTIOI_IRQS_GROUP_COUNT),
+        VMSTATE_UINT32_ARRAY(nodetype, LoongArchExtIOI,
+                             EXTIOI_IRQS_NODETYPE_COUNT / 2),
+        VMSTATE_UINT32_ARRAY(enable, LoongArchExtIOI, EXTIOI_IRQS / 32),
+        VMSTATE_UINT32_ARRAY(isr, LoongArchExtIOI, EXTIOI_IRQS / 32),
+        VMSTATE_UINT32_ARRAY(ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE / 4),
+        VMSTATE_UINT32_ARRAY(coremap, LoongArchExtIOI, EXTIOI_IRQS / 4),
+
+        VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchExtIOI, num_cpu,
+                         vmstate_extioi_core, ExtIOICore),
+        VMSTATE_UINT32(features, LoongArchExtIOI),
+        VMSTATE_UINT32(status, LoongArchExtIOI),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static Property extioi_properties[] = {
+    DEFINE_PROP_UINT32("num-cpu", LoongArchExtIOI, num_cpu, 1),
+    DEFINE_PROP_BIT("has-virtualization-extension", LoongArchExtIOI, features,
+                    EXTIOI_HAS_VIRT_EXTENSION, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void loongarch_extioi_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
+    dc->realize = loongarch_extioi_realize;
+    dc->reset   = loongarch_extioi_reset;
+    device_class_set_props(dc, extioi_properties);
     dc->vmsd = &vmstate_loongarch_extioi;
 }
 
 static const TypeInfo loongarch_extioi_info = {
     .name          = TYPE_LOONGARCH_EXTIOI,
     .parent        = TYPE_SYS_BUS_DEVICE,
-    .instance_init = loongarch_extioi_instance_init,
     .instance_size = sizeof(struct LoongArchExtIOI),
     .class_init    = loongarch_extioi_class_init,
+    .instance_finalize = loongarch_extioi_finalize,
 };
 
 static void loongarch_extioi_register_types(void)
diff --git a/hw/intc/loongarch_extioi_kvm.c b/hw/intc/loongarch_extioi_kvm.c
new file mode 100644
index 0000000000000000000000000000000000000000..94af4378e466ff72602985c4dc7b92da76fdee70
--- /dev/null
+++ b/hw/intc/loongarch_extioi_kvm.c
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * LoongArch kvm extioi interrupt support
+ *
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "qemu/typedefs.h"
+#include "hw/intc/loongarch_extioi.h"
+#include "hw/sysbus.h"
+#include "linux/kvm.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "sysemu/kvm.h"
+
+static void kvm_extioi_access_regs(int fd, uint64_t addr,
+                                       void *val, int is_write)
+{
+    kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS,
+                      addr, val, is_write, &error_abort);
+}
+
+static void kvm_extioi_access_sw_status(int fd, uint64_t addr,
+                                       void *val, bool is_write)
+{
+    kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS,
+                      addr, val, is_write, &error_abort);
+}
+
+static void kvm_extioi_save_load_sw_status(void *opaque, bool is_write)
+{
+    KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque;
+    KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s);
+    int fd = class->dev_fd;
+    int addr;
+
+    addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU;
+    kvm_extioi_access_sw_status(fd, addr, (void *)&s->num_cpu, is_write);
+
+    addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE;
+    kvm_extioi_access_sw_status(fd, addr, (void *)&s->features, is_write);
+
+    addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE;
+    kvm_extioi_access_sw_status(fd, addr, (void *)&s->status, is_write);
+}
+
+static int kvm_loongarch_extioi_pre_save(void *opaque)
+{
+    KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque;
+    KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s);
+    int fd = class->dev_fd;
+
+    kvm_extioi_access_regs(fd, EXTIOI_NODETYPE_START,
+                           (void *)s->nodetype, false);
+    kvm_extioi_access_regs(fd, EXTIOI_IPMAP_START, (void *)s->ipmap, false);
+    kvm_extioi_access_regs(fd, EXTIOI_ENABLE_START, (void *)s->enable, false);
+    kvm_extioi_access_regs(fd, EXTIOI_BOUNCE_START, (void *)s->bounce, false);
+    kvm_extioi_access_regs(fd, EXTIOI_ISR_START, (void *)s->isr, false);
+    kvm_extioi_access_regs(fd, EXTIOI_COREMAP_START,
+                           (void *)s->coremap, false);
+    kvm_extioi_access_regs(fd, EXTIOI_SW_COREMAP_FLAG,
+                           (void *)s->sw_coremap, false);
+    kvm_extioi_access_regs(fd, EXTIOI_COREISR_START,
+                           (void *)s->coreisr, false);
+
+    kvm_extioi_save_load_sw_status(opaque, false);
+
+    return 0;
+}
+
+static int kvm_loongarch_extioi_post_load(void *opaque, int version_id)
+{
+    KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque;
+    KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s);
+    int fd = class->dev_fd;
+
+    kvm_extioi_access_regs(fd, EXTIOI_NODETYPE_START,
+                           (void *)s->nodetype, true);
+    kvm_extioi_access_regs(fd, EXTIOI_IPMAP_START, (void *)s->ipmap, true);
+    kvm_extioi_access_regs(fd, EXTIOI_ENABLE_START, (void *)s->enable, true);
+    kvm_extioi_access_regs(fd, EXTIOI_BOUNCE_START, (void *)s->bounce, true);
+    kvm_extioi_access_regs(fd, EXTIOI_ISR_START, (void *)s->isr, true);
+    kvm_extioi_access_regs(fd, EXTIOI_COREMAP_START, (void *)s->coremap, true);
+    kvm_extioi_access_regs(fd, EXTIOI_SW_COREMAP_FLAG,
+                           (void *)s->sw_coremap, true);
+    kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, (void *)s->coreisr, true);
+
+    kvm_extioi_save_load_sw_status(opaque, true);
+
+    kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL,
+                      KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED,
+                      NULL, true, &error_abort);
+
+    return 0;
+}
+
+static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp)
+{
+    KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_GET_CLASS(dev);
+    KVMLoongArchExtIOI *s = KVM_LOONGARCH_EXTIOI(dev);
+    struct kvm_create_device cd = {0};
+    Error *err = NULL;
+    int ret,i;
+
+    extioi_class->parent_realize(dev, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) {
+        s->features |= EXTIOI_VIRT_HAS_FEATURES;
+    }
+
+    if (!extioi_class->is_created) {
+        cd.type = KVM_DEV_TYPE_LOONGARCH_EIOINTC;
+        ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd);
+        if (ret < 0) {
+            error_setg_errno(errp, errno,
+                             "Creating the KVM extioi device failed");
+            return;
+        }
+        extioi_class->is_created = true;
+        extioi_class->dev_fd = cd.fd;
+
+        kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL,
+                          KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU,
+                          &s->num_cpu, true, NULL);
+
+        kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL,
+                          KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE,
+                          &s->features, true, NULL);
+
+        fprintf(stdout, "Create LoongArch extioi irqchip in KVM done!\n");
+    }
+
+    kvm_async_interrupts_allowed = true;
+    kvm_msi_via_irqfd_allowed = kvm_irqfds_enabled();
+    if (kvm_has_gsi_routing()) {
+        for (i = 0; i < 64; ++i) {
+            kvm_irqchip_add_irq_route(kvm_state, i, 0, i);
+        }
+        kvm_gsi_routing_allowed = true;
+    }
+}
+
+static const VMStateDescription vmstate_kvm_extioi_core = {
+    .name = "kvm-extioi-single",
+    .version_id = 2,
+    .minimum_version_id = 2,
+    .pre_save = kvm_loongarch_extioi_pre_save,
+    .post_load = kvm_loongarch_extioi_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32_ARRAY(nodetype, KVMLoongArchExtIOI,
+                             EXTIOI_IRQS_NODETYPE_COUNT / 2),
+        VMSTATE_UINT32_ARRAY(bounce, KVMLoongArchExtIOI,
+                             EXTIOI_IRQS_GROUP_COUNT),
+        VMSTATE_UINT32_ARRAY(isr, KVMLoongArchExtIOI, EXTIOI_IRQS / 32),
+        VMSTATE_UINT32_2DARRAY(coreisr, KVMLoongArchExtIOI, EXTIOI_CPUS,
+                               EXTIOI_IRQS_GROUP_COUNT),
+        VMSTATE_UINT32_ARRAY(enable, KVMLoongArchExtIOI, EXTIOI_IRQS / 32),
+        VMSTATE_UINT32_ARRAY(ipmap, KVMLoongArchExtIOI,
+                             EXTIOI_IRQS_IPMAP_SIZE / 4),
+        VMSTATE_UINT32_ARRAY(coremap, KVMLoongArchExtIOI, EXTIOI_IRQS / 4),
+        VMSTATE_UINT8_ARRAY(sw_coremap, KVMLoongArchExtIOI, EXTIOI_IRQS),
+        VMSTATE_UINT32(num_cpu, KVMLoongArchExtIOI),
+        VMSTATE_UINT32(features, KVMLoongArchExtIOI),
+        VMSTATE_UINT32(status, KVMLoongArchExtIOI),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static Property extioi_properties[] = {
+    DEFINE_PROP_UINT32("num-cpu", KVMLoongArchExtIOI, num_cpu, 1),
+    DEFINE_PROP_BIT("has-virtualization-extension", KVMLoongArchExtIOI,
+                    features, EXTIOI_HAS_VIRT_EXTENSION, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void kvm_loongarch_extioi_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_CLASS(oc);
+
+    extioi_class->parent_realize = dc->realize;
+    dc->realize = kvm_loongarch_extioi_realize;
+    extioi_class->is_created = false;
+    device_class_set_props(dc, extioi_properties);
+    dc->vmsd = &vmstate_kvm_extioi_core;
+}
+
+static const TypeInfo kvm_loongarch_extioi_info = {
+    .name = TYPE_KVM_LOONGARCH_EXTIOI,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(KVMLoongArchExtIOI),
+    .class_size = sizeof(KVMLoongArchExtIOIClass),
+    .class_init = kvm_loongarch_extioi_class_init,
+};
+
+static void kvm_loongarch_extioi_register_types(void)
+{
+    type_register_static(&kvm_loongarch_extioi_info);
+}
+
+type_init(kvm_loongarch_extioi_register_types)
diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c
index 67858b521c6a4ce68c1bc26b7ac6fba9e083db06..630bcb14ea7fc8b1b7c78838f5afd98e613db509 100644
--- a/hw/intc/loongarch_ipi.c
+++ b/hw/intc/loongarch_ipi.c
@@ -9,22 +9,26 @@
 #include "hw/sysbus.h"
 #include "hw/intc/loongarch_ipi.h"
 #include "hw/irq.h"
+#include "hw/qdev-properties.h"
 #include "qapi/error.h"
 #include "qemu/log.h"
 #include "exec/address-spaces.h"
 #include "hw/loongarch/virt.h"
 #include "migration/vmstate.h"
+#include "target/loongarch/cpu.h"
 #include "target/loongarch/internals.h"
 #include "trace.h"
 
-static void loongarch_ipi_writel(void *, hwaddr, uint64_t, unsigned);
-
-static uint64_t loongarch_ipi_readl(void *opaque, hwaddr addr, unsigned size)
+static MemTxResult loongarch_ipi_readl(void *opaque, hwaddr addr,
+                                       uint64_t *data,
+                                       unsigned size, MemTxAttrs attrs)
 {
-    IPICore *s = opaque;
+    IPICore *s;
+    LoongArchIPI *ipi = opaque;
     uint64_t ret = 0;
     int index = 0;
 
+    s = &ipi->cpu[attrs.requester_id];
     addr &= 0xff;
     switch (addr) {
     case CORE_STATUS_OFF:
@@ -49,10 +53,12 @@ static uint64_t loongarch_ipi_readl(void *opaque, hwaddr addr, unsigned size)
     }
 
     trace_loongarch_ipi_read(size, (uint64_t)addr, ret);
-    return ret;
+    *data = ret;
+    return MEMTX_OK;
 }
 
-static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr)
+static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr,
+                          MemTxAttrs attrs)
 {
     int i, mask = 0, data = 0;
 
@@ -61,8 +67,8 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr)
      * if the mask is 0, we need not to do anything.
      */
     if ((val >> 27) & 0xf) {
-        data = address_space_ldl(&env->address_space_iocsr, addr,
-                                 MEMTXATTRS_UNSPECIFIED, NULL);
+        data = address_space_ldl(env->address_space_iocsr, addr,
+                                 attrs, NULL);
         for (i = 0; i < 4; i++) {
             /* get mask for byte writing */
             if (val & (0x1 << (27 + i))) {
@@ -73,8 +79,8 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr)
 
     data &= mask;
     data |= (val >> 32) & ~mask;
-    address_space_stl(&env->address_space_iocsr, addr,
-                      data, MEMTXATTRS_UNSPECIFIED, NULL);
+    address_space_stl(env->address_space_iocsr, addr,
+                      data, attrs, NULL);
 }
 
 static int archid_cmp(const void *a, const void *b)
@@ -103,80 +109,72 @@ static CPUState *ipi_getcpu(int arch_id)
     CPUArchId *archid;
 
     archid = find_cpu_by_archid(machine, arch_id);
-    return CPU(archid->cpu);
-}
-
-static void ipi_send(uint64_t val)
-{
-    uint32_t cpuid;
-    uint8_t vector;
-    CPUState *cs;
-    LoongArchCPU *cpu;
-    LoongArchIPI *s;
-
-    cpuid = extract32(val, 16, 10);
-    if (cpuid >= LOONGARCH_MAX_CPUS) {
-        trace_loongarch_ipi_unsupported_cpuid("IOCSR_IPI_SEND", cpuid);
-        return;
+    if (archid) {
+        return CPU(archid->cpu);
     }
 
-    /* IPI status vector */
-    vector = extract8(val, 0, 5);
-
-    cs = ipi_getcpu(cpuid);
-    cpu = LOONGARCH_CPU(cs);
-    s = LOONGARCH_IPI(cpu->env.ipistate);
-    loongarch_ipi_writel(&s->ipi_core, CORE_SET_OFF, BIT(vector), 4);
+    return NULL;
 }
 
-static void mail_send(uint64_t val)
+static MemTxResult mail_send(uint64_t val, MemTxAttrs attrs)
 {
     uint32_t cpuid;
     hwaddr addr;
-    CPULoongArchState *env;
     CPUState *cs;
-    LoongArchCPU *cpu;
 
     cpuid = extract32(val, 16, 10);
     if (cpuid >= LOONGARCH_MAX_CPUS) {
         trace_loongarch_ipi_unsupported_cpuid("IOCSR_MAIL_SEND", cpuid);
-        return;
+        return MEMTX_DECODE_ERROR;
     }
 
-    addr = 0x1020 + (val & 0x1c);
     cs = ipi_getcpu(cpuid);
-    cpu = LOONGARCH_CPU(cs);
-    env = &cpu->env;
-    send_ipi_data(env, val, addr);
+    if (cs == NULL) {
+        return MEMTX_DECODE_ERROR;
+    }
+
+    /* override requester_id */
+    addr = SMP_IPI_MAILBOX + CORE_BUF_20 + (val & 0x1c);
+    attrs.requester_id = cs->cpu_index;
+    send_ipi_data(&LOONGARCH_CPU(cs)->env, val, addr, attrs);
+    return MEMTX_OK;
 }
 
-static void any_send(uint64_t val)
+static MemTxResult any_send(uint64_t val, MemTxAttrs attrs)
 {
     uint32_t cpuid;
     hwaddr addr;
-    CPULoongArchState *env;
     CPUState *cs;
-    LoongArchCPU *cpu;
 
     cpuid = extract32(val, 16, 10);
     if (cpuid >= LOONGARCH_MAX_CPUS) {
         trace_loongarch_ipi_unsupported_cpuid("IOCSR_ANY_SEND", cpuid);
-        return;
+        return MEMTX_DECODE_ERROR;
     }
 
-    addr = val & 0xffff;
     cs = ipi_getcpu(cpuid);
-    cpu = LOONGARCH_CPU(cs);
-    env = &cpu->env;
-    send_ipi_data(env, val, addr);
+    if (cs == NULL) {
+        return MEMTX_DECODE_ERROR;
+    }
+
+    /* override requester_id */
+    addr = val & 0xffff;
+    attrs.requester_id = cs->cpu_index;
+    send_ipi_data(&LOONGARCH_CPU(cs)->env, val, addr, attrs);
+    return MEMTX_OK;
 }
 
-static void loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val,
-                                 unsigned size)
+static MemTxResult loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val,
+                                        unsigned size, MemTxAttrs attrs)
 {
-    IPICore *s = opaque;
+    LoongArchIPI *ipi = opaque;
+    IPICore *s;
     int index = 0;
+    uint32_t cpuid;
+    uint8_t vector;
+    CPUState *cs;
 
+    s = &ipi->cpu[attrs.requester_id];
     addr &= 0xff;
     trace_loongarch_ipi_write(size, (uint64_t)addr, val);
     switch (addr) {
@@ -203,17 +201,34 @@ static void loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val,
         s->buf[index] = val;
         break;
     case IOCSR_IPI_SEND:
-        ipi_send(val);
+        cpuid = extract32(val, 16, 10);
+        if (cpuid >= LOONGARCH_MAX_CPUS) {
+            trace_loongarch_ipi_unsupported_cpuid("IOCSR_IPI_SEND", cpuid);
+            return MEMTX_DECODE_ERROR;
+        }
+
+        /* IPI status vector */
+        vector = extract8(val, 0, 5);
+        cs = ipi_getcpu(cpuid);
+        if (cs == NULL) {
+            return MEMTX_DECODE_ERROR;
+        }
+
+        /* override requester_id */
+        attrs.requester_id = cs->cpu_index;
+        loongarch_ipi_writel(ipi, CORE_SET_OFF, BIT(vector), 4, attrs);
         break;
     default:
         qemu_log_mask(LOG_UNIMP, "invalid write: %x", (uint32_t)addr);
         break;
     }
+
+    return MEMTX_OK;
 }
 
 static const MemoryRegionOps loongarch_ipi_ops = {
-    .read = loongarch_ipi_readl,
-    .write = loongarch_ipi_writel,
+    .read_with_attrs = loongarch_ipi_readl,
+    .write_with_attrs = loongarch_ipi_writel,
     .impl.min_access_size = 4,
     .impl.max_access_size = 4,
     .valid.min_access_size = 4,
@@ -222,24 +237,28 @@ static const MemoryRegionOps loongarch_ipi_ops = {
 };
 
 /* mail send and any send only support writeq */
-static void loongarch_ipi_writeq(void *opaque, hwaddr addr, uint64_t val,
-                                 unsigned size)
+static MemTxResult loongarch_ipi_writeq(void *opaque, hwaddr addr, uint64_t val,
+                                        unsigned size, MemTxAttrs attrs)
 {
+    MemTxResult ret = MEMTX_OK;
+
     addr &= 0xfff;
     switch (addr) {
     case MAIL_SEND_OFFSET:
-        mail_send(val);
+        ret = mail_send(val, attrs);
         break;
     case ANY_SEND_OFFSET:
-        any_send(val);
+        ret = any_send(val, attrs);
         break;
     default:
        break;
     }
+
+    return ret;
 }
 
 static const MemoryRegionOps loongarch_ipi64_ops = {
-    .write = loongarch_ipi_writeq,
+    .write_with_attrs = loongarch_ipi_writeq,
     .impl.min_access_size = 8,
     .impl.max_access_size = 8,
     .valid.min_access_size = 8,
@@ -247,23 +266,39 @@ static const MemoryRegionOps loongarch_ipi64_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
-static void loongarch_ipi_init(Object *obj)
+static void loongarch_ipi_realize(DeviceState *dev, Error **errp)
 {
-    LoongArchIPI *s = LOONGARCH_IPI(obj);
-    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    LoongArchIPI *s = LOONGARCH_IPI(dev);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    int i;
+
+    if (s->num_cpu == 0) {
+        error_setg(errp, "num-cpu must be at least 1");
+        return;
+    }
 
-    memory_region_init_io(&s->ipi_iocsr_mem, obj, &loongarch_ipi_ops,
-                          &s->ipi_core, "loongarch_ipi_iocsr", 0x48);
+    memory_region_init_io(&s->ipi_iocsr_mem, OBJECT(dev), &loongarch_ipi_ops,
+                          s, "loongarch_ipi_iocsr", 0x48);
 
     /* loongarch_ipi_iocsr performs re-entrant IO through ipi_send */
     s->ipi_iocsr_mem.disable_reentrancy_guard = true;
 
     sysbus_init_mmio(sbd, &s->ipi_iocsr_mem);
 
-    memory_region_init_io(&s->ipi64_iocsr_mem, obj, &loongarch_ipi64_ops,
-                          &s->ipi_core, "loongarch_ipi64_iocsr", 0x118);
+    memory_region_init_io(&s->ipi64_iocsr_mem, OBJECT(dev),
+                          &loongarch_ipi64_ops,
+                          s, "loongarch_ipi64_iocsr", 0x118);
     sysbus_init_mmio(sbd, &s->ipi64_iocsr_mem);
-    qdev_init_gpio_out(DEVICE(obj), &s->ipi_core.irq, 1);
+
+    s->cpu = g_new0(IPICore, s->num_cpu);
+    if (s->cpu == NULL) {
+        error_setg(errp, "Memory allocation for ExtIOICore faile");
+        return;
+    }
+
+    for (i = 0; i < s->num_cpu; i++) {
+        qdev_init_gpio_out(dev, &s->cpu[i].irq, 1);
+    }
 }
 
 static const VMStateDescription vmstate_ipi_core = {
@@ -282,27 +317,42 @@ static const VMStateDescription vmstate_ipi_core = {
 
 static const VMStateDescription vmstate_loongarch_ipi = {
     .name = TYPE_LOONGARCH_IPI,
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields = (VMStateField[]) {
-        VMSTATE_STRUCT(ipi_core, LoongArchIPI, 0, vmstate_ipi_core, IPICore),
+    .version_id = 2,
+    .minimum_version_id = 2,
+    .fields = (const VMStateField[]) {
+        VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchIPI, num_cpu,
+                         vmstate_ipi_core, IPICore),
         VMSTATE_END_OF_LIST()
     }
 };
 
+static Property ipi_properties[] = {
+    DEFINE_PROP_UINT32("num-cpu", LoongArchIPI, num_cpu, 1),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void loongarch_ipi_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
+    dc->realize = loongarch_ipi_realize;
+    device_class_set_props(dc, ipi_properties);
     dc->vmsd = &vmstate_loongarch_ipi;
 }
 
+static void loongarch_ipi_finalize(Object *obj)
+{
+    LoongArchIPI *s = LOONGARCH_IPI(obj);
+
+    g_free(s->cpu);
+}
+
 static const TypeInfo loongarch_ipi_info = {
     .name          = TYPE_LOONGARCH_IPI,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(LoongArchIPI),
-    .instance_init = loongarch_ipi_init,
     .class_init    = loongarch_ipi_class_init,
+    .instance_finalize = loongarch_ipi_finalize,
 };
 
 static void loongarch_ipi_register_types(void)
diff --git a/hw/intc/loongarch_ipi_kvm.c b/hw/intc/loongarch_ipi_kvm.c
new file mode 100644
index 0000000000000000000000000000000000000000..57fc05db778ddbfbcb02845801351525cadcddaf
--- /dev/null
+++ b/hw/intc/loongarch_ipi_kvm.c
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * LoongArch kvm ipi interrupt support
+ *
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "qemu/typedefs.h"
+#include "hw/intc/loongarch_ipi.h"
+#include "hw/sysbus.h"
+#include "linux/kvm.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "sysemu/kvm.h"
+
+#define IPI_DEV_FD_UNDEF        -1
+
+static void kvm_ipi_access_regs(int fd, uint64_t addr,
+                                uint32_t *val, int is_write)
+{
+        kvm_device_access(fd, KVM_DEV_LOONGARCH_IPI_GRP_REGS,
+                          addr, val, is_write, &error_abort);
+}
+
+static int kvm_loongarch_ipi_pre_save(void *opaque)
+{
+    KVMLoongArchIPI *ipi = (KVMLoongArchIPI *)opaque;
+    KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(ipi);
+    IPICore *cpu;
+    uint64_t attr;
+    int cpu_id = 0;
+    int fd = ipi_class->dev_fd;
+
+    for (cpu_id = 0; cpu_id < ipi->num_cpu; cpu_id++) {
+        cpu = &ipi->cpu[cpu_id];
+        attr = (cpu_id << 16) | CORE_STATUS_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->status, false);
+
+        attr = (cpu_id << 16) | CORE_EN_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->en, false);
+
+        attr = (cpu_id << 16) | CORE_SET_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->set, false);
+
+        attr = (cpu_id << 16) | CORE_CLEAR_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->clear, false);
+
+        attr = (cpu_id << 16) | CORE_BUF_20;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[0], false);
+
+        attr = (cpu_id << 16) | CORE_BUF_28;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[2], false);
+
+        attr = (cpu_id << 16) | CORE_BUF_30;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[4], false);
+
+        attr = (cpu_id << 16) | CORE_BUF_38;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[6], false);
+    }
+
+    return 0;
+}
+
+static int kvm_loongarch_ipi_post_load(void *opaque, int version_id)
+{
+    KVMLoongArchIPI *ipi = (KVMLoongArchIPI *)opaque;
+    KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(ipi);
+    IPICore *cpu;
+    uint64_t attr;
+    int cpu_id = 0;
+    int fd = ipi_class->dev_fd;
+
+    for (cpu_id = 0; cpu_id < ipi->num_cpu; cpu_id++) {
+        cpu = &ipi->cpu[cpu_id];
+        attr = (cpu_id << 16) | CORE_STATUS_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->status, true);
+
+        attr = (cpu_id << 16) | CORE_EN_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->en, true);
+
+        attr = (cpu_id << 16) | CORE_SET_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->set, true);
+
+        attr = (cpu_id << 16) | CORE_CLEAR_OFF;
+        kvm_ipi_access_regs(fd, attr, &cpu->clear, true);
+
+        attr = (cpu_id << 16) | CORE_BUF_20;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[0], true);
+
+        attr = (cpu_id << 16) | CORE_BUF_28;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[2], true);
+
+        attr = (cpu_id << 16) | CORE_BUF_30;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[4], true);
+
+        attr = (cpu_id << 16) | CORE_BUF_38;
+        kvm_ipi_access_regs(fd, attr, &cpu->buf[6], true);
+    }
+
+    return 0;
+}
+
+static void kvm_loongarch_ipi_realize(DeviceState *dev, Error **errp)
+{
+    KVMLoongArchIPI *ipi = KVM_LOONGARCH_IPI(dev);
+    KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(dev);
+    struct kvm_create_device cd = {0};
+    Error *err = NULL;
+    int ret;
+
+    if (ipi->num_cpu == 0) {
+        error_setg(errp, "num-cpu must be at least 1");
+        return;
+    }
+
+    ipi_class->parent_realize(dev, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    ipi->cpu = g_new0(IPICore, ipi->num_cpu);
+    if (ipi->cpu == NULL) {
+        error_setg(errp, "Memory allocation for ExtIOICore faile");
+        return;
+    }
+
+    if (!ipi_class->is_created) {
+        cd.type = KVM_DEV_TYPE_LOONGARCH_IPI;
+        ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd);
+        if (ret < 0) {
+            error_setg_errno(errp, errno, "Creating the KVM device failed");
+            return;
+        }
+        ipi_class->is_created = true;
+        ipi_class->dev_fd = cd.fd;
+        fprintf(stdout, "Create LoongArch IPI irqchip in KVM done!\n");
+    }
+
+    assert(ipi_class->dev_fd != IPI_DEV_FD_UNDEF);
+}
+
+static Property kvm_loongarch_ipi_properties[] = {
+    DEFINE_PROP_UINT32("num-cpu", KVMLoongArchIPI, num_cpu, 1),
+    DEFINE_PROP_END_OF_LIST()
+};
+
+static const VMStateDescription vmstate_kvm_ipi_core = {
+    .name = "kvm-ipi-single",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(status, IPICore),
+        VMSTATE_UINT32(en, IPICore),
+        VMSTATE_UINT32(set, IPICore),
+        VMSTATE_UINT32(clear, IPICore),
+        VMSTATE_UINT32_ARRAY(buf, IPICore, 8),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_kvm_loongarch_ipi = {
+    .name = TYPE_KVM_LOONGARCH_IPI,
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = kvm_loongarch_ipi_pre_save,
+    .post_load = kvm_loongarch_ipi_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, KVMLoongArchIPI, num_cpu,
+                         vmstate_kvm_ipi_core, IPICore),
+
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvm_loongarch_ipi_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_CLASS(oc);
+
+    ipi_class->parent_realize = dc->realize;
+    dc->realize = kvm_loongarch_ipi_realize;
+
+    ipi_class->is_created = false;
+    ipi_class->dev_fd = IPI_DEV_FD_UNDEF;
+
+    device_class_set_props(dc, kvm_loongarch_ipi_properties);
+
+    dc->vmsd = &vmstate_kvm_loongarch_ipi;
+}
+
+static const TypeInfo kvm_loongarch_ipi_info = {
+    .name = TYPE_KVM_LOONGARCH_IPI,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(KVMLoongArchIPI),
+    .class_size = sizeof(KVMLoongArchIPIClass),
+    .class_init = kvm_loongarch_ipi_class_init,
+};
+
+static void kvm_loongarch_ipi_register_types(void)
+{
+    type_register_static(&kvm_loongarch_ipi_info);
+}
+
+type_init(kvm_loongarch_ipi_register_types)
diff --git a/hw/intc/loongarch_pch_msi.c b/hw/intc/loongarch_pch_msi.c
index ecf3ed0267e4fb79e0614fcdc941b7e121e2e644..901c2c21bef33d6d46ff5347e755d9d716cb3d8b 100644
--- a/hw/intc/loongarch_pch_msi.c
+++ b/hw/intc/loongarch_pch_msi.c
@@ -14,6 +14,8 @@
 #include "hw/misc/unimp.h"
 #include "migration/vmstate.h"
 #include "trace.h"
+#include "sysemu/kvm.h"
+#include "hw/loongarch/virt.h"
 
 static uint64_t loongarch_msi_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
@@ -26,14 +28,24 @@ static void loongarch_msi_mem_write(void *opaque, hwaddr addr,
     LoongArchPCHMSI *s = (LoongArchPCHMSI *)opaque;
     int irq_num;
 
-    /*
-     * vector number is irq number from upper extioi intc
-     * need subtract irq base to get msi vector offset
-     */
-    irq_num = (val & 0xff) - s->irq_base;
-    trace_loongarch_msi_set_irq(irq_num);
-    assert(irq_num < s->irq_num);
-    qemu_set_irq(s->pch_msi_irq[irq_num], 1);
+    MSIMessage msg = {
+        .address = addr,
+        .data = val,
+    };
+
+    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+        kvm_irqchip_send_msi(kvm_state, msg);
+    } else {
+        /*
+         * vector number is irq number from upper extioi intc
+         * need subtract irq base to get msi vector offset
+         */
+        irq_num = (val & 0xff) - s->irq_base;
+        trace_loongarch_msi_set_irq(irq_num);
+        assert(irq_num < s->irq_num);
+
+        qemu_set_irq(s->pch_msi_irq[irq_num], 1);
+    }
 }
 
 static const MemoryRegionOps loongarch_pch_msi_ops = {
@@ -46,7 +58,16 @@ static void pch_msi_irq_handler(void *opaque, int irq, int level)
 {
     LoongArchPCHMSI *s = LOONGARCH_PCH_MSI(opaque);
 
-    qemu_set_irq(s->pch_msi_irq[irq], level);
+    MSIMessage msg = {
+        .address = 0,
+        .data = irq,
+    };
+
+    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+        kvm_irqchip_send_msi(kvm_state, msg);
+    } else {
+        qemu_set_irq(s->pch_msi_irq[irq], level);
+    }
 }
 
 static void loongarch_pch_msi_realize(DeviceState *dev, Error **errp)
diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c
index 6aa4cadfa4afd936d61a96e600b31947a62bb9b9..beb4ac188d8f97912856194bdfa00dfc90b00214 100644
--- a/hw/intc/loongarch_pch_pic.c
+++ b/hw/intc/loongarch_pch_pic.c
@@ -16,19 +16,28 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "sysemu/kvm.h"
 
 static void pch_pic_update_irq(LoongArchPCHPIC *s, uint64_t mask, int level)
 {
     uint64_t val;
     int irq;
+    int kvm_irq;
 
     if (level) {
         val = mask & s->intirr & ~s->int_mask;
         if (val) {
             irq = ctz64(val);
             s->intisr |= MAKE_64BIT_MASK(irq, 1);
-            qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 1);
-        }
+            if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+                kvm_irq = (
+                KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT)
+                | (0 <<  KVM_LOONGARCH_IRQ_VCPU_SHIFT) | s->htmsi_vector[irq];
+                kvm_set_irq(kvm_state, kvm_irq, !!level);
+            } else {
+                qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 1);
+            }
+	}
     } else {
         /*
          * intirr means requested pending irq
@@ -38,8 +47,15 @@ static void pch_pic_update_irq(LoongArchPCHPIC *s, uint64_t mask, int level)
         if (val) {
             irq = ctz64(val);
             s->intisr &= ~MAKE_64BIT_MASK(irq, 1);
-            qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 0);
-        }
+            if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+                kvm_irq = (
+                KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT)
+                | (0 <<  KVM_LOONGARCH_IRQ_VCPU_SHIFT) | s->htmsi_vector[irq];
+                kvm_set_irq(kvm_state, kvm_irq, !!level);
+            } else {
+                qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 0);
+            }
+	}
     }
 }
 
diff --git a/hw/intc/loongarch_pch_pic_kvm.c b/hw/intc/loongarch_pch_pic_kvm.c
new file mode 100644
index 0000000000000000000000000000000000000000..e9cef02f9aacb9e9dbed5b173587e573fd012bcb
--- /dev/null
+++ b/hw/intc/loongarch_pch_pic_kvm.c
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * LoongArch kvm pch pic interrupt support
+ *
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "qemu/typedefs.h"
+#include "hw/intc/loongarch_pch_pic.h"
+#include "hw/sysbus.h"
+#include "linux/kvm.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "sysemu/kvm.h"
+#include "hw/loongarch/virt.h"
+#include "hw/pci-host/ls7a.h"
+#include "qemu/error-report.h"
+
+static void kvm_pch_pic_access_regs(int fd, uint64_t addr,
+                                       void *val, int is_write)
+{
+        kvm_device_access(fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS,
+                          addr, val, is_write, &error_abort);
+}
+
+static int kvm_loongarch_pch_pic_pre_save(void *opaque)
+{
+    KVMLoongArchPCHPIC *s = (KVMLoongArchPCHPIC *)opaque;
+    KVMLoongArchPCHPICClass *class = KVM_LOONGARCH_PCH_PIC_GET_CLASS(s);
+    int fd = class->dev_fd;
+
+    kvm_pch_pic_access_regs(fd, PCH_PIC_MASK_START,
+                            (void *)&s->int_mask, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_EN_START,
+                            (void *)&s->htmsi_en, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_EDGE_START,
+                            (void *)&s->intedge, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL0_START,
+                            (void *)&s->auto_crtl0, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL1_START,
+                            (void *)&s->auto_crtl1, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_ROUTE_ENTRY_START,
+                            (void *)s->route_entry, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_VEC_START,
+                            (void *)s->htmsi_vector, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_INT_IRR_START,
+                            (void *)&s->intirr, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_INT_ISR_START,
+                            (void *)&s->intisr, false);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_POLARITY_START,
+                            (void *)&s->int_polarity, false);
+
+    return 0;
+}
+
+static int kvm_loongarch_pch_pic_post_load(void *opaque, int version_id)
+{
+    KVMLoongArchPCHPIC *s = (KVMLoongArchPCHPIC *)opaque;
+    KVMLoongArchPCHPICClass *class = KVM_LOONGARCH_PCH_PIC_GET_CLASS(s);
+    int fd = class->dev_fd;
+
+    kvm_pch_pic_access_regs(fd, PCH_PIC_MASK_START,
+                            (void *)&s->int_mask, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_EN_START,
+                            (void *)&s->htmsi_en, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_EDGE_START,
+                            (void *)&s->intedge, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL0_START,
+                            (void *)&s->auto_crtl0, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL1_START,
+                            (void *)&s->auto_crtl1, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_ROUTE_ENTRY_START,
+                            (void *)s->route_entry, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_VEC_START,
+                            (void *)s->htmsi_vector, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_INT_IRR_START,
+                            (void *)&s->intirr, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_INT_ISR_START,
+                            (void *)&s->intisr, true);
+    kvm_pch_pic_access_regs(fd, PCH_PIC_POLARITY_START,
+                            (void *)&s->int_polarity, true);
+
+    return 0;
+}
+
+static void kvm_pch_pic_handler(void *opaque, int irq, int level)
+{
+    int kvm_irq;
+
+    if (kvm_enabled()) {
+        kvm_irq = \
+            (KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT)
+            | (0 <<  KVM_LOONGARCH_IRQ_VCPU_SHIFT) | irq;
+        kvm_set_irq(kvm_state, kvm_irq, !!level);
+    }
+}
+
+static void kvm_loongarch_pch_pic_realize(DeviceState *dev, Error **errp)
+{
+    KVMLoongArchPCHPICClass *pch_pic_class =
+            KVM_LOONGARCH_PCH_PIC_GET_CLASS(dev);
+    struct kvm_create_device cd = {0};
+    uint64_t pch_pic_base = VIRT_PCH_REG_BASE;
+    Error *err = NULL;
+    int ret;
+
+    pch_pic_class->parent_realize(dev, &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    if (!pch_pic_class->is_created) {
+        cd.type = KVM_DEV_TYPE_LOONGARCH_PCHPIC;
+        ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd);
+        if (ret < 0) {
+            error_setg_errno(errp, errno,
+                             "Creating the KVM pch pic device failed");
+            return;
+        }
+        pch_pic_class->is_created = true;
+        pch_pic_class->dev_fd = cd.fd;
+        fprintf(stdout, "Create LoongArch pch pic irqchip in KVM done!\n");
+
+        ret = kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL,
+                                KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT,
+                                &pch_pic_base, true, NULL);
+        if (ret < 0) {
+            error_report(
+                "KVM EXTIOI: failed to set the base address of EXTIOI");
+            exit(1);
+        }
+
+        qdev_init_gpio_in(dev, kvm_pch_pic_handler, VIRT_PCH_PIC_IRQ_NUM);
+    }
+}
+
+static const VMStateDescription vmstate_kvm_loongarch_pch_pic = {
+    .name = TYPE_LOONGARCH_PCH_PIC,
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = kvm_loongarch_pch_pic_pre_save,
+    .post_load = kvm_loongarch_pch_pic_post_load,
+    .fields = (const VMStateField[]) {
+        VMSTATE_UINT64(int_mask, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(htmsi_en, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(intedge, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(intclr, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(auto_crtl0, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(auto_crtl1, KVMLoongArchPCHPIC),
+        VMSTATE_UINT8_ARRAY(route_entry, KVMLoongArchPCHPIC, 64),
+        VMSTATE_UINT8_ARRAY(htmsi_vector, KVMLoongArchPCHPIC, 64),
+        VMSTATE_UINT64(last_intirr, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(intirr, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(intisr, KVMLoongArchPCHPIC),
+        VMSTATE_UINT64(int_polarity, KVMLoongArchPCHPIC),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+
+static void kvm_loongarch_pch_pic_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    KVMLoongArchPCHPICClass *pch_pic_class = KVM_LOONGARCH_PCH_PIC_CLASS(oc);
+
+    pch_pic_class->parent_realize = dc->realize;
+    dc->realize = kvm_loongarch_pch_pic_realize;
+    pch_pic_class->is_created = false;
+    dc->vmsd = &vmstate_kvm_loongarch_pch_pic;
+
+}
+
+static const TypeInfo kvm_loongarch_pch_pic_info = {
+    .name = TYPE_KVM_LOONGARCH_PCH_PIC,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(KVMLoongArchPCHPIC),
+    .class_size = sizeof(KVMLoongArchPCHPICClass),
+    .class_init = kvm_loongarch_pch_pic_class_init,
+};
+
+static void kvm_loongarch_pch_pic_register_types(void)
+{
+    type_register_static(&kvm_loongarch_pch_pic_info);
+}
+
+type_init(kvm_loongarch_pch_pic_register_types)
diff --git a/hw/intc/meson.build b/hw/intc/meson.build
index ed355941d174193633f1601d45208eddf8f4963d..49b450131556911967c3b7881041f48c1aca95b9 100644
--- a/hw/intc/meson.build
+++ b/hw/intc/meson.build
@@ -70,6 +70,9 @@ specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_XIVE'],
 specific_ss.add(when: 'CONFIG_M68K_IRQC', if_true: files('m68k_irqc.c'))
 specific_ss.add(when: 'CONFIG_NIOS2_VIC', if_true: files('nios2_vic.c'))
 specific_ss.add(when: 'CONFIG_LOONGARCH_IPI', if_true: files('loongarch_ipi.c'))
+specific_ss.add(when: 'CONFIG_LOONGARCH_IPI_KVM', if_true: files('loongarch_ipi_kvm.c'))
 specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC', if_true: files('loongarch_pch_pic.c'))
 specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_MSI', if_true: files('loongarch_pch_msi.c'))
 specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI', if_true: files('loongarch_extioi.c'))
+specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI_KVM', if_true: files('loongarch_extioi_kvm.c'))
+specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC_KVM', if_true: files('loongarch_pch_pic_kvm.c'))
diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c
index a6f91d4bcdf7d154ef16507a6da8b1d18dd6504a..d74ec11af4adb3fb1f5855e9cc85465477a3223c 100644
--- a/hw/intc/openpic.c
+++ b/hw/intc/openpic.c
@@ -41,7 +41,6 @@
 #include "hw/pci/msi.h"
 #include "qapi/error.h"
 #include "qemu/bitops.h"
-#include "qapi/qmp/qerror.h"
 #include "qemu/module.h"
 #include "qemu/timer.h"
 #include "qemu/error-report.h"
@@ -1032,13 +1031,14 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr,
         s_IRQ = IRQ_get_next(opp, &dst->servicing);
         /* Check queued interrupts. */
         n_IRQ = IRQ_get_next(opp, &dst->raised);
-        src = &opp->src[n_IRQ];
-        if (n_IRQ != -1 &&
-            (s_IRQ == -1 ||
-             IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
-            DPRINTF("Raise OpenPIC INT output cpu %d irq %d",
-                    idx, n_IRQ);
-            qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]);
+        if (n_IRQ != -1) {
+            src = &opp->src[n_IRQ];
+            if (s_IRQ == -1 ||
+                IVPR_PRIORITY(src->ivpr) > dst->servicing.priority) {
+                DPRINTF("Raise OpenPIC INT output cpu %d irq %d",
+                        idx, n_IRQ);
+                qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]);
+             }
         }
         break;
     default:
@@ -1535,9 +1535,7 @@ static void openpic_realize(DeviceState *dev, Error **errp)
     };
 
     if (opp->nb_cpus > MAX_CPU) {
-        error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
-                   TYPE_OPENPIC, "nb_cpus", (uint64_t)opp->nb_cpus,
-                   (uint64_t)0, (uint64_t)MAX_CPU);
+        error_setg(errp, "property 'nb_cpus' can be at most %d", MAX_CPU);
         return;
     }
 
diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c
index c677b5cfbb54d3fdd2480063ba0f5edc0c67d379..2fdf85444e556c9ec30f9726cc5afe1394019c78 100644
--- a/hw/intc/riscv_aplic.c
+++ b/hw/intc/riscv_aplic.c
@@ -974,16 +974,16 @@ DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size,
     qdev_prop_set_bit(dev, "msimode", msimode);
     qdev_prop_set_bit(dev, "mmode", mmode);
 
+    if (parent) {
+        riscv_aplic_add_child(parent, dev);
+    }
+
     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
 
     if (!is_kvm_aia(msimode)) {
         sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr);
     }
 
-    if (parent) {
-        riscv_aplic_add_child(parent, dev);
-    }
-
     if (!msimode) {
         for (i = 0; i < num_harts; i++) {
             CPUState *cpu = cpu_by_arch_id(hartid_base + i);
diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c
index 5522ede2cf856f78be9107d2eee7e993960a76eb..e5de52bc44c99df0baa7b5cd7e6973c2f4832133 100644
--- a/hw/intc/sifive_plic.c
+++ b/hw/intc/sifive_plic.c
@@ -349,8 +349,10 @@ static void sifive_plic_irq_request(void *opaque, int irq, int level)
 {
     SiFivePLICState *s = opaque;
 
-    sifive_plic_set_pending(s, irq, level > 0);
-    sifive_plic_update(s);
+    if (level > 0) {
+        sifive_plic_set_pending(s, irq, true);
+        sifive_plic_update(s);
+    }
 }
 
 static void sifive_plic_realize(DeviceState *dev, Error **errp)
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index 5789062379c7a616f4f907136d5d0bea3a653e48..7a86197fc95a01841cab5968ba327475e4c12a24 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -720,7 +720,7 @@ int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers,
 {
     SpaprXive *xive = SPAPR_XIVE(intc);
     XiveSource *xsrc = &xive->source;
-    size_t esb_len = xive_source_esb_len(xsrc);
+    uint64_t esb_len = xive_source_esb_len(xsrc);
     size_t tima_len = 4ull << TM_SHIFT;
     CPUState *cs;
     int fd;
@@ -824,7 +824,7 @@ void kvmppc_xive_disconnect(SpaprInterruptController *intc)
 {
     SpaprXive *xive = SPAPR_XIVE(intc);
     XiveSource *xsrc;
-    size_t esb_len;
+    uint64_t esb_len;
 
     assert(xive->fd != -1);
 
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index a3585593d8f495f9699eebfb050f83f67c1bd389..0cfc172dd4c7f3ba7f24982c1f337229f48fb3d8 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1238,7 +1238,7 @@ static void xive_source_reset(void *dev)
 static void xive_source_realize(DeviceState *dev, Error **errp)
 {
     XiveSource *xsrc = XIVE_SOURCE(dev);
-    size_t esb_len = xive_source_esb_len(xsrc);
+    uint64_t esb_len = xive_source_esb_len(xsrc);
 
     assert(xsrc->xive);
 
diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
index 9c2333a277d9ea253b50e1fc09b00c481913b03e..0334431219c88316fab11b983a921d5575863488 100644
--- a/hw/isa/vt82c686.c
+++ b/hw/isa/vt82c686.c
@@ -613,7 +613,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level)
     ViaISAState *s = VIA_ISA(pci_get_function_0(d));
     uint8_t irq = d->config[PCI_INTERRUPT_LINE], max_irq = 15;
     int f = PCI_FUNC(d->devfn);
-    uint16_t mask = BIT(f);
+    uint16_t mask;
 
     switch (f) {
     case 0: /* PIRQ/PINT inputs */
@@ -628,6 +628,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level)
     }
 
     /* Keep track of the state of all sources */
+    mask = BIT(f);
     if (level) {
         s->irq_state[0] |= mask;
     } else {
diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig
index 5727efed6d8442bee0217d2dcd0f2e6c1cd8c52e..16c854c0d595f54aa4a9bcf194f19cbbbccd9331 100644
--- a/hw/loongarch/Kconfig
+++ b/hw/loongarch/Kconfig
@@ -1,10 +1,12 @@
 config LOONGARCH_VIRT
     bool
+    default y
+    depends on LOONGARCH64
     select PCI
     select PCI_EXPRESS_GENERIC_BRIDGE
-    imply VIRTIO_VGA
     imply PCI_DEVICES
     imply NVDIMM
+    imply TPM_TIS_SYSBUS
     select SERIAL
     select VIRTIO_PCI
     select PLATFORM_BUS
@@ -12,8 +14,12 @@ config LOONGARCH_VIRT
     select LOONGARCH_PCH_PIC
     select LOONGARCH_PCH_MSI
     select LOONGARCH_EXTIOI
+    select LOONGARCH_IPI_KVM if KVM
+    select LOONGARCH_PCH_PIC_KVM if KVM
+    select LOONGARCH_EXTIOI_KVM if KVM
     select LS7A_RTC
     select SMBIOS
+    select ACPI_CPU_HOTPLUG
     select ACPI_PCI
     select ACPI_HW_REDUCED
     select FW_CFG_DMA
diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c
index ae292fc5432670aed9b823f421428c859d9929f3..a54c5e0e70622e0e1212505df2cb6b47deef82d9 100644
--- a/hw/loongarch/acpi-build.c
+++ b/hw/loongarch/acpi-build.c
@@ -31,6 +31,7 @@
 
 #include "hw/acpi/generic_event_device.h"
 #include "hw/pci-host/gpex.h"
+#include "sysemu/sysemu.h"
 #include "sysemu/tpm.h"
 #include "hw/platform-bus.h"
 #include "hw/acpi/aml-build.h"
@@ -46,6 +47,22 @@
 #define ACPI_BUILD_DPRINTF(fmt, ...)
 #endif
 
+static void virt_madt_cpu_entry(int uid,
+                                const CPUArchIdList *apic_ids,
+                                GArray *entry, bool force_enabled)
+{
+    uint32_t flags, apic_id = apic_ids->cpus[uid].arch_id;
+
+    flags = apic_ids->cpus[uid].cpu || force_enabled ? 1 /* Enabled */ : 0;
+
+    /* Rev 1.0b, Table 5-13 Processor Local APIC Structure */
+    build_append_int_noprefix(entry, 0, 1);       /* Type */
+    build_append_int_noprefix(entry, 8, 1);       /* Length */
+    build_append_int_noprefix(entry, uid, 1);     /* ACPI Processor ID */
+    build_append_int_noprefix(entry, apic_id, 1); /* APIC ID */
+    build_append_int_noprefix(entry, flags, 4); /* Flags */
+}
+
 /* build FADT */
 static void init_common_fadt_data(AcpiFadtData *data)
 {
@@ -105,14 +122,15 @@ build_facs(GArray *table_data)
 
 /* build MADT */
 static void
-build_madt(GArray *table_data, BIOSLinker *linker, LoongArchMachineState *lams)
+build_madt(GArray *table_data, BIOSLinker *linker,
+           LoongArchVirtMachineState *lvms)
 {
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
     MachineClass *mc = MACHINE_GET_CLASS(ms);
     const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms);
     int i, arch_id;
-    AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = lams->oem_id,
-                        .oem_table_id = lams->oem_table_id };
+    AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = lvms->oem_id,
+                        .oem_table_id = lvms->oem_table_id };
 
     acpi_table_begin(&table, table_data);
 
@@ -121,15 +139,17 @@ build_madt(GArray *table_data, BIOSLinker *linker, LoongArchMachineState *lams)
     build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */
 
     for (i = 0; i < arch_ids->len; i++) {
+        uint32_t flags;
+
         /* Processor Core Interrupt Controller Structure */
         arch_id = arch_ids->cpus[i].arch_id;
-
+        flags   = arch_ids->cpus[i].cpu ? 1 : 0;
         build_append_int_noprefix(table_data, 17, 1);    /* Type */
         build_append_int_noprefix(table_data, 15, 1);    /* Length */
         build_append_int_noprefix(table_data, 1, 1);     /* Version */
         build_append_int_noprefix(table_data, i, 4);     /* ACPI Processor ID */
         build_append_int_noprefix(table_data, arch_id, 4); /* Core ID */
-        build_append_int_noprefix(table_data, 1, 4);     /* Flags */
+        build_append_int_noprefix(table_data, flags, 4); /* Flags */
     }
 
     /* Extend I/O Interrupt Controller Structure */
@@ -165,13 +185,14 @@ static void
 build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
     int i, arch_id, node_id;
-    uint64_t mem_len, mem_base;
-    int nb_numa_nodes = machine->numa_state->num_nodes;
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
-    MachineClass *mc = MACHINE_GET_CLASS(lams);
+    hwaddr len, base, gap;
+    NodeInfo *numa_info;
+    int nodes, nb_numa_nodes = machine->numa_state->num_nodes;
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine);
+    MachineClass *mc = MACHINE_GET_CLASS(lvms);
     const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine);
-    AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lams->oem_id,
-                        .oem_table_id = lams->oem_table_id };
+    AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lvms->oem_id,
+                        .oem_table_id = lvms->oem_table_id };
 
     acpi_table_begin(&table, table_data);
     build_append_int_noprefix(table_data, 1, 4); /* Reserved */
@@ -195,41 +216,88 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
         build_append_int_noprefix(table_data, 0, 4); /* Reserved */
     }
 
-    /* Node0 */
-    build_srat_memory(table_data, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE,
-                      0, MEM_AFFINITY_ENABLED);
-    mem_base = VIRT_HIGHMEM_BASE;
-    if (!nb_numa_nodes) {
-        mem_len = machine->ram_size - VIRT_LOWMEM_SIZE;
-    } else {
-        mem_len = machine->numa_state->nodes[0].node_mem - VIRT_LOWMEM_SIZE;
+    base = VIRT_LOWMEM_BASE;
+    gap = VIRT_LOWMEM_SIZE;
+    numa_info = machine->numa_state->nodes;
+    nodes = nb_numa_nodes;
+    if (!nodes) {
+        nodes = 1;
     }
-    if (mem_len)
-        build_srat_memory(table_data, mem_base, mem_len, 0, MEM_AFFINITY_ENABLED);
-
-    /* Node1 - Nodemax */
-    if (nb_numa_nodes) {
-        mem_base += mem_len;
-        for (i = 1; i < nb_numa_nodes; ++i) {
-            if (machine->numa_state->nodes[i].node_mem > 0) {
-                build_srat_memory(table_data, mem_base,
-                                  machine->numa_state->nodes[i].node_mem, i,
-                                  MEM_AFFINITY_ENABLED);
-                mem_base += machine->numa_state->nodes[i].node_mem;
-            }
+
+    for (i = 0; i < nodes; i++) {
+        if (nb_numa_nodes) {
+            len = numa_info[i].node_mem;
+        } else {
+            len = machine->ram_size;
+        }
+
+        /*
+         * memory for the node splited into two part
+         *   lowram:  [base, +gap)
+         *   highram: [VIRT_HIGHMEM_BASE, +(len - gap))
+         */
+        if (len >= gap) {
+            build_srat_memory(table_data, base, gap, i, MEM_AFFINITY_ENABLED);
+            len -= gap;
+            base = VIRT_HIGHMEM_BASE;
+            gap = machine->ram_size - VIRT_LOWMEM_SIZE;
+        }
+
+        if (len) {
+            build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED);
+            base += len;
+            gap  -= len;
         }
     }
 
     if (machine->device_memory) {
         build_srat_memory(table_data, machine->device_memory->base,
                           memory_region_size(&machine->device_memory->mr),
-                          nb_numa_nodes - 1,
+                          nodes - 1,
                           MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
     }
 
     acpi_table_end(linker, &table);
 }
 
+/*
+ * Serial Port Console Redirection Table (SPCR)
+ * https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
+ */
+static void
+spcr_setup(GArray *table_data, BIOSLinker *linker, MachineState *machine)
+{
+    LoongArchVirtMachineState *lvms;
+    AcpiSpcrData serial = {
+        .interface_type = 0,       /* 16550 compatible */
+        .base_addr.id = AML_AS_SYSTEM_MEMORY,
+        .base_addr.width = 32,
+        .base_addr.offset = 0,
+        .base_addr.size = 1,
+        .base_addr.addr = VIRT_UART_BASE,
+        .interrupt_type = 0,       /* Interrupt not supported */
+        .pc_interrupt = 0,
+        .interrupt = VIRT_UART_IRQ,
+        .baud_rate = 7,            /* 115200 */
+        .parity = 0,
+        .stop_bits = 1,
+        .flow_control = 0,
+        .terminal_type = 3,        /* ANSI */
+        .language = 0,             /* Language */
+        .pci_device_id = 0xffff,   /* not a PCI device*/
+        .pci_vendor_id = 0xffff,   /* not a PCI device*/
+        .pci_bus = 0,
+        .pci_device = 0,
+        .pci_function = 0,
+        .pci_flags = 0,
+        .pci_segment = 0,
+    };
+
+    lvms = LOONGARCH_VIRT_MACHINE(machine);
+    build_spcr(table_data, linker, &serial, 2, lvms->oem_id,
+               lvms->oem_table_id);
+}
+
 typedef
 struct AcpiBuildState {
     /* Copy of table in RAM (for patching). */
@@ -241,23 +309,27 @@ struct AcpiBuildState {
     MemoryRegion *linker_mr;
 } AcpiBuildState;
 
-static void build_uart_device_aml(Aml *table)
+static void build_uart_device_aml(Aml *table, int index)
 {
     Aml *dev;
     Aml *crs;
     Aml *pkg0, *pkg1, *pkg2;
-    uint32_t uart_irq = VIRT_UART_IRQ;
-
-    Aml *scope = aml_scope("_SB");
-    dev = aml_device("COMA");
+    Aml *scope;
+    uint32_t uart_irq;
+    uint64_t base;
+
+    uart_irq = VIRT_UART_IRQ + index;
+    base = VIRT_UART_BASE + index * VIRT_UART_SIZE;
+    scope = aml_scope("_SB");
+    dev = aml_device("COM%d", index);
     aml_append(dev, aml_name_decl("_HID", aml_string("PNP0501")));
-    aml_append(dev, aml_name_decl("_UID", aml_int(0)));
+    aml_append(dev, aml_name_decl("_UID", aml_int(index)));
     aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
     crs = aml_resource_template();
     aml_append(crs,
         aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED,
                          AML_NON_CACHEABLE, AML_READ_WRITE,
-                         0, VIRT_UART_BASE, VIRT_UART_BASE + VIRT_UART_SIZE - 1,
+                         0, base, base + VIRT_UART_SIZE - 1,
                          0, VIRT_UART_SIZE));
     aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
                                   AML_SHARED, &uart_irq, 1));
@@ -279,23 +351,36 @@ static void
 build_la_ged_aml(Aml *dsdt, MachineState *machine)
 {
     uint32_t event;
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine);
+    CPUHotplugFeatures opts;
 
     build_ged_aml(dsdt, "\\_SB."GED_DEVICE,
-                  HOTPLUG_HANDLER(lams->acpi_ged),
+                  HOTPLUG_HANDLER(lvms->acpi_ged),
                   VIRT_SCI_IRQ, AML_SYSTEM_MEMORY,
                   VIRT_GED_EVT_ADDR);
-    event = object_property_get_uint(OBJECT(lams->acpi_ged),
+    event = object_property_get_uint(OBJECT(lvms->acpi_ged),
                                      "ged-event", &error_abort);
     if (event & ACPI_GED_MEM_HOTPLUG_EVT) {
         build_memory_hotplug_aml(dsdt, machine->ram_slots, "\\_SB", NULL,
                                  AML_SYSTEM_MEMORY,
                                  VIRT_GED_MEM_ADDR);
     }
+
+    if (event & ACPI_GED_CPU_HOTPLUG_EVT) {
+        opts.acpi_1_compatible = false;
+        opts.has_legacy_cphp = false;
+        opts.fw_unplugs_cpu = false;
+        opts.smi_path = NULL;
+
+        build_cpus_aml(dsdt, machine, opts, virt_madt_cpu_entry, NULL,
+                       VIRT_GED_CPUHP_ADDR, "\\_SB",
+                       NULL, AML_SYSTEM_MEMORY);
+    }
+
     acpi_dsdt_add_power_button(dsdt);
 }
 
-static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams)
+static void build_pci_device_aml(Aml *scope, LoongArchVirtMachineState *lvms)
 {
     struct GPEXConfig cfg = {
         .mmio64.base = VIRT_PCI_MEM_BASE,
@@ -305,31 +390,54 @@ static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams)
         .ecam.base   = VIRT_PCI_CFG_BASE,
         .ecam.size   = VIRT_PCI_CFG_SIZE,
         .irq         = VIRT_GSI_BASE + VIRT_DEVICE_IRQS,
-        .bus         = lams->pci_bus,
+        .bus         = lvms->pci_bus,
     };
 
     acpi_dsdt_add_gpex(scope, &cfg);
 }
 
-static void build_flash_aml(Aml *scope, LoongArchMachineState *lams)
+static void build_flash_aml(Aml *scope, LoongArchVirtMachineState *lvms)
 {
     Aml *dev, *crs;
+    MemoryRegion *flash_mem;
 
-    hwaddr flash_base = VIRT_FLASH_BASE;
-    hwaddr flash_size = VIRT_FLASH_SIZE;
+    hwaddr flash0_base;
+    hwaddr flash0_size;
+
+    hwaddr flash1_base;
+    hwaddr flash1_size;
+
+    flash_mem = pflash_cfi01_get_memory(lvms->flash[0]);
+    flash0_base = flash_mem->addr;
+    flash0_size = memory_region_size(flash_mem);
+
+    flash_mem = pflash_cfi01_get_memory(lvms->flash[1]);
+    flash1_base = flash_mem->addr;
+    flash1_size = memory_region_size(flash_mem);
 
     dev = aml_device("FLS0");
     aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015")));
     aml_append(dev, aml_name_decl("_UID", aml_int(0)));
 
     crs = aml_resource_template();
-    aml_append(crs, aml_memory32_fixed(flash_base, flash_size, AML_READ_WRITE));
+    aml_append(crs, aml_memory32_fixed(flash0_base, flash0_size,
+                                       AML_READ_WRITE));
+    aml_append(dev, aml_name_decl("_CRS", crs));
+    aml_append(scope, dev);
+
+    dev = aml_device("FLS1");
+    aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015")));
+    aml_append(dev, aml_name_decl("_UID", aml_int(1)));
+
+    crs = aml_resource_template();
+    aml_append(crs, aml_memory32_fixed(flash1_base, flash1_size,
+                                       AML_READ_WRITE));
     aml_append(dev, aml_name_decl("_CRS", crs));
     aml_append(scope, dev);
 }
 
 #ifdef CONFIG_TPM
-static void acpi_dsdt_add_tpm(Aml *scope, LoongArchMachineState *vms)
+static void acpi_dsdt_add_tpm(Aml *scope, LoongArchVirtMachineState *vms)
 {
     PlatformBusDevice *pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev);
     hwaddr pbus_base = VIRT_PLATFORM_BUS_BASEADDRESS;
@@ -367,19 +475,21 @@ static void acpi_dsdt_add_tpm(Aml *scope, LoongArchMachineState *vms)
 static void
 build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
+    int i;
     Aml *dsdt, *scope, *pkg;
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
-    AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lams->oem_id,
-                        .oem_table_id = lams->oem_table_id };
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine);
+    AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lvms->oem_id,
+                        .oem_table_id = lvms->oem_table_id };
 
     acpi_table_begin(&table, table_data);
     dsdt = init_aml_allocator();
-    build_uart_device_aml(dsdt);
-    build_pci_device_aml(dsdt, lams);
+    for (i = 0; i < VIRT_UART_COUNT; i++)
+        build_uart_device_aml(dsdt, i);
+    build_pci_device_aml(dsdt, lvms);
     build_la_ged_aml(dsdt, machine);
-    build_flash_aml(dsdt, lams);
+    build_flash_aml(dsdt, lvms);
 #ifdef CONFIG_TPM
-    acpi_dsdt_add_tpm(dsdt, lams);
+    acpi_dsdt_add_tpm(dsdt, lvms);
 #endif
     /* System State Package */
     scope = aml_scope("\\");
@@ -398,7 +508,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 
 static void acpi_build(AcpiBuildTables *tables, MachineState *machine)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine);
     GArray *table_offsets;
     AcpiFadtData fadt_data;
     unsigned facs, rsdt, dsdt;
@@ -432,28 +542,30 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine)
     fadt_data.dsdt_tbl_offset = &dsdt;
     fadt_data.xdsdt_tbl_offset = &dsdt;
     build_fadt(tables_blob, tables->linker, &fadt_data,
-               lams->oem_id, lams->oem_table_id);
+               lvms->oem_id, lvms->oem_table_id);
 
     acpi_add_table(table_offsets, tables_blob);
-    build_madt(tables_blob, tables->linker, lams);
+    build_madt(tables_blob, tables->linker, lvms);
 
     acpi_add_table(table_offsets, tables_blob);
     build_pptt(tables_blob, tables->linker, machine,
-               lams->oem_id, lams->oem_table_id);
+               lvms->oem_id, lvms->oem_table_id);
 
     acpi_add_table(table_offsets, tables_blob);
     build_srat(tables_blob, tables->linker, machine);
+    acpi_add_table(table_offsets, tables_blob);
+    spcr_setup(tables_blob, tables->linker, machine);
 
     if (machine->numa_state->num_nodes) {
         if (machine->numa_state->have_numa_distance) {
             acpi_add_table(table_offsets, tables_blob);
-            build_slit(tables_blob, tables->linker, machine, lams->oem_id,
-                       lams->oem_table_id);
+            build_slit(tables_blob, tables->linker, machine, lvms->oem_id,
+                       lvms->oem_table_id);
         }
         if (machine->numa_state->hmat_enabled) {
             acpi_add_table(table_offsets, tables_blob);
             build_hmat(tables_blob, tables->linker, machine->numa_state,
-                       lams->oem_id, lams->oem_table_id);
+                       lvms->oem_id, lvms->oem_table_id);
         }
     }
 
@@ -463,8 +575,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine)
            .base = cpu_to_le64(VIRT_PCI_CFG_BASE),
            .size = cpu_to_le64(VIRT_PCI_CFG_SIZE),
         };
-        build_mcfg(tables_blob, tables->linker, &mcfg, lams->oem_id,
-                   lams->oem_table_id);
+        build_mcfg(tables_blob, tables->linker, &mcfg, lvms->oem_id,
+                   lvms->oem_table_id);
     }
 
 #ifdef CONFIG_TPM
@@ -472,8 +584,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine)
     if (tpm_get_version(tpm_find()) == TPM_VERSION_2_0) {
         acpi_add_table(table_offsets, tables_blob);
         build_tpm2(tables_blob, tables->linker,
-                   tables->tcpalog, lams->oem_id,
-                   lams->oem_table_id);
+                   tables->tcpalog, lvms->oem_id,
+                   lvms->oem_table_id);
     }
 #endif
     /* Add tables supplied by user (if any) */
@@ -487,13 +599,13 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine)
     /* RSDT is pointed to by RSDP */
     rsdt = tables_blob->len;
     build_rsdt(tables_blob, tables->linker, table_offsets,
-               lams->oem_id, lams->oem_table_id);
+               lvms->oem_id, lvms->oem_table_id);
 
     /* RSDP is in FSEG memory, so allocate it separately */
     {
         AcpiRsdpData rsdp_data = {
             .revision = 0,
-            .oem_id = lams->oem_id,
+            .oem_id = lvms->oem_id,
             .xsdt_tbl_offset = NULL,
             .rsdt_tbl_offset = &rsdt,
         };
@@ -570,17 +682,25 @@ static const VMStateDescription vmstate_acpi_build = {
     },
 };
 
-void loongarch_acpi_setup(LoongArchMachineState *lams)
+static bool loongarch_is_acpi_enabled(LoongArchVirtMachineState *lvms)
+{
+    if (lvms->acpi == ON_OFF_AUTO_OFF) {
+        return false;
+    }
+    return true;
+}
+
+void loongarch_acpi_setup(LoongArchVirtMachineState *lvms)
 {
     AcpiBuildTables tables;
     AcpiBuildState *build_state;
 
-    if (!lams->fw_cfg) {
+    if (!lvms->fw_cfg) {
         ACPI_BUILD_DPRINTF("No fw cfg. Bailing out.\n");
         return;
     }
 
-    if (!loongarch_is_acpi_enabled(lams)) {
+    if (!loongarch_is_acpi_enabled(lvms)) {
         ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n");
         return;
     }
@@ -588,7 +708,7 @@ void loongarch_acpi_setup(LoongArchMachineState *lams)
     build_state = g_malloc0(sizeof *build_state);
 
     acpi_build_tables_init(&tables);
-    acpi_build(&tables, MACHINE(lams));
+    acpi_build(&tables, MACHINE(lvms));
 
     /* Now expose it all to Guest */
     build_state->table_mr = acpi_add_rom_blob(acpi_build_update,
@@ -604,6 +724,9 @@ void loongarch_acpi_setup(LoongArchMachineState *lams)
                                              build_state, tables.rsdp,
                                              ACPI_BUILD_RSDP_FILE);
 
+    fw_cfg_add_file(lvms->fw_cfg, ACPI_BUILD_TPMLOG_FILE, tables.tcpalog->data,
+                    acpi_data_len(tables.tcpalog));
+
     qemu_register_reset(acpi_build_reset, build_state);
     acpi_build_reset(build_state);
     vmstate_register(NULL, 0, &vmstate_acpi_build, build_state);
diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c
new file mode 100644
index 0000000000000000000000000000000000000000..39c4a6d8c6db76d84a1c1e03b71433115f0f0c51
--- /dev/null
+++ b/hw/loongarch/boot.c
@@ -0,0 +1,373 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * LoongArch boot helper functions.
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "target/loongarch/cpu.h"
+#include "hw/loongarch/virt.h"
+#include "hw/loader.h"
+#include "elf.h"
+#include "qemu/error-report.h"
+#include "sysemu/reset.h"
+#include "sysemu/qtest.h"
+
+struct memmap_entry *memmap_table;
+unsigned memmap_entries;
+
+ram_addr_t initrd_offset;
+uint64_t initrd_size;
+
+static const unsigned int slave_boot_code[] = {
+                  /* Configure reset ebase.                    */
+    0x0400302c,   /* csrwr      $t0, LOONGARCH_CSR_EENTRY      */
+
+                  /* Disable interrupt.                        */
+    0x0380100c,   /* ori        $t0, $zero,0x4                 */
+    0x04000180,   /* csrxchg    $zero, $t0, LOONGARCH_CSR_CRMD */
+
+                  /* Clear mailbox.                            */
+    0x1400002d,   /* lu12i.w    $t1, 1(0x1)                    */
+    0x038081ad,   /* ori        $t1, $t1, CORE_BUF_20  */
+    0x06481da0,   /* iocsrwr.d  $zero, $t1                     */
+
+                  /* Enable IPI interrupt.                     */
+    0x1400002c,   /* lu12i.w    $t0, 1(0x1)                    */
+    0x0400118c,   /* csrxchg    $t0, $t0, LOONGARCH_CSR_ECFG   */
+    0x02fffc0c,   /* addi.d     $t0, $r0,-1(0xfff)             */
+    0x1400002d,   /* lu12i.w    $t1, 1(0x1)                    */
+    0x038011ad,   /* ori        $t1, $t1, CORE_EN_OFF          */
+    0x064819ac,   /* iocsrwr.w  $t0, $t1                       */
+    0x1400002d,   /* lu12i.w    $t1, 1(0x1)                    */
+    0x038081ad,   /* ori        $t1, $t1, CORE_BUF_20          */
+
+                  /* Wait for wakeup  <.L11>:                  */
+    0x06488000,   /* idle       0x0                            */
+    0x03400000,   /* andi       $zero, $zero, 0x0              */
+    0x064809ac,   /* iocsrrd.w  $t0, $t1                       */
+    0x43fff59f,   /* beqz       $t0, -12(0x7ffff4) # 48 <.L11> */
+
+                  /* Read and clear IPI interrupt.             */
+    0x1400002d,   /* lu12i.w    $t1, 1(0x1)                    */
+    0x064809ac,   /* iocsrrd.w  $t0, $t1                       */
+    0x1400002d,   /* lu12i.w    $t1, 1(0x1)                    */
+    0x038031ad,   /* ori        $t1, $t1, CORE_CLEAR_OFF       */
+    0x064819ac,   /* iocsrwr.w  $t0, $t1                       */
+
+                  /* Disable  IPI interrupt.                   */
+    0x1400002c,   /* lu12i.w    $t0, 1(0x1)                    */
+    0x04001180,   /* csrxchg    $zero, $t0, LOONGARCH_CSR_ECFG */
+
+                  /* Read mail buf and jump to specified entry */
+    0x1400002d,   /* lu12i.w    $t1, 1(0x1)                    */
+    0x038081ad,   /* ori        $t1, $t1, CORE_BUF_20          */
+    0x06480dac,   /* iocsrrd.d  $t0, $t1                       */
+    0x00150181,   /* move       $ra, $t0                       */
+    0x4c000020,   /* jirl       $zero, $ra,0                   */
+};
+
+static inline void *guidcpy(void *dst, const void *src)
+{
+    return memcpy(dst, src, sizeof(efi_guid_t));
+}
+
+static void init_efi_boot_memmap(struct efi_system_table *systab,
+                                 void *p, void *start)
+{
+    unsigned i;
+    struct efi_boot_memmap *boot_memmap = p;
+    efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID;
+
+    /* efi_configuration_table 1 */
+    guidcpy(&systab->tables[0].guid, &tbl_guid);
+    systab->tables[0].table = (struct efi_configuration_table *)(p - start);
+    systab->nr_tables = 1;
+
+    boot_memmap->desc_size = sizeof(efi_memory_desc_t);
+    boot_memmap->desc_ver = 1;
+    boot_memmap->map_size = 0;
+
+    efi_memory_desc_t *map = p + sizeof(struct efi_boot_memmap);
+    for (i = 0; i < memmap_entries; i++) {
+        map = (void *)boot_memmap + sizeof(*map);
+        map[i].type = memmap_table[i].type;
+        map[i].phys_addr = ROUND_UP(memmap_table[i].address, 64 * KiB);
+        map[i].num_pages = ROUND_DOWN(memmap_table[i].address +
+                        memmap_table[i].length - map[i].phys_addr, 64 * KiB);
+        p += sizeof(efi_memory_desc_t);
+    }
+}
+
+static void init_efi_initrd_table(struct efi_system_table *systab,
+                                  void *p, void *start)
+{
+    efi_guid_t tbl_guid = LINUX_EFI_INITRD_MEDIA_GUID;
+    struct efi_initrd *initrd_table  = p;
+
+    /* efi_configuration_table 2 */
+    guidcpy(&systab->tables[1].guid, &tbl_guid);
+    systab->tables[1].table = (struct efi_configuration_table *)(p - start);
+    systab->nr_tables = 2;
+
+    initrd_table->base = initrd_offset;
+    initrd_table->size = initrd_size;
+}
+
+static void init_efi_fdt_table(struct efi_system_table *systab)
+{
+    efi_guid_t tbl_guid = DEVICE_TREE_GUID;
+
+    /* efi_configuration_table 3 */
+    guidcpy(&systab->tables[2].guid, &tbl_guid);
+    systab->tables[2].table = (void *)FDT_BASE;
+    systab->nr_tables = 3;
+}
+
+static void init_systab(struct loongarch_boot_info *info, void *p, void *start)
+{
+    void *bp_tables_start;
+    struct efi_system_table *systab = p;
+
+    info->a2 = p - start;
+
+    systab->hdr.signature = EFI_SYSTEM_TABLE_SIGNATURE;
+    systab->hdr.revision = EFI_SPECIFICATION_VERSION;
+    systab->hdr.revision = sizeof(struct efi_system_table),
+    systab->fw_revision = FW_VERSION << 16 | FW_PATCHLEVEL << 8;
+    systab->runtime = 0;
+    systab->boottime = 0;
+    systab->nr_tables = 0;
+
+    p += ROUND_UP(sizeof(struct efi_system_table), 64 * KiB);
+
+    systab->tables = p;
+    bp_tables_start = p;
+
+    init_efi_boot_memmap(systab, p, start);
+    p += ROUND_UP(sizeof(struct efi_boot_memmap) +
+                  sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB);
+    init_efi_initrd_table(systab, p, start);
+    p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB);
+    init_efi_fdt_table(systab);
+
+    systab->tables = (struct efi_configuration_table *)(bp_tables_start - start);
+}
+
+static void init_cmdline(struct loongarch_boot_info *info, void *p, void *start)
+{
+    hwaddr cmdline_addr = p - start;
+
+    info->a0 = 1;
+    info->a1 = cmdline_addr;
+
+    g_strlcpy(p, info->kernel_cmdline, COMMAND_LINE_SIZE);
+}
+
+static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr)
+{
+    return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS);
+}
+
+static void find_initrd_loadoffset(struct loongarch_boot_info *info,
+                uint64_t kernel_high, ssize_t kernel_size)
+{
+    hwaddr base, size, gap, low_end;
+    ram_addr_t initrd_end, initrd_start;
+
+    base = VIRT_LOWMEM_BASE;
+    gap = VIRT_LOWMEM_SIZE;
+    initrd_start = ROUND_UP(kernel_high + 4 * kernel_size, 64 * KiB);
+    initrd_end = initrd_start + initrd_size;
+
+    size = info->ram_size;
+    low_end = base + MIN(size, gap);
+    if (initrd_end <= low_end) {
+        initrd_offset = initrd_start;
+        return;
+    }
+
+    if (size <= gap) {
+        error_report("The low memory too small for initial ram disk '%s',"
+             "You need to expand the memory space",
+             info->initrd_filename);
+        exit(1);
+    }
+
+    /*
+     * Try to load initrd in the high memory
+     */
+    size -= gap;
+    base = VIRT_HIGHMEM_BASE;
+    initrd_start = ROUND_UP(base, 64 * KiB);
+    if (initrd_size <= size) {
+        initrd_offset = initrd_start;
+        return;
+    }
+
+    error_report("The high memory too small for initial ram disk '%s',"
+         "You need to expand the memory space",
+         info->initrd_filename);
+    exit(1);
+}
+
+static int64_t load_kernel_info(struct loongarch_boot_info *info)
+{
+    uint64_t kernel_entry, kernel_low, kernel_high;
+    ssize_t kernel_size;
+
+    kernel_size = load_elf(info->kernel_filename, NULL,
+                           cpu_loongarch_virt_to_phys, NULL,
+                           &kernel_entry, &kernel_low,
+                           &kernel_high, NULL, 0,
+                           EM_LOONGARCH, 1, 0);
+
+    if (kernel_size < 0) {
+        error_report("could not load kernel '%s': %s",
+                     info->kernel_filename,
+                     load_elf_strerror(kernel_size));
+        exit(1);
+    }
+
+    if (info->initrd_filename) {
+        initrd_size = get_image_size(info->initrd_filename);
+        if (initrd_size > 0) {
+            find_initrd_loadoffset(info, kernel_high, kernel_size);
+            initrd_size = load_image_targphys(info->initrd_filename, initrd_offset,
+                                              initrd_size);
+        }
+
+        if (initrd_size == (target_ulong)-1) {
+            error_report("could not load initial ram disk '%s'",
+                         info->initrd_filename);
+            exit(1);
+        }
+    } else {
+        initrd_size = 0;
+    }
+
+    return kernel_entry;
+}
+
+void reset_load_elf(void *opaque)
+{
+    LoongArchCPU *cpu = opaque;
+    CPULoongArchState *env = &cpu->env;
+
+    cpu_reset(CPU(cpu));
+    if (env->load_elf) {
+	if (cpu == LOONGARCH_CPU(first_cpu)) {
+            env->gpr[4] = env->boot_info->a0;
+            env->gpr[5] = env->boot_info->a1;
+            env->gpr[6] = env->boot_info->a2;
+        }
+        cpu_set_pc(CPU(cpu), env->elf_address);
+    }
+}
+
+static void fw_cfg_add_kernel_info(struct loongarch_boot_info *info,
+                                   FWCfgState *fw_cfg)
+{
+    /*
+     * Expose the kernel, the command line, and the initrd in fw_cfg.
+     * We don't process them here at all, it's all left to the
+     * firmware.
+     */
+    load_image_to_fw_cfg(fw_cfg,
+                         FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA,
+                         info->kernel_filename,
+                         false);
+
+    if (info->initrd_filename) {
+        load_image_to_fw_cfg(fw_cfg,
+                             FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA,
+                             info->initrd_filename, false);
+    }
+
+    if (info->kernel_cmdline) {
+        fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
+                       strlen(info->kernel_cmdline) + 1);
+        fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA,
+                          info->kernel_cmdline);
+    }
+}
+
+static void loongarch_firmware_boot(LoongArchVirtMachineState *lvms,
+                                    struct loongarch_boot_info *info)
+{
+    fw_cfg_add_kernel_info(info, lvms->fw_cfg);
+}
+
+static void init_boot_rom(struct loongarch_boot_info *info, void *p)
+{
+    void *start = p;
+
+    init_cmdline(info, p, start);
+    p += COMMAND_LINE_SIZE;
+
+    init_systab(info, p, start);
+}
+
+static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info)
+{
+    void *p, *bp;
+    int64_t kernel_addr = VIRT_FLASH0_BASE;
+    LoongArchCPU *lacpu;
+    CPUState *cs;
+
+    if (info->kernel_filename) {
+        kernel_addr = load_kernel_info(info);
+    } else {
+        if(!qtest_enabled()) {
+            warn_report("No kernel provided, booting from flash drive.");
+        }
+    }
+
+    /* Load cmdline and system tables at [0 - 1 MiB] */
+    p = g_malloc0(1 * MiB);
+    bp = p;
+    init_boot_rom(info, p);
+    rom_add_blob_fixed_as("boot_info", bp, 1 * MiB, 0, &address_space_memory);
+
+    /* Load slave boot code at pflash0 . */
+    void *boot_code = g_malloc0(VIRT_FLASH0_SIZE);
+    memcpy(boot_code, &slave_boot_code, sizeof(slave_boot_code));
+    rom_add_blob_fixed("boot_code", boot_code, VIRT_FLASH0_SIZE, VIRT_FLASH0_BASE);
+
+    CPU_FOREACH(cs) {
+        lacpu = LOONGARCH_CPU(cs);
+        lacpu->env.load_elf = true;
+        if (cs == first_cpu) {
+            lacpu->env.elf_address = kernel_addr;
+        } else {
+            lacpu->env.elf_address = VIRT_FLASH0_BASE;
+        }
+        lacpu->env.boot_info = info;
+    }
+
+    g_free(boot_code);
+    g_free(bp);
+}
+
+void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info)
+{
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms);
+    int i;
+
+    /* register reset function */
+    for (i = 0; i < ms->smp.cpus; i++) {
+        qemu_register_reset(reset_load_elf, LOONGARCH_CPU(qemu_get_cpu(i)));
+    }
+
+    info->kernel_filename = ms->kernel_filename;
+    info->kernel_cmdline = ms->kernel_cmdline;
+    info->initrd_filename = ms->initrd_filename;
+
+    if (lvms->bios_loaded) {
+        loongarch_firmware_boot(lvms, info);
+    } else {
+        loongarch_direct_kernel_boot(info);
+    }
+}
diff --git a/hw/loongarch/fw_cfg.c b/hw/loongarch/fw_cfg.c
index f15a17416c4814100ac9d45ff23b0abacc856eed..35aeb2decbd1ed87da6689672b134b6684321d62 100644
--- a/hw/loongarch/fw_cfg.c
+++ b/hw/loongarch/fw_cfg.c
@@ -17,7 +17,7 @@ static void fw_cfg_boot_set(void *opaque, const char *boot_device,
     fw_cfg_modify_i16(opaque, FW_CFG_BOOT_DEVICE, boot_device[0]);
 }
 
-FWCfgState *loongarch_fw_cfg_init(ram_addr_t ram_size, MachineState *ms)
+FWCfgState *virt_fw_cfg_init(ram_addr_t ram_size, MachineState *ms)
 {
     FWCfgState *fw_cfg;
     int max_cpus = ms->smp.max_cpus;
diff --git a/hw/loongarch/fw_cfg.h b/hw/loongarch/fw_cfg.h
index 7c0de4db4ad44a582fddd1cf3e44e140b2a0d8b3..27ee68286ed115f5e3f99b27c8d3ad2ec1d1550f 100644
--- a/hw/loongarch/fw_cfg.h
+++ b/hw/loongarch/fw_cfg.h
@@ -11,5 +11,5 @@
 #include "hw/boards.h"
 #include "hw/nvram/fw_cfg.h"
 
-FWCfgState *loongarch_fw_cfg_init(ram_addr_t ram_size, MachineState *ms);
+FWCfgState *virt_fw_cfg_init(ram_addr_t ram_size, MachineState *ms);
 #endif
diff --git a/hw/loongarch/meson.build b/hw/loongarch/meson.build
index c0421502abab9a8ca9c3751b37969d9e6a8caf3d..d306d82c2ee0e6a671f8180c6554a7185d47c98e 100644
--- a/hw/loongarch/meson.build
+++ b/hw/loongarch/meson.build
@@ -1,6 +1,7 @@
 loongarch_ss = ss.source_set()
 loongarch_ss.add(files(
     'fw_cfg.c',
+    'boot.c',
 ))
 loongarch_ss.add(when: 'CONFIG_LOONGARCH_VIRT', if_true: [files('virt.c'), fdt])
 loongarch_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-build.c'))
diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 4b7dc67a2d79a136438afa789d98916878d8a9e1..233297d78f397a5201a7b2a90d92315bd835a1ce 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -10,11 +10,14 @@
 #include "qapi/error.h"
 #include "hw/boards.h"
 #include "hw/char/serial.h"
+#include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/qtest.h"
 #include "sysemu/runstate.h"
 #include "sysemu/reset.h"
 #include "sysemu/rtc.h"
+#include "sysemu/tcg.h"
+#include "sysemu/kvm.h"
 #include "hw/loongarch/virt.h"
 #include "exec/address-spaces.h"
 #include "hw/irq.h"
@@ -44,17 +47,43 @@
 #include "sysemu/tpm.h"
 #include "sysemu/block-backend.h"
 #include "hw/block/flash.h"
+#include "hw/virtio/virtio-iommu.h"
 #include "qemu/error-report.h"
+#include "qemu/guest-random.h"
 
+#define FDT_IRQ_FLAGS_EDGE_LO_HI 1
+#define FDT_IRQ_FLAGS_EDGE_HI_LO 2
+#define FDT_IRQ_FLAGS_LEVEL_HI 4
+#define FDT_IRQ_FLAGS_LEVEL_LO 8
 
-struct loaderparams {
-    uint64_t ram_size;
-    const char *kernel_filename;
-    const char *kernel_cmdline;
-    const char *initrd_filename;
-};
+static bool virt_is_veiointc_enabled(LoongArchVirtMachineState *lvms)
+{
+    if (lvms->veiointc == ON_OFF_AUTO_OFF) {
+        return false;
+    }
+    return true;
+}
 
-static void virt_flash_create(LoongArchMachineState *lams)
+static void virt_get_veiointc(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj);
+    OnOffAuto veiointc = lvms->veiointc;
+
+    visit_type_OnOffAuto(v, name, &veiointc, errp);
+}
+
+static void virt_set_veiointc(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, name, &lvms->veiointc, errp);
+}
+
+static PFlashCFI01 *virt_flash_create1(LoongArchVirtMachineState *lvms,
+                                       const char *name,
+                                       const char *alias_prop_name)
 {
     DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01);
 
@@ -66,84 +95,262 @@ static void virt_flash_create(LoongArchMachineState *lams)
     qdev_prop_set_uint16(dev, "id1", 0x18);
     qdev_prop_set_uint16(dev, "id2", 0x00);
     qdev_prop_set_uint16(dev, "id3", 0x00);
-    qdev_prop_set_string(dev, "name", "virt.flash");
-    object_property_add_child(OBJECT(lams), "virt.flash", OBJECT(dev));
-    object_property_add_alias(OBJECT(lams), "pflash",
+    qdev_prop_set_string(dev, "name", name);
+    object_property_add_child(OBJECT(lvms), name, OBJECT(dev));
+    object_property_add_alias(OBJECT(lvms), alias_prop_name,
                               OBJECT(dev), "drive");
+    return PFLASH_CFI01(dev);
+}
 
-    lams->flash = PFLASH_CFI01(dev);
+static void virt_flash_create(LoongArchVirtMachineState *lvms)
+{
+    lvms->flash[0] = virt_flash_create1(lvms, "virt.flash0", "pflash0");
+    lvms->flash[1] = virt_flash_create1(lvms, "virt.flash1", "pflash1");
 }
 
-static void virt_flash_map(LoongArchMachineState *lams,
-                           MemoryRegion *sysmem)
+static void virt_flash_map1(PFlashCFI01 *flash,
+                            hwaddr base, hwaddr size,
+                            MemoryRegion *sysmem)
 {
-    PFlashCFI01 *flash = lams->flash;
     DeviceState *dev = DEVICE(flash);
-    hwaddr base = VIRT_FLASH_BASE;
-    hwaddr size = VIRT_FLASH_SIZE;
+    BlockBackend *blk;
+    hwaddr real_size = size;
 
-    assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE));
-    assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
+    blk = pflash_cfi01_get_blk(flash);
+    if (blk) {
+        real_size = blk_getlength(blk);
+        assert(real_size && real_size <= size);
+    }
+
+    assert(QEMU_IS_ALIGNED(real_size, VIRT_FLASH_SECTOR_SIZE));
+    assert(real_size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
 
-    qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE);
+    qdev_prop_set_uint32(dev, "num-blocks", real_size / VIRT_FLASH_SECTOR_SIZE);
     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
     memory_region_add_subregion(sysmem, base,
                                 sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0));
+}
+
+static void virt_flash_map(LoongArchVirtMachineState *lvms,
+                           MemoryRegion *sysmem)
+{
+    PFlashCFI01 *flash0 = lvms->flash[0];
+    PFlashCFI01 *flash1 = lvms->flash[1];
+
+    virt_flash_map1(flash0, VIRT_FLASH0_BASE, VIRT_FLASH0_SIZE, sysmem);
+    virt_flash_map1(flash1, VIRT_FLASH1_BASE, VIRT_FLASH1_SIZE, sysmem);
+}
+
+static void fdt_add_cpuic_node(LoongArchVirtMachineState *lvms,
+                               uint32_t *cpuintc_phandle)
+{
+    MachineState *ms = MACHINE(lvms);
+    char *nodename;
+
+    *cpuintc_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+    nodename = g_strdup_printf("/cpuic");
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *cpuintc_phandle);
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible",
+                            "loongson,cpu-interrupt-controller");
+    qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1);
+    g_free(nodename);
+}
+
+static void fdt_add_eiointc_node(LoongArchVirtMachineState *lvms,
+                                  uint32_t *cpuintc_phandle,
+                                  uint32_t *eiointc_phandle)
+{
+    MachineState *ms = MACHINE(lvms);
+    char *nodename;
+    hwaddr extioi_base = APIC_BASE;
+    hwaddr extioi_size = EXTIOI_SIZE;
 
+    *eiointc_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+    nodename = g_strdup_printf("/eiointc@%" PRIx64, extioi_base);
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *eiointc_phandle);
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible",
+                            "loongson,ls2k2000-eiointc");
+    qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                          *cpuintc_phandle);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupts", 3);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0,
+                           extioi_base, 0x0, extioi_size);
+    g_free(nodename);
 }
 
-static void fdt_add_flash_node(LoongArchMachineState *lams)
+static void fdt_add_pch_pic_node(LoongArchVirtMachineState *lvms,
+                                 uint32_t *eiointc_phandle,
+                                 uint32_t *pch_pic_phandle)
 {
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
     char *nodename;
+    hwaddr pch_pic_base = VIRT_PCH_REG_BASE;
+    hwaddr pch_pic_size = VIRT_PCH_REG_SIZE;
 
-    hwaddr flash_base = VIRT_FLASH_BASE;
-    hwaddr flash_size = VIRT_FLASH_SIZE;
+    *pch_pic_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+    nodename = g_strdup_printf("/platic@%" PRIx64, pch_pic_base);
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+    qemu_fdt_setprop_cell(ms->fdt,  nodename, "phandle", *pch_pic_phandle);
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible",
+                            "loongson,pch-pic-1.0");
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0,
+                           pch_pic_base, 0, pch_pic_size);
+    qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 2);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                          *eiointc_phandle);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,pic-base-vec", 0);
+    g_free(nodename);
+}
 
-    nodename = g_strdup_printf("/flash@%" PRIx64, flash_base);
+static void fdt_add_pch_msi_node(LoongArchVirtMachineState *lvms,
+                                 uint32_t *eiointc_phandle,
+                                 uint32_t *pch_msi_phandle)
+{
+    MachineState *ms = MACHINE(lvms);
+    char *nodename;
+    hwaddr pch_msi_base = VIRT_PCH_MSI_ADDR_LOW;
+    hwaddr pch_msi_size = VIRT_PCH_MSI_SIZE;
+
+    *pch_msi_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+    nodename = g_strdup_printf("/msi@%" PRIx64, pch_msi_base);
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *pch_msi_phandle);
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible",
+                            "loongson,pch-msi-1.0");
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "reg",
+                           0, pch_msi_base,
+                           0, pch_msi_size);
+    qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                          *eiointc_phandle);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,msi-base-vec",
+                          VIRT_PCH_PIC_IRQ_NUM);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,msi-num-vecs",
+                          EXTIOI_IRQS - VIRT_PCH_PIC_IRQ_NUM);
+    g_free(nodename);
+}
+
+static void fdt_add_flash_node(LoongArchVirtMachineState *lvms)
+{
+    MachineState *ms = MACHINE(lvms);
+    char *nodename;
+    MemoryRegion *flash_mem;
+
+    hwaddr flash0_base;
+    hwaddr flash0_size;
+
+    hwaddr flash1_base;
+    hwaddr flash1_size;
+
+    flash_mem = pflash_cfi01_get_memory(lvms->flash[0]);
+    flash0_base = flash_mem->addr;
+    flash0_size = memory_region_size(flash_mem);
+
+    flash_mem = pflash_cfi01_get_memory(lvms->flash[1]);
+    flash1_base = flash_mem->addr;
+    flash1_size = memory_region_size(flash_mem);
+
+    nodename = g_strdup_printf("/flash@%" PRIx64, flash0_base);
     qemu_fdt_add_subnode(ms->fdt, nodename);
     qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cfi-flash");
     qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg",
-                                 2, flash_base, 2, flash_size);
+                                 2, flash0_base, 2, flash0_size,
+                                 2, flash1_base, 2, flash1_size);
     qemu_fdt_setprop_cell(ms->fdt, nodename, "bank-width", 4);
     g_free(nodename);
 }
 
-static void fdt_add_rtc_node(LoongArchMachineState *lams)
+static void fdt_add_rtc_node(LoongArchVirtMachineState *lvms,
+                             uint32_t *pch_pic_phandle)
 {
     char *nodename;
     hwaddr base = VIRT_RTC_REG_BASE;
     hwaddr size = VIRT_RTC_LEN;
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
 
     nodename = g_strdup_printf("/rtc@%" PRIx64, base);
     qemu_fdt_add_subnode(ms->fdt, nodename);
-    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "loongson,ls7a-rtc");
+    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible",
+                            "loongson,ls7a-rtc");
     qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts",
+                           VIRT_RTC_IRQ - VIRT_GSI_BASE , FDT_IRQ_FLAGS_EDGE_LO_HI);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                          *pch_pic_phandle);
     g_free(nodename);
 }
 
-static void fdt_add_uart_node(LoongArchMachineState *lams)
+static void fdt_add_ged_reset(LoongArchVirtMachineState *lvms)
+{
+    char *name;
+    uint32_t ged_handle;
+    MachineState *ms = MACHINE(lvms);
+    hwaddr base = VIRT_GED_REG_ADDR;
+    hwaddr size = ACPI_GED_REG_COUNT;
+
+    ged_handle = qemu_fdt_alloc_phandle(ms->fdt);
+    name = g_strdup_printf("/ged@%" PRIx64, base);
+    qemu_fdt_add_subnode(ms->fdt, name);
+    qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon");
+    qemu_fdt_setprop_cells(ms->fdt, name, "reg", 0x0, base, 0x0, size);
+    /* 8 bit registers */
+    qemu_fdt_setprop_cell(ms->fdt, name, "reg-shift", 0);
+    qemu_fdt_setprop_cell(ms->fdt, name, "reg-io-width", 1);
+    qemu_fdt_setprop_cell(ms->fdt, name, "phandle", ged_handle);
+    ged_handle = qemu_fdt_get_phandle(ms->fdt, name);
+    g_free(name);
+
+    name = g_strdup_printf("/reboot");
+    qemu_fdt_add_subnode(ms->fdt, name);
+    qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon-reboot");
+    qemu_fdt_setprop_cell(ms->fdt, name, "regmap", ged_handle);
+    qemu_fdt_setprop_cell(ms->fdt, name, "offset", ACPI_GED_REG_RESET);
+    qemu_fdt_setprop_cell(ms->fdt, name, "value", ACPI_GED_RESET_VALUE);
+    g_free(name);
+
+    name = g_strdup_printf("/poweroff");
+    qemu_fdt_add_subnode(ms->fdt, name);
+    qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon-poweroff");
+    qemu_fdt_setprop_cell(ms->fdt, name, "regmap", ged_handle);
+    qemu_fdt_setprop_cell(ms->fdt, name, "offset", ACPI_GED_REG_SLEEP_CTL);
+    qemu_fdt_setprop_cell(ms->fdt, name, "value", ACPI_GED_SLP_EN |
+                          (ACPI_GED_SLP_TYP_S5 << ACPI_GED_SLP_TYP_POS));
+    g_free(name);
+}
+
+static void fdt_add_uart_node(LoongArchVirtMachineState *lvms,
+                              uint32_t *pch_pic_phandle, hwaddr base,
+                              int irq, bool chosen)
 {
     char *nodename;
-    hwaddr base = VIRT_UART_BASE;
     hwaddr size = VIRT_UART_SIZE;
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
 
     nodename = g_strdup_printf("/serial@%" PRIx64, base);
     qemu_fdt_add_subnode(ms->fdt, nodename);
     qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "ns16550a");
     qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0, base, 0x0, size);
     qemu_fdt_setprop_cell(ms->fdt, nodename, "clock-frequency", 100000000);
-    qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename);
+    if (chosen)
+        qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", irq, FDT_IRQ_FLAGS_LEVEL_HI);
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                          *pch_pic_phandle);
     g_free(nodename);
 }
 
-static void create_fdt(LoongArchMachineState *lams)
+static void create_fdt(LoongArchVirtMachineState *lvms)
 {
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
+    uint8_t rng_seed[32];
 
-    ms->fdt = create_device_tree(&lams->fdt_size);
+    ms->fdt = create_device_tree(&lvms->fdt_size);
     if (!ms->fdt) {
         error_report("create_device_tree() failed");
         exit(1);
@@ -155,12 +362,16 @@ static void create_fdt(LoongArchMachineState *lams)
     qemu_fdt_setprop_cell(ms->fdt, "/", "#address-cells", 0x2);
     qemu_fdt_setprop_cell(ms->fdt, "/", "#size-cells", 0x2);
     qemu_fdt_add_subnode(ms->fdt, "/chosen");
+
+    /* Pass seed to RNG */
+    qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed));
+    qemu_fdt_setprop(ms->fdt, "/chosen", "rng-seed", rng_seed, sizeof(rng_seed));
 }
 
-static void fdt_add_cpu_nodes(const LoongArchMachineState *lams)
+static void fdt_add_cpu_nodes(const LoongArchVirtMachineState *lvms)
 {
     int num;
-    const MachineState *ms = MACHINE(lams);
+    const MachineState *ms = MACHINE(lvms);
     int smp_cpus = ms->smp.cpus;
 
     qemu_fdt_add_subnode(ms->fdt, "/cpus");
@@ -214,11 +425,11 @@ static void fdt_add_cpu_nodes(const LoongArchMachineState *lams)
     }
 }
 
-static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams)
+static void fdt_add_fw_cfg_node(const LoongArchVirtMachineState *lvms)
 {
     char *nodename;
     hwaddr base = VIRT_FWCFG_BASE;
-    const MachineState *ms = MACHINE(lams);
+    const MachineState *ms = MACHINE(lvms);
 
     nodename = g_strdup_printf("/fw_cfg@%" PRIx64, base);
     qemu_fdt_add_subnode(ms->fdt, nodename);
@@ -230,7 +441,62 @@ static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams)
     g_free(nodename);
 }
 
-static void fdt_add_pcie_node(const LoongArchMachineState *lams)
+static void fdt_add_pcie_irq_map_node(const LoongArchVirtMachineState *lvms,
+                                      char *nodename,
+                                      uint32_t *pch_pic_phandle)
+{
+    int pin, dev;
+    uint32_t irq_map_stride = 0;
+    uint32_t full_irq_map[GPEX_NUM_IRQS *GPEX_NUM_IRQS * 10] = {};
+    uint32_t *irq_map = full_irq_map;
+    const MachineState *ms = MACHINE(lvms);
+
+    /* This code creates a standard swizzle of interrupts such that
+     * each device's first interrupt is based on it's PCI_SLOT number.
+     * (See pci_swizzle_map_irq_fn())
+     *
+     * We only need one entry per interrupt in the table (not one per
+     * possible slot) seeing the interrupt-map-mask will allow the table
+     * to wrap to any number of devices.
+     */
+
+    for (dev = 0; dev < GPEX_NUM_IRQS; dev++) {
+        int devfn = dev * 0x8;
+
+        for (pin = 0; pin  < GPEX_NUM_IRQS; pin++) {
+            int irq_nr = 16 + ((pin + PCI_SLOT(devfn)) % GPEX_NUM_IRQS);
+            int i = 0;
+
+            /* Fill PCI address cells */
+            irq_map[i] = cpu_to_be32(devfn << 8);
+            i += 3;
+
+            /* Fill PCI Interrupt cells */
+            irq_map[i] = cpu_to_be32(pin + 1);
+            i += 1;
+
+            /* Fill interrupt controller phandle and cells */
+            irq_map[i++] = cpu_to_be32(*pch_pic_phandle);
+            irq_map[i++] = cpu_to_be32(irq_nr);
+
+            if (!irq_map_stride) {
+                irq_map_stride = i;
+            }
+            irq_map += irq_map_stride;
+        }
+    }
+
+
+    qemu_fdt_setprop(ms->fdt, nodename, "interrupt-map", full_irq_map,
+                     GPEX_NUM_IRQS * GPEX_NUM_IRQS *
+                     irq_map_stride * sizeof(uint32_t));
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupt-map-mask",
+                     0x1800, 0, 0, 0x7);
+}
+
+static void fdt_add_pcie_node(const LoongArchVirtMachineState *lvms,
+                              uint32_t *pch_pic_phandle,
+                              uint32_t *pch_msi_phandle)
 {
     char *nodename;
     hwaddr base_mmio = VIRT_PCI_MEM_BASE;
@@ -241,7 +507,7 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams)
     hwaddr size_pcie = VIRT_PCI_CFG_SIZE;
     hwaddr base = base_pcie;
 
-    const MachineState *ms = MACHINE(lams);
+    const MachineState *ms = MACHINE(lvms);
 
     nodename = g_strdup_printf("/pcie@%" PRIx64, base);
     qemu_fdt_add_subnode(ms->fdt, nodename);
@@ -261,34 +527,11 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams)
                                  2, base_pio, 2, size_pio,
                                  1, FDT_PCI_RANGE_MMIO, 2, base_mmio,
                                  2, base_mmio, 2, size_mmio);
-    g_free(nodename);
-}
-
-static void fdt_add_irqchip_node(LoongArchMachineState *lams)
-{
-    MachineState *ms = MACHINE(lams);
-    char *nodename;
-    uint32_t irqchip_phandle;
-
-    irqchip_phandle = qemu_fdt_alloc_phandle(ms->fdt);
-    qemu_fdt_setprop_cell(ms->fdt, "/", "interrupt-parent", irqchip_phandle);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "msi-map",
+                           0, *pch_msi_phandle, 0, 0x10000);
 
-    nodename = g_strdup_printf("/intc@%lx", VIRT_IOAPIC_REG_BASE);
-    qemu_fdt_add_subnode(ms->fdt, nodename);
-    qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 3);
-    qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0);
-    qemu_fdt_setprop_cell(ms->fdt, nodename, "#address-cells", 0x2);
-    qemu_fdt_setprop_cell(ms->fdt, nodename, "#size-cells", 0x2);
-    qemu_fdt_setprop(ms->fdt, nodename, "ranges", NULL, 0);
-
-    qemu_fdt_setprop_string(ms->fdt, nodename, "compatible",
-                            "loongarch,ls7a");
+    fdt_add_pcie_irq_map_node(lvms, nodename, pch_pic_phandle);
 
-    qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg",
-                                 2, VIRT_IOAPIC_REG_BASE,
-                                 2, PCH_PIC_ROUTE_ENTRY_OFFSET);
-
-    qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", irqchip_phandle);
     g_free(nodename);
 }
 
@@ -298,7 +541,8 @@ static void fdt_add_memory_node(MachineState *ms,
     char *nodename = g_strdup_printf("/memory@%" PRIx64, base);
 
     qemu_fdt_add_subnode(ms->fdt, nodename);
-    qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 2, base, 2, size);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", base >> 32, base,
+                           size >> 32, size);
     qemu_fdt_setprop_string(ms->fdt, nodename, "device_type", "memory");
 
     if (ms->numa_state && ms->numa_state->num_nodes) {
@@ -308,15 +552,57 @@ static void fdt_add_memory_node(MachineState *ms,
     g_free(nodename);
 }
 
-static void virt_build_smbios(LoongArchMachineState *lams)
+static void fdt_add_memory_nodes(MachineState *ms)
+{
+    hwaddr base, size, ram_size, gap;
+    int i, nb_numa_nodes, nodes;
+    NodeInfo *numa_info;
+
+    ram_size = ms->ram_size;
+    base = VIRT_LOWMEM_BASE;
+    gap = VIRT_LOWMEM_SIZE;
+    nodes = nb_numa_nodes = ms->numa_state->num_nodes;
+    numa_info = ms->numa_state->nodes;
+    if (!nodes) {
+        nodes = 1;
+    }
+
+    for (i = 0; i < nodes; i++) {
+        if (nb_numa_nodes) {
+            size = numa_info[i].node_mem;
+        } else {
+            size = ram_size;
+        }
+
+        /*
+         * memory for the node splited into two part
+         *   lowram:  [base, +gap)
+         *   highram: [VIRT_HIGHMEM_BASE, +(len - gap))
+         */
+        if (size >= gap) {
+            fdt_add_memory_node(ms, base, gap, i);
+            size -= gap;
+            base = VIRT_HIGHMEM_BASE;
+            gap = ram_size - VIRT_LOWMEM_SIZE;
+        }
+
+        if (size) {
+            fdt_add_memory_node(ms, base, size, i);
+            base += size;
+            gap -= size;
+        }
+    }
+}
+
+static void virt_build_smbios(LoongArchVirtMachineState *lvms)
 {
-    MachineState *ms = MACHINE(lams);
-    MachineClass *mc = MACHINE_GET_CLASS(lams);
+    MachineState *ms = MACHINE(lvms);
+    MachineClass *mc = MACHINE_GET_CLASS(lvms);
     uint8_t *smbios_tables, *smbios_anchor;
     size_t smbios_tables_len, smbios_anchor_len;
     const char *product = "QEMU Virtual Machine";
 
-    if (!lams->fw_cfg) {
+    if (!lvms->fw_cfg) {
         return;
     }
 
@@ -327,39 +613,29 @@ static void virt_build_smbios(LoongArchMachineState *lams)
                       &smbios_anchor, &smbios_anchor_len, &error_fatal);
 
     if (smbios_anchor) {
-        fw_cfg_add_file(lams->fw_cfg, "etc/smbios/smbios-tables",
+        fw_cfg_add_file(lvms->fw_cfg, "etc/smbios/smbios-tables",
                         smbios_tables, smbios_tables_len);
-        fw_cfg_add_file(lams->fw_cfg, "etc/smbios/smbios-anchor",
+        fw_cfg_add_file(lvms->fw_cfg, "etc/smbios/smbios-anchor",
                         smbios_anchor, smbios_anchor_len);
     }
 }
 
-static void virt_machine_done(Notifier *notifier, void *data)
+static void virt_done(Notifier *notifier, void *data)
 {
-    LoongArchMachineState *lams = container_of(notifier,
-                                        LoongArchMachineState, machine_done);
-    virt_build_smbios(lams);
-    loongarch_acpi_setup(lams);
+    LoongArchVirtMachineState *lvms = container_of(notifier,
+                                      LoongArchVirtMachineState, machine_done);
+    virt_build_smbios(lvms);
+    loongarch_acpi_setup(lvms);
 }
 
 static void virt_powerdown_req(Notifier *notifier, void *opaque)
 {
-    LoongArchMachineState *s = container_of(notifier,
-                                   LoongArchMachineState, powerdown_notifier);
+    LoongArchVirtMachineState *s;
 
+    s = container_of(notifier, LoongArchVirtMachineState, powerdown_notifier);
     acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS);
 }
 
-struct memmap_entry {
-    uint64_t address;
-    uint64_t length;
-    uint32_t type;
-    uint32_t reserved;
-};
-
-static struct memmap_entry *memmap_table;
-static unsigned memmap_entries;
-
 static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type)
 {
     /* Ensure there are no duplicate entries. */
@@ -376,40 +652,22 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type)
     memmap_entries++;
 }
 
-static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr)
-{
-    return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS);
-}
-
-static int64_t load_kernel_info(const struct loaderparams *loaderparams)
-{
-    uint64_t kernel_entry, kernel_low, kernel_high;
-    ssize_t kernel_size;
-
-    kernel_size = load_elf(loaderparams->kernel_filename, NULL,
-                           cpu_loongarch_virt_to_phys, NULL,
-                           &kernel_entry, &kernel_low,
-                           &kernel_high, NULL, 0,
-                           EM_LOONGARCH, 1, 0);
-
-    if (kernel_size < 0) {
-        error_report("could not load kernel '%s': %s",
-                     loaderparams->kernel_filename,
-                     load_elf_strerror(kernel_size));
-        exit(1);
-    }
-    return kernel_entry;
-}
-
-static DeviceState *create_acpi_ged(DeviceState *pch_pic, LoongArchMachineState *lams)
+static DeviceState *create_acpi_ged(DeviceState *pch_pic,
+                                    LoongArchVirtMachineState *lvms)
 {
     DeviceState *dev;
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
+    MachineClass *mc = MACHINE_GET_CLASS(lvms);
     uint32_t event = ACPI_GED_PWR_DOWN_EVT;
 
     if (ms->ram_slots) {
         event |= ACPI_GED_MEM_HOTPLUG_EVT;
     }
+
+    if (mc->has_hotpluggable_cpus) {
+        event |= ACPI_GED_CPU_HOTPLUG_EVT;
+    }
+
     dev = qdev_new(TYPE_ACPI_GED);
     qdev_prop_set_uint32(dev, "ged-event", event);
     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
@@ -421,6 +679,10 @@ static DeviceState *create_acpi_ged(DeviceState *pch_pic, LoongArchMachineState
     /* ged regs used for reset and power down */
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, VIRT_GED_REG_ADDR);
 
+    if (mc->has_hotpluggable_cpus) {
+        sysbus_mmio_map(SYS_BUS_DEVICE(dev), 3, VIRT_GED_CPUHP_ADDR);
+    }
+
     sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0,
                        qdev_get_gpio_in(pch_pic, VIRT_SCI_IRQ - VIRT_GSI_BASE));
     return dev;
@@ -451,9 +713,12 @@ static DeviceState *create_platform_bus(DeviceState *pch_pic)
     return dev;
 }
 
-static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState *lams)
+static void virt_devices_init(DeviceState *pch_pic,
+                                   LoongArchVirtMachineState *lvms,
+                                   uint32_t *pch_pic_phandle,
+                                   uint32_t *pch_msi_phandle)
 {
-    MachineClass *mc = MACHINE_GET_CLASS(lams);
+    MachineClass *mc = MACHINE_GET_CLASS(lvms);
     DeviceState *gpex_dev;
     SysBusDevice *d;
     PCIBus *pci_bus;
@@ -465,7 +730,7 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState *
     d = SYS_BUS_DEVICE(gpex_dev);
     sysbus_realize_and_unref(d, &error_fatal);
     pci_bus = PCI_HOST_BRIDGE(gpex_dev)->bus;
-    lams->pci_bus = pci_bus;
+    lvms->pci_bus = pci_bus;
 
     /* Map only part size_ecam bytes of ECAM space */
     ecam_alias = g_new0(MemoryRegion, 1);
@@ -497,11 +762,21 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState *
         gpex_set_irq_num(GPEX_HOST(gpex_dev), i, 16 + i);
     }
 
-    serial_mm_init(get_system_memory(), VIRT_UART_BASE, 0,
-                   qdev_get_gpio_in(pch_pic,
-                                    VIRT_UART_IRQ - VIRT_GSI_BASE),
-                   115200, serial_hd(0), DEVICE_LITTLE_ENDIAN);
-    fdt_add_uart_node(lams);
+    /* Add pcie node */
+    fdt_add_pcie_node(lvms, pch_pic_phandle, pch_msi_phandle);
+
+    /*
+     * Create uart fdt node in reverse order so that they appear
+     * in the finished device tree lowest address first
+     */
+    for (i = VIRT_UART_COUNT; i --> 0;) {
+        hwaddr base = VIRT_UART_BASE + i * VIRT_UART_SIZE;
+        int irq = VIRT_UART_IRQ + i - VIRT_GSI_BASE;
+        serial_mm_init(get_system_memory(), base, 0,
+                       qdev_get_gpio_in(pch_pic, irq),
+                       115200, serial_hd(i), DEVICE_LITTLE_ENDIAN);
+        fdt_add_uart_node(lvms, pch_pic_phandle, base, irq, i == 0);
+    }
 
     /* Network init */
     for (i = 0; i < nb_nics; i++) {
@@ -516,17 +791,18 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState *
     sysbus_create_simple("ls7a_rtc", VIRT_RTC_REG_BASE,
                          qdev_get_gpio_in(pch_pic,
                          VIRT_RTC_IRQ - VIRT_GSI_BASE));
-    fdt_add_rtc_node(lams);
+    fdt_add_rtc_node(lvms, pch_pic_phandle);
+    fdt_add_ged_reset(lvms);
 
     /* acpi ged */
-    lams->acpi_ged = create_acpi_ged(pch_pic, lams);
+    lvms->acpi_ged = create_acpi_ged(pch_pic, lvms);
     /* platform bus */
-    lams->platform_bus_dev = create_platform_bus(pch_pic);
+    lvms->platform_bus_dev = create_platform_bus(pch_pic);
 }
 
-static void loongarch_irq_init(LoongArchMachineState *lams)
+static void virt_irq_init(LoongArchVirtMachineState *lvms)
 {
-    MachineState *ms = MACHINE(lams);
+    MachineState *ms = MACHINE(lvms);
     DeviceState *pch_pic, *pch_msi, *cpudev;
     DeviceState *ipi, *extioi;
     SysBusDevice *d;
@@ -534,9 +810,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams)
     CPULoongArchState *env;
     CPUState *cpu_state;
     int cpu, pin, i, start, num;
-
-    extioi = qdev_new(TYPE_LOONGARCH_EXTIOI);
-    sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal);
+    uint32_t cpuintc_phandle, eiointc_phandle, pch_pic_phandle, pch_msi_phandle;
 
     /*
      * The connection of interrupts:
@@ -559,93 +833,160 @@ static void loongarch_irq_init(LoongArchMachineState *lams)
      * | UARTs  | | Devices | | Devices |
      * +--------+ +---------+ +---------+
      */
-    for (cpu = 0; cpu < ms->smp.cpus; cpu++) {
-        cpu_state = qemu_get_cpu(cpu);
-        cpudev = DEVICE(cpu_state);
-        lacpu = LOONGARCH_CPU(cpu_state);
-        env = &(lacpu->env);
 
+    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+        ipi = qdev_new(TYPE_KVM_LOONGARCH_IPI);
+        qdev_prop_set_int32(ipi, "num-cpu", ms->smp.max_cpus);
+        sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal);
+    } else {
         ipi = qdev_new(TYPE_LOONGARCH_IPI);
+        qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.max_cpus);
         sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal);
 
-        /* connect ipi irq to cpu irq */
-        qdev_connect_gpio_out(ipi, 0, qdev_get_gpio_in(cpudev, IRQ_IPI));
         /* IPI iocsr memory region */
-        memory_region_add_subregion(&env->system_iocsr, SMP_IPI_MAILBOX,
-                                    sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi),
-                                    0));
-        memory_region_add_subregion(&env->system_iocsr, MAIL_SEND_ADDR,
-                                    sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi),
-                                    1));
-        /*
-	 * extioi iocsr memory region
-	 * only one extioi is added on loongarch virt machine
-	 * external device interrupt can only be routed to cpu 0-3
-	 */
-	if (cpu < EXTIOI_CPUS)
-            memory_region_add_subregion(&env->system_iocsr, APIC_BASE,
-                                sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi),
-                                cpu));
+        memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX,
+                       sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0));
+        memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR,
+                       sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1));
+        for (cpu = 0; cpu < ms->smp.cpus; cpu++) {
+            cpu_state = qemu_get_cpu(cpu);
+            cpudev = DEVICE(cpu_state);
+
+            /* connect ipi irq to cpu irq */
+            qdev_connect_gpio_out(ipi, cpu, qdev_get_gpio_in(cpudev, IRQ_IPI));
+        }
+    }
+
+    /* Add cpu interrupt-controller */
+    fdt_add_cpuic_node(lvms, &cpuintc_phandle);
+
+    for (cpu = 0; cpu < ms->smp.cpus; cpu++) {
+        cpu_state = qemu_get_cpu(cpu);
+        cpudev = DEVICE(cpu_state);
+        lacpu = LOONGARCH_CPU(cpu_state);
+        env = &(lacpu->env);
+        env->address_space_iocsr = &lvms->as_iocsr;
         env->ipistate = ipi;
     }
 
-    /*
-     * connect ext irq to the cpu irq
-     * cpu_pin[9:2] <= intc_pin[7:0]
-     */
-    for (cpu = 0; cpu < MIN(ms->smp.cpus, EXTIOI_CPUS); cpu++) {
-        cpudev = DEVICE(qemu_get_cpu(cpu));
-        for (pin = 0; pin < LS3A_INTC_IP; pin++) {
-            qdev_connect_gpio_out(extioi, (cpu * 8 + pin),
-                                  qdev_get_gpio_in(cpudev, pin + 2));
+    lvms->ipi = ipi;
+
+    /* Create EXTIOI device */
+    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+        extioi = qdev_new(TYPE_KVM_LOONGARCH_EXTIOI);
+        qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus);
+        qdev_prop_set_bit(extioi, "has-virtualization-extension", true);
+        sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal);
+    } else {
+        extioi = qdev_new(TYPE_LOONGARCH_EXTIOI);
+        qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus);
+        if (virt_is_veiointc_enabled(lvms)) {
+            qdev_prop_set_bit(extioi, "has-virtualization-extension", true);
         }
+        sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal);
+        memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE,
+                       sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0));
+        if (virt_is_veiointc_enabled(lvms)) {
+            memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE,
+                        sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1));
+        }
+        /*
+         * connect ext irq to the cpu irq
+         * cpu_pin[9:2] <= intc_pin[7:0]
+         */
+        for (cpu = 0; cpu < ms->smp.cpus; cpu++) {
+            cpudev = DEVICE(qemu_get_cpu(cpu));
+            for (pin = 0; pin < LS3A_INTC_IP; pin++) {
+                qdev_connect_gpio_out(extioi, (cpu * 8 + pin),
+                                      qdev_get_gpio_in(cpudev, pin + 2));
+            }
+       }
     }
 
-    pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC);
-    num = VIRT_PCH_PIC_IRQ_NUM;
-    qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num);
-    d = SYS_BUS_DEVICE(pch_pic);
-    sysbus_realize_and_unref(d, &error_fatal);
-    memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE,
-                            sysbus_mmio_get_region(d, 0));
-    memory_region_add_subregion(get_system_memory(),
+    lvms->extioi = extioi;
+
+    /* Add Extend I/O Interrupt Controller node */
+    fdt_add_eiointc_node(lvms, &cpuintc_phandle, &eiointc_phandle);
+
+    /* Add PCH PIC node */
+    fdt_add_pch_pic_node(lvms, &eiointc_phandle, &pch_pic_phandle);
+
+    /* Add PCH MSI node */
+    fdt_add_pch_msi_node(lvms, &eiointc_phandle, &pch_msi_phandle);
+
+    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+        pch_pic = qdev_new(TYPE_KVM_LOONGARCH_PCH_PIC);
+        sysbus_realize_and_unref(SYS_BUS_DEVICE(pch_pic), &error_fatal);
+    } else {
+        pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC);
+        num = VIRT_PCH_PIC_IRQ_NUM;
+        qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num);
+        d = SYS_BUS_DEVICE(pch_pic);
+        sysbus_realize_and_unref(d, &error_fatal);
+        memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE,
+                                sysbus_mmio_get_region(d, 0));
+        memory_region_add_subregion(get_system_memory(),
                             VIRT_IOAPIC_REG_BASE + PCH_PIC_ROUTE_ENTRY_OFFSET,
                             sysbus_mmio_get_region(d, 1));
-    memory_region_add_subregion(get_system_memory(),
-                            VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO,
-                            sysbus_mmio_get_region(d, 2));
-
-    /* Connect pch_pic irqs to extioi */
-    for (i = 0; i < num; i++) {
-        qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i));
+        memory_region_add_subregion(get_system_memory(),
+                                VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO,
+                                sysbus_mmio_get_region(d, 2));
+        /* Connect pch_pic irqs to extioi */
+        for (i = 0; i < num; i++) {
+            qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i));
+        }
     }
 
     pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI);
+    num = VIRT_PCH_PIC_IRQ_NUM;
     start   =  num;
     num = EXTIOI_IRQS - start;
     qdev_prop_set_uint32(pch_msi, "msi_irq_base", start);
     qdev_prop_set_uint32(pch_msi, "msi_irq_num", num);
     d = SYS_BUS_DEVICE(pch_msi);
     sysbus_realize_and_unref(d, &error_fatal);
-    sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW);
-    for (i = 0; i < num; i++) {
+
+    if (!(kvm_enabled() && kvm_irqchip_in_kernel())) {
         /* Connect pch_msi irqs to extioi */
-        qdev_connect_gpio_out(DEVICE(d), i,
-                              qdev_get_gpio_in(extioi, i + start));
+        for (i = 0; i < num; i++) {
+            qdev_connect_gpio_out(DEVICE(d), i,
+                                  qdev_get_gpio_in(extioi, i + start));
+        }
     }
 
-    loongarch_devices_init(pch_pic, lams);
+    sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW);
+    virt_devices_init(pch_pic, lvms, &pch_pic_phandle, &pch_msi_phandle);
 }
 
-static void loongarch_firmware_init(LoongArchMachineState *lams)
+static void virt_firmware_init(LoongArchVirtMachineState *lvms)
 {
-    char *filename = MACHINE(lams)->firmware;
+    char *filename = MACHINE(lvms)->firmware;
     char *bios_name = NULL;
-    int bios_size;
+    int bios_size, i;
+    BlockBackend *pflash_blk0;
+    MemoryRegion *mr;
 
-    lams->bios_loaded = false;
+    lvms->bios_loaded = false;
 
-    virt_flash_map(lams, get_system_memory());
+    /* Map legacy -drive if=pflash to machine properties */
+    for (i = 0; i < ARRAY_SIZE(lvms->flash); i++) {
+        pflash_cfi01_legacy_drive(lvms->flash[i],
+                                  drive_get(IF_PFLASH, 0, i));
+    }
+
+    virt_flash_map(lvms, get_system_memory());
+
+    pflash_blk0 = pflash_cfi01_get_blk(lvms->flash[0]);
+
+    if (pflash_blk0) {
+        if (filename) {
+            error_report("cannot use both '-bios' and '-drive if=pflash'"
+                         "options at once");
+            exit(1);
+        }
+        lvms->bios_loaded = true;
+        return;
+    }
 
     if (filename) {
         bios_name = qemu_find_file(QEMU_FILE_TYPE_BIOS, filename);
@@ -654,166 +995,251 @@ static void loongarch_firmware_init(LoongArchMachineState *lams)
             exit(1);
         }
 
-        bios_size = load_image_targphys(bios_name, VIRT_BIOS_BASE, VIRT_BIOS_SIZE);
+        mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(lvms->flash[0]), 0);
+        bios_size = load_image_mr(bios_name, mr);
         if (bios_size < 0) {
             error_report("Could not load ROM image '%s'", bios_name);
             exit(1);
         }
-
         g_free(bios_name);
+        lvms->bios_loaded = true;
+    }
+}
+
+
+static MemTxResult virt_iocsr_misc_write(void *opaque, hwaddr addr, uint64_t val,
+                                         unsigned size, MemTxAttrs attrs)
+{
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque);
+    uint64_t features;
 
-        memory_region_init_ram(&lams->bios, NULL, "loongarch.bios",
-                               VIRT_BIOS_SIZE, &error_fatal);
-        memory_region_set_readonly(&lams->bios, true);
-        memory_region_add_subregion(get_system_memory(), VIRT_BIOS_BASE, &lams->bios);
-        lams->bios_loaded = true;
+    switch (addr) {
+    case MISC_FUNC_REG:
+        if (!virt_is_veiointc_enabled(lvms)) {
+            return MEMTX_OK;
+        }
+
+        features = address_space_ldl(&lvms->as_iocsr,
+                                     EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG,
+                                     attrs, NULL);
+        if (val & BIT_ULL(IOCSRM_EXTIOI_EN)) {
+            features |= BIT(EXTIOI_ENABLE);
+        }
+        if (val & BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE)) {
+            features |= BIT(EXTIOI_ENABLE_INT_ENCODE);
+        }
+
+        address_space_stl(&lvms->as_iocsr,
+                          EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG,
+                          features, attrs, NULL);
     }
 
+    return MEMTX_OK;
 }
 
-static void reset_load_elf(void *opaque)
+static MemTxResult virt_iocsr_misc_read(void *opaque, hwaddr addr,
+                                        uint64_t *data,
+                                        unsigned size, MemTxAttrs attrs)
 {
-    LoongArchCPU *cpu = opaque;
-    CPULoongArchState *env = &cpu->env;
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque);
+    uint64_t ret = 0;
+    int features;
+
+    switch (addr) {
+    case VERSION_REG:
+        ret = 0x11ULL;
+        break;
+    case FEATURE_REG:
+        ret = BIT(IOCSRF_MSI) | BIT(IOCSRF_EXTIOI) | BIT(IOCSRF_CSRIPI);
+        if (kvm_enabled()) {
+            ret |= BIT(IOCSRF_VM);
+        }
+        break;
+    case VENDOR_REG:
+        ret = 0x6e6f73676e6f6f4cULL; /* "Loongson" */
+        break;
+    case CPUNAME_REG:
+        ret = 0x303030354133ULL;     /* "3A5000" */
+        break;
+    case MISC_FUNC_REG:
+        if (!virt_is_veiointc_enabled(lvms)) {
+            ret |= BIT_ULL(IOCSRM_EXTIOI_EN);
+            break;
+        }
+
+        features = address_space_ldl(&lvms->as_iocsr,
+                                     EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG,
+                                     attrs, NULL);
+        if (features & BIT(EXTIOI_ENABLE)) {
+            ret |= BIT_ULL(IOCSRM_EXTIOI_EN);
+        }
 
-    cpu_reset(CPU(cpu));
-    if (env->load_elf) {
-        cpu_set_pc(CPU(cpu), env->elf_address);
+        if (features & BIT(EXTIOI_ENABLE_INT_ENCODE)) {
+            ret |= BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE);
+        }
+        break;
+    default:
+        g_assert_not_reached();
     }
+
+    *data = ret;
+    return MEMTX_OK;
 }
 
-static void fw_cfg_add_kernel_info(const struct loaderparams *loaderparams,
-                                   FWCfgState *fw_cfg)
+static const MemoryRegionOps virt_iocsr_misc_ops = {
+    .read_with_attrs  = virt_iocsr_misc_read,
+    .write_with_attrs = virt_iocsr_misc_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    },
+    .impl = {
+        .min_access_size = 8,
+        .max_access_size = 8,
+    },
+};
+
+static void fw_cfg_add_memory(MachineState *ms)
 {
-    /*
-     * Expose the kernel, the command line, and the initrd in fw_cfg.
-     * We don't process them here at all, it's all left to the
-     * firmware.
-     */
-    load_image_to_fw_cfg(fw_cfg,
-                         FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA,
-                         loaderparams->kernel_filename,
-                         false);
-
-    if (loaderparams->initrd_filename) {
-        load_image_to_fw_cfg(fw_cfg,
-                             FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA,
-                             loaderparams->initrd_filename, false);
+    hwaddr base, size, ram_size, gap;
+    int nb_numa_nodes, nodes;
+    NodeInfo *numa_info;
+
+    ram_size = ms->ram_size;
+    base = VIRT_LOWMEM_BASE;
+    gap = VIRT_LOWMEM_SIZE;
+    nodes = nb_numa_nodes = ms->numa_state->num_nodes;
+    numa_info = ms->numa_state->nodes;
+    if (!nodes) {
+        nodes = 1;
     }
 
-    if (loaderparams->kernel_cmdline) {
-        fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
-                       strlen(loaderparams->kernel_cmdline) + 1);
-        fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA,
-                          loaderparams->kernel_cmdline);
+    /* add fw_cfg memory map of node0 */
+    if (nb_numa_nodes) {
+        size = numa_info[0].node_mem;
+    } else {
+        size = ram_size;
     }
-}
 
-static void loongarch_firmware_boot(LoongArchMachineState *lams,
-                                    const struct loaderparams *loaderparams)
-{
-    fw_cfg_add_kernel_info(loaderparams, lams->fw_cfg);
-}
+    if (size >= gap) {
+        memmap_add_entry(base, gap, 1);
+        size -= gap;
+        base = VIRT_HIGHMEM_BASE;
+    }
 
-static void loongarch_direct_kernel_boot(LoongArchMachineState *lams,
-                                         const struct loaderparams *loaderparams)
-{
-    MachineState *machine = MACHINE(lams);
-    int64_t kernel_addr = 0;
-    LoongArchCPU *lacpu;
-    int i;
+    if (size) {
+        memmap_add_entry(base, size, 1);
+        base += size;
+    }
 
-    kernel_addr = load_kernel_info(loaderparams);
-    if (!machine->firmware) {
-        for (i = 0; i < machine->smp.cpus; i++) {
-            lacpu = LOONGARCH_CPU(qemu_get_cpu(i));
-            lacpu->env.load_elf = true;
-            lacpu->env.elf_address = kernel_addr;
-        }
+    if (nodes < 2) {
+        return;
     }
+
+    /* add fw_cfg memory map of other nodes */
+    if (numa_info[0].node_mem < gap && ram_size > gap) {
+        /*
+         * memory map for the maining nodes splited into two part
+         * lowram:  [base, +(gap - numa_info[0].node_mem))
+         * highram: [VIRT_HIGHMEM_BASE, +(ram_size - gap))
+         */
+        memmap_add_entry(base, gap - numa_info[0].node_mem, 1);
+        size = ram_size - gap;
+        base = VIRT_HIGHMEM_BASE;
+    } else {
+        size = ram_size - numa_info[0].node_mem;
+    }
+
+   if (size)
+        memmap_add_entry(base, size, 1);
 }
 
-static void loongarch_init(MachineState *machine)
+static void virt_init(MachineState *machine)
 {
     LoongArchCPU *lacpu;
     const char *cpu_model = machine->cpu_type;
-    ram_addr_t offset = 0;
-    ram_addr_t ram_size = machine->ram_size;
-    uint64_t highram_size = 0, phyAddr = 0;
     MemoryRegion *address_space_mem = get_system_memory();
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
-    int nb_numa_nodes = machine->numa_state->num_nodes;
-    NodeInfo *numa_info = machine->numa_state->nodes;
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine);
     int i;
-    hwaddr fdt_base;
-    const CPUArchIdList *possible_cpus;
+    hwaddr base, size, ram_size = machine->ram_size;
     MachineClass *mc = MACHINE_GET_CLASS(machine);
-    CPUState *cpu;
-    char *ramName = NULL;
-    struct loaderparams loaderparams = { };
 
     if (!cpu_model) {
         cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
     }
 
-    if (ram_size < 1 * GiB) {
-        error_report("ram_size must be greater than 1G.");
-        exit(1);
-    }
-    create_fdt(lams);
+    create_fdt(lvms);
+
+    /* Create IOCSR space */
+    memory_region_init_io(&lvms->system_iocsr, OBJECT(machine), NULL,
+                          machine, "iocsr", UINT64_MAX);
+    address_space_init(&lvms->as_iocsr, &lvms->system_iocsr, "IOCSR");
+    memory_region_init_io(&lvms->iocsr_mem, OBJECT(machine),
+                          &virt_iocsr_misc_ops,
+                          machine, "iocsr_misc", 0x428);
+    memory_region_add_subregion(&lvms->system_iocsr, 0, &lvms->iocsr_mem);
+
     /* Init CPUs */
+    mc->possible_cpu_arch_ids(machine);
+    for (i = 0; i < machine->smp.cpus; i++) {
+        Object *cpuobj;
+        cpuobj = object_new(machine->cpu_type);
+        lacpu = LOONGARCH_CPU(cpuobj);
 
-    possible_cpus = mc->possible_cpu_arch_ids(machine);
-    for (i = 0; i < possible_cpus->len; i++) {
-        cpu = cpu_create(machine->cpu_type);
-        cpu->cpu_index = i;
-        machine->possible_cpus->cpus[i].cpu = OBJECT(cpu);
-        lacpu = LOONGARCH_CPU(cpu);
         lacpu->phy_id = machine->possible_cpus->cpus[i].arch_id;
+        object_property_set_int(cpuobj, "socket-id",
+                                machine->possible_cpus->cpus[i].props.socket_id,
+                                NULL);
+        object_property_set_int(cpuobj, "core-id",
+                                machine->possible_cpus->cpus[i].props.core_id,
+                                NULL);
+        object_property_set_int(cpuobj, "thread-id",
+                                machine->possible_cpus->cpus[i].props.thread_id,
+                                NULL);
+        /*
+         * The CPU in place at the time of machine startup will also enter
+         * the CPU hot-plug process when it is created, but at this time,
+         * the GED device has not been created, resulting in exit in the CPU
+         * hot-plug process, which can avoid the incumbent CPU repeatedly
+         * applying for resources.
+         *
+         * The interrupt resource of the in-place CPU will be requested at
+         * the current function call loongarch_irq_init().
+         *
+         * The interrupt resource of the subsequently inserted CPU will be
+         * requested in the CPU hot-plug process.
+         */
+        qdev_realize(DEVICE(cpuobj), NULL, &error_fatal);
+        object_unref(cpuobj);
     }
-    fdt_add_cpu_nodes(lams);
+
+    fdt_add_cpu_nodes(lvms);
+    fdt_add_memory_nodes(machine);
+    fw_cfg_add_memory(machine);
 
     /* Node0 memory */
-    memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
-    fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0);
-    memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram",
-                             machine->ram, offset, VIRT_LOWMEM_SIZE);
-    memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem);
-
-    offset += VIRT_LOWMEM_SIZE;
-    if (nb_numa_nodes > 0) {
-        assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE);
-        highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE;
-    } else {
-        highram_size = ram_size - VIRT_LOWMEM_SIZE;
+    size = ram_size;
+    base = VIRT_LOWMEM_BASE;
+    if (size > VIRT_LOWMEM_SIZE) {
+        size = VIRT_LOWMEM_SIZE;
     }
-    phyAddr = VIRT_HIGHMEM_BASE;
-    memmap_add_entry(phyAddr, highram_size, 1);
-    fdt_add_memory_node(machine, phyAddr, highram_size, 0);
-    memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram",
-                              machine->ram, offset, highram_size);
-    memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem);
-
-    /* Node1 - Nodemax memory */
-    offset += highram_size;
-    phyAddr += highram_size;
-
-    for (i = 1; i < nb_numa_nodes; i++) {
-        MemoryRegion *nodemem = g_new(MemoryRegion, 1);
-        ramName = g_strdup_printf("loongarch.node%d.ram", i);
-        memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
-                                 offset,  numa_info[i].node_mem);
-        memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-        memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
-        fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i);
-        offset += numa_info[i].node_mem;
-        phyAddr += numa_info[i].node_mem;
+
+    memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.lowram",
+                              machine->ram, base, size);
+    memory_region_add_subregion(address_space_mem, base, &lvms->lowmem);
+    base += size;
+    if (ram_size - size) {
+        base = VIRT_HIGHMEM_BASE;
+        memory_region_init_alias(&lvms->highmem, NULL, "loongarch.highram",
+                machine->ram, VIRT_LOWMEM_BASE + size, ram_size - size);
+        memory_region_add_subregion(address_space_mem, base, &lvms->highmem);
+        base += ram_size - size;
     }
 
     /* initialize device memory address space */
     if (machine->ram_size < machine->maxram_size) {
         ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
-        hwaddr device_mem_base;
 
         if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) {
             error_report("unsupported amount of memory slots: %"PRIu64,
@@ -827,55 +1253,35 @@ static void loongarch_init(MachineState *machine)
                          "%d bytes", TARGET_PAGE_SIZE);
             exit(EXIT_FAILURE);
         }
-        /* device memory base is the top of high memory address. */
-        device_mem_base = ROUND_UP(VIRT_HIGHMEM_BASE + highram_size, 1 * GiB);
-        machine_memory_devices_init(machine, device_mem_base, device_mem_size);
+        machine_memory_devices_init(machine, base, device_mem_size);
     }
 
     /* load the BIOS image. */
-    loongarch_firmware_init(lams);
+    virt_firmware_init(lvms);
 
     /* fw_cfg init */
-    lams->fw_cfg = loongarch_fw_cfg_init(ram_size, machine);
-    rom_set_fw(lams->fw_cfg);
-    if (lams->fw_cfg != NULL) {
-        fw_cfg_add_file(lams->fw_cfg, "etc/memmap",
+    lvms->fw_cfg = virt_fw_cfg_init(ram_size, machine);
+    rom_set_fw(lvms->fw_cfg);
+    if (lvms->fw_cfg != NULL) {
+        fw_cfg_add_file(lvms->fw_cfg, "etc/memmap",
                         memmap_table,
                         sizeof(struct memmap_entry) * (memmap_entries));
     }
-    fdt_add_fw_cfg_node(lams);
-    loaderparams.ram_size = ram_size;
-    loaderparams.kernel_filename = machine->kernel_filename;
-    loaderparams.kernel_cmdline = machine->kernel_cmdline;
-    loaderparams.initrd_filename = machine->initrd_filename;
-    /* load the kernel. */
-    if (loaderparams.kernel_filename) {
-        if (lams->bios_loaded) {
-            loongarch_firmware_boot(lams, &loaderparams);
-        } else {
-            loongarch_direct_kernel_boot(lams, &loaderparams);
-        }
-    }
-    fdt_add_flash_node(lams);
-    /* register reset function */
-    for (i = 0; i < machine->smp.cpus; i++) {
-        lacpu = LOONGARCH_CPU(qemu_get_cpu(i));
-        qemu_register_reset(reset_load_elf, lacpu);
-    }
+    fdt_add_fw_cfg_node(lvms);
+    fdt_add_flash_node(lvms);
+
     /* Initialize the IO interrupt subsystem */
-    loongarch_irq_init(lams);
-    fdt_add_irqchip_node(lams);
-    platform_bus_add_all_fdt_nodes(machine->fdt, "/intc",
+    virt_irq_init(lvms);
+    platform_bus_add_all_fdt_nodes(machine->fdt, "/platic",
                                    VIRT_PLATFORM_BUS_BASEADDRESS,
                                    VIRT_PLATFORM_BUS_SIZE,
                                    VIRT_PLATFORM_BUS_IRQ);
-    lams->machine_done.notify = virt_machine_done;
-    qemu_add_machine_init_done_notifier(&lams->machine_done);
+    lvms->machine_done.notify = virt_done;
+    qemu_add_machine_init_done_notifier(&lvms->machine_done);
      /* connect powerdown request */
-    lams->powerdown_notifier.notify = virt_powerdown_req;
-    qemu_register_powerdown_notifier(&lams->powerdown_notifier);
+    lvms->powerdown_notifier.notify = virt_powerdown_req;
+    qemu_register_powerdown_notifier(&lvms->powerdown_notifier);
 
-    fdt_add_pcie_node(lams);
     /*
      * Since lowmem region starts from 0 and Linux kernel legacy start address
      * at 2 MiB, FDT base address is located at 1 MiB to avoid NULL pointer
@@ -883,44 +1289,241 @@ static void loongarch_init(MachineState *machine)
      * Put the FDT into the memory map as a ROM image: this will ensure
      * the FDT is copied again upon reset, even if addr points into RAM.
      */
-    fdt_base = 1 * MiB;
-    qemu_fdt_dumpdtb(machine->fdt, lams->fdt_size);
-    rom_add_blob_fixed("fdt", machine->fdt, lams->fdt_size, fdt_base);
+    qemu_fdt_dumpdtb(machine->fdt, lvms->fdt_size);
+    rom_add_blob_fixed_as("fdt", machine->fdt, lvms->fdt_size, FDT_BASE,
+                          &address_space_memory);
+    qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
+            rom_ptr_for_as(&address_space_memory, FDT_BASE, lvms->fdt_size));
+
+    lvms->bootinfo.ram_size = ram_size;
+    loongarch_load_kernel(machine, &lvms->bootinfo);
 }
 
-bool loongarch_is_acpi_enabled(LoongArchMachineState *lams)
+static void virt_get_acpi(Object *obj, Visitor *v, const char *name,
+                          void *opaque, Error **errp)
 {
-    if (lams->acpi == ON_OFF_AUTO_OFF) {
-        return false;
-    }
-    return true;
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj);
+    OnOffAuto acpi = lvms->acpi;
+
+    visit_type_OnOffAuto(v, name, &acpi, errp);
 }
 
-static void loongarch_get_acpi(Object *obj, Visitor *v, const char *name,
+static void virt_set_acpi(Object *obj, Visitor *v, const char *name,
                                void *opaque, Error **errp)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(obj);
-    OnOffAuto acpi = lams->acpi;
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj);
 
-    visit_type_OnOffAuto(v, name, &acpi, errp);
+    visit_type_OnOffAuto(v, name, &lvms->acpi, errp);
 }
 
-static void loongarch_set_acpi(Object *obj, Visitor *v, const char *name,
-                               void *opaque, Error **errp)
+static void virt_initfn(Object *obj)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(obj);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj);
 
-    visit_type_OnOffAuto(v, name, &lams->acpi, errp);
+    if (tcg_enabled()) {
+        lvms->veiointc = ON_OFF_AUTO_OFF;
+    }
+    lvms->acpi = ON_OFF_AUTO_AUTO;
+    lvms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
+    lvms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
+    virt_flash_create(lvms);
 }
 
-static void loongarch_machine_initfn(Object *obj)
+static int virt_get_arch_id_from_topo(MachineState *ms, LoongArchCPUTopo *topo)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(obj);
+    int arch_id, sock_vcpu_num, core_vcpu_num;
+
+    /*
+     * calculate total logical cpus across socket/core/thread.
+     * For more information on how to calculate the arch_id,
+     * you can refer to the CPU Topology chapter of the
+     * docs/system/loongarch/virt.rst document.
+     */
+    sock_vcpu_num = topo->socket_id * (ms->smp.threads * ms->smp.cores);
+    core_vcpu_num = topo->core_id * ms->smp.threads;
+
+    /* get vcpu-id(logical cpu index) for this vcpu from this topology */
+    arch_id = (sock_vcpu_num + core_vcpu_num) + topo->thread_id;
+
+    assert(arch_id >= 0 && arch_id < ms->possible_cpus->len);
 
-    lams->acpi = ON_OFF_AUTO_AUTO;
-    lams->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
-    lams->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
-    virt_flash_create(lams);
+    return arch_id;
+}
+
+/* find cpu slot in machine->possible_cpus by arch_id */
+static CPUArchId *virt_find_cpu_slot(MachineState *ms, int arch_id, int *index)
+{
+    int n;
+    for (n = 0; n < ms->possible_cpus->len; n++) {
+        if (ms->possible_cpus->cpus[n].arch_id == arch_id) {
+            if (index) {
+                *index = n;
+            }
+            return &ms->possible_cpus->cpus[n];
+        }
+    }
+
+    return NULL;
+}
+
+static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev,
+                              DeviceState *dev, Error **errp)
+{
+    MachineState *ms = MACHINE(OBJECT(hotplug_dev));
+    MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
+    LoongArchCPU *cpu = LOONGARCH_CPU(dev);
+    CPUState *cs = CPU(dev);
+    CPUArchId *cpu_slot;
+    Error *local_err = NULL;
+    LoongArchCPUTopo topo;
+    int arch_id, index;
+
+    if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
+        error_setg(&local_err, "CPU hotplug not supported for this machine");
+        goto out;
+    }
+
+    /* sanity check the cpu */
+    if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
+        error_setg(&local_err, "Invalid CPU type, expected cpu type: '%s'",
+                   ms->cpu_type);
+        goto out;
+    }
+
+    if ((cpu->thread_id < 0) || (cpu->thread_id >= ms->smp.threads)) {
+        error_setg(&local_err,
+                   "Invalid thread-id %u specified, must be in range 1:%u",
+                   cpu->thread_id, ms->smp.threads - 1);
+        goto out;
+    }
+
+    if ((cpu->core_id < 0) || (cpu->core_id >= ms->smp.cores)) {
+        error_setg(&local_err,
+                   "Invalid core-id %u specified, must be in range 1:%u",
+                   cpu->core_id, ms->smp.cores - 1);
+        goto out;
+    }
+
+    if ((cpu->socket_id < 0) || (cpu->socket_id >= ms->smp.sockets)) {
+        error_setg(&local_err,
+                   "Invalid socket-id %u specified, must be in range 1:%u",
+                   cpu->socket_id, ms->smp.sockets - 1);
+        goto out;
+    }
+
+    topo.socket_id = cpu->socket_id;
+    topo.core_id = cpu->core_id;
+    topo.thread_id = cpu->thread_id;
+    arch_id =  virt_get_arch_id_from_topo(ms, &topo);
+    cpu_slot = virt_find_cpu_slot(ms, arch_id, &index);
+    if (CPU(cpu_slot->cpu)) {
+        error_setg(&local_err,
+                   "cpu(id%d=%d:%d:%d) with arch-id %" PRIu64 " exists",
+                   cs->cpu_index, cpu->socket_id, cpu->core_id,
+                   cpu->thread_id, cpu_slot->arch_id);
+        goto out;
+    }
+    cpu->phy_id = arch_id;
+    /*
+     * update cpu_index calculation method since it is easily used as index
+     * with possible_cpus array by function virt_cpu_index_to_props
+     */
+    cs->cpu_index = index;
+    numa_cpu_pre_plug(cpu_slot, dev, &local_err);
+    return ;
+
+out:
+    error_propagate(errp, local_err);
+}
+
+static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev,
+                                       DeviceState *dev, Error **errp)
+{
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
+    Error *local_err = NULL;
+    HotplugHandlerClass *hhc;
+    LoongArchCPU *cpu = LOONGARCH_CPU(dev);
+    CPUState *cs = CPU(dev);
+
+    if (!lvms->acpi_ged) {
+        error_setg(&local_err, "CPU hot unplug not supported without ACPI");
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (cs->cpu_index == 0) {
+        error_setg(&local_err,
+                   "hot-unplug of boot cpu(id%d=%d:%d:%d) not supported",
+                   cs->cpu_index, cpu->socket_id,
+                   cpu->core_id, cpu->thread_id);
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged);
+    hhc->unplug_request(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err);
+}
+
+static void virt_cpu_unplug(HotplugHandler *hotplug_dev,
+                                DeviceState *dev, Error **errp)
+{
+    CPUArchId *cpu_slot;
+    HotplugHandlerClass *hhc;
+    Error *local_err = NULL;
+    LoongArchCPU *cpu = LOONGARCH_CPU(dev);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
+
+    hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged);
+    hhc->unplug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    cpu_slot = virt_find_cpu_slot(MACHINE(lvms), cpu->phy_id, NULL);
+    cpu_slot->cpu = NULL;
+    return;
+}
+
+static void virt_cpu_plug(HotplugHandler *hotplug_dev,
+                                DeviceState *dev, Error **errp)
+{
+    CPUArchId *cpu_slot;
+    HotplugHandlerClass *hhc;
+    Error *local_err = NULL;
+    LoongArchCPU *cpu = LOONGARCH_CPU(dev);
+    CPUState *cs = CPU(cpu);
+    CPULoongArchState *env;
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
+    int pin;
+
+    if (lvms->acpi_ged) {
+        env = &(cpu->env);
+        env->address_space_iocsr = &lvms->as_iocsr;
+
+        qemu_register_reset(reset_load_elf, LOONGARCH_CPU(qemu_get_cpu(cs->cpu_index)));
+        env->ipistate = lvms->ipi;
+        if (!(kvm_enabled() && kvm_irqchip_in_kernel())) {
+            /* connect ipi irq to cpu irq, logic cpu index used here */
+            qdev_connect_gpio_out(lvms->ipi, cs->cpu_index,
+                                  qdev_get_gpio_in(dev, IRQ_IPI));
+
+            for (pin = 0; pin < LS3A_INTC_IP; pin++) {
+                qdev_connect_gpio_out(lvms->extioi, (cs->cpu_index * 8 + pin),
+                                      qdev_get_gpio_in(dev, pin + 2));
+            }
+        }
+        hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged);
+        hhc->plug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+           return;
+        }
+    }
+
+    cpu_slot = virt_find_cpu_slot(MACHINE(lvms), cpu->phy_id, NULL);
+    cpu_slot->cpu = OBJECT(dev);
+    return;
 }
 
 static bool memhp_type_supported(DeviceState *dev)
@@ -936,92 +1539,112 @@ static void virt_mem_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
     pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), NULL, errp);
 }
 
-static void virt_machine_device_pre_plug(HotplugHandler *hotplug_dev,
+static void virt_device_pre_plug(HotplugHandler *hotplug_dev,
                                             DeviceState *dev, Error **errp)
 {
     if (memhp_type_supported(dev)) {
         virt_mem_pre_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) {
+        virt_cpu_pre_plug(hotplug_dev, dev, errp);
     }
 }
 
 static void virt_mem_unplug_request(HotplugHandler *hotplug_dev,
                                      DeviceState *dev, Error **errp)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
 
     /* the acpi ged is always exist */
-    hotplug_handler_unplug_request(HOTPLUG_HANDLER(lams->acpi_ged), dev,
+    hotplug_handler_unplug_request(HOTPLUG_HANDLER(lvms->acpi_ged), dev,
                                    errp);
 }
 
-static void virt_machine_device_unplug_request(HotplugHandler *hotplug_dev,
+static void virt_device_unplug_request(HotplugHandler *hotplug_dev,
                                           DeviceState *dev, Error **errp)
 {
     if (memhp_type_supported(dev)) {
         virt_mem_unplug_request(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) {
+        virt_cpu_unplug_request(hotplug_dev, dev, errp);
     }
 }
 
 static void virt_mem_unplug(HotplugHandler *hotplug_dev,
                              DeviceState *dev, Error **errp)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
 
-    hotplug_handler_unplug(HOTPLUG_HANDLER(lams->acpi_ged), dev, errp);
-    pc_dimm_unplug(PC_DIMM(dev), MACHINE(lams));
+    hotplug_handler_unplug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, errp);
+    pc_dimm_unplug(PC_DIMM(dev), MACHINE(lvms));
     qdev_unrealize(dev);
 }
 
-static void virt_machine_device_unplug(HotplugHandler *hotplug_dev,
+static void virt_device_unplug(HotplugHandler *hotplug_dev,
                                           DeviceState *dev, Error **errp)
 {
     if (memhp_type_supported(dev)) {
         virt_mem_unplug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) {
+        virt_cpu_unplug(hotplug_dev, dev, errp);
     }
 }
 
 static void virt_mem_plug(HotplugHandler *hotplug_dev,
                              DeviceState *dev, Error **errp)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
 
-    pc_dimm_plug(PC_DIMM(dev), MACHINE(lams));
-    hotplug_handler_plug(HOTPLUG_HANDLER(lams->acpi_ged),
+    pc_dimm_plug(PC_DIMM(dev), MACHINE(lvms));
+    hotplug_handler_plug(HOTPLUG_HANDLER(lvms->acpi_ged),
                          dev, &error_abort);
 }
 
-static void loongarch_machine_device_plug_cb(HotplugHandler *hotplug_dev,
+static void virt_device_plug_cb(HotplugHandler *hotplug_dev,
                                         DeviceState *dev, Error **errp)
 {
-    LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev);
-    MachineClass *mc = MACHINE_GET_CLASS(lams);
+    LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev);
+    MachineClass *mc = MACHINE_GET_CLASS(lvms);
+    PlatformBusDevice *pbus;
 
     if (device_is_dynamic_sysbus(mc, dev)) {
-        if (lams->platform_bus_dev) {
-            platform_bus_link_device(PLATFORM_BUS_DEVICE(lams->platform_bus_dev),
-                                     SYS_BUS_DEVICE(dev));
+        if (lvms->platform_bus_dev) {
+            pbus = PLATFORM_BUS_DEVICE(lvms->platform_bus_dev);
+            platform_bus_link_device(pbus, SYS_BUS_DEVICE(dev));
         }
     } else if (memhp_type_supported(dev)) {
         virt_mem_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) {
+        virt_cpu_plug(hotplug_dev, dev, errp);
     }
 }
 
-static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
-                                                        DeviceState *dev)
+static HotplugHandler *virt_get_hotplug_handler(MachineState *machine,
+                                                DeviceState *dev)
 {
     MachineClass *mc = MACHINE_GET_CLASS(machine);
 
     if (device_is_dynamic_sysbus(mc, dev) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU) ||
+        object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) ||
         memhp_type_supported(dev)) {
         return HOTPLUG_HANDLER(machine);
     }
     return NULL;
 }
 
+static void virt_get_cpu_topo_from_index(MachineState *ms,
+                                         LoongArchCPUTopo *topo, int index)
+{
+    topo->socket_id  = index / (ms->smp.cores * ms->smp.threads);
+    topo->core_id = index / ms->smp.threads % ms->smp.cores;
+    topo->thread_id = index % ms->smp.threads;
+}
+
 static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
 {
     int n;
     unsigned int max_cpus = ms->smp.max_cpus;
+    LoongArchCPUTopo topo;
 
     if (ms->possible_cpus) {
         assert(ms->possible_cpus->len == max_cpus);
@@ -1032,23 +1655,24 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
                                   sizeof(CPUArchId) * max_cpus);
     ms->possible_cpus->len = max_cpus;
     for (n = 0; n < ms->possible_cpus->len; n++) {
+        ms->possible_cpus->cpus[n].vcpus_count = ms->smp.threads;
         ms->possible_cpus->cpus[n].type = ms->cpu_type;
-        ms->possible_cpus->cpus[n].arch_id = n;
+        virt_get_cpu_topo_from_index(ms, &topo, n);
 
         ms->possible_cpus->cpus[n].props.has_socket_id = true;
-        ms->possible_cpus->cpus[n].props.socket_id  =
-                                   n / (ms->smp.cores * ms->smp.threads);
+        ms->possible_cpus->cpus[n].props.socket_id  = topo.socket_id;
         ms->possible_cpus->cpus[n].props.has_core_id = true;
-        ms->possible_cpus->cpus[n].props.core_id =
-                                   n / ms->smp.threads % ms->smp.cores;
+        ms->possible_cpus->cpus[n].props.core_id = topo.core_id;
         ms->possible_cpus->cpus[n].props.has_thread_id = true;
-        ms->possible_cpus->cpus[n].props.thread_id = n % ms->smp.threads;
+        ms->possible_cpus->cpus[n].props.thread_id = topo.thread_id;
+        ms->possible_cpus->cpus[n].arch_id =
+                                virt_get_arch_id_from_topo(ms, &topo);
     }
     return ms->possible_cpus;
 }
 
-static CpuInstanceProperties
-virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
+static CpuInstanceProperties virt_cpu_index_to_props(MachineState *ms,
+                                                     unsigned cpu_index)
 {
     MachineClass *mc = MACHINE_GET_CLASS(ms);
     const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
@@ -1059,27 +1683,25 @@ virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
 
 static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx)
 {
-    int64_t nidx = 0;
+    int64_t socket_id;
 
     if (ms->numa_state->num_nodes) {
-        nidx = idx / (ms->smp.cpus / ms->numa_state->num_nodes);
-        if (ms->numa_state->num_nodes <= nidx) {
-            nidx = ms->numa_state->num_nodes - 1;
-        }
+        socket_id = ms->possible_cpus->cpus[idx].props.socket_id;
+        return socket_id % ms->numa_state->num_nodes;
+    } else {
+        return 0;
     }
-    return nidx;
 }
 
-static void loongarch_class_init(ObjectClass *oc, void *data)
+static void virt_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
     HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
 
-    mc->desc = "Loongson-3A5000 LS7A1000 machine";
-    mc->init = loongarch_init;
-    mc->default_ram_size = 1 * GiB;
+    mc->init = virt_init;
     mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464");
     mc->default_ram_id = "loongarch.ram";
+    mc->desc = "QEMU LoongArch Virtual Machine";
     mc->max_cpus = LOONGARCH_MAX_CPUS;
     mc->is_default = 1;
     mc->default_kernel_irqchip_split = false;
@@ -1092,31 +1714,36 @@ static void loongarch_class_init(ObjectClass *oc, void *data)
     mc->numa_mem_supported = true;
     mc->auto_enable_numa_with_memhp = true;
     mc->auto_enable_numa_with_memdev = true;
-    mc->get_hotplug_handler = virt_machine_get_hotplug_handler;
+    mc->has_hotpluggable_cpus = true;
+    mc->get_hotplug_handler = virt_get_hotplug_handler;
     mc->default_nic = "virtio-net-pci";
-    hc->plug = loongarch_machine_device_plug_cb;
-    hc->pre_plug = virt_machine_device_pre_plug;
-    hc->unplug_request = virt_machine_device_unplug_request;
-    hc->unplug = virt_machine_device_unplug;
+    hc->plug = virt_device_plug_cb;
+    hc->pre_plug = virt_device_pre_plug;
+    hc->unplug_request = virt_device_unplug_request;
+    hc->unplug = virt_device_unplug;
 
     object_class_property_add(oc, "acpi", "OnOffAuto",
-        loongarch_get_acpi, loongarch_set_acpi,
+        virt_get_acpi, virt_set_acpi,
         NULL, NULL);
     object_class_property_set_description(oc, "acpi",
         "Enable ACPI");
+    object_class_property_add(oc, "v-eiointc", "OnOffAuto",
+        virt_get_veiointc, virt_set_veiointc, NULL, NULL);
+    object_class_property_set_description(oc, "v-eiointc",
+                            "Enable Virt Extend I/O Interrupt Controller");
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
 #ifdef CONFIG_TPM
     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS);
 #endif
 }
 
-static const TypeInfo loongarch_machine_types[] = {
+static const TypeInfo virt_machine_types[] = {
     {
-        .name           = TYPE_LOONGARCH_MACHINE,
+        .name           = TYPE_LOONGARCH_VIRT_MACHINE,
         .parent         = TYPE_MACHINE,
-        .instance_size  = sizeof(LoongArchMachineState),
-        .class_init     = loongarch_class_init,
-        .instance_init = loongarch_machine_initfn,
+        .instance_size  = sizeof(LoongArchVirtMachineState),
+        .class_init     = virt_class_init,
+        .instance_init  = virt_initfn,
         .interfaces = (InterfaceInfo[]) {
          { TYPE_HOTPLUG_HANDLER },
          { }
@@ -1124,4 +1751,4 @@ static const TypeInfo loongarch_machine_types[] = {
     }
 };
 
-DEFINE_TYPES(loongarch_machine_types)
+DEFINE_TYPES(virt_machine_types)
diff --git a/hw/m68k/next-kbd.c b/hw/m68k/next-kbd.c
index 0c348c18cf29bfc07cf7f29059fe34caccf52abc..880ebe36021c286c8b3d67114c83edb96b4746b5 100644
--- a/hw/m68k/next-kbd.c
+++ b/hw/m68k/next-kbd.c
@@ -68,7 +68,6 @@ struct NextKBDState {
     uint16_t shift;
 };
 
-static void queue_code(void *opaque, int code);
 
 /* lots of magic numbers here */
 static uint32_t kbd_read_byte(void *opaque, hwaddr addr)
@@ -166,68 +165,70 @@ static const MemoryRegionOps kbd_ops = {
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-static void nextkbd_event(void *opaque, int ch)
-{
-    /*
-     * Will want to set vars for caps/num lock
-     * if (ch & 0x80) -> key release
-     * there's also e0 escaped scancodes that might need to be handled
-     */
-    queue_code(opaque, ch);
-}
-
-static const unsigned char next_keycodes[128] = {
-    0x00, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x50, 0x4F,
-    0x4E, 0x1E, 0x1F, 0x20, 0x1D, 0x1C, 0x1B, 0x00,
-    0x42, 0x43, 0x44, 0x45, 0x48, 0x47, 0x46, 0x06,
-    0x07, 0x08, 0x00, 0x00, 0x2A, 0x00, 0x39, 0x3A,
-    0x3B, 0x3C, 0x3D, 0x40, 0x3F, 0x3E, 0x2D, 0x2C,
-    0x2B, 0x26, 0x00, 0x00, 0x31, 0x32, 0x33, 0x34,
-    0x35, 0x37, 0x36, 0x2e, 0x2f, 0x30, 0x00, 0x00,
-    0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+static const int qcode_to_nextkbd_keycode[] = {
+    [Q_KEY_CODE_ESC]           = 0x49,
+    [Q_KEY_CODE_1]             = 0x4a,
+    [Q_KEY_CODE_2]             = 0x4b,
+    [Q_KEY_CODE_3]             = 0x4c,
+    [Q_KEY_CODE_4]             = 0x4d,
+    [Q_KEY_CODE_5]             = 0x50,
+    [Q_KEY_CODE_6]             = 0x4f,
+    [Q_KEY_CODE_7]             = 0x4e,
+    [Q_KEY_CODE_8]             = 0x1e,
+    [Q_KEY_CODE_9]             = 0x1f,
+    [Q_KEY_CODE_0]             = 0x20,
+    [Q_KEY_CODE_MINUS]         = 0x1d,
+    [Q_KEY_CODE_EQUAL]         = 0x1c,
+    [Q_KEY_CODE_BACKSPACE]     = 0x1b,
+
+    [Q_KEY_CODE_Q]             = 0x42,
+    [Q_KEY_CODE_W]             = 0x43,
+    [Q_KEY_CODE_E]             = 0x44,
+    [Q_KEY_CODE_R]             = 0x45,
+    [Q_KEY_CODE_T]             = 0x48,
+    [Q_KEY_CODE_Y]             = 0x47,
+    [Q_KEY_CODE_U]             = 0x46,
+    [Q_KEY_CODE_I]             = 0x06,
+    [Q_KEY_CODE_O]             = 0x07,
+    [Q_KEY_CODE_P]             = 0x08,
+    [Q_KEY_CODE_RET]           = 0x2a,
+    [Q_KEY_CODE_A]             = 0x39,
+    [Q_KEY_CODE_S]             = 0x3a,
+
+    [Q_KEY_CODE_D]             = 0x3b,
+    [Q_KEY_CODE_F]             = 0x3c,
+    [Q_KEY_CODE_G]             = 0x3d,
+    [Q_KEY_CODE_H]             = 0x40,
+    [Q_KEY_CODE_J]             = 0x3f,
+    [Q_KEY_CODE_K]             = 0x3e,
+    [Q_KEY_CODE_L]             = 0x2d,
+    [Q_KEY_CODE_SEMICOLON]     = 0x2c,
+    [Q_KEY_CODE_APOSTROPHE]    = 0x2b,
+    [Q_KEY_CODE_GRAVE_ACCENT]  = 0x26,
+    [Q_KEY_CODE_Z]             = 0x31,
+    [Q_KEY_CODE_X]             = 0x32,
+    [Q_KEY_CODE_C]             = 0x33,
+    [Q_KEY_CODE_V]             = 0x34,
+
+    [Q_KEY_CODE_B]             = 0x35,
+    [Q_KEY_CODE_N]             = 0x37,
+    [Q_KEY_CODE_M]             = 0x36,
+    [Q_KEY_CODE_COMMA]         = 0x2e,
+    [Q_KEY_CODE_DOT]           = 0x2f,
+    [Q_KEY_CODE_SLASH]         = 0x30,
+
+    [Q_KEY_CODE_SPC]           = 0x38,
 };
 
-static void queue_code(void *opaque, int code)
+static void nextkbd_put_keycode(NextKBDState *s, int keycode)
 {
-    NextKBDState *s = NEXTKBD(opaque);
     KBDQueue *q = &s->queue;
-    int key = code & KD_KEYMASK;
-    int release = code & 0x80;
-    static int ext;
-
-    if (code == 0xE0) {
-        ext = 1;
-    }
-
-    if (code == 0x2A || code == 0x1D || code == 0x36) {
-        if (code == 0x2A) {
-            s->shift = KD_LSHIFT;
-        } else if (code == 0x36) {
-            s->shift = KD_RSHIFT;
-            ext = 0;
-        } else if (code == 0x1D && !ext) {
-            s->shift = KD_LCOMM;
-        } else if (code == 0x1D && ext) {
-            ext = 0;
-            s->shift = KD_RCOMM;
-        }
-        return;
-    } else if (code == (0x2A | 0x80) || code == (0x1D | 0x80) ||
-               code == (0x36 | 0x80)) {
-        s->shift = 0;
-        return;
-    }
 
     if (q->count >= KBD_QUEUE_SIZE) {
         return;
     }
 
-    q->data[q->wptr] = next_keycodes[key] | release;
-
+    q->data[q->wptr] = keycode;
     if (++q->wptr == KBD_QUEUE_SIZE) {
         q->wptr = 0;
     }
@@ -241,6 +242,53 @@ static void queue_code(void *opaque, int code)
     /* s->update_irq(s->update_arg, 1); */
 }
 
+static void nextkbd_event(DeviceState *dev, QemuConsole *src, InputEvent *evt)
+{
+    NextKBDState *s = NEXTKBD(dev);
+    int qcode, keycode;
+    bool key_down = evt->u.key.data->down;
+
+    qcode = qemu_input_key_value_to_qcode(evt->u.key.data->key);
+    if (qcode >= ARRAY_SIZE(qcode_to_nextkbd_keycode)) {
+        return;
+    }
+
+    /* Shift key currently has no keycode, so handle separately */
+    if (qcode == Q_KEY_CODE_SHIFT) {
+        if (key_down) {
+            s->shift |= KD_LSHIFT;
+        } else {
+            s->shift &= ~KD_LSHIFT;
+        }
+    }
+
+    if (qcode == Q_KEY_CODE_SHIFT_R) {
+        if (key_down) {
+            s->shift |= KD_RSHIFT;
+        } else {
+            s->shift &= ~KD_RSHIFT;
+        }
+    }
+
+    keycode = qcode_to_nextkbd_keycode[qcode];
+    if (!keycode) {
+        return;
+    }
+
+    /* If key release event, create keyboard break code */
+    if (!key_down) {
+        keycode |= 0x80;
+    }
+
+    nextkbd_put_keycode(s, keycode);
+}
+
+static const QemuInputHandler nextkbd_handler = {
+    .name  = "QEMU NeXT Keyboard",
+    .mask  = INPUT_EVENT_MASK_KEY,
+    .event = nextkbd_event,
+};
+
 static void nextkbd_reset(DeviceState *dev)
 {
     NextKBDState *nks = NEXTKBD(dev);
@@ -256,7 +304,7 @@ static void nextkbd_realize(DeviceState *dev, Error **errp)
     memory_region_init_io(&s->mr, OBJECT(dev), &kbd_ops, s, "next.kbd", 0x1000);
     sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->mr);
 
-    qemu_add_kbd_event_handler(nextkbd_event, s);
+    qemu_input_handler_register(dev, &nextkbd_handler);
 }
 
 static const VMStateDescription nextkbd_vmstate = {
diff --git a/hw/misc/Kconfig b/hw/misc/Kconfig
index cc8a8c1418ff007bd6eaaa73f098415beef51735..2ea5c68eb5da574e4d77170bc1d54b88c65fdf86 100644
--- a/hw/misc/Kconfig
+++ b/hw/misc/Kconfig
@@ -200,4 +200,8 @@ config IOSB
 config XLNX_VERSAL_TRNG
     bool
 
+config PSP_DEV
+    bool
+    default y
+
 source macio/Kconfig
diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c
index b07506ec04efe3dfe3faef16a5fc23e58dd40802..8706e3d37644f645a3e8bca83309e06f4a6278eb 100644
--- a/hw/misc/aspeed_hace.c
+++ b/hw/misc/aspeed_hace.c
@@ -123,6 +123,11 @@ static bool has_padding(AspeedHACEState *s, struct iovec *iov,
     if (*total_msg_len <= s->total_req_len) {
         uint32_t padding_size = s->total_req_len - *total_msg_len;
         uint8_t *padding = iov->iov_base;
+
+        if (padding_size > req_len) {
+            return false;
+        }
+
         *pad_offset = req_len - padding_size;
         if (padding[*pad_offset] == 0x80) {
             return true;
diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c
index ff55a4e2cd2f94aeedf65cba047cc7caac3bb5ae..12a1bc558ab6fa52f6816f570fd9d199c4ffa28e 100644
--- a/hw/misc/bcm2835_property.c
+++ b/hw/misc/bcm2835_property.c
@@ -28,8 +28,6 @@ static void bcm2835_property_mbox_push(BCM2835PropertyState *s, uint32_t value)
     uint32_t tot_len;
     size_t resplen;
     uint32_t tmp;
-    int n;
-    uint32_t offset, length, color;
 
     /*
      * Copy the current state of the framebuffer config; we will update
@@ -264,18 +262,25 @@ static void bcm2835_property_mbox_push(BCM2835PropertyState *s, uint32_t value)
             resplen = 16;
             break;
         case RPI_FWREQ_FRAMEBUFFER_SET_PALETTE:
-            offset = ldl_le_phys(&s->dma_as, value + 12);
-            length = ldl_le_phys(&s->dma_as, value + 16);
-            n = 0;
-            while (n < length - offset) {
-                color = ldl_le_phys(&s->dma_as, value + 20 + (n << 2));
-                stl_le_phys(&s->dma_as,
-                            s->fbdev->vcram_base + ((offset + n) << 2), color);
-                n++;
+        {
+            uint32_t offset = ldl_le_phys(&s->dma_as, value + 12);
+            uint32_t length = ldl_le_phys(&s->dma_as, value + 16);
+            int resp;
+
+            if (offset > 255 || length < 1 || length > 256) {
+                resp = 1; /* invalid request */
+            } else {
+                for (uint32_t e = 0; e < length; e++) {
+                    uint32_t color = ldl_le_phys(&s->dma_as, value + 20 + (e << 2));
+                    stl_le_phys(&s->dma_as,
+                                s->fbdev->vcram_base + ((offset + e) << 2), color);
+                }
+                resp = 0;
             }
-            stl_le_phys(&s->dma_as, value + 12, 0);
+            stl_le_phys(&s->dma_as, value + 12, resp);
             resplen = 4;
             break;
+	}
         case RPI_FWREQ_FRAMEBUFFER_GET_NUM_DISPLAYS:
             stl_le_phys(&s->dma_as, value + 12, 1);
             resplen = 4;
diff --git a/hw/misc/edu.c b/hw/misc/edu.c
index a1f8bc77e7701eb90abab3faef46164503e39560..e64a246d3febf9c096c27cb299b83d6fa1a0477d 100644
--- a/hw/misc/edu.c
+++ b/hw/misc/edu.c
@@ -115,7 +115,7 @@ static void edu_check_range(uint64_t addr, uint64_t size1, uint64_t start,
     uint64_t end2 = start + size2;
 
     if (within(addr, start, end2) &&
-            end1 > addr && within(end1, start, end2)) {
+            end1 > addr && end1 <= end2) {
         return;
     }
 
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 044788802982426261b7e2e5750ad12d12d6d35b..f66491a7a7a68fac494fc7707e0e5ab22d160f17 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -278,6 +278,7 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
     IVShmemState *s = IVSHMEM_COMMON(dev);
     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
     MSIVector *v = &s->msi_vectors[vector];
+    KVMRouteChange c;
     int ret;
 
     IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);
@@ -287,11 +288,12 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
     }
     assert(!v->unmasked);
 
-    ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
+    c = kvm_irqchip_begin_route_changes(kvm_state);
+    ret = kvm_irqchip_update_msi_route(&c, v->virq, msg, dev);
     if (ret < 0) {
         return ret;
     }
-    kvm_irqchip_commit_routes(kvm_state);
+    kvm_irqchip_commit_route_changes(&c);
 
     ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
     if (ret < 0) {
@@ -400,6 +402,7 @@ static void close_peer_eventfds(IVShmemState *s, int posn)
     }
 
     g_free(s->peers[posn].eventfds);
+    s->peers[posn].eventfds = NULL;
     s->peers[posn].nb_eventfds = 0;
 }
 
@@ -533,6 +536,10 @@ static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd,
         close(fd);
         return;
     }
+    if (peer->eventfds == NULL) {
+        peer->eventfds = g_new0(EventNotifier, s->vectors);
+        peer->nb_eventfds = 0;
+    }
     vector = peer->nb_eventfds++;
 
     IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd);
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
index 36c20d5637f70cf0db17173a4cb388a2b65bb8f9..28cba0ac2859ce90fccd4f3f4b34adb8cacf8bdd 100644
--- a/hw/misc/meson.build
+++ b/hw/misc/meson.build
@@ -9,6 +9,7 @@ system_ss.add(when: 'CONFIG_UNIMP', if_true: files('unimp.c'))
 system_ss.add(when: 'CONFIG_EMPTY_SLOT', if_true: files('empty_slot.c'))
 system_ss.add(when: 'CONFIG_LED', if_true: files('led.c'))
 system_ss.add(when: 'CONFIG_PVPANIC_COMMON', if_true: files('pvpanic.c'))
+system_ss.add(when: 'CONFIG_PSP_DEV', if_true: files('psp.c'))
 
 # ARM devices
 system_ss.add(when: 'CONFIG_PL310', if_true: files('arm_l2x0.c'))
diff --git a/hw/misc/nrf51_rng.c b/hw/misc/nrf51_rng.c
index fc86e1b697905d2ae2fb6c5c69a54181a53b06f0..e911b3a3a30114309f1869e4f8e33774c1f072c1 100644
--- a/hw/misc/nrf51_rng.c
+++ b/hw/misc/nrf51_rng.c
@@ -107,25 +107,25 @@ static void rng_write(void *opaque, hwaddr offset,
         break;
     case NRF51_RNG_REG_SHORTS:
         s->shortcut_stop_on_valrdy =
-                (value & BIT_MASK(NRF51_RNG_REG_SHORTS_VALRDY_STOP)) ? 1 : 0;
+                (value & BIT(NRF51_RNG_REG_SHORTS_VALRDY_STOP)) ? 1 : 0;
         break;
     case NRF51_RNG_REG_INTEN:
         s->interrupt_enabled =
-                (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) ? 1 : 0;
+                (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) ? 1 : 0;
         break;
     case NRF51_RNG_REG_INTENSET:
-        if (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) {
+        if (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) {
             s->interrupt_enabled = 1;
         }
         break;
     case NRF51_RNG_REG_INTENCLR:
-        if (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) {
+        if (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) {
             s->interrupt_enabled = 0;
         }
         break;
     case NRF51_RNG_REG_CONFIG:
         s->filter_enabled =
-                      (value & BIT_MASK(NRF51_RNG_REG_CONFIG_DECEN)) ? 1 : 0;
+                      (value & BIT(NRF51_RNG_REG_CONFIG_DECEN)) ? 1 : 0;
         break;
 
     default:
diff --git a/hw/misc/psp.c b/hw/misc/psp.c
new file mode 100644
index 0000000000000000000000000000000000000000..03e8663027c689a36a1d9055ffd718efd2ec6e78
--- /dev/null
+++ b/hw/misc/psp.c
@@ -0,0 +1,307 @@
+/*
+ * hygon psp device emulation
+ *
+ * Copyright 2024 HYGON Corp.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/compiler.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "migration/vmstate.h"
+#include "hw/qdev-properties.h"
+#include "sysemu/runstate.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "exec/ramblock.h"
+#include "hw/i386/e820_memory_layout.h"
+#include <sys/ioctl.h>
+
+#define TYPE_PSP_DEV "psp"
+OBJECT_DECLARE_SIMPLE_TYPE(PSPDevState, PSP_DEV)
+
+struct PSPDevState {
+    /* Private */
+    DeviceState pdev;
+
+    /* Public */
+    Notifier shutdown_notifier;
+    int dev_fd;
+    bool enabled;
+
+    /**
+     * vid is used to identify a virtual machine in qemu.
+     * When a virtual machine accesses a tkm key,
+     * the TKM module uses different key spaces based on different vids.
+    */
+    uint32_t vid;
+    /* pinned hugepage numbers */
+    int hp_num;
+};
+
+#define PSP_DEV_PATH "/dev/hygon_psp_config"
+#define HYGON_PSP_IOC_TYPE      'H'
+#define PSP_IOC_MUTEX_ENABLE    _IOWR(HYGON_PSP_IOC_TYPE, 1, NULL)
+#define PSP_IOC_MUTEX_DISABLE   _IOWR(HYGON_PSP_IOC_TYPE, 2, NULL)
+#define PSP_IOC_VPSP_OPT        _IOWR(HYGON_PSP_IOC_TYPE, 3, NULL)
+#define PSP_IOC_PIN_USER_PAGE   _IOWR(HYGON_PSP_IOC_TYPE, 4, NULL)
+#define PSP_IOC_UNPIN_USER_PAGE _IOWR(HYGON_PSP_IOC_TYPE, 5, NULL)
+
+enum VPSP_DEV_CTRL_OPCODE {
+    VPSP_OP_VID_ADD,
+    VPSP_OP_VID_DEL,
+    VPSP_OP_SET_DEFAULT_VID_PERMISSION,
+    VPSP_OP_GET_DEFAULT_VID_PERMISSION,
+    VPSP_OP_SET_GPA,
+};
+
+struct psp_dev_ctrl {
+    unsigned char op;
+    unsigned char resv[3];
+    union {
+        unsigned int vid;
+        // Set or check the permissions for the default VID
+        unsigned int def_vid_perm;
+        struct {
+            uint64_t gpa_start;
+            uint64_t gpa_end;
+        } gpa;
+        unsigned char reserved[128];
+    } __attribute__ ((packed)) data;
+};
+
+static MemoryRegion *find_memory_region_by_name(MemoryRegion *root, const char *name) {
+    MemoryRegion *subregion;
+    MemoryRegion *result;
+
+    if (strcmp(root->name, name) == 0)
+        return root;
+
+    QTAILQ_FOREACH(subregion, &root->subregions, subregions_link) {
+        result = find_memory_region_by_name(subregion, name);
+        if (result) {
+            return result;
+        }
+    }
+
+    return NULL;
+}
+
+static int pin_user_hugepage(int fd, uint64_t vaddr)
+{
+    int ret;
+
+    ret = ioctl(fd, PSP_IOC_PIN_USER_PAGE, vaddr);
+    /* 22: Invalid argument, some old kernel doesn't support this ioctl command */
+    if (ret != 0 && errno == EINVAL) {
+        ret = 0;
+    }
+    return ret;
+}
+
+static int unpin_user_hugepage(int fd, uint64_t vaddr)
+{
+    int ret;
+
+    ret = ioctl(fd, PSP_IOC_UNPIN_USER_PAGE, vaddr);
+    /* 22: Invalid argument, some old kernel doesn't support this ioctl command */
+    if (ret != 0 && errno == EINVAL) {
+        ret = 0;
+    }
+    return ret;
+}
+
+static int pin_psp_user_hugepages(struct PSPDevState *state, MemoryRegion *root)
+{
+    int ret = 0;
+    char mr_name[128] = {0};
+    int i, pinned_num;
+    MemoryRegion *find_mr = NULL;
+
+    for (i = 0 ; i < state->hp_num; ++i) {
+        sprintf(mr_name, "mem2-%d", i);
+        find_mr = find_memory_region_by_name(root, mr_name);
+        if (!find_mr) {
+            error_report("fail to find memory region by name %s.", mr_name);
+            ret = -ENOMEM;
+            goto end;
+        }
+
+        ret = pin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host);
+        if (ret) {
+            error_report("fail to pin_user_hugepage, ret: %d.", ret);
+            goto end;
+        }
+    }
+end:
+    if (ret) {
+        pinned_num = i;
+        for (i = 0 ; i < pinned_num; ++i) {
+            sprintf(mr_name, "mem2-%d", i);
+            find_mr = find_memory_region_by_name(root, mr_name);
+            if (!find_mr) {
+                continue;
+            }
+            unpin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host);
+        }
+
+    }
+    return ret;
+}
+
+static int unpin_psp_user_hugepages(struct PSPDevState *state, MemoryRegion *root)
+{
+    int ret = 0;
+    char mr_name[128] = {0};
+    int i;
+    MemoryRegion *find_mr = NULL;
+
+    for (i = 0 ; i < state->hp_num; ++i) {
+        sprintf(mr_name, "mem2-%d", i);
+        find_mr = find_memory_region_by_name(root, mr_name);
+        if (!find_mr) {
+            continue;
+        }
+
+        ret = unpin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host);
+        if (ret) {
+            error_report("fail to unpin_user_hugepage, ret: %d.", ret);
+            goto end;
+        }
+    }
+end:
+    return ret;
+}
+
+static void psp_dev_destroy(PSPDevState *state)
+{
+    struct psp_dev_ctrl ctrl = { 0 };
+    if (state && state->dev_fd) {
+        if (state->enabled) {
+            ctrl.op = VPSP_OP_VID_DEL;
+            if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) {
+                error_report("VPSP_OP_VID_DEL: %d", -errno);
+            }
+
+            /* Unpin hugepage memory */
+            if (unpin_psp_user_hugepages(state, get_system_memory())) {
+                error_report("unpin_psp_user_hugepages failed");
+            } else {
+                state->enabled = false;
+            }
+        }
+        qemu_close(state->dev_fd);
+        state->dev_fd = 0;
+    }
+}
+
+/**
+ * Guest OS performs shut down operations through 'shutdown' and 'powerdown' event.
+ * The 'powerdown' event will also trigger 'shutdown' in the end,
+ * so only attention to the 'shutdown' event.
+ *
+ * When Guest OS trigger 'reboot' or 'reset' event, to do nothing.
+*/
+static void psp_dev_shutdown_notify(Notifier *notifier, void *data)
+{
+    PSPDevState *state = container_of(notifier, PSPDevState, shutdown_notifier);
+    psp_dev_destroy(state);
+}
+
+static void psp_dev_realize(DeviceState *dev, Error **errp)
+{
+    int i;
+    char mr_name[128] = {0};
+    struct psp_dev_ctrl ctrl = { 0 };
+    PSPDevState *state = PSP_DEV(dev);
+    MemoryRegion *root_mr = get_system_memory();
+    MemoryRegion *find_mr = NULL;
+    uint64_t ram2_start = 0, ram2_end = 0;
+
+    state->dev_fd = qemu_open_old(PSP_DEV_PATH, O_RDWR);
+    if (state->dev_fd < 0) {
+        error_setg(errp, "fail to open %s, errno %d.", PSP_DEV_PATH, errno);
+        goto end;
+    }
+
+    ctrl.op = VPSP_OP_VID_ADD;
+    ctrl.data.vid = state->vid;
+    if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) {
+        error_setg(errp, "psp_dev_realize VPSP_OP_VID_ADD vid %d, return %d", ctrl.data.vid, -errno);
+        goto end;
+    }
+
+    for (i = 0 ;; ++i) {
+        sprintf(mr_name, "mem2-%d", i);
+        find_mr = find_memory_region_by_name(root_mr, mr_name);
+        if (!find_mr)
+            break;
+
+        if (!ram2_start)
+            ram2_start = find_mr->addr;
+        ram2_end = find_mr->addr + find_mr->size - 1;
+    }
+
+    state->hp_num = i;
+
+    if (ram2_start != ram2_end) {
+        ctrl.op = VPSP_OP_SET_GPA;
+        ctrl.data.gpa.gpa_start = ram2_start;
+        ctrl.data.gpa.gpa_end = ram2_end;
+        if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) {
+            error_setg(errp, "psp_dev_realize VPSP_OP_SET_GPA (start 0x%lx, end 0x%lx), return %d",
+                        ram2_start, ram2_end, -errno);
+            goto del_vid;
+        }
+
+        /* Pin hugepage memory */
+        if(pin_psp_user_hugepages(state, root_mr)) {
+            error_setg(errp, "pin_psp_user_hugepages failed.");
+            goto del_vid;
+        }
+    }
+
+    state->enabled = true;
+    state->shutdown_notifier.notify = psp_dev_shutdown_notify;
+    qemu_register_shutdown_notifier(&state->shutdown_notifier);
+
+    return;
+del_vid:
+    ctrl.op = VPSP_OP_VID_DEL;
+    ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl);
+end:
+    return;
+}
+
+static struct Property psp_dev_properties[] = {
+    DEFINE_PROP_UINT32("vid", PSPDevState, vid, 0),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void psp_dev_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->desc = "PSP Device";
+    dc->realize = psp_dev_realize;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+    device_class_set_props(dc, psp_dev_properties);
+}
+
+static const TypeInfo psp_dev_info = {
+    .name = TYPE_PSP_DEV,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(PSPDevState),
+    .class_init = psp_dev_class_init,
+};
+
+static void psp_dev_register_types(void)
+{
+    type_register_static(&psp_dev_info);
+}
+
+type_init(psp_dev_register_types)
diff --git a/hw/net/can/can_sja1000.c b/hw/net/can/can_sja1000.c
index 73201f91395af81dea302332c7122ff4c3b3d84c..575df7d2f8dc3d5b7143509abdf9740b407fda63 100644
--- a/hw/net/can/can_sja1000.c
+++ b/hw/net/can/can_sja1000.c
@@ -108,7 +108,7 @@ void can_sja_single_filter(struct qemu_can_filter *filter,
         }
 
         filter->can_mask = (uint32_t)amr[0] << 3;
-        filter->can_mask |= (uint32_t)amr[1] << 5;
+        filter->can_mask |= (uint32_t)amr[1] >> 5;
         filter->can_mask = ~filter->can_mask & QEMU_CAN_SFF_MASK;
         if (!(amr[1] & 0x10)) {
             filter->can_mask |= QEMU_CAN_RTR_FLAG;
diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index 69e1c4bb8918fa506cf9f420edb257a0f5424127..f6204ec0590005eb8c6bfe221805fd61be4ae6ec 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -279,6 +279,9 @@ typedef struct {
     /* Quasi static device properties (no need to save them). */
     uint16_t stats_size;
     bool has_extended_tcb_support;
+
+    /* Flag to avoid recursions. */
+    bool busy;
 } EEPRO100State;
 
 /* Word indices in EEPROM. */
@@ -844,6 +847,14 @@ static void action_command(EEPRO100State *s)
        Therefore we limit the number of iterations. */
     unsigned max_loop_count = 16;
 
+    if (s->busy) {
+        /* Prevent recursions. */
+        logout("recursion in %s:%u\n", __FILE__, __LINE__);
+        return;
+    }
+
+    s->busy = true;
+
     for (;;) {
         bool bit_el;
         bool bit_s;
@@ -940,6 +951,7 @@ static void action_command(EEPRO100State *s)
     }
     TRACE(OTHER, logout("CU list empty\n"));
     /* List is empty. Now CU is idle or suspended. */
+    s->busy = false;
 }
 
 static void eepro100_cu_command(EEPRO100State * s, uint8_t val)
diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index 2e5f58b3c9cc643ddf9aed2cedc2e247b6429570..d40d508a11bf8af3e769410b0cad0abb0c6b2b8f 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -141,6 +141,10 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt)
     uint32_t csum = 0;
     struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG;
 
+    if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) {
+        return false;
+    }
+
     if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) {
         return false;
     }
diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
index a7e123e60dbab97ebfafabb05fbbb99576af9cc9..7d574f487b8b6c00a561d7207a4db77e2dab71d3 100644
--- a/hw/net/pcnet.c
+++ b/hw/net/pcnet.c
@@ -632,7 +632,7 @@ static inline int ladr_match(PCNetState *s, const uint8_t *buf, int size)
 {
     struct qemu_ether_header *hdr = (void *)buf;
     if ((*(hdr->ether_dhost)&0x01) &&
-        ((uint64_t *)&s->csr[8])[0] != 0LL) {
+        (s->csr[8] | s->csr[9] | s->csr[10] | s->csr[11]) != 0) {
         uint8_t ladr[8] = {
             s->csr[8] & 0xff, s->csr[8] >> 8,
             s->csr[9] & 0xff, s->csr[9] >> 8,
diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c
index 5e16056be661e54ff0da2ff8bdcf116055fe1e45..c25438ccccc3e18d8b18c9fb3148f7b1b8aa276b 100644
--- a/hw/net/rocker/rocker_of_dpa.c
+++ b/hw/net/rocker/rocker_of_dpa.c
@@ -2070,6 +2070,7 @@ static int of_dpa_cmd_add_l2_flood(OfDpa *of_dpa, OfDpaGroup *group,
 err_out:
     group->l2_flood.group_count = 0;
     g_free(group->l2_flood.group_ids);
+    group->l2_flood.group_ids = NULL;
     g_free(tlvs);
 
     return err;
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index e8e1661646219094995538af85e792f2e593269a..a02d65d208f8b8646c2beee91beef8b18109c77e 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -167,9 +167,26 @@ static int vhost_net_get_fd(NetClientState *backend)
     }
 }
 
+static uint64_t vhost_get_mask_features(const int *feature_bits, uint64_t features)
+{
+    const int *bit = feature_bits;
+    uint64_t out_features = 0;
+
+    while (*bit != VHOST_INVALID_FEATURE_BIT) {
+        uint64_t bit_mask = (1ULL << *bit);
+        if (features & bit_mask) {
+            out_features |= bit_mask;
+        }
+        bit++;
+    }
+    return out_features;
+}
+
 struct vhost_net *vhost_net_init(VhostNetOptions *options)
 {
     int r;
+    VirtIONet *n;
+    VirtIODevice *vdev;
     bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
     struct vhost_net *net = g_new0(struct vhost_net, 1);
     uint64_t features = 0;
@@ -195,7 +212,46 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options)
         net->backend = r;
         net->dev.protocol_features = 0;
     } else {
-        net->dev.backend_features = 0;
+        /* for ovs restart when vm start.
+         * Normal situation:
+         * 1.vm start.
+         * 2.vhost_net_init init ok, then dev.acked_features is 0x40000000.
+         * 3.guest virtio-net mod load. qemu will call virtio_net_set_features set
+         * dev.acked_features to 0x40408000.
+         * 4.feature set to ovs's vhostuser(0x40408000).
+         * 5.ovs restart.
+         * 6.vhost_user_stop will save net->dev.acked_features(0x40408000) to
+         * VhostUserState's acked_features(0x40408000).
+         * 7.restart ok.
+         * 8.vhost_net_init fun call vhost_user_get_acked_features get the save
+         * features, and set to net->dev.acked_features.
+         * Abnormal situation:
+         * 1.vm start.
+         * 2.vhost_net_init init ok, then dev.acked_features is 0x40000000.
+         * 3.ovs restart.
+         * 4.vhost_user_stop will save net->dev.acked_features(0x40000000) to
+         * VhostUserState's acked_features(0x40000000).
+         * 5.guest virtio-net mod load. qemu will call virtio_net_set_features set
+         * dev.acked_features to 0x40408000.
+         * 6.restart ok.
+         * 7.vhost_net_init fun call vhost_user_get_acked_features get the save
+         * features(0x40000000), and set to net->dev.acked_features(0x40000000).
+         * 8.feature set to ovs's vhostuser(0x40000000).
+         *
+         * in abnormal situation, qemu set the wrong features to ovs's vhostuser,
+         * then the vm's network will be down.
+         * in abnormal situation, we found it just lost the guest feartures in
+         * acked_features, so hear we set the acked_features to vm's featrue
+         * just the same as guest virtio-net mod load.
+         */
+        if (options->net_backend->peer) {
+            n = qemu_get_nic_opaque(options->net_backend->peer);
+            vdev = VIRTIO_DEVICE(n);
+            net->dev.backend_features = vhost_get_mask_features(vhost_net_get_feature_bits(net),
+                                                                vdev->guest_features);
+        } else {
+            net->dev.backend_features = 0;
+        }
         net->dev.protocol_features = 0;
         net->backend = -1;
 
@@ -403,7 +459,9 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
             peer = qemu_get_peer(ncs, n->max_queue_pairs);
         }
 
-        if (peer->vring_enable) {
+        /* ovs needs to restore all states of vring */
+        if (peer->vring_enable ||
+	        ncs[i].peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
             /* restore vring enable state */
             r = vhost_set_vring_enable(peer, peer->vring_enable);
 
@@ -541,6 +599,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
     VHostNetState *net = get_vhost_net(nc);
     const VhostOps *vhost_ops = net->dev.vhost_ops;
 
+    /*
+     * vhost-vdpa network devices need to enable dataplane virtqueues after
+     * DRIVER_OK, so they can recover device state before starting dataplane.
+     * Because of that, we don't enable virtqueues here and leave it to
+     * net/vhost-vdpa.c.
+     */
+    if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
+        return 0;
+    }
+
     nc->vring_enable = enable;
 
     if (vhost_ops && vhost_ops->vhost_set_vring_enable) {
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 80c56f0cfcf1675a1d1815489d0cfb94b525736e..25044385dc00008c71331789aa94f4ce0194895f 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -50,12 +50,11 @@
 #define VIRTIO_NET_VM_VERSION    11
 
 /* previously fixed value */
-#define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
-#define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
+#define VIRTIO_NET_VHOST_USER_DEFAULT_SIZE 2048
 
 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */
-#define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
-#define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
+#define VIRTIO_NET_RX_QUEUE_MIN_SIZE 256
+#define VIRTIO_NET_TX_QUEUE_MIN_SIZE 256
 
 #define VIRTIO_NET_IP4_ADDR_SIZE   8        /* ipv4 saddr + daddr */
 
@@ -674,6 +673,11 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
 
     n->mergeable_rx_bufs = mergeable_rx_bufs;
 
+    /*
+     * Note: when extending the vnet header, please make sure to
+     * change the vnet header copying logic in virtio_net_flush_tx()
+     * as well.
+     */
     if (version_1) {
         n->guest_hdr_len = hash_report ?
             sizeof(struct virtio_net_hdr_v1_hash) :
@@ -696,6 +700,28 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
     }
 }
 
+static void virtio_net_set_default_queue_size(VirtIONet *n)
+{
+    NetClientState *peer = n->nic_conf.peers.ncs[0];
+
+    /* Default value is 0 if not set */
+    if (n->net_conf.rx_queue_size == 0) {
+        if (peer && peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
+            n->net_conf.rx_queue_size = VIRTIO_NET_VHOST_USER_DEFAULT_SIZE;
+        } else {
+            n->net_conf.rx_queue_size = VIRTIO_NET_VQ_MAX_SIZE;
+        }
+    }
+
+    if (n->net_conf.tx_queue_size == 0) {
+        if (peer && peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
+            n->net_conf.tx_queue_size = VIRTIO_NET_VHOST_USER_DEFAULT_SIZE;
+        } else {
+            n->net_conf.tx_queue_size = VIRTIO_NET_VQ_MAX_SIZE;
+        }
+    }
+}
+
 static int virtio_net_max_tx_queue_size(VirtIONet *n)
 {
     NetClientState *peer = n->nic_conf.peers.ncs[0];
@@ -705,15 +731,16 @@ static int virtio_net_max_tx_queue_size(VirtIONet *n)
      * size.
      */
     if (!peer) {
-        return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
+        return VIRTIO_NET_VQ_MAX_SIZE;
     }
 
     switch(peer->info->type) {
     case NET_CLIENT_DRIVER_VHOST_USER:
+        return VIRTIO_NET_VQ_MAX_SIZE;
     case NET_CLIENT_DRIVER_VHOST_VDPA:
-        return VIRTQUEUE_MAX_SIZE;
+        return VIRTIO_NET_VQ_MAX_SIZE;
     default:
-        return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
+        return VIRTIO_NET_VQ_MAX_SIZE;
     };
 }
 
@@ -1373,17 +1400,17 @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
     n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
     n->rss_data.indirections_len =
         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
-    n->rss_data.indirections_len++;
     if (!do_rss) {
-        n->rss_data.indirections_len = 1;
+        n->rss_data.indirections_len = 0;
     }
-    if (!is_power_of_2(n->rss_data.indirections_len)) {
-        err_msg = "Invalid size of indirection table";
+    if (n->rss_data.indirections_len >= VIRTIO_NET_RSS_MAX_TABLE_LEN) {
+        err_msg = "Too large indirection table";
         err_value = n->rss_data.indirections_len;
         goto error;
     }
-    if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
-        err_msg = "Too large indirection table";
+    n->rss_data.indirections_len++;
+    if (!is_power_of_2(n->rss_data.indirections_len)) {
+        err_msg = "Invalid size of indirection table";
         err_value = n->rss_data.indirections_len;
         goto error;
     }
@@ -1635,24 +1662,28 @@ static bool virtio_net_can_receive(NetClientState *nc)
 
 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
 {
+    int opaque;
+    unsigned int in_bytes;
     VirtIONet *n = q->n;
-    if (virtio_queue_empty(q->rx_vq) ||
-        (n->mergeable_rx_bufs &&
-         !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
-        virtio_queue_set_notification(q->rx_vq, 1);
-
-        /* To avoid a race condition where the guest has made some buffers
-         * available after the above check but before notification was
-         * enabled, check for available buffers again.
-         */
-        if (virtio_queue_empty(q->rx_vq) ||
-            (n->mergeable_rx_bufs &&
-             !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
+
+    while (virtio_queue_empty(q->rx_vq) || n->mergeable_rx_bufs) {
+        opaque = virtqueue_get_avail_bytes(q->rx_vq, &in_bytes, NULL,
+                                           bufsize, 0);
+        /* Buffer is enough, disable notifiaction */
+        if (bufsize <= in_bytes) {
+            break;
+        }
+
+        if (virtio_queue_enable_notification_and_check(q->rx_vq, opaque)) {
+            /* Guest has added some buffers, try again */
+            continue;
+        } else {
             return 0;
         }
     }
 
     virtio_queue_set_notification(q->rx_vq, 0);
+
     return 1;
 }
 
@@ -1904,7 +1935,8 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
     if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
         int index = virtio_net_process_rss(nc, buf, size);
         if (index >= 0) {
-            NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
+            NetClientState *nc2 =
+                qemu_get_subqueue(n->nic, index % n->curr_queue_pairs);
             return virtio_net_receive_rcu(nc2, buf, size, true);
         }
     }
@@ -1964,7 +1996,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
                                     sg, elem->in_num,
                                     offsetof(typeof(mhdr), num_buffers),
                                     sizeof(mhdr.num_buffers));
-            }
+            }else {
+                mhdr.num_buffers = cpu_to_le16(1);
+	    }
 
             receive_header(n, sg, elem->in_num, buf, size);
             if (n->rss_data.populate_hash) {
@@ -2114,7 +2148,7 @@ static void virtio_net_rsc_purge(void *opq)
     chain->stat.timer++;
     if (!QTAILQ_EMPTY(&chain->buffers)) {
         timer_mod(chain->drain_timer,
-              qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
+              qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout);
     }
 }
 
@@ -2350,7 +2384,7 @@ static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain,
         chain->stat.empty_cache++;
         virtio_net_rsc_cache_buf(chain, nc, buf, size);
         timer_mod(chain->drain_timer,
-              qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
+              qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout);
         return size;
     }
 
@@ -2588,7 +2622,7 @@ static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n,
         chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD;
         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
     }
-    chain->drain_timer = timer_new_ns(QEMU_CLOCK_HOST,
+    chain->drain_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                       virtio_net_rsc_purge, chain);
     memset(&chain->stat, 0, sizeof(chain->stat));
 
@@ -2693,7 +2727,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
         ssize_t ret;
         unsigned int out_num;
         struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
-        struct virtio_net_hdr_mrg_rxbuf mhdr;
+        struct virtio_net_hdr_v1_hash vhdr;
 
         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
         if (!elem) {
@@ -2704,22 +2738,18 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
         out_sg = elem->out_sg;
         if (out_num < 1) {
             virtio_error(vdev, "virtio-net header not in first element");
-            virtqueue_detach_element(q->tx_vq, elem, 0);
-            g_free(elem);
-            return -EINVAL;
+            goto detach;
         }
 
         if (n->has_vnet_hdr) {
-            if (iov_to_buf(out_sg, out_num, 0, &mhdr, n->guest_hdr_len) <
+            if (iov_to_buf(out_sg, out_num, 0, &vhdr, n->guest_hdr_len) <
                 n->guest_hdr_len) {
                 virtio_error(vdev, "virtio-net header incorrect");
-                virtqueue_detach_element(q->tx_vq, elem, 0);
-                g_free(elem);
-                return -EINVAL;
+                goto detach;
             }
             if (n->needs_vnet_hdr_swap) {
-                virtio_net_hdr_swap(vdev, (void *) &mhdr);
-                sg2[0].iov_base = &mhdr;
+                virtio_net_hdr_swap(vdev, (void *) &vhdr);
+                sg2[0].iov_base = &vhdr;
                 sg2[0].iov_len = n->guest_hdr_len;
                 out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
                                    out_sg, out_num,
@@ -2746,6 +2776,11 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
                              n->guest_hdr_len, -1);
             out_num = sg_num;
             out_sg = sg;
+
+            if (out_num < 1) {
+                virtio_error(vdev, "virtio-net nothing to send");
+                goto detach;
+            }
         }
 
         ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
@@ -2766,6 +2801,11 @@ drop:
         }
     }
     return num_packets;
+
+detach:
+    virtqueue_detach_element(q->tx_vq, elem, 0);
+    g_free(elem);
+    return -EINVAL;
 }
 
 static void virtio_net_tx_timer(void *opaque);
@@ -2804,6 +2844,10 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
     VirtIONet *n = VIRTIO_NET(vdev);
     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
 
+    if (unlikely(n->vhost_started)) {
+        return;
+    }
+
     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
         virtio_net_drop_tx_queue_data(vdev, vq);
         return;
@@ -3632,18 +3676,20 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
     virtio_net_set_config_size(n, n->host_features);
     virtio_init(vdev, VIRTIO_ID_NET, n->config_size);
 
+    virtio_net_set_default_queue_size(n);
+
     /*
      * We set a lower limit on RX queue size to what it always was.
      * Guests that want a smaller ring can always resize it without
      * help from us (using virtio 1 and up).
      */
     if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
-        n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
+        n->net_conf.rx_queue_size > VIRTIO_NET_VQ_MAX_SIZE ||
         !is_power_of_2(n->net_conf.rx_queue_size)) {
         error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
                    "must be a power of 2 between %d and %d.",
                    n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
-                   VIRTQUEUE_MAX_SIZE);
+                   VIRTIO_NET_VQ_MAX_SIZE);
         virtio_cleanup(vdev);
         return;
     }
@@ -3933,10 +3979,8 @@ static Property virtio_net_properties[] = {
                        TX_TIMER_INTERVAL),
     DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST),
     DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx),
-    DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size,
-                       VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE),
-    DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size,
-                       VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE),
+    DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, 0),
+    DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size, 0),
     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
                      true),
@@ -3952,6 +3996,46 @@ static Property virtio_net_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+static void virtio_net_print_features(uint64_t features)
+{
+    Property *props = virtio_net_properties;
+    int feature_cnt = 0;
+
+    if (!features) {
+        return;
+    }
+    printf("virtio_net_feature: ");
+
+    for (; features && props->name; props++) {
+        /* The bitnr of property may be default(0) besides 'csum' property. */
+        if (props->bitnr == 0 && strcmp(props->name, "csum")) {
+            continue;
+        }
+
+        /* Features only support 64bit. */
+        if (props->bitnr > 63) {
+            continue;
+        }
+
+        if (virtio_has_feature(features, props->bitnr)) {
+            virtio_clear_feature(&features, props->bitnr);
+            if (feature_cnt != 0) {
+                printf(", ");
+            }
+            printf("%s", props->name);
+            feature_cnt++;
+        }
+    }
+
+    if (features) {
+        if (feature_cnt != 0) {
+            printf(", ");
+        }
+        printf("unkown bits 0x%." PRIx64, features);
+    }
+    printf("\n");
+}
+
 static void virtio_net_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3966,6 +4050,7 @@ static void virtio_net_class_init(ObjectClass *klass, void *data)
     vdc->set_config = virtio_net_set_config;
     vdc->get_features = virtio_net_get_features;
     vdc->set_features = virtio_net_set_features;
+    vdc->print_features = virtio_net_print_features;
     vdc->bad_features = virtio_net_bad_features;
     vdc->reset = virtio_net_reset;
     vdc->queue_reset = virtio_net_queue_reset;
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f026245d1e9eb3a14d39f651c4a9d35c9d2cd201..6fc2a64b0e5a85470ee29def89bb959f563cdb7d 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1516,9 +1516,16 @@ static void nvme_post_cqes(void *opaque)
             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
             break;
         }
+
         QTAILQ_REMOVE(&cq->req_list, req, entry);
+
         nvme_inc_cq_tail(cq);
         nvme_sg_unmap(&req->sg);
+
+        if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
+            qemu_bh_schedule(sq->bh);
+        }
+
         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
     }
     if (cq->tail != cq->head) {
@@ -2592,6 +2599,7 @@ next:
 done:
     iocb->aiocb = NULL;
     iocb->common.cb(iocb->common.opaque, iocb->ret);
+    g_free(iocb->range);
     qemu_aio_unref(iocb);
 }
 
@@ -2855,7 +2863,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
         uint32_t nlb;
         nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
                                      &nlb, NULL, NULL, NULL);
-        copy_len += nlb + 1;
+        copy_len += nlb;
     }
 
     if (copy_len > ns->id_ns.mcl) {
@@ -4301,7 +4309,7 @@ static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
 
     nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
     trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
-    buf = g_malloc(trans_len);
+    buf = g_malloc0(trans_len);
 
     trans_len = MIN(trans_len, len);
 
@@ -4352,7 +4360,7 @@ static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
     NvmeNamespace *ns = req->ns;
     uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
     uint16_t ret = NVME_SUCCESS;
-    uint32_t npid = (cdw10 >> 1) + 1;
+    uint32_t npid = (cdw10 >> 16) + 1;
     unsigned int i = 0;
     g_autofree uint16_t *pids = NULL;
     uint32_t maxnpid;
@@ -5882,7 +5890,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
     uint32_t nsid = le32_to_cpu(cmd->nsid);
-    uint32_t result;
+    uint32_t result = 0;
     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
     uint16_t iv;
@@ -7574,7 +7582,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         /* Completion queue doorbell write */
 
         uint16_t new_head = val & 0xffff;
-        int start_sqs;
         NvmeCQueue *cq;
 
         qid = (addr - (0x1000 + (1 << 2))) >> 3;
@@ -7625,18 +7632,15 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 
         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
 
-        start_sqs = nvme_cq_full(cq) ? 1 : 0;
+        /* scheduled deferred cqe posting if queue was previously full */
+        if (nvme_cq_full(cq)) {
+            qemu_bh_schedule(cq->bh);
+        }
+
         cq->head = new_head;
         if (!qid && n->dbbuf_enabled) {
             stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
         }
-        if (start_sqs) {
-            NvmeSQueue *sq;
-            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
-                qemu_bh_schedule(sq->bh);
-            }
-            qemu_bh_schedule(cq->bh);
-        }
 
         if (cq->tail == cq->head) {
             if (cq->irq_enabled) {
@@ -7924,7 +7928,7 @@ static void nvme_init_state(NvmeCtrl *n)
     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
     QTAILQ_INIT(&n->aer_queue);
 
-    list->numcntl = cpu_to_le16(max_vfs);
+    list->numcntl = max_vfs;
     for (i = 0; i < max_vfs; i++) {
         sctrl = &list->sec[i];
         sctrl->pcid = cpu_to_le16(n->cntlid);
@@ -8466,36 +8470,26 @@ static void nvme_pci_reset(DeviceState *qdev)
     nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
 }
 
-static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address,
-                                      uint32_t val, int len)
+static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
 {
     NvmeCtrl *n = NVME(dev);
     NvmeSecCtrlEntry *sctrl;
-    uint16_t sriov_cap = dev->exp.sriov_cap;
-    uint32_t off = address - sriov_cap;
-    int i, num_vfs;
-
-    if (!sriov_cap) {
-        return;
-    }
+    int i;
 
-    if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
-        if (!(val & PCI_SRIOV_CTRL_VFE)) {
-            num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
-            for (i = 0; i < num_vfs; i++) {
-                sctrl = &n->sec_ctrl_list.sec[i];
-                nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
-            }
-        }
+    for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
+        sctrl = &n->sec_ctrl_list.sec[i];
+        nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
     }
 }
 
 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
                                   uint32_t val, int len)
 {
-    nvme_sriov_pre_write_ctrl(dev, address, val, len);
+    uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
+
     pci_default_write_config(dev, address, val, len);
     pcie_cap_flr_write_config(dev, address, val, len);
+    nvme_sriov_post_write_config(dev, old_num_vfs);
 }
 
 static const VMStateDescription nvme_vmstate = {
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index 4e4524673a39add1b830723d4e2e6d10a86ab582..d32079ebdfab493a7eb61e9cbedc6bd2d3dbfd0b 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -729,7 +729,6 @@ static void *fw_cfg_modify_bytes_read(FWCfgState *s, uint16_t key,
     ptr = s->entries[arch][key].data;
     s->entries[arch][key].data = data;
     s->entries[arch][key].len = len;
-    s->entries[arch][key].callback_opaque = NULL;
     s->entries[arch][key].allow_write = false;
 
     return ptr;
diff --git a/hw/pci-bridge/Kconfig b/hw/pci-bridge/Kconfig
index 67077366cc7daa00b04a140627e30e79fe1fe789..449ec9864357c77ab870dd2e7f91e68ae1739cca 100644
--- a/hw/pci-bridge/Kconfig
+++ b/hw/pci-bridge/Kconfig
@@ -1,3 +1,8 @@
+config PCI_BRIDGE
+    bool
+    default y if PCI_DEVICES
+    depends on PCI
+
 config PCIE_PORT
     bool
     default y if PCI_DEVICES
diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c
index 1ce4e7bebae7ef6719d9ec932e132ba04b69fe73..1e1ab5bb19fe093daa9533f7963f032696038a2c 100644
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@@ -145,6 +145,8 @@ static Property gen_rp_props[] = {
                                 speed, PCIE_LINK_SPEED_16),
     DEFINE_PROP_PCIE_LINK_WIDTH("x-width", PCIESlot,
                                 width, PCIE_LINK_WIDTH_32),
+    DEFINE_PROP_UINT8("fast-plug", PCIESlot, fast_plug, 0),
+    DEFINE_PROP_UINT8("fast-unplug", PCIESlot, fast_unplug, 0),
     DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/hw/pci-bridge/meson.build b/hw/pci-bridge/meson.build
index 6d5ad9f37b226c143d51ff60d87f76de59c78801..a8b88e909998cb71c9e9119784cce551bc17b01b 100644
--- a/hw/pci-bridge/meson.build
+++ b/hw/pci-bridge/meson.build
@@ -1,5 +1,5 @@
 pci_ss = ss.source_set()
-pci_ss.add(files('pci_bridge_dev.c'))
+pci_ss.add(when: 'CONFIG_PCI_BRIDGE', if_true: files('pci_bridge_dev.c'))
 pci_ss.add(when: 'CONFIG_I82801B11', if_true: files('i82801b11.c'))
 pci_ss.add(when: 'CONFIG_IOH3420', if_true: files('ioh3420.c'))
 pci_ss.add(when: 'CONFIG_PCIE_PORT', if_true: files('pcie_root_port.c', 'gen_pcie_root_port.c'))
diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c
index f477f97847d753341f341281ed2abed2ce242574..004142709cd6287d7f03133c6d2987fa8b923689 100644
--- a/hw/pci-host/designware.c
+++ b/hw/pci-host/designware.c
@@ -360,7 +360,7 @@ static void designware_pcie_root_config_write(PCIDevice *d, uint32_t address,
 
     case DESIGNWARE_PCIE_ATU_UPPER_TARGET:
         viewport->target &= 0x00000000FFFFFFFFULL;
-        viewport->target |= val;
+        viewport->target |= (uint64_t)val << 32;
         break;
 
     case DESIGNWARE_PCIE_ATU_LIMIT:
diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c
index 1092dc3b7082bc8cedbcb1d697b729ead355fd54..162f6221abde87690d160fc0167fd65287bbd957 100644
--- a/hw/pci-host/gpex-acpi.c
+++ b/hw/pci-host/gpex-acpi.c
@@ -49,9 +49,10 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq)
     }
 }
 
-static void acpi_dsdt_add_pci_osc(Aml *dev)
+static void acpi_dsdt_add_pci_osc(Aml *dev, bool preserve_config)
 {
     Aml *method, *UUID, *ifctx, *ifctx1, *elsectx, *buf;
+    uint8_t byte_list[1] = {0};
 
     /* Declare an _OSC (OS Control Handoff) method */
     aml_append(dev, aml_name_decl("SUPP", aml_int(0)));
@@ -113,10 +114,24 @@ static void acpi_dsdt_add_pci_osc(Aml *dev)
     UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
     ifctx = aml_if(aml_equal(aml_arg(0), UUID));
     ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0)));
-    uint8_t byte_list[1] = {1};
+    if (preserve_config) {
+        /* support for functions other than function 0 and function 5 */
+        byte_list[0] = 0x21;
+    }
     buf = aml_buffer(1, byte_list);
     aml_append(ifctx1, aml_return(buf));
     aml_append(ifctx, ifctx1);
+
+    if (preserve_config) {
+        Aml *ifctx2 = aml_if(aml_equal(aml_arg(2), aml_int(5)));
+        /*
+         * 0 - The operating system must not ignore the PCI configuration that
+         *     firmware has done at boot time.
+         */
+        aml_append(ifctx2, aml_return(aml_int(0)));
+        aml_append(ifctx, ifctx2);
+    }
+
     aml_append(method, ifctx);
 
     byte_list[0] = 0;
@@ -188,7 +203,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg)
             if (is_cxl) {
                 build_cxl_osc_method(dev);
             } else {
-                acpi_dsdt_add_pci_osc(dev);
+                acpi_dsdt_add_pci_osc(dev, cfg->preserve_config);
             }
 
             aml_append(scope, dev);
@@ -263,7 +278,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg)
     }
     aml_append(dev, aml_name_decl("_CRS", rbuf));
 
-    acpi_dsdt_add_pci_osc(dev);
+    acpi_dsdt_add_pci_osc(dev, cfg->preserve_config);
 
     Aml *dev_res0 = aml_device("%s", "RES0");
     aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02")));
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index c49417abb2df4dba40dcc52e61ee94c36300d328..447ef2b16364c0de83a859845c29748d0092c3dc 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@ static Property pci_props[] = {
                     QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
     DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
                     QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+    DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice,
+                     max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
     DEFINE_PROP_END_OF_LIST()
 };
 
@@ -1201,6 +1203,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
                        "bus master container", UINT64_MAX);
     address_space_init(&pci_dev->bus_master_as,
                        &pci_dev->bus_master_container_region, pci_dev->name);
+    pci_dev->bus_master_as.max_bounce_buffer_size =
+        pci_dev->max_bounce_buffer_size;
 
     if (phase_check(PHASE_MACHINE_READY)) {
         pci_init_bus_master(pci_dev);
@@ -2411,6 +2415,7 @@ static void pci_add_option_rom(PCIDevice *pdev, bool is_default_rom,
     snprintf(name, sizeof(name), "%s.rom",
              vmsd ? vmsd->name : object_get_typename(OBJECT(pdev)));
 
+    qemu_log("add rom file: %s\n", name);
     pdev->has_rom = true;
     memory_region_init_rom(&pdev->rom, OBJECT(pdev), name, pdev->romsize,
                            &error_fatal);
@@ -2657,6 +2662,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
     k->unrealize = pci_qdev_unrealize;
     k->bus_type = TYPE_PCI_BUS;
     device_class_set_props(k, pci_props);
+    object_class_property_set_description(
+        klass, "x-max-bounce-buffer-size",
+        "Maximum buffer size allocated for bounce buffers used for mapped "
+        "access to indirect DMA memory");
 }
 
 static void pci_device_class_base_init(ObjectClass *klass, void *data)
@@ -2672,11 +2681,27 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data)
     }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus|devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+                                           PCIBus **piommu_bus,
+                                           PCIBus **aliased_bus,
+                                           int *aliased_devfn)
 {
     PCIBus *bus = pci_get_bus(dev);
     PCIBus *iommu_bus = bus;
-    uint8_t devfn = dev->devfn;
+    int devfn = dev->devfn;
 
     while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
         PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2717,13 +2742,79 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
 
         iommu_bus = parent_bus;
     }
-    if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+    assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+    assert(iommu_bus);
+
+    if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+        iommu_bus = NULL;
+    }
+
+    *piommu_bus = iommu_bus;
+
+    if (aliased_bus) {
+        *aliased_bus = bus;
+    }
+
+    if (aliased_devfn) {
+        *aliased_devfn = devfn;
+    }
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+    PCIBus *bus;
+    PCIBus *iommu_bus;
+    int devfn;
+
+    pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+    if (iommu_bus) {
         return iommu_bus->iommu_ops->get_address_space(bus,
                                  iommu_bus->iommu_opaque, devfn);
     }
     return &address_space_memory;
 }
 
+bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+                                 Error **errp)
+{
+    PCIBus *iommu_bus;
+
+    /* set_iommu_device requires device's direct BDF instead of aliased BDF */
+    pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+    if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+        return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+                                                      iommu_bus->iommu_opaque,
+                                                      dev->devfn, hiod, errp);
+    }
+    return true;
+}
+
+void pci_device_unset_iommu_device(PCIDevice *dev)
+{
+    PCIBus *iommu_bus;
+
+    pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+    if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) {
+        return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev),
+                                                        iommu_bus->iommu_opaque,
+                                                        dev->devfn);
+    }
+}
+
+bool pci_device_get_pasid_cap(PCIDevice *dev)
+{
+    PCIBus *iommu_bus;
+
+    pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+    if (iommu_bus && iommu_bus->iommu_ops->get_pasid_cap) {
+        return iommu_bus->iommu_ops->get_pasid_cap(pci_get_bus(dev),
+                                                   iommu_bus->iommu_opaque,
+                                                   dev->devfn);
+    }
+    return false;
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
     /*
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 6db0cf69cd8a83ae2d439b3e194f4a8cb70ee244..a5b4e54bd775e02ee493a4fca2aeeaee3df9d575 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -97,13 +97,6 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev)
         return;
     }
 
-    /* Clear and fill LNKCAP from what was configured above */
-    pci_long_test_and_clear_mask(exp_cap + PCI_EXP_LNKCAP,
-                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
-    pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
-                               QEMU_PCI_EXP_LNKCAP_MLW(s->width) |
-                               QEMU_PCI_EXP_LNKCAP_MLS(s->speed));
-
     /*
      * Link bandwidth notification is required for all root ports and
      * downstream ports supporting links wider than x1 or multiple link
@@ -111,6 +104,12 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev)
      */
     if (s->width > QEMU_PCI_EXP_LNK_X1 ||
         s->speed > QEMU_PCI_EXP_LNK_2_5GT) {
+        /* Clear and fill LNKCAP from what was configured above */
+        pci_long_test_and_clear_mask(exp_cap + PCI_EXP_LNKCAP,
+                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
+        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
+                               QEMU_PCI_EXP_LNKCAP_MLW(s->width) |
+                               QEMU_PCI_EXP_LNKCAP_MLS(s->speed));
         pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
                                    PCI_EXP_LNKCAP_LBNC);
     }
@@ -556,6 +555,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev,
     uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap;
     uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP);
     uint16_t sltctl = pci_get_word(exp_cap + PCI_EXP_SLTCTL);
+    PCIESlot *s = PCIE_SLOT(hotplug_pdev);
 
     /* Check if hot-unplug is disabled on the slot */
     if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) {
@@ -601,7 +601,17 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev,
         return;
     }
 
-    pcie_cap_slot_push_attention_button(hotplug_pdev);
+    if ((pci_dev->cap_present & QEMU_PCIE_LNKSTA_DLLLA) && s->fast_plug) {
+        pci_word_test_and_clear_mask(pci_dev->config + pci_dev->exp.exp_cap + PCI_EXP_LNKSTA,
+                                    PCI_EXP_LNKSTA_DLLLA);
+    }
+
+    if (s->fast_unplug) {
+        pcie_cap_slot_event(hotplug_pdev,
+                            PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP);
+    } else {
+        pcie_cap_slot_push_attention_button(hotplug_pdev);
+    }
 }
 
 /* pci express slot for pci express root/downstream port
@@ -1113,3 +1123,15 @@ void pcie_acs_reset(PCIDevice *dev)
         pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0);
     }
 }
+
+void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps)
+{
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, 1,
+                        offset, PCI_EXT_CAP_PASID_SIZEOF);
+
+    dev->exp.pasid_cap = offset;
+
+    pci_set_word(dev->config + offset + PCI_PASID_CAP, caps);
+
+    pci_set_word(dev->wmask + dev->exp.pasid_cap + PCI_PASID_CTRL, 0x7);
+}
diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
index a1fe65f5d801dffae113f3231aadf934913bbfc9..da209b7f47fd38d33927281d79af2f4696b54ae3 100644
--- a/hw/pci/pcie_sriov.c
+++ b/hw/pci/pcie_sriov.c
@@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev)
 
     assert(sriov_cap > 0);
     num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
+    if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
+        return;
+    }
 
     dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs);
 
diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 384226296bfba2ffdcfe740296780ac65b8f854f..7722e52fa40b3fd3a95486afcf3adc930487d0b5 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -203,6 +203,8 @@ static void dt_i2c_create(void *fdt, const char *soc, const char *mpic,
     qemu_fdt_setprop_cells(fdt, i2c, "cell-index", 0);
     qemu_fdt_setprop_cells(fdt, i2c, "interrupts", irq0, 0x2);
     qemu_fdt_setprop_phandle(fdt, i2c, "interrupt-parent", mpic);
+    qemu_fdt_setprop_cell(fdt, i2c, "#size-cells", 0);
+    qemu_fdt_setprop_cell(fdt, i2c, "#address-cells", 1);
     qemu_fdt_setprop_string(fdt, "/aliases", alias, i2c);
 
     g_free(i2c);
@@ -832,7 +834,7 @@ static DeviceState *ppce500_init_mpic_qemu(PPCE500MachineState *pms,
 }
 
 static DeviceState *ppce500_init_mpic_kvm(const PPCE500MachineClass *pmc,
-                                          IrqLines *irqs, Error **errp)
+                                          Error **errp)
 {
 #ifdef CONFIG_KVM
     DeviceState *dev;
@@ -872,7 +874,7 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms,
         Error *err = NULL;
 
         if (kvm_kernel_irqchip_allowed()) {
-            dev = ppce500_init_mpic_kvm(pmc, irqs, &err);
+            dev = ppce500_init_mpic_kvm(pmc, &err);
         }
         if (kvm_kernel_irqchip_required() && !dev) {
             error_reportf_err(err,
@@ -1024,7 +1026,7 @@ void ppce500_init(MachineState *machine)
     sysbus_connect_irq(s, 0, qdev_get_gpio_in(mpicdev, MPC8544_I2C_IRQ));
     memory_region_add_subregion(ccsr_addr_space, MPC8544_I2C_REGS_OFFSET,
                                 sysbus_mmio_get_region(s, 0));
-    i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c");
+    i2c = I2C_BUS(qdev_get_child_bus(dev, "i2c"));
     i2c_slave_create_simple(i2c, "ds1338", RTC_REGS_OFFSET);
 
     /* eSDHC */
@@ -1073,7 +1075,7 @@ void ppce500_init(MachineState *machine)
     memory_region_add_subregion(ccsr_addr_space, MPC8544_PCI_REGS_OFFSET,
                                 sysbus_mmio_get_region(s, 0));
 
-    pci_bus = (PCIBus *)qdev_get_child_bus(dev, "pci.0");
+    pci_bus = PCI_BUS(qdev_get_child_bus(dev, "pci.0"));
     if (!pci_bus)
         printf("couldn't create PCI controller!\n");
 
diff --git a/hw/ppc/pnv_i2c.c b/hw/ppc/pnv_i2c.c
index 656a48eebe51597e9fee89f4ccbceb8bd3b14804..0ac6aa5c06f055d1850cb550c87856772326155e 100644
--- a/hw/ppc/pnv_i2c.c
+++ b/hw/ppc/pnv_i2c.c
@@ -673,6 +673,9 @@ static void pnv_i2c_class_init(ObjectClass *klass, void *data)
 
     xscomc->dt_xscom = pnv_i2c_dt_xscom;
 
+    /* Reason: This device is part of the CPU and cannot be used separately */
+    dc->user_creatable = false;
+
     dc->desc = "PowerNV I2C";
     dc->realize = pnv_i2c_realize;
     device_class_set_props(dc, pnv_i2c_properties);
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index f283f7e38d619fd65985146b5e1572c221f95c53..d1d07bec4644da4ae6a99d3357d6d17ff66264de 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -84,27 +84,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
 {
     VFIOAddressSpace *space = vfio_get_address_space(as);
-    VFIOContainer *container = NULL;
+    VFIOContainerBase *bcontainer = NULL;
 
     if (QLIST_EMPTY(&space->containers)) {
         /* No containers to act on */
         goto out;
     }
 
-    container = QLIST_FIRST(&space->containers);
+    bcontainer = QLIST_FIRST(&space->containers);
 
-    if (QLIST_NEXT(container, next)) {
+    if (QLIST_NEXT(bcontainer, next)) {
         /*
          * We don't yet have logic to synchronize EEH state across
          * multiple containers
          */
-        container = NULL;
+        bcontainer = NULL;
         goto out;
     }
 
 out:
     vfio_put_address_space(space);
-    return container;
+    return container_of(bcontainer, VFIOContainer, bcontainer);
 }
 
 static bool vfio_eeh_as_ok(AddressSpace *as)
diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c
index e3b430a81f4f195e6aaf967bc45cc80b42f3c26c..b5b6514d79fcd71afece90da18e83ed35d37d6b9 100644
--- a/hw/ppc/vof.c
+++ b/hw/ppc/vof.c
@@ -646,7 +646,7 @@ static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base)
     mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen);
     g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc));
     if (sc == 2) {
-        mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac));
+        mem0_end = ldq_be_p(mem0_reg + sizeof(uint32_t) * ac);
     } else {
         mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac));
     }
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 8b10c32a3c6eee5446ed3a0ee54433b8e577355e..8b708422fe5291511b94d6c62f2bfa47562396df 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -281,7 +281,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
     while (bytes > 0) {
         len = (bytes > pci_access_width) ? pci_access_width : bytes;
         if (is_write) {
-            memcpy(&val, ptr, len);
+            val = ldn_le_p(ptr, len);
             pci_host_config_write_common(o->pci_dev, offset,
                                          pci_config_size(o->pci_dev),
                                          val, len);
@@ -289,7 +289,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
         } else {
             val = pci_host_config_read_common(o->pci_dev, offset,
                                               pci_config_size(o->pci_dev), len);
-            memcpy(ptr, &val, len);
+            stn_le_p(ptr, len, val);
             trace_vfu_cfg_read(offset, val);
         }
         offset += len;
diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index b6a5eb4452e44fd95ac2a78118295fe4f3475d66..1e11ac9432274f26c8febea25fba09bb597aebeb 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -41,6 +41,7 @@ config RISCV_VIRT
     select RISCV_IMSIC
     select SIFIVE_PLIC
     select SIFIVE_TEST
+    select SMBIOS
     select VIRTIO_MMIO
     select FW_CFG_DMA
     select PLATFORM_BUS
diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index d2eac2415619c704349a6bdad23a2fb2cf27ffd2..9b29ed1108f6ceefac3bc0ac48430783f6e8154b 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -36,6 +36,7 @@
 #include "hw/riscv/boot.h"
 #include "hw/riscv/numa.h"
 #include "kvm/kvm_riscv.h"
+#include "hw/firmware/smbios.h"
 #include "hw/intc/riscv_aclint.h"
 #include "hw/intc/riscv_aplic.h"
 #include "hw/intc/riscv_imsic.h"
@@ -1249,6 +1250,45 @@ static void create_platform_bus(RISCVVirtState *s, DeviceState *irqchip)
                                 sysbus_mmio_get_region(sysbus, 0));
 }
 
+static void virt_build_smbios(RISCVVirtState *s)
+{
+    MachineClass *mc = MACHINE_GET_CLASS(s);
+    MachineState *ms = MACHINE(s);
+    uint8_t *smbios_tables, *smbios_anchor;
+    size_t smbios_tables_len, smbios_anchor_len;
+    struct smbios_phys_mem_area mem_array;
+    const char *product = "QEMU Virtual Machine";
+
+    if (kvm_enabled()) {
+        product = "KVM Virtual Machine";
+    }
+
+    smbios_set_defaults("QEMU", product, mc->name, false,
+                        true, SMBIOS_ENTRY_POINT_TYPE_64);
+
+    if (riscv_is_32bit(&s->soc[0])) {
+        smbios_set_default_processor_family(0x200);
+    } else {
+        smbios_set_default_processor_family(0x201);
+    }
+
+    /* build the array of physical mem area from base_memmap */
+    mem_array.address = s->memmap[VIRT_DRAM].base;
+    mem_array.length = ms->ram_size;
+
+    smbios_get_tables(ms, &mem_array, 1,
+                      &smbios_tables, &smbios_tables_len,
+                      &smbios_anchor, &smbios_anchor_len,
+                      &error_fatal);
+
+    if (smbios_anchor) {
+        fw_cfg_add_file(s->fw_cfg, "etc/smbios/smbios-tables",
+                        smbios_tables, smbios_tables_len);
+        fw_cfg_add_file(s->fw_cfg, "etc/smbios/smbios-anchor",
+                        smbios_anchor, smbios_anchor_len);
+    }
+}
+
 static void virt_machine_done(Notifier *notifier, void *data)
 {
     RISCVVirtState *s = container_of(notifier, RISCVVirtState,
@@ -1337,6 +1377,8 @@ static void virt_machine_done(Notifier *notifier, void *data)
         riscv_setup_direct_kernel(kernel_entry, fdt_load_addr);
     }
 
+    virt_build_smbios(s);
+
     if (virt_is_acpi_enabled(s)) {
         virt_acpi_setup(s);
     }
diff --git a/hw/rtc/ls7a_rtc.c b/hw/rtc/ls7a_rtc.c
index 1f9e38a735bd75886908e7009f14a26ca1e6261d..be9546c8505d7af32603b39a9df63c295530e66d 100644
--- a/hw/rtc/ls7a_rtc.c
+++ b/hw/rtc/ls7a_rtc.c
@@ -145,20 +145,22 @@ static void toymatch_write(LS7ARtcState *s, uint64_t val, int num)
         now = qemu_clock_get_ms(rtc_clock);
         toymatch_val_to_time(s, val, &tm);
         expire_time = now + (qemu_timedate_diff(&tm) - s->offset_toy) * 1000;
-        timer_mod(s->toy_timer[num], expire_time);
+        if (expire_time > now)
+            timer_mod(s->toy_timer[num], expire_time);
     }
 }
 
 static void rtcmatch_write(LS7ARtcState *s, uint64_t val, int num)
 {
-    uint64_t expire_ns;
+    int64_t expire_ns;
 
     /* it do not support write when toy disabled */
     if (rtc_enabled(s)) {
         s->rtcmatch[num] = val;
         /* calculate expire time */
         expire_ns = ticks_to_ns(val) - ticks_to_ns(s->offset_rtc);
-        timer_mod_ns(s->rtc_timer[num], expire_ns);
+        if (expire_ns > 0)
+            timer_mod_ns(s->rtc_timer[num], expire_ns);
     }
 }
 
@@ -185,7 +187,7 @@ static void ls7a_rtc_stop(LS7ARtcState *s)
 static void ls7a_toy_start(LS7ARtcState *s)
 {
     int i;
-    uint64_t expire_time, now;
+    int64_t expire_time, now;
     struct tm tm = {};
 
     now = qemu_clock_get_ms(rtc_clock);
@@ -194,19 +196,21 @@ static void ls7a_toy_start(LS7ARtcState *s)
     for (i = 0; i < TIMER_NUMS; i++) {
         toymatch_val_to_time(s, s->toymatch[i], &tm);
         expire_time = now + (qemu_timedate_diff(&tm) - s->offset_toy) * 1000;
-        timer_mod(s->toy_timer[i], expire_time);
+        if (expire_time > now)
+            timer_mod(s->toy_timer[i], expire_time);
     }
 }
 
 static void ls7a_rtc_start(LS7ARtcState *s)
 {
     int i;
-    uint64_t expire_time;
+    int64_t expire_time;
 
     /* recalculate expire time and enable timer */
     for (i = 0; i < TIMER_NUMS; i++) {
         expire_time = ticks_to_ns(s->rtcmatch[i]) - ticks_to_ns(s->offset_rtc);
-        timer_mod_ns(s->rtc_timer[i], expire_time);
+        if (expire_time > 0)
+            timer_mod_ns(s->rtc_timer[i], expire_time);
     }
 }
 
@@ -370,7 +374,7 @@ static void toy_timer_cb(void *opaque)
     LS7ARtcState *s = opaque;
 
     if (toy_enabled(s)) {
-        qemu_irq_raise(s->irq);
+        qemu_irq_pulse(s->irq);
     }
 }
 
@@ -379,7 +383,7 @@ static void rtc_timer_cb(void *opaque)
     LS7ARtcState *s = opaque;
 
     if (rtc_enabled(s)) {
-        qemu_irq_raise(s->irq);
+        qemu_irq_pulse(s->irq);
     }
 }
 
diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c
index 2d391a83969e92529125a9f3f7ab846d7f84cabf..e61c76d0605ed9e5e1cafaf816bc14a4955a6d23 100644
--- a/hw/rtc/mc146818rtc.c
+++ b/hw/rtc/mc146818rtc.c
@@ -606,7 +606,8 @@ static void rtc_set_time(MC146818RtcState *s)
     s->base_rtc = mktimegm(&tm);
     s->last_update = qemu_clock_get_ns(rtc_clock);
 
-    qapi_event_send_rtc_change(qemu_timedate_diff(&tm), qom_path);
+    set_rtc_date_diff(qemu_timedate_diff(&tm));
+    qapi_event_send_rtc_change(get_rtc_date_diff(), qom_path);
 }
 
 static void rtc_set_cmos(MC146818RtcState *s, const struct tm *tm)
diff --git a/hw/rtc/pl031.c b/hw/rtc/pl031.c
index b01d0e75d1af40334289df94795fa5e5e3c513e6..57e9a3561663bce6e8002f26bbb2fc6ef0978735 100644
--- a/hw/rtc/pl031.c
+++ b/hw/rtc/pl031.c
@@ -63,6 +63,15 @@ static uint32_t pl031_get_count(PL031State *s)
     return s->tick_offset + now / NANOSECONDS_PER_SECOND;
 }
 
+static void pl031_get_date(Object *obj, struct tm *current_tm, Error **errp)
+{
+    PL031State *s = PL031(obj);
+    time_t ti = pl031_get_count(s);
+
+    /* Changed to UTC time */
+    gmtime_r(&ti, current_tm);
+}
+
 static void pl031_set_alarm(PL031State *s)
 {
     uint32_t ticks;
@@ -144,7 +153,8 @@ static void pl031_write(void * opaque, hwaddr offset,
         s->tick_offset += value - pl031_get_count(s);
 
         qemu_get_timedate(&tm, s->tick_offset);
-        qapi_event_send_rtc_change(qemu_timedate_diff(&tm), qom_path);
+	set_rtc_date_diff(qemu_timedate_diff(&tm));
+        qapi_event_send_rtc_change(get_rtc_date_diff(), qom_path);
 
         pl031_set_alarm(s);
         break;
@@ -201,6 +211,20 @@ static void pl031_init(Object *obj)
         qemu_clock_get_ns(rtc_clock) / NANOSECONDS_PER_SECOND;
 
     s->timer = timer_new_ns(rtc_clock, pl031_interrupt, s);
+    object_property_add_tm(OBJECT(s), "date", pl031_get_date);
+}
+
+static void pl031_realize(DeviceState *d, Error **errp)
+{
+    object_property_add_alias(qdev_get_machine(), "rtc-time",
+                              OBJECT(d), "date");
+}
+
+static void pl031_unrealize(DeviceState *d)
+{
+    if (object_property_find(qdev_get_machine(), "rtc-time")) {
+        object_property_del(qdev_get_machine(), "rtc-time");
+    }
 }
 
 static void pl031_finalize(Object *obj)
@@ -337,6 +361,8 @@ static void pl031_class_init(ObjectClass *klass, void *data)
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->vmsd = &vmstate_pl031;
+    dc->realize = pl031_realize;
+    dc->unrealize = pl031_unrealize;
     device_class_set_props(dc, pl031_properties);
 }
 
diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig
index 4c068d7960b973474b84d3cef9d4a3bf31ca4f9f..26ad1044858ce9ba3e78c65ffb9f295c31d7ab4c 100644
--- a/hw/s390x/Kconfig
+++ b/hw/s390x/Kconfig
@@ -6,6 +6,7 @@ config S390_CCW_VIRTIO
     imply VFIO_CCW
     imply WDT_DIAG288
     imply PCIE_DEVICES
+    imply IOMMUFD
     select PCI_EXPRESS
     select S390_FLIC
     select S390_FLIC_KVM if KVM
diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
index 893e71a41b2ddd88a007bd7d395895fc4d444540..69bf04e23ab01db6c0a5c37321d62069e1872ccb 100644
--- a/hw/s390x/sclp.c
+++ b/hw/s390x/sclp.c
@@ -21,13 +21,14 @@
 #include "hw/s390x/s390-pci-bus.h"
 #include "hw/s390x/ipl.h"
 #include "hw/s390x/cpu-topology.h"
+#include "hw/s390x/s390-virtio-ccw.h"
 
-static inline SCLPDevice *get_sclp_device(void)
+static SCLPDevice *get_sclp_device(void)
 {
     static SCLPDevice *sclp;
 
     if (!sclp) {
-        sclp = SCLP(object_resolve_path_type("", TYPE_SCLP, NULL));
+        sclp = S390_CCW_MACHINE(qdev_get_machine())->sclp;
     }
     return sclp;
 }
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index fc4b77fdb02707fa51585b511ed016ffb7315bdc..7b60ac11f550ba80fcdd78b5381c479b5b3e88b7 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -144,14 +144,10 @@ void scsi_bus_init_named(SCSIBus *bus, size_t bus_size, DeviceState *host,
     qbus_set_bus_hotplug_handler(BUS(bus));
 }
 
-static void scsi_dma_restart_bh(void *opaque)
+void scsi_retry_requests(SCSIDevice *s)
 {
-    SCSIDevice *s = opaque;
     SCSIRequest *req, *next;
 
-    qemu_bh_delete(s->bh);
-    s->bh = NULL;
-
     aio_context_acquire(blk_get_aio_context(s->conf.blk));
     QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
         scsi_req_ref(req);
@@ -171,6 +167,17 @@ static void scsi_dma_restart_bh(void *opaque)
         scsi_req_unref(req);
     }
     aio_context_release(blk_get_aio_context(s->conf.blk));
+}
+
+static void scsi_dma_restart_bh(void *opaque)
+{
+    SCSIDevice *s = opaque;
+
+    qemu_bh_delete(s->bh);
+    s->bh = NULL;
+
+    scsi_retry_requests(s);
+
     /* Drop the reference that was acquired in scsi_dma_restart_cb */
     object_unref(OBJECT(s));
 }
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 6691f5edb841b744a25a090d9c5f2d294a18ec5d..282bdc1ee0372cdd26a9def1b5bd8aa5a7b63d72 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -249,12 +249,16 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int ret, bool acct_failed)
         scsi_req_retry(&r->req);
         return true;
 
+    case BLOCK_ERROR_ACTION_RETRY:
+        scsi_req_retry(&r->req);
+        return true;
+
     default:
         g_assert_not_reached();
     }
 }
 
-static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed)
+static bool scsi_disk_req_handle_error(SCSIDiskReq *r, int ret, bool acct_failed)
 {
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
@@ -268,6 +272,18 @@ static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed)
     return false;
 }
 
+static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed)
+{
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+    if (r->req.io_canceled || ret < 0) {
+        return scsi_disk_req_handle_error(r, ret, acct_failed);
+    }
+
+    blk_error_retry_reset_timeout(s->qdev.conf.blk);
+    return false;
+}
+
 static void scsi_aio_complete(void *opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
@@ -416,7 +432,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret)
     SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
 
     assert (r->req.aiocb == NULL);
-    if (scsi_disk_req_check_error(r, ret, false)) {
+    if (scsi_disk_req_handle_error(r, ret, false)) {
         goto done;
     }
 
@@ -457,6 +473,9 @@ static void scsi_do_read_cb(void *opaque, int ret)
         block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
     } else {
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+        if (!r->req.io_canceled) {
+            blk_error_retry_reset_timeout(s->qdev.conf.blk);
+        }
     }
     scsi_do_read(opaque, ret);
     aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
@@ -2021,7 +2040,10 @@ static int32_t scsi_disk_emulate_command(SCSIRequest *req, uint8_t *buf)
     memset(outbuf, 0, r->buflen);
     switch (req->cmd.buf[0]) {
     case TEST_UNIT_READY:
-        assert(blk_is_available(s->qdev.conf.blk));
+        if (!blk_is_available(s->qdev.conf.blk)) {
+            scsi_check_condition(r, SENSE_CODE(NO_MEDIUM));
+            return 0;
+        }
         break;
     case INQUIRY:
         buflen = scsi_disk_emulate_inquiry(req, outbuf);
@@ -2391,6 +2413,13 @@ static void scsi_disk_resize_cb(void *opaque)
     }
 }
 
+static void scsi_disk_retry_request(void *opaque)
+{
+    SCSIDiskState *s = opaque;
+
+    scsi_retry_requests(&s->qdev);
+}
+
 static void scsi_cd_change_media_cb(void *opaque, bool load, Error **errp)
 {
     SCSIDiskState *s = opaque;
@@ -2440,12 +2469,14 @@ static const BlockDevOps scsi_disk_removable_block_ops = {
     .is_medium_locked = scsi_cd_is_medium_locked,
     .is_tray_open     = scsi_cd_is_tray_open,
     .resize_cb        = scsi_disk_resize_cb,
+    .retry_request_cb = scsi_disk_retry_request,
 };
 
 static const BlockDevOps scsi_disk_block_ops = {
     .drained_begin = scsi_disk_drained_begin,
     .drained_end   = scsi_disk_drained_end,
     .resize_cb     = scsi_disk_resize_cb,
+    .retry_request_cb = scsi_disk_retry_request,
 };
 
 static void scsi_disk_unit_attention_reported(SCSIDevice *dev)
@@ -3241,9 +3272,7 @@ static const TypeInfo scsi_cd_info = {
 
 #ifdef __linux__
 static Property scsi_block_properties[] = {
-    DEFINE_BLOCK_ERROR_PROPERTIES(SCSIDiskState, qdev.conf),
-    DEFINE_PROP_DRIVE("drive", SCSIDiskState, qdev.conf.blk),
-    DEFINE_PROP_BOOL("share-rw", SCSIDiskState, qdev.conf.share_rw, false),
+    DEFINE_SCSI_DISK_PROPERTIES(),
     DEFINE_PROP_UINT16("rotation_rate", SCSIDiskState, rotation_rate, 0),
     DEFINE_PROP_UINT64("max_unmap_size", SCSIDiskState, max_unmap_size,
                        DEFAULT_MAX_UNMAP_SIZE),
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 2417f0ad84799059a626d41099cd537bf4bdb836..12fdd8e748734ca84bb33d4234a11f7cef8a1498 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -192,6 +192,10 @@ static int scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s, int len)
         (r->req.cmd.buf[1] & 0x01)) {
         page = r->req.cmd.buf[2];
         if (page == 0xb0 && r->buflen >= 8) {
+            if (s->blocksize == 0) {
+                qemu_log("device blocksize is 0!\n");
+                abort();
+            }
             uint8_t buf[16] = {};
             uint8_t buf_used = MIN(r->buflen, 16);
             uint64_t max_transfer = calculate_max_transfer(s);
@@ -326,11 +330,23 @@ static void scsi_read_complete(void * opaque, int ret)
     /* Snoop READ CAPACITY output to set the blocksize.  */
     if (r->req.cmd.buf[0] == READ_CAPACITY_10 &&
         (ldl_be_p(&r->buf[0]) != 0xffffffffU || s->max_lba == 0)) {
-        s->blocksize = ldl_be_p(&r->buf[4]);
+        int new_blocksize = ldl_be_p(&r->buf[4]);
+        if (s->blocksize != new_blocksize) {
+            qemu_log("device id=%s type=%d: blocksize %d change to %d\n",
+                     s->qdev.id ? s->qdev.id : "null", s->type,
+                     s->blocksize, new_blocksize);
+        }
+        s->blocksize = new_blocksize;
         s->max_lba = ldl_be_p(&r->buf[0]) & 0xffffffffULL;
     } else if (r->req.cmd.buf[0] == SERVICE_ACTION_IN_16 &&
                (r->req.cmd.buf[1] & 31) == SAI_READ_CAPACITY_16) {
-        s->blocksize = ldl_be_p(&r->buf[8]);
+        int new_blocksize = ldl_be_p(&r->buf[8]);
+        if (s->blocksize != new_blocksize) {
+            qemu_log("device id=%s type=%d: blocksize %d change to %d\n",
+                     s->qdev.id ? s->qdev.id : "null", s->type,
+                     s->blocksize, new_blocksize);
+        }
+        s->blocksize = new_blocksize;
         s->max_lba = ldq_be_p(&r->buf[0]);
     }
 
@@ -766,7 +782,6 @@ static void scsi_generic_realize(SCSIDevice *s, Error **errp)
 
     /* Only used by scsi-block, but initialize it nevertheless to be clean.  */
     s->default_scsi_version = -1;
-    s->io_timeout = DEFAULT_IO_TIMEOUT;
     scsi_generic_read_device_inquiry(s);
 }
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 9c751bf29699697e40a24bc5149583f9380b7cee..bc7feb404a3a32dbaf8b365e5960d4843c002aaa 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -781,7 +781,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
                               req->req.cmd.tag, req->req.cmd.cdb[0]);
 
     d = virtio_scsi_device_get(s, req->req.cmd.lun);
-    if (!d) {
+    if (!d || !d->qdev.realized) {
         req->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET;
         virtio_scsi_complete_cmd_req(req);
         return -ENOENT;
diff --git a/hw/sd/sdhci-pci.c b/hw/sd/sdhci-pci.c
index 9b7bee8b3fbf21a31646e37c28f288b494d06734..c1eb67cf29257446a398b28c8ad43f1f367da77e 100644
--- a/hw/sd/sdhci-pci.c
+++ b/hw/sd/sdhci-pci.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
+#include "hw/irq.h"
 #include "hw/qdev-properties.h"
 #include "hw/sd/sdhci.h"
 #include "sdhci-internal.h"
@@ -49,6 +50,7 @@ static void sdhci_pci_exit(PCIDevice *dev)
 {
     SDHCIState *s = PCI_SDHCI(dev);
 
+    qemu_free_irq(s->irq);
     sdhci_common_unrealize(s);
     sdhci_uninitfn(s);
 }
diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 40473b0db099a6289181ffe2795e19ccbe532ae9..e95ea34895dbf47a312ad16b7be68af41c84ae19 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -473,6 +473,7 @@ static uint32_t sdhci_read_dataport(SDHCIState *s, unsigned size)
     }
 
     for (i = 0; i < size; i++) {
+        assert(s->data_count < s->buf_maxsz);
         value |= s->fifo_buffer[s->data_count] << i * 8;
         s->data_count++;
         /* check if we've read all valid data (blksize bytes) from buffer */
@@ -561,6 +562,7 @@ static void sdhci_write_dataport(SDHCIState *s, uint32_t value, unsigned size)
     }
 
     for (i = 0; i < size; i++) {
+        assert(s->data_count < s->buf_maxsz);
         s->fifo_buffer[s->data_count] = value & 0xFF;
         s->data_count++;
         value >>= 8;
@@ -1208,6 +1210,12 @@ sdhci_write(void *opaque, hwaddr offset, uint64_t val, unsigned size)
         if (!(s->capareg & R_SDHC_CAPAB_SDMA_MASK)) {
             value &= ~SDHC_TRNS_DMA;
         }
+
+        /* TRNMOD writes are inhibited while Command Inhibit (DAT) is true */
+        if (s->prnsts & SDHC_DATA_INHIBIT) {
+            mask |= 0xffff;
+        }
+
         MASKED_WRITE(s->trnmod, mask, value & SDHC_TRNMOD_MASK);
         MASKED_WRITE(s->cmdreg, mask >> 16, value >> 16);
 
diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
index 2a90601ac5d93ca1cf2e07d5a722b614939ab71d..c0c5a81e664bbfa70be6e6b965a3d6ef6530186c 100644
--- a/hw/smbios/smbios.c
+++ b/hw/smbios/smbios.c
@@ -102,6 +102,7 @@ static struct {
 #define DEFAULT_CPU_SPEED 2000
 
 static struct {
+    uint16_t processor_family;
     const char *sock_pfx, *manufacturer, *version, *serial, *asset, *part;
     uint64_t max_speed;
     uint64_t current_speed;
@@ -110,6 +111,7 @@ static struct {
     .max_speed = DEFAULT_CPU_SPEED,
     .current_speed = DEFAULT_CPU_SPEED,
     .processor_id = 0,
+    .processor_family = 0x01, /* Other */
 };
 
 struct type8_instance {
@@ -337,6 +339,10 @@ static const QemuOptDesc qemu_smbios_type4_opts[] = {
         .name = "part",
         .type = QEMU_OPT_STRING,
         .help = "part number",
+    }, {
+        .name = "processor-family",
+        .type = QEMU_OPT_NUMBER,
+        .help = "processor family",
     }, {
         .name = "processor-id",
         .type = QEMU_OPT_NUMBER,
@@ -726,7 +732,7 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance)
     snprintf(sock_str, sizeof(sock_str), "%s%2x", type4.sock_pfx, instance);
     SMBIOS_TABLE_SET_STR(4, socket_designation_str, sock_str);
     t->processor_type = 0x03; /* CPU */
-    t->processor_family = 0x01; /* Other */
+    t->processor_family = 0xfe; /* use Processor Family 2 field */
     SMBIOS_TABLE_SET_STR(4, processor_manufacturer_str, type4.manufacturer);
     if (type4.processor_id == 0) {
         t->processor_id[0] = cpu_to_le32(smbios_cpuid_version);
@@ -758,7 +764,7 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance)
     t->thread_count = (threads_per_socket > 255) ? 0xFF : threads_per_socket;
 
     t->processor_characteristics = cpu_to_le16(0x02); /* Unknown */
-    t->processor_family2 = cpu_to_le16(0x01); /* Other */
+    t->processor_family2 = cpu_to_le16(type4.processor_family);
 
     if (tbl_len == SMBIOS_TYPE_4_LEN_V30) {
         t->core_count2 = t->core_enabled2 = cpu_to_le16(cores_per_socket);
@@ -983,6 +989,13 @@ void smbios_set_cpuid(uint32_t version, uint32_t features)
         field = value;                                                    \
     }
 
+void smbios_set_default_processor_family(uint16_t processor_family)
+{
+    if (type4.processor_family <= 0x01) {
+        type4.processor_family = processor_family;
+    }
+}
+
 void smbios_set_defaults(const char *manufacturer, const char *product,
                          const char *version, bool legacy_mode,
                          bool uuid_encoded, SmbiosEntryPointType ep_type)
@@ -1402,6 +1415,9 @@ void smbios_entry_add(QemuOpts *opts, Error **errp)
                 return;
             }
             save_opt(&type4.sock_pfx, opts, "sock_pfx");
+            type4.processor_family = qemu_opt_get_number(opts,
+                                                         "processor-family",
+                                                         0x01 /* Other */);
             save_opt(&type4.manufacturer, opts, "manufacturer");
             save_opt(&type4.version, opts, "version");
             save_opt(&type4.serial, opts, "serial");
diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c
index 2a4001b774a2f5c66a6c794763e99276b2bb1d94..8af919a970401566080a8567786faf2a821ef6e1 100644
--- a/hw/ssi/aspeed_smc.c
+++ b/hw/ssi/aspeed_smc.c
@@ -764,8 +764,7 @@ static uint8_t aspeed_smc_hclk_divisor(uint8_t hclk_mask)
         }
     }
 
-    aspeed_smc_error("invalid HCLK mask %x", hclk_mask);
-    return 0;
+    g_assert_not_reached();
 }
 
 /*
diff --git a/hw/timer/exynos4210_mct.c b/hw/timer/exynos4210_mct.c
index 446bbd2b96cf3c65d8c15b1ddb13c39f2937065f..6f47bfe2c29b8d874e17a95756886dc21e5f23d6 100644
--- a/hw/timer/exynos4210_mct.c
+++ b/hw/timer/exynos4210_mct.c
@@ -815,7 +815,7 @@ static uint32_t exynos4210_ltick_cnt_get_cnto(struct tick_timer *s)
         /* Both are counting */
         icnto = remain / s->tcntb;
         if (icnto) {
-            tcnto = remain % (icnto * s->tcntb);
+            tcnto = remain % ((uint64_t)icnto * s->tcntb);
         } else {
             tcnto = remain % s->tcntb;
         }
diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c
index eccdb852a0a7ecefe3b7cd8c5a2919572a326272..f57d33e77169fb197e3afbd58110fd28ea8fe3e5 100644
--- a/hw/ufs/ufs.c
+++ b/hw/ufs/ufs.c
@@ -25,6 +25,7 @@
 #include "qapi/error.h"
 #include "migration/vmstate.h"
 #include "scsi/constants.h"
+#include "hw/irq.h"
 #include "trace.h"
 #include "ufs.h"
 
@@ -126,6 +127,10 @@ static MemTxResult ufs_dma_read_req_upiu(UfsRequest *req)
     copy_size = sizeof(UtpUpiuHeader) + UFS_TRANSACTION_SPECIFIC_FIELD_SIZE +
                 data_segment_length;
 
+    if (copy_size > sizeof(req->req_upiu)) {
+        copy_size = sizeof(req->req_upiu);
+    }
+
     ret = ufs_addr_read(u, req_upiu_base_addr, &req->req_upiu, copy_size);
     if (ret) {
         trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr);
@@ -225,6 +230,10 @@ static MemTxResult ufs_dma_write_rsp_upiu(UfsRequest *req)
         copy_size = rsp_upiu_byte_len;
     }
 
+    if (copy_size > sizeof(req->rsp_upiu)) {
+        copy_size = sizeof(req->rsp_upiu);
+    }
+
     ret = ufs_addr_write(u, rsp_upiu_base_addr, &req->rsp_upiu, copy_size);
     if (ret) {
         trace_ufs_err_dma_write_rsp_upiu(req->slot, rsp_upiu_base_addr);
@@ -447,6 +456,14 @@ void ufs_build_upiu_header(UfsRequest *req, uint8_t trans_type, uint8_t flags,
     req->rsp_upiu.header.data_segment_length = cpu_to_be16(data_segment_length);
 }
 
+void ufs_build_query_response(UfsRequest *req)
+{
+    req->rsp_upiu.qr.opcode = req->req_upiu.qr.opcode;
+    req->rsp_upiu.qr.idn = req->req_upiu.qr.idn;
+    req->rsp_upiu.qr.index = req->req_upiu.qr.index;
+    req->rsp_upiu.qr.selector = req->req_upiu.qr.selector;
+}
+
 static UfsReqResult ufs_exec_scsi_cmd(UfsRequest *req)
 {
     UfsHc *u = req->hc;
@@ -923,10 +940,6 @@ static QueryRespCode ufs_read_desc(UfsRequest *req)
     if (length > req->rsp_upiu.qr.data[0]) {
         length = req->rsp_upiu.qr.data[0];
     }
-    req->rsp_upiu.qr.opcode = req->req_upiu.qr.opcode;
-    req->rsp_upiu.qr.idn = req->req_upiu.qr.idn;
-    req->rsp_upiu.qr.index = req->req_upiu.qr.index;
-    req->rsp_upiu.qr.selector = req->req_upiu.qr.selector;
     req->rsp_upiu.qr.length = cpu_to_be16(length);
 
     return status;
@@ -1007,6 +1020,7 @@ static UfsReqResult ufs_exec_query_cmd(UfsRequest *req)
     data_segment_length = be16_to_cpu(req->rsp_upiu.qr.length);
     ufs_build_upiu_header(req, UFS_UPIU_TRANSACTION_QUERY_RSP, 0, status, 0,
                           data_segment_length);
+    ufs_build_query_response(req);
 
     if (status != UFS_QUERY_RESULT_SUCCESS) {
         return UFS_REQUEST_FAIL;
@@ -1273,6 +1287,8 @@ static void ufs_exit(PCIDevice *pci_dev)
 {
     UfsHc *u = UFS(pci_dev);
 
+    qemu_free_irq(u->irq);
+
     qemu_bh_delete(u->doorbell_bh);
     qemu_bh_delete(u->complete_bh);
 
diff --git a/hw/ufs/ufs.h b/hw/ufs/ufs.h
index 8fda94f4efa9e4d2d8a5b8041528195eb74f8378..8a74b4c2ab7280f8128fc0263902a24708c0a1e6 100644
--- a/hw/ufs/ufs.h
+++ b/hw/ufs/ufs.h
@@ -132,6 +132,7 @@ static inline bool is_wlun(uint8_t lun)
 void ufs_build_upiu_header(UfsRequest *req, uint8_t trans_type, uint8_t flags,
                            uint8_t response, uint8_t scsi_status,
                            uint16_t data_segment_length);
+void ufs_build_query_response(UfsRequest *req);
 void ufs_complete_req(UfsRequest *req, UfsReqResult req_result);
 void ufs_init_wlu(UfsLu *wlu, uint8_t wlun);
 #endif /* HW_UFS_UFS_H */
diff --git a/hw/usb/bus.c b/hw/usb/bus.c
index 92d6ed5626141f8622c55bedb9dc23326b3338c8..20cd9b6e6fd9e70689bfe756ff44a2ff8a013fca 100644
--- a/hw/usb/bus.c
+++ b/hw/usb/bus.c
@@ -536,6 +536,10 @@ void usb_check_attach(USBDevice *dev, Error **errp)
                    bus->qbus.name, port->path, portspeed);
         return;
     }
+
+    qemu_log("attach usb device \"%s\" (%s speed) to VM bus \"%s\", "
+             "port \"%s\" (%s speed)\n", dev->product_desc, devspeed,
+             bus->qbus.name, port->path, portspeed);
 }
 
 void usb_device_attach(USBDevice *dev, Error **errp)
@@ -564,6 +568,8 @@ int usb_device_detach(USBDevice *dev)
 
     usb_detach(port);
     dev->attached = false;
+    qemu_log("detach usb device \"%s\" from VM bus \"%s\", port \"%s\"\n",
+             dev->product_desc, bus->qbus.name, port->path);
     return 0;
 }
 
diff --git a/hw/usb/core.c b/hw/usb/core.c
index 975f76250a1a34b79252975b13724368577240f4..51b36126cab25b5c9854f157b09ce14c233b2e51 100644
--- a/hw/usb/core.c
+++ b/hw/usb/core.c
@@ -87,7 +87,7 @@ void usb_device_reset(USBDevice *dev)
         return;
     }
     usb_device_handle_reset(dev);
-    dev->remote_wakeup = 0;
+    dev->remote_wakeup &= ~USB_DEVICE_REMOTE_WAKEUP;
     dev->addr = 0;
     dev->state = USB_STATE_DEFAULT;
 }
@@ -105,7 +105,8 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream)
          */
         return;
     }
-    if (dev->remote_wakeup && dev->port && dev->port->ops->wakeup) {
+    if ((dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP)
+        && dev->port && dev->port->ops->wakeup) {
         dev->port->ops->wakeup(dev->port);
     }
     if (bus->ops->wakeup_endpoint) {
diff --git a/hw/usb/desc.c b/hw/usb/desc.c
index f2bdc05a95394c6c5f942dd2aad67dc124ede069..333f73fff150f0a5fd1d015eac1bb1943286f6a8 100644
--- a/hw/usb/desc.c
+++ b/hw/usb/desc.c
@@ -752,7 +752,7 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p,
         if (config->bmAttributes & USB_CFG_ATT_SELFPOWER) {
             data[0] |= 1 << USB_DEVICE_SELF_POWERED;
         }
-        if (dev->remote_wakeup) {
+        if (dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) {
             data[0] |= 1 << USB_DEVICE_REMOTE_WAKEUP;
         }
         data[1] = 0x00;
@@ -762,14 +762,15 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p,
     }
     case DeviceOutRequest | USB_REQ_CLEAR_FEATURE:
         if (value == USB_DEVICE_REMOTE_WAKEUP) {
-            dev->remote_wakeup = 0;
+            dev->remote_wakeup &= ~USB_DEVICE_REMOTE_WAKEUP;
             ret = 0;
         }
         trace_usb_clear_device_feature(dev->addr, value, ret);
         break;
     case DeviceOutRequest | USB_REQ_SET_FEATURE:
+        dev->remote_wakeup |= USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED;
         if (value == USB_DEVICE_REMOTE_WAKEUP) {
-            dev->remote_wakeup = 1;
+            dev->remote_wakeup |= USB_DEVICE_REMOTE_WAKEUP;
             ret = 0;
         }
         trace_usb_set_device_feature(dev->addr, value, ret);
diff --git a/hw/usb/dev-hid.c b/hw/usb/dev-hid.c
index bdd6d1ffafe432b75a5321305c5d0193e3e27bb1..cc68d1ce9eec463610429d5802ae5c23a6b74a5d 100644
--- a/hw/usb/dev-hid.c
+++ b/hw/usb/dev-hid.c
@@ -745,7 +745,7 @@ static int usb_ptr_post_load(void *opaque, int version_id)
 {
     USBHIDState *s = opaque;
 
-    if (s->dev.remote_wakeup) {
+    if (s->dev.remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) {
         hid_pointer_activate(&s->hid);
     }
     return 0;
diff --git a/hw/usb/dev-hub.c b/hw/usb/dev-hub.c
index 5703e0e826ec6484c5b7e92b248656fd382414eb..7b3cfa2c1b5de9de73230c9348f98f6a3cd1f8a6 100644
--- a/hw/usb/dev-hub.c
+++ b/hw/usb/dev-hub.c
@@ -479,6 +479,7 @@ static void usb_hub_handle_control(USBDevice *dev, USBPacket *p,
                     usb_hub_port_clear(port, PORT_STAT_SUSPEND);
                     port->wPortChange = 0;
                 }
+                break;
             default:
                 goto fail;
             }
diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c
index 19b4534c20c10d78d5e3d1fdca7aefc168e9265a..fa8c7af5c81d00cc783fa9d8b9c66df1efb688a2 100644
--- a/hw/usb/hcd-ehci.c
+++ b/hw/usb/hcd-ehci.c
@@ -1086,8 +1086,9 @@ static void ehci_opreg_write(void *ptr, hwaddr addr,
     case CONFIGFLAG:
         val &= 0x1;
         if (val) {
-            for(i = 0; i < NB_PORTS; i++)
+            for (i = 0; i < NB_PORTS; i++) {
                 handle_port_owner_write(s, i, 0);
+            }
         }
         break;
 
@@ -2286,7 +2287,8 @@ static void ehci_work_bh(void *opaque)
             ehci_update_frindex(ehci, skipped_uframes);
             ehci->last_run_ns += UFRAME_TIMER_NS * skipped_uframes;
             uframes -= skipped_uframes;
-            DPRINTF("WARNING - EHCI skipped %d uframes\n", skipped_uframes);
+            DPRINTF("WARNING - EHCI skipped %"PRIu64" uframes\n",
+                    skipped_uframes);
         }
 
         for (i = 0; i < uframes; i++) {
diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c
index 77baaa7a6b1099927c420decfc956e6f74429349..a92581ff5f883632d7a167a8e183ba3d1d5943d4 100644
--- a/hw/usb/hcd-uhci.c
+++ b/hw/usb/hcd-uhci.c
@@ -44,6 +44,8 @@
 #include "hcd-uhci.h"
 
 #define FRAME_TIMER_FREQ 1000
+#define FRAME_TIMER_FREQ_LAZY 10
+#define USB_DEVICE_NEED_NORMAL_FREQ "QEMU USB Tablet"
 
 #define FRAME_MAX_LOOPS  256
 
@@ -109,6 +111,22 @@ static void uhci_async_cancel(UHCIAsync *async);
 static void uhci_queue_fill(UHCIQueue *q, UHCI_TD *td);
 static void uhci_resume(void *opaque);
 
+static int64_t uhci_frame_timer_freq = FRAME_TIMER_FREQ_LAZY;
+
+static void uhci_set_frame_freq(int freq)
+{
+    if (freq <= 0) {
+        return;
+    }
+
+    uhci_frame_timer_freq = freq;
+}
+
+static qemu_usb_controller qemu_uhci = {
+    .name = "uhci",
+    .qemu_set_freq = uhci_set_frame_freq,
+};
+
 static inline int32_t uhci_queue_token(UHCI_TD *td)
 {
     if ((td->token & (0xf << 15)) == 0) {
@@ -351,7 +369,7 @@ static int uhci_post_load(void *opaque, int version_id)
 
     if (version_id < 2) {
         s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-            (NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ);
+            (NANOSECONDS_PER_SECOND / uhci_frame_timer_freq);
     }
     return 0;
 }
@@ -392,8 +410,29 @@ static void uhci_port_write(void *opaque, hwaddr addr,
         if ((val & UHCI_CMD_RS) && !(s->cmd & UHCI_CMD_RS)) {
             /* start frame processing */
             trace_usb_uhci_schedule_start();
-            s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-                (NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ);
+
+            /*
+             * If the frequency of frame_timer is too slow, Guest OS (Win2012) would become
+             * blue-screen after hotplugging some vcpus.
+             * If this USB device support the remote-wakeup, the UHCI controller
+             * will enter global suspend mode when there is no input for several seconds.
+             * In this case, Qemu will delete the frame_timer. Since the frame_timer has been deleted,
+             * there is no influence to the performance of Vms. So, we can change the frequency to 1000.
+             * After that the frequency will be safe when we trigger the frame_timer again.
+             * Excepting this, there are two ways to change the frequency:
+             * 1)VNC connect/disconnect;2)attach/detach USB device.
+             */
+            if ((uhci_frame_timer_freq != FRAME_TIMER_FREQ)
+                && (s->ports[0].port.dev)
+                && (!memcmp(s->ports[0].port.dev->product_desc,
+                USB_DEVICE_NEED_NORMAL_FREQ, strlen(USB_DEVICE_NEED_NORMAL_FREQ)))
+                && (s->ports[0].port.dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED)) {
+                qemu_log("turn up the frequency of UHCI controller to %d\n", FRAME_TIMER_FREQ);
+                uhci_frame_timer_freq = FRAME_TIMER_FREQ;
+            }
+
+            s->frame_time = NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ;
+            s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->frame_time;
             timer_mod(s->frame_timer, s->expire_time);
             s->status &= ~UHCI_STS_HCHALTED;
         } else if (!(val & UHCI_CMD_RS)) {
@@ -457,8 +496,9 @@ static void uhci_port_write(void *opaque, hwaddr addr,
             int n;
 
             n = (addr >> 1) & 7;
-            if (n >= NB_PORTS)
+            if (n >= NB_PORTS) {
                 return;
+            }
             port = &s->ports[n];
             dev = port->port.dev;
             if (dev && dev->attached) {
@@ -513,8 +553,9 @@ static uint64_t uhci_port_read(void *opaque, hwaddr addr, unsigned size)
             UHCIPort *port;
             int n;
             n = (addr >> 1) & 7;
-            if (n >= NB_PORTS)
+            if (n >= NB_PORTS) {
                 goto read_default;
+            }
             port = &s->ports[n];
             val = port->ctrl;
         }
@@ -1081,7 +1122,6 @@ static void uhci_frame_timer(void *opaque)
     UHCIState *s = opaque;
     uint64_t t_now, t_last_run;
     int i, frames;
-    const uint64_t frame_t = NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ;
 
     s->completions_only = false;
     qemu_bh_cancel(s->bh);
@@ -1097,14 +1137,14 @@ static void uhci_frame_timer(void *opaque)
     }
 
     /* We still store expire_time in our state, for migration */
-    t_last_run = s->expire_time - frame_t;
+    t_last_run = s->expire_time - s->frame_time;
     t_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 
     /* Process up to MAX_FRAMES_PER_TICK frames */
-    frames = (t_now - t_last_run) / frame_t;
+    frames = (t_now - t_last_run) / s->frame_time;
     if (frames > s->maxframes) {
         int skipped = frames - s->maxframes;
-        s->expire_time += skipped * frame_t;
+        s->expire_time += skipped * s->frame_time;
         s->frnum = (s->frnum + skipped) & 0x7ff;
         frames -= skipped;
     }
@@ -1121,7 +1161,7 @@ static void uhci_frame_timer(void *opaque)
         /* The spec says frnum is the frame currently being processed, and
          * the guest must look at frnum - 1 on interrupt, so inc frnum now */
         s->frnum = (s->frnum + 1) & 0x7ff;
-        s->expire_time += frame_t;
+        s->expire_time += s->frame_time;
     }
 
     /* Complete the previous frame(s) */
@@ -1132,7 +1172,12 @@ static void uhci_frame_timer(void *opaque)
     }
     s->pending_int_mask = 0;
 
-    timer_mod(s->frame_timer, t_now + frame_t);
+    /* expire_time is calculated from last frame_time, we should calculate it
+     * according to new frame_time which equals to
+     * NANOSECONDS_PER_SECOND / uhci_frame_timer_freq */
+    s->expire_time -= s->frame_time - NANOSECONDS_PER_SECOND / uhci_frame_timer_freq;
+    s->frame_time = NANOSECONDS_PER_SECOND / uhci_frame_timer_freq;
+    timer_mod(s->frame_timer, t_now + s->frame_time);
 }
 
 static const MemoryRegionOps uhci_ioport_ops = {
@@ -1193,8 +1238,10 @@ void usb_uhci_common_realize(PCIDevice *dev, Error **errp)
     s->bh = qemu_bh_new_guarded(uhci_bh, s, &DEVICE(dev)->mem_reentrancy_guard);
     s->frame_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, uhci_frame_timer, s);
     s->num_ports_vmstate = NB_PORTS;
+    s->frame_time = NANOSECONDS_PER_SECOND / uhci_frame_timer_freq;
     QTAILQ_INIT(&s->queues);
 
+    qemu_register_usb_controller(&qemu_uhci, QEMU_USB_CONTROLLER_UHCI);
     memory_region_init_io(&s->io_bar, OBJECT(s), &uhci_ioport_ops, s,
                           "uhci", 0x20);
 
diff --git a/hw/usb/hcd-uhci.h b/hw/usb/hcd-uhci.h
index 69f8b40c49c2febf063a0f3ee3e5fd677e3f7a9a..091871991133c3299db55e05089175c1dc7cecdc 100644
--- a/hw/usb/hcd-uhci.h
+++ b/hw/usb/hcd-uhci.h
@@ -50,6 +50,7 @@ typedef struct UHCIState {
     uint16_t status;
     uint16_t intr; /* interrupt enable register */
     uint16_t frnum; /* frame number */
+    uint64_t frame_time; /* frame time in ns */
     uint32_t fl_base_addr; /* frame list base address */
     uint8_t sof_timing;
     uint8_t status2; /* bit 0 and 1 are used to generate UHCI_STS_USBINT */
diff --git a/hw/usb/host-libusb.c b/hw/usb/host-libusb.c
index d7060a42d57e072303e509fde25eae91318847e6..11a246ac7223db529cda44320f6118f62c9048b9 100644
--- a/hw/usb/host-libusb.c
+++ b/hw/usb/host-libusb.c
@@ -945,6 +945,30 @@ static void usb_host_ep_update(USBHostDevice *s)
     libusb_free_config_descriptor(conf);
 }
 
+static unsigned int usb_get_controller_type(int speed)
+{
+    unsigned int type = MAX_USB_CONTROLLER_TYPES;
+
+    switch (speed) {
+    case USB_SPEED_SUPER:
+        type = QEMU_USB_CONTROLLER_XHCI;
+        break;
+    case USB_SPEED_HIGH:
+        type = QEMU_USB_CONTROLLER_EHCI;
+        break;
+    case USB_SPEED_FULL:
+        type = QEMU_USB_CONTROLLER_UHCI;
+        break;
+    case USB_SPEED_LOW:
+        type = QEMU_USB_CONTROLLER_OHCI;
+        break;
+    default:
+        break;
+    }
+
+    return type;
+}
+
 static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd)
 {
     USBDevice *udev = USB_DEVICE(s);
@@ -968,6 +992,8 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd)
 
         rc = libusb_open(dev, &s->dh);
         if (rc != 0) {
+	    qemu_log("libusb open usb device bus %d, device %d failed\n",
+		      bus_num, addr);
             goto fail;
         }
     } else {
@@ -995,6 +1021,7 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd)
 
     libusb_get_device_descriptor(dev, &s->ddesc);
     usb_host_get_port(s->dev, s->port, sizeof(s->port));
+    qemu_log("open a host usb device on bus %d, device %d\n", bus_num, addr);
 
     usb_ep_init(udev);
     usb_host_ep_update(s);
@@ -1054,6 +1081,12 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd)
     }
 
     trace_usb_host_open_success(bus_num, addr);
+
+    /* change ehci frame time freq when USB passthrough */
+    qemu_log("usb host speed is %d\n", udev->speed);
+    qemu_timer_set_mode(QEMU_TIMER_USB_NORMAL_MODE,
+                        usb_get_controller_type(udev->speed));
+
     return 0;
 
 fail:
@@ -1116,6 +1149,8 @@ static int usb_host_close(USBHostDevice *s)
         usb_device_detach(udev);
     }
 
+    qemu_log("begin to reset the usb device, bus : %d, device : %d\n",
+             s->bus_num, s->addr);
     usb_host_release_interfaces(s);
     libusb_reset_device(s->dh);
     usb_host_attach_kernel(s);
@@ -1129,6 +1164,8 @@ static int usb_host_close(USBHostDevice *s)
     }
 
     usb_host_auto_check(NULL);
+    qemu_timer_set_mode(QEMU_TIMER_USB_LAZY_MODE,
+                        usb_get_controller_type(udev->speed));
     return 0;
 }
 
diff --git a/hw/vfio/Kconfig b/hw/vfio/Kconfig
index 7cdba0560aa821c88d3420b36f86020575834202..5f0d3c2d2bfae4acbc76b91bca73ec02d0184e55 100644
--- a/hw/vfio/Kconfig
+++ b/hw/vfio/Kconfig
@@ -41,3 +41,9 @@ config VFIO_IGD
     bool
     default y if PC_PCI
     depends on VFIO_PCI
+
+config VFIO_HCT
+    bool
+    default y
+    select VFIO
+    depends on LINUX && PCI
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index bbf69ff55ae8a922d0ab6d6f966a9a2283cdd2a3..6b2bc325493fbb3a33fc48a11b3869686f8dfaaf 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -11,10 +11,12 @@
  */
 
 #include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
 #include <linux/vfio.h>
 #include <sys/ioctl.h>
 #include "qapi/error.h"
 #include "hw/vfio/vfio-common.h"
+#include "sysemu/iommufd.h"
 #include "hw/s390x/ap-device.h"
 #include "qemu/error-report.h"
 #include "qemu/event_notifier.h"
@@ -158,18 +160,9 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
     VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
     VFIODevice *vbasedev = &vapdev->vdev;
 
-    vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
-    vbasedev->ops = &vfio_ap_ops;
-    vbasedev->type = VFIO_DEVICE_TYPE_AP;
-    vbasedev->dev = dev;
-
-    /*
-     * vfio-ap devices operate in a way compatible with discarding of
-     * memory in RAM blocks, as no pages are pinned in the host.
-     * This needs to be set before vfio_get_device() for vfio common to
-     * handle ram_block_discard_disable().
-     */
-    vapdev->vdev.ram_block_discard_allowed = true;
+    if (vfio_device_get_name(vbasedev, errp) < 0) {
+        return;
+    }
 
     ret = vfio_attach_device(vbasedev->name, vbasedev,
                              &address_space_memory, errp);
@@ -204,6 +197,10 @@ static void vfio_ap_unrealize(DeviceState *dev)
 
 static Property vfio_ap_properties[] = {
     DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev),
+#ifdef CONFIG_IOMMUFD
+    DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd,
+                     TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -224,11 +221,39 @@ static const VMStateDescription vfio_ap_vmstate = {
     .unmigratable = 1,
 };
 
+static void vfio_ap_instance_init(Object *obj)
+{
+    VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj);
+    VFIODevice *vbasedev = &vapdev->vdev;
+
+    /*
+     * vfio-ap devices operate in a way compatible with discarding of
+     * memory in RAM blocks, as no pages are pinned in the host.
+     * This needs to be set before vfio_get_device() for vfio common to
+     * handle ram_block_discard_disable().
+     */
+    vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
+                     DEVICE(vapdev), true);
+
+    /* AP device is mdev type device */
+    vbasedev->mdev = true;
+}
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp)
+{
+    vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp);
+}
+#endif
+
 static void vfio_ap_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     device_class_set_props(dc, vfio_ap_properties);
+#ifdef CONFIG_IOMMUFD
+    object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd);
+#endif
     dc->vmsd = &vfio_ap_vmstate;
     dc->desc = "VFIO-based AP device assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
@@ -243,6 +268,7 @@ static const TypeInfo vfio_ap_info = {
     .name = TYPE_VFIO_AP_DEVICE,
     .parent = TYPE_AP_DEVICE,
     .instance_size = sizeof(VFIOAPDevice),
+    .instance_init = vfio_ap_instance_init,
     .class_init = vfio_ap_class_init,
 };
 
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index d857bb8d0fe4ff71b3cf635d6da43eccbe593d5f..257e9723cf58ab852d039178c43b4f2456765aeb 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -15,12 +15,14 @@
  */
 
 #include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
 #include <linux/vfio.h>
 #include <linux/vfio_ccw.h>
 #include <sys/ioctl.h>
 
 #include "qapi/error.h"
 #include "hw/vfio/vfio-common.h"
+#include "sysemu/iommufd.h"
 #include "hw/s390x/s390-ccw.h"
 #include "hw/s390x/vfio-ccw.h"
 #include "hw/qdev-properties.h"
@@ -588,22 +590,9 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
         }
     }
 
-    vbasedev->ops = &vfio_ccw_ops;
-    vbasedev->type = VFIO_DEVICE_TYPE_CCW;
-    vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid,
-                           vcdev->cdev.hostid.ssid,
-                           vcdev->cdev.hostid.devid);
-    vbasedev->dev = dev;
-
-    /*
-     * All vfio-ccw devices are believed to operate in a way compatible with
-     * discarding of memory in RAM blocks, ie. pages pinned in the host are
-     * in the current working set of the guest driver and therefore never
-     * overlap e.g., with pages available to the guest balloon driver.  This
-     * needs to be set before vfio_get_device() for vfio common to handle
-     * ram_block_discard_disable().
-     */
-    vbasedev->ram_block_discard_allowed = true;
+    if (vfio_device_get_name(vbasedev, errp) < 0) {
+        return;
+    }
 
     ret = vfio_attach_device(cdev->mdevid, vbasedev,
                              &address_space_memory, errp);
@@ -677,6 +666,10 @@ static void vfio_ccw_unrealize(DeviceState *dev)
 static Property vfio_ccw_properties[] = {
     DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev),
     DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false),
+#ifdef CONFIG_IOMMUFD
+    DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd,
+                     TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -685,12 +678,42 @@ static const VMStateDescription vfio_ccw_vmstate = {
     .unmigratable = 1,
 };
 
+static void vfio_ccw_instance_init(Object *obj)
+{
+    VFIOCCWDevice *vcdev = VFIO_CCW(obj);
+    VFIODevice *vbasedev = &vcdev->vdev;
+
+    /* CCW device is mdev type device */
+    vbasedev->mdev = true;
+
+    /*
+     * All vfio-ccw devices are believed to operate in a way compatible with
+     * discarding of memory in RAM blocks, ie. pages pinned in the host are
+     * in the current working set of the guest driver and therefore never
+     * overlap e.g., with pages available to the guest balloon driver.  This
+     * needs to be set before vfio_get_device() for vfio common to handle
+     * ram_block_discard_disable().
+     */
+    vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
+                     DEVICE(vcdev), true);
+}
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp)
+{
+    vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp);
+}
+#endif
+
 static void vfio_ccw_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass);
 
     device_class_set_props(dc, vfio_ccw_properties);
+#ifdef CONFIG_IOMMUFD
+    object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd);
+#endif
     dc->vmsd = &vfio_ccw_vmstate;
     dc->desc = "VFIO-based subchannel assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
@@ -708,6 +731,7 @@ static const TypeInfo vfio_ccw_info = {
     .name = TYPE_VFIO_CCW,
     .parent = TYPE_S390_CCW,
     .instance_size = sizeof(VFIOCCWDevice),
+    .instance_init = vfio_ccw_instance_init,
     .class_init = vfio_ccw_class_init,
 };
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index e70fdf5e0cacf756fb8dcce2bfd906c65ae71684..0be63c5fbc574b040ed797cb6411243807f1e537 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void)
 
 bool vfio_viommu_preset(VFIODevice *vbasedev)
 {
-    return vbasedev->container->space->as != &address_space_memory;
+    return vbasedev->bcontainer->space->as != &address_space_memory;
 }
 
 static void vfio_set_migration_error(int err)
@@ -177,7 +177,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
            migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
 }
 
-static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
+static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer)
 {
     VFIODevice *vbasedev;
     MigrationState *ms = migrate_get_current();
@@ -187,7 +187,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
         return false;
     }
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
         VFIOMigration *migration = vbasedev->migration;
 
         if (!migration) {
@@ -203,11 +203,14 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
     return true;
 }
 
-bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
+bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
 {
     VFIODevice *vbasedev;
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+        if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) {
+            return false;
+        }
         if (!vbasedev->dirty_pages_supported) {
             return false;
         }
@@ -220,7 +223,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
  * Check if all VFIO devices are running and migration is active, which is
  * essentially equivalent to the migration being in pre-copy phase.
  */
-bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
+bool
+vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer)
 {
     VFIODevice *vbasedev;
 
@@ -228,7 +232,7 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
         return false;
     }
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
         VFIOMigration *migration = vbasedev->migration;
 
         if (!migration) {
@@ -292,7 +296,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 {
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
-    VFIOContainer *container = giommu->container;
+    VFIOContainerBase *bcontainer = giommu->bcontainer;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
     void *vaddr;
     int ret;
@@ -322,21 +326,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
          * of vaddr will always be there, even if the memory object is
          * destroyed and its backing memory munmap-ed.
          */
-        ret = vfio_dma_map(container, iova,
-                           iotlb->addr_mask + 1, vaddr,
-                           read_only);
+        ret = vfio_container_dma_map(bcontainer, iova,
+                                     iotlb->addr_mask + 1, vaddr,
+                                     read_only);
         if (ret) {
-            error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+            error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx", %p) = %d (%s)",
-                         container, iova,
+                         bcontainer, iova,
                          iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
         }
     } else {
-        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
+        ret = vfio_container_dma_unmap(bcontainer, iova,
+                                       iotlb->addr_mask + 1, iotlb);
         if (ret) {
-            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+            error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%s)",
-                         container, iova,
+                         bcontainer, iova,
                          iotlb->addr_mask + 1, ret, strerror(-ret));
             vfio_set_migration_error(ret);
         }
@@ -350,14 +355,15 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
 {
     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
                                                 listener);
+    VFIOContainerBase *bcontainer = vrdl->bcontainer;
     const hwaddr size = int128_get64(section->size);
     const hwaddr iova = section->offset_within_address_space;
     int ret;
 
     /* Unmap with a single call. */
-    ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
+    ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
     if (ret) {
-        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
+        error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
                      strerror(-ret));
     }
 }
@@ -367,6 +373,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
 {
     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
                                                 listener);
+    VFIOContainerBase *bcontainer = vrdl->bcontainer;
     const hwaddr end = section->offset_within_region +
                        int128_get64(section->size);
     hwaddr start, next, iova;
@@ -385,8 +392,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
                section->offset_within_address_space;
         vaddr = memory_region_get_ram_ptr(section->mr) + start;
 
-        ret = vfio_dma_map(vrdl->container, iova, next - start,
-                           vaddr, section->readonly);
+        ret = vfio_container_dma_map(bcontainer, iova, next - start,
+                                     vaddr, section->readonly);
         if (ret) {
             /* Rollback */
             vfio_ram_discard_notify_discard(rdl, section);
@@ -396,7 +403,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
     return 0;
 }
 
-static void vfio_register_ram_discard_listener(VFIOContainer *container,
+static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
                                                MemoryRegionSection *section)
 {
     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
@@ -409,7 +416,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
     g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
 
     vrdl = g_new0(VFIORamDiscardListener, 1);
-    vrdl->container = container;
+    vrdl->bcontainer = bcontainer;
     vrdl->mr = section->mr;
     vrdl->offset_within_address_space = section->offset_within_address_space;
     vrdl->size = int128_get64(section->size);
@@ -417,14 +424,14 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
                                                                 section->mr);
 
     g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
-    g_assert(container->pgsizes &&
-             vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
+    g_assert(bcontainer->pgsizes &&
+             vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
 
     ram_discard_listener_init(&vrdl->listener,
                               vfio_ram_discard_notify_populate,
                               vfio_ram_discard_notify_discard, true);
     ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
-    QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
+    QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
 
     /*
      * Sanity-check if we have a theoretically problematic setup where we could
@@ -439,7 +446,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
      * number of sections in the address space we could have over time,
      * also consuming DMA mappings.
      */
-    if (container->dma_max_mappings) {
+    if (bcontainer->dma_max_mappings) {
         unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
 
 #ifdef CONFIG_KVM
@@ -448,7 +455,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
         }
 #endif
 
-        QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+        QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
             hwaddr start, end;
 
             start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
@@ -460,23 +467,23 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
         }
 
         if (vrdl_mappings + max_memslots - vrdl_count >
-            container->dma_max_mappings) {
+            bcontainer->dma_max_mappings) {
             warn_report("%s: possibly running out of DMA mappings. E.g., try"
                         " increasing the 'block-size' of virtio-mem devies."
                         " Maximum possible DMA mappings: %d, Maximum possible"
-                        " memslots: %d", __func__, container->dma_max_mappings,
+                        " memslots: %d", __func__, bcontainer->dma_max_mappings,
                         max_memslots);
         }
     }
 }
 
-static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
+static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
                                                  MemoryRegionSection *section)
 {
     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
     VFIORamDiscardListener *vrdl = NULL;
 
-    QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+    QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
         if (vrdl->mr == section->mr &&
             vrdl->offset_within_address_space ==
             section->offset_within_address_space) {
@@ -538,7 +545,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section,
     return true;
 }
 
-static bool vfio_get_section_iova_range(VFIOContainer *container,
+static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
                                         MemoryRegionSection *section,
                                         hwaddr *out_iova, hwaddr *out_end,
                                         Int128 *out_llend)
@@ -566,7 +573,8 @@ static bool vfio_get_section_iova_range(VFIOContainer *container,
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+                                                 listener);
     hwaddr iova, end;
     Int128 llend, llsize;
     void *vaddr;
@@ -577,7 +585,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
         return;
     }
 
-    if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
+    if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
+                                     &llend)) {
         if (memory_region_is_ram_device(section->mr)) {
             trace_vfio_listener_region_add_no_dma_map(
                 memory_region_name(section->mr),
@@ -588,7 +597,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
         return;
     }
 
-    if (vfio_container_add_section_window(container, section, &err)) {
+    if (vfio_container_add_section_window(bcontainer, section, &err)) {
         goto fail;
     }
 
@@ -610,7 +619,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
         giommu->iommu_mr = iommu_mr;
         giommu->iommu_offset = section->offset_within_address_space -
                                section->offset_within_region;
-        giommu->container = container;
+        giommu->bcontainer = bcontainer;
         llend = int128_add(int128_make64(section->offset_within_region),
                            section->size);
         llend = int128_sub(llend, int128_one());
@@ -623,16 +632,17 @@ static void vfio_listener_region_add(MemoryListener *listener,
                             iommu_idx);
 
         ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
-                                                     container->pgsizes,
+                                                     bcontainer->pgsizes,
                                                      &err);
         if (ret) {
             g_free(giommu);
             goto fail;
         }
 
-        if (container->iova_ranges) {
+        if (bcontainer->iova_ranges) {
             ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr,
-                    container->iova_ranges, &err);
+                                                      bcontainer->iova_ranges,
+                                                      &err);
             if (ret) {
                 g_free(giommu);
                 goto fail;
@@ -645,7 +655,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
             g_free(giommu);
             goto fail;
         }
-        QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+        QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
         memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
 
         return;
@@ -659,7 +669,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
      * about changes.
      */
     if (memory_region_has_ram_discard_manager(section->mr)) {
-        vfio_register_ram_discard_listener(container, section);
+        vfio_register_ram_discard_listener(bcontainer, section);
         return;
     }
 
@@ -672,7 +682,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
     llsize = int128_sub(llend, int128_make64(iova));
 
     if (memory_region_is_ram_device(section->mr)) {
-        hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
+        hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
 
         if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
             trace_vfio_listener_region_add_no_dma_map(
@@ -684,12 +694,12 @@ static void vfio_listener_region_add(MemoryListener *listener,
         }
     }
 
-    ret = vfio_dma_map(container, iova, int128_get64(llsize),
-                       vaddr, section->readonly);
+    ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
+                                 vaddr, section->readonly);
     if (ret) {
-        error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+        error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
                    "0x%"HWADDR_PRIx", %p) = %d (%s)",
-                   container, iova, int128_get64(llsize), vaddr, ret,
+                   bcontainer, iova, int128_get64(llsize), vaddr, ret,
                    strerror(-ret));
         if (memory_region_is_ram_device(section->mr)) {
             /* Allow unexpected mappings not to be fatal for RAM devices */
@@ -711,9 +721,9 @@ fail:
      * can gracefully fail.  Runtime, there's not much we can do other
      * than throw a hardware error.
      */
-    if (!container->initialized) {
-        if (!container->error) {
-            error_propagate_prepend(&container->error, err,
+    if (!bcontainer->initialized) {
+        if (!bcontainer->error) {
+            error_propagate_prepend(&bcontainer->error, err,
                                     "Region %s: ",
                                     memory_region_name(section->mr));
         } else {
@@ -728,7 +738,8 @@ fail:
 static void vfio_listener_region_del(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+                                                 listener);
     hwaddr iova, end;
     Int128 llend, llsize;
     int ret;
@@ -741,7 +752,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
     if (memory_region_is_iommu(section->mr)) {
         VFIOGuestIOMMU *giommu;
 
-        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+        QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
                 giommu->n.start == section->offset_within_region) {
                 memory_region_unregister_iommu_notifier(section->mr,
@@ -761,7 +772,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
          */
     }
 
-    if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
+    if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
+                                     &llend)) {
         return;
     }
 
@@ -772,10 +784,10 @@ static void vfio_listener_region_del(MemoryListener *listener,
     if (memory_region_is_ram_device(section->mr)) {
         hwaddr pgmask;
 
-        pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
+        pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
     } else if (memory_region_has_ram_discard_manager(section->mr)) {
-        vfio_unregister_ram_discard_listener(container, section);
+        vfio_unregister_ram_discard_listener(bcontainer, section);
         /* Unregistering will trigger an unmap. */
         try_unmap = false;
     }
@@ -784,27 +796,29 @@ static void vfio_listener_region_del(MemoryListener *listener,
         if (int128_eq(llsize, int128_2_64())) {
             /* The unmap ioctl doesn't accept a full 64-bit span. */
             llsize = int128_rshift(llsize, 1);
-            ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
+            ret = vfio_container_dma_unmap(bcontainer, iova,
+                                           int128_get64(llsize), NULL);
             if (ret) {
-                error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+                error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                              "0x%"HWADDR_PRIx") = %d (%s)",
-                             container, iova, int128_get64(llsize), ret,
+                             bcontainer, iova, int128_get64(llsize), ret,
                              strerror(-ret));
             }
             iova += int128_get64(llsize);
         }
-        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
+        ret = vfio_container_dma_unmap(bcontainer, iova,
+                                       int128_get64(llsize), NULL);
         if (ret) {
-            error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+            error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%s)",
-                         container, iova, int128_get64(llsize), ret,
+                         bcontainer, iova, int128_get64(llsize), ret,
                          strerror(-ret));
         }
     }
 
     memory_region_unref(section->mr);
 
-    vfio_container_del_section_window(container, section);
+    vfio_container_del_section_window(bcontainer, section);
 }
 
 typedef struct VFIODirtyRanges {
@@ -817,13 +831,13 @@ typedef struct VFIODirtyRanges {
 } VFIODirtyRanges;
 
 typedef struct VFIODirtyRangesListener {
-    VFIOContainer *container;
+    VFIOContainerBase *bcontainer;
     VFIODirtyRanges ranges;
     MemoryListener listener;
 } VFIODirtyRangesListener;
 
 static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
-                                     VFIOContainer *container)
+                                     VFIOContainerBase *bcontainer)
 {
     VFIOPCIDevice *pcidev;
     VFIODevice *vbasedev;
@@ -831,7 +845,7 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
 
     owner = memory_region_owner(section->mr);
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
         if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
             continue;
         }
@@ -854,7 +868,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
     hwaddr iova, end, *min, *max;
 
     if (!vfio_listener_valid_section(section, "tracking_update") ||
-        !vfio_get_section_iova_range(dirty->container, section,
+        !vfio_get_section_iova_range(dirty->bcontainer, section,
                                      &iova, &end, NULL)) {
         return;
     }
@@ -878,7 +892,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
      * The alternative would be an IOVATree but that has a much bigger runtime
      * overhead and unnecessary complexity.
      */
-    if (vfio_section_is_vfio_pci(section, dirty->container) &&
+    if (vfio_section_is_vfio_pci(section, dirty->bcontainer) &&
         iova >= UINT32_MAX) {
         min = &range->minpci64;
         max = &range->maxpci64;
@@ -902,7 +916,7 @@ static const MemoryListener vfio_dirty_tracking_listener = {
     .region_add = vfio_dirty_tracking_update,
 };
 
-static void vfio_dirty_tracking_init(VFIOContainer *container,
+static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
                                      VFIODirtyRanges *ranges)
 {
     VFIODirtyRangesListener dirty;
@@ -912,10 +926,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
     dirty.ranges.min64 = UINT64_MAX;
     dirty.ranges.minpci64 = UINT64_MAX;
     dirty.listener = vfio_dirty_tracking_listener;
-    dirty.container = container;
+    dirty.bcontainer = bcontainer;
 
     memory_listener_register(&dirty.listener,
-                             container->space->as);
+                             bcontainer->space->as);
 
     *ranges = dirty.ranges;
 
@@ -927,7 +941,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
     memory_listener_unregister(&dirty.listener);
 }
 
-static void vfio_devices_dma_logging_stop(VFIOContainer *container)
+static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
 {
     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
                               sizeof(uint64_t))] = {};
@@ -938,7 +952,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container)
     feature->flags = VFIO_DEVICE_FEATURE_SET |
                      VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
         if (!vbasedev->dirty_tracking) {
             continue;
         }
@@ -952,7 +966,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container)
 }
 
 static struct vfio_device_feature *
-vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
+vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
                                              VFIODirtyRanges *tracking)
 {
     struct vfio_device_feature *feature;
@@ -1025,21 +1039,21 @@ static void vfio_device_feature_dma_logging_start_destroy(
     g_free(feature);
 }
 
-static int vfio_devices_dma_logging_start(VFIOContainer *container)
+static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer)
 {
     struct vfio_device_feature *feature;
     VFIODirtyRanges ranges;
     VFIODevice *vbasedev;
     int ret = 0;
 
-    vfio_dirty_tracking_init(container, &ranges);
-    feature = vfio_device_feature_dma_logging_start_create(container,
+    vfio_dirty_tracking_init(bcontainer, &ranges);
+    feature = vfio_device_feature_dma_logging_start_create(bcontainer,
                                                            &ranges);
     if (!feature) {
         return -errno;
     }
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
         if (vbasedev->dirty_tracking) {
             continue;
         }
@@ -1056,7 +1070,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
 
 out:
     if (ret) {
-        vfio_devices_dma_logging_stop(container);
+        vfio_devices_dma_logging_stop(bcontainer);
     }
 
     vfio_device_feature_dma_logging_start_destroy(feature);
@@ -1066,13 +1080,14 @@ out:
 
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+                                                 listener);
     int ret;
 
-    if (vfio_devices_all_device_dirty_tracking(container)) {
-        ret = vfio_devices_dma_logging_start(container);
+    if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
+        ret = vfio_devices_dma_logging_start(bcontainer);
     } else {
-        ret = vfio_set_dirty_page_tracking(container, true);
+        ret = vfio_container_set_dirty_page_tracking(bcontainer, true);
     }
 
     if (ret) {
@@ -1084,13 +1099,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
 
 static void vfio_listener_log_global_stop(MemoryListener *listener)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+                                                 listener);
     int ret = 0;
 
-    if (vfio_devices_all_device_dirty_tracking(container)) {
-        vfio_devices_dma_logging_stop(container);
+    if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
+        vfio_devices_dma_logging_stop(bcontainer);
     } else {
-        ret = vfio_set_dirty_page_tracking(container, false);
+        ret = vfio_container_set_dirty_page_tracking(bcontainer, false);
     }
 
     if (ret) {
@@ -1126,14 +1142,14 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
     return 0;
 }
 
-int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
+int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
                                     VFIOBitmap *vbmap, hwaddr iova,
                                     hwaddr size)
 {
     VFIODevice *vbasedev;
     int ret;
 
-    QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
         ret = vfio_device_dma_logging_report(vbasedev, iova, size,
                                              vbmap->bitmap);
         if (ret) {
@@ -1149,31 +1165,40 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
     return 0;
 }
 
-int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
                           uint64_t size, ram_addr_t ram_addr)
 {
     bool all_device_dirty_tracking =
-        vfio_devices_all_device_dirty_tracking(container);
+        vfio_devices_all_device_dirty_tracking(bcontainer);
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                            bcontainer);
     uint64_t dirty_pages;
     VFIOBitmap vbmap;
+    VFIODMARange *qrange;
     int ret;
 
-    if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
+    if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
         cpu_physical_memory_set_dirty_range(ram_addr, size,
                                             tcg_enabled() ? DIRTY_CLIENTS_ALL :
                                             DIRTY_CLIENTS_NOCODE);
         return 0;
     }
 
+    qrange = vfio_lookup_match_range(container, iova, size);
+    /* the same as vfio_dma_unmap() */
+    assert(qrange);
+
     ret = vfio_bitmap_alloc(&vbmap, size);
     if (ret) {
         return ret;
     }
+    g_free(vbmap.bitmap);
+    vbmap.bitmap = qrange->bitmap;
 
     if (all_device_dirty_tracking) {
-        ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
+        ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size);
     } else {
-        ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
+        ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size);
     }
 
     if (ret) {
@@ -1183,11 +1208,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
     dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
                                                          vbmap.pages);
 
-    trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
-                                ram_addr, dirty_pages);
+    trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
 out:
-    g_free(vbmap.bitmap);
-
     return ret;
 }
 
@@ -1201,7 +1223,7 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     vfio_giommu_dirty_notifier *gdn = container_of(n,
                                                 vfio_giommu_dirty_notifier, n);
     VFIOGuestIOMMU *giommu = gdn->giommu;
-    VFIOContainer *container = giommu->container;
+    VFIOContainerBase *bcontainer = giommu->bcontainer;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
     ram_addr_t translated_addr;
     int ret = -EINVAL;
@@ -1216,12 +1238,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 
     rcu_read_lock();
     if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
-        ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
+        ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
                                     translated_addr);
         if (ret) {
             error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%s)",
-                         container, iova, iotlb->addr_mask + 1, ret,
+                         bcontainer, iova, iotlb->addr_mask + 1, ret,
                          strerror(-ret));
         }
     }
@@ -1246,16 +1268,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
      * Sync the whole mapped region (spanning multiple individual mappings)
      * in one go.
      */
-    return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
+    return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr);
 }
 
-static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
-                                                   MemoryRegionSection *section)
+static int
+vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
+                                            MemoryRegionSection *section)
 {
     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
     VFIORamDiscardListener *vrdl = NULL;
 
-    QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+    QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
         if (vrdl->mr == section->mr &&
             vrdl->offset_within_address_space ==
             section->offset_within_address_space) {
@@ -1276,7 +1299,7 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
                                                 &vrdl);
 }
 
-static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
                                   MemoryRegionSection *section)
 {
     ram_addr_t ram_addr;
@@ -1284,7 +1307,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
     if (memory_region_is_iommu(section->mr)) {
         VFIOGuestIOMMU *giommu;
 
-        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+        QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
                 giommu->n.start == section->offset_within_region) {
                 Int128 llend;
@@ -1308,13 +1331,13 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
         }
         return 0;
     } else if (memory_region_has_ram_discard_manager(section->mr)) {
-        return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
+        return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
     }
 
     ram_addr = memory_region_get_ram_addr(section->mr) +
                section->offset_within_region;
 
-    return vfio_get_dirty_bitmap(container,
+    return vfio_get_dirty_bitmap(bcontainer,
                    REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
                    int128_get64(section->size), ram_addr);
 }
@@ -1322,15 +1345,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
 static void vfio_listener_log_sync(MemoryListener *listener,
         MemoryRegionSection *section)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+    VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+                                                 listener);
     int ret;
 
     if (vfio_listener_skipped_section(section)) {
         return;
     }
 
-    if (vfio_devices_all_dirty_tracking(container)) {
-        ret = vfio_sync_dirty_bitmap(container, section);
+    if (vfio_devices_all_dirty_tracking(bcontainer)) {
+        ret = vfio_sync_dirty_bitmap(bcontainer, section);
         if (ret) {
             error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
                          strerror(-ret));
@@ -1339,6 +1363,144 @@ static void vfio_listener_log_sync(MemoryListener *listener,
     }
 }
 
+/*
+ * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP
+ * ioctl. But copy from kvm side and align {start, size} with 64 pages.
+ *
+ * I think the code can be simplified a lot if no alignment requirement.
+ */
+#define VFIO_CLEAR_LOG_SHIFT  6
+#define VFIO_CLEAR_LOG_ALIGN  (qemu_real_host_page_size() << VFIO_CLEAR_LOG_SHIFT)
+#define VFIO_CLEAR_LOG_MASK   (-VFIO_CLEAR_LOG_ALIGN)
+
+static int vfio_log_clear_one_range(VFIOContainer *container,VFIODMARange *qrange,
+                                    uint64_t start, uint64_t size)
+{
+    struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+    struct vfio_iommu_type1_dirty_bitmap_get *range;
+
+    dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
+
+    dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
+    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP;
+    range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
+
+    /*
+     * Now let's deal with the actual bitmap, which is almost the same
+     * as the kvm side.
+     */
+    uint64_t end, bmap_start, start_delta, bmap_npages;
+    unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
+    int ret;
+
+    bmap_start = start & VFIO_CLEAR_LOG_MASK;
+    start_delta = start - bmap_start;
+    bmap_start /= psize;
+
+    bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN)
+        << VFIO_CLEAR_LOG_SHIFT;
+    end = qrange->size / psize;
+    if (bmap_npages > end - bmap_start) {
+        bmap_npages = end - bmap_start;
+    }
+    start_delta /= psize;
+
+    if (start_delta) {
+        bmap_clear = bitmap_new(bmap_npages);
+        bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap,
+                                    bmap_start, start_delta + size / psize);
+        bitmap_clear(bmap_clear, 0, start_delta);
+        range->bitmap.data = (__u64 *)bmap_clear;
+    } else {
+        range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start));
+    }
+
+    range->iova = qrange->iova + bmap_start * psize;
+    range->size = bmap_npages * psize;
+    range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) /
+                                               BITS_PER_BYTE;
+    range->bitmap.pgsize = qemu_real_host_page_size();
+
+    ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
+    if (ret) {
+        error_report("Failed to clear dirty log for iova: 0x%"PRIx64
+                " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
+                (uint64_t)range->size, errno);
+        goto err_out;
+    }
+
+    bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize);
+err_out:
+    g_free(bmap_clear);
+    g_free(dbitmap);
+    return 0;
+}
+
+static int vfio_physical_log_clear(VFIOContainer *container,
+                                   MemoryRegionSection *section)
+{
+    uint64_t start, size, offset, count;
+    VFIODMARange *qrange;
+    int ret = 0;
+
+    if (!container->dirty_log_manual_clear) {
+        /* No need to do explicit clear */
+        return ret;
+    }
+
+    start = section->offset_within_address_space;
+    size = int128_get64(section->size);
+
+    if (!size) {
+        return ret;
+    }
+
+    QLIST_FOREACH(qrange, &container->dma_list, next) {
+        /*
+         * Discard ranges that do not overlap the section (e.g., the
+         * Memory BAR regions of the device)
+         */
+        if (qrange->iova > start + size - 1 ||
+            start > qrange->iova + qrange->size - 1) {
+            continue;
+        }
+
+        if (start >= qrange->iova) {
+            /* The range starts before section or is aligned to it. */
+            offset = start - qrange->iova;
+            count = MIN(qrange->size - offset, size);
+        } else {
+            /* The range starts after section. */
+            offset = 0;
+            count = MIN(qrange->size, size - (qrange->iova - start));
+        }
+        ret = vfio_log_clear_one_range(container, qrange, offset, count);
+        if (ret < 0) {
+            break;
+        }
+    }
+
+    return ret;
+}
+
+static void vfio_listener_log_clear(MemoryListener *listener,
+                                    MemoryRegionSection *section)
+{
+    VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+                                                 listener);
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                            bcontainer);
+
+    if (vfio_listener_skipped_section(section) ||
+        !bcontainer->dirty_pages_supported) {
+        return;
+    }
+
+    if (vfio_devices_all_dirty_tracking(bcontainer)) {
+        vfio_physical_log_clear(container, section);
+    }
+}
+
 const MemoryListener vfio_memory_listener = {
     .name = "vfio",
     .region_add = vfio_listener_region_add,
@@ -1346,6 +1508,7 @@ const MemoryListener vfio_memory_listener = {
     .log_global_start = vfio_listener_log_global_start,
     .log_global_stop = vfio_listener_log_global_stop,
     .log_sync = vfio_listener_log_sync,
+    .log_clear = vfio_listener_log_clear,
 };
 
 void vfio_reset_handler(void *opaque)
@@ -1449,10 +1612,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
 
 void vfio_put_address_space(VFIOAddressSpace *space)
 {
-    if (QLIST_EMPTY(&space->containers)) {
-        QLIST_REMOVE(space, list);
-        g_free(space);
+    if (!QLIST_EMPTY(&space->containers)) {
+        return;
     }
+
+    QLIST_REMOVE(space, list);
+    g_free(space);
+
     if (QLIST_EMPTY(&vfio_address_spaces)) {
         qemu_unregister_reset(vfio_reset_handler, NULL);
     }
@@ -1481,3 +1647,53 @@ retry:
 
     return info;
 }
+
+int vfio_attach_device(char *name, VFIODevice *vbasedev,
+                       AddressSpace *as, Error **errp)
+{
+    const VFIOIOMMUClass *ops =
+        VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
+    HostIOMMUDevice *hiod = NULL;
+    HostIOMMUDeviceClass *hiod_ops = NULL;
+    int ret;
+
+    if (vbasedev->iommufd) {
+        ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
+    }
+
+    assert(ops);
+
+    if (!vbasedev->mdev) {
+         hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
+         hiod_ops = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+         vbasedev->hiod = hiod;
+    }
+
+    ret = ops->attach_device(name, vbasedev, as, errp);
+    if (ret) {
+        goto err_attach;
+    }
+
+    if (hiod_ops && hiod_ops->realize_late &&
+        !hiod_ops->realize_late(hiod, vbasedev, errp)) {
+        ops->detach_device(vbasedev);
+        ret = -EINVAL;
+        goto err_attach;
+    }
+
+    return 0;
+
+err_attach:
+    object_unref(hiod);
+    vbasedev->hiod = NULL;
+    return ret;
+}
+
+void vfio_detach_device(VFIODevice *vbasedev)
+{
+    if (!vbasedev->bcontainer) {
+        return;
+    }
+    object_unref(vbasedev->hiod);
+    vbasedev->bcontainer->ops->detach_device(vbasedev);
+}
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
new file mode 100644
index 0000000000000000000000000000000000000000..913ae49077c4f09b7b27517c1231cfbe4befb7fb
--- /dev/null
+++ b/hw/vfio/container-base.c
@@ -0,0 +1,111 @@
+/*
+ * VFIO BASE CONTAINER
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/vfio/vfio-container-base.h"
+
+int vfio_container_dma_map(VFIOContainerBase *bcontainer,
+                           hwaddr iova, ram_addr_t size,
+                           void *vaddr, bool readonly)
+{
+    g_assert(bcontainer->ops->dma_map);
+    return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly);
+}
+
+int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
+                             hwaddr iova, ram_addr_t size,
+                             IOMMUTLBEntry *iotlb)
+{
+    g_assert(bcontainer->ops->dma_unmap);
+    return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
+}
+
+int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+                                      MemoryRegionSection *section,
+                                      Error **errp)
+{
+    if (!bcontainer->ops->add_window) {
+        return 0;
+    }
+
+    return bcontainer->ops->add_window(bcontainer, section, errp);
+}
+
+void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
+                                       MemoryRegionSection *section)
+{
+    if (!bcontainer->ops->del_window) {
+        return;
+    }
+
+    return bcontainer->ops->del_window(bcontainer, section);
+}
+
+int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
+                                           bool start)
+{
+    if (!bcontainer->dirty_pages_supported) {
+        return 0;
+    }
+
+    g_assert(bcontainer->ops->set_dirty_page_tracking);
+    return bcontainer->ops->set_dirty_page_tracking(bcontainer, start);
+}
+
+int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+                                      VFIOBitmap *vbmap,
+                                      hwaddr iova, hwaddr size)
+{
+    g_assert(bcontainer->ops->query_dirty_bitmap);
+    return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size);
+}
+
+void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
+                         const VFIOIOMMUClass *ops)
+{
+    bcontainer->ops = ops;
+    bcontainer->space = space;
+    bcontainer->error = NULL;
+    bcontainer->dirty_pages_supported = false;
+    bcontainer->dma_max_mappings = 0;
+    bcontainer->iova_ranges = NULL;
+    QLIST_INIT(&bcontainer->giommu_list);
+    QLIST_INIT(&bcontainer->vrdl_list);
+}
+
+void vfio_container_destroy(VFIOContainerBase *bcontainer)
+{
+    VFIOGuestIOMMU *giommu, *tmp;
+
+    QLIST_REMOVE(bcontainer, next);
+
+    QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) {
+        memory_region_unregister_iommu_notifier(
+                MEMORY_REGION(giommu->iommu_mr), &giommu->n);
+        QLIST_REMOVE(giommu, giommu_next);
+        g_free(giommu);
+    }
+
+    g_list_free_full(bcontainer->iova_ranges, g_free);
+}
+
+static const TypeInfo types[] = {
+    {
+        .name = TYPE_VFIO_IOMMU,
+        .parent = TYPE_INTERFACE,
+        .class_size = sizeof(VFIOIOMMUClass),
+    },
+};
+
+DEFINE_TYPES(types)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 242010036af33faa325a34008af40c2cc67a02ea..539cf34b208544b18deee120f8b16747c8ba1010 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -30,9 +30,12 @@
 #include "qemu/error-report.h"
 #include "qemu/range.h"
 #include "sysemu/reset.h"
+#include "sysemu/kvm.h"
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/migration.h"
+#include "sysemu/kvm.h"
+#include "pci.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -60,10 +63,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
     }
 }
 
-static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
                                  hwaddr iova, ram_addr_t size,
                                  IOMMUTLBEntry *iotlb)
 {
+    const VFIOContainerBase *bcontainer = &container->bcontainer;
     struct vfio_iommu_type1_dma_unmap *unmap;
     struct vfio_bitmap *bitmap;
     VFIOBitmap vbmap;
@@ -91,7 +95,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
     bitmap->size = vbmap.size;
     bitmap->data = (__u64 *)vbmap.bitmap;
 
-    if (vbmap.size > container->max_dirty_bitmap_size) {
+    if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
         ret = -E2BIG;
         goto unmap_exit;
@@ -112,30 +116,73 @@ unmap_exit:
     return ret;
 }
 
+VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container,
+        hwaddr start_addr, hwaddr size)
+{
+    VFIODMARange *qrange;
+
+    QLIST_FOREACH(qrange, &container->dma_list, next) {
+        if (qrange->iova == start_addr && qrange->size == size) {
+            return qrange;
+        }
+    }
+    return NULL;
+}
+
+void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange)
+{
+    uint64_t pages, size;
+
+    pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size();
+    size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
+
+    qrange->bitmap = g_malloc0(size);
+}
+
 /*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
-int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
-                   ram_addr_t size, IOMMUTLBEntry *iotlb)
+static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
+                                 hwaddr iova, ram_addr_t size,
+                                 IOMMUTLBEntry *iotlb)
 {
+    const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                                  bcontainer);
     struct vfio_iommu_type1_dma_unmap unmap = {
         .argsz = sizeof(unmap),
         .flags = 0,
         .iova = iova,
         .size = size,
     };
+    VFIODMARange *qrange;
     bool need_dirty_sync = false;
     int ret;
 
-    if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
-        if (!vfio_devices_all_device_dirty_tracking(container) &&
-            container->dirty_pages_supported) {
+    if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) {
+        if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
+            bcontainer->dirty_pages_supported) {
             return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
         }
 
         need_dirty_sync = true;
     }
 
+    /*
+     * unregister the DMA range
+     *
+     * It seems that the memory layer will give us the same section as the one
+     * used in region_add(). Otherwise it'll be complicated to manipulate the
+     * bitmap across region_{add,del}. Is there any guarantee?
+     *
+     * But there is really not such a restriction on the kernel interface
+     * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc).
+     */
+    qrange = vfio_lookup_match_range(container, iova, size);
+    assert(qrange);
+    g_free(qrange->bitmap);
+    QLIST_REMOVE(qrange, next);
+    g_free(qrange);
+
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
         /*
          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -151,8 +198,8 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
          */
         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
-            trace_vfio_dma_unmap_overflow_workaround();
-            unmap.size -= 1ULL << ctz64(container->pgsizes);
+            trace_vfio_legacy_dma_unmap_overflow_workaround();
+            unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
             continue;
         }
         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
@@ -160,7 +207,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
     }
 
     if (need_dirty_sync) {
-        ret = vfio_get_dirty_bitmap(container, iova, size,
+        ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
                                     iotlb->translated_addr);
         if (ret) {
             return ret;
@@ -170,9 +217,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
     return 0;
 }
 
-int vfio_dma_map(VFIOContainer *container, hwaddr iova,
-                 ram_addr_t size, void *vaddr, bool readonly)
+static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
+                               ram_addr_t size, void *vaddr, bool readonly)
 {
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                                  bcontainer);
     struct vfio_iommu_type1_dma_map map = {
         .argsz = sizeof(map),
         .flags = VFIO_DMA_MAP_FLAG_READ,
@@ -180,6 +229,14 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .iova = iova,
         .size = size,
     };
+    VFIODMARange *qrange;
+
+    qrange = g_malloc0(sizeof(*qrange));
+    qrange->iova = iova;
+    qrange->size = size;
+    QLIST_INSERT_HEAD(&container->dma_list, qrange, next);
+    /* XXX allocate the dirty bitmap on demand */
+    vfio_dma_range_init_dirty_bitmap(qrange);
 
     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
@@ -191,7 +248,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
      * the VGA ROM space.
      */
     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
-        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
+        (errno == EBUSY &&
+         vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
         return 0;
     }
@@ -200,17 +258,17 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
     return -errno;
 }
 
-int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
+static int
+vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
+                                    bool start)
 {
+    const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                                  bcontainer);
     int ret;
     struct vfio_iommu_type1_dirty_bitmap dirty = {
         .argsz = sizeof(dirty),
     };
 
-    if (!container->dirty_pages_supported) {
-        return 0;
-    }
-
     if (start) {
         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
     } else {
@@ -227,9 +285,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
     return ret;
 }
 
-int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
-                            hwaddr iova, hwaddr size)
+static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+                                          VFIOBitmap *vbmap,
+                                          hwaddr iova, hwaddr size)
 {
+    const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                                  bcontainer);
     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
     struct vfio_iommu_type1_dirty_bitmap_get *range;
     int ret;
@@ -237,7 +298,9 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
 
     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
-    dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+    dbitmap->flags = container->dirty_log_manual_clear ?
+                     VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR :
+                     VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
     range->iova = iova;
     range->size = size;
@@ -296,7 +359,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
 }
 
 static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
-                                     VFIOContainer *container)
+                                     VFIOContainerBase *bcontainer)
 {
     struct vfio_info_cap_header *hdr;
     struct vfio_iommu_type1_info_cap_iova_range *cap;
@@ -314,8 +377,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
 
         range_set_bounds(range, cap->iova_ranges[i].start,
                          cap->iova_ranges[i].end);
-        container->iova_ranges =
-            range_list_insert(container->iova_ranges, range);
+        bcontainer->iova_ranges =
+            range_list_insert(bcontainer->iova_ranges, range);
     }
 
     return true;
@@ -349,6 +412,14 @@ static int vfio_get_iommu_type(VFIOContainer *container,
                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
     int i;
 
+    if (virtcca_cvm_enabled()) {
+        if (ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_S_IOMMU)) {
+            return VFIO_TYPE1v2_S_IOMMU;
+        } else {
+            return -errno;
+        }
+    }
+
     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
             return iommu_types[i];
@@ -358,10 +429,35 @@ static int vfio_get_iommu_type(VFIOContainer *container,
     return -EINVAL;
 }
 
+/*
+ * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
+ */
+static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp)
+{
+    ObjectClass *klass = NULL;
+
+    switch (iommu_type) {
+    case VFIO_TYPE1v2_IOMMU:
+    case VFIO_TYPE1_IOMMU:
+    case VFIO_TYPE1v2_S_IOMMU:
+        klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY);
+        break;
+    case VFIO_SPAPR_TCE_v2_IOMMU:
+    case VFIO_SPAPR_TCE_IOMMU:
+        klass = object_class_by_name(TYPE_VFIO_IOMMU_SPAPR);
+        break;
+    default:
+        g_assert_not_reached();
+    };
+
+    return VFIO_IOMMU_CLASS(klass);
+}
+
 static int vfio_init_container(VFIOContainer *container, int group_fd,
-                               Error **errp)
+                               VFIOAddressSpace *space, Error **errp)
 {
-    int iommu_type, ret;
+    int iommu_type, dirty_log_manual_clear, ret;
+    const VFIOIOMMUClass *vioc;
 
     iommu_type = vfio_get_iommu_type(container, errp);
     if (iommu_type < 0) {
@@ -390,6 +486,20 @@ static int vfio_init_container(VFIOContainer *container, int group_fd,
     }
 
     container->iommu_type = iommu_type;
+
+    dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION,
+                                   VFIO_DIRTY_LOG_MANUAL_CLEAR);
+    if (dirty_log_manual_clear) {
+        container->dirty_log_manual_clear = dirty_log_manual_clear;
+    }
+
+    vioc = vfio_get_iommu_class(iommu_type, errp);
+    if (!vioc) {
+        error_setg(errp, "No available IOMMU models");
+        return -EINVAL;
+    }
+
+    vfio_container_init(&container->bcontainer, space, vioc);
     return 0;
 }
 
@@ -442,6 +552,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
 {
     struct vfio_info_cap_header *hdr;
     struct vfio_iommu_type1_info_cap_migration *cap_mig;
+    VFIOContainerBase *bcontainer = &container->bcontainer;
 
     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
     if (!hdr) {
@@ -456,22 +567,72 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
      * qemu_real_host_page_size to mark those dirty.
      */
     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
-        container->dirty_pages_supported = true;
-        container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
-        container->dirty_pgsizes = cap_mig->pgsize_bitmap;
+        bcontainer->dirty_pages_supported = true;
+        bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
+        bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
     }
 }
 
-static void vfio_free_container(VFIOContainer *container)
+static SharedRegionListener *g_shl;
+
+static void shared_memory_listener_register(MemoryListener *listener,
+                                            AddressSpace *as)
 {
-    g_list_free_full(container->iova_ranges, g_free);
-    g_free(container);
+    SharedRegionListener *shl;
+
+    shl = g_new0(SharedRegionListener, 1);
+
+    shl->listener = listener;
+    shl->as = as;
+
+    shared_region_register_listener(shl);
+    g_shl = shl;
+}
+
+static void shared_memory_listener_unregister(void)
+{
+    SharedRegionListener *shl = g_shl;
+
+    shared_region_unregister_listener(shl);
+
+    g_free(shl);
+    g_shl = NULL;
+}
+
+static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
+{
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                            bcontainer);
+    g_autofree struct vfio_iommu_type1_info *info = NULL;
+    int ret;
+
+    ret = vfio_get_iommu_info(container, &info);
+    if (ret) {
+        error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
+        return ret;
+    }
+
+    if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
+        bcontainer->pgsizes = info->iova_pgsizes;
+    } else {
+        bcontainer->pgsizes = qemu_real_host_page_size();
+    }
+
+    if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
+        bcontainer->dma_max_mappings = 65535;
+    }
+
+    vfio_get_info_iova_range(info, bcontainer);
+
+    vfio_get_iommu_info_migration(container, info);
+    return 0;
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
                                   Error **errp)
 {
     VFIOContainer *container;
+    VFIOContainerBase *bcontainer;
     int ret, fd;
     VFIOAddressSpace *space;
 
@@ -508,7 +669,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
      * details once we know which type of IOMMU we are using.
      */
 
-    QLIST_FOREACH(container, &space->containers, next) {
+    QLIST_FOREACH(bcontainer, &space->containers, next) {
+        container = container_of(bcontainer, VFIOContainer, bcontainer);
         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
             ret = vfio_ram_block_discard_disable(container, true);
             if (ret) {
@@ -544,16 +706,11 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     }
 
     container = g_malloc0(sizeof(*container));
-    container->space = space;
     container->fd = fd;
-    container->error = NULL;
-    container->dirty_pages_supported = false;
-    container->dma_max_mappings = 0;
-    container->iova_ranges = NULL;
-    QLIST_INIT(&container->giommu_list);
-    QLIST_INIT(&container->vrdl_list);
-
-    ret = vfio_init_container(container, group->fd, errp);
+    QLIST_INIT(&container->dma_list);
+    bcontainer = &container->bcontainer;
+
+    ret = vfio_init_container(container, group->fd, space, errp);
     if (ret) {
         goto free_container_exit;
     }
@@ -564,82 +721,60 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
         goto free_container_exit;
     }
 
-    switch (container->iommu_type) {
-    case VFIO_TYPE1v2_IOMMU:
-    case VFIO_TYPE1_IOMMU:
-    {
-        struct vfio_iommu_type1_info *info;
+    assert(bcontainer->ops->setup);
 
-        ret = vfio_get_iommu_info(container, &info);
-        if (ret) {
-            error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
-            goto enable_discards_exit;
-        }
-
-        if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
-            container->pgsizes = info->iova_pgsizes;
-        } else {
-            container->pgsizes = qemu_real_host_page_size();
-        }
-
-        if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
-            container->dma_max_mappings = 65535;
-        }
-
-        vfio_get_info_iova_range(info, container);
-
-        vfio_get_iommu_info_migration(container, info);
-        g_free(info);
-        break;
-    }
-    case VFIO_SPAPR_TCE_v2_IOMMU:
-    case VFIO_SPAPR_TCE_IOMMU:
-    {
-        ret = vfio_spapr_container_init(container, errp);
-        if (ret) {
-            goto enable_discards_exit;
-        }
-        break;
-    }
+    ret = bcontainer->ops->setup(bcontainer, errp);
+    if (ret) {
+        goto enable_discards_exit;
     }
 
     vfio_kvm_device_add_group(group);
 
     QLIST_INIT(&container->group_list);
-    QLIST_INSERT_HEAD(&space->containers, container, next);
+    QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
 
     group->container = container;
     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 
-    container->listener = vfio_memory_listener;
+    if (kvm_csv3_enabled()) {
+        shared_memory_listener_register(&bcontainer->listener,
+                                        bcontainer->space->as);
+    }
 
-    memory_listener_register(&container->listener, container->space->as);
+    bcontainer->listener = vfio_memory_listener;
+    memory_listener_register(&bcontainer->listener, bcontainer->space->as);
 
-    if (container->error) {
+    if (bcontainer->error) {
         ret = -1;
-        error_propagate_prepend(errp, container->error,
+        error_propagate_prepend(errp, bcontainer->error,
             "memory listener initialization failed: ");
         goto listener_release_exit;
     }
 
-    container->initialized = true;
+    bcontainer->initialized = true;
 
     return 0;
 listener_release_exit:
     QLIST_REMOVE(group, container_next);
-    QLIST_REMOVE(container, next);
+    QLIST_REMOVE(bcontainer, next);
     vfio_kvm_device_del_group(group);
-    memory_listener_unregister(&container->listener);
+    if (kvm_csv3_enabled()) {
+        shared_memory_listener_unregister();
+    } else {
+        memory_listener_unregister(&bcontainer->listener);
+    }
     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
         container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
-        vfio_spapr_container_deinit(container);
+        if (bcontainer->ops->release) {
+            bcontainer->ops->release(bcontainer);
+    }
     }
 
 enable_discards_exit:
     vfio_ram_block_discard_disable(container, false);
 
 free_container_exit:
-    vfio_free_container(container);
+    g_free(container);
 
 close_fd_exit:
     close(fd);
@@ -653,6 +788,7 @@ put_space_exit:
 static void vfio_disconnect_container(VFIOGroup *group)
 {
     VFIOContainer *container = group->container;
+    VFIOContainerBase *bcontainer = &container->bcontainer;
 
     QLIST_REMOVE(group, container_next);
     group->container = NULL;
@@ -663,10 +799,16 @@ static void vfio_disconnect_container(VFIOGroup *group)
      * group.
      */
     if (QLIST_EMPTY(&container->group_list)) {
-        memory_listener_unregister(&container->listener);
+        if (kvm_csv3_enabled()) {
+            shared_memory_listener_unregister();
+        } else {
+            memory_listener_unregister(&bcontainer->listener);
+        }
         if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
             container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
-            vfio_spapr_container_deinit(container);
+            if (bcontainer->ops->release) {
+                bcontainer->ops->release(bcontainer);
+            }
         }
     }
 
@@ -676,21 +818,13 @@ static void vfio_disconnect_container(VFIOGroup *group)
     }
 
     if (QLIST_EMPTY(&container->group_list)) {
-        VFIOAddressSpace *space = container->space;
-        VFIOGuestIOMMU *giommu, *tmp;
+        VFIOAddressSpace *space = bcontainer->space;
 
-        QLIST_REMOVE(container, next);
-
-        QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
-            memory_region_unregister_iommu_notifier(
-                    MEMORY_REGION(giommu->iommu_mr), &giommu->n);
-            QLIST_REMOVE(giommu, giommu_next);
-            g_free(giommu);
-        }
+        vfio_container_destroy(bcontainer);
 
         trace_vfio_disconnect_container(container->fd);
         close(container->fd);
-        vfio_free_container(container);
+        g_free(container);
 
         vfio_put_address_space(space);
     }
@@ -705,7 +839,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
     QLIST_FOREACH(group, &vfio_group_list, next) {
         if (group->groupid == groupid) {
             /* Found it.  Now is it already in the right context? */
-            if (group->container->space->as == as) {
+            if (group->container->bcontainer.space->as == as) {
                 return group;
             } else {
                 error_setg(errp, "group %d used in multiple address spaces",
@@ -799,6 +933,11 @@ static int vfio_get_device(VFIOGroup *group, const char *name,
         return -1;
     }
 
+    if (!virtcca_cvm_enabled() && (info->flags & VFIO_DEVICE_FLAGS_SECURE)) {
+        error_setg(errp, "Normal vm cannot use confidential device.");
+        return -1;
+    }
+
     /*
      * Set discarding of RAM as not broken for this group if the driver knows
      * the device operates compatibly with discarding.  Setting must be
@@ -877,13 +1016,13 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
  * @name and @vbasedev->name are likely to be different depending
  * on the type of the device, hence the need for passing @name
  */
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-                       AddressSpace *as, Error **errp)
+static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
+                                     AddressSpace *as, Error **errp)
 {
     int groupid = vfio_device_groupid(vbasedev, errp);
     VFIODevice *vbasedev_iter;
     VFIOGroup *group;
-    VFIOContainer *container;
+    VFIOContainerBase *bcontainer;
     int ret;
 
     if (groupid < 0) {
@@ -892,6 +1031,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
 
     trace_vfio_attach_device(vbasedev->name, groupid);
 
+    if (!vfio_device_hiod_realize(vbasedev, errp)) {
+        return false;
+    }
+
     group = vfio_get_group(groupid, as, errp);
     if (!group) {
         return -ENOENT;
@@ -910,26 +1053,251 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
         return ret;
     }
 
-    container = group->container;
-    vbasedev->container = container;
-    QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next);
+    bcontainer = &group->container->bcontainer;
+    vbasedev->bcontainer = bcontainer;
+    QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
 
     return ret;
 }
 
-void vfio_detach_device(VFIODevice *vbasedev)
+static void vfio_legacy_detach_device(VFIODevice *vbasedev)
 {
     VFIOGroup *group = vbasedev->group;
 
-    if (!vbasedev->container) {
-        return;
-    }
-
     QLIST_REMOVE(vbasedev, global_next);
     QLIST_REMOVE(vbasedev, container_next);
-    vbasedev->container = NULL;
+    vbasedev->bcontainer = NULL;
     trace_vfio_detach_device(vbasedev->name, group->groupid);
     vfio_put_base_device(vbasedev);
     vfio_put_group(group);
 }
+
+static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    VFIOGroup *group;
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    struct vfio_pci_hot_reset *reset;
+    int32_t *fds;
+    int ret, i, count;
+    bool multi = false;
+
+    trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
+
+    if (!single) {
+        vfio_pci_pre_reset(vdev);
+    }
+    vdev->vbasedev.needs_reset = false;
+
+    ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
+
+    if (ret) {
+        goto out_single;
+    }
+    devices = &info->devices[0];
+
+    trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        trace_vfio_pci_hot_reset_dep_devices(host.domain,
+                host.bus, host.slot, host.function, devices[i].group_id);
+
+        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+            continue;
+        }
+
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            if (!vdev->has_pm_reset) {
+                error_report("vfio: Cannot reset device %s, "
+                             "depends on group %d which is not owned.",
+                             vdev->vbasedev.name, devices[i].group_id);
+            }
+            ret = -EPERM;
+            goto out;
+        }
+
+        /* Prep dependent devices for reset and clear our marker. */
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (!vbasedev_iter->dev->realized ||
+                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                if (single) {
+                    ret = -EINVAL;
+                    goto out_single;
+                }
+                vfio_pci_pre_reset(tmp);
+                tmp->vbasedev.needs_reset = false;
+                multi = true;
+                break;
+            }
+        }
+    }
+
+    if (!single && !multi) {
+        ret = -EINVAL;
+        goto out_single;
+    }
+
+    /* Determine how many group fds need to be passed */
+    count = 0;
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                count++;
+                break;
+            }
+        }
+    }
+
+    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+    fds = &reset->group_fds[0];
+
+    /* Fill in group fds */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        for (i = 0; i < info->count; i++) {
+            if (group->groupid == devices[i].group_id) {
+                fds[reset->count++] = group->fd;
+                break;
+            }
+        }
+    }
+
+    /* Bus reset! */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+    g_free(reset);
+    if (ret) {
+        ret = -errno;
+    }
+
+    trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
+                                    ret ? strerror(errno) : "Success");
+
+out:
+    /* Re-enable INTx on affected devices */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+            continue;
+        }
+
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            break;
+        }
+
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (!vbasedev_iter->dev->realized ||
+                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                vfio_pci_post_reset(tmp);
+                break;
+            }
+        }
+    }
+out_single:
+    if (!single) {
+        vfio_pci_post_reset(vdev);
+    }
+    g_free(info);
+
+    return ret;
+}
+
+static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
+{
+    VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+
+    vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
+
+    vioc->setup = vfio_legacy_setup;
+    vioc->dma_map = vfio_legacy_dma_map;
+    vioc->dma_unmap = vfio_legacy_dma_unmap;
+    vioc->attach_device = vfio_legacy_attach_device;
+    vioc->detach_device = vfio_legacy_detach_device;
+    vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
+    vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
+    vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
+};
+
+static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+                                     Error **errp)
+{
+    VFIODevice *vdev = opaque;
+
+    hiod->name = g_strdup(vdev->name);
+    hiod->agent = opaque;
+
+    return true;
+}
+
+static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap,
+                                    Error **errp)
+{
+    switch (cap) {
+    case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+        return vfio_device_get_aw_bits(hiod->agent);
+    default:
+        error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
+        return -EINVAL;
+    }
+}
+
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
+{
+    HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+    hioc->realize = hiod_legacy_vfio_realize;
+    hioc->get_cap = hiod_legacy_vfio_get_cap;
+};
+
+static const TypeInfo types[] = {
+    {
+        .name = TYPE_VFIO_IOMMU_LEGACY,
+        .parent = TYPE_VFIO_IOMMU,
+        .class_init = vfio_iommu_legacy_class_init,
+    }, {
+        .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
+        .parent = TYPE_HOST_IOMMU_DEVICE,
+        .class_init = hiod_legacy_vfio_class_init,
+    }
+};
+
+DEFINE_TYPES(types)
diff --git a/hw/vfio/hct.c b/hw/vfio/hct.c
new file mode 100644
index 0000000000000000000000000000000000000000..7fd397718201629cd054f9c03682ca6b49e4c598
--- /dev/null
+++ b/hw/vfio/hct.c
@@ -0,0 +1,551 @@
+/*
+ * vfio based mediated ccp(hct) assignment support
+ *
+ * Copyright 2023 HYGON Corp.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "hw/pci/pci.h"
+#include "hw/vfio/pci.h"
+#include "qemu/range.h"
+#include "sysemu/kvm.h"
+#include "hw/pci/msi.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+
+#define MAX_CCP_CNT                  48
+#define PAGE_SIZE                    4096
+#define HCT_SHARED_MEMORY_SIZE       (PAGE_SIZE * MAX_CCP_CNT)
+#define CCP_INDEX_BYTES              4
+#define PATH_MAX                     4096
+#define TYPE_HCT_DEV                 "hct"
+#define PCI_HCT_DEV(obj)             OBJECT_CHECK(HCTDevState, (obj), TYPE_HCT_DEV)
+#define HCT_MMIO_SIZE                (1 << 20)
+#define HCT_MAX_PASID                (1 << 8)
+
+#define PCI_VENDOR_ID_HYGON_CCP      0x1d94
+#define PCI_DEVICE_ID_HYGON_CCP      0x1468
+
+#define HCT_SHARE_DEV                "/dev/hct_share"
+
+#define HCT_VERSION_STRING           "0.5"
+#define DEF_VERSION_STRING           "0.1"
+#define VERSION_SIZE                 16
+
+#define HCT_SHARE_IOC_TYPE           'C'
+#define HCT_SHARE_OP_TYPE            0x01
+#define HCT_SHARE_OP                 _IOWR(HCT_SHARE_IOC_TYPE, \
+                                           HCT_SHARE_OP_TYPE,  \
+                                           struct hct_dev_ctrl)
+#define HCT_SHARE_OP_DMA_MAP         0x01
+#define HCT_SHARE_OP_GET_ID          0x03
+#define HCT_SHARE_OP_GET_PASID       0x04
+#define HCT_SHARE_OP_DMA_UNMAP       0x05
+#define HCT_SHARE_OP_GET_VERSION     0x06
+
+/* BARS */
+#define HCT_REG_BAR_IDX              2
+#define HCT_SHARED_BAR_IDX           3
+#define HCT_PASID_BAR_IDX            4
+
+#define PASID_OFFSET                 40
+
+static volatile struct hct_data {
+    int init;
+    int hct_fd;
+    unsigned long pasid;
+    uint8_t *pasid_memory;
+    uint8_t *hct_shared_memory;
+    uint8_t ccp_index[MAX_CCP_CNT];
+    uint8_t ccp_cnt;
+} hct_data;
+
+typedef struct SharedDevice {
+    PCIDevice dev;
+    int shared_memory_offset;
+} SharedDevice;
+
+typedef struct HctDevState {
+    SharedDevice sdev;
+    VFIODevice vdev;
+    MemoryRegion mmio;
+    MemoryRegion shared;
+    MemoryRegion pasid;
+    void *maps[PCI_NUM_REGIONS];
+} HCTDevState;
+
+struct hct_dev_ctrl {
+    unsigned char op;
+    unsigned char rsvd[3];
+    union {
+        unsigned char version[VERSION_SIZE];
+        struct {
+            unsigned long vaddr;
+            unsigned long iova;
+            unsigned long size;
+        };
+        unsigned int id;
+    };
+};
+
+static int pasid_get_and_init(HCTDevState *state)
+{
+    struct hct_dev_ctrl ctrl;
+    int ret;
+
+    ctrl.op = HCT_SHARE_OP_GET_PASID;
+    ctrl.id = -1;
+    ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl);
+    if (ret < 0) {
+        ret = -errno;
+        error_report("GET_PASID fail: %d", -errno);
+        goto out;
+    }
+
+    *hct_data.pasid_memory = ctrl.id;
+    hct_data.pasid = ctrl.id;
+
+out:
+    return ret;
+}
+
+static const MemoryRegionOps hct_mmio_ops = {
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid =
+        {
+            .min_access_size = 4,
+            .max_access_size = 4,
+        },
+};
+
+static void vfio_hct_detach_device(HCTDevState *state)
+{
+    vfio_detach_device(&state->vdev);
+
+    if (state->vdev.name)
+        g_free(state->vdev.name);
+}
+
+static void vfio_hct_exit(PCIDevice *dev)
+{
+    HCTDevState *state = PCI_HCT_DEV(dev);
+
+    vfio_hct_detach_device(state);
+
+    if (hct_data.hct_fd) {
+        qemu_close(hct_data.hct_fd);
+        hct_data.hct_fd = 0;
+    }
+}
+
+static Property vfio_hct_properties[] = {
+    DEFINE_PROP_STRING("sysfsdev", HCTDevState, vdev.sysfsdev),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vfio_ccp_compute_needs_reset(VFIODevice *vdev)
+{
+    vdev->needs_reset = false;
+}
+
+struct VFIODeviceOps vfio_ccp_ops = {
+    .vfio_compute_needs_reset = vfio_ccp_compute_needs_reset,
+};
+
+/* create BAR2, BAR3 and BAR4 space for the virtual machine. */
+static int vfio_hct_region_mmap(HCTDevState *state)
+{
+    int ret;
+    int i;
+    struct vfio_region_info *info;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        ret = vfio_get_region_info(&state->vdev, i, &info);
+        if (ret)
+            goto out;
+
+        if (info->size) {
+            state->maps[i] = mmap(NULL, info->size, PROT_READ | PROT_WRITE,
+                                  MAP_SHARED, state->vdev.fd, info->offset);
+            if (state->maps[i] == MAP_FAILED) {
+                ret = -errno;
+                g_free(info);
+                error_report("vfio mmap fail\n");
+                goto out;
+            }
+        }
+        g_free(info);
+    }
+
+    memory_region_init_io(&state->mmio, OBJECT(state), &hct_mmio_ops, state,
+                          "hct mmio", HCT_MMIO_SIZE);
+    memory_region_init_ram_device_ptr(&state->mmio, OBJECT(state), "hct mmio",
+                                      HCT_MMIO_SIZE,
+                                      state->maps[HCT_REG_BAR_IDX]);
+
+    memory_region_init_io(&state->shared, OBJECT(state), &hct_mmio_ops, state,
+                          "hct shared memory", PAGE_SIZE);
+    memory_region_init_ram_device_ptr(
+        &state->shared, OBJECT(state), "hct shared memory", PAGE_SIZE,
+        (void *)hct_data.hct_shared_memory +
+            state->sdev.shared_memory_offset * PAGE_SIZE);
+
+    memory_region_init_io(&state->pasid, OBJECT(state), &hct_mmio_ops, state,
+                          "hct pasid", PAGE_SIZE);
+    memory_region_init_ram_device_ptr(&state->pasid, OBJECT(state), "hct pasid",
+                                      PAGE_SIZE, hct_data.pasid_memory);
+
+    pci_register_bar(&state->sdev.dev, HCT_REG_BAR_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &state->mmio);
+    pci_register_bar(&state->sdev.dev, HCT_SHARED_BAR_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &state->shared);
+    pci_register_bar(&state->sdev.dev, HCT_PASID_BAR_IDX,
+                     PCI_BASE_ADDRESS_SPACE_MEMORY, &state->pasid);
+out:
+    return ret;
+}
+
+static int hct_check_duplicated_index(int index)
+{
+    int cnt;
+    for (cnt = 0; cnt < hct_data.ccp_cnt; cnt++) {
+        if (hct_data.ccp_index[cnt] == index) {
+            error_report("many mdev shouldn't be mapped to one ccp in a "
+                         "virtual machine!\n");
+            return -1;
+        }
+    }
+
+    hct_data.ccp_index[hct_data.ccp_cnt++] = index;
+    return 0;
+}
+
+static int hct_get_ccp_index(HCTDevState *state)
+{
+    char path[PATH_MAX] = {0};
+    char buf[CCP_INDEX_BYTES] = {0};
+    int fd;
+    int ret;
+    int ccp_index;
+
+    snprintf(path, PATH_MAX, "%s/vendor/id", state->vdev.sysfsdev);
+    fd = qemu_open_old(path, O_RDONLY);
+    if (fd < 0) {
+        error_report("open %s fail\n", path);
+        return -errno;
+    }
+
+    ret = read(fd, buf, sizeof(buf));
+    if (ret < 0) {
+        ret = -errno;
+        error_report("read %s fail\n", path);
+        goto out;
+    }
+
+    if (1 != sscanf(buf, "%d", &ccp_index)) {
+        ret = -errno;
+        error_report("format addr %s fail\n", buf);
+        goto out;
+    }
+
+    if (!hct_check_duplicated_index(ccp_index)) {
+        state->sdev.shared_memory_offset = ccp_index;
+    } else {
+        ret = -1;
+    }
+
+out:
+    qemu_close(fd);
+    return ret;
+}
+
+static int hct_api_version_check(void)
+{
+    struct hct_dev_ctrl ctrl;
+    int ret;
+
+    ctrl.op = HCT_SHARE_OP_GET_VERSION;
+    memcpy(ctrl.version, DEF_VERSION_STRING, sizeof(DEF_VERSION_STRING));
+    ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl);
+    if (ret < 0) {
+        error_report("ret %d, errno %d: fail to get hct.ko version.\n", ret,
+                     errno);
+        return -1;
+    } else if (memcmp(ctrl.version, HCT_VERSION_STRING,
+                      sizeof(HCT_VERSION_STRING)) < 0) {
+        error_report("The hct.ko version is %s, please upgrade to version %s "
+                     "or higher.\n",
+                     ctrl.version, HCT_VERSION_STRING);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int hct_shared_memory_init(void)
+{
+    int ret = 0;
+
+    hct_data.hct_shared_memory =
+        mmap(NULL, HCT_SHARED_MEMORY_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+             hct_data.hct_fd, 0);
+    if (hct_data.hct_shared_memory == MAP_FAILED) {
+        ret = -errno;
+        error_report("map hct shared memory fail\n");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static void hct_listener_region_add(MemoryListener *listener,
+                                    MemoryRegionSection *section)
+{
+    struct hct_dev_ctrl ctrl;
+    hwaddr iova;
+    Int128 llend, llsize;
+    void *vaddr;
+    int ret;
+
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_add(llend, int128_exts64(qemu_real_host_page_mask()));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return;
+    }
+
+    if (!section->mr->ram) {
+        return;
+    }
+
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    ctrl.op = HCT_SHARE_OP_DMA_MAP;
+    ctrl.iova = iova | (hct_data.pasid << PASID_OFFSET);
+    ctrl.vaddr = (uint64_t)vaddr;
+    ctrl.size = llsize;
+    ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl);
+    if (ret < 0)
+        error_report("VFIO_MAP_DMA: %d, iova=%lx", -errno, iova);
+}
+
+static void hct_listener_region_del(MemoryListener *listener,
+                                    MemoryRegionSection *section)
+{
+    struct hct_dev_ctrl ctrl;
+    hwaddr iova;
+    Int128 llend, llsize;
+    int ret;
+
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_add(llend, int128_exts64(qemu_real_host_page_mask()));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return;
+    }
+
+    if (!section->mr->ram) {
+        return;
+    }
+
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    ctrl.op = HCT_SHARE_OP_DMA_UNMAP;
+    ctrl.iova = iova | (hct_data.pasid << PASID_OFFSET);
+    ctrl.size = llsize;
+    ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl);
+    if (ret < 0)
+        error_report("VFIO_UNMAP_DMA: %d", -errno);
+}
+
+static MemoryListener hct_memory_listener = {
+    .region_add = hct_listener_region_add,
+    .region_del = hct_listener_region_del,
+};
+
+static void hct_data_uninit(HCTDevState *state)
+{
+    if (hct_data.hct_fd) {
+        qemu_close(hct_data.hct_fd);
+        hct_data.hct_fd = 0;
+    }
+
+    if (hct_data.pasid) {
+        hct_data.pasid = 0;
+    }
+
+    if (hct_data.pasid_memory) {
+        munmap(hct_data.pasid_memory, PAGE_SIZE);
+        hct_data.pasid_memory = NULL;
+    }
+
+    if (hct_data.hct_shared_memory) {
+        munmap((void *)hct_data.hct_shared_memory, HCT_SHARED_MEMORY_SIZE);
+        hct_data.hct_shared_memory = NULL;
+    }
+
+    memory_listener_unregister(&hct_memory_listener);
+}
+
+static int hct_data_init(HCTDevState *state)
+{
+    int ret;
+
+    if (hct_data.init == 0) {
+        hct_data.hct_fd = qemu_open_old(HCT_SHARE_DEV, O_RDWR);
+        if (hct_data.hct_fd < 0) {
+            error_report("fail to open %s, errno %d.", HCT_SHARE_DEV, errno);
+            ret = -errno;
+            goto out;
+        }
+
+        /* The hct.ko version number needs not to be less than 0.2. */
+        ret = hct_api_version_check();
+        if (ret)
+            goto out;
+
+        /* assign a page to the virtual BAR3 of each CCP. */
+        ret = hct_shared_memory_init();
+        if (ret)
+            goto out;
+
+        hct_data.pasid_memory = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+                                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (hct_data.pasid_memory < 0)
+            goto unmap_shared_memory_exit;
+
+        /* assign a unique pasid to each virtual machine. */
+        ret = pasid_get_and_init(state);
+        if (ret < 0)
+            goto unmap_pasid_memory_exit;
+
+        /* perform DMA_MAP and DMA_UNMAP operations on all memories of the
+         * virtual machine. */
+        memory_listener_register(&hct_memory_listener, &address_space_memory);
+
+        hct_data.init = 1;
+    }
+
+    return hct_get_ccp_index(state);
+
+unmap_pasid_memory_exit:
+    munmap(hct_data.pasid_memory, PAGE_SIZE);
+
+unmap_shared_memory_exit:
+    munmap((void *)hct_data.hct_shared_memory, HCT_SHARED_MEMORY_SIZE);
+
+out:
+    return ret;
+}
+
+/* When device is loaded */
+static void vfio_hct_realize(PCIDevice *pci_dev, Error **errp)
+{
+    int ret;
+    char *mdevid;
+    HCTDevState *state = PCI_HCT_DEV(pci_dev);
+
+    /* parsing mdev device name from startup scripts */
+    mdevid = g_path_get_basename(state->vdev.sysfsdev);
+    state->vdev.name = g_strdup_printf("%s", mdevid);
+
+    ret = hct_data_init(state);
+    if (ret < 0) {
+        g_free(state->vdev.name);
+        state->vdev.name = NULL;
+        error_setg(errp, "hct data init failed");
+        goto out;
+    }
+
+    ret = vfio_attach_device(state->vdev.name, &state->vdev,
+                             pci_device_iommu_address_space(pci_dev), errp);
+
+    if (ret) {
+        g_free(state->vdev.name);
+        state->vdev.name = NULL;
+        error_setg(errp, "attach device failed, name = %s", state->vdev.name);
+        goto data_uninit_out;
+    }
+
+    state->vdev.ops = &vfio_ccp_ops;
+    state->vdev.dev = &state->sdev.dev.qdev;
+
+    ret = vfio_hct_region_mmap(state);
+    if (ret < 0)
+    {
+        g_free(state->vdev.name);
+        state->vdev.name = NULL;
+        error_setg(errp, "region mmap failed, name = %s", state->vdev.name);
+        goto detach_device_out;
+    }
+
+    return;
+
+detach_device_out:
+    vfio_hct_detach_device(state);
+
+data_uninit_out:
+    hct_data_uninit(state);
+
+out:
+    return;
+}
+
+static void hct_dev_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
+
+    dc->desc = "HCT Device";
+    device_class_set_props(dc, vfio_hct_properties);
+
+    pdc->realize = vfio_hct_realize;
+    pdc->exit = vfio_hct_exit;
+    pdc->vendor_id = PCI_VENDOR_ID_HYGON_CCP;
+    pdc->device_id = PCI_DEVICE_ID_HYGON_CCP;
+    pdc->class_id = PCI_CLASS_CRYPT_OTHER;
+    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+
+    return;
+}
+
+static const TypeInfo pci_hct_info = {
+    .name = TYPE_HCT_DEV,
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(HCTDevState),
+    .class_init = hct_dev_class_init,
+    .interfaces =
+        (InterfaceInfo[]){
+            {INTERFACE_CONVENTIONAL_PCI_DEVICE},
+            {},
+        },
+};
+
+static void hct_register_types(void) {
+    type_register_static(&pci_hct_info);
+}
+
+type_init(hct_register_types);
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 168847e7c51ef35afbea276745c1aa7e6cd94ce0..1f3bfed917a966a41f78422c8f03c3ace2dc9e3e 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -27,6 +27,7 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "monitor/monitor.h"
 
 /*
  * Common VFIO interrupt disable
@@ -609,3 +610,98 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
 
     return ret;
 }
+
+int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
+{
+    struct stat st;
+
+    if (vbasedev->fd < 0) {
+        if (stat(vbasedev->sysfsdev, &st) < 0) {
+            error_setg_errno(errp, errno, "no such host device");
+            error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
+            return -errno;
+        }
+        /* User may specify a name, e.g: VFIO platform device */
+        if (!vbasedev->name) {
+            vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
+        }
+    } else {
+        if (!vbasedev->iommufd) {
+            error_setg(errp, "Use FD passing only with iommufd backend");
+            return -EINVAL;
+        }
+        /*
+         * Give a name with fd so any function printing out vbasedev->name
+         * will not break.
+         */
+        if (!vbasedev->name) {
+            vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
+        }
+    }
+
+    return 0;
+}
+
+void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
+{
+    int fd = monitor_fd_param(monitor_cur(), str, errp);
+
+    if (fd < 0) {
+        error_prepend(errp, "Could not parse remote object fd %s:", str);
+        return;
+    }
+    vbasedev->fd = fd;
+}
+
+void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
+                      DeviceState *dev, bool ram_discard)
+{
+    vbasedev->type = type;
+    vbasedev->ops = ops;
+    vbasedev->dev = dev;
+    vbasedev->fd = -1;
+
+    vbasedev->ram_block_discard_allowed = ram_discard;
+}
+
+int vfio_device_get_aw_bits(VFIODevice *vdev)
+{
+    /*
+     * iova_ranges is a sorted list. For old kernels that support
+     * VFIO but not support query of iova ranges, iova_ranges is NULL,
+     * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
+     */
+    GList *l = g_list_last(vdev->bcontainer->iova_ranges);
+
+    if (l) {
+        Range *range = l->data;
+        return range_get_last_bit(range) + 1;
+    }
+
+    return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
+}
+
+bool vfio_device_is_mdev(VFIODevice *vbasedev)
+{
+    g_autofree char *subsys = NULL;
+    g_autofree char *tmp = NULL;
+
+    if (!vbasedev->sysfsdev) {
+        return false;
+    }
+
+    tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
+    subsys = realpath(tmp, NULL);
+    return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
+}
+
+bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp)
+{
+    HostIOMMUDevice *hiod = vbasedev->hiod;
+
+    if (!hiod) {
+        return true;
+    }
+
+    return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp);
+}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
new file mode 100644
index 0000000000000000000000000000000000000000..a108beda29e248b4188ba2c74db915f54865e854
--- /dev/null
+++ b/hw/vfio/iommufd.c
@@ -0,0 +1,916 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include <linux/iommufd.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "sysemu/iommufd.h"
+#include "hw/qdev-core.h"
+#include "sysemu/reset.h"
+#include "qemu/cutils.h"
+#include "qemu/chardev_open.h"
+#include "pci.h"
+#include "exec/ram_addr.h"
+
+static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
+                            ram_addr_t size, void *vaddr, bool readonly)
+{
+    const VFIOIOMMUFDContainer *container =
+        container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+
+    return iommufd_backend_map_dma(container->be,
+                                   container->ioas_id,
+                                   iova, size, vaddr, readonly);
+}
+
+static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
+                              hwaddr iova, ram_addr_t size,
+                              IOMMUTLBEntry *iotlb)
+{
+    const VFIOIOMMUFDContainer *container =
+        container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+
+    /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
+    return iommufd_backend_unmap_dma(container->be,
+                                     container->ioas_id, iova, size);
+}
+
+static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
+{
+    return vfio_kvm_device_add_fd(vbasedev->fd, errp);
+}
+
+static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev)
+{
+    Error *err = NULL;
+
+    if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) {
+        error_report_err(err);
+    }
+}
+
+static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
+{
+    IOMMUFDBackend *iommufd = vbasedev->iommufd;
+    struct vfio_device_bind_iommufd bind = {
+        .argsz = sizeof(bind),
+        .flags = 0,
+    };
+    int ret;
+
+    ret = iommufd_backend_connect(iommufd, errp);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * Add device to kvm-vfio to be prepared for the tracking
+     * in KVM. Especially for some emulated devices, it requires
+     * to have kvm information in the device open.
+     */
+    ret = iommufd_cdev_kvm_device_add(vbasedev, errp);
+    if (ret) {
+        goto err_kvm_device_add;
+    }
+
+    /* Bind device to iommufd */
+    bind.iommufd = iommufd->fd;
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+    if (ret) {
+        error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d",
+                         vbasedev->fd, bind.iommufd);
+        goto err_bind;
+    }
+
+    vbasedev->devid = bind.out_devid;
+    trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
+                                        vbasedev->fd, vbasedev->devid);
+    return ret;
+err_bind:
+    iommufd_cdev_kvm_device_del(vbasedev);
+err_kvm_device_add:
+    iommufd_backend_disconnect(iommufd);
+    return ret;
+}
+
+static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev)
+{
+    /* Unbind is automatically conducted when device fd is closed */
+    iommufd_cdev_kvm_device_del(vbasedev);
+    iommufd_backend_disconnect(vbasedev->iommufd);
+}
+
+static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt)
+{
+    return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+}
+
+static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
+                                           bool start)
+{
+    const VFIOIOMMUFDContainer *container =
+        container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+    VFIOIOASHwpt *hwpt;
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        if (!iommufd_hwpt_dirty_tracking(hwpt)) {
+            continue;
+        }
+
+        if (!iommufd_backend_set_dirty_tracking(container->be,
+                                                hwpt->hwpt_id, start, NULL)) {
+            error_report("Failed to set dirty tracking hwpt_id %u errno: %d",
+                         hwpt->hwpt_id, errno);
+            goto err;
+        }
+    }
+
+    return 0;
+
+err:
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        if (!iommufd_hwpt_dirty_tracking(hwpt)) {
+            continue;
+        }
+        iommufd_backend_set_dirty_tracking(container->be,
+                                           hwpt->hwpt_id, !start, NULL);
+    }
+    return -EINVAL;
+}
+
+static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+                                      VFIOBitmap *vbmap, hwaddr iova,
+                                      hwaddr size)
+{
+    VFIOIOMMUFDContainer *container = container_of(bcontainer,
+                                                   VFIOIOMMUFDContainer,
+                                                   bcontainer);
+    unsigned long page_size = qemu_real_host_page_size();
+    VFIOIOASHwpt *hwpt;
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        if (!iommufd_hwpt_dirty_tracking(hwpt)) {
+            continue;
+        }
+
+        if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id,
+                                              iova, size, page_size,
+                                              (uint64_t *)vbmap->bitmap,
+                                              NULL)) {
+            error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
+                " size: 0x%"PRIx64" err: %d", (uint64_t)iova,
+                (uint64_t)size, errno);
+
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
+static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp)
+{
+    long int ret = -ENOTTY;
+    char *path, *vfio_dev_path = NULL, *vfio_path = NULL;
+    DIR *dir = NULL;
+    struct dirent *dent;
+    gchar *contents;
+    struct stat st;
+    gsize length;
+    int major, minor;
+    dev_t vfio_devt;
+
+    path = g_strdup_printf("%s/vfio-dev", sysfs_path);
+    if (stat(path, &st) < 0) {
+        error_setg_errno(errp, errno, "no such host device");
+        goto out_free_path;
+    }
+
+    dir = opendir(path);
+    if (!dir) {
+        error_setg_errno(errp, errno, "couldn't open directory %s", path);
+        goto out_free_path;
+    }
+
+    while ((dent = readdir(dir))) {
+        if (!strncmp(dent->d_name, "vfio", 4)) {
+            vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name);
+            break;
+        }
+    }
+
+    if (!vfio_dev_path) {
+        error_setg(errp, "failed to find vfio-dev/vfioX/dev");
+        goto out_close_dir;
+    }
+
+    if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) {
+        error_setg(errp, "failed to load \"%s\"", vfio_dev_path);
+        goto out_free_dev_path;
+    }
+
+    if (sscanf(contents, "%d:%d", &major, &minor) != 2) {
+        error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path);
+        goto out_free_dev_path;
+    }
+    g_free(contents);
+    vfio_devt = makedev(major, minor);
+
+    vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
+    ret = open_cdev(vfio_path, vfio_devt);
+    if (ret < 0) {
+        error_setg(errp, "Failed to open %s", vfio_path);
+    }
+
+    trace_iommufd_cdev_getfd(vfio_path, ret);
+    g_free(vfio_path);
+
+out_free_dev_path:
+    g_free(vfio_dev_path);
+out_close_dir:
+    closedir(dir);
+out_free_path:
+    if (*errp) {
+        error_prepend(errp, VFIO_MSG_PREFIX, path);
+    }
+    g_free(path);
+
+    return ret;
+}
+
+static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
+                                         Error **errp)
+{
+    int ret, iommufd = vbasedev->iommufd->fd;
+    struct vfio_device_attach_iommufd_pt attach_data = {
+        .argsz = sizeof(attach_data),
+        .flags = 0,
+        .pt_id = id,
+    };
+
+    /* Attach device to an IOAS or hwpt within iommufd */
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
+    if (ret) {
+        error_setg_errno(errp, errno,
+                         "[iommufd=%d] error attach %s (%d) to id=%d",
+                         iommufd, vbasedev->name, vbasedev->fd, id);
+        return -errno;
+    }
+
+    trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
+                                            vbasedev->fd, id);
+    return 0;
+}
+
+static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
+{
+    int ret, iommufd = vbasedev->iommufd->fd;
+    struct vfio_device_detach_iommufd_pt detach_data = {
+        .argsz = sizeof(detach_data),
+        .flags = 0,
+    };
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data);
+    if (ret) {
+        error_setg_errno(errp, errno, "detach %s failed", vbasedev->name);
+    } else {
+        trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name);
+    }
+    return ret;
+}
+
+static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container,
+                                         Error **errp)
+{
+    ERRP_GUARD();
+    IOMMUFDBackend *iommufd = vbasedev->iommufd;
+    uint32_t flags = 0;
+    VFIOIOASHwpt *hwpt;
+    uint32_t hwpt_id;
+    int ret;
+
+    /* Try to find a domain */
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
+        if (ret) {
+            /* -EINVAL means the domain is incompatible with the device. */
+            if (ret == -EINVAL) {
+                /*
+                 * It is an expected failure and it just means we will try
+                 * another domain, or create one if no existing compatible
+                 * domain is found. Hence why the error is discarded below.
+                 */
+                error_free(*errp);
+                *errp = NULL;
+                continue;
+            }
+
+            return ret;
+        } else {
+            vbasedev->hwpt = hwpt;
+            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+            vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt);
+            return 0;
+        }
+    }
+
+    /*
+     * This is quite early and VFIO Migration state isn't yet fully
+     * initialized, thus rely only on IOMMU hardware capabilities as to
+     * whether IOMMU dirty tracking is going to be requested. Later
+     * vfio_migration_realize() may decide to use VF dirty tracking
+     * instead.
+     */
+    if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
+        flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+    }
+
+    if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid,
+                                    container->ioas_id, flags,
+                                    IOMMU_HWPT_DATA_NONE, 0, NULL,
+                                    &hwpt_id, NULL, errp)) {
+        return -EINVAL;
+    }
+
+    hwpt = g_malloc0(sizeof(*hwpt));
+    hwpt->hwpt_id = hwpt_id;
+    hwpt->hwpt_flags = flags;
+    QLIST_INIT(&hwpt->device_list);
+
+    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp);
+    if (ret) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+        return ret;
+    }
+
+    vbasedev->hwpt = hwpt;
+    vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt);
+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+    container->bcontainer.dirty_pages_supported |=
+                               vbasedev->iommu_dirty_tracking;
+    if (container->bcontainer.dirty_pages_supported &&
+        !vbasedev->iommu_dirty_tracking) {
+        warn_report("IOMMU instance for device %s doesn't support dirty tracking",
+                    vbasedev->name);
+    }
+    return 0;
+}
+
+static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container)
+{
+    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
+
+    QLIST_REMOVE(vbasedev, hwpt_next);
+    vbasedev->hwpt = NULL;
+
+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        QLIST_REMOVE(hwpt, next);
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+    }
+}
+
+static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container,
+                                         Error **errp)
+{
+    /* mdevs aren't physical devices and will fail with auto domains */
+    if (!vbasedev->mdev) {
+        return iommufd_cdev_autodomains_get(vbasedev, container, errp);
+    }
+
+    return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
+}
+
+static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
+                                          VFIOIOMMUFDContainer *container)
+{
+    Error *err = NULL;
+
+    if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
+        error_report_err(err);
+    }
+
+    if (vbasedev->hwpt) {
+        iommufd_cdev_autodomains_put(vbasedev, container);
+    }
+
+}
+
+static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container)
+{
+    VFIOContainerBase *bcontainer = &container->bcontainer;
+
+    if (!QLIST_EMPTY(&bcontainer->device_list)) {
+        return;
+    }
+    memory_listener_unregister(&bcontainer->listener);
+    vfio_container_destroy(bcontainer);
+    iommufd_backend_free_id(container->be, container->ioas_id);
+    g_free(container);
+}
+
+static int iommufd_cdev_ram_block_discard_disable(bool state)
+{
+    /*
+     * We support coordinated discarding of RAM via the RamDiscardManager.
+     */
+    return ram_block_uncoordinated_discard_disable(state);
+}
+
+static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
+                                            uint32_t ioas_id, Error **errp)
+{
+    VFIOContainerBase *bcontainer = &container->bcontainer;
+    struct iommu_ioas_iova_ranges *info;
+    struct iommu_iova_range *iova_ranges;
+    int ret, sz, fd = container->be->fd;
+
+    info = g_malloc0(sizeof(*info));
+    info->size = sizeof(*info);
+    info->ioas_id = ioas_id;
+
+    ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info);
+    if (ret && errno != EMSGSIZE) {
+        goto error;
+    }
+
+    sz = info->num_iovas * sizeof(struct iommu_iova_range);
+    info = g_realloc(info, sizeof(*info) + sz);
+    info->allowed_iovas = (uintptr_t)(info + 1);
+
+    ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info);
+    if (ret) {
+        goto error;
+    }
+
+    iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas;
+
+    for (int i = 0; i < info->num_iovas; i++) {
+        Range *range = g_new(Range, 1);
+
+        range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last);
+        bcontainer->iova_ranges =
+            range_list_insert(bcontainer->iova_ranges, range);
+    }
+    bcontainer->pgsizes = info->out_iova_alignment;
+
+    g_free(info);
+    return 0;
+
+error:
+    ret = -errno;
+    g_free(info);
+    error_setg_errno(errp, errno, "Cannot get IOVA ranges");
+    return ret;
+}
+
+static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
+                               AddressSpace *as, Error **errp)
+{
+    VFIOContainerBase *bcontainer;
+    VFIOIOMMUFDContainer *container;
+    VFIOAddressSpace *space;
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    int ret, devfd;
+    uint32_t ioas_id;
+    Error *err = NULL;
+    const VFIOIOMMUClass *iommufd_vioc =
+        VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
+
+    if (vbasedev->fd < 0) {
+        devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
+        if (devfd < 0) {
+            return devfd;
+        }
+        vbasedev->fd = devfd;
+    } else {
+        devfd = vbasedev->fd;
+    }
+
+    ret = iommufd_cdev_connect_and_bind(vbasedev, errp);
+    if (ret) {
+        goto err_connect_bind;
+    }
+
+    space = vfio_get_address_space(as);
+
+    /*
+     * The HostIOMMUDevice data from legacy backend is static and doesn't need
+     * any information from the (type1-iommu) backend to be initialized. In
+     * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd
+     * FD to be connected and having a devid to be able to successfully call
+     * iommufd_backend_get_device_info().
+     */
+    if (!vfio_device_hiod_realize(vbasedev, errp)) {
+        goto err_alloc_ioas;
+    }
+
+    /* try to attach to an existing container in this space */
+    QLIST_FOREACH(bcontainer, &space->containers, next) {
+        container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+        if (bcontainer->ops != iommufd_vioc ||
+            vbasedev->iommufd != container->be) {
+            continue;
+        }
+        if (iommufd_cdev_attach_container(vbasedev, container, &err)) {
+            const char *msg = error_get_pretty(err);
+
+            trace_iommufd_cdev_fail_attach_existing_container(msg);
+            error_free(err);
+            err = NULL;
+        } else {
+            ret = iommufd_cdev_ram_block_discard_disable(true);
+            if (ret) {
+                error_setg(errp,
+                              "Cannot set discarding of RAM broken (%d)", ret);
+                goto err_discard_disable;
+            }
+            goto found_container;
+        }
+    }
+
+    /* Need to allocate a new dedicated container */
+    ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp);
+    if (ret < 0) {
+        goto err_alloc_ioas;
+    }
+
+    trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id);
+
+    container = g_malloc0(sizeof(*container));
+    container->be = vbasedev->iommufd;
+    container->ioas_id = ioas_id;
+    QLIST_INIT(&container->hwpt_list);
+
+    bcontainer = &container->bcontainer;
+    vfio_container_init(bcontainer, space, iommufd_vioc);
+    QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
+
+    ret = iommufd_cdev_attach_container(vbasedev, container, errp);
+    if (ret) {
+        goto err_attach_container;
+    }
+
+    ret = iommufd_cdev_ram_block_discard_disable(true);
+    if (ret) {
+        goto err_discard_disable;
+    }
+
+    ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err);
+    if (ret) {
+        error_append_hint(&err,
+                   "Fallback to default 64bit IOVA range and 4K page size\n");
+        warn_report_err(err);
+        err = NULL;
+        bcontainer->pgsizes = qemu_real_host_page_size();
+    }
+
+    bcontainer->listener = vfio_memory_listener;
+    memory_listener_register(&bcontainer->listener, bcontainer->space->as);
+
+    if (bcontainer->error) {
+        ret = -1;
+        error_propagate_prepend(errp, bcontainer->error,
+                                "memory listener initialization failed: ");
+        goto err_listener_register;
+    }
+
+    bcontainer->initialized = true;
+
+found_container:
+    ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_setg_errno(errp, errno, "error getting device info");
+        goto err_listener_register;
+    }
+
+    /*
+     * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
+     * for discarding incompatibility check as well?
+     */
+    if (vbasedev->ram_block_discard_allowed) {
+        iommufd_cdev_ram_block_discard_disable(false);
+    }
+
+    vbasedev->group = 0;
+    vbasedev->num_irqs = dev_info.num_irqs;
+    vbasedev->num_regions = dev_info.num_regions;
+    vbasedev->flags = dev_info.flags;
+    vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+    vbasedev->bcontainer = bcontainer;
+    QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
+    QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+
+    trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
+                                   vbasedev->num_regions, vbasedev->flags);
+    return 0;
+
+err_listener_register:
+    iommufd_cdev_ram_block_discard_disable(false);
+err_discard_disable:
+    iommufd_cdev_detach_container(vbasedev, container);
+err_attach_container:
+    iommufd_cdev_container_destroy(container);
+err_alloc_ioas:
+    vfio_put_address_space(space);
+    iommufd_cdev_unbind_and_disconnect(vbasedev);
+err_connect_bind:
+    close(vbasedev->fd);
+    return ret;
+}
+
+static void iommufd_cdev_detach(VFIODevice *vbasedev)
+{
+    VFIOContainerBase *bcontainer = vbasedev->bcontainer;
+    VFIOAddressSpace *space = bcontainer->space;
+    VFIOIOMMUFDContainer *container = container_of(bcontainer,
+                                                   VFIOIOMMUFDContainer,
+                                                   bcontainer);
+    QLIST_REMOVE(vbasedev, global_next);
+    QLIST_REMOVE(vbasedev, container_next);
+    vbasedev->bcontainer = NULL;
+
+    if (!vbasedev->ram_block_discard_allowed) {
+        iommufd_cdev_ram_block_discard_disable(false);
+    }
+
+    iommufd_cdev_detach_container(vbasedev, container);
+    iommufd_cdev_container_destroy(container);
+    vfio_put_address_space(space);
+
+    iommufd_cdev_unbind_and_disconnect(vbasedev);
+    close(vbasedev->fd);
+}
+
+static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid)
+{
+    VFIODevice *vbasedev_iter;
+    const VFIOIOMMUClass *iommufd_vioc =
+        VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
+
+    QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) {
+        if (vbasedev_iter->bcontainer->ops != iommufd_vioc) {
+            continue;
+        }
+        if (devid == vbasedev_iter->devid) {
+            return vbasedev_iter;
+        }
+    }
+    return NULL;
+}
+
+static VFIOPCIDevice *
+iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev,
+                                    VFIODevice *reset_dev)
+{
+    VFIODevice *vbasedev_tmp;
+
+    if (dep_dev->devid == reset_dev->devid ||
+        dep_dev->devid == VFIO_PCI_DEVID_OWNED) {
+        return NULL;
+    }
+
+    vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid);
+    if (!vbasedev_tmp || !vbasedev_tmp->dev->realized ||
+        vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) {
+        return NULL;
+    }
+
+    return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev);
+}
+
+static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    struct vfio_pci_hot_reset *reset;
+    int ret, i;
+    bool multi = false;
+
+    trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
+
+    if (!single) {
+        vfio_pci_pre_reset(vdev);
+    }
+    vdev->vbasedev.needs_reset = false;
+
+    ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
+
+    if (ret) {
+        goto out_single;
+    }
+
+    assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID);
+
+    devices = &info->devices[0];
+
+    if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) {
+        if (!vdev->has_pm_reset) {
+            for (i = 0; i < info->count; i++) {
+                if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) {
+                    error_report("vfio: Cannot reset device %s, "
+                                 "depends on device %04x:%02x:%02x.%x "
+                                 "which is not owned.",
+                                 vdev->vbasedev.name, devices[i].segment,
+                                 devices[i].bus, PCI_SLOT(devices[i].devfn),
+                                 PCI_FUNC(devices[i].devfn));
+                }
+            }
+        }
+        ret = -EPERM;
+        goto out_single;
+    }
+
+    trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
+
+    for (i = 0; i < info->count; i++) {
+        VFIOPCIDevice *tmp;
+
+        trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment,
+                                                     devices[i].bus,
+                                                     PCI_SLOT(devices[i].devfn),
+                                                     PCI_FUNC(devices[i].devfn),
+                                                     devices[i].devid);
+
+        /*
+         * If a VFIO cdev device is resettable, all the dependent devices
+         * are either bound to same iommufd or within same iommu_groups as
+         * one of the iommufd bound devices.
+         */
+        assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED);
+
+        tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev);
+        if (!tmp) {
+            continue;
+        }
+
+        if (single) {
+            ret = -EINVAL;
+            goto out_single;
+        }
+        vfio_pci_pre_reset(tmp);
+        tmp->vbasedev.needs_reset = false;
+        multi = true;
+    }
+
+    if (!single && !multi) {
+        ret = -EINVAL;
+        goto out_single;
+    }
+
+    /* Use zero length array for hot reset with iommufd backend */
+    reset = g_malloc0(sizeof(*reset));
+    reset->argsz = sizeof(*reset);
+
+     /* Bus reset! */
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+    g_free(reset);
+    if (ret) {
+        ret = -errno;
+    }
+
+    trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
+                                    ret ? strerror(errno) : "Success");
+
+    /* Re-enable INTx on affected devices */
+    for (i = 0; i < info->count; i++) {
+        VFIOPCIDevice *tmp;
+
+        tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev);
+        if (!tmp) {
+            continue;
+        }
+        vfio_pci_post_reset(tmp);
+    }
+out_single:
+    if (!single) {
+        vfio_pci_post_reset(vdev);
+    }
+    g_free(info);
+
+    return ret;
+}
+
+static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data)
+{
+    VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+
+    vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO;
+
+    vioc->dma_map = iommufd_cdev_map;
+    vioc->dma_unmap = iommufd_cdev_unmap;
+    vioc->attach_device = iommufd_cdev_attach;
+    vioc->detach_device = iommufd_cdev_detach;
+    vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
+    vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking;
+    vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap;
+};
+
+static bool
+host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                           uint32_t hwpt_id, Error **errp)
+{
+    VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent;
+
+    return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp);
+}
+
+static bool
+host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                           Error **errp)
+{
+    VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent;
+
+    return !iommufd_cdev_detach_ioas_hwpt(vbasedev, errp);
+}
+
+static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque,
+                                           Error **errp)
+{
+    VFIODevice *vdev = opaque;
+    VFIOIOMMUFDContainer *container = container_of(vdev->bcontainer,
+                                                   VFIOIOMMUFDContainer,
+                                                   bcontainer);
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+
+    idev->iommufd = vdev->iommufd;
+    idev->devid = vdev->devid;
+    idev->ioas_id = container->ioas_id;
+
+    return true;
+}
+
+static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+                                      Error **errp)
+{
+    VFIODevice *vdev = opaque;
+    HostIOMMUDeviceCaps *caps = &hiod->caps;
+    enum iommu_hw_info_type type;
+    union {
+        struct iommu_hw_info_vtd vtd;
+    } data;
+    uint64_t hw_caps;
+    uint8_t pasids;
+
+    hiod->agent = opaque;
+
+    if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid,
+                                         &type, &data, sizeof(data),
+                                         &hw_caps, &pasids, errp)) {
+        return false;
+    }
+
+    hiod->name = g_strdup(vdev->name);
+    caps->type = type;
+    caps->hw_caps = hw_caps;
+    caps->max_pasid_log2 = pasids;
+
+    return true;
+}
+
+static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
+{
+    HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+    HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc);
+
+    hiodc->realize = hiod_iommufd_vfio_realize;
+    hiodc->realize_late = hiod_iommufd_vfio_realize_late;
+
+    idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt;
+    idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt;
+};
+
+static const TypeInfo types[] = {
+    {
+        .name = TYPE_VFIO_IOMMU_IOMMUFD,
+        .parent = TYPE_VFIO_IOMMU,
+        .class_init = vfio_iommu_iommufd_class_init,
+    }, {
+        .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
+        .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+        .class_init = hiod_iommufd_vfio_class_init,
+    }
+};
+
+DEFINE_TYPES(types)
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index 2a6912c94027d6213144f910d64625a469cc2b1f..bda2688983e9e7013dcd12f6696253e835ded8ef 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -2,10 +2,14 @@ vfio_ss = ss.source_set()
 vfio_ss.add(files(
   'helpers.c',
   'common.c',
+  'container-base.c',
   'container.c',
-  'spapr.c',
   'migration.c',
 ))
+vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c'))
+vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
+  'iommufd.c',
+))
 vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
   'display.c',
   'pci-quirks.c',
@@ -17,5 +21,6 @@ vfio_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c'))
 vfio_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c'))
 vfio_ss.add(when: 'CONFIG_VFIO_AP', if_true: files('ap.c'))
 vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c'))
+vfio_ss.add(when: 'CONFIG_VFIO_HCT', if_true: files('hct.c'))
 
 specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss)
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 28d422b39f9f70e94a2f396b0fb064c5de17dc28..3924beb2893fa642add56a07f79fc89b63fb1209 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -945,16 +945,18 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
         return !vfio_block_migration(vbasedev, err, errp);
     }
 
-    if (!vbasedev->dirty_pages_supported) {
+    if ((!vbasedev->dirty_pages_supported ||
+         vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
+        !vbasedev->iommu_dirty_tracking) {
         if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
             error_setg(&err,
-                       "%s: VFIO device doesn't support device dirty tracking",
-                       vbasedev->name);
+                       "%s: VFIO device doesn't support device and "
+                       "IOMMU dirty tracking", vbasedev->name);
             goto add_blocker;
         }
 
-        warn_report("%s: VFIO device doesn't support device dirty tracking",
-                    vbasedev->name);
+        warn_report("%s: VFIO device doesn't support device and "
+                    "IOMMU dirty tracking", vbasedev->name);
     }
 
     ret = vfio_block_multiple_devices_migration(vbasedev, errp);
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index 84b1a7b9485c5c1758179ffe7ce6126d1f240f5d..a71ebe26b427e6ca2ceb383b5d5adfe3460cfb91 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -1209,6 +1209,190 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
     return 0;
 }
 
+#define PCI_VENDOR_ID_HUAWEI      0x19e5
+#define PCI_DEVICE_ID_ASCEND910   0xd801
+#define PCI_DEVICE_ID_ASCEND710   0xd500
+#define PCI_DEVICE_ID_ASCEND310   0xd100
+#define PCI_SUB_DEVICE_ID_ASCEND710_1P_MIN  0x100
+#define PCI_SUB_DEVICE_ID_ASCEND710_1P_MAX  0x10f
+#define PCI_SUB_DEVICE_ID_ASCEND710_2P_MIN  0x110
+#define PCI_SUB_DEVICE_ID_ASCEND710_2P_MAX  0x11f
+#define ASCEND910_XLOADER_SIZE    4
+#define ASCEND910_XLOADER_OFFSET  0x80400
+#define ASCEND710_2P_BASE         (128 * 1024 * 1024)
+#define ASCEND710_1P_DEVNUM       1
+#define ASCEND710_2P_DEVNUM       2
+#define ASCEND710_XLOADER_SIZE    4
+#define ASCEND710_XLOADER_OFFSET  0x100430
+#define ASCEND310_XLOADER_SIZE    4
+#define ASCEND310_XLOADER_OFFSET  0x400
+
+typedef struct VFIOAscendBarQuirk {
+    struct VFIOPCIDevice *vdev;
+    pcibus_t offset;
+    uint8_t bar;
+    MemoryRegion *mem;
+} VFIOAscendBarQuirk;
+
+static uint64_t vfio_ascend_quirk_read(void *opaque,
+                                       hwaddr addr, unsigned size)
+{
+    VFIOAscendBarQuirk *quirk = opaque;
+    VFIOPCIDevice *vdev = quirk->vdev;
+
+    qemu_log("read RO region! addr=0x%" HWADDR_PRIx ", size=%d\n",
+            addr + quirk->offset, size);
+
+    return vfio_region_read(&vdev->bars[quirk->bar].region,
+                            addr + quirk->offset, size);
+}
+
+static void vfio_ascend_quirk_write(void *opaque, hwaddr addr,
+                                    uint64_t data, unsigned size)
+{
+    VFIOAscendBarQuirk *quirk = opaque;
+
+    qemu_log("modifying RO region is not allowed! addr=0x%"
+            HWADDR_PRIx ", data=0x%" PRIx64 ", size=%d\n",
+            addr + quirk->offset, data, size);
+}
+
+static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = {
+    .read = vfio_ascend_quirk_read,
+    .write = vfio_ascend_quirk_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_probe_ascend910_bar0_quirk(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOQuirk *quirk;
+    VFIOAscendBarQuirk *bar0_quirk;
+
+    if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 ||
+        vdev->device_id != PCI_DEVICE_ID_ASCEND910) {
+        return;
+    }
+
+    quirk = g_malloc0(sizeof(*quirk));
+    quirk->nr_mem = 1;
+    quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
+    bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem);
+    bar0_quirk[0].vdev = vdev;
+    bar0_quirk[0].offset = ASCEND910_XLOADER_OFFSET;
+    bar0_quirk[0].bar = nr;
+
+    /*
+     * intercept w/r to the xloader-updating register,
+     * so the vm can't enable xloader-updating
+     */
+    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
+                          &vfio_ascend_intercept_regs_quirk,
+                          &bar0_quirk[0],
+                          "vfio-ascend910-bar0-intercept-regs-quirk",
+                          ASCEND910_XLOADER_SIZE);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        bar0_quirk[0].offset,
+                                        &quirk->mem[0], 1);
+    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+}
+
+static void vfio_probe_ascend710_bar2_quirk(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOQuirk *quirk;
+    VFIOAscendBarQuirk *bar2_quirk;
+    int sub_device_id;
+    int devnum = 0;
+
+    if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 ||
+        vdev->device_id != PCI_DEVICE_ID_ASCEND710) {
+        return;
+    }
+
+    sub_device_id = pci_get_word(vdev->pdev.config + PCI_SUBSYSTEM_ID);
+    if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND710_1P_MIN &&
+        sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND710_1P_MAX) {
+        devnum = ASCEND710_1P_DEVNUM;
+    } else if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND710_2P_MIN &&
+               sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND710_2P_MAX) {
+        devnum = ASCEND710_2P_DEVNUM;
+    }
+
+    if (devnum != ASCEND710_1P_DEVNUM && devnum != ASCEND710_2P_DEVNUM) {
+        return;
+    }
+
+    quirk = g_malloc0(sizeof(*quirk));
+    quirk->nr_mem = devnum;
+    quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
+    bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem);
+    bar2_quirk[0].vdev = vdev;
+    bar2_quirk[0].offset = ASCEND710_XLOADER_OFFSET;
+    bar2_quirk[0].bar = nr;
+
+    /*
+     * intercept w/r to the xloader-updating register,
+     * so the vm can't enable xloader-updating
+     */
+    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
+                          &vfio_ascend_intercept_regs_quirk,
+                          &bar2_quirk[0],
+                          "vfio-ascend710-bar2-1p-intercept-regs-quirk",
+                          ASCEND710_XLOADER_SIZE);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        bar2_quirk[0].offset,
+                                        &quirk->mem[0], 1);
+
+    if (devnum == ASCEND710_2P_DEVNUM) {
+        bar2_quirk[1].vdev = vdev;
+        bar2_quirk[1].offset = (ASCEND710_2P_BASE + ASCEND710_XLOADER_OFFSET);
+        bar2_quirk[1].bar = nr;
+
+        memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
+                              &vfio_ascend_intercept_regs_quirk,
+                              &bar2_quirk[1],
+                              "vfio-ascend710-bar2-2p-intercept-regs-quirk",
+                              ASCEND710_XLOADER_SIZE);
+        memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                            bar2_quirk[1].offset,
+                                            &quirk->mem[1], 1);
+    }
+
+    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+}
+
+static void vfio_probe_ascend310_bar4_quirk(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOQuirk *quirk;
+    VFIOAscendBarQuirk *bar4_quirk;
+
+    if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 4 ||
+        vdev->device_id != PCI_DEVICE_ID_ASCEND310) {
+        return;
+    }
+
+    quirk = g_malloc0(sizeof(*quirk));
+    quirk->nr_mem = 1;
+    quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
+    bar4_quirk = quirk->data = g_new0(typeof(*bar4_quirk), quirk->nr_mem);
+    bar4_quirk[0].vdev = vdev;
+    bar4_quirk[0].offset = ASCEND310_XLOADER_OFFSET;
+    bar4_quirk[0].bar = nr;
+
+    /*
+     * intercept w/r to the xloader-updating register,
+     * so the vm can't enable xloader-updating
+     */
+    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
+                          &vfio_ascend_intercept_regs_quirk,
+                          &bar4_quirk[0],
+                          "vfio-ascend310-bar4-intercept-regs-quirk",
+                          ASCEND310_XLOADER_SIZE);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        bar4_quirk[0].offset,
+                                        &quirk->mem[0], 1);
+    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+}
+
 /*
  * Common quirk probe entry points.
  */
@@ -1261,6 +1445,9 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
 #ifdef CONFIG_VFIO_IGD
     vfio_probe_igd_bar4_quirk(vdev, nr);
 #endif
+    vfio_probe_ascend910_bar0_quirk(vdev, nr);
+    vfio_probe_ascend710_bar2_quirk(vdev, nr);
+    vfio_probe_ascend310_bar4_quirk(vdev, nr);
 }
 
 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c62c02f7b692c98bba1b931ebb1a4254a7f56061..ce958848b61d0d871a08ce28dc9bc5009870d1a2 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -19,7 +19,9 @@
  */
 
 #include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
 #include <linux/vfio.h>
+#include <linux/iommufd.h>
 #include <sys/ioctl.h>
 
 #include "hw/hw.h"
@@ -42,6 +44,7 @@
 #include "qapi/error.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
+#include "sysemu/iommufd.h"
 
 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
 
@@ -504,8 +507,9 @@ static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
                                      PCIDevice *pdev)
 {
-    kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
-    kvm_irqchip_commit_routes(kvm_state);
+    KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+    kvm_irqchip_update_msi_route(&c, vector->virq, msg, pdev);
+    kvm_irqchip_commit_route_changes(&c);
 }
 
 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
@@ -2346,6 +2350,33 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
 
     }
 
+    {
+        HostIOMMUDeviceCaps *caps = &vdev->vbasedev.hiod->caps;
+
+        /*
+         * TODO: Add option for enabling pasid at a safe offset, this adds the
+         * pasid capability in the end of the PCIE config space.
+	  */
+        if (caps->max_pasid_log2 && pci_device_get_pasid_cap(&vdev->pdev)) {
+            uint16_t pasid_caps = (caps->max_pasid_log2 << 8) & PCI_PASID_CAP_WIDTH;
+
+            if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) {
+                pasid_caps |= PCI_PASID_CAP_EXEC;
+            }
+
+            if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) {
+                pasid_caps |= PCI_PASID_CAP_PRIV;
+            }
+
+            pcie_pasid_init(pdev,
+                            PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF,
+                            pasid_caps);
+
+            /* PASID capability is fully emulated by QEMU */
+            memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
+        }
+    }
+
     /* Cleanup chain head ID if necessary */
     if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
         pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
@@ -2374,7 +2405,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
     return 0;
 }
 
-static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
+void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
     uint16_t cmd;
@@ -2411,7 +2442,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
 }
 
-static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
+void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 {
     Error *err = NULL;
     int nr;
@@ -2435,7 +2466,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_quirk_reset(vdev);
 }
 
-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 {
     char tmp[13];
 
@@ -2445,22 +2476,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
     return (strcmp(tmp, name) == 0);
 }
 
-static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
+int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
+                                    struct vfio_pci_hot_reset_info **info_p)
 {
-    VFIOGroup *group;
     struct vfio_pci_hot_reset_info *info;
-    struct vfio_pci_dependent_device *devices;
-    struct vfio_pci_hot_reset *reset;
-    int32_t *fds;
-    int ret, i, count;
-    bool multi = false;
+    int ret, count;
 
-    trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
-
-    if (!single) {
-        vfio_pci_pre_reset(vdev);
-    }
-    vdev->vbasedev.needs_reset = false;
+    assert(info_p && !*info_p);
 
     info = g_malloc0(sizeof(*info));
     info->argsz = sizeof(*info);
@@ -2468,163 +2490,36 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
     if (ret && errno != ENOSPC) {
         ret = -errno;
+        g_free(info);
         if (!vdev->has_pm_reset) {
             error_report("vfio: Cannot reset device %s, "
                          "no available reset mechanism.", vdev->vbasedev.name);
         }
-        goto out_single;
+        return ret;
     }
 
     count = info->count;
-    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
-    info->argsz = sizeof(*info) + (count * sizeof(*devices));
-    devices = &info->devices[0];
+    info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
+    info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
 
     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
     if (ret) {
         ret = -errno;
+        g_free(info);
         error_report("vfio: hot reset info failed: %m");
-        goto out_single;
-    }
-
-    trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
-
-    /* Verify that we have all the groups required */
-    for (i = 0; i < info->count; i++) {
-        PCIHostDeviceAddress host;
-        VFIOPCIDevice *tmp;
-        VFIODevice *vbasedev_iter;
-
-        host.domain = devices[i].segment;
-        host.bus = devices[i].bus;
-        host.slot = PCI_SLOT(devices[i].devfn);
-        host.function = PCI_FUNC(devices[i].devfn);
-
-        trace_vfio_pci_hot_reset_dep_devices(host.domain,
-                host.bus, host.slot, host.function, devices[i].group_id);
-
-        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
-            continue;
-        }
-
-        QLIST_FOREACH(group, &vfio_group_list, next) {
-            if (group->groupid == devices[i].group_id) {
-                break;
-            }
-        }
-
-        if (!group) {
-            if (!vdev->has_pm_reset) {
-                error_report("vfio: Cannot reset device %s, "
-                             "depends on group %d which is not owned.",
-                             vdev->vbasedev.name, devices[i].group_id);
-            }
-            ret = -EPERM;
-            goto out;
-        }
-
-        /* Prep dependent devices for reset and clear our marker. */
-        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
-            if (!vbasedev_iter->dev->realized ||
-                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
-                continue;
-            }
-            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
-            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
-                if (single) {
-                    ret = -EINVAL;
-                    goto out_single;
-                }
-                vfio_pci_pre_reset(tmp);
-                tmp->vbasedev.needs_reset = false;
-                multi = true;
-                break;
-            }
-        }
-    }
-
-    if (!single && !multi) {
-        ret = -EINVAL;
-        goto out_single;
-    }
-
-    /* Determine how many group fds need to be passed */
-    count = 0;
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                count++;
-                break;
-            }
-        }
-    }
-
-    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
-    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
-    fds = &reset->group_fds[0];
-
-    /* Fill in group fds */
-    QLIST_FOREACH(group, &vfio_group_list, next) {
-        for (i = 0; i < info->count; i++) {
-            if (group->groupid == devices[i].group_id) {
-                fds[reset->count++] = group->fd;
-                break;
-            }
-        }
+        return ret;
     }
 
-    /* Bus reset! */
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
-    g_free(reset);
-
-    trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
-                                    ret ? strerror(errno) : "Success");
-
-out:
-    /* Re-enable INTx on affected devices */
-    for (i = 0; i < info->count; i++) {
-        PCIHostDeviceAddress host;
-        VFIOPCIDevice *tmp;
-        VFIODevice *vbasedev_iter;
-
-        host.domain = devices[i].segment;
-        host.bus = devices[i].bus;
-        host.slot = PCI_SLOT(devices[i].devfn);
-        host.function = PCI_FUNC(devices[i].devfn);
-
-        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
-            continue;
-        }
-
-        QLIST_FOREACH(group, &vfio_group_list, next) {
-            if (group->groupid == devices[i].group_id) {
-                break;
-            }
-        }
-
-        if (!group) {
-            break;
-        }
+    *info_p = info;
+    return 0;
+}
 
-        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
-            if (!vbasedev_iter->dev->realized ||
-                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
-                continue;
-            }
-            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
-            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
-                vfio_pci_post_reset(tmp);
-                break;
-            }
-        }
-    }
-out_single:
-    if (!single) {
-        vfio_pci_post_reset(vdev);
-    }
-    g_free(info);
+static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
+{
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops;
 
-    return ret;
+    return ops->pci_hot_reset(vbasedev, single);
 }
 
 /*
@@ -3076,19 +2971,19 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
     VFIODevice *vbasedev = &vdev->vbasedev;
-    char *tmp, *subsys;
     Error *err = NULL;
-    struct stat st;
     int i, ret;
-    bool is_mdev;
     char uuid[UUID_STR_LEN];
     char *name;
 
-    if (!vbasedev->sysfsdev) {
+    if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
         if (!(~vdev->host.domain || ~vdev->host.bus ||
               ~vdev->host.slot || ~vdev->host.function)) {
             error_setg(errp, "No provided host device");
             error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
+#ifdef CONFIG_IOMMUFD
+                              "or -device vfio-pci,fd=DEVICE_FD "
+#endif
                               "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
             return;
         }
@@ -3098,32 +2993,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
                             vdev->host.slot, vdev->host.function);
     }
 
-    if (stat(vbasedev->sysfsdev, &st) < 0) {
-        error_setg_errno(errp, errno, "no such host device");
-        error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
+    if (vfio_device_get_name(vbasedev, errp) < 0) {
         return;
     }
 
-    vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
-    vbasedev->ops = &vfio_pci_ops;
-    vbasedev->type = VFIO_DEVICE_TYPE_PCI;
-    vbasedev->dev = DEVICE(vdev);
-
     /*
      * Mediated devices *might* operate compatibly with discarding of RAM, but
      * we cannot know for certain, it depends on whether the mdev vendor driver
      * stays in sync with the active working set of the guest driver.  Prevent
      * the x-balloon-allowed option unless this is minimally an mdev device.
      */
-    tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
-    subsys = realpath(tmp, NULL);
-    g_free(tmp);
-    is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
-    free(subsys);
+    vbasedev->mdev = vfio_device_is_mdev(vbasedev);
 
-    trace_vfio_mdev(vbasedev->name, is_mdev);
+    trace_vfio_mdev(vbasedev->name, vbasedev->mdev);
 
-    if (vbasedev->ram_block_discard_allowed && !is_mdev) {
+    if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) {
         error_setg(errp, "x-balloon-allowed only potentially compatible "
                    "with mdev devices");
         goto error;
@@ -3246,6 +3130,12 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
     vfio_bars_register(vdev);
 
+    if (!vbasedev->mdev &&
+        !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) {
+        error_prepend(errp, "Failed to set iommu_device: ");
+        goto out_teardown;
+    }
+
     ret = vfio_add_capabilities(vdev, errp);
     if (ret) {
         goto out_teardown;
@@ -3267,7 +3157,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
             error_setg(errp,
                        "cannot support IGD OpRegion feature on hotplugged "
                        "device");
-            goto out_teardown;
+            goto out_unset_idev;
         }
 
         ret = vfio_get_dev_region_info(vbasedev,
@@ -3276,13 +3166,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         if (ret) {
             error_setg_errno(errp, -ret,
                              "does not support requested IGD OpRegion feature");
-            goto out_teardown;
+            goto out_unset_idev;
         }
 
         ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
         g_free(opregion);
         if (ret) {
-            goto out_teardown;
+            goto out_unset_idev;
         }
     }
 
@@ -3368,6 +3258,10 @@ out_deregister:
     if (vdev->intx.mmap_timer) {
         timer_free(vdev->intx.mmap_timer);
     }
+out_unset_idev:
+    if (!vbasedev->mdev) {
+        pci_device_unset_iommu_device(pdev);
+    }
 out_teardown:
     vfio_teardown_msi(vdev);
     vfio_bars_exit(vdev);
@@ -3396,6 +3290,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+    VFIODevice *vbasedev = &vdev->vbasedev;
 
     vfio_unregister_req_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
@@ -3410,7 +3305,8 @@ static void vfio_exitfn(PCIDevice *pdev)
     vfio_teardown_msi(vdev);
     vfio_pci_disable_rp_atomics(vdev);
     vfio_bars_exit(vdev);
-    vfio_migration_exit(&vdev->vbasedev);
+    vfio_migration_exit(vbasedev);
+    pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
@@ -3456,6 +3352,7 @@ static void vfio_instance_init(Object *obj)
 {
     PCIDevice *pci_dev = PCI_DEVICE(obj);
     VFIOPCIDevice *vdev = VFIO_PCI(obj);
+    VFIODevice *vbasedev = &vdev->vbasedev;
 
     device_add_bootindex_property(obj, &vdev->bootindex,
                                   "bootindex", NULL,
@@ -3465,6 +3362,9 @@ static void vfio_instance_init(Object *obj)
     vdev->host.slot = ~0U;
     vdev->host.function = ~0U;
 
+    vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
+                     DEVICE(vdev), false);
+
     vdev->nv_gpudirect_clique = 0xFF;
 
     /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
@@ -3479,6 +3379,9 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
                             vbasedev.pre_copy_dirty_page_tracking,
                             ON_OFF_AUTO_ON),
+    DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice,
+                            vbasedev.device_dirty_page_tracking,
+                            ON_OFF_AUTO_ON),
     DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
                             display, ON_OFF_AUTO_OFF),
     DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
@@ -3517,14 +3420,20 @@ static Property vfio_pci_dev_properties[] = {
                                    qdev_prop_nv_gpudirect_clique, uint8_t),
     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
                                 OFF_AUTOPCIBAR_OFF),
-    /*
-     * TODO - support passed fds... is this necessary?
-     * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
-     * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
-     */
+#ifdef CONFIG_IOMMUFD
+    DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
+                     TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
     DEFINE_PROP_END_OF_LIST(),
 };
 
+#ifdef CONFIG_IOMMUFD
+static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
+{
+    vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
+}
+#endif
+
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3532,6 +3441,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 
     dc->reset = vfio_pci_reset;
     device_class_set_props(dc, vfio_pci_dev_properties);
+#ifdef CONFIG_IOMMUFD
+    object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
+#endif
     dc->desc = "VFIO-based PCI device assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->realize = vfio_realize;
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index fba8737ab2cb23118c0819f600379773d718ed18..6e64a2654e690af11b72710530a41135b726e96f 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -218,6 +218,12 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr);
 
 extern const PropertyInfo qdev_prop_nv_gpudirect_clique;
 
+void vfio_pci_pre_reset(VFIOPCIDevice *vdev);
+void vfio_pci_post_reset(VFIOPCIDevice *vdev);
+bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name);
+int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
+                                    struct vfio_pci_hot_reset_info **info_p);
+
 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
 
 int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 8e3d4ac45824ec69afb523f8f0e668327122cd02..a8d9b7da633e0717421acbe9a951334b074b6607 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -15,11 +15,13 @@
  */
 
 #include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
 #include "qapi/error.h"
 #include <sys/ioctl.h>
 #include <linux/vfio.h>
 
 #include "hw/vfio/vfio-platform.h"
+#include "sysemu/iommufd.h"
 #include "migration/vmstate.h"
 #include "qemu/error-report.h"
 #include "qemu/lockable.h"
@@ -529,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = {
  */
 static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
 {
-    struct stat st;
     int ret;
 
-    /* @sysfsdev takes precedence over @host */
-    if (vbasedev->sysfsdev) {
+    /* @fd takes precedence over @sysfsdev which takes precedence over @host */
+    if (vbasedev->fd < 0 && vbasedev->sysfsdev) {
         g_free(vbasedev->name);
         vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
-    } else {
+    } else if (vbasedev->fd < 0) {
         if (!vbasedev->name || strchr(vbasedev->name, '/')) {
             error_setg(errp, "wrong host device name");
             return -EINVAL;
@@ -546,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
                                              vbasedev->name);
     }
 
-    if (stat(vbasedev->sysfsdev, &st) < 0) {
-        error_setg_errno(errp, errno,
-                         "failed to get the sysfs host device file status");
-        return -errno;
+    ret = vfio_device_get_name(vbasedev, errp);
+    if (ret) {
+        return ret;
     }
 
     ret = vfio_attach_device(vbasedev->name, vbasedev,
@@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp)
     VFIODevice *vbasedev = &vdev->vbasedev;
     int i, ret;
 
-    vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
-    vbasedev->dev = dev;
-    vbasedev->ops = &vfio_platform_ops;
-
     qemu_mutex_init(&vdev->intp_mutex);
 
     trace_vfio_platform_realize(vbasedev->sysfsdev ?
@@ -649,9 +645,29 @@ static Property vfio_platform_dev_properties[] = {
     DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
                        mmap_timeout, 1100),
     DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true),
+#ifdef CONFIG_IOMMUFD
+    DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd,
+                     TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
     DEFINE_PROP_END_OF_LIST(),
 };
 
+static void vfio_platform_instance_init(Object *obj)
+{
+    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj);
+    VFIODevice *vbasedev = &vdev->vbasedev;
+
+    vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
+                     DEVICE(vdev), false);
+}
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp)
+{
+    vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp);
+}
+#endif
+
 static void vfio_platform_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -659,6 +675,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data)
 
     dc->realize = vfio_platform_realize;
     device_class_set_props(dc, vfio_platform_dev_properties);
+#ifdef CONFIG_IOMMUFD
+    object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd);
+#endif
     dc->vmsd = &vfio_platform_vmstate;
     dc->desc = "VFIO-based platform device assignment";
     sbc->connect_irq_notifier = vfio_start_irqfd_injection;
@@ -671,6 +690,7 @@ static const TypeInfo vfio_platform_dev_info = {
     .name = TYPE_VFIO_PLATFORM,
     .parent = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(VFIOPlatformDevice),
+    .instance_init = vfio_platform_instance_init,
     .class_init = vfio_platform_class_init,
     .class_size = sizeof(VFIOPlatformDeviceClass),
 };
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 83da2f7ec213dab2acca9b96a1d07a22a49d22c4..697f80d11df7a301cf3d0a6e274a723308e3ea0d 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -24,6 +24,12 @@
 #include "qapi/error.h"
 #include "trace.h"
 
+typedef struct VFIOSpaprContainer {
+    VFIOContainer container;
+    MemoryListener prereg_listener;
+    QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+} VFIOSpaprContainer;
+
 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
 {
     if (memory_region_is_iommu(section->mr)) {
@@ -44,8 +50,10 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
 static void vfio_prereg_listener_region_add(MemoryListener *listener,
                                             MemoryRegionSection *section)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer,
-                                            prereg_listener);
+    VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
+                                                  prereg_listener);
+    VFIOContainer *container = &scontainer->container;
+    VFIOContainerBase *bcontainer = &container->bcontainer;
     const hwaddr gpa = section->offset_within_address_space;
     hwaddr end;
     int ret;
@@ -88,9 +96,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
          * can gracefully fail.  Runtime, there's not much we can do other
          * than throw a hardware error.
          */
-        if (!container->initialized) {
-            if (!container->error) {
-                error_setg_errno(&container->error, -ret,
+        if (!bcontainer->initialized) {
+            if (!bcontainer->error) {
+                error_setg_errno(&bcontainer->error, -ret,
                                  "Memory registering failed");
             }
         } else {
@@ -102,8 +110,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
 static void vfio_prereg_listener_region_del(MemoryListener *listener,
                                             MemoryRegionSection *section)
 {
-    VFIOContainer *container = container_of(listener, VFIOContainer,
-                                            prereg_listener);
+    VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
+                                                  prereg_listener);
+    VFIOContainer *container = &scontainer->container;
     const hwaddr gpa = section->offset_within_address_space;
     hwaddr end;
     int ret;
@@ -146,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = {
     .region_del = vfio_prereg_listener_region_del,
 };
 
-static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
+static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
                               hwaddr max_iova, uint64_t iova_pgsizes)
 {
     VFIOHostDMAWindow *hostwin;
 
-    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+    QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
         if (ranges_overlap(hostwin->min_iova,
                            hostwin->max_iova - hostwin->min_iova + 1,
                            min_iova,
@@ -165,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
     hostwin->min_iova = min_iova;
     hostwin->max_iova = max_iova;
     hostwin->iova_pgsizes = iova_pgsizes;
-    QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
+    QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
 }
 
-static int vfio_host_win_del(VFIOContainer *container,
+static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
                              hwaddr min_iova, hwaddr max_iova)
 {
     VFIOHostDMAWindow *hostwin;
 
-    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+    QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
         if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
             QLIST_REMOVE(hostwin, hostwin_next);
             g_free(hostwin);
@@ -184,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container,
     return -1;
 }
 
-static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
+static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
                                             hwaddr iova, hwaddr end)
 {
     VFIOHostDMAWindow *hostwin;
@@ -226,6 +235,7 @@ static int vfio_spapr_create_window(VFIOContainer *container,
                                     hwaddr *pgsize)
 {
     int ret = 0;
+    VFIOContainerBase *bcontainer = &container->bcontainer;
     IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
     uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
     unsigned entries, bits_total, bits_per_level, max_levels;
@@ -239,13 +249,13 @@ static int vfio_spapr_create_window(VFIOContainer *container,
     if (pagesize > rampagesize) {
         pagesize = rampagesize;
     }
-    pgmask = container->pgsizes & (pagesize | (pagesize - 1));
+    pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
     pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
     if (!pagesize) {
         error_report("Host doesn't support page size 0x%"PRIx64
                      ", the supported mask is 0x%lx",
                      memory_region_iommu_get_min_page_size(iommu_mr),
-                     container->pgsizes);
+                     bcontainer->pgsizes);
         return -EINVAL;
     }
 
@@ -313,10 +323,15 @@ static int vfio_spapr_create_window(VFIOContainer *container,
     return 0;
 }
 
-int vfio_container_add_section_window(VFIOContainer *container,
-                                      MemoryRegionSection *section,
-                                      Error **errp)
+static int
+vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
+                                        MemoryRegionSection *section,
+                                        Error **errp)
 {
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                            bcontainer);
+    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+                                                  container);
     VFIOHostDMAWindow *hostwin;
     hwaddr pgsize = 0;
     int ret;
@@ -332,7 +347,7 @@ int vfio_container_add_section_window(VFIOContainer *container,
         iova = section->offset_within_address_space;
         end = iova + int128_get64(section->size) - 1;
 
-        if (!vfio_find_hostwin(container, iova, end)) {
+        if (!vfio_find_hostwin(scontainer, iova, end)) {
             error_setg(errp, "Container %p can't map guest IOVA region"
                        " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
                        iova, end);
@@ -346,7 +361,7 @@ int vfio_container_add_section_window(VFIOContainer *container,
     }
 
     /* For now intersections are not allowed, we may relax this later */
-    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+    QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
         if (ranges_overlap(hostwin->min_iova,
                            hostwin->max_iova - hostwin->min_iova + 1,
                            section->offset_within_address_space,
@@ -368,7 +383,7 @@ int vfio_container_add_section_window(VFIOContainer *container,
         return ret;
     }
 
-    vfio_host_win_add(container, section->offset_within_address_space,
+    vfio_host_win_add(scontainer, section->offset_within_address_space,
                       section->offset_within_address_space +
                       int128_get64(section->size) - 1, pgsize);
 #ifdef CONFIG_KVM
@@ -401,16 +416,22 @@ int vfio_container_add_section_window(VFIOContainer *container,
     return 0;
 }
 
-void vfio_container_del_section_window(VFIOContainer *container,
-                                       MemoryRegionSection *section)
+static void
+vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
+                                        MemoryRegionSection *section)
 {
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                            bcontainer);
+    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+                                                  container);
+
     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
         return;
     }
 
     vfio_spapr_remove_window(container,
                              section->offset_within_address_space);
-    if (vfio_host_win_del(container,
+    if (vfio_host_win_del(scontainer,
                           section->offset_within_address_space,
                           section->offset_within_address_space +
                           int128_get64(section->size) - 1) < 0) {
@@ -419,13 +440,45 @@ void vfio_container_del_section_window(VFIOContainer *container,
     }
 }
 
+static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
+{
+    VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+                                            bcontainer);
+    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+                                                  container);
+    VFIOHostDMAWindow *hostwin, *next;
+
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        memory_listener_unregister(&scontainer->prereg_listener);
+    }
+    QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
+                       next) {
+        QLIST_REMOVE(hostwin, hostwin_next);
+        g_free(hostwin);
+    }
+}
+
+static VFIOIOMMUOps vfio_iommu_spapr_ops;
+
+static void setup_spapr_ops(VFIOContainerBase *bcontainer)
+{
+    vfio_iommu_spapr_ops = *bcontainer->ops;
+    vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
+    vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
+    vfio_iommu_spapr_ops.release = vfio_spapr_container_release;
+    bcontainer->ops = &vfio_iommu_spapr_ops;
+}
+
 int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
 {
+    VFIOContainerBase *bcontainer = &container->bcontainer;
+    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+                                                  container);
     struct vfio_iommu_spapr_tce_info info;
     bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
     int ret, fd = container->fd;
 
-    QLIST_INIT(&container->hostwin_list);
+    QLIST_INIT(&scontainer->hostwin_list);
 
     /*
      * The host kernel code implementing VFIO_IOMMU_DISABLE is called
@@ -439,13 +492,13 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
             return -errno;
         }
     } else {
-        container->prereg_listener = vfio_prereg_listener;
+        scontainer->prereg_listener = vfio_prereg_listener;
 
-        memory_listener_register(&container->prereg_listener,
+        memory_listener_register(&scontainer->prereg_listener,
                                  &address_space_memory);
-        if (container->error) {
+        if (bcontainer->error) {
             ret = -1;
-            error_propagate_prepend(errp, container->error,
+            error_propagate_prepend(errp, bcontainer->error,
                     "RAM memory listener initialization failed: ");
             goto listener_unregister_exit;
         }
@@ -461,7 +514,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
     }
 
     if (v2) {
-        container->pgsizes = info.ddw.pgsizes;
+        bcontainer->pgsizes = info.ddw.pgsizes;
         /*
          * There is a default window in just created container.
          * To make region_add/del simpler, we better remove this
@@ -476,32 +529,56 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
         }
     } else {
         /* The default table uses 4K pages */
-        container->pgsizes = 0x1000;
-        vfio_host_win_add(container, info.dma32_window_start,
+        bcontainer->pgsizes = 0x1000;
+        vfio_host_win_add(scontainer, info.dma32_window_start,
                           info.dma32_window_start +
                           info.dma32_window_size - 1,
                           0x1000);
     }
 
+    setup_spapr_ops(bcontainer);
+
     return 0;
 
 listener_unregister_exit:
     if (v2) {
-        memory_listener_unregister(&container->prereg_listener);
+        memory_listener_unregister(&scontainer->prereg_listener);
     }
     return ret;
 }
 
 void vfio_spapr_container_deinit(VFIOContainer *container)
 {
+    VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+                                                  container);
     VFIOHostDMAWindow *hostwin, *next;
 
     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
-        memory_listener_unregister(&container->prereg_listener);
+        memory_listener_unregister(&scontainer->prereg_listener);
     }
-    QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
+    QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
                        next) {
         QLIST_REMOVE(hostwin, hostwin_next);
         g_free(hostwin);
     }
 }
+
+static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
+{
+    VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+
+    vioc->add_window = vfio_spapr_container_add_section_window;
+    vioc->del_window = vfio_spapr_container_del_section_window;
+    //vioc->release = vfio_spapr_container_release;
+    //vioc->setup = vfio_spapr_container_setup;
+};
+
+static const TypeInfo types[] = {
+    {
+        .name = TYPE_VFIO_IOMMU_SPAPR,
+        .parent = TYPE_VFIO_IOMMU_LEGACY,
+        .class_init = vfio_iommu_spapr_class_init,
+    },
+};
+
+DEFINE_TYPES(types)
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 0eb2387cf24c920b0904ec4012b0fcd3f2e8b3cf..8fdde5445697789edeb4c6383566c1b417cc1595 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -116,8 +116,8 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
 vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
-vfio_dma_unmap_overflow_workaround(void) ""
-vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
+vfio_legacy_dma_unmap_overflow_workaround(void) ""
+vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
 
 # platform.c
@@ -164,3 +164,14 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop
 vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
 vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
+
+#iommufd.c
+
+iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d"
+iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)"
+iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d"
+iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s"
+iommufd_cdev_fail_attach_existing_container(const char *msg) " %s"
+iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d"
+iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
+iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d"
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index c0055a78326c1344d2df79090443bdc4817a5cd0..67291563d369cb5cbe2c1abd170207306cea3c9f 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -5,7 +5,7 @@ system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c')
 system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c'))
 system_virtio_ss.add(when: 'CONFIG_VHOST_VSOCK_COMMON', if_true: files('vhost-vsock-common.c'))
 system_virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c'))
-system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
+system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c', 'vdpa-dev-mig.c', 'vdpa-dev-iommufd.c'))
 
 specific_virtio_ss = ss.source_set()
 specific_virtio_ss.add(files('virtio.c'))
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 637cac4edf0f3459915d9d840bfc02501e7ca7a5..439f45a569bef1e56fb357900b64bd5ffc7835a2 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -48,11 +48,10 @@ vhost_vdpa_set_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRI
 vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id %"PRIu32
 vhost_vdpa_reset_device(void *dev) "dev: %p"
 vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq idx: %d"
-vhost_vdpa_set_vring_ready(void *dev, unsigned i, int r) "dev: %p, idx: %u, r: %d"
+vhost_vdpa_set_vring_enable_one(void *dev, unsigned i, int enable, int r) "dev: %p, idx: %u, enable: %u, r: %d"
 vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s"
 vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32
 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32
-vhost_vdpa_suspend(void *dev) "dev: %p"
 vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d"
 vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p"
 vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64
diff --git a/hw/virtio/vdpa-dev-iommufd.c b/hw/virtio/vdpa-dev-iommufd.c
new file mode 100644
index 0000000000000000000000000000000000000000..f5718bae99d431ff60aa2d9ba0ada0a0b6df46de
--- /dev/null
+++ b/hw/virtio/vdpa-dev-iommufd.c
@@ -0,0 +1,483 @@
+/*
+ * vhost vdpa device iommufd backend
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All Rights Reserved.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include <sys/ioctl.h>
+#include <linux/vhost.h>
+#include "qapi/error.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "hw/virtio/vdpa-dev-iommufd.h"
+#include "migration/migration.h"
+#include "qapi/qapi-commands-migration.h"
+#include "hw/virtio/vhost.h"
+
+static QLIST_HEAD(, VDPAIOMMUFDContainer) vdpa_container_list =
+    QLIST_HEAD_INITIALIZER(vdpa_container_list);
+
+static int vhost_vdpa_iommufd_container_dma_map(VDPAIOMMUFDContainer *container, hwaddr iova,
+                                                hwaddr size, void *vaddr, bool readonly)
+{
+    return iommufd_backend_map_dma(container->iommufd, container->ioas_id, iova, size, vaddr, readonly);
+
+}
+static int vhost_vdpa_iommufd_container_dma_unmap(VDPAIOMMUFDContainer *container,
+                                                  hwaddr iova, hwaddr size)
+{
+    return iommufd_backend_unmap_dma(container->iommufd, container->ioas_id, iova, size);
+}
+
+static void vhost_vdpa_iommufd_container_region_add(MemoryListener *listener,
+                                                    MemoryRegionSection *section)
+{
+    VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener);
+    hwaddr iova;
+    Int128 llend, llsize;
+    void *vaddr;
+    int page_size = qemu_target_page_size();
+    int page_mask = -page_size;
+    int ret;
+
+    if (vhost_vdpa_listener_skipped_section(section, 0, ULLONG_MAX, page_mask)) {
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~page_mask) !=
+                 (section->offset_within_region & ~page_mask))) {
+        return;
+    }
+
+    iova = ROUND_UP(section->offset_within_address_space, page_size);
+    llend = vhost_vdpa_section_end(section, page_mask);
+    if (int128_ge(int128_make64(iova), llend)) {
+        return;
+    }
+
+    memory_region_ref(section->mr);
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    llsize = int128_sub(llend, int128_make64(iova));
+
+    ret = vhost_vdpa_iommufd_container_dma_map(container, iova, int128_get64(llsize),
+                                               vaddr, section->readonly);
+    if (ret) {
+        qemu_log("vhost vdpa iommufd container dma map failed: %d\n", ret);
+    }
+}
+
+static void vhost_vdpa_iommufd_container_region_del(MemoryListener *listener,
+                                                    MemoryRegionSection *section)
+{
+    VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener);
+    hwaddr iova;
+    Int128 llend, llsize;
+    int page_size = qemu_target_page_size();
+    int page_mask = -page_size;
+    int ret;
+
+    if (vhost_vdpa_listener_skipped_section(section, 0, ULLONG_MAX, page_mask)) {
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~page_mask) !=
+                 (section->offset_within_region & ~page_mask))) {
+        return;
+    }
+
+    iova = ROUND_UP(section->offset_within_address_space, page_size);
+    llend = vhost_vdpa_section_end(section, page_mask);
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return;
+    }
+
+    llsize = int128_sub(llend, int128_make64(iova));
+    /*
+     * The unmap ioctl doesn't accept a full 64-bit. need to check it
+     */
+    if (int128_eq(llsize, int128_2_64())) {
+        llsize = int128_rshift(llsize, 1);
+        ret = vhost_vdpa_iommufd_container_dma_unmap(container, iova, int128_get64(llsize));
+
+        if (ret) {
+            qemu_log("vhost vdpa iommufd container unmap failed(0x%" HWADDR_PRIx ", "
+                     "0x%" HWADDR_PRIx ") = %d (%m)", iova, int128_get64(llsize), ret);
+        }
+        iova += int128_get64(llsize);
+    }
+    ret = vhost_vdpa_iommufd_container_dma_unmap(container, iova, int128_get64(llsize));
+
+    if (ret) {
+        qemu_log("vhost vdpa iommufd container unmap failed(0x%" HWADDR_PRIx ", "
+                  "0x%" HWADDR_PRIx ") = %d (%m)", iova, int128_get64(llsize), ret);
+    }
+
+    memory_region_unref(section->mr);
+}
+
+static void vhost_vdpa_iommufd_container_log_sync(MemoryListener *listener,
+                                                  MemoryRegionSection *section)
+{
+    VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener);
+    IOMMUFDHWPT *hwpt;
+    VhostVdpaDevice *vdev;
+    MigrationState *ms = migrate_get_current();
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        QLIST_FOREACH(vdev, &hwpt->device_list, next) {
+            if (!vdev->dev.log_enabled || !vdev->dev.log) {
+                continue;
+            }
+
+            /**
+             * For the vhost-vdpa device, log_sync is performed on the entire VM,
+             * that is, this sync is for the entire flatview.
+             * Therefore, the first MemoryRegionSection of flatview needs to be
+             * synchronized. The rest of the mrs do not need to be synchronized.
+             */
+            if (is_first_section(section)) {
+                int r = vdev->dev.vhost_ops->vhost_log_sync(&vdev->dev);
+                if (r < 0) {
+                    qemu_log("Failed to sync dirty log: %d\n", r);
+                    if (migration_is_running(ms->state)) {
+                        qmp_migrate_cancel(NULL);
+                    }
+                    return;
+                }
+            }
+
+            /**
+             * Dirty maps are merged separately by MRS, so each MRS needs to be iterated.
+             */
+            if (vhost_bytemap_log_support(&vdev->dev)) {
+                vhost_sync_dirty_bytemap(&vdev->dev, section);
+            } else {
+                vhost_sync_dirty_bitmap(&vdev->dev, section, 0x0, ~0x0ULL);
+            }
+            return;
+        }
+    }
+}
+
+
+/*
+ * IOTLB API used by vhost vdpa iommufd container
+ */
+const MemoryListener vhost_vdpa_iommufd_container_listener = {
+    .name = "vhost-vdpa-iommufd-container",
+    .region_add = vhost_vdpa_iommufd_container_region_add,
+    .region_del = vhost_vdpa_iommufd_container_region_del,
+    .log_sync = vhost_vdpa_iommufd_container_log_sync,
+};
+
+static int vhost_vdpa_container_connect_iommufd(VDPAIOMMUFDContainer *container)
+{
+    IOMMUFDBackend *iommufd = container->iommufd;
+    uint32_t ioas_id;
+    Error *err = NULL;
+
+    if (!iommufd) {
+        return -1;
+    }
+
+    if (iommufd_backend_connect(iommufd, &err)) {
+        error_report_err(err);
+        return -1;
+    }
+
+    if (iommufd_backend_alloc_ioas(iommufd, &ioas_id, &err)) {
+        error_report_err(err);
+        iommufd_backend_disconnect(iommufd);
+        return -1;
+    }
+    container->ioas_id = ioas_id;
+    return 0;
+}
+
+static void vhost_vdpa_container_disconnect_iommufd(VDPAIOMMUFDContainer *container)
+{
+    IOMMUFDBackend *iommufd = container->iommufd;
+    uint32_t ioas_id = container->ioas_id;
+
+    if (!iommufd) {
+        return;
+    }
+
+    iommufd_backend_free_id(iommufd, ioas_id);
+    iommufd_backend_disconnect(iommufd);
+}
+
+static IOMMUFDHWPT *vhost_vdpa_find_hwpt(VDPAIOMMUFDContainer *container,
+                                         VhostVdpaDevice *vdev)
+{
+    IOMMUFDHWPT *hwpt = NULL;
+    VhostVdpaDevice *tmp = NULL;
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        QLIST_FOREACH(tmp, &hwpt->device_list, next) {
+            if (tmp == vdev) {
+                return hwpt;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+static VDPAIOMMUFDContainer *vhost_vdpa_find_container(VhostVdpaDevice *vdev)
+{
+    VDPAIOMMUFDContainer *container = NULL;
+
+    QLIST_FOREACH(container, &vdpa_container_list, next) {
+        if (container->iommufd == vdev->iommufd) {
+            return container;
+        }
+    }
+
+    return NULL;
+}
+
+static VDPAIOMMUFDContainer *vhost_vdpa_create_container(VhostVdpaDevice *vdev)
+{
+    VDPAIOMMUFDContainer *container = NULL;
+
+    container = g_new0(VDPAIOMMUFDContainer, 1);
+    container->iommufd = vdev->iommufd;
+    container->listener = vhost_vdpa_iommufd_container_listener;
+    QLIST_INIT(&container->hwpt_list);
+
+    QLIST_INSERT_HEAD(&vdpa_container_list, container, next);
+
+    return container;
+}
+
+static void vhost_vdpa_destroy_container(VDPAIOMMUFDContainer *container)
+{
+    if (!container) {
+        return;
+    }
+
+    container->iommufd = NULL;
+    QLIST_SAFE_REMOVE(container, next);
+    g_free(container);
+}
+
+static void vhost_vdpa_device_unbind_iommufd(VhostVdpaDevice *vdev)
+{
+    int ret;
+    ret = ioctl(vdev->vhostfd, VHOST_VDPA_UNBIND_IOMMUFD, 0);
+    if (ret) {
+        qemu_log("vhost vdpa device unbind iommufd failed: %d, devid: %d\n",
+                 ret, vdev->iommufd_devid);
+    }
+}
+
+static int vhost_vdpa_device_bind_iommufd(VhostVdpaDevice *vdev)
+{
+    IOMMUFDBackend *iommufd = vdev->iommufd;
+    struct vdpa_dev_bind_iommufd bind = {
+        .iommufd = iommufd->fd,
+        .out_devid = -1,
+    };
+    int ret;
+
+    /* iommufd auto unbind when vdev->vhostfd close */
+    ret = ioctl(vdev->vhostfd, VHOST_VDPA_BIND_IOMMUFD, &bind);
+    if (ret) {
+        qemu_log("vhost vdpa device bind iommufd failed: %d\n", ret);
+        return ret;
+    }
+    vdev->iommufd_devid = bind.out_devid;
+    return 0;
+}
+
+static int vhost_vdpa_container_attach_device(VDPAIOMMUFDContainer *container, VhostVdpaDevice *vdev)
+{
+    IOMMUFDBackend *iommufd = NULL;
+    IOMMUFDHWPT *hwpt = NULL;
+    Error *err = NULL;
+    uint32_t pt_id;
+    int ret;
+
+    if (!container || !container->iommufd || container->iommufd != vdev->iommufd) {
+        return -1;
+    }
+
+    iommufd = container->iommufd;
+
+    /* try to find an available hwpt */
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        pt_id = hwpt->hwpt_id;
+        ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id);
+        if (ret == 0) {
+            QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next);
+            vdev->dev.has_container = true;
+            return 0;
+        }
+    }
+
+    /* available hwpt not found in the container, create a new one */
+    hwpt = g_new0(IOMMUFDHWPT, 1);
+    QLIST_INIT(&hwpt->device_list);
+
+    if (!iommufd_backend_alloc_hwpt(iommufd, vdev->iommufd_devid,
+                                    container->ioas_id, 0, 0, 0, NULL,
+                                    &pt_id, NULL, &err)) {
+        error_report_err(err);
+        ret = -1;
+        goto free_mem;
+    }
+
+    hwpt->hwpt_id = pt_id;
+
+    ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id);
+    if (ret) {
+        qemu_log("vhost vdpa device attach iommufd pt failed: %d\n", ret);
+        goto free_hwpt;
+    }
+
+    QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next);
+    vdev->dev.has_container = true;
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+
+    return 0;
+
+free_hwpt:
+    iommufd_backend_free_id(iommufd, hwpt->hwpt_id);
+free_mem:
+    g_free(hwpt);
+    return ret;
+}
+
+static void vhost_vdpa_container_detach_device(VDPAIOMMUFDContainer *container, VhostVdpaDevice *vdev)
+{
+    IOMMUFDBackend *iommufd = vdev->iommufd;
+    IOMMUFDHWPT *hwpt = NULL;
+
+    /* find the hwpt using by this device */
+    hwpt = vhost_vdpa_find_hwpt(container, vdev);
+    if (!hwpt) {
+        return;
+    }
+
+    ioctl(vdev->vhostfd, VHOST_VDPA_DETACH_IOMMUFD_PT, &hwpt->hwpt_id);
+
+    QLIST_SAFE_REMOVE(vdev, next);
+    vdev->dev.has_container = false;
+
+    /* No device using this hwpt, free it */
+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        iommufd_backend_free_id(iommufd, hwpt->hwpt_id);
+        QLIST_SAFE_REMOVE(hwpt, next);
+        g_free(hwpt);
+    }
+}
+
+static int vhost_vdpa_container_get_dev_count(VDPAIOMMUFDContainer *container)
+{
+    IOMMUFDHWPT *hwpt;
+    VhostVdpaDevice *dev;
+    int dev_count = 0;
+
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        QLIST_FOREACH(dev, &hwpt->device_list, next) {
+            dev_count++;
+        }
+    }
+
+    return dev_count;
+}
+
+int vhost_vdpa_attach_container(VhostVdpaDevice *vdev)
+{
+    VDPAIOMMUFDContainer *container = NULL;
+    IOMMUFDBackend *iommufd = vdev->iommufd;
+    bool new_container = false;
+    int dev_count = 0;
+    int ret = 0;
+
+    if (!iommufd) {
+        return 0;
+    }
+
+    container = vhost_vdpa_find_container(vdev);
+    if (!container) {
+        container = vhost_vdpa_create_container(vdev);
+        if (!container) {
+            qemu_log("vdpa create container failed\n");
+            return -1;
+        }
+        ret = vhost_vdpa_container_connect_iommufd(container);
+        if (ret) {
+            qemu_log("vdpa container connect iommufd failed\n");
+            goto destroy;
+        }
+        new_container = true;
+    }
+
+    ret = vhost_vdpa_device_bind_iommufd(vdev);
+    if (ret) {
+        qemu_log("vdpa device bind iommufd failed\n");
+        goto disconnect;
+    }
+
+    ret = vhost_vdpa_container_attach_device(container, vdev);
+    if (ret) {
+        qemu_log("vdpa container attach device failed\n");
+        goto unbind;
+    }
+
+    /* register the container memory listener when attaching the first device */
+    dev_count = vhost_vdpa_container_get_dev_count(container);
+    if (dev_count == 1) {
+        memory_listener_register(&container->listener, &address_space_memory);
+    }
+
+    return 0;
+
+unbind:
+    vhost_vdpa_device_unbind_iommufd(vdev);
+disconnect:
+    if (!new_container) {
+        return ret;
+    }
+    vhost_vdpa_container_disconnect_iommufd(container);
+destroy:
+    vhost_vdpa_destroy_container(container);
+
+    return ret;
+}
+
+void vhost_vdpa_detach_container(VhostVdpaDevice *vdev)
+{
+    VDPAIOMMUFDContainer *container = NULL;
+    IOMMUFDBackend *iommufd = vdev->iommufd;
+
+    if (!iommufd) {
+        return;
+    }
+
+    container = vhost_vdpa_find_container(vdev);
+    if (!container) {
+        return;
+    }
+
+    vhost_vdpa_container_detach_device(container, vdev);
+
+    vhost_vdpa_device_unbind_iommufd(vdev);
+
+    if (!QLIST_EMPTY(&container->hwpt_list)) {
+        return;
+    }
+    /* No HWPT in this container, destroy it */
+    memory_listener_unregister(&container->listener);
+    vhost_vdpa_container_disconnect_iommufd(container);
+
+    vhost_vdpa_destroy_container(container);
+}
diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c
new file mode 100644
index 0000000000000000000000000000000000000000..7de996c83552efe1ccc7edcfd82c584c6ec3e1db
--- /dev/null
+++ b/hw/virtio/vdpa-dev-mig.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/ioctl.h>
+#include <linux/vhost.h>
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "hw/virtio/virtio-bus.h"
+#include "migration/register.h"
+#include "migration/migration.h"
+#include "migration/misc.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/vdpa-dev-mig.h"
+#include "migration/qemu-file-types.h"
+#include "qemu/main-loop.h"
+
+/*
+ * Flags used as delimiter:
+ * 0xffffffff => MSB 32-bit all 1s
+ * 0xef10     => emulated (virtual) function IO
+ * 0x0000     => 16-bits reserved for flags
+ */
+#define VDPA_MIG_FLAG_END_OF_STATE      (0xffffffffef100001ULL)
+#define VDPA_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
+#define VDPA_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
+
+static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
+                           void *arg)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    int fd = v->device_fd;
+
+    if (dev->vhost_ops->backend_type != VHOST_BACKEND_TYPE_VDPA) {
+        error_report("backend type isn't VDPA. Operation not permitted!\n");
+        return -EPERM;
+    }
+
+    return ioctl(fd, request, arg);
+}
+
+static int vhost_vdpa_set_mig_state(struct vhost_dev *dev, uint8_t state)
+{
+    return vhost_vdpa_call(dev, VHOST_VDPA_SET_MIG_STATE, &state);
+}
+
+static int vhost_vdpa_dev_buffer_size(struct vhost_dev *dev, uint32_t *size)
+{
+    return vhost_vdpa_call(dev, VHOST_GET_DEV_BUFFER_SIZE, size);
+}
+
+static int vhost_vdpa_dev_buffer_save(struct vhost_dev *dev, QEMUFile *f)
+{
+    struct vhost_vdpa_config *config;
+    unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
+    uint32_t buffer_size = 0;
+    int ret;
+
+    ret = vhost_vdpa_dev_buffer_size(dev, &buffer_size);
+    if (ret) {
+        error_report("get dev buffer size failed: %d\n", ret);
+        return ret;
+    }
+
+    qemu_put_be32(f, buffer_size);
+
+    config = g_malloc(buffer_size + config_size);
+    config->off = 0;
+    config->len = buffer_size;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_DEV_BUFFER, config);
+    if (ret) {
+        error_report("get dev buffer failed: %d\n", ret);
+        goto free;
+    }
+
+    qemu_put_buffer(f, config->buf, buffer_size);
+free:
+    g_free(config);
+
+    return ret;
+}
+
+static int vhost_vdpa_dev_buffer_load(struct vhost_dev *dev, QEMUFile *f)
+{
+    struct vhost_vdpa_config *config;
+    unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
+    uint32_t buffer_size, recv_size;
+    int ret;
+
+    buffer_size = qemu_get_be32(f);
+
+    config = g_malloc(buffer_size + config_size);
+    config->off = 0;
+    config->len = buffer_size;
+
+    recv_size = qemu_get_buffer(f, config->buf, buffer_size);
+    if (recv_size != buffer_size) {
+        error_report("read dev mig buffer failed, buffer_size: %u, "
+                     "recv_size: %u\n", buffer_size, recv_size);
+        ret = -EINVAL;
+        goto free;
+    }
+
+    ret = vhost_vdpa_call(dev, VHOST_SET_DEV_BUFFER, config);
+    if (ret) {
+        error_report("set dev buffer failed: %d\n", ret);
+    }
+
+free:
+    g_free(config);
+
+    return ret;
+}
+
+static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(vdpa);
+
+    if (!vdpa->started || vdpa->suspended) {
+        return 0;
+    }
+
+    vdpa->suspended = true;
+
+    return vhost_dev_suspend(&vdpa->dev, vdev, false);
+}
+
+static int vhost_vdpa_device_resume(VhostVdpaDevice *vdpa)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(vdpa);
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int ret;
+
+    if (!vdpa->started ||
+        (!vdpa->suspended && mis->state != RUN_STATE_RESTORE_VM)) {
+        return 0;
+    }
+
+    ret = vhost_dev_resume(&vdpa->dev, vdev, false);
+    if (ret < 0) {
+        return ret;
+    }
+
+    vdpa->suspended = false;
+    return ret;
+}
+
+static void vdpa_dev_migration_handle_incoming_bh(void *opaque)
+{
+    struct vhost_dev *hdev = opaque;
+    int ret;
+
+    /* Post start device, unsupport rollback if failed! */
+    ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_POST_START);
+    if (ret) {
+        error_report("Failed to set state: POST_START\n");
+    }
+}
+
+static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state)
+{
+    VhostVdpaDevice *vdpa = VHOST_VDPA_DEVICE(opaque);
+    struct vhost_dev *hdev = &vdpa->dev;
+    int ret;
+    MigrationState *ms = migrate_get_current();
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    if (!running) {
+        if (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED) {
+            ret = vhost_vdpa_device_suspend(vdpa);
+            if (ret) {
+                error_report("suspend vdpa device failed: %d\n", ret);
+                if (ms->migration_thread_running) {
+                    migrate_fd_cancel(ms);
+                }
+            }
+        }
+    } else {
+        if (vdpa->suspended) {
+            ret = vhost_vdpa_device_resume(vdpa);
+            if (ret) {
+                error_report("vhost vdpa device resume failed: %d\n", ret);
+            }
+        }
+
+        if (mis->state == RUN_STATE_RESTORE_VM) {
+            ret = vhost_vdpa_call(hdev, VHOST_VDPA_RESUME, NULL);
+            if (ret) {
+                error_report("migration dest resume device failed: %d\n", ret);
+                exit(EXIT_FAILURE);
+            }
+            /* post resume */
+            mis->bh = qemu_bh_new(vdpa_dev_migration_handle_incoming_bh,
+                                  hdev);
+            qemu_bh_schedule(mis->bh);
+        }
+    }
+}
+
+static int vdpa_save_setup(QEMUFile *f, void *opaque)
+{
+    qemu_put_be64(f, VDPA_MIG_FLAG_DEV_SETUP_STATE);
+    qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE);
+
+    return qemu_file_get_error(f);
+}
+
+static int vdpa_save_complete_precopy(QEMUFile *f, void *opaque)
+{
+    VhostVdpaDevice *vdev = VHOST_VDPA_DEVICE(opaque);
+    struct vhost_dev *hdev = &vdev->dev;
+    int ret;
+
+    qemu_put_be64(f, VDPA_MIG_FLAG_DEV_CONFIG_STATE);
+    qemu_put_be16(f, (uint16_t)vdev->suspended);
+    if (vdev->suspended) {
+        ret = vhost_vdpa_dev_buffer_save(hdev, f);
+        if (ret) {
+            error_report("Save vdpa device buffer failed: %d\n", ret);
+            return ret;
+        }
+    }
+    qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE);
+
+    return qemu_file_get_error(f);
+}
+
+static int vdpa_load_state(QEMUFile *f, void *opaque, int version_id)
+{
+    VhostVdpaDevice *vdev = VHOST_VDPA_DEVICE(opaque);
+    struct vhost_dev *hdev = &vdev->dev;
+
+    int ret;
+    uint64_t data;
+    uint16_t suspended;
+
+    data = qemu_get_be64(f);
+    while (data != VDPA_MIG_FLAG_END_OF_STATE) {
+        if (data == VDPA_MIG_FLAG_DEV_SETUP_STATE) {
+            data = qemu_get_be64(f);
+            if (data == VDPA_MIG_FLAG_END_OF_STATE) {
+                return 0;
+            } else {
+                error_report("SETUP STATE: EOS not found 0x%lx\n", data);
+                return -EINVAL;
+            }
+        } else if (data == VDPA_MIG_FLAG_DEV_CONFIG_STATE) {
+            suspended = qemu_get_be16(f);
+            if (suspended) {
+                ret = vhost_vdpa_dev_buffer_load(hdev, f);
+                if (ret) {
+                    error_report("fail to restore device buffer.\n");
+                    return ret;
+                }
+            }
+        }
+
+        ret = qemu_file_get_error(f);
+        if (ret) {
+            error_report("qemu file error: %d\n", ret);
+            return ret;
+        }
+        data = qemu_get_be64(f);
+    }
+
+    return 0;
+}
+
+static int vdpa_load_setup(QEMUFile *f, void *opaque)
+{
+    VhostVdpaDevice *v = VHOST_VDPA_DEVICE(opaque);
+    struct vhost_dev *hdev = &v->dev;
+    int ret = 0;
+
+    ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_PRE_START);
+    if (ret) {
+        error_report("pre start device failed: %d\n", ret);
+        goto out;
+    }
+
+    return qemu_file_get_error(f);
+out:
+    return ret;
+}
+
+static SaveVMHandlers savevm_vdpa_handlers = {
+    .save_setup = vdpa_save_setup,
+    .save_live_complete_precopy = vdpa_save_complete_precopy,
+    .load_state = vdpa_load_state,
+    .load_setup = vdpa_load_setup,
+};
+
+static void vdpa_migration_state_notifier(Notifier *notifier, void *data)
+{
+    MigrationState *s = data;
+    VhostVdpaDevice *vdev = container_of(notifier,
+                                         VhostVdpaDevice,
+                                         migration_state);
+    struct vhost_dev *hdev = &vdev->dev;
+    int ret;
+
+    switch (s->state) {
+    case MIGRATION_STATUS_CANCELLING:
+    case MIGRATION_STATUS_CANCELLED:
+    case MIGRATION_STATUS_FAILED:
+        ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_CANCEL);
+        if (ret) {
+            error_report("Failed to set state CANCEL\n");
+        }
+
+        break;
+    case MIGRATION_STATUS_COMPLETED:
+    default:
+        break;
+    }
+}
+
+void vdpa_migration_register(VhostVdpaDevice *vdev)
+{
+    vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev),
+                                                     vdpa_dev_vmstate_change,
+                                                     DEVICE(vdev));
+    register_savevm_live("vdpa", -1, 1,
+                         &savevm_vdpa_handlers, DEVICE(vdev));
+    vdev->migration_state.notify = vdpa_migration_state_notifier;
+    migration_add_notifier(&vdev->migration_state, vdpa_migration_state_notifier);
+}
+
+void vdpa_migration_unregister(VhostVdpaDevice *vdev)
+{
+    migration_remove_notifier(&vdev->migration_state);
+    unregister_savevm(NULL, "vdpa", DEVICE(vdev));
+    qemu_del_vm_change_state_handler(vdev->vmstate);
+}
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
index f22d5d5bc0ac66e8997037d84c732271d45e0ba0..7ce8547419a4c807df89b4a58dd10c5217754ddc 100644
--- a/hw/virtio/vdpa-dev.c
+++ b/hw/virtio/vdpa-dev.c
@@ -28,6 +28,11 @@
 #include "hw/virtio/vdpa-dev.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/runstate.h"
+#include "hw/virtio/vdpa-dev-mig.h"
+#include "migration/migration.h"
+#include "exec/address-spaces.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "hw/virtio/vdpa-dev-iommufd.h"
 
 static void
 vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
@@ -106,6 +111,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
     v->dev.vq_index = 0;
     v->dev.vq_index_end = v->dev.nvqs;
     v->dev.backend_features = 0;
+    v->dev.has_container = false;
     v->started = false;
 
     ret = vhost_vdpa_get_iova_range(v->vhostfd, &iova_range);
@@ -123,6 +129,17 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
         goto free_vqs;
     }
 
+    /* If the vdpa device is associated with an iommufd, attach device to container */
+    if (v->iommufd) {
+        ret = vhost_vdpa_attach_container(v);
+        if (ret < 0) {
+            error_setg(errp, "vhost vdpa device attach container failed: %s",
+                       strerror(-ret));
+            goto free_vqs;
+        }
+    } else {
+        memory_listener_register(&v->vdpa.listener, &address_space_memory);
+    }
     v->config_size = vhost_vdpa_device_get_u32(v->vhostfd,
                                                VHOST_VDPA_GET_CONFIG_SIZE,
                                                errp);
@@ -154,12 +171,18 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
                                         vhost_vdpa_device_dummy_handle_output);
     }
 
+    vdpa_migration_register(v);
+
     return;
 
 free_config:
     g_free(v->config);
 vhost_cleanup:
+    memory_listener_unregister(&v->vdpa.listener);
     vhost_dev_cleanup(&v->dev);
+    if (v->iommufd) {
+        vhost_vdpa_detach_container(v);
+    }
 free_vqs:
     g_free(vqs);
 out:
@@ -173,6 +196,7 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev)
     VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev);
     int i;
 
+    vdpa_migration_unregister(s);
     virtio_set_status(vdev, 0);
 
     for (i = 0; i < s->num_queues; i++) {
@@ -183,7 +207,11 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev)
 
     g_free(s->config);
     g_free(s->dev.vqs);
+    memory_listener_unregister(&s->vdpa.listener);
     vhost_dev_cleanup(&s->dev);
+    if (s->iommufd) {
+        vhost_vdpa_detach_container(s);
+    }
     qemu_close(s->vhostfd);
     s->vhostfd = -1;
 }
@@ -192,7 +220,23 @@ static void
 vhost_vdpa_device_get_config(VirtIODevice *vdev, uint8_t *config)
 {
     VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev);
+    uint8_t *new_config;
+    int ret;
 
+    if (s->vdev_id != VIRTIO_ID_BLOCK) {
+        goto out;
+    }
+
+    new_config = g_malloc0(s->config_size);
+    ret = vhost_dev_get_config(&s->dev, new_config, s->config_size, NULL);
+    if (ret < 0) {
+        error_report("vhost-vdpa-device: get config failed(%d)\n", ret);
+        goto free;
+    }
+    memcpy(s->config, new_config, s->config_size);
+free:
+    g_free(new_config);
+out:
     memcpy(config, s->config, s->config_size);
 }
 
@@ -250,14 +294,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp)
 
     s->dev.acked_features = vdev->guest_features;
 
-    ret = vhost_dev_start(&s->dev, vdev, false);
+    ret = vhost_dev_start(&s->dev, vdev, true);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Error starting vhost");
         goto err_guest_notifiers;
     }
-    for (i = 0; i < s->dev.nvqs; ++i) {
-        vhost_vdpa_set_vring_ready(&s->vdpa, i);
-    }
     s->started = true;
 
     /*
@@ -316,7 +357,7 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status)
         should_start = false;
     }
 
-    if (s->started == should_start) {
+    if (s->started == should_start || s->suspended) {
         return;
     }
 
@@ -333,12 +374,13 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status)
 static Property vhost_vdpa_device_properties[] = {
     DEFINE_PROP_STRING("vhostdev", VhostVdpaDevice, vhostdev),
     DEFINE_PROP_UINT16("queue-size", VhostVdpaDevice, queue_size, 0),
+    DEFINE_PROP_LINK("iommufd", VhostVdpaDevice, iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
     DEFINE_PROP_END_OF_LIST(),
 };
 
 static const VMStateDescription vmstate_vhost_vdpa_device = {
     .name = "vhost-vdpa-device",
-    .unmigratable = 1,
+    .unmigratable = 0,
     .minimum_version_id = 1,
     .version_id = 1,
     .fields = (VMStateField[]) {
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index f214df804b27fb29effee67a58e9437f303ff12b..224e7d2379cda51f7a4657dcb64685a1d9306c55 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -28,6 +28,7 @@
 #include "sysemu/cryptodev.h"
 #include "migration/migration.h"
 #include "migration/postcopy-ram.h"
+#include "migration/register.h"
 #include "trace.h"
 #include "exec/ramblock.h"
 
@@ -2119,6 +2120,28 @@ static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
     return 0;
 }
 
+static int vhost_user_load_setup(QEMUFile *f, void *opaque)
+{
+    struct vhost_dev *hdev = opaque;
+    int r;
+
+    if (hdev->vhost_ops && hdev->vhost_ops->vhost_set_mem_table) {
+        r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
+        if (r < 0) {
+            qemu_log("error: vhost_set_mem_table failed: %s(%d)\n",
+                     strerror(errno), errno);
+            return r;
+        } else {
+            qemu_log("info: vhost_set_mem_table OK\n");
+        }
+    }
+    return 0;
+}
+
+SaveVMHandlers savevm_vhost_user_handlers = {
+    .load_setup = vhost_user_load_setup,
+};
+
 static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
                                    Error **errp)
 {
@@ -2126,9 +2149,15 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
     struct vhost_user *u;
     VhostUserState *vus = (VhostUserState *) opaque;
     int err;
+    Chardev *chr;
 
     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
 
+    chr = qemu_chr_fe_get_driver(((VhostUserState *)opaque)->chr);
+    if (chr) {
+        chr->chr_for_flag = CHR_FOR_VHOST_USER;
+    }
+
     u = g_new0(struct vhost_user, 1);
     u->user = vus;
     u->dev = dev;
@@ -2255,6 +2284,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
 
     u->postcopy_notifier.notify = vhost_user_postcopy_notifier;
     postcopy_add_notifier(&u->postcopy_notifier);
+    register_savevm_live("vhost-user", -1, 1, &savevm_vhost_user_handlers, dev);
 
     return 0;
 }
@@ -2286,6 +2316,7 @@ static int vhost_user_backend_cleanup(struct vhost_dev *dev)
     u->region_rb_len = 0;
     g_free(u);
     dev->opaque = 0;
+    unregister_savevm(NULL, "vhost-user", dev);
 
     return 0;
 }
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 819b2d811af4247e52f7fd9f109e844218836d3c..b5fb89b98e15fe871d16b237390aa0ed581022cc 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -26,13 +26,14 @@
 #include "qemu/main-loop.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "hw/virtio/vdpa-dev-iommufd.h"
 
 /*
  * Return one past the end of the end of section. Be careful with uint64_t
  * conversions!
  */
-static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section,
-                                     int page_mask)
+Int128 vhost_vdpa_section_end(const MemoryRegionSection *section,
+                              int page_mask)
 {
     Int128 llend = int128_make64(section->offset_within_address_space);
     llend = int128_add(llend, section->size);
@@ -41,10 +42,10 @@ static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section,
     return llend;
 }
 
-static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
-                                                uint64_t iova_min,
-                                                uint64_t iova_max,
-                                                int page_mask)
+bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
+                                         uint64_t iova_min,
+                                         uint64_t iova_max,
+                                         int page_mask)
 {
     Int128 llend;
 
@@ -596,21 +597,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
         return 0;
     }
 
-    /*
-     * If dev->shadow_vqs_enabled at initialization that means the device has
-     * been started with x-svq=on, so don't block migration
-     */
-    if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) {
-        /* We don't have dev->features yet */
-        uint64_t features;
-        ret = vhost_vdpa_get_dev_features(dev, &features);
-        if (unlikely(ret)) {
-            error_setg_errno(errp, -ret, "Could not get device features");
-            return ret;
-        }
-        vhost_svq_valid_features(features, &dev->migration_blocker);
-    }
-
     /*
      * Similar to VFIO, we end up pinning all guest memory and have to
      * disable discarding of RAM.
@@ -829,10 +815,11 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev,
 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
 {
     uint64_t features;
-    uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
-        0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
-        0x1ULL << VHOST_BACKEND_F_IOTLB_ASID |
-        0x1ULL << VHOST_BACKEND_F_SUSPEND;
+    uint64_t f = BIT_ULL(VHOST_BACKEND_F_IOTLB_MSG_V2) |
+                 BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH) |
+                 BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID) |
+                 BIT_ULL(VHOST_BACKEND_F_SUSPEND) |
+                 BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG);
     int r;
 
     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
@@ -864,13 +851,11 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
 
 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
 {
-    struct vhost_vdpa *v = dev->opaque;
     int ret;
     uint8_t status = 0;
 
     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
     trace_vhost_vdpa_reset_device(dev);
-    v->suspended = false;
     return ret;
 }
 
@@ -882,19 +867,46 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
     return idx;
 }
 
-int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
+                                           int enable)
 {
     struct vhost_dev *dev = v->dev;
     struct vhost_vring_state state = {
         .index = idx,
-        .num = 1,
+        .num = enable,
     };
+    hwaddr addr = virtio_queue_get_desc_addr(dev->vdev, idx);
+    if (addr == 0) {
+        return 0;
+    }
+
     int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
 
-    trace_vhost_vdpa_set_vring_ready(dev, idx, r);
+    trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r);
     return r;
 }
 
+static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    unsigned int i;
+    int ret;
+
+    for (i = 0; i < dev->nvqs; ++i) {
+        ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+{
+    return vhost_vdpa_set_vring_enable_one(v, idx, 1);
+}
+
 static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
                                        int fd)
 {
@@ -1268,29 +1280,6 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
     }
 }
 
-static void vhost_vdpa_suspend(struct vhost_dev *dev)
-{
-    struct vhost_vdpa *v = dev->opaque;
-    int r;
-
-    if (!vhost_vdpa_first_dev(dev)) {
-        return;
-    }
-
-    if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) {
-        trace_vhost_vdpa_suspend(dev);
-        r = ioctl(v->device_fd, VHOST_VDPA_SUSPEND);
-        if (unlikely(r)) {
-            error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
-        } else {
-            v->suspended = true;
-            return;
-        }
-    }
-
-    vhost_vdpa_reset_device(dev);
-}
-
 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
 {
     struct vhost_vdpa *v = dev->opaque;
@@ -1304,7 +1293,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
             return -1;
         }
     } else {
-        vhost_vdpa_suspend(dev);
         vhost_vdpa_svqs_stop(dev);
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
@@ -1319,8 +1307,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
                          "IOMMU and try again");
             return -1;
         }
-        memory_listener_register(&v->listener, dev->vdev->dma_as);
-
         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
     }
 
@@ -1329,8 +1315,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
 
 static void vhost_vdpa_reset_status(struct vhost_dev *dev)
 {
-    struct vhost_vdpa *v = dev->opaque;
-
     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
         return;
     }
@@ -1338,7 +1322,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev)
     vhost_vdpa_reset_device(dev);
     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
                                VIRTIO_CONFIG_S_DRIVER);
-    memory_listener_unregister(&v->listener);
 }
 
 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
@@ -1354,6 +1337,30 @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
 }
 
+static int vhost_vdpa_set_log_fd(struct vhost_dev *dev, int fd,
+                                 struct vhost_log *log)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
+        return 0;
+    }
+
+    return vhost_vdpa_call(dev, VHOST_SET_LOG_FD, &fd);
+}
+
+static int vhost_vdpa_set_log_size(struct vhost_dev *dev, uint64_t size,
+                                   struct vhost_log *log)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    uint64_t logsize = size * sizeof(*(log->log));
+
+    if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
+        return 0;
+    }
+
+    return vhost_vdpa_call(dev, VHOST_SET_LOG_SIZE, &logsize);
+}
+
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
@@ -1404,14 +1411,6 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
         return 0;
     }
 
-    if (!v->suspended) {
-        /*
-         * Cannot trust in value returned by device, let vhost recover used
-         * idx from guest.
-         */
-        return -1;
-    }
-
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
@@ -1488,11 +1487,59 @@ static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
     return true;
 }
 
+static int vhost_vdpa_suspend_device(struct vhost_dev *dev)
+{
+    int ret;
+
+    vhost_vdpa_svqs_stop(dev);
+    vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
+
+    if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+        return 0;
+    }
+
+    ret = vhost_vdpa_call(dev, VHOST_VDPA_SUSPEND, NULL);
+    return ret;
+}
+
+static int vhost_vdpa_resume_device(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    bool ok;
+
+    vhost_vdpa_host_notifiers_init(dev);
+    ok = vhost_vdpa_svqs_start(dev);
+    if (unlikely(!ok)) {
+        return -1;
+    }
+    for (int i = 0; i < v->dev->nvqs; ++i) {
+        vhost_vdpa_set_vring_ready(v, v->dev->vq_index + i);
+    }
+
+    if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+        return 0;
+    }
+
+    return vhost_vdpa_call(dev, VHOST_VDPA_RESUME, NULL);
+}
+
+static int vhost_vdpa_log_sync(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
+        return 0;
+    }
+
+    return vhost_vdpa_call(dev, VHOST_LOG_SYNC, NULL);
+}
+
 const VhostOps vdpa_ops = {
         .backend_type = VHOST_BACKEND_TYPE_VDPA,
         .vhost_backend_init = vhost_vdpa_init,
         .vhost_backend_cleanup = vhost_vdpa_cleanup,
         .vhost_set_log_base = vhost_vdpa_set_log_base,
+        .vhost_set_log_size = vhost_vdpa_set_log_size,
+        .vhost_set_log_fd = vhost_vdpa_set_log_fd,
         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
@@ -1508,6 +1555,7 @@ const VhostOps vdpa_ops = {
         .vhost_set_features = vhost_vdpa_set_features,
         .vhost_reset_device = vhost_vdpa_reset_device,
         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
+        .vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
         .vhost_get_config  = vhost_vdpa_get_config,
         .vhost_set_config = vhost_vdpa_set_config,
         .vhost_requires_shm_log = NULL,
@@ -1519,6 +1567,9 @@ const VhostOps vdpa_ops = {
         .vhost_get_device_id = vhost_vdpa_get_device_id,
         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
         .vhost_force_iommu = vhost_vdpa_force_iommu,
+        .vhost_log_sync = vhost_vdpa_log_sync,
         .vhost_set_config_call = vhost_vdpa_set_config_call,
         .vhost_reset_status = vhost_vdpa_reset_status,
+        .vhost_dev_suspend = vhost_vdpa_suspend_device,
+        .vhost_dev_resume = vhost_vdpa_resume_device,
 };
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac794680ea9b65eba6cc22e70cf141e90aa73..58207e472b082c3a2c10a139de86a6445239abba 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -26,8 +26,11 @@
 #include "hw/mem/memory-device.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file-types.h"
+#include "migration/migration.h"
 #include "sysemu/dma.h"
 #include "trace.h"
+#include "qapi/qapi-commands-migration.h"
+#include "sysemu/kvm.h"
 
 /* enabled until disconnected backend stabilizes */
 #define _VHOST_DEBUG 1
@@ -43,6 +46,11 @@
     do { } while (0)
 #endif
 
+bool vhost_bytemap_log_support(struct vhost_dev *dev)
+{
+    return (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG));
+}
+
 static struct vhost_log *vhost_log;
 static struct vhost_log *vhost_log_shm;
 
@@ -55,6 +63,8 @@ static unsigned int used_shared_memslots;
 static QLIST_HEAD(, vhost_dev) vhost_devices =
     QLIST_HEAD_INITIALIZER(vhost_devices);
 
+bool used_memslots_exceeded;
+
 unsigned int vhost_get_max_memslots(void)
 {
     unsigned int max = UINT_MAX;
@@ -149,10 +159,10 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
     }
 }
 
-static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
-                                   MemoryRegionSection *section,
-                                   hwaddr first,
-                                   hwaddr last)
+int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
+                            MemoryRegionSection *section,
+                            hwaddr first,
+                            hwaddr last)
 {
     int i;
     hwaddr start_addr;
@@ -229,12 +239,40 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
     return 0;
 }
 
+int vhost_sync_dirty_bytemap(struct vhost_dev *dev,
+                             MemoryRegionSection *section)
+{
+    unsigned long *bytemap = dev->log->log;
+    return memory_section_set_dirty_bytemap(section, bytemap);
+}
+
 static void vhost_log_sync(MemoryListener *listener,
                           MemoryRegionSection *section)
 {
     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                          memory_listener);
-    vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
+    MigrationState *ms = migrate_get_current();
+
+    if (!dev->log_enabled || !dev->log || dev->has_container) {
+        return;
+    }
+
+    if (dev->vhost_ops->vhost_log_sync) {
+        int r = dev->vhost_ops->vhost_log_sync(dev);
+        if (r < 0) {
+            error_report("Failed to sync dirty log: 0x%x\n", r);
+            if (migration_is_running(ms->state)) {
+                qmp_migrate_cancel(NULL);
+            }
+            return;
+        }
+    }
+
+    if (vhost_bytemap_log_support(dev)) {
+        vhost_sync_dirty_bytemap(dev, section);
+    } else {
+        vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
+    }
 }
 
 static void vhost_log_sync_range(struct vhost_dev *dev,
@@ -244,7 +282,11 @@ static void vhost_log_sync_range(struct vhost_dev *dev,
     /* FIXME: this is N^2 in number of sections */
     for (i = 0; i < dev->n_mem_sections; ++i) {
         MemoryRegionSection *section = &dev->mem_sections[i];
-        vhost_sync_dirty_bitmap(dev, section, first, last);
+        if (vhost_bytemap_log_support(dev)) {
+            vhost_sync_dirty_bytemap(dev, section);
+        } else {
+            vhost_sync_dirty_bitmap(dev, section, first, last);
+        }
     }
 }
 
@@ -252,11 +294,19 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 {
     uint64_t log_size = 0;
     int i;
+    uint64_t vhost_log_chunk_size;
+
+    if (vhost_bytemap_log_support(dev)) {
+        vhost_log_chunk_size = VHOST_LOG_CHUNK_BYTES;
+    } else {
+        vhost_log_chunk_size = VHOST_LOG_CHUNK;
+    }
+
     for (i = 0; i < dev->mem->nregions; ++i) {
         struct vhost_memory_region *reg = dev->mem->regions + i;
         uint64_t last = range_get_last(reg->guest_phys_addr,
                                        reg->memory_size);
-        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
+        log_size = MAX(log_size, last / vhost_log_chunk_size + 1);
     }
     return log_size;
 }
@@ -374,12 +424,21 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
            dev->vhost_ops->vhost_requires_shm_log(dev);
 }
 
-static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
+static inline int vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 {
     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
-    uint64_t log_base = (uintptr_t)log->log;
+    uint64_t log_base;
+    int log_fd;
     int r;
 
+    if (!log) {
+        r = -ENOMEM;
+        goto out;
+    }
+
+    log_base = (uint64_t)log->log;
+    log_fd = log_fd;
+
     /* inform backend of log switching, this must be done before
        releasing the current log, to ensure no logging is lost */
     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
@@ -387,9 +446,19 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
         VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
     }
 
+    if (dev->vhost_ops->vhost_set_log_size) {
+        r = dev->vhost_ops->vhost_set_log_size(dev, size, dev->log);
+        if (r < 0) {
+            VHOST_OPS_DEBUG(r, "vhost_set_log_size failed");
+        }
+    }
+
     vhost_log_put(dev, true);
     dev->log = log;
     dev->log_size = size;
+
+out:
+    return r;
 }
 
 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
@@ -1015,7 +1084,11 @@ static int vhost_migration_log(MemoryListener *listener, bool enable)
         }
         vhost_log_put(dev, false);
     } else {
-        vhost_dev_log_resize(dev, vhost_get_log_size(dev));
+        r = vhost_dev_log_resize(dev, vhost_get_log_size(dev));
+        if ( r < 0 ) {
+            return r;
+        }
+
         r = vhost_dev_set_log(dev, true);
         if (r < 0) {
             goto check_dev_state;
@@ -1047,20 +1120,24 @@ check_dev_state:
 static void vhost_log_global_start(MemoryListener *listener)
 {
     int r;
+    Error *errp = NULL;
 
     r = vhost_migration_log(listener, true);
     if (r < 0) {
-        abort();
+        error_setg(&errp, "Failed to start vhost migration log");
+        migrate_fd_error(migrate_get_current(), errp);
     }
 }
 
 static void vhost_log_global_stop(MemoryListener *listener)
 {
     int r;
+    Error *errp = NULL;
 
     r = vhost_migration_log(listener, false);
     if (r < 0) {
-        abort();
+        error_setg(&errp, "Failed to stop vhost migration log");
+        migrate_fd_error(migrate_get_current(), errp);
     }
 }
 
@@ -1540,7 +1617,12 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
     hdev->log_size = 0;
     hdev->log_enabled = false;
     hdev->started = false;
-    memory_listener_register(&hdev->memory_listener, &address_space_memory);
+    if (virtcca_cvm_enabled() && virtcca_shared_hugepage && virtcca_shared_hugepage->ram_block) {
+        memory_listener_register(&hdev->memory_listener,
+                                 &address_space_virtcca_shared_memory);
+    } else {
+        memory_listener_register(&hdev->memory_listener, &address_space_memory);
+    }
     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
 
     /*
@@ -1564,8 +1646,11 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
         error_setg(errp, "vhost backend memory slots limit (%d) is less"
                    " than current number of used (%d) and reserved (%d)"
                    " memory slots for memory devices.", limit, used, reserved);
+        used_memslots_exceeded = true;
         r = -EINVAL;
         goto fail_busyloop;
+    } else {
+        used_memslots_exceeded = false;
     }
 
     return 0;
@@ -1984,7 +2069,13 @@ static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
     return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
 }
 
-/* Host notifiers must be enabled at this point. */
+/*
+ * Host notifiers must be enabled at this point.
+ *
+ * If @vrings is true, this function will enable all vrings before starting the
+ * device. If it is false, the vring initialization is left to be done by the
+ * caller.
+ */
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
 {
     int i, r;
@@ -2047,6 +2138,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
             VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
             goto fail_log;
         }
+
+        if (hdev->vhost_ops->vhost_set_log_size) {
+            r = hdev->vhost_ops->vhost_set_log_size(hdev, hdev->log_size, hdev->log);
+            if (r < 0) {
+                VHOST_OPS_DEBUG(r, "vhost_set_log_size failed");
+                goto fail_log;
+            }
+        }
     }
     if (vrings) {
         r = vhost_dev_set_vring_enable(hdev, true);
@@ -2400,3 +2499,145 @@ fail:
 
     return ret;
 }
+
+bool used_memslots_is_exceeded(void)
+{
+    return used_memslots_exceeded;
+}
+
+int vhost_dev_resume(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
+{
+    int i, r;
+    EventNotifier *e = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
+
+    /* should only be called after backend is connected */
+    if (!hdev->vhost_ops) {
+        error_report("Missing vhost_ops! Operation not permitted!\n");
+        return -EPERM;
+    }
+
+    vdev->vhost_started = true;
+    hdev->started = true;
+    hdev->vdev = vdev;
+
+    if (vhost_dev_has_iommu(hdev)) {
+        memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
+    }
+
+    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
+    if (r < 0) {
+        VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
+        goto fail_mem;
+    }
+    for (i = 0; i < hdev->nvqs; ++i) {
+        r = vhost_virtqueue_start(hdev,
+                                  vdev,
+                                  hdev->vqs + i,
+                                  hdev->vq_index + i);
+        if (r < 0) {
+            goto fail_vq;
+        }
+    }
+
+    r = event_notifier_init(e, 0);
+    if (r < 0) {
+        return r;
+    }
+    event_notifier_test_and_clear(e);
+    if (!vdev->use_guest_notifier_mask) {
+        vhost_config_mask(hdev, vdev, true);
+    }
+    if (vrings) {
+        r = vhost_dev_set_vring_enable(hdev, true);
+        if (r) {
+            goto fail_vq;
+        }
+    }
+    if (hdev->vhost_ops->vhost_dev_resume) {
+        r = hdev->vhost_ops->vhost_dev_resume(hdev);
+        if (r) {
+            goto fail_start;
+        }
+    }
+    if (vhost_dev_has_iommu(hdev)) {
+        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
+
+        /*
+         * Update used ring information for IOTLB to work correctly,
+         * vhost-kernel code requires for this.
+         */
+        for (i = 0; i < hdev->nvqs; ++i) {
+            struct vhost_virtqueue *vq = hdev->vqs + i;
+            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
+        }
+    }
+    vhost_start_config_intr(hdev);
+    return 0;
+fail_start:
+    if (vrings) {
+        vhost_dev_set_vring_enable(hdev, false);
+    }
+fail_vq:
+    while (--i >= 0) {
+        vhost_virtqueue_stop(hdev,
+                             vdev,
+                             hdev->vqs + i,
+                             hdev->vq_index + i);
+    }
+
+fail_mem:
+    vdev->vhost_started = false;
+    hdev->started = false;
+    return r;
+}
+
+int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
+{
+    int i;
+    int ret = 0;
+    EventNotifier *e = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
+
+    /* should only be called after backend is connected */
+    if (!hdev->vhost_ops) {
+        error_report("Missing vhost_ops! Operation not permitted!\n");
+        return -EPERM;
+    }
+
+    event_notifier_test_and_clear(e);
+    event_notifier_test_and_clear(&vdev->config_notifier);
+
+    if (hdev->vhost_ops->vhost_dev_suspend) {
+        ret = hdev->vhost_ops->vhost_dev_suspend(hdev);
+        if (ret) {
+            goto fail_suspend;
+        }
+    }
+    if (vrings) {
+        ret = vhost_dev_set_vring_enable(hdev, false);
+        if (ret) {
+            goto fail_suspend;
+        }
+    }
+    for (i = 0; i < hdev->nvqs; ++i) {
+        vhost_virtqueue_stop(hdev,
+                             vdev,
+                             hdev->vqs + i,
+                             hdev->vq_index + i);
+    }
+
+    if (vhost_dev_has_iommu(hdev)) {
+        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
+        memory_listener_unregister(&hdev->iommu_listener);
+    }
+    vhost_stop_config_intr(hdev);
+    hdev->started = false;
+    vdev->vhost_started = false;
+    hdev->vdev = NULL;
+
+    return ret;
+
+fail_suspend:
+    event_notifier_test_and_clear(e);
+
+    return ret;
+}
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index 896feb37a1caa805543e971c150d3673675b9a6b..749df6478ea0c112f0123dbe32e9b9a5d685b843 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -25,10 +25,12 @@
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
+#include "sysemu/kvm.h"
 #include "qapi/error.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio.h"
 #include "exec/address-spaces.h"
+#include "sysemu/kvm.h"
 
 /* #define DEBUG_VIRTIO_BUS */
 
@@ -70,6 +72,12 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp)
         return;
     }
 
+    if (virtcca_cvm_enabled() && (strcmp(vdev->name, "vhost-user-fs") == 0)) {
+        /* VIRTIO_F_IOMMU_PLATFORM should be enabled for vhost-user-fs using swiotlb */
+        error_setg(errp, "iommu_platform is not supported by this device");
+        return;
+    } 
+
     if (klass->device_plugged != NULL) {
         klass->device_plugged(qbus->parent, &local_err);
     }
@@ -81,6 +89,11 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp)
     vdev->dma_as = &address_space_memory;
     if (has_iommu) {
         vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+
+        if (virtcca_cvm_enabled() && (strcmp(vdev->name, "virtio-user-fs") == 0)) {
+            vdev_has_iommu = true;
+        }
+
         /*
          * Present IOMMU_PLATFORM to the driver iff iommu_plattform=on and
          * device operational. If the driver does not accept IOMMU_PLATFORM
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 0e2cc8d5a890c021c9b900186195f37d093c54f6..4aaced74be120d6e8d35606328f2202c3a2caad6 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -1080,8 +1080,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp)
         vcrypto->vqs[i].dataq =
                  virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh);
         vcrypto->vqs[i].dataq_bh =
-                 qemu_bh_new_guarded(virtio_crypto_dataq_bh, &vcrypto->vqs[i],
-                                     &dev->mem_reentrancy_guard);
+                 virtio_bh_new_guarded(dev, virtio_crypto_dataq_bh,
+                                       &vcrypto->vqs[i]);
         vcrypto->vqs[i].vcrypto = vcrypto;
     }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index e4338795423d56b13bd67fa12fc5c808525aef69..13220c258d7c8fa52da3d1334414f754757fc584 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -49,6 +49,90 @@
  * configuration space */
 #define VIRTIO_PCI_CONFIG_SIZE(dev)     VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev))
 
+static KVMRouteChange virtio_pci_route_change;
+
+static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy,
+                                    EventNotifier *n,
+                                    unsigned int vector);
+
+static inline void virtio_pci_begin_route_changes(VirtIODevice *vdev)
+{
+    if (!vdev->defer_kvm_irq_routing) {
+        virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+    }
+}
+
+static inline void virtio_pci_commit_route_changes(VirtIODevice *vdev)
+{
+    if (!vdev->defer_kvm_irq_routing) {
+        kvm_irqchip_commit_route_changes(&virtio_pci_route_change);
+    }
+}
+
+static void virtio_pci_prepare_kvm_msi_virq_batch(VirtIOPCIProxy *proxy)
+{
+    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+    if (vdev->defer_kvm_irq_routing) {
+        qemu_log("invaild defer kvm irq routing state: %d\n", vdev->defer_kvm_irq_routing);
+        return;
+    }
+    virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+    vdev->defer_kvm_irq_routing = true;
+}
+
+static void virtio_pci_commit_kvm_msi_virq_batch(VirtIOPCIProxy *proxy)
+{
+    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+    VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+    EventNotifier *n;
+    VirtQueue *vq;
+    int vector, index, ret;
+
+    if (!vdev->defer_kvm_irq_routing) {
+        qemu_log("invaild defer kvm irq routing state: %d\n", vdev->defer_kvm_irq_routing);
+        return;
+    }
+    vdev->defer_kvm_irq_routing = false;
+    kvm_irqchip_commit_route_changes(&virtio_pci_route_change);
+
+    if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+        return;
+    }
+
+    for (vector = 0; vector < proxy->pci_dev.msix_entries_nr; vector++) {
+        if (msix_is_masked(&proxy->pci_dev, vector)) {
+            continue;
+        }
+
+        if (vector == vdev->config_vector) {
+            n = virtio_config_get_guest_notifier(vdev);
+            ret = kvm_virtio_pci_irqfd_use(proxy, n, vector);
+            if (ret) {
+                qemu_log("config irqfd use failed: %d\n", ret);
+            }
+            continue;
+        }
+
+        vq = virtio_vector_first_queue(vdev, vector);
+
+        while (vq) {
+            index = virtio_get_queue_index(vq);
+            if (!virtio_queue_get_num(vdev, index)) {
+                break;
+            }
+            if (index < proxy->nvqs_with_notifiers) {
+                n = virtio_queue_get_guest_notifier(vq);
+                ret = kvm_virtio_pci_irqfd_use(proxy, n, vector);
+                if (ret < 0) {
+                    qemu_log("Error: irqfd use failed: %d\n", ret);
+                }
+            }
+            vq = virtio_vector_next_queue(vq);
+        }
+    }
+}
+
 static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
                                VirtIOPCIProxy *dev);
 static void virtio_pci_reset(DeviceState *qdev);
@@ -815,12 +899,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
     int ret;
 
     if (irqfd->users == 0) {
-        KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
-        ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev);
+        ret = kvm_irqchip_add_msi_route(&virtio_pci_route_change, vector, &proxy->pci_dev);
         if (ret < 0) {
             return ret;
         }
-        kvm_irqchip_commit_route_changes(&c);
         irqfd->virq = ret;
     }
     irqfd->users++;
@@ -860,6 +942,9 @@ static int virtio_pci_get_notifier(VirtIOPCIProxy *proxy, int queue_no,
     VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
     VirtQueue *vq;
 
+    if (!proxy->vector_irqfd && vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)
+        return -1;
+
     if (queue_no == VIRTIO_CONFIG_IRQ_IDX) {
         *n = virtio_config_get_guest_notifier(vdev);
         *vector = vdev->config_vector;
@@ -922,18 +1007,57 @@ undo:
     }
     return ret;
 }
+
+#ifdef __aarch64__
+int __attribute__((weak)) kvm_create_shadow_device(PCIDevice *dev)
+{
+    return 0;
+}
+
+int __attribute__((weak)) kvm_delete_shadow_device(PCIDevice *dev)
+{
+    return 0;
+}
+#endif
+
+#ifdef __aarch64__
+static bool shadow_device_supported(VirtIODevice *vdev)
+{
+    return !strcmp(vdev->name, "virtio-net") ||
+           !strcmp(vdev->name, "virtio-blk") ||
+           !strcmp(vdev->name, "virtio-scsi");
+}
+#endif
+
 static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs)
 {
     int queue_no;
     int ret = 0;
     VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
 
+#ifdef __aarch64__
+    if (shadow_device_supported(vdev)) {
+        kvm_create_shadow_device(&proxy->pci_dev);
+    }
+#endif
     for (queue_no = 0; queue_no < nvqs; queue_no++) {
         if (!virtio_queue_get_num(vdev, queue_no)) {
             return -1;
         }
+    }
+
+    virtio_pci_begin_route_changes(vdev);
+    for (queue_no = 0; queue_no < nvqs; queue_no++) {
         ret = kvm_virtio_pci_vector_use_one(proxy, queue_no);
     }
+    virtio_pci_commit_route_changes(vdev);
+
+#ifdef __aarch64__
+    if (shadow_device_supported(vdev) && ret != 0) {
+        kvm_delete_shadow_device(&proxy->pci_dev);
+    }
+#endif
+
     return ret;
 }
 
@@ -976,6 +1100,12 @@ static void kvm_virtio_pci_vector_vq_release(VirtIOPCIProxy *proxy, int nvqs)
         }
         kvm_virtio_pci_vector_release_one(proxy, queue_no);
     }
+
+#ifdef __aarch64__
+    if (shadow_device_supported(vdev)) {
+        kvm_delete_shadow_device(&proxy->pci_dev);
+    }
+#endif
 }
 
 static void kvm_virtio_pci_vector_config_release(VirtIOPCIProxy *proxy)
@@ -997,12 +1127,13 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy,
     if (proxy->vector_irqfd) {
         irqfd = &proxy->vector_irqfd[vector];
         if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
-            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg,
+            virtio_pci_begin_route_changes(vdev);
+            ret = kvm_irqchip_update_msi_route(&virtio_pci_route_change, irqfd->virq, msg,
                                                &proxy->pci_dev);
             if (ret < 0) {
                 return ret;
             }
-            kvm_irqchip_commit_routes(kvm_state);
+            virtio_pci_commit_route_changes(vdev);
         }
     }
 
@@ -1017,7 +1148,9 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy,
             event_notifier_set(n);
         }
     } else {
-        ret = kvm_virtio_pci_irqfd_use(proxy, n, vector);
+        if (!vdev->defer_kvm_irq_routing) {
+            ret = kvm_virtio_pci_irqfd_use(proxy, n, vector);
+        }
     }
     return ret;
 }
@@ -1274,6 +1407,8 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign)
     if ((with_irqfd ||
          (vdev->use_guest_notifier_mask && k->guest_notifier_mask)) &&
         assign) {
+
+        virtio_pci_prepare_kvm_msi_virq_batch(proxy);
         if (with_irqfd) {
             proxy->vector_irqfd =
                 g_malloc0(sizeof(*proxy->vector_irqfd) *
@@ -1291,6 +1426,7 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign)
         r = msix_set_vector_notifiers(&proxy->pci_dev, virtio_pci_vector_unmask,
                                       virtio_pci_vector_mask,
                                       virtio_pci_vector_poll);
+        virtio_pci_commit_kvm_msi_virq_batch(proxy);
         if (r < 0) {
             goto notifiers_error;
         }
@@ -1424,6 +1560,38 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
     return offset;
 }
 
+static void virtio_pci_set_vector(VirtIODevice *vdev,
+                                  VirtIOPCIProxy *proxy,
+                                  int queue_no, uint16_t old_vector,
+                                  uint16_t new_vector)
+{
+    bool kvm_irqfd = (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+        msix_enabled(&proxy->pci_dev) && kvm_msi_via_irqfd_enabled();
+
+    if (new_vector == old_vector) {
+        return;
+    }
+
+    /*
+     * If the device uses irqfd and the vector changes after DRIVER_OK is
+     * set, we need to release the old vector and set up the new one.
+     * Otherwise just need to set the new vector on the device.
+     */
+    if (kvm_irqfd && old_vector != VIRTIO_NO_VECTOR) {
+        kvm_virtio_pci_vector_release_one(proxy, queue_no);
+    }
+    /* Set the new vector on the device. */
+    if (queue_no == VIRTIO_CONFIG_IRQ_IDX) {
+        vdev->config_vector = new_vector;
+    } else {
+        virtio_queue_set_vector(vdev, queue_no, new_vector);
+    }
+    /* If the new vector changed need to set it up. */
+    if (kvm_irqfd && new_vector != VIRTIO_NO_VECTOR) {
+        kvm_virtio_pci_vector_use_one(proxy, queue_no);
+    }
+}
+
 int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
                            uint8_t bar, uint64_t offset, uint64_t length,
                            uint8_t id)
@@ -1570,7 +1738,8 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
         } else {
             val = VIRTIO_NO_VECTOR;
         }
-        vdev->config_vector = val;
+        virtio_pci_set_vector(vdev, proxy, VIRTIO_CONFIG_IRQ_IDX,
+                              vdev->config_vector, val);
         break;
     case VIRTIO_PCI_COMMON_STATUS:
         if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
@@ -1610,7 +1779,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
         } else {
             val = VIRTIO_NO_VECTOR;
         }
-        virtio_queue_set_vector(vdev, vdev->queue_sel, val);
+        virtio_pci_set_vector(vdev, proxy, vdev->queue_sel, vector, val);
         break;
     case VIRTIO_PCI_COMMON_Q_ENABLE:
         if (val == 1) {
@@ -2082,7 +2251,9 @@ static void virtio_pci_device_unplugged(DeviceState *d)
     VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
     bool modern = virtio_pci_modern(proxy);
     bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
+    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
 
+    qemu_log("unplug device name: %s\n", !vdev ? "NULL" : vdev->name);
     virtio_pci_stop_ioeventfd(proxy);
 
     if (modern) {
diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c
index e8e3442f38287c6b8ad28ebee16ecac39d933a2c..e542d471622ab50b866122a8388cd3cfc4e8a9aa 100644
--- a/hw/virtio/virtio-scsi-pci.c
+++ b/hw/virtio/virtio-scsi-pci.c
@@ -20,6 +20,7 @@
 #include "qemu/module.h"
 #include "hw/virtio/virtio-pci.h"
 #include "qom/object.h"
+#include "qemu/log.h"
 
 typedef struct VirtIOSCSIPCI VirtIOSCSIPCI;
 
@@ -51,6 +52,8 @@ static void virtio_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
     VirtIOSCSIConf *conf = &dev->vdev.parent_obj.conf;
     char *bus_name;
 
+    qemu_log("virtio scsi HBA %s begin to initialize.\n",
+             !proxy->id ? "NULL" : proxy->id);
     if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
         conf->num_queues =
             virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED);
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 3a160f86ed02ddaee4a501f4468a20f3ecd696f2..f57b6c955e1c4df0283ffd34589ebaddd80fa03a 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -204,6 +204,15 @@ static const char *virtio_id_to_name(uint16_t device_id)
     return name;
 }
 
+static void virtio_check_indirect_feature(VirtIODevice *vdev)
+{
+    if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Device %s: indirect_desc was not negotiated!\n",
+                      vdev->name);
+    }
+}
+
 /* Called within call_rcu().  */
 static void virtio_free_region_cache(VRingMemoryRegionCaches *caches)
 {
@@ -322,7 +331,6 @@ static void vring_packed_event_read(VirtIODevice *vdev,
     /* Make sure flags is seen before off_wrap */
     smp_rmb();
     e->off_wrap = virtio_lduw_phys_cached(vdev, cache, off_off);
-    virtio_tswap16s(vdev, &e->flags);
 }
 
 static void vring_packed_off_wrap_write(VirtIODevice *vdev,
@@ -744,6 +752,60 @@ int virtio_queue_empty(VirtQueue *vq)
     }
 }
 
+static bool virtio_queue_split_poll(VirtQueue *vq, unsigned shadow_idx)
+{
+    if (unlikely(!vq->vring.avail)) {
+        return false;
+    }
+
+    return (uint16_t)shadow_idx != vring_avail_idx(vq);
+}
+
+static bool virtio_queue_packed_poll(VirtQueue *vq, unsigned shadow_idx)
+{
+    VRingPackedDesc desc;
+    VRingMemoryRegionCaches *caches;
+
+    if (unlikely(!vq->vring.desc)) {
+        return false;
+    }
+
+    caches = vring_get_region_caches(vq);
+    if (!caches) {
+        return false;
+    }
+
+    vring_packed_desc_read(vq->vdev, &desc, &caches->desc,
+                           shadow_idx, true);
+
+    return is_desc_avail(desc.flags, vq->shadow_avail_wrap_counter);
+}
+
+static bool virtio_queue_poll(VirtQueue *vq, unsigned shadow_idx)
+{
+    if (virtio_device_disabled(vq->vdev)) {
+        return false;
+    }
+
+    if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+        return virtio_queue_packed_poll(vq, shadow_idx);
+    } else {
+        return virtio_queue_split_poll(vq, shadow_idx);
+    }
+}
+
+bool virtio_queue_enable_notification_and_check(VirtQueue *vq,
+                                                int opaque)
+{
+    virtio_queue_set_notification(vq, 1);
+
+    if (opaque >= 0) {
+        return virtio_queue_poll(vq, (unsigned)opaque);
+    } else {
+        return false;
+    }
+}
+
 static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem,
                                unsigned int len)
 {
@@ -1323,9 +1385,9 @@ err:
     goto done;
 }
 
-void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
-                               unsigned int *out_bytes,
-                               unsigned max_in_bytes, unsigned max_out_bytes)
+int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
+                              unsigned int *out_bytes, unsigned max_in_bytes,
+                              unsigned max_out_bytes)
 {
     uint16_t desc_size;
     VRingMemoryRegionCaches *caches;
@@ -1358,7 +1420,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
                                         caches);
     }
 
-    return;
+    return (int)vq->shadow_avail_idx;
 err:
     if (in_bytes) {
         *in_bytes = 0;
@@ -1366,6 +1428,8 @@ err:
     if (out_bytes) {
         *out_bytes = 0;
     }
+
+    return -1;
 }
 
 int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
@@ -1559,6 +1623,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
             goto done;
         }
 
+	virtio_check_indirect_feature(vdev);
+
         /* loop over the indirect descriptor table */
         len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as,
                                        desc.addr, desc.len, false);
@@ -1689,6 +1755,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
             goto done;
         }
 
+	virtio_check_indirect_feature(vdev);
+
         /* loop over the indirect descriptor table */
         len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as,
                                        desc.addr, desc.len, false);
@@ -2048,7 +2116,14 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val)
         k->set_status(vdev, val);
     }
     vdev->status = val;
-
+    if (val) {
+        qemu_log("%s device status is %d that means %s\n",
+                 vdev->name, val,
+                 (val & VIRTIO_CONFIG_S_DRIVER_OK) ? "DRIVER OK" :
+                 (val & VIRTIO_CONFIG_S_DRIVER) ? "DRIVER" :
+                 (val & VIRTIO_CONFIG_S_ACKNOWLEDGE) ? "ACKNOWLEDGE" :
+                 (val & VIRTIO_CONFIG_S_FAILED) ? "FAILED" : "UNKNOWN");
+    }
     return 0;
 }
 
@@ -2189,12 +2264,17 @@ void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
 
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
 {
+    int vq_max_size = VIRTQUEUE_MAX_SIZE;
+
+    if (!strcmp(vdev->name, "virtio-net")) {
+        vq_max_size = VIRTIO_NET_VQ_MAX_SIZE;
+    }
+
     /* Don't allow guest to flip queue between existent and
      * nonexistent states, or to set it to an invalid size.
      */
     if (!!num != !!vdev->vq[n].vring.num ||
-        num > VIRTQUEUE_MAX_SIZE ||
-        num < 0) {
+        num > vq_max_size || num < 0) {
         return;
     }
     vdev->vq[n].vring.num = num;
@@ -2326,8 +2406,11 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
             break;
     }
 
-    if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
+    if (i == VIRTIO_QUEUE_MAX) {
+        qemu_log("unacceptable queue_size (%d) or num (%d)\n",
+                 queue_size, i);
         abort();
+    }
 
     vdev->vq[i].vring.num = queue_size;
     vdev->vq[i].vring.num_default = queue_size;
@@ -2787,6 +2870,35 @@ static const VMStateDescription vmstate_virtio = {
     }
 };
 
+static void check_vring_avail_num(VirtIODevice *vdev, int index)
+{
+    uint16_t nheads;
+    VRingMemoryRegionCaches *caches;
+
+    rcu_read_lock();
+    caches = qatomic_rcu_read(&vdev->vq[index].vring.caches);
+    if (caches == NULL) {
+        /*
+         * caches may be NULL if virtio_reset is called at the same time,
+         * such as when the virtual machine starts.
+         */
+        rcu_read_unlock();
+        return;
+    }
+
+    /* Check it isn't doing strange things with descriptor numbers. */
+    nheads = vring_avail_idx(&vdev->vq[index]) - vdev->vq[index].last_avail_idx;
+    if (nheads > vdev->vq[index].vring.num) {
+        qemu_log("VQ %d size 0x%x Guest index 0x%x "
+                 "inconsistent with Host index 0x%x: "
+                 "delta 0x%x\n",
+                 index, vdev->vq[index].vring.num,
+                 vring_avail_idx(&vdev->vq[index]),
+                 vdev->vq[index].last_avail_idx, nheads);
+    }
+    rcu_read_unlock();
+}
+
 int virtio_save(VirtIODevice *vdev, QEMUFile *f)
 {
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
@@ -2817,6 +2929,8 @@ int virtio_save(VirtIODevice *vdev, QEMUFile *f)
         if (vdev->vq[i].vring.num == 0)
             break;
 
+        check_vring_avail_num(vdev, i);
+
         qemu_put_be32(f, vdev->vq[i].vring.num);
         if (k->has_variable_vring_alignment) {
             qemu_put_be32(f, vdev->vq[i].vring.align);
@@ -2875,6 +2989,13 @@ static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val)
 {
     VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
     bool bad = (val & ~(vdev->host_features)) != 0;
+    uint64_t feat = val & ~(vdev->host_features);
+
+    if (bad && k->print_features) {
+        qemu_log("error: Please check host config, "\
+                 "because host does not support required feature bits 0x%" PRIx64 "\n", feat);
+        k->print_features(feat);
+    }
 
     val &= vdev->host_features;
     if (k->set_features) {
@@ -4095,3 +4216,13 @@ static void virtio_register_types(void)
 }
 
 type_init(virtio_register_types)
+
+QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev,
+                                   QEMUBHFunc *cb, void *opaque,
+                                   const char *name)
+{
+    DeviceState *transport = qdev_get_parent_bus(dev)->parent;
+
+    return qemu_bh_new_full(cb, opaque, name,
+                            &transport->mem_reentrancy_guard);
+}
diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index 4973e7d9c95a6909c7fc69b867e933362f6e1e8b..c10b0899143a4377912840a58fa10d0092514e26 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -352,6 +352,7 @@ static void xen_bus_realize(BusState *bus, Error **errp)
             error_reportf_err(local_err,
                               "failed to set up '%s' enumeration watch: ",
                               type[i]);
+            local_err = NULL;
         }
 
         g_free(node);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 4e31d161c57aeb5f51117973be8ff763d0d6548e..a6e2436524f02325670cf136e76eeee11e7431a3 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -57,6 +57,8 @@
 #define BLOCK_OPT_DATA_FILE_RAW     "data_file_raw"
 #define BLOCK_OPT_COMPRESSION_TYPE  "compression_type"
 #define BLOCK_OPT_EXTL2             "extended_l2"
+#define BLOCK_OPT_CACHE             "cache"
+#define BLOCK_OPT_BUFFER_SIZE       "buffer_size"
 
 #define BLOCK_PROBE_BUF_SIZE        512
 
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 4e7bd6342f9cc39378e913cb47969331ca462ee0..d4f8b21aecc379cb0049a28cc2d39039791373e7 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -33,6 +33,19 @@ typedef struct NBDMetaContexts NBDMetaContexts;
 
 extern const BlockExportDriver blk_exp_nbd;
 
+/*
+ * NBD_DEFAULT_HANDSHAKE_MAX_SECS: Number of seconds in which client must
+ * succeed at NBD_OPT_GO before being forcefully dropped as too slow.
+ */
+#define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10
+
+/*
+ * NBD_DEFAULT_MAX_CONNECTIONS: Number of client sockets to allow at
+ * once; must be large enough to allow a MULTI_CONN-aware client like
+ * nbdcopy to create its typical number of 8-16 sockets.
+ */
+#define NBD_DEFAULT_MAX_CONNECTIONS 100
+
 /* Handshake phase structs - this struct is passed on the wire */
 
 typedef struct NBDOption {
@@ -403,9 +416,12 @@ AioContext *nbd_export_aio_context(NBDExport *exp);
 NBDExport *nbd_export_find(const char *name);
 
 void nbd_client_new(QIOChannelSocket *sioc,
+                    uint32_t handshake_max_secs,
                     QCryptoTLSCreds *tlscreds,
                     const char *tlsauthz,
-                    void (*close_fn)(NBDClient *, bool));
+                    void (*close_fn)(NBDClient *, bool),
+                    void *owner);
+void *nbd_client_owner(NBDClient *client);
 void nbd_client_get(NBDClient *client);
 void nbd_client_put(NBDClient *client);
 
diff --git a/include/chardev/char.h b/include/chardev/char.h
index 01df55f9e8c8da5dfd40cafa195cfa606e172fb7..f8bd4694664d20155236f65448840117d9a84f8a 100644
--- a/include/chardev/char.h
+++ b/include/chardev/char.h
@@ -14,6 +14,8 @@
 #define IAC_SB 250
 #define IAC 255
 
+#define CHR_FOR_VHOST_USER     0x32a1
+
 /* character device */
 typedef struct CharBackend CharBackend;
 
@@ -70,6 +72,7 @@ struct Chardev {
     GSource *gsource;
     GMainContext *gcontext;
     DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST);
+    int chr_for_flag;
 };
 
 /**
@@ -227,6 +230,16 @@ int qemu_chr_write(Chardev *s, const uint8_t *buf, int len, bool write_all);
 #define qemu_chr_write_all(s, buf, len) qemu_chr_write(s, buf, len, true)
 int qemu_chr_wait_connected(Chardev *chr, Error **errp);
 
+/**
+ * @qemu_chr_set_reconnect_time:
+ *
+ * Set reconnect time for char disconnect.
+ * Currently, only vhost user will call it.
+ *
+ * @reconnect_time the reconnect_time to be set
+ */
+void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time);
+
 #define TYPE_CHARDEV "chardev"
 OBJECT_DECLARE_TYPE(Chardev, ChardevClass, CHARDEV)
 
@@ -306,6 +319,9 @@ struct ChardevClass {
 
     /* handle various events */
     void (*chr_be_event)(Chardev *s, QEMUChrEvent event);
+
+    /* set reconnect time */
+    void (*chr_set_reconnect_time)(Chardev *chr, int64_t reconnect_time);
 };
 
 Chardev *qemu_chardev_new(const char *id, const char *typename,
diff --git a/include/exec/address-spaces.h b/include/exec/address-spaces.h
index 0d0aa61d68966a56e65e35cc6f38687b7e16816f..4518b5da868283e2ed1fc45f68988e5fec9c6e17 100644
--- a/include/exec/address-spaces.h
+++ b/include/exec/address-spaces.h
@@ -33,6 +33,9 @@ MemoryRegion *get_system_io(void);
 
 extern AddressSpace address_space_memory;
 extern AddressSpace address_space_io;
+extern AddressSpace address_space_virtcca_shared_memory;
+
+extern MemoryRegion *virtcca_shared_hugepage;
 
 #endif
 
diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h
index ba2dd4b5dfc4a0c2d039b3e67718afc5844a6317..2cba27642f765a7ccd6ee33c1d366e2b12e0de81 100644
--- a/include/exec/confidential-guest-support.h
+++ b/include/exec/confidential-guest-support.h
@@ -53,8 +53,54 @@ struct ConfidentialGuestSupport {
     bool ready;
 };
 
+/**
+ * The functions registers with ConfidentialGuestMemoryEncryptionOps will be
+ * used during the encrypted guest migration.
+ */
+struct ConfidentialGuestMemoryEncryptionOps {
+    /* Initialize the platform specific state before starting the migration */
+    int (*save_setup)(const char *pdh, const char *plat_cert,
+                      const char *amd_cert);
+
+    /* Write the encrypted page and metadata associated with it */
+    int (*save_outgoing_page)(QEMUFile *f, uint8_t *ptr, uint32_t size,
+                              uint64_t *bytes_sent);
+
+    /* Load the incoming encrypted page into guest memory */
+    int (*load_incoming_page)(QEMUFile *f, uint8_t *ptr);
+
+    /* Check if gfn is in shared/unencrypted region */
+    bool (*is_gfn_in_unshared_region)(unsigned long gfn);
+
+    /* Write the shared regions list */
+    int (*save_outgoing_shared_regions_list)(QEMUFile *f, uint64_t *bytes_sent);
+
+    /* Load the shared regions list */
+    int (*load_incoming_shared_regions_list)(QEMUFile *f);
+
+    /* Queue the encrypted page and metadata associated with it into a list */
+    int (*queue_outgoing_page)(uint8_t *ptr, uint32_t size, uint64_t addr);
+
+    /* Write the list queued with encrypted pages and metadata associated
+     * with them */
+    int (*save_queued_outgoing_pages)(QEMUFile *f, uint64_t *bytes_sent);
+
+    /* Queue the incoming encrypted page into a list */
+    int (*queue_incoming_page)(QEMUFile *f, uint8_t *ptr);
+
+    /* Load the incoming encrypted pages queued in list into guest memory */
+    int (*load_queued_incoming_pages)(QEMUFile *f);
+
+    /* Write the encrypted cpu state */
+    int (*save_outgoing_cpu_state)(QEMUFile *f, uint64_t *bytes_sent);
+
+    /* Load the encrypted cpu state */
+    int (*load_incoming_cpu_state)(QEMUFile *f);
+};
+
 typedef struct ConfidentialGuestSupportClass {
     ObjectClass parent;
+    struct ConfidentialGuestMemoryEncryptionOps *memory_encryption_ops;
 } ConfidentialGuestSupportClass;
 
 #endif /* !CONFIG_USER_ONLY */
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 41115d8919407109d83b9e633d0aacfa18dd03ac..d21d9990ad0e548b59bcf846c0dc919d45050f25 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -28,6 +28,7 @@ typedef uint64_t vaddr;
 
 void cpu_exec_init_all(void);
 void cpu_exec_step_atomic(CPUState *cpu);
+void virtcca_shared_memory_address_space_init(void);
 
 /* Using intptr_t ensures that qemu_*_page_mask is sign-extended even
  * when intptr_t is 32-bit and we are aligning a long long.
@@ -139,6 +140,14 @@ size_t qemu_ram_pagesize_largest(void);
  */
 void cpu_address_space_init(CPUState *cpu, int asidx,
                             const char *prefix, MemoryRegion *mr);
+/**
+ * cpu_address_space_destroy:
+ * @cpu: CPU for which address space needs to be destroyed
+ * @asidx: integer index of this address space
+ *
+ * Note that with KVM only one address space is supported.
+ */
+void cpu_address_space_destroy(CPUState *cpu, int asidx);
 
 void cpu_physical_memory_rw(hwaddr addr, void *buf,
                             hwaddr len, bool is_write);
@@ -157,8 +166,6 @@ void *cpu_physical_memory_map(hwaddr addr,
                               bool is_write);
 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
                                bool is_write, hwaddr access_len);
-void cpu_register_map_client(QEMUBH *bh);
-void cpu_unregister_map_client(QEMUBH *bh);
 
 bool cpu_physical_memory_is_io(hwaddr phys_addr);
 
diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
index d8a3c56fa2b8f10c261d2d46f146a72202e557c3..e2e8dff051dcbbbb269cc3a7631153259453ab9c 100644
--- a/include/exec/gdbstub.h
+++ b/include/exec/gdbstub.h
@@ -40,6 +40,12 @@ void gdb_register_coprocessor(CPUState *cpu,
                               gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg,
                               int num_regs, const char *xml, int g_pos);
 
+/**
+ * gdb_unregister_coprocessor_all() - unregisters supplemental set of registers
+ * @cpu - the CPU associated with registers
+ */
+void gdb_unregister_coprocessor_all(CPUState *cpu);
+
 /**
  * gdbserver_start: start the gdb server
  * @port_or_device: connection spec for gdb
diff --git a/include/exec/memop.h b/include/exec/memop.h
index a86dc6743a620f021c0a34370a2affbdad25ae99..5b9064819ca91b4e643a5533833002291bf9498a 100644
--- a/include/exec/memop.h
+++ b/include/exec/memop.h
@@ -164,10 +164,4 @@ static inline MemOp size_memop(unsigned size)
     return ctz32(size);
 }
 
-/* Big endianness from MemOp.  */
-static inline bool memop_big_endian(MemOp op)
-{
-    return (op & MO_BSWAP) == MO_BE;
-}
-
 #endif
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 831f7c996d9da49cdf9884fdeffa32959865cb07..0361ec20547bfa5fa9d856a7f2d4bbc23e06443f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -243,6 +243,20 @@ typedef struct IOMMUTLBEvent {
 /* RAM FD is opened read-only */
 #define RAM_READONLY_FD (1 << 11)
 
+/* The address limit of the VirtCCA bounce buffer is 4GB. */
+#define VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT 0x100000000ULL
+
+/* The VirtCCA shared hugepage memory granularity is 1GB */
+#define VIRTCCA_SHARED_HUGEPAGE_ALIGN 0x40000000ULL
+
+/* The default GPA starting address of VM is 1GB */
+#define DEFAULT_VM_GPA_START 0x40000000ULL
+
+/* The GPA starting address of the VirtCCA CVM is 1GB or 3GB */
+extern uint64_t virtcca_cvm_gpa_start;
+
+extern uint64_t virtcca_cvm_ram_size;
+
 static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
                                        IOMMUNotifierFlag flags,
                                        hwaddr start, hwaddr end,
@@ -775,6 +789,17 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
                           ram_addr_t *ram_addr, bool *read_only,
                           bool *mr_has_discard_manager);
 
+typedef struct SharedRegionListener SharedRegionListener;
+struct SharedRegionListener {
+    MemoryListener *listener;
+    AddressSpace *as;
+    QTAILQ_ENTRY(SharedRegionListener) next;
+};
+
+void shared_region_register_listener(SharedRegionListener *shl);
+void shared_region_unregister_listener(SharedRegionListener *shl);
+void *shared_region_listeners_get(void);
+
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
 typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
 
@@ -1054,6 +1079,27 @@ struct MemoryListener {
     void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section,
                         bool match_data, uint64_t data, EventNotifier *e);
 
+    /**
+     * @eventfd_begin:
+     *
+     * Called during an address space begin to update ioeventfd,
+     * notify kvm that ioeventfd will be update in batches.
+     *
+     * @listener: The #MemoryListener.
+     */
+    void (*eventfd_begin)(MemoryListener *listener);
+
+    /**
+     * @eventfd_end:
+     *
+     * Called during an address space update ioeventfd end,
+     * notify kvm that all ioeventfd modifications have been submitted
+     * and batch processing can be started.
+     *
+     * @listener: The #MemoryListener.
+     */
+    void (*eventfd_end)(MemoryListener *listener);
+
     /**
      * @coalesced_io_add:
      *
@@ -1106,6 +1152,13 @@ struct MemoryListener {
     QTAILQ_ENTRY(MemoryListener) link_as;
 };
 
+typedef struct AddressSpaceMapClient {
+    QEMUBH *bh;
+    QLIST_ENTRY(AddressSpaceMapClient) link;
+} AddressSpaceMapClient;
+
+#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
+
 /**
  * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
  */
@@ -1114,6 +1167,7 @@ struct AddressSpace {
     struct rcu_head rcu;
     char *name;
     MemoryRegion *root;
+    bool free_in_rcu;
 
     /* Accessed via RCU.  */
     struct FlatView *current_map;
@@ -1123,6 +1177,14 @@ struct AddressSpace {
     struct MemoryRegionIoeventfd *ioeventfds;
     QTAILQ_HEAD(, MemoryListener) listeners;
     QTAILQ_ENTRY(AddressSpace) address_spaces_link;
+
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    size_t max_bounce_buffer_size;
+    /* Total size of bounce buffers currently allocated, atomically accessed */
+    size_t bounce_buffer_size;
+    /* List of callbacks to invoke when buffers free up */
+    QemuMutex map_client_list_lock;
+    QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
 };
 
 typedef struct AddressSpaceDispatch AddressSpaceDispatch;
@@ -1139,6 +1201,8 @@ struct FlatView {
     unsigned nr_allocated;
     struct AddressSpaceDispatch *dispatch;
     MemoryRegion *root;
+    #define FLATVIEW_FLAG_LAST_PROCESSED (1 << 0)
+    unsigned flags;
 };
 
 static inline FlatView *address_space_to_flatview(AddressSpace *as)
@@ -2044,6 +2108,16 @@ void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client);
 void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr,
                              hwaddr size);
 
+/**
+ * is_first_section: Determine whether a MemoryRegionSection is the first section
+ *
+ * Determine whether a MemoryRegionSection is the first section
+ * of its corresponding parent MemoryRegion.
+ *
+ * @section: MemoryRegionSection
+ */
+bool is_first_section(MemoryRegionSection *section);
+
 /**
  * memory_region_clear_dirty_bitmap - clear dirty bitmap for memory range
  *
@@ -2526,6 +2600,11 @@ void memory_region_transaction_begin(void);
  */
 void memory_region_transaction_commit(void);
 
+/**
+ * memory_region_commit: Force commit memory region immediately.
+ */
+void memory_region_commit(void);
+
 /**
  * memory_listener_register: register callbacks to be called when memory
  *                           sections are mapped or unmapped into an address
@@ -2594,6 +2673,15 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr,
                                          MemOp op,
                                          MemTxAttrs attrs);
 
+/**
+ * memory_section_set_dirty_bytemap: Mark a range of bytes as dirty for a memory section
+ * using a bytemap
+ *
+ * @section: the memory section being dirtied.
+ * @bytemap: bytemap that stores dirty page range information.
+ */
+int64_t memory_section_set_dirty_bytemap(MemoryRegionSection *section, unsigned long *bytemap);
+
 /**
  * address_space_init: initializes an address space
  *
@@ -2906,8 +2994,8 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, hwaddr len,
  * May return %NULL and set *@plen to zero(0), if resources needed to perform
  * the mapping are exhausted.
  * Use only for reads OR writes - not for read-modify-write operations.
- * Use cpu_register_map_client() to know when retrying the map operation is
- * likely to succeed.
+ * Use address_space_register_map_client() to know when retrying the map
+ * operation is likely to succeed.
  *
  * @as: #AddressSpace to be accessed
  * @addr: address within that address space
@@ -2932,6 +3020,28 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                          bool is_write, hwaddr access_len);
 
+/*
+ * address_space_register_map_client: Register a callback to invoke when
+ * resources for address_space_map() are available again.
+ *
+ * address_space_map may fail when there are not enough resources available,
+ * such as when bounce buffer memory would exceed the limit. The callback can
+ * be used to retry the address_space_map operation. Note that the callback
+ * gets automatically removed after firing.
+ *
+ * @as: #AddressSpace to be accessed
+ * @bh: callback to invoke when address_space_map() retry is appropriate
+ */
+void address_space_register_map_client(AddressSpace *as, QEMUBH *bh);
+
+/*
+ * address_space_unregister_map_client: Unregister a callback that has
+ * previously been registered and not fired yet.
+ *
+ * @as: #AddressSpace to be accessed
+ * @bh: callback to unregister
+ */
+void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh);
 
 /* Internal functions, part of the implementation of address_space_read.  */
 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 90676093f5d50c3e4fe6afc55e3ed26587eaf47e..ef6988b445e1734767551f59ff8a36d8cca8b1b3 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -535,5 +535,49 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 
     return num_dirty;
 }
+
+#define BYTES_PER_LONG           (sizeof(unsigned long))
+#define BYTE_WORD(nr)            ((nr) / BYTES_PER_LONG)
+#define BYTES_TO_LONGS(nr)       DIV_ROUND_UP(nr, BYTES_PER_LONG)
+
+static inline int64_t _set_dirty_bytemap_atomic(unsigned long *bytemap, unsigned long cur_pfn)
+{
+    char *byte_of_long = (char *)bytemap;
+    int i;
+    int64_t dirty_num = 0;
+
+    for (i = 0; i < BYTES_PER_LONG; i++) {
+        if (byte_of_long[i]) {
+            cpu_physical_memory_set_dirty_range((cur_pfn + i) << TARGET_PAGE_BITS,
+                                                TARGET_PAGE_SIZE,
+                                                1 << DIRTY_MEMORY_MIGRATION);
+            /* Per byte ops, no need to atomic_xchg */
+            byte_of_long[i] = 0;
+            dirty_num++;
+        }
+    }
+
+    return dirty_num;
+}
+
+static inline int64_t cpu_physical_memory_set_dirty_bytemap(unsigned long *bytemap,
+                                      ram_addr_t start,
+                                      ram_addr_t pages)
+{
+    unsigned long i;
+    unsigned long len = BYTES_TO_LONGS(pages);
+    unsigned long pfn = (start >> TARGET_PAGE_BITS) /
+                         BYTES_PER_LONG * BYTES_PER_LONG;
+    int64_t dirty_mig_bits = 0;
+
+    for (i = 0; i < len; i++) {
+        if (bytemap[i]) {
+            dirty_mig_bits += _set_dirty_bytemap_atomic(&bytemap[i],
+                                                        pfn + BYTES_PER_LONG * i);
+        }
+    }
+
+    return dirty_mig_bits;
+}
 #endif
 #endif
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 69c6a53902939a2de7b38284baff5ed4a86daa63..8f9579ed705b5f6327e0dca9ffd3fda9bdefc725 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -44,7 +44,7 @@ struct RAMBlock {
     size_t page_size;
     /* dirty bitmap used during migration */
     unsigned long *bmap;
-    /* bitmap of already received pages in postcopy */
+    /* Bitmap of already received pages.  Only used on destination side. */
     unsigned long *receivedmap;
 
     /*
diff --git a/include/exec/ramlist.h b/include/exec/ramlist.h
index 2ad2a81accfb6fc4ae4534e009d72be9cc558909..d9cfe530beaf386087d65730cdacf0bd43986edb 100644
--- a/include/exec/ramlist.h
+++ b/include/exec/ramlist.h
@@ -50,6 +50,7 @@ typedef struct RAMList {
     /* RCU-enabled, writes protected by the ramlist lock. */
     QLIST_HEAD(, RAMBlock) blocks;
     DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
+    unsigned int num_dirty_blocks;
     uint32_t version;
     QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
 } RAMList;
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 2b42e4192bf62e01d34d8c148b86eb8b2322187f..7a8b708cdaadb44b850f39a147aa2ba6c70e2fc4 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -90,7 +90,80 @@ typedef struct AcpiFadtData {
     unsigned *xdsdt_tbl_offset;
 } AcpiFadtData;
 
+typedef struct AcpiGas {
+    uint8_t id;                /* Address space ID */
+    uint8_t width;             /* Register bit width */
+    uint8_t offset;            /* Register bit offset */
+    uint8_t size;              /* Access size */
+    uint64_t addr;             /* Address */
+} AcpiGas;
+
+/* SPCR (Serial Port Console Redirection table) */
+typedef struct AcpiSpcrData {
+    uint8_t interface_type;
+    uint8_t reserved[3];
+    struct AcpiGas base_addr;
+    uint8_t interrupt_type;
+    uint8_t pc_interrupt;
+    uint32_t interrupt;        /* Global system interrupt */
+    uint8_t baud_rate;
+    uint8_t parity;
+    uint8_t stop_bits;
+    uint8_t flow_control;
+    uint8_t terminal_type;
+    uint8_t language;
+    uint8_t reserved1;
+    uint16_t pci_device_id;    /* Must be 0xffff if not PCI device */
+    uint16_t pci_vendor_id;    /* Must be 0xffff if not PCI device */
+    uint8_t pci_bus;
+    uint8_t pci_device;
+    uint8_t pci_function;
+    uint32_t pci_flags;
+    uint8_t pci_segment;
+    uint32_t reserved2;
+} AcpiSpcrData;
+
 #define ACPI_FADT_ARM_PSCI_COMPLIANT  (1 << 0)
 #define ACPI_FADT_ARM_PSCI_USE_HVC    (1 << 1)
 
+/*
+ * CPPC register definition from kernel header
+ * include/acpi/cppc_acpi.h
+ * The last element is newly added for easy use
+ */
+enum cppc_regs {
+    HIGHEST_PERF,
+    NOMINAL_PERF,
+    LOW_NON_LINEAR_PERF,
+    LOWEST_PERF,
+    GUARANTEED_PERF,
+    DESIRED_PERF,
+    MIN_PERF,
+    MAX_PERF,
+    PERF_REDUC_TOLERANCE,
+    TIME_WINDOW,
+    CTR_WRAP_TIME,
+    REFERENCE_CTR,
+    DELIVERED_CTR,
+    PERF_LIMITED,
+    ENABLE,
+    AUTO_SEL_ENABLE,
+    AUTO_ACT_WINDOW,
+    ENERGY_PERF,
+    REFERENCE_PERF,
+    LOWEST_FREQ,
+    NOMINAL_FREQ,
+    CPPC_REG_COUNT,
+};
+
+#define CPPC_REG_PER_CPU_STRIDE     0x40
+
+/*
+ * Offset for each CPPC register; -1 for unavailable
+ *
+ * Offset for each CPPC register; -1 for unavailable
+ * The whole register space is unavailable if desired perf offset is -1.
+ */
+extern int cppc_regs_offset[CPPC_REG_COUNT];
+
 #endif
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index ff2a310270a44f25821f324479a1390379b9d894..381ad4a8a1782b52b62a0e2c5289bc0e5d780f27 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -382,6 +382,9 @@ Aml *aml_dma(AmlDmaType typ, AmlDmaBusMaster bm, AmlTransferSize sz,
              uint8_t channel);
 Aml *aml_sleep(uint64_t msec);
 Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source);
+Aml *aml_generic_register(AmlRegionSpace rs, uint8_t reg_width,
+                          uint8_t reg_offset, AmlAccessType type,
+                          uint64_t addr);
 
 /* Block AML object primitives */
 Aml *aml_scope(const char *name_format, ...) G_GNUC_PRINTF(1, 2);
@@ -412,6 +415,7 @@ Aml *aml_sizeof(Aml *arg);
 Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target);
 Aml *aml_object_type(Aml *object);
 
+void build_append_byte(GArray *array, uint8_t val);
 void build_append_int_noprefix(GArray *table, uint64_t value, int size);
 
 typedef struct AcpiTable {
@@ -489,6 +493,11 @@ void build_srat_memory(GArray *table_data, uint64_t base,
 void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms,
                 const char *oem_id, const char *oem_table_id);
 
+void build_processor_hierarchy_node(GArray *tbl, uint32_t flags,
+                                    uint32_t parent, uint32_t id,
+                                    uint32_t *priv_rsrc,
+                                    uint32_t priv_num);
+
 void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms,
                 const char *oem_id, const char *oem_table_id);
 
@@ -497,4 +506,8 @@ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f,
 
 void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog,
                 const char *oem_id, const char *oem_table_id);
+
+void build_spcr(GArray *table_data, BIOSLinker *linker,
+                const AcpiSpcrData *f, const uint8_t rev,
+                const char *oem_id, const char *oem_table_id);
 #endif
diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h
index bc901660fb6fd6f585c553636c1fcb183d2c1276..fa5b5e5f0136e617461593dda2a1c7539b7e16d5 100644
--- a/include/hw/acpi/cpu.h
+++ b/include/hw/acpi/cpu.h
@@ -18,11 +18,15 @@
 #include "hw/boards.h"
 #include "hw/hotplug.h"
 
+#define ACPI_CPU_HOTPLUG_REG_LEN 12
+
 typedef struct AcpiCpuStatus {
-    struct CPUState *cpu;
+    CPUState *cpu;
     uint64_t arch_id;
     bool is_inserting;
     bool is_removing;
+    bool is_present;
+    bool is_enabled;
     bool fw_remove;
     uint32_t ost_event;
     uint32_t ost_status;
@@ -59,10 +63,15 @@ typedef struct CPUHotplugFeatures {
 typedef void (*build_madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids,
                                   GArray *entry, bool force_enabled);
 
+typedef void (*build_cpu_cppc_fn)(int uid, int num_cpu, Aml *dev);
+
 void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
-                    build_madt_cpu_fn build_madt_cpu, hwaddr io_base,
+                    build_madt_cpu_fn build_madt_cpu,
+                    build_cpu_cppc_fn build_cpu_cppc,
+                    hwaddr base_addr,
                     const char *res_root,
-                    const char *event_handler_method);
+                    const char *event_handler_method,
+                    AmlRegionSpace rs);
 
 void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list);
 
diff --git a/include/hw/acpi/cpu_hotplug.h b/include/hw/acpi/cpu_hotplug.h
index 3b932abbbbeeba3e5c2dc00b0ae1d99312682925..ef631750b4441e92cbf218f2d40373339202a31c 100644
--- a/include/hw/acpi/cpu_hotplug.h
+++ b/include/hw/acpi/cpu_hotplug.h
@@ -19,6 +19,10 @@
 #include "hw/hotplug.h"
 #include "hw/acpi/cpu.h"
 
+#define ACPI_CPU_HOTPLUG_REG_LEN 12
+#define ACPI_CPU_SCAN_METHOD "CSCN"
+#define ACPI_CPU_CONTAINER "\\_SB.CPUS"
+
 typedef struct AcpiCpuHotplug {
     Object *device;
     MemoryRegion io;
diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h
index ba84ce02147726309c489256f69d7b9be9ef5a7b..d1df3c12e535a4ca0d1e1f63f69aa107d5c7adfa 100644
--- a/include/hw/acpi/generic_event_device.h
+++ b/include/hw/acpi/generic_event_device.h
@@ -60,8 +60,10 @@
 #define HW_ACPI_GENERIC_EVENT_DEVICE_H
 
 #include "hw/sysbus.h"
+#include "hw/acpi/cpu_hotplug.h"
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/ghes.h"
+#include "hw/acpi/cpu.h"
 #include "qom/object.h"
 
 #define ACPI_POWER_BUTTON_DEVICE "PWRB"
@@ -80,8 +82,11 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED)
 /* ACPI_GED_REG_RESET value for reset*/
 #define ACPI_GED_RESET_VALUE       0x42
 
-/* ACPI_GED_REG_SLEEP_CTL.SLP_TYP value for S5 (aka poweroff) */
-#define ACPI_GED_SLP_TYP_S5        0x05
+/* [ACPI 5.0 Chapter 4.8.3.7] Sleep Control and Status Register */
+#define ACPI_GED_SLP_TYP_POS       0x2   /* SLP_TYPx Bit Offset */
+#define ACPI_GED_SLP_TYP_MASK      0x07  /* SLP_TYPx 3-bit mask */
+#define ACPI_GED_SLP_TYP_S5        0x05  /* System _S5 State (Soft Off) */
+#define ACPI_GED_SLP_EN            0x20  /* SLP_EN write-only bit */
 
 #define GED_DEVICE      "GED"
 #define AML_GED_EVT_REG "EREG"
@@ -95,6 +100,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED)
 #define ACPI_GED_MEM_HOTPLUG_EVT   0x1
 #define ACPI_GED_PWR_DOWN_EVT      0x2
 #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4
+#define ACPI_GED_CPU_HOTPLUG_EVT    0x8
 
 typedef struct GEDState {
     MemoryRegion evt;
@@ -106,6 +112,8 @@ struct AcpiGedState {
     SysBusDevice parent_obj;
     MemHotplugState memhp_state;
     MemoryRegion container_memhp;
+    CPUHotplugState cpuhp_state;
+    MemoryRegion container_cpuhp;
     GEDState ged_state;
     uint32_t ged_event_bitmap;
     qemu_irq irq;
diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h
index 80c492d7421e840174acadd71d9e525fa84fcc86..06ca1d90b222483ffb769ab3c83f6056f0ebbb11 100644
--- a/include/hw/arm/boot.h
+++ b/include/hw/arm/boot.h
@@ -39,6 +39,7 @@ void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename,
 /* arm_boot.c */
 struct arm_boot_info {
     uint64_t ram_size;
+    void *numa_info;
     const char *kernel_filename;
     const char *kernel_cmdline;
     const char *initrd_filename;
@@ -132,6 +133,9 @@ struct arm_boot_info {
     bool secure_board_setup;
 
     arm_endianness endianness;
+    hwaddr firmware_base;
+    hwaddr firmware_max_size;
+    bool confidential;
 };
 
 /**
@@ -178,6 +182,8 @@ AddressSpace *arm_boot_address_space(ARMCPU *cpu,
 int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
                  hwaddr addr_limit, AddressSpace *as, MachineState *ms);
 
+void do_cpu_reset(void *opaque);
+
 /* Write a secure board setup routine with a dummy handler for SMCs */
 void arm_write_secure_board_setup_dummy_smc(ARMCPU *cpu,
                                             const struct arm_boot_info *info,
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index fd8d772da117060d9bccc9b1ea05a394e40e8421..8ae33c375349f3236a2777ff5159112aeec81b78 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -22,6 +22,8 @@
 #include "hw/sysbus.h"
 #include "hw/pci/pci.h"
 #include "qom/object.h"
+#include "sysemu/iommufd.h"
+#include <linux/iommufd.h>
 
 #define SMMU_PCI_BUS_MAX                    256
 #define SMMU_PCI_DEVFN_MAX                  256
@@ -49,6 +51,13 @@ typedef enum {
     SMMU_PTW_ERR_PERMISSION,  /* Permission fault */
 } SMMUPTWEventType;
 
+/* SMMU Stage */
+typedef enum {
+    SMMU_STAGE_1 = 1,
+    SMMU_STAGE_2,
+    SMMU_NESTED,
+} SMMUStage;
+
 typedef struct SMMUPTWEventInfo {
     int stage;
     SMMUPTWEventType type;
@@ -106,14 +115,73 @@ typedef struct SMMUTransCfg {
     struct SMMUS2Cfg s2cfg;
 } SMMUTransCfg;
 
+typedef struct SMMUS2Hwpt {
+    IOMMUFDBackend *iommufd;
+    uint32_t hwpt_id;
+    uint32_t ioas_id;
+} SMMUS2Hwpt;
+
+typedef struct SMMUViommu {
+    void *smmu;
+    IOMMUFDBackend *iommufd;
+    IOMMUFDViommu *core;
+    SMMUS2Hwpt *s2_hwpt;
+    uint32_t bypass_hwpt_id;
+    uint32_t abort_hwpt_id;
+    QLIST_HEAD(, SMMUDevice) device_list;
+    QLIST_ENTRY(SMMUViommu) next;
+} SMMUViommu;
+
+typedef struct SMMUVdev {
+    SMMUViommu *vsmmu;
+    IOMMUFDVdev *core;
+    uint32_t sid;
+}SMMUVdev;
+
+typedef struct PendFaultEntry {
+    struct iommu_hwpt_pgfault fault;
+    QTAILQ_ENTRY(PendFaultEntry) entry;
+} PendFaultEntry;
+
+typedef struct PageRespEntry {
+    struct iommu_hwpt_page_response resp;
+    QTAILQ_ENTRY(PageRespEntry) entry;
+} PageRespEntry;
+
+typedef struct SMMUS1Hwpt {
+    void  *sdev;
+    void *smmu;
+    IOMMUFDBackend *iommufd;
+    SMMUViommu *viommu;
+    uint32_t hwpt_id;
+    uint32_t out_fault_fd;
+    QLIST_HEAD(, SMMUDevice) device_list;
+    QLIST_ENTRY(SMMUViommu) next;
+    /* fault handling */
+    struct io_uring fault_ring;
+    QemuThread read_fault_thread;
+    QemuThread write_fault_thread;
+    QemuMutex fault_mutex;
+    QemuCond fault_cond;
+    QTAILQ_HEAD(, PageRespEntry) pageresp;
+    QTAILQ_HEAD(, PendFaultEntry) pendfault;
+    bool exiting;
+} SMMUS1Hwpt;
+
 typedef struct SMMUDevice {
     void               *smmu;
     PCIBus             *bus;
     int                devfn;
     IOMMUMemoryRegion  iommu;
+    HostIOMMUDeviceIOMMUFD *idev;
+    SMMUViommu         *viommu;
+    SMMUVdev           *vdev;
+    SMMUS1Hwpt         *s1_hwpt;
     AddressSpace       as;
+    AddressSpace       as_sysmem;
     uint32_t           cfg_cache_hits;
     uint32_t           cfg_cache_misses;
+    struct iommu_hw_info_arm_smmuv3 info;
     QLIST_ENTRY(SMMUDevice) next;
 } SMMUDevice;
 
@@ -134,7 +202,13 @@ struct SMMUState {
     /* <private> */
     SysBusDevice  dev;
     const char *mrtypename;
+    MemoryRegion root;
     MemoryRegion iomem;
+    MemoryRegion sysmem;
+
+    /* Nested SMMU */
+    bool nested;
+    SMMUViommu *viommu;
 
     GHashTable *smmu_pcibus_by_busptr;
     GHashTable *configs; /* cache for configuration data */
@@ -181,8 +255,8 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm,
  */
 SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova);
 
-/* Return the iommu mr associated to @sid, or NULL if none */
-IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid);
+/* Return the SMMUDevice associated to @sid, or NULL if none */
+SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid);
 
 #define SMMU_IOTLB_MAX_SIZE 256
 
@@ -200,4 +274,15 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, dma_addr_t iova,
 /* Unmap the range of all the notifiers registered to any IOMMU mr */
 void smmu_inv_notifiers_all(SMMUState *s);
 
+/* IOMMUFD helpers */
+int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type,
+                      uint32_t data_len, uint8_t *pasid, void *data);
+void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort);
+int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type,
+                                uint32_t data_len, void *data,
+                                bool req_fault_fd);
+int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len,
+                               uint32_t *num, void *reqs);
+int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type,
+                                 uint32_t len, uint32_t *num, void *reqs);
 #endif /* HW_ARM_SMMU_COMMON_H */
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index d183a627669b5164dc5fa9446f4da203498fcb06..79b6fcd8e7b79f60d83ba8dd0d2f7e7e58e11a14 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -84,6 +84,23 @@ struct SMMUv3Class {
 #define TYPE_ARM_SMMUV3   "arm-smmuv3"
 OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3)
 
+#define TYPE_ARM_SMMUV3_ACCEL   "arm-smmuv3-accel"
+OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL)
+
+struct SMMUv3AccelState {
+    SMMUv3State smmuv3_state;
+
+    char *pci_bus;
+};
+
+struct SMMUv3AccelClass {
+    /*< private >*/
+    SMMUv3Class smmuv3_class;
+    /*< public >*/
+
+    DeviceRealize parent_realize;
+};
+
 #define STAGE1_SUPPORTED(s)      FIELD_EX32(s->idr[0], IDR0, S1P)
 #define STAGE2_SUPPORTED(s)      FIELD_EX32(s->idr[0], IDR0, S2P)
 
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index f69239850e617332ec3ed223edcf4a69019ec973..3e2759d225066c96e9a408dda79e1520639c06f1 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -47,6 +47,71 @@
 /* See Linux kernel arch/arm64/include/asm/pvclock-abi.h */
 #define PVTIME_SIZE_PER_CPU 64
 
+/* ARM CLIDR_EL1 related definitions */
+/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */
+#define CTYPE_NONE     0b000
+#define CTYPE_INS      0b001
+#define CTYPE_DATA     0b010
+#define CTYPE_INS_DATA 0b011
+#define CTYPE_UNIFIED  0b100
+
+#define ARM64_REG_CLIDR_EL1 ARM64_SYS_REG(3, 1, 0, 0, 1)
+
+#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1))
+#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level))
+#define CLIDR_CTYPE(clidr, level) \
+    (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
+
+/* L1 data cache */
+#define ARM_L1DCACHE_SIZE 65536
+#define ARM_L1DCACHE_SETS 256
+#define ARM_L1DCACHE_ASSOCIATIVITY 4
+#define ARM_L1DCACHE_ATTRIBUTES 2
+#define ARM_L1DCACHE_LINE_SIZE 64
+
+/* L1 instruction cache */
+#define ARM_L1ICACHE_SIZE 65536
+#define ARM_L1ICACHE_SETS 256
+#define ARM_L1ICACHE_ASSOCIATIVITY 4
+#define ARM_L1ICACHE_ATTRIBUTES 4
+#define ARM_L1ICACHE_LINE_SIZE 64
+
+/* L1 unified cache */
+#define ARM_L1CACHE_SIZE 131072
+#define ARM_L1CACHE_SETS 256
+#define ARM_L1CACHE_ASSOCIATIVITY 4
+#define ARM_L1CACHE_ATTRIBUTES 10
+#define ARM_L1CACHE_LINE_SIZE 128
+
+/* L2 unified cache */
+#define ARM_L2CACHE_SIZE 524288
+#define ARM_L2CACHE_SETS 1024
+#define ARM_L2CACHE_ASSOCIATIVITY 8
+#define ARM_L2CACHE_ATTRIBUTES 10
+#define ARM_L2CACHE_LINE_SIZE 64
+
+/* L3 unified cache */
+#define ARM_L3CACHE_SIZE 33554432
+#define ARM_L3CACHE_SETS 2048
+#define ARM_L3CACHE_ASSOCIATIVITY 15
+#define ARM_L3CACHE_ATTRIBUTES 10
+#define ARM_L3CACHE_LINE_SIZE 128
+
+/* Definitions of the hardcoded cache info */
+typedef enum {
+    ARM_L1D_CACHE,
+    ARM_L1I_CACHE,
+    ARM_L1_CACHE,
+    ARM_L2_CACHE,
+    ARM_L3_CACHE
+} ArmCacheType;
+
+/* MMIO region size for SMMUv3 */
+#define SMMU_IO_LEN 0x20000
+
+/* Max supported nested SMMUv3 */
+#define MAX_SMMU_ACCEL 64
+
 enum {
     VIRT_FLASH,
     VIRT_MEM,
@@ -59,12 +124,16 @@ enum {
     VIRT_GIC_ITS,
     VIRT_GIC_REDIST,
     VIRT_SMMU,
+    VIRT_SMMU_ACCEL,
     VIRT_UART,
+    VIRT_CPUFREQ,
     VIRT_MMIO,
+    VIRT_CVM_MSI,
     VIRT_RTC,
     VIRT_FW_CFG,
     VIRT_PCIE,
     VIRT_PCIE_MMIO,
+    VIRT_KAE_DEVICE,
     VIRT_PCIE_PIO,
     VIRT_PCIE_ECAM,
     VIRT_PLATFORM_BUS,
@@ -75,6 +144,7 @@ enum {
     VIRT_PCDIMM_ACPI,
     VIRT_ACPI_GED,
     VIRT_NVDIMM_ACPI,
+    VIRT_CPUHP_ACPI,
     VIRT_PVTIME,
     VIRT_LOWMEMMAP_LAST,
 };
@@ -89,6 +159,7 @@ enum {
 typedef enum VirtIOMMUType {
     VIRT_IOMMU_NONE,
     VIRT_IOMMU_SMMUV3,
+    VIRT_IOMMU_SMMUV3_ACCEL,
     VIRT_IOMMU_VIRTIO,
 } VirtIOMMUType;
 
@@ -138,6 +209,10 @@ struct VirtMachineState {
     DeviceState *platform_bus_dev;
     FWCfgState *fw_cfg;
     PFlashCFI01 *flash[2];
+    MemoryRegion *sysmem;
+    MemoryRegion *secure_sysmem;
+    MemoryRegion *tag_sysmem;
+    MemoryRegion *secure_tag_sysmem;
     bool secure;
     bool highmem;
     bool highmem_compact;
@@ -147,9 +222,12 @@ struct VirtMachineState {
     bool its;
     bool tcg_its;
     bool virt;
+    bool cpu_hotplug_enabled;
     bool ras;
     bool mte;
     bool dtb_randomness;
+    bool pmu;
+    int smmu_accel_count;
     OnOffAuto acpi;
     VirtGICType gic_version;
     VirtIOMMUType iommu;
@@ -160,6 +238,7 @@ struct VirtMachineState {
     MemMapEntry *memmap;
     char *pciehb_nodename;
     const int *irqmap;
+    uint16_t boot_cpus;
     int fdt_size;
     uint32_t clock_phandle;
     uint32_t gic_phandle;
@@ -173,6 +252,8 @@ struct VirtMachineState {
     PCIBus *bus;
     char *oem_id;
     char *oem_table_id;
+    char *kvm_type;
+    NotifierList cpuhp_notifiers;
 };
 
 #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
@@ -182,6 +263,7 @@ OBJECT_DECLARE_TYPE(VirtMachineState, VirtMachineClass, VIRT_MACHINE)
 
 void virt_acpi_setup(VirtMachineState *vms);
 bool virt_is_acpi_enabled(VirtMachineState *vms);
+bool cpu_l1_cache_unified(int cpu);
 
 /* Return number of redistributors that fit in the specified region */
 static uint32_t virt_redist_capacity(VirtMachineState *vms, int region)
@@ -200,11 +282,23 @@ static uint32_t virt_redist_capacity(VirtMachineState *vms, int region)
 static inline int virt_gicv3_redist_region_count(VirtMachineState *vms)
 {
     uint32_t redist0_capacity = virt_redist_capacity(vms, VIRT_GIC_REDIST);
+    MachineState *ms = MACHINE(vms);
+    unsigned int max_cpus = ms->smp.max_cpus;
+
+    if (!vms->cpu_hotplug_enabled) {
+        max_cpus = ms->smp.cpus;
+    }
 
     assert(vms->gic_version != VIRT_GIC_VERSION_2);
 
-    return (MACHINE(vms)->smp.cpus > redist0_capacity &&
+    return (max_cpus > redist0_capacity &&
             vms->highmem_redists) ? 2 : 1;
 }
 
+static inline bool virt_has_smmuv3(const VirtMachineState *vms)
+{
+    return vms->iommu == VIRT_IOMMU_SMMUV3 ||
+           vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL;
+}
+
 #endif /* QEMU_ARM_VIRT_H */
diff --git a/include/hw/block/block.h b/include/hw/block/block.h
index 15fff6643502e51a0896189d12709e1622c2b339..844e87495ad81407f9bf29f78c8612ab3626784a 100644
--- a/include/hw/block/block.h
+++ b/include/hw/block/block.h
@@ -34,6 +34,8 @@ typedef struct BlockConf {
     OnOffAuto account_invalid, account_failed;
     BlockdevOnError rerror;
     BlockdevOnError werror;
+    int64_t retry_interval;
+    int64_t retry_timeout;
 } BlockConf;
 
 static inline unsigned int get_physical_block_exp(BlockConf *conf)
@@ -84,7 +86,11 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf)
     DEFINE_PROP_BLOCKDEV_ON_ERROR("rerror", _state, _conf.rerror,       \
                                   BLOCKDEV_ON_ERROR_AUTO),              \
     DEFINE_PROP_BLOCKDEV_ON_ERROR("werror", _state, _conf.werror,       \
-                                  BLOCKDEV_ON_ERROR_AUTO)
+                                  BLOCKDEV_ON_ERROR_AUTO),              \
+    DEFINE_PROP_BLOCKDEV_RETRY_INTERVAL("retry_interval", _state,       \
+                                        _conf.retry_interval, 1000),    \
+    DEFINE_PROP_BLOCKDEV_RETRY_TIMEOUT("retry_timeout", _state,         \
+                                       _conf.retry_timeout, 0)
 
 /* Backend access helpers */
 
diff --git a/include/hw/boards.h b/include/hw/boards.h
index da85f86efb9185df05a444bd22eeec0557a3e3c8..8ac8cad2a2225e97df1475c2f20118ccf2af394e 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -389,6 +389,8 @@ struct MachineState {
 
     ram_addr_t ram_size;
     ram_addr_t maxram_size;
+    ram_addr_t ram2_base;
+    ram_addr_t ram2_size;
     uint64_t   ram_slots;
     BootConfiguration boot_config;
     char *kernel_filename;
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index c0c8320413e5f66f3c026ec6b4b89374b9cde7b3..37f3a469c897e09f73fc23a1fa999bee7f834238 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -495,7 +495,8 @@ struct CPUState {
     QemuMutex work_mutex;
     QSIMPLEQ_HEAD(, qemu_work_item) work_list;
 
-    CPUAddressSpace *cpu_ases;
+    struct CPUAddressSpace *cpu_ases;
+    int cpu_ases_count;
     int num_ases;
     AddressSpace *as;
     MemoryRegion *memory;
@@ -528,6 +529,7 @@ struct CPUState {
     uint32_t kvm_fetch_index;
     uint64_t dirty_pages;
     int kvm_vcpu_stats_fd;
+    VMChangeStateEntry *vmcse;
 
     /* Use by accel-block: CPU is executing an ioctl() */
     QemuLockCnt in_ioctl_lock;
@@ -538,6 +540,24 @@ struct CPUState {
     GArray *plugin_mem_cbs;
 #endif
 
+    /*
+     * Some architectures do not allow *presence* of vCPUs to be changed
+     * after guest has booted using information specified by VMM/firmware
+     * via ACPI MADT at the boot time. Thus to enable vCPU hotplug on these
+     * architectures possible vCPU can have CPUState object in 'disabled'
+     * state or can also not have CPUState object at all. This is possible
+     * when vCPU Hotplug is supported and vCPUs are 'yet-to-be-plugged' in
+     * the QOM or have been hot-unplugged.
+     * By default every CPUState is enabled as of now across all archs.
+     */
+    bool disabled;
+    /*
+     * On certain architectures, to give persistent view of the 'presence' of
+     * vCPUs to the guest, ACPI might need to fake the 'presence' of the vCPUs
+     * but keep them ACPI disabled to the guest. This is done by returning
+     * _STA.PRES=True and _STA.Ena=False for the unplugged vCPUs in QEMU QoM.
+     */
+    bool acpi_persistent;
     /* TODO Move common fields from CPUArchState here. */
     int cpu_index;
     int cluster_index;
@@ -545,6 +565,8 @@ struct CPUState {
     uint32_t halted;
     int32_t exception_index;
 
+    bool cold_booted;
+
     AccelCPUState *accel;
     /* shared by kvm and hvf */
     bool vcpu_dirty;
@@ -913,6 +935,61 @@ static inline bool cpu_in_exclusive_context(const CPUState *cpu)
  */
 CPUState *qemu_get_cpu(int index);
 
+/**
+ * qemu_get_possible_cpu:
+ * @index: The CPUState@cpu_index value of the CPU to obtain.
+ *         Input index MUST be in range [0, Max Possible CPUs)
+ *
+ * If CPUState object exists,then it gets a CPU matching
+ * @index in the possible CPU array.
+ *
+ * Returns: The possible CPU or %NULL if CPU does not exist.
+ */
+CPUState *qemu_get_possible_cpu(int index);
+
+/**
+ * qemu_present_cpu:
+ * @cpu: The vCPU to check
+ *
+ * Checks if the vCPU is amongst the present possible vcpus.
+ *
+ * Returns: True if it is present possible vCPU else false
+ */
+bool qemu_present_cpu(CPUState *cpu);
+
+/**
+ * qemu_enabled_cpu:
+ * @cpu: The vCPU to check
+ *
+ * Checks if the vCPU is enabled.
+ *
+ * Returns: True if it is 'enabled' else false
+ */
+bool qemu_enabled_cpu(CPUState *cpu);
+
+/**
+ * qemu_persistent_cpu:
+ * @cpu: The vCPU to check
+ *
+ * Checks if the vCPU state should always be reflected as *present* via ACPI
+ * to the Guest. By default, this is False on all architectures and has to be
+ * explicity set during initialization.
+ *
+ * Returns: True if it is ACPI 'persistent' CPU
+ *
+ */
+bool qemu_persistent_cpu(CPUState *cpu);
+
+/**
+ * qemu_get_cpu_archid:
+ * @cpu_index: possible vCPU for which arch-id needs to be retreived
+ *
+ * Fetches the vCPU arch-id from the present possible vCPUs.
+ *
+ * Returns: arch-id of the possible vCPU
+ */
+uint64_t qemu_get_cpu_archid(int cpu_index);
+
 /**
  * cpu_exists:
  * @id: Guest-exposed CPU ID to lookup.
diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
index 0a5c258fe6898c58b5b7aa03dee4286e70c20541..9c35d1b9da6c36198b6f49e99db6eb49cdfae058 100644
--- a/include/hw/elf_ops.h
+++ b/include/hw/elf_ops.h
@@ -500,7 +500,7 @@ static ssize_t glue(load_elf, SZ)(const char *name, int fd,
             }
 
             if (data_swab) {
-                int j;
+                elf_word j;
                 for (j = 0; j < file_size; j += (1 << data_swab)) {
                     uint8_t *dp = data + j;
                     switch (data_swab) {
diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h
index 7f3259a6300af0d7b8a359b879f5dcb99cf4d485..6e514982d49c40fef516fe1220aaed7f31062435 100644
--- a/include/hw/firmware/smbios.h
+++ b/include/hw/firmware/smbios.h
@@ -295,6 +295,7 @@ void smbios_set_cpuid(uint32_t version, uint32_t features);
 void smbios_set_defaults(const char *manufacturer, const char *product,
                          const char *version, bool legacy_mode,
                          bool uuid_encoded, SmbiosEntryPointType ep_type);
+void smbios_set_default_processor_family(uint16_t processor_family);
 uint8_t *smbios_get_table_legacy(MachineState *ms, size_t *length);
 void smbios_get_tables(MachineState *ms,
                        const struct smbios_phys_mem_area *mem_array,
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c87bb8569fd6985e299fc0a1cc4b0c0c..1eb05c29fc9c703a61f06d90616694e74fb61c15 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,8 @@ struct IntelIOMMUState {
     /* list of registered notifiers */
     QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+    GHashTable *vtd_host_iommu_dev;             /* HostIOMMUDevice */
+
     /* interrupt remapping */
     bool intr_enabled;              /* Whether guest enabled IR */
     dma_addr_t intr_root;           /* Interrupt remapping table pointer */
diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h
index 4e2fb518e7240074465b18aab9ea96d5752146a9..b5f8ba17ff30c80b70eaa1e2e31060970f437b9b 100644
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -280,6 +280,7 @@ struct GICv3State {
     GICv3CPUState *gicd_irouter_target[GICV3_MAXIRQ];
     uint32_t gicd_nsacr[DIV_ROUND_UP(GICV3_MAXIRQ, 16)];
 
+    Notifier cpu_update_notifier;
     GICv3CPUState *cpu;
     /* List of all ITSes connected to this GIC */
     GPtrArray *itslist;
@@ -324,10 +325,32 @@ struct ARMGICv3CommonClass {
 
     void (*pre_save)(GICv3State *s);
     void (*post_load)(GICv3State *s);
+    void (*init_cpu_reginfo)(CPUState *cs);
 };
 
 void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler,
                               const MemoryRegionOps *ops);
+/**
+ * Structure used by GICv3 CPU hotplug notifier
+ */
+typedef struct GICv3CPUHotplugInfo {
+    DeviceState *gic; /* GICv3State */
+    CPUState *cpu;
+} GICv3CPUHotplugInfo;
+
+/**
+ * gicv3_cpuhp_notifier
+ *
+ * Returns CPU hotplug notifier which could be used to update GIC about any
+ * CPU hot(un)plug events.
+ *
+ * Returns: Notifier initialized with CPU Hot(un)plug update function
+ */
+static inline Notifier *gicv3_cpuhp_notifier(DeviceState *dev)
+{
+    GICv3State *s = ARM_GICV3_COMMON(dev);
+    return &s->cpu_update_notifier;
+}
 
 /**
  * gicv3_class_name
diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h
index fbdef9a7b3b74ec70cd12878b5a2680d90d4a651..92b38d5c38bea88f7b7c107199b910201d72281d 100644
--- a/include/hw/intc/loongarch_extioi.h
+++ b/include/hw/intc/loongarch_extioi.h
@@ -15,7 +15,7 @@
 #define EXTIOI_IRQS                (256)
 #define EXTIOI_IRQS_BITMAP_SIZE    (256 / 8)
 /* irq from EXTIOI is routed to no more than 4 cpus */
-#define EXTIOI_CPUS                (4)
+#define EXTIOI_CPUS                (256)
 /* map to ipnum per 32 irqs */
 #define EXTIOI_IRQS_IPMAP_SIZE     (256 / 32)
 #define EXTIOI_IRQS_COREMAP_SIZE   256
@@ -39,26 +39,88 @@
 #define EXTIOI_COREISR_END           (0xB20 - APIC_OFFSET)
 #define EXTIOI_COREMAP_START         (0xC00 - APIC_OFFSET)
 #define EXTIOI_COREMAP_END           (0xD00 - APIC_OFFSET)
+#define EXTIOI_SIZE                  0x800
+
+#define EXTIOI_VIRT_BASE             (0x40000000)
+#define EXTIOI_VIRT_SIZE             (0x1000)
+#define EXTIOI_VIRT_FEATURES         (0x0)
+#define EXTIOI_HAS_VIRT_EXTENSION    (0)
+#define EXTIOI_HAS_ENABLE_OPTION     (1)
+#define EXTIOI_HAS_INT_ENCODE        (2)
+#define EXTIOI_HAS_CPU_ENCODE        (3)
+#define EXTIOI_VIRT_HAS_FEATURES     (BIT(EXTIOI_HAS_VIRT_EXTENSION) \
+                                      | BIT(EXTIOI_HAS_ENABLE_OPTION)\
+                                      | BIT(EXTIOI_HAS_INT_ENCODE)   \
+                                      | BIT(EXTIOI_HAS_CPU_ENCODE))
+#define EXTIOI_VIRT_CONFIG           (0x4)
+#define EXTIOI_ENABLE                (1)
+#define EXTIOI_ENABLE_INT_ENCODE     (2)
+#define EXTIOI_ENABLE_CPU_ENCODE     (3)
+#define EXTIOI_VIRT_COREMAP_START    (0x40)
+#define EXTIOI_VIRT_COREMAP_END      (0x240)
+
+#define EXTIOI_SW_COREMAP_FLAG       (1 << 0)
+
+typedef struct ExtIOICore {
+    uint32_t coreisr[EXTIOI_IRQS_GROUP_COUNT];
+    DECLARE_BITMAP(sw_isr[LS3A_INTC_IP], EXTIOI_IRQS);
+    qemu_irq parent_irq[LS3A_INTC_IP];
+} ExtIOICore;
+
+#define TYPE_LOONGARCH_EXTIOI        "loongarch-extioi"
+#define TYPE_KVM_LOONGARCH_EXTIOI    "loongarch-kvm-extioi"
 
-#define TYPE_LOONGARCH_EXTIOI "loongarch.extioi"
 OBJECT_DECLARE_SIMPLE_TYPE(LoongArchExtIOI, LOONGARCH_EXTIOI)
 struct LoongArchExtIOI {
     SysBusDevice parent_obj;
+    uint32_t num_cpu;
+    uint32_t features;
+    uint32_t status;
     /* hardware state */
     uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2];
     uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT];
     uint32_t isr[EXTIOI_IRQS / 32];
-    uint32_t coreisr[EXTIOI_CPUS][EXTIOI_IRQS_GROUP_COUNT];
     uint32_t enable[EXTIOI_IRQS / 32];
     uint32_t ipmap[EXTIOI_IRQS_IPMAP_SIZE / 4];
     uint32_t coremap[EXTIOI_IRQS / 4];
     uint32_t sw_pending[EXTIOI_IRQS / 32];
-    DECLARE_BITMAP(sw_isr[EXTIOI_CPUS][LS3A_INTC_IP], EXTIOI_IRQS);
     uint8_t  sw_ipmap[EXTIOI_IRQS_IPMAP_SIZE];
     uint8_t  sw_coremap[EXTIOI_IRQS];
-    qemu_irq parent_irq[EXTIOI_CPUS][LS3A_INTC_IP];
     qemu_irq irq[EXTIOI_IRQS];
-    MemoryRegion extioi_iocsr_mem[EXTIOI_CPUS];
+    ExtIOICore *cpu;
     MemoryRegion extioi_system_mem;
+    MemoryRegion virt_extend;
+};
+
+struct KVMLoongArchExtIOI {
+    SysBusDevice parent_obj;
+    uint32_t num_cpu;
+    uint32_t features;
+    uint32_t status;
+
+    /* hardware state */
+    uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2];
+    uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT];
+    uint32_t isr[EXTIOI_IRQS / 32];
+    uint32_t coreisr[EXTIOI_CPUS][EXTIOI_IRQS_GROUP_COUNT];
+    uint32_t enable[EXTIOI_IRQS / 32];
+    uint32_t ipmap[EXTIOI_IRQS_IPMAP_SIZE / 4];
+    uint32_t coremap[EXTIOI_IRQS / 4];
+    uint8_t  sw_coremap[EXTIOI_IRQS];
 };
+typedef struct KVMLoongArchExtIOI KVMLoongArchExtIOI;
+DECLARE_INSTANCE_CHECKER(KVMLoongArchExtIOI, KVM_LOONGARCH_EXTIOI,
+                         TYPE_KVM_LOONGARCH_EXTIOI)
+
+struct KVMLoongArchExtIOIClass {
+    SysBusDeviceClass parent_class;
+    DeviceRealize parent_realize;
+
+    bool is_created;
+    int dev_fd;
+};
+typedef struct KVMLoongArchExtIOIClass KVMLoongArchExtIOIClass;
+DECLARE_CLASS_CHECKERS(KVMLoongArchExtIOIClass, KVM_LOONGARCH_EXTIOI,
+                       TYPE_KVM_LOONGARCH_EXTIOI)
+
 #endif /* LOONGARCH_EXTIOI_H */
diff --git a/include/hw/intc/loongarch_ipi.h b/include/hw/intc/loongarch_ipi.h
index 6c6194786e807317a3aa800e94e36ee1bef4652d..601b4f18a7b835457ad12578d836c001ca23797b 100644
--- a/include/hw/intc/loongarch_ipi.h
+++ b/include/hw/intc/loongarch_ipi.h
@@ -32,6 +32,7 @@
 
 #define TYPE_LOONGARCH_IPI "loongarch_ipi"
 OBJECT_DECLARE_SIMPLE_TYPE(LoongArchIPI, LOONGARCH_IPI)
+#define TYPE_KVM_LOONGARCH_IPI "loongarch-ipi-kvm"
 
 typedef struct IPICore {
     uint32_t status;
@@ -47,7 +48,30 @@ struct LoongArchIPI {
     SysBusDevice parent_obj;
     MemoryRegion ipi_iocsr_mem;
     MemoryRegion ipi64_iocsr_mem;
-    IPICore ipi_core;
+    uint32_t num_cpu;
+    IPICore *cpu;
 };
 
+struct KVMLoongArchIPI {
+    SysBusDevice parent_obj;
+    uint32_t num_cpu;
+    IPICore *cpu;
+};
+typedef struct KVMLoongArchIPI KVMLoongArchIPI;
+DECLARE_INSTANCE_CHECKER(KVMLoongArchIPI, KVM_LOONGARCH_IPI,
+                         TYPE_KVM_LOONGARCH_IPI)
+
+struct KVMLoongArchIPIClass {
+    SysBusDeviceClass parent_class;
+    DeviceRealize parent_realize;
+
+    bool is_created;
+    int dev_fd;
+
+};
+typedef struct KVMLoongArchIPIClass KVMLoongArchIPIClass;
+DECLARE_CLASS_CHECKERS(KVMLoongArchIPIClass, KVM_LOONGARCH_IPI,
+                       TYPE_KVM_LOONGARCH_IPI)
+
+
 #endif
diff --git a/include/hw/intc/loongarch_pch_msi.h b/include/hw/intc/loongarch_pch_msi.h
index b8586fb3b6f6b170915d4b126ad4d562dec1890a..fd4ea97a83b3e7032efec0126179e5f0eb73ba2a 100644
--- a/include/hw/intc/loongarch_pch_msi.h
+++ b/include/hw/intc/loongarch_pch_msi.h
@@ -7,7 +7,7 @@
 
 #include "hw/sysbus.h"
 
-#define TYPE_LOONGARCH_PCH_MSI "loongarch_pch_msi"
+#define TYPE_LOONGARCH_PCH_MSI     "loongarch_pch_msi"
 OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHMSI, LOONGARCH_PCH_MSI)
 
 /* MSI irq start from 32 to 255 */
diff --git a/include/hw/intc/loongarch_pch_pic.h b/include/hw/intc/loongarch_pch_pic.h
index d5437e88f289880b1ab4d93f143e32eb4608c743..77f4cd74a129755a3f580b742655a1f6e915f26e 100644
--- a/include/hw/intc/loongarch_pch_pic.h
+++ b/include/hw/intc/loongarch_pch_pic.h
@@ -7,7 +7,8 @@
 
 #include "hw/sysbus.h"
 
-#define TYPE_LOONGARCH_PCH_PIC "loongarch_pch_pic"
+#define TYPE_LOONGARCH_PCH_PIC          "loongarch_pch_pic"
+#define TYPE_KVM_LOONGARCH_PCH_PIC      "loongarch_kvm_pch_pic"
 #define PCH_PIC_NAME(name) TYPE_LOONGARCH_PCH_PIC#name
 OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHPIC, LOONGARCH_PCH_PIC)
 
@@ -37,6 +38,19 @@ OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHPIC, LOONGARCH_PCH_PIC)
 #define PCH_PIC_INT_POL_LO              0x3e0
 #define PCH_PIC_INT_POL_HI              0x3e4
 
+#define PCH_PIC_INT_ID_START            PCH_PIC_INT_ID_LO
+#define PCH_PIC_MASK_START              PCH_PIC_INT_MASK_LO
+#define PCH_PIC_HTMSI_EN_START          PCH_PIC_HTMSI_EN_LO
+#define PCH_PIC_EDGE_START              PCH_PIC_INT_EDGE_LO
+#define PCH_PIC_CLEAR_START             PCH_PIC_INT_CLEAR_LO
+#define PCH_PIC_AUTO_CTRL0_START        PCH_PIC_AUTO_CTRL0_LO
+#define PCH_PIC_AUTO_CTRL1_START        PCH_PIC_AUTO_CTRL1_LO
+#define PCH_PIC_ROUTE_ENTRY_START       PCH_PIC_ROUTE_ENTRY_OFFSET
+#define PCH_PIC_HTMSI_VEC_START         PCH_PIC_HTMSI_VEC_OFFSET
+#define PCH_PIC_INT_IRR_START           0x380
+#define PCH_PIC_INT_ISR_START           PCH_PIC_INT_STATUS_LO
+#define PCH_PIC_POLARITY_START          PCH_PIC_INT_POL_LO
+
 #define STATUS_LO_START                 0
 #define STATUS_HI_START                 0x4
 #define POL_LO_START                    0x40
@@ -67,3 +81,38 @@ struct LoongArchPCHPIC {
     MemoryRegion iomem8;
     unsigned int irq_num;
 };
+
+struct KVMLoongArchPCHPIC {
+    SysBusDevice parent_obj;
+    uint64_t int_mask; /*0x020 interrupt mask register*/
+    uint64_t htmsi_en; /*0x040 1=msi*/
+    uint64_t intedge; /*0x060 edge=1 level  =0*/
+    uint64_t intclr; /*0x080 for clean edge int,set 1 clean,set 0 is noused*/
+    uint64_t auto_crtl0; /*0x0c0*/
+    uint64_t auto_crtl1; /*0x0e0*/
+    uint64_t last_intirr;    /* edge detection */
+    uint64_t intirr; /* 0x380 interrupt request register */
+    uint64_t intisr; /* 0x3a0 interrupt service register */
+    /*
+     * 0x3e0 interrupt level polarity selection
+     * register 0 for high level trigger
+     */
+    uint64_t int_polarity;
+
+    uint8_t route_entry[64]; /*0x100 - 0x138*/
+    uint8_t htmsi_vector[64]; /*0x200 - 0x238*/
+};
+typedef struct KVMLoongArchPCHPIC KVMLoongArchPCHPIC;
+DECLARE_INSTANCE_CHECKER(KVMLoongArchPCHPIC, KVM_LOONGARCH_PCH_PIC,
+                         TYPE_KVM_LOONGARCH_PCH_PIC)
+
+struct KVMLoongArchPCHPICClass {
+    SysBusDeviceClass parent_class;
+    DeviceRealize parent_realize;
+
+    bool is_created;
+    int dev_fd;
+};
+typedef struct KVMLoongArchPCHPICClass KVMLoongArchPCHPICClass;
+DECLARE_CLASS_CHECKERS(KVMLoongArchPCHPICClass, KVM_LOONGARCH_PCH_PIC,
+                       TYPE_KVM_LOONGARCH_PCH_PIC)
diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3b870df1f097662dff7fc7c173f152da583d374
--- /dev/null
+++ b/include/hw/loongarch/boot.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Definitions for LoongArch boot.
+ *
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+
+#ifndef HW_LOONGARCH_BOOT_H
+#define HW_LOONGARCH_BOOT_H
+
+/* UEFI 2.10 */
+#define EFI_SYSTEM_TABLE_SIGNATURE       0x5453595320494249
+#define EFI_2_100_SYSTEM_TABLE_REVISION  ((2<<16) | (100))
+#define EFI_SPECIFICATION_VERSION        EFI_SYSTEM_TABLE_REVISION
+#define EFI_SYSTEM_TABLE_REVISION        EFI_2_100_SYSTEM_TABLE_REVISION
+
+#define FW_VERSION 0x1
+#define FW_PATCHLEVEL 0x0
+
+typedef struct {
+    uint8_t b[16];
+} efi_guid_t QEMU_ALIGNED(8);
+
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ {                                \
+        (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+        (b) & 0xff, ((b) >> 8) & 0xff,                                         \
+        (c) & 0xff, ((c) >> 8) & 0xff, d } }
+
+#define LINUX_EFI_BOOT_MEMMAP_GUID \
+        EFI_GUID(0x800f683f, 0xd08b, 0x423a,  0xa2, 0x93, \
+                 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4)
+
+#define LINUX_EFI_INITRD_MEDIA_GUID \
+        EFI_GUID(0x5568e427, 0x68fc, 0x4f3d,  0xac, 0x74, \
+                 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
+
+#define DEVICE_TREE_GUID \
+        EFI_GUID(0xb1b621d5, 0xf19c, 0x41a5,  0x83, 0x0b, \
+                 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0)
+
+struct efi_config_table {
+    efi_guid_t guid;
+    uint64_t *ptr;
+    const char name[16];
+};
+
+typedef struct {
+    uint64_t signature;
+    uint32_t revision;
+    uint32_t headersize;
+    uint32_t crc32;
+    uint32_t reserved;
+} efi_table_hdr_t;
+
+struct efi_configuration_table {
+    efi_guid_t guid;
+    void *table;
+};
+
+struct efi_system_table {
+    efi_table_hdr_t hdr;
+    uint64_t fw_vendor;        /* physical addr of CHAR16 vendor string */
+    uint32_t fw_revision;
+    uint64_t con_in_handle;
+    uint64_t *con_in;
+    uint64_t con_out_handle;
+    uint64_t *con_out;
+    uint64_t stderr_handle;
+    uint64_t stderr_placeholder;
+    uint64_t *runtime;
+    uint64_t *boottime;
+    uint64_t nr_tables;
+    struct efi_configuration_table *tables;
+};
+
+typedef struct {
+    uint32_t type;
+    uint32_t pad;
+    uint64_t phys_addr;
+    uint64_t virt_addr;
+    uint64_t num_pages;
+    uint64_t attribute;
+} efi_memory_desc_t;
+
+struct efi_boot_memmap {
+    uint64_t map_size;
+    uint64_t desc_size;
+    uint32_t desc_ver;
+    uint64_t map_key;
+    uint64_t buff_size;
+    efi_memory_desc_t map[32];
+};
+
+struct efi_initrd {
+    uint64_t base;
+    uint64_t size;
+};
+
+struct loongarch_boot_info {
+    uint64_t ram_size;
+    const char *kernel_filename;
+    const char *kernel_cmdline;
+    const char *initrd_filename;
+    uint64_t a0, a1, a2;
+};
+
+extern struct memmap_entry *memmap_table;
+extern unsigned memmap_entries;
+
+struct memmap_entry {
+    uint64_t address;
+    uint64_t length;
+    uint32_t type;
+    uint32_t reserved;
+};
+
+void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info);
+
+#endif /* HW_LOONGARCH_BOOT_H */
diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h
index 674f4655e0ee974c215a23234cff2e67fa2d03e1..0edce84d6db7dca7f004f57a475ca35216dea46a 100644
--- a/include/hw/loongarch/virt.h
+++ b/include/hw/loongarch/virt.h
@@ -8,29 +8,51 @@
 #ifndef HW_LOONGARCH_H
 #define HW_LOONGARCH_H
 
-#include "target/loongarch/cpu.h"
 #include "hw/boards.h"
 #include "qemu/queue.h"
 #include "hw/intc/loongarch_ipi.h"
 #include "hw/block/flash.h"
+#include "hw/loongarch/boot.h"
 
 #define LOONGARCH_MAX_CPUS      256
 
 #define VIRT_FWCFG_BASE         0x1e020000UL
 #define VIRT_BIOS_BASE          0x1c000000UL
-#define VIRT_BIOS_SIZE          (4 * MiB)
+#define VIRT_BIOS_SIZE          (16 * MiB)
 #define VIRT_FLASH_SECTOR_SIZE  (128 * KiB)
-#define VIRT_FLASH_BASE         0x1d000000UL
-#define VIRT_FLASH_SIZE         (16 * MiB)
+#define VIRT_FLASH0_BASE        VIRT_BIOS_BASE
+#define VIRT_FLASH0_SIZE        VIRT_BIOS_SIZE
+#define VIRT_FLASH1_BASE        0x1d000000UL
+#define VIRT_FLASH1_SIZE        (16 * MiB)
 
 #define VIRT_LOWMEM_BASE        0
 #define VIRT_LOWMEM_SIZE        0x10000000
-#define VIRT_HIGHMEM_BASE       0x90000000
+#define VIRT_HIGHMEM_BASE       0x80000000
 #define VIRT_GED_EVT_ADDR       0x100e0000
 #define VIRT_GED_MEM_ADDR       (VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN)
 #define VIRT_GED_REG_ADDR       (VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN)
+#define VIRT_GED_CPUHP_ADDR     (VIRT_GED_REG_ADDR + ACPI_GED_REG_COUNT)
 
-struct LoongArchMachineState {
+#define COMMAND_LINE_SIZE       512
+
+#define FDT_BASE                0x100000
+
+/* KVM_IRQ_LINE irq field index values */
+#define KVM_LOONGARCH_IRQ_TYPE_SHIFT            24
+#define KVM_LOONGARCH_IRQ_TYPE_MASK             0xff
+#define KVM_LOONGARCH_IRQ_VCPU_SHIFT            16
+#define KVM_LOONGARCH_IRQ_VCPU_MASK             0xff
+#define KVM_LOONGARCH_IRQ_NUM_SHIFT             0
+#define KVM_LOONGARCH_IRQ_NUM_MASK              0xffff
+
+/* irq_type field */
+#define KVM_LOONGARCH_IRQ_TYPE_CPU_IP           0
+#define KVM_LOONGARCH_IRQ_TYPE_CPU_IO           1
+#define KVM_LOONGARCH_IRQ_TYPE_HT               2
+#define KVM_LOONGARCH_IRQ_TYPE_MSI              3
+#define KVM_LOONGARCH_IRQ_TYPE_IOAPIC           4
+
+struct LoongArchVirtMachineState {
     /*< private >*/
     MachineState parent_obj;
 
@@ -43,17 +65,25 @@ struct LoongArchMachineState {
     Notifier     machine_done;
     Notifier     powerdown_notifier;
     OnOffAuto    acpi;
+    OnOffAuto    veiointc;
     char         *oem_id;
     char         *oem_table_id;
     DeviceState  *acpi_ged;
     int          fdt_size;
     DeviceState *platform_bus_dev;
+    DeviceState  *extioi;
     PCIBus       *pci_bus;
-    PFlashCFI01  *flash;
+    PFlashCFI01  *flash[2];
+    MemoryRegion system_iocsr;
+    MemoryRegion iocsr_mem;
+    AddressSpace as_iocsr;
+    int          features;
+    struct loongarch_boot_info bootinfo;
+    DeviceState *ipi;
 };
 
-#define TYPE_LOONGARCH_MACHINE  MACHINE_TYPE_NAME("virt")
-OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE)
-bool loongarch_is_acpi_enabled(LoongArchMachineState *lams);
-void loongarch_acpi_setup(LoongArchMachineState *lams);
+#define TYPE_LOONGARCH_VIRT_MACHINE  MACHINE_TYPE_NAME("virt")
+OBJECT_DECLARE_SIMPLE_TYPE(LoongArchVirtMachineState, LOONGARCH_VIRT_MACHINE)
+void loongarch_acpi_setup(LoongArchVirtMachineState *lvms);
+void reset_load_elf(void *opaque);
 #endif
diff --git a/include/hw/misc/mos6522.h b/include/hw/misc/mos6522.h
index fba45668abae32b06b82deb4df404b3f046d81d3..920871a598bc68254f6116e15424cd23b5ebcd0d 100644
--- a/include/hw/misc/mos6522.h
+++ b/include/hw/misc/mos6522.h
@@ -154,7 +154,7 @@ struct MOS6522State {
 OBJECT_DECLARE_TYPE(MOS6522State, MOS6522DeviceClass, MOS6522)
 
 struct MOS6522DeviceClass {
-    DeviceClass parent_class;
+    SysBusDeviceClass parent_class;
 
     ResettablePhases parent_phases;
     void (*portB_write)(MOS6522State *dev);
diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h
index b0240bd7681fb36db4e6b64ed056855e28c363de..65475f7f9d128640b876c317ef7daff9287168e7 100644
--- a/include/hw/pci-host/gpex.h
+++ b/include/hw/pci-host/gpex.h
@@ -64,6 +64,7 @@ struct GPEXConfig {
     MemMapEntry pio;
     int         irq;
     PCIBus      *bus;
+    bool        preserve_config;
 };
 
 int gpex_set_irq_num(GPEXHost *s, int index, int gsi);
diff --git a/include/hw/pci-host/ls7a.h b/include/hw/pci-host/ls7a.h
index e753449593bf035d0b82cd99e6a0d338556aba45..79d4ea85012747bf27449b7d02c29d9a577d49c9 100644
--- a/include/hw/pci-host/ls7a.h
+++ b/include/hw/pci-host/ls7a.h
@@ -24,6 +24,8 @@
 #define VIRT_PCH_REG_BASE        0x10000000UL
 #define VIRT_IOAPIC_REG_BASE     (VIRT_PCH_REG_BASE)
 #define VIRT_PCH_MSI_ADDR_LOW    0x2FF00000UL
+#define VIRT_PCH_REG_SIZE        0x400
+#define VIRT_PCH_MSI_SIZE        0x8
 
 /*
  * GSI_BASE is hard-coded with 64 in linux kernel, else kernel fails to boot
@@ -34,17 +36,18 @@
 #define VIRT_PCH_PIC_IRQ_NUM     32
 #define VIRT_GSI_BASE            64
 #define VIRT_DEVICE_IRQS         16
+#define VIRT_UART_COUNT          4
 #define VIRT_UART_IRQ            (VIRT_GSI_BASE + 2)
 #define VIRT_UART_BASE           0x1fe001e0
-#define VIRT_UART_SIZE           0X100
-#define VIRT_RTC_IRQ             (VIRT_GSI_BASE + 3)
+#define VIRT_UART_SIZE           0x100
+#define VIRT_RTC_IRQ             (VIRT_GSI_BASE + 6)
 #define VIRT_MISC_REG_BASE       (VIRT_PCH_REG_BASE + 0x00080000)
 #define VIRT_RTC_REG_BASE        (VIRT_MISC_REG_BASE + 0x00050100)
 #define VIRT_RTC_LEN             0x100
-#define VIRT_SCI_IRQ             (VIRT_GSI_BASE + 4)
+#define VIRT_SCI_IRQ             (VIRT_GSI_BASE + 7)
 
 #define VIRT_PLATFORM_BUS_BASEADDRESS   0x16000000
 #define VIRT_PLATFORM_BUS_SIZE          0x2000000
 #define VIRT_PLATFORM_BUS_NUM_IRQS      2
-#define VIRT_PLATFORM_BUS_IRQ           (VIRT_GSI_BASE + 5)
+#define VIRT_PLATFORM_BUS_IRQ           (VIRT_GSI_BASE + 8)
 #endif
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index fa6313aabc43b81594ee95d1d17bd9653aeb7a69..0dfe274c33f29c3f327d1f4a68a5fc77acbd8512 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -15,7 +16,7 @@ extern bool pci_available;
 #define PCI_BUS_NUM(x)          (((x) >> 8) & 0xff)
 #define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
 #define PCI_FUNC(devfn)         ((devfn) & 0x07)
-#define PCI_BUILD_BDF(bus, devfn)     ((bus << 8) | (devfn))
+#define PCI_BUILD_BDF(bus, devfn)     (((bus) << 8) | (devfn))
 #define PCI_BDF_TO_DEVFN(x)     ((x) & 0xff)
 #define PCI_BUS_MAX             256
 #define PCI_DEVFN_MAX           256
@@ -384,10 +385,58 @@ typedef struct PCIIOMMUOps {
      *
      * @devfn: device and function number
      */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+    AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+    /**
+     * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+     *
+     * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+     * retrieve host information from the associated HostIOMMUDevice.
+     *
+     * @bus: the #PCIBus of the PCI device.
+     *
+     * @opaque: the data passed to pci_setup_iommu().
+     *
+     * @devfn: device and function number of the PCI device.
+     *
+     * @dev: the #HostIOMMUDevice to attach.
+     *
+     * @errp: pass an Error out only when return false
+     *
+     * Returns: true if HostIOMMUDevice is attached or else false with errp set.
+     */
+    bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+                             HostIOMMUDevice *dev, Error **errp);
+    /**
+     * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+     *
+     * Optional callback.
+     *
+     * @bus: the #PCIBus of the PCI device.
+     *
+     * @opaque: the data passed to pci_setup_iommu().
+     *
+     * @devfn: device and function number of the PCI device.
+     */
+    void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
+    /**
+     * @get_pasid_cap: get pasid capability from vIOMMU
+     *
+     * Optional callback.
+     *
+     * @bus: the #PCIBus of the PCI device.
+     *
+     * @opaque: the data passed to pci_setup_iommu().
+     *
+     * @devfn: device and function number of the PCI device.
+     */
+    bool (*get_pasid_cap)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+                                 Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
+bool pci_device_get_pasid_cap(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
@@ -632,16 +681,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev)
     pci_set_irq(pci_dev, 0);
 }
 
-/*
- * FIXME: PCI does not work this way.
- * All the callers to this method should be fixed.
- */
-static inline void pci_irq_pulse(PCIDevice *pci_dev)
-{
-    pci_irq_assert(pci_dev);
-    pci_irq_deassert(pci_dev);
-}
-
 MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
 void pci_set_power(PCIDevice *pci_dev, bool state);
 
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b273ad1d192e43afd27c020a414f0a6b..253b48a688b06a356375e4c5c2d12b054d371f87 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -160,6 +160,9 @@ struct PCIDevice {
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
     uint32_t acpi_index;
+
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    uint32_t max_bounce_buffer_size;
 };
 
 static inline int pci_intx(PCIDevice *pci_dev)
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index 11f5a91bbb7ec318912e2440daec4cf7d3e5d669..41ee27f0234aa5b19bb158c0c49504af2650e729 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -79,6 +79,9 @@ struct PCIExpressDevice {
     uint16_t sriov_cap;
     PCIESriovPF sriov_pf;
     PCIESriovVF sriov_vf;
+
+    /* Offset of PASID capability in config space */
+    uint16_t pasid_cap;
 };
 
 #define COMPAT_PROP_PCP "power_controller_present"
@@ -147,4 +150,5 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
                              Error **errp);
 void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev,
                                      DeviceState *dev, Error **errp);
+void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps);
 #endif /* QEMU_PCIE_H */
diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h
index 90e6cf45b873721bf07a61af7b55af8ecf1f0387..7148a0959bff3a1d0babdf9872f98b1895829f0a 100644
--- a/include/hw/pci/pcie_port.h
+++ b/include/hw/pci/pcie_port.h
@@ -56,6 +56,9 @@ struct PCIESlot {
     uint8_t     chassis;
     uint16_t    slot;
 
+    uint8_t     fast_plug;
+    uint8_t     fast_unplug;
+
     PCIExpLinkSpeed speed;
     PCIExpLinkWidth width;
 
diff --git a/include/hw/ppc/mac_dbdma.h b/include/hw/ppc/mac_dbdma.h
index 4a3f644516b3bbbb12d5b8b42d9b1b078d68e051..c774f6bf84f1a4f6b40306ad1f76da8e3bd42041 100644
--- a/include/hw/ppc/mac_dbdma.h
+++ b/include/hw/ppc/mac_dbdma.h
@@ -44,10 +44,6 @@ struct DBDMA_io {
     DBDMA_end dma_end;
     /* DMA is in progress, don't start another one */
     bool processing;
-    /* DMA request */
-    void *dma_mem;
-    dma_addr_t dma_len;
-    DMADirection dir;
 };
 
 /*
diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
index f120874e0ff192a4676b275364146c267fadd3f7..00023c02338cd51239d7a584ceacfbd3e8377641 100644
--- a/include/hw/ppc/xive.h
+++ b/include/hw/ppc/xive.h
@@ -218,7 +218,7 @@ static inline bool xive_source_esb_has_2page(XiveSource *xsrc)
         xsrc->esb_shift == XIVE_ESB_4K_2PAGE;
 }
 
-static inline size_t xive_source_esb_len(XiveSource *xsrc)
+static inline uint64_t xive_source_esb_len(XiveSource *xsrc)
 {
     return (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
 }
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 151d9682380d292357a5b28c77824e15050fcb55..2d3661d6cdde5f88539b568742ea0a2b2d2b05be 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -739,6 +739,8 @@ qemu_irq qdev_get_gpio_out_connector(DeviceState *dev, const char *name, int n);
  */
 qemu_irq qdev_intercept_gpio_out(DeviceState *dev, qemu_irq icpt,
                                  const char *name, int n);
+qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev,
+                                               const char *name, int n);
 
 BusState *qdev_get_child_bus(DeviceState *dev, const char *name);
 
diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h
index 91f7a2452d94b777293b5da92fd096b84ff113b4..63dcf69978fd136f4da43cd5a394b3eb73d6de51 100644
--- a/include/hw/qdev-properties-system.h
+++ b/include/hw/qdev-properties-system.h
@@ -8,8 +8,11 @@ extern const PropertyInfo qdev_prop_macaddr;
 extern const PropertyInfo qdev_prop_reserved_region;
 extern const PropertyInfo qdev_prop_multifd_compression;
 extern const PropertyInfo qdev_prop_mig_mode;
+extern const PropertyInfo qdev_prop_zero_page_detection;
 extern const PropertyInfo qdev_prop_losttickpolicy;
 extern const PropertyInfo qdev_prop_blockdev_on_error;
+extern const PropertyInfo qdev_prop_blockdev_retry_interval;
+extern const PropertyInfo qdev_prop_blockdev_retry_timeout;
 extern const PropertyInfo qdev_prop_bios_chs_trans;
 extern const PropertyInfo qdev_prop_fdc_drive_type;
 extern const PropertyInfo qdev_prop_drive;
@@ -46,12 +49,21 @@ extern const PropertyInfo qdev_prop_cpus390entitlement;
 #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \
     DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \
                        MigMode)
+#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \
+    DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \
+                       ZeroPageDetection)
 #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
     DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
                         LostTickPolicy)
 #define DEFINE_PROP_BLOCKDEV_ON_ERROR(_n, _s, _f, _d) \
     DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_on_error, \
                         BlockdevOnError)
+#define DEFINE_PROP_BLOCKDEV_RETRY_INTERVAL(_n, _s, _f, _d) \
+    DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_retry_interval, \
+                        int64_t)
+#define DEFINE_PROP_BLOCKDEV_RETRY_TIMEOUT(_n, _s, _f, _d) \
+    DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_retry_timeout, \
+                        int64_t)
 #define DEFINE_PROP_BIOS_CHS_TRANS(_n, _s, _f, _d) \
     DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_bios_chs_trans, int)
 #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f) \
diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h
index 25743a29a000b15d7a15b15e6d83fa3a279d1edd..63602c2c74bac745ad7c124359604b2ea46052e3 100644
--- a/include/hw/qdev-properties.h
+++ b/include/hw/qdev-properties.h
@@ -60,6 +60,7 @@ extern const PropertyInfo qdev_prop_int64;
 extern const PropertyInfo qdev_prop_size;
 extern const PropertyInfo qdev_prop_string;
 extern const PropertyInfo qdev_prop_on_off_auto;
+extern const PropertyInfo qdev_prop_compress_method;
 extern const PropertyInfo qdev_prop_size32;
 extern const PropertyInfo qdev_prop_array;
 extern const PropertyInfo qdev_prop_link;
@@ -168,6 +169,9 @@ extern const PropertyInfo qdev_prop_link;
     DEFINE_PROP(_n, _s, _f, qdev_prop_string, char*)
 #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \
     DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto)
+#define DEFINE_PROP_COMPRESS_METHOD(_n, _s, _f, _d) \
+    DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_compress_method, \
+                        CompressMethod)
 #define DEFINE_PROP_SIZE32(_n, _s, _f, _d)                       \
     DEFINE_PROP_UNSIGNED(_n, _s, _f, _d, qdev_prop_size32, uint32_t)
 
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 3692ca82f3154064e48f811b448bf7eb80303de3..6ec18bf12b1218810abd86c431d6b040f481c804 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -226,6 +226,7 @@ void scsi_req_cancel_complete(SCSIRequest *req);
 void scsi_req_cancel(SCSIRequest *req);
 void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier);
 void scsi_req_retry(SCSIRequest *req);
+void scsi_retry_requests(SCSIDevice *s);
 void scsi_device_drained_begin(SCSIDevice *sdev);
 void scsi_device_drained_end(SCSIDevice *sdev);
 void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense);
diff --git a/include/hw/usb.h b/include/hw/usb.h
index 32c23a5ca2a0559328800bc117137d25b3ed8e8e..911179158d496c4e94ff04e9650aec28859e2108 100644
--- a/include/hw/usb.h
+++ b/include/hw/usb.h
@@ -142,6 +142,7 @@
 
 #define USB_DEVICE_SELF_POWERED         0
 #define USB_DEVICE_REMOTE_WAKEUP        1
+#define USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED   2
 
 #define USB_DT_DEVICE                   0x01
 #define USB_DT_CONFIG                   0x02
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index a4a22accb9434c5d04bef73172a0ccbe182405cb..abae8655c4c9df3f3c948da83f9852d339f87528 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -30,6 +30,9 @@
 #include <linux/vfio.h>
 #endif
 #include "sysemu/sysemu.h"
+#include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -72,54 +75,29 @@ typedef struct VFIOMigration {
     bool initial_data_sent;
 } VFIOMigration;
 
-typedef struct VFIOAddressSpace {
-    AddressSpace *as;
-    QLIST_HEAD(, VFIOContainer) containers;
-    QLIST_ENTRY(VFIOAddressSpace) list;
-} VFIOAddressSpace;
-
 struct VFIOGroup;
 
+#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
+            TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
+
+typedef struct VFIODMARange {
+    QLIST_ENTRY(VFIODMARange) next;
+    hwaddr iova;
+    size_t size;
+    void *vaddr; /* unused */
+    unsigned long *bitmap; /* dirty bitmap cache for this range */
+} VFIODMARange;
+
 typedef struct VFIOContainer {
-    VFIOAddressSpace *space;
+    VFIOContainerBase bcontainer;
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
-    MemoryListener listener;
-    MemoryListener prereg_listener;
     unsigned iommu_type;
-    Error *error;
-    bool initialized;
-    bool dirty_pages_supported;
-    uint64_t dirty_pgsizes;
-    uint64_t max_dirty_bitmap_size;
-    unsigned long pgsizes;
-    unsigned int dma_max_mappings;
-    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
-    QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+    bool dirty_log_manual_clear;
     QLIST_HEAD(, VFIOGroup) group_list;
-    QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
-    QLIST_ENTRY(VFIOContainer) next;
-    QLIST_HEAD(, VFIODevice) device_list;
-    GList *iova_ranges;
+    QLIST_HEAD(, VFIODMARange) dma_list;
 } VFIOContainer;
 
-typedef struct VFIOGuestIOMMU {
-    VFIOContainer *container;
-    IOMMUMemoryRegion *iommu_mr;
-    hwaddr iommu_offset;
-    IOMMUNotifier n;
-    QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
-} VFIOGuestIOMMU;
-
-typedef struct VFIORamDiscardListener {
-    VFIOContainer *container;
-    MemoryRegion *mr;
-    hwaddr offset_within_address_space;
-    hwaddr size;
-    uint64_t granularity;
-    RamDiscardListener listener;
-    QLIST_ENTRY(VFIORamDiscardListener) next;
-} VFIORamDiscardListener;
-
 typedef struct VFIOHostDMAWindow {
     hwaddr min_iova;
     hwaddr max_iova;
@@ -127,6 +105,22 @@ typedef struct VFIOHostDMAWindow {
     QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
 } VFIOHostDMAWindow;
 
+typedef struct IOMMUFDBackend IOMMUFDBackend;
+
+typedef struct VFIOIOASHwpt {
+    uint32_t hwpt_id;
+    uint32_t hwpt_flags;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
+typedef struct VFIOIOMMUFDContainer {
+    VFIOContainerBase bcontainer;
+    IOMMUFDBackend *be;
+    uint32_t ioas_id;
+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
+} VFIOIOMMUFDContainer;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef struct VFIODevice {
@@ -134,12 +128,13 @@ typedef struct VFIODevice {
     QLIST_ENTRY(VFIODevice) container_next;
     QLIST_ENTRY(VFIODevice) global_next;
     struct VFIOGroup *group;
-    VFIOContainer *container;
+    VFIOContainerBase *bcontainer;
     char *sysfsdev;
     char *name;
     DeviceState *dev;
     int fd;
     int type;
+    bool mdev;
     bool reset_works;
     bool needs_reset;
     bool no_mmap;
@@ -152,8 +147,15 @@ typedef struct VFIODevice {
     VFIOMigration *migration;
     Error *migration_blocker;
     OnOffAuto pre_copy_dirty_page_tracking;
+    OnOffAuto device_dirty_page_tracking;
     bool dirty_pages_supported;
     bool dirty_tracking;
+    bool iommu_dirty_tracking;
+    HostIOMMUDevice *hiod;
+    int devid;
+    IOMMUFDBackend *iommufd;
+    VFIOIOASHwpt *hwpt;
+    QLIST_ENTRY(VFIODevice) hwpt_next;
 } VFIODevice;
 
 struct VFIODeviceOps {
@@ -201,31 +203,14 @@ typedef struct VFIODisplay {
     } dmabuf;
 } VFIODisplay;
 
-typedef struct {
-    unsigned long *bitmap;
-    hwaddr size;
-    hwaddr pages;
-} VFIOBitmap;
-
 VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
 void vfio_put_address_space(VFIOAddressSpace *space);
-bool vfio_devices_all_running_and_saving(VFIOContainer *container);
 
-/* container->fd */
-int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
-                   ram_addr_t size, IOMMUTLBEntry *iotlb);
-int vfio_dma_map(VFIOContainer *container, hwaddr iova,
-                 ram_addr_t size, void *vaddr, bool readonly);
-int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start);
-int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
-                            hwaddr iova, hwaddr size);
+VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container,
+		hwaddr start_addr, hwaddr size);
+void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange);
 
 /* SPAPR specific */
-int vfio_container_add_section_window(VFIOContainer *container,
-                                      MemoryRegionSection *section,
-                                      Error **errp);
-void vfio_container_del_section_window(VFIOContainer *container,
-                                       MemoryRegionSection *section);
 int vfio_spapr_container_init(VFIOContainer *container, Error **errp);
 void vfio_spapr_container_deinit(VFIOContainer *container);
 
@@ -247,6 +232,8 @@ void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 struct vfio_device_info *vfio_get_device_info(int fd);
+bool vfio_device_is_mdev(VFIODevice *vbasedev);
+bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp);
 int vfio_attach_device(char *name, VFIODevice *vbasedev,
                        AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
@@ -259,7 +246,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
 typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
 extern VFIOGroupList vfio_group_list;
 extern VFIODeviceList vfio_device_list;
-
 extern const MemoryListener vfio_memory_listener;
 extern int vfio_kvm_device_fd;
 
@@ -292,11 +278,20 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
 void vfio_migration_exit(VFIODevice *vbasedev);
 
 int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
-bool vfio_devices_all_running_and_mig_active(VFIOContainer *container);
-bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container);
-int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
+bool
+vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer);
+bool
+vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer);
+int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
                                     VFIOBitmap *vbmap, hwaddr iova,
                                     hwaddr size);
-int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
-                                 uint64_t size, ram_addr_t ram_addr);
+int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
+                          uint64_t size, ram_addr_t ram_addr);
+
+/* Returns 0 on success, or a negative errno. */
+int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
+void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
+void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
+                      DeviceState *dev, bool ram_discard);
+int vfio_device_get_aw_bits(VFIODevice *vdev);
 #endif /* HW_VFIO_VFIO_COMMON_H */
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a4c575115a74112bff1eb21890b2e7f7b1014e6
--- /dev/null
+++ b/include/hw/vfio/vfio-container-base.h
@@ -0,0 +1,143 @@
+/*
+ * VFIO BASE CONTAINER
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H
+#define HW_VFIO_VFIO_CONTAINER_BASE_H
+
+#include "exec/memory.h"
+
+typedef struct VFIODevice VFIODevice;
+typedef struct VFIOIOMMUClass VFIOIOMMUClass;
+
+typedef struct {
+    unsigned long *bitmap;
+    hwaddr size;
+    hwaddr pages;
+} VFIOBitmap;
+
+typedef struct VFIOAddressSpace {
+    AddressSpace *as;
+    QLIST_HEAD(, VFIOContainerBase) containers;
+    QLIST_ENTRY(VFIOAddressSpace) list;
+} VFIOAddressSpace;
+
+/*
+ * This is the base object for vfio container backends
+ */
+typedef struct VFIOContainerBase {
+    const VFIOIOMMUClass *ops;
+    VFIOAddressSpace *space;
+    MemoryListener listener;
+    Error *error;
+    bool initialized;
+    uint64_t dirty_pgsizes;
+    uint64_t max_dirty_bitmap_size;
+    unsigned long pgsizes;
+    unsigned int dma_max_mappings;
+    bool dirty_pages_supported;
+    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+    QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
+    QLIST_ENTRY(VFIOContainerBase) next;
+    QLIST_HEAD(, VFIODevice) device_list;
+    GList *iova_ranges;
+} VFIOContainerBase;
+
+typedef struct VFIOGuestIOMMU {
+    VFIOContainerBase *bcontainer;
+    IOMMUMemoryRegion *iommu_mr;
+    hwaddr iommu_offset;
+    IOMMUNotifier n;
+    QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+} VFIOGuestIOMMU;
+
+typedef struct VFIORamDiscardListener {
+    VFIOContainerBase *bcontainer;
+    MemoryRegion *mr;
+    hwaddr offset_within_address_space;
+    hwaddr size;
+    uint64_t granularity;
+    RamDiscardListener listener;
+    QLIST_ENTRY(VFIORamDiscardListener) next;
+} VFIORamDiscardListener;
+
+int vfio_container_dma_map(VFIOContainerBase *bcontainer,
+                           hwaddr iova, ram_addr_t size,
+                           void *vaddr, bool readonly);
+int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
+                             hwaddr iova, ram_addr_t size,
+                             IOMMUTLBEntry *iotlb);
+int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+                                      MemoryRegionSection *section,
+                                      Error **errp);
+void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
+                                       MemoryRegionSection *section);
+int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
+                                           bool start);
+int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+                                      VFIOBitmap *vbmap,
+                                      hwaddr iova, hwaddr size);
+
+void vfio_container_init(VFIOContainerBase *bcontainer,
+                         VFIOAddressSpace *space,
+                         const VFIOIOMMUClass *ops);
+void vfio_container_destroy(VFIOContainerBase *bcontainer);
+
+
+#define TYPE_VFIO_IOMMU "vfio-iommu"
+#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy"
+#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr"
+#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd"
+
+/*
+ * VFIOContainerBase is not an abstract QOM object because it felt
+ * unnecessary to expose all the IOMMU backends to the QEMU machine
+ * and human interface. However, we can still abstract the IOMMU
+ * backend handlers using a QOM interface class. This provides more
+ * flexibility when referencing the various implementations.
+ */
+DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU)
+
+struct VFIOIOMMUClass {
+    InterfaceClass parent_class;
+
+    /* Properties */
+    const char *hiod_typename;
+
+    /* basic feature */
+    int (*setup)(VFIOContainerBase *bcontainer, Error **errp);
+    int (*dma_map)(const VFIOContainerBase *bcontainer,
+                   hwaddr iova, ram_addr_t size,
+                   void *vaddr, bool readonly);
+    int (*dma_unmap)(const VFIOContainerBase *bcontainer,
+                     hwaddr iova, ram_addr_t size,
+                     IOMMUTLBEntry *iotlb);
+    int (*attach_device)(const char *name, VFIODevice *vbasedev,
+                         AddressSpace *as, Error **errp);
+    void (*detach_device)(VFIODevice *vbasedev);
+    /* migration feature */
+    int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
+                                   bool start);
+    int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer,
+                              VFIOBitmap *vbmap,
+                              hwaddr iova, hwaddr size);
+    /* PCI specific */
+    int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
+
+    /* SPAPR specific */
+    int (*add_window)(VFIOContainerBase *bcontainer,
+                      MemoryRegionSection *section,
+                      Error **errp);
+    void (*del_window)(VFIOContainerBase *bcontainer,
+                       MemoryRegionSection *section);
+    void (*release)(VFIOContainerBase *bcontainer);
+};
+#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */
diff --git a/include/hw/virtio/vdpa-dev-iommufd.h b/include/hw/virtio/vdpa-dev-iommufd.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e56647690a0bf8a92ee82384d6b80a75b530728
--- /dev/null
+++ b/include/hw/virtio/vdpa-dev-iommufd.h
@@ -0,0 +1,41 @@
+/*
+ * vhost vDPA device support iommufd header
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All Rights Reserved.
+ */
+
+#ifndef _VHOST_VDPA_IOMMUFD_H
+#define _VHOST_VDPA_IOMMUFD_H
+
+#include "hw/virtio/vdpa-dev.h"
+
+/*
+ * A HW pagetable is called an iommu_domain inside the kernel.
+ * This user object allows directly creating an inspecting the
+ * domains. Domains that have kernel owned page tables will be
+ * associated with an iommufd_ioas that provides the IOVA to
+ * PFN map.
+ */
+typedef struct IOMMUFDHWPT {
+    uint32_t hwpt_id;
+    QLIST_HEAD(, VhostVdpaDevice) device_list;
+    QLIST_ENTRY(IOMMUFDHWPT) next;
+} IOMMUFDHWPT;
+
+typedef struct VDPAIOMMUFDContainer {
+    MemoryListener listener;
+    struct IOMMUFDBackend *iommufd;
+    uint32_t ioas_id;
+    QLIST_HEAD(, IOMMUFDHWPT) hwpt_list;
+    QLIST_ENTRY(VDPAIOMMUFDContainer) next;
+} VDPAIOMMUFDContainer;
+
+struct vdpa_dev_bind_iommufd {
+    __s32 iommufd;
+    __u32 out_devid;
+};
+
+int vhost_vdpa_attach_container(VhostVdpaDevice *vdev);
+void vhost_vdpa_detach_container(VhostVdpaDevice *vdev);
+
+#endif /* _VHOST_VDPA_IOMMUFD_H */
diff --git a/include/hw/virtio/vdpa-dev-mig.h b/include/hw/virtio/vdpa-dev-mig.h
new file mode 100644
index 0000000000000000000000000000000000000000..adc1d657f70aa500cb9a6578c6a9444c7c0677b9
--- /dev/null
+++ b/include/hw/virtio/vdpa-dev-mig.h
@@ -0,0 +1,29 @@
+/*
+ * Vhost Vdpa Device Migration Header
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All Rights Reserved.
+ */
+
+#ifndef _VHOST_VDPA_MIGRATION_H
+#define _VHOST_VDPA_MIGRATION_H
+
+#include "hw/virtio/vdpa-dev.h"
+
+enum {
+    VDPA_DEVICE_START,
+    VDPA_DEVICE_STOP,
+    VDPA_DEVICE_PRE_START,
+    VDPA_DEVICE_PRE_STOP,
+    VDPA_DEVICE_CANCEL,
+    VDPA_DEVICE_POST_START,
+    VDPA_DEVICE_START_ASYNC,
+    VDPA_DEVICE_STOP_ASYNC,
+    VDPA_DEVICE_PRE_START_ASYNC,
+    VDPA_DEVICE_QUERY_OP_STATE,
+};
+
+void vdpa_migration_register(VhostVdpaDevice *vdev);
+
+void vdpa_migration_unregister(VhostVdpaDevice *vdev);
+
+#endif /* _VHOST_VDPA_MIGRATION_H */
diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h
index 4dbf98195c4cc1348494c16275cb71717b9482ed..872e630546c251fe18a9345ffc302cc2930d17ef 100644
--- a/include/hw/virtio/vdpa-dev.h
+++ b/include/hw/virtio/vdpa-dev.h
@@ -18,6 +18,7 @@
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "qom/object.h"
+#include "sysemu/iommufd.h"
 
 
 #define TYPE_VHOST_VDPA_DEVICE "vhost-vdpa-device"
@@ -37,7 +38,13 @@ struct VhostVdpaDevice {
     int config_size;
     uint16_t queue_size;
     bool started;
+    bool suspended;
     int (*post_init)(VhostVdpaDevice *v, Error **errp);
+    VMChangeStateEntry *vmstate;
+    Notifier migration_state;
+    IOMMUFDBackend *iommufd;
+    uint32_t iommufd_devid;
+    QLIST_ENTRY(VhostVdpaDevice) next;
 };
 
 #endif
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index a86d103f8245b2b8602bd534960bf012e52cceb5..84b8fa1075763ce5961517034094b80e9e683621 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -65,6 +65,11 @@ typedef int (*vhost_scsi_get_abi_version_op)(struct vhost_dev *dev,
                                              int *version);
 typedef int (*vhost_set_log_base_op)(struct vhost_dev *dev, uint64_t base,
                                      struct vhost_log *log);
+typedef int (*vhost_set_log_size_op)(struct vhost_dev *dev, uint64_t size,
+                                     struct vhost_log *log);
+typedef int (*vhost_set_log_fd_op)(struct vhost_dev *dev, int fd,
+                                   struct vhost_log *log);
+typedef int (*vhost_log_sync_op)(struct vhost_dev *dev);
 typedef int (*vhost_set_mem_table_op)(struct vhost_dev *dev,
                                       struct vhost_memory *mem);
 typedef int (*vhost_set_vring_addr_op)(struct vhost_dev *dev,
@@ -150,6 +155,9 @@ typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev,
                                             Error **errp);
 typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, Error **errp);
 
+typedef int (*vhost_dev_suspend_op)(struct vhost_dev *dev);
+typedef int (*vhost_dev_resume_op)(struct vhost_dev *dev);
+
 typedef struct VhostOps {
     VhostBackendType backend_type;
     vhost_backend_init vhost_backend_init;
@@ -162,6 +170,9 @@ typedef struct VhostOps {
     vhost_scsi_clear_endpoint_op vhost_scsi_clear_endpoint;
     vhost_scsi_get_abi_version_op vhost_scsi_get_abi_version;
     vhost_set_log_base_op vhost_set_log_base;
+    vhost_set_log_size_op vhost_set_log_size;
+    vhost_set_log_fd_op vhost_set_log_fd;
+    vhost_log_sync_op vhost_log_sync;
     vhost_set_mem_table_op vhost_set_mem_table;
     vhost_set_vring_addr_op vhost_set_vring_addr;
     vhost_set_vring_endian_op vhost_set_vring_endian;
@@ -200,6 +211,8 @@ typedef struct VhostOps {
     vhost_supports_device_state_op vhost_supports_device_state;
     vhost_set_device_state_fd_op vhost_set_device_state_fd;
     vhost_check_device_state_op vhost_check_device_state;
+    vhost_dev_suspend_op vhost_dev_suspend;
+    vhost_dev_resume_op vhost_dev_resume;
 } VhostOps;
 
 int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index 5407d54fd79bb6904729055fb05a8927d19ab17d..e32effc6e11feafa27420ac7dc631c6303e2ca21 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
     bool shadow_vqs_enabled;
     /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
     bool shadow_data;
-    /* Device suspended successfully */
-    bool suspended;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
@@ -59,6 +57,13 @@ typedef struct vhost_vdpa {
 int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range);
 int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx);
 
+Int128 vhost_vdpa_section_end(const MemoryRegionSection *section,
+                              int page_mask);
+bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
+                                         uint64_t iova_min,
+                                         uint64_t iova_max,
+                                         int page_mask);
+
 int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
                        hwaddr size, void *vaddr, bool readonly);
 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 02477788df5eb370ac1d7ddb1cf02859a07fa150..598ae137575304dac90c6bb5889901656d015b70 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -43,6 +43,7 @@ typedef unsigned long vhost_log_chunk_t;
 #define VHOST_LOG_PAGE 0x1000
 #define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t))
 #define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS)
+#define VHOST_LOG_CHUNK_BYTES (VHOST_LOG_PAGE * sizeof(vhost_log_chunk_t))
 #define VHOST_INVALID_FEATURE_BIT   (0xff)
 #define VHOST_QUEUE_NUM_CONFIG_INR 0
 
@@ -132,6 +133,7 @@ struct vhost_dev {
     QLIST_HEAD(, vhost_iommu) iommu_list;
     IOMMUNotifier n;
     const VhostDevConfigOps *config_ops;
+    bool has_container;
 };
 
 extern const VhostOps kernel_ops;
@@ -205,6 +207,14 @@ static inline bool vhost_dev_is_started(struct vhost_dev *hdev)
     return hdev->started;
 }
 
+/**
+ * vhost_bytemap_log_support() - check if the vhost device supports dirty bytemap
+ * @dev: common vhost_dev structure
+ *
+ * Return: true if the vhost device supports dirty bytemap, false otherwise.
+ */
+bool vhost_bytemap_log_support(struct vhost_dev *dev);
+
 /**
  * vhost_dev_start() - start the vhost device
  * @hdev: common vhost_dev structure
@@ -340,7 +350,14 @@ int vhost_dev_set_inflight(struct vhost_dev *dev,
                            struct vhost_inflight *inflight);
 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
                            struct vhost_inflight *inflight);
+bool used_memslots_is_exceeded(void);
 bool vhost_dev_has_iommu(struct vhost_dev *dev);
+int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
+                            MemoryRegionSection *section,
+                            hwaddr first,
+                            hwaddr last);
+int vhost_sync_dirty_bytemap(struct vhost_dev *dev,
+                             MemoryRegionSection *section);
 
 #ifdef CONFIG_VHOST
 int vhost_reset_device(struct vhost_dev *hdev);
@@ -464,4 +481,7 @@ int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
  */
 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
 
+int vhost_dev_resume(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings);
+int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings);
+
 #endif
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index c8f72850bc05f4c6ea7b51aae4b572b04882725f..672f7445dd0e489d805c49d0b16ef94cca83519c 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -22,6 +22,7 @@
 #include "standard-headers/linux/virtio_config.h"
 #include "standard-headers/linux/virtio_ring.h"
 #include "qom/object.h"
+#include "block/aio.h"
 
 /*
  * A guest should never accept this. It implies negotiation is broken
@@ -60,6 +61,7 @@ size_t virtio_get_config_size(const VirtIOConfigSizeParams *params,
 typedef struct VirtQueue VirtQueue;
 
 #define VIRTQUEUE_MAX_SIZE 1024
+#define VIRTIO_NET_VQ_MAX_SIZE (4096)
 
 typedef struct VirtQueueElement
 {
@@ -145,6 +147,7 @@ struct VirtIODevice
     bool use_started;
     bool started;
     bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */
+    bool defer_kvm_irq_routing;
     bool disable_legacy_check;
     bool vhost_started;
     VMChangeStateEntry *vmstate;
@@ -182,6 +185,7 @@ struct VirtioDeviceClass {
     int (*validate_features)(VirtIODevice *vdev);
     void (*get_config)(VirtIODevice *vdev, uint8_t *config);
     void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
+    void (*print_features)(uint64_t features);
     void (*reset)(VirtIODevice *vdev);
     void (*set_status)(VirtIODevice *vdev, uint8_t val);
     /* Device must validate queue_index.  */
@@ -270,9 +274,13 @@ void qemu_put_virtqueue_element(VirtIODevice *vdev, QEMUFile *f,
                                 VirtQueueElement *elem);
 int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
                           unsigned int out_bytes);
-void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
-                               unsigned int *out_bytes,
-                               unsigned max_in_bytes, unsigned max_out_bytes);
+/**
+ * Return <0 on error or an opaque >=0 to pass to
+ * virtio_queue_enable_notification_and_check on success.
+ */
+int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
+                              unsigned int *out_bytes, unsigned max_in_bytes,
+                              unsigned max_out_bytes);
 
 void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq);
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
@@ -306,6 +314,15 @@ int virtio_queue_ready(VirtQueue *vq);
 
 int virtio_queue_empty(VirtQueue *vq);
 
+/**
+ * Enable notification and check whether guest has added some
+ * buffers since last call to virtqueue_get_avail_bytes.
+ *
+ * @opaque: value returned from virtqueue_get_avail_bytes
+ */
+bool virtio_queue_enable_notification_and_check(VirtQueue *vq,
+                                                int opaque);
+
 /* Host binding interface.  */
 
 uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr);
@@ -508,4 +525,10 @@ static inline bool virtio_device_disabled(VirtIODevice *vdev)
 bool virtio_legacy_allowed(VirtIODevice *vdev);
 bool virtio_legacy_check_disabled(VirtIODevice *vdev);
 
+QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev,
+                                   QEMUBHFunc *cb, void *opaque,
+                                   const char *name);
+#define virtio_bh_new_guarded(dev, cb, opaque) \
+    virtio_bh_new_guarded_full((dev), (cb), (opaque), (stringify(cb)))
+
 #endif
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 965f5d5450034541ca548b3f11e1246a3695e0aa..60079086a800b2d49ba3a95f040866b00e83a7db 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -63,4 +63,6 @@ void monitor_register_hmp_info_hrt(const char *name,
 int error_vprintf_unless_qmp(const char *fmt, va_list ap) G_GNUC_PRINTF(1, 0);
 int error_printf_unless_qmp(const char *fmt, ...) G_GNUC_PRINTF(1, 2);
 
+void monitor_qapi_event_discard_io_error(void);
+
 #endif /* MONITOR_H */
diff --git a/include/qemu/bswap.h b/include/qemu/bswap.h
index 933a66ee87e258db1245bf9d0b56b74f47664fe0..49e4944457f5b024dc7811833790d408bb0ca058 100644
--- a/include/qemu/bswap.h
+++ b/include/qemu/bswap.h
@@ -138,6 +138,8 @@ CPU_CONVERT(le, 16, uint16_t)
 CPU_CONVERT(le, 32, uint32_t)
 CPU_CONVERT(le, 64, uint64_t)
 
+#undef CPU_CONVERT
+
 /*
  * Same as cpu_to_le{16,32,64}, except that gcc will figure the result is
  * a compile-time constant if you pass in a constant.  So this can be
diff --git a/include/qemu/chardev_open.h b/include/qemu/chardev_open.h
new file mode 100644
index 0000000000000000000000000000000000000000..64e8fcfdcb239e47774d55dfc57efff9101ce0a8
--- /dev/null
+++ b/include/qemu/chardev_open.h
@@ -0,0 +1,16 @@
+/*
+ * QEMU Chardev Helper
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_CHARDEV_OPEN_H
+#define QEMU_CHARDEV_OPEN_H
+
+int open_cdev(const char *devpath, dev_t cdev);
+#endif
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index 1da148552f74b542959dd48448eb6bbc3aead935..11b550a0fcbf55cf67c2ed0eaf7317e1a6128475 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -73,5 +73,6 @@ Coroutine *qemu_coroutine_new(void);
 void qemu_coroutine_delete(Coroutine *co);
 CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to,
                                       CoroutineAction action);
-
+void qemu_coroutine_info_add(const Coroutine *co_);
+void qemu_coroutine_info_delete(const Coroutine *co_);
 #endif
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index 8344daaa03f2d48819fdd3d6f1fef862b8eacd3a..63e4edfd2f9c041538274c51cf721332950e1426 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -1,6 +1,10 @@
 #ifndef QEMU_MMAP_ALLOC_H
 #define QEMU_MMAP_ALLOC_H
 
+#define HUGETLBFS_MAGIC       0x958458f6
+
+size_t qemu_fd_getfiletype(int fd);
+
 typedef enum {
     QEMU_FS_TYPE_UNKNOWN = 0,
     QEMU_FS_TYPE_TMPFS,
diff --git a/include/qemu/range.h b/include/qemu/range.h
index 205e1da76dc5b29327f590b8293a826ced63c25d..4ce694a398311a32e9b5e3ca97aed97c90c0ae09 100644
--- a/include/qemu/range.h
+++ b/include/qemu/range.h
@@ -20,6 +20,8 @@
 #ifndef QEMU_RANGE_H
 #define QEMU_RANGE_H
 
+#include "qemu/bitops.h"
+
 /*
  * Operations on 64 bit address ranges.
  * Notes:
@@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1,
     return !(last2 < first1 || last1 < first2);
 }
 
+/* Get highest non-zero bit position of a range */
+static inline int range_get_last_bit(Range *range)
+{
+    if (range_is_empty(range)) {
+        return -1;
+    }
+    return 63 - clz64(range->upb);
+}
+
 /*
  * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
  * Both @a and @b must not be empty.
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 9a366e551fb36670112e4c21a62cfe41e9f530da..475c2a3f182220e097f912a39340e739c5f09a6c 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -91,6 +91,34 @@ struct QEMUTimer {
     int scale;
 };
 
+#define QEMU_USB_NORMAL_FREQ       1000
+#define QEMU_USB_LAZY_FREQ         10
+#define MAX_USB_CONTROLLER_TYPES   4
+#define QEMU_USB_CONTROLLER_OHCI   0
+#define QEMU_USB_CONTROLLER_UHCI   1
+#define QEMU_USB_CONTROLLER_EHCI   2
+#define QEMU_USB_CONTROLLER_XHCI   3
+
+typedef void (*QEMUSetFreqHandler) (int freq);
+
+typedef struct qemu_usb_controller {
+    const char *name;
+    QEMUSetFreqHandler qemu_set_freq;
+} qemu_usb_controller;
+
+typedef qemu_usb_controller* qemu_usb_controller_ptr;
+
+enum qemu_timer_mode {
+    QEMU_TIMER_USB_NORMAL_MODE = 1 << 0, /* Set when VNC connect or
+                                          * with usb dev passthrough
+                                          */
+    QEMU_TIMER_USB_LAZY_MODE   = 1 << 1, /* Set when VNC disconnect */
+};
+
+int qemu_register_usb_controller(qemu_usb_controller_ptr controller,
+                                 unsigned int type);
+int qemu_timer_set_mode(enum qemu_timer_mode mode, unsigned int type);
+
 extern QEMUTimerListGroup main_loop_tlg;
 
 /*
diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
index 18a4314212d7bf604920f554cafb2ad879ee5db7..a1979308d7e1ce1a6e07284a58af511da534562a 100644
--- a/include/qemu/userfaultfd.h
+++ b/include/qemu/userfaultfd.h
@@ -39,7 +39,6 @@ int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake);
 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length);
 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count);
-bool uffd_poll_events(int uffd_fd, int tmo);
 
 #endif /* CONFIG_LINUX */
 
diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h
index 72279f4d25d4b01fb876256511605e4aa78b5720..3afb70160f0bf5feaf7ae284ea19e6e51e822cf5 100644
--- a/include/standard-headers/drm/drm_fourcc.h
+++ b/include/standard-headers/drm/drm_fourcc.h
@@ -322,6 +322,8 @@ extern "C" {
  * index 1 = Cr:Cb plane, [39:0] Cr1:Cb1:Cr0:Cb0 little endian
  */
 #define DRM_FORMAT_NV15		fourcc_code('N', 'V', '1', '5') /* 2x2 subsampled Cr:Cb plane */
+#define DRM_FORMAT_NV20		fourcc_code('N', 'V', '2', '0') /* 2x1 subsampled Cr:Cb plane */
+#define DRM_FORMAT_NV30		fourcc_code('N', 'V', '3', '0') /* non-subsampled Cr:Cb plane */
 
 /*
  * 2 plane YCbCr MSB aligned
diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h
index 6b9793842c98496feff47cba2f0a49eecdfac7d6..fc0dcd10aededdb8ea0f2ddb7144dcf44854727d 100644
--- a/include/standard-headers/linux/fuse.h
+++ b/include/standard-headers/linux/fuse.h
@@ -209,7 +209,7 @@
  *  - add FUSE_HAS_EXPIRE_ONLY
  *
  *  7.39
- *  - add FUSE_DIRECT_IO_RELAX
+ *  - add FUSE_DIRECT_IO_ALLOW_MMAP
  *  - add FUSE_STATX and related structures
  */
 
@@ -405,8 +405,7 @@ struct fuse_file_lock {
  * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir,
  *			symlink and mknod (single group that matches parent)
  * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation
- * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now
- *                       allow shared mmap
+ * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode.
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -445,7 +444,10 @@ struct fuse_file_lock {
 #define FUSE_HAS_INODE_DAX	(1ULL << 33)
 #define FUSE_CREATE_SUPP_GROUP	(1ULL << 34)
 #define FUSE_HAS_EXPIRE_ONLY	(1ULL << 35)
-#define FUSE_DIRECT_IO_RELAX	(1ULL << 36)
+#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36)
+
+/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
+#define FUSE_DIRECT_IO_RELAX	FUSE_DIRECT_IO_ALLOW_MMAP
 
 /**
  * CUSE INIT request/reply flags
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index e5f558d9649396faed564c9156ab3d03b9aaa7ab..a39193213ff25ba24233fe28bc4977f38c294702 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -80,6 +80,7 @@
 #define  PCI_HEADER_TYPE_NORMAL		0
 #define  PCI_HEADER_TYPE_BRIDGE		1
 #define  PCI_HEADER_TYPE_CARDBUS	2
+#define  PCI_HEADER_TYPE_MFD		0x80	/* Multi-Function Device (possible) */
 
 #define PCI_BIST		0x0f	/* 8 bits */
 #define  PCI_BIST_CODE_MASK	0x0f	/* Return result */
@@ -637,6 +638,7 @@
 #define PCI_EXP_RTCAP		0x1e	/* Root Capabilities */
 #define  PCI_EXP_RTCAP_CRSVIS	0x0001	/* CRS Software Visibility capability */
 #define PCI_EXP_RTSTA		0x20	/* Root Status */
+#define  PCI_EXP_RTSTA_PME_RQ_ID 0x0000ffff /* PME Requester ID */
 #define  PCI_EXP_RTSTA_PME	0x00010000 /* PME status */
 #define  PCI_EXP_RTSTA_PENDING	0x00020000 /* PME pending */
 /*
@@ -930,12 +932,13 @@
 
 /* Process Address Space ID */
 #define PCI_PASID_CAP		0x04    /* PASID feature register */
-#define  PCI_PASID_CAP_EXEC	0x02	/* Exec permissions Supported */
-#define  PCI_PASID_CAP_PRIV	0x04	/* Privilege Mode Supported */
+#define  PCI_PASID_CAP_EXEC	0x0002	/* Exec permissions Supported */
+#define  PCI_PASID_CAP_PRIV	0x0004	/* Privilege Mode Supported */
+#define  PCI_PASID_CAP_WIDTH	0x1f00
 #define PCI_PASID_CTRL		0x06    /* PASID control register */
-#define  PCI_PASID_CTRL_ENABLE	0x01	/* Enable bit */
-#define  PCI_PASID_CTRL_EXEC	0x02	/* Exec permissions Enable */
-#define  PCI_PASID_CTRL_PRIV	0x04	/* Privilege Mode Enable */
+#define  PCI_PASID_CTRL_ENABLE	0x0001	/* Enable bit */
+#define  PCI_PASID_CTRL_EXEC	0x0002	/* Exec permissions Enable */
+#define  PCI_PASID_CTRL_PRIV	0x0004	/* Privilege Mode Enable */
 #define PCI_EXT_CAP_PASID_SIZEOF	8
 
 /* Single Root I/O Virtualization */
@@ -975,6 +978,8 @@
 #define  PCI_LTR_VALUE_MASK	0x000003ff
 #define  PCI_LTR_SCALE_MASK	0x00001c00
 #define  PCI_LTR_SCALE_SHIFT	10
+#define  PCI_LTR_NOSNOOP_VALUE	0x03ff0000 /* Max No-Snoop Latency Value */
+#define  PCI_LTR_NOSNOOP_SCALE	0x1c000000 /* Scale for Max Value */
 #define PCI_EXT_CAP_LTR_SIZEOF	8
 
 /* Access Control Service */
@@ -1042,9 +1047,16 @@
 #define PCI_EXP_DPC_STATUS		0x08	/* DPC Status */
 #define  PCI_EXP_DPC_STATUS_TRIGGER	    0x0001 /* Trigger Status */
 #define  PCI_EXP_DPC_STATUS_TRIGGER_RSN	    0x0006 /* Trigger Reason */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR  0x0000 /* Uncorrectable error */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE    0x0002 /* Rcvd ERR_NONFATAL */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE     0x0004 /* Rcvd ERR_FATAL */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT 0x0006 /* Reason in Trig Reason Extension field */
 #define  PCI_EXP_DPC_STATUS_INTERRUPT	    0x0008 /* Interrupt Status */
 #define  PCI_EXP_DPC_RP_BUSY		    0x0010 /* Root Port Busy */
 #define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO		0x0000	/* RP PIO error */
+#define  PCI_EXP_DPC_STATUS_TRIGGER_RSN_SW_TRIGGER	0x0020	/* DPC SW Trigger bit */
+#define  PCI_EXP_DPC_RP_PIO_FEP		    0x1f00 /* RP PIO First Err Ptr */
 
 #define PCI_EXP_DPC_SOURCE_ID		 0x0A	/* DPC Source Identifier */
 
@@ -1088,6 +1100,8 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_VALUE	0x03ff0000  /* LTR_L1.2_THRESHOLD_Value */
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
+#define  PCI_L1SS_CTL2_T_PWR_ON_SCALE	0x00000003  /* T_POWER_ON Scale */
+#define  PCI_L1SS_CTL2_T_PWR_ON_VALUE	0x000000f8  /* T_POWER_ON Value */
 
 /* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
 #define PCI_DVSEC_HEADER1		0x4 /* Designated Vendor-Specific Header1 */
diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h
index 5ad07e134aed3525943388a18b5bded129624f58..46fc53cd83ec4d60beeff4eeca06b8ae83c807df 100644
--- a/include/standard-headers/linux/vhost_types.h
+++ b/include/standard-headers/linux/vhost_types.h
@@ -185,5 +185,14 @@ struct vhost_vdpa_iova_range {
  * DRIVER_OK
  */
 #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK  0x6
+/* Device may expose the virtqueue's descriptor area, driver area and
+ * device area to a different group for ASID binding than where its
+ * buffers may reside. Requires VHOST_BACKEND_F_IOTLB_ASID.
+ */
+#define VHOST_BACKEND_F_DESC_ASID    0x7
+/* IOTLB don't flush memory mapping across device reset */
+#define VHOST_BACKEND_F_IOTLB_PERSIST  0x8
+/* device can use bytemap log */
+#define VHOST_BACKEND_F_BYTEMAPLOG  0x3f
 
 #endif
diff --git a/include/standard-headers/linux/virtio_config.h b/include/standard-headers/linux/virtio_config.h
index 8a7d0dc8b0070343636a72926dbea66ceb01b184..bfd1ca643e7f4a62127c295bdac8ccc4da068cc0 100644
--- a/include/standard-headers/linux/virtio_config.h
+++ b/include/standard-headers/linux/virtio_config.h
@@ -103,6 +103,11 @@
  */
 #define VIRTIO_F_NOTIFICATION_DATA	38
 
+/* This feature indicates that the driver uses the data provided by the device
+ * as a virtqueue identifier in available buffer notifications.
+ */
+#define VIRTIO_F_NOTIF_CONFIG_DATA	39
+
 /*
  * This feature indicates that the driver can reset a queue individually.
  */
diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h
index be912cfc957cd78f0cade4c99ad03c0513e03c4d..b7fdfd066878cb89a091edcfe79c11196f3361d7 100644
--- a/include/standard-headers/linux/virtio_pci.h
+++ b/include/standard-headers/linux/virtio_pci.h
@@ -166,6 +166,17 @@ struct virtio_pci_common_cfg {
 	uint32_t queue_used_hi;		/* read-write */
 };
 
+/*
+ * Warning: do not use sizeof on this: use offsetofend for
+ * specific fields you need.
+ */
+struct virtio_pci_modern_common_cfg {
+	struct virtio_pci_common_cfg cfg;
+
+	uint16_t queue_notify_data;	/* read-write */
+	uint16_t queue_reset;		/* read-write */
+};
+
 /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */
 struct virtio_pci_cfg_cap {
 	struct virtio_pci_cap cap;
diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
index ef91fc28bbdfa3dab181e11c5a4dc42fce1bad92..7a32e7f820325ed677db482893faa60c7d43b5d4 100644
--- a/include/sysemu/accel-ops.h
+++ b/include/sysemu/accel-ops.h
@@ -53,6 +53,9 @@ struct AccelOpsClass {
     int (*insert_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len);
     int (*remove_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len);
     void (*remove_all_breakpoints)(CPUState *cpu);
+
+    void (*control_pre_system_reset)(void);
+    void (*control_post_system_reset)(void);
 };
 
 #endif /* ACCEL_OPS_H */
diff --git a/include/sysemu/block-backend-common.h b/include/sysemu/block-backend-common.h
index 780cea7305aadd131e3e2974053ab33e86a00ac9..5a1cdac9c4a1040da20f79c6721cb627b7321267 100644
--- a/include/sysemu/block-backend-common.h
+++ b/include/sysemu/block-backend-common.h
@@ -16,6 +16,9 @@
 #include "qemu/iov.h"
 #include "block/throttle-groups.h"
 
+/* block backend default retry interval */
+#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL   1000
+
 /*
  * TODO Have to include block/block.h for a bunch of block layer
  * types.  Unfortunately, this pulls in the whole BlockDriverState
@@ -71,6 +74,10 @@ typedef struct BlockDevOps {
      * Is the device still busy?
      */
     bool (*drained_poll)(void *opaque);
+    /*
+     * Runs when retrying failed requests.
+     */
+    void (*retry_request_cb)(void *opaque);
 
     /*
      * I/O API functions. These functions are thread-safe.
diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h
index 49c12b0fa9387f5d9be6c18408ff60592c981a50..d56592c22ed91b57c615eacbcd3192ac7dbfe305 100644
--- a/include/sysemu/block-backend-global-state.h
+++ b/include/sysemu/block-backend-global-state.h
@@ -84,6 +84,9 @@ int blk_commit_all(void);
 bool blk_in_drain(BlockBackend *blk);
 void blk_drain(BlockBackend *blk);
 void blk_drain_all(void);
+void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval);
+void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout);
+void blk_error_retry_reset_timeout(BlockBackend *blk);
 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
                       BlockdevOnError on_write_error);
 bool blk_supports_write_perm(BlockBackend *blk);
diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
index b4a566cfe75274f6d8f4bfe8abb9e409a4c2e440..f24d27daf5269d658e2249cb1482d88d23741bea 100644
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@@ -44,6 +44,8 @@ extern int icount_align_option;
 void qemu_cpu_kick_self(void);
 
 bool cpus_are_resettable(void);
+void cpus_control_pre_system_reset(void);
+void cpus_control_post_system_reset(void);
 
 void cpu_synchronize_all_states(void);
 void cpu_synchronize_all_post_reset(void);
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index a1ac5bc1b5435be8ef97d39acb0bf99cfb94e032..5a49a306284d26af967254ba064a134ab68305ec 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -152,7 +152,7 @@ static inline MemTxResult dma_memory_read(AddressSpace *as, dma_addr_t addr,
 }
 
 /**
- * address_space_write: Write to address space from DMA controller.
+ * dma_memory_write: Write to address space from DMA controller.
  *
  * Return a MemTxResult indicating whether the operation succeeded
  * or failed (eg unassigned memory, device rejected the transaction,
@@ -189,7 +189,7 @@ MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr,
                            uint8_t c, dma_addr_t len, MemTxAttrs attrs);
 
 /**
- * address_space_map: Map a physical memory region into a host virtual address.
+ * dma_memory_map: Map a physical memory region into a host virtual address.
  *
  * May map a subset of the requested range, given by and returned in @plen.
  * May return %NULL and set *@plen to zero(0), if resources needed to perform
@@ -216,16 +216,15 @@ static inline void *dma_memory_map(AddressSpace *as,
 }
 
 /**
- * address_space_unmap: Unmaps a memory region previously mapped
- *                      by dma_memory_map()
+ * dma_memory_unmap: Unmaps a memory region previously mapped by dma_memory_map()
  *
  * Will also mark the memory as dirty if @dir == %DMA_DIRECTION_FROM_DEVICE.
  * @access_len gives the amount of memory that was actually read or written
  * by the caller.
  *
  * @as: #AddressSpace used
- * @buffer: host pointer as returned by address_space_map()
- * @len: buffer length as returned by address_space_map()
+ * @buffer: host pointer as returned by dma_memory_map()
+ * @len: buffer length as returned by dma_memory_map()
  * @dir: indicates the transfer direction
  * @access_len: amount of data actually transferred
  */
diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..22c76a37a79504d7de552655545395a2a7544ed7
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,111 @@
+/*
+ * Host IOMMU device abstract declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan <zhenzhong.duan@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+#include "qom/object.h"
+#include "qapi/error.h"
+
+/**
+ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities.
+ *
+ * @type: host platform IOMMU type.
+ *
+ * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents
+ *           the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl)
+ */
+typedef struct HostIOMMUDeviceCaps {
+    uint32_t type;
+    uint64_t hw_caps;
+    uint8_t max_pasid_log2;
+} HostIOMMUDeviceCaps;
+
+#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
+OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
+
+struct HostIOMMUDevice {
+    Object parent_obj;
+
+    char *name;
+    void *agent; /* pointer to agent device, ie. VFIO or VDPA device */
+    HostIOMMUDeviceCaps caps;
+};
+
+/**
+ * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices.
+ *
+ * Different types of host devices (e.g., VFIO or VDPA device) or devices
+ * with different backend (e.g., VFIO legacy container or IOMMUFD backend)
+ * will have different implementations of the HostIOMMUDeviceClass.
+ */
+struct HostIOMMUDeviceClass {
+    ObjectClass parent_class;
+
+    /**
+     * @realize: initialize host IOMMU device instance further.
+     *
+     * Mandatory callback.
+     *
+     * @hiod: pointer to a host IOMMU device instance.
+     *
+     * @opaque: pointer to agent device of this host IOMMU device,
+     *          e.g., VFIO base device or VDPA device.
+     *
+     * @errp: pass an Error out when realize fails.
+     *
+     * Returns: true on success, false on failure.
+     */
+    bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+    /**
+     * @realize_late: initialize host IOMMU device instance after attachment,
+     *                some elements e.g., ioas are ready only after attachment.
+     *                This callback initialize them.
+     *
+     * Optional callback.
+     *
+     * @hiod: pointer to a host IOMMU device instance.
+     *
+     * @opaque: pointer to agent device of this host IOMMU device,
+     *          e.g., VFIO base device or VDPA device.
+     *
+     * @errp: pass an Error out when realize fails.
+     *
+     * Returns: true on success, false on failure.
+     */
+    bool (*realize_late)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+    /**
+     * @get_cap: check if a host IOMMU device capability is supported.
+     *
+     * Optional callback, if not implemented, hint not supporting query
+     * of @cap.
+     *
+     * @hiod: pointer to a host IOMMU device instance.
+     *
+     * @cap: capability to check.
+     *
+     * @errp: pass an Error out when fails to query capability.
+     *
+     * Returns: <0 on failure, 0 if a @cap is unsupported, or else
+     * 1 or some positive value for some special @cap,
+     * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS.
+     */
+    int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp);
+};
+
+/*
+ * Host IOMMU device capability list.
+ */
+#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE        0
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS           1
+
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX       64
+#endif
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
new file mode 100644
index 0000000000000000000000000000000000000000..908c94d81145a69293677d06ec09e42b3bc235cd
--- /dev/null
+++ b/include/sysemu/iommufd.h
@@ -0,0 +1,144 @@
+/*
+ * iommufd container backend declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ * Copyright Red Hat, Inc. 2024
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *          Eric Auger <eric.auger@redhat.com>
+ *          Zhenzhong Duan <zhenzhong.duan@intel.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef SYSEMU_IOMMUFD_H
+#define SYSEMU_IOMMUFD_H
+
+#include "qom/object.h"
+#include "exec/hwaddr.h"
+#include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
+
+#define TYPE_IOMMUFD_BACKEND "iommufd"
+OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
+
+struct IOMMUFDBackendClass {
+    ObjectClass parent_class;
+};
+
+struct IOMMUFDBackend {
+    Object parent;
+
+    /*< protected >*/
+    int fd;            /* /dev/iommu file descriptor */
+    bool owned;        /* is the /dev/iommu opened internally */
+    uint32_t users;
+
+    /*< public >*/
+};
+
+typedef struct IOMMUFDViommu {
+    IOMMUFDBackend *iommufd;
+    uint32_t s2_hwpt_id;
+    uint32_t viommu_id;
+} IOMMUFDViommu;
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
+void iommufd_backend_disconnect(IOMMUFDBackend *be);
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+                               Error **errp);
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+                            ram_addr_t size, void *vaddr, bool readonly);
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+                              hwaddr iova, ram_addr_t size);
+bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+                                     uint32_t *type, void *data, uint32_t len,
+                                     uint64_t *caps, uint8_t *max_pasid_log2,
+                                     Error **errp);
+bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id,
+                                uint32_t pt_id, uint32_t flags,
+                                uint32_t data_type, uint32_t data_len,
+                                void *data_ptr, uint32_t *out_hwpt,
+                                uint32_t *out_fault_fd, Error **errp);
+bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id,
+                                        bool start, Error **errp);
+bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id,
+                                      uint64_t iova, ram_addr_t size,
+                                      uint64_t page_size, uint64_t *data,
+                                      Error **errp);
+int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id,
+                                     uint32_t data_type, uint32_t entry_len,
+                                     uint32_t *entry_num, void *data_ptr);
+struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be,
+                                                   uint32_t dev_id,
+                                                   uint32_t viommu_type,
+                                                   uint32_t hwpt_id);
+int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id,
+                                    uint32_t data_type, uint32_t entry_len,
+                                    uint32_t *entry_num, void *data_ptr);
+
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
+OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass,
+                    HOST_IOMMU_DEVICE_IOMMUFD)
+
+/* Abstract of host IOMMU device with iommufd backend */
+struct HostIOMMUDeviceIOMMUFD {
+    HostIOMMUDevice parent_obj;
+
+    IOMMUFDBackend *iommufd;
+    uint32_t devid;
+    uint32_t ioas_id;
+};
+
+struct HostIOMMUDeviceIOMMUFDClass {
+    HostIOMMUDeviceClass parent_class;
+
+    /**
+     * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table.
+     * VFIO and VDPA device can have different implementation.
+     *
+     * Mandatory callback.
+     *
+     * @idev: host IOMMU device backed by IOMMUFD backend.
+     *
+     * @hwpt_id: ID of IOMMUFD hardware page table.
+     *
+     * @errp: pass an Error out when attachment fails.
+     *
+     * Returns: true on success, false on failure.
+     */
+    bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id,
+                        Error **errp);
+    /**
+     * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table.
+     * VFIO and VDPA device can have different implementation.
+     *
+     * Mandatory callback.
+     *
+     * @idev: host IOMMU device backed by IOMMUFD backend.
+     *
+     * @errp: pass an Error out when attachment fails.
+     *
+     * Returns: true on success, false on failure.
+     */
+    bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp);
+};
+
+bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                           uint32_t hwpt_id, Error **errp);
+bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                           Error **errp);
+
+typedef struct IOMMUFDVdev {
+    HostIOMMUDeviceIOMMUFD *idev;
+    IOMMUFDViommu *viommu;
+    uint32_t vdev_id;
+    uint64_t virt_id;
+} IOMMUFDVdev;
+
+struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev,
+                                               IOMMUFDViommu *viommu,
+                                               uint64_t virt_id);
+#endif
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index d6148781642107de0995af400331bbde126791b8..098257e72f08dff9155d36fe39df1a5786c2fa7c 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -19,6 +19,7 @@
 #include "exec/memattrs.h"
 #include "qemu/accel.h"
 #include "qom/object.h"
+#include "linux-headers/linux/kvm.h"
 
 #ifdef NEED_CPU_H
 # ifdef CONFIG_KVM
@@ -32,6 +33,7 @@
 #ifdef CONFIG_KVM_IS_POSSIBLE
 
 extern bool kvm_allowed;
+extern bool virtcca_cvm_allowed;
 extern bool kvm_kernel_irqchip;
 extern bool kvm_split_irqchip;
 extern bool kvm_async_interrupts_allowed;
@@ -42,8 +44,11 @@ extern bool kvm_gsi_routing_allowed;
 extern bool kvm_gsi_direct_mapping;
 extern bool kvm_readonly_mem_allowed;
 extern bool kvm_msi_use_devid;
+extern bool kvm_csv3_allowed;
 
 #define kvm_enabled()           (kvm_allowed)
+#define virtcca_cvm_enabled()           (virtcca_cvm_allowed)
+#define VIRTCCA_CVM_TYPE (1UL << 8)
 /**
  * kvm_irqchip_in_kernel:
  *
@@ -143,9 +148,25 @@ extern bool kvm_msi_use_devid;
  */
 #define kvm_msi_devid_required() (kvm_msi_use_devid)
 
+/**
+ * kvm_csv3_enabled:
+ * Returns: true if CSV3 feature is used for the VM.
+ */
+#define kvm_csv3_enabled() (kvm_csv3_allowed)
+
+/**
+ * kvm_csv3_should_set_priv_mem:
+ * Returns: true if we should explicitly request
+ *          KVM_CSV3_SET_GUEST_PRIVATE_MEMORY.
+ */
+#define kvm_csv3_should_set_priv_mem() \
+        (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM)
+
 #else
 
 #define kvm_enabled()           (0)
+#define virtcca_cvm_enabled()           (0)
+#define VIRTCCA_CVM_TYPE        (0)
 #define kvm_irqchip_in_kernel() (false)
 #define kvm_irqchip_is_split() (false)
 #define kvm_async_interrupts_enabled() (false)
@@ -157,6 +178,8 @@ extern bool kvm_msi_use_devid;
 #define kvm_gsi_direct_mapping() (false)
 #define kvm_readonly_mem_enabled() (false)
 #define kvm_msi_devid_required() (false)
+#define kvm_csv3_enabled() (false)
+#define kvm_csv3_should_set_priv_mem() (false)
 
 #endif  /* CONFIG_KVM_IS_POSSIBLE */
 
@@ -206,6 +229,14 @@ int kvm_has_gsi_routing(void);
  */
 bool kvm_arm_supports_user_irq(void);
 
+/*
+ * The default HDBSS size. The value ranges [0, 9].
+ * Set to 0 to disable the HDBSS feature.
+ */
+#define DEFAULT_HDBSS_BUFFER_SIZE          0
+#define MAX_HDBSS_BUFFER_SIZE              9
+
+void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size);
 
 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr);
 int kvm_on_sigbus(int code, void *addr);
@@ -313,6 +344,31 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test);
  */
 bool kvm_device_supported(int vmfd, uint64_t type);
 
+/**
+ * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU
+ * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created.
+ *
+ * @returns: 0 when success, errno (<0) when failed.
+ */
+int kvm_create_vcpu(CPUState *cpu);
+
+/**
+ * kvm_park_vcpu - Park QEMU KVM vCPU context
+ * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked.
+ *
+ * @returns: none
+ */
+void kvm_park_vcpu(CPUState *cpu);
+
+/**
+ * kvm_unpark_vcpu - unpark QEMU KVM vCPU context
+ * @s: KVM State
+ * @vcpu_id: Architecture vCPU ID of the parked vCPU
+ *
+ * @returns: KVM fd
+ */
+int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id);
+
 /* Arch specific hooks */
 
 extern const KVMCapabilityInfo kvm_arch_required_capabilities[];
@@ -453,7 +509,7 @@ void kvm_init_cpu_signals(CPUState *cpu);
  * @return: virq (>=0) when success, errno (<0) when failed.
  */
 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev);
-int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
+int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg,
                                  PCIDevice *dev);
 void kvm_irqchip_commit_routes(KVMState *s);
 
@@ -490,6 +546,8 @@ bool kvm_kernel_irqchip_allowed(void);
 bool kvm_kernel_irqchip_required(void);
 bool kvm_kernel_irqchip_split(void);
 
+bool kvm_smccc_filter_enabled(void);
+
 /**
  * kvm_arch_irqchip_create:
  * @KVMState: The KVMState pointer
@@ -538,4 +596,12 @@ bool kvm_arch_cpu_check_are_resettable(void);
 bool kvm_dirty_ring_enabled(void);
 
 uint32_t kvm_dirty_ring_size(void);
+
+int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size,
+                       struct kvm_numa_info *numa_info);
+
+#ifdef __aarch64__
+int kvm_create_shadow_device(PCIDevice *dev);
+int kvm_delete_shadow_device(PCIDevice *dev);
+#endif
 #endif
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index fd846394be104b9b56b8f4c0b7b56305ac486a9a..b2d2c5947795868a2c5e003b3f7bd45c6ef4372a 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -112,6 +112,7 @@ struct KVMState
     uint64_t kvm_dirty_ring_bytes;  /* Size of the per-vcpu dirty ring */
     uint32_t kvm_dirty_ring_size;   /* Number of dirty GFNs per ring */
     bool kvm_dirty_ring_with_bitmap;
+    bool kvm_smccc_filter_enabled;
     uint64_t kvm_eager_split_size;  /* Eager Page Splitting chunk size */
     struct KVMDirtyRingReaper reaper;
     NotifyVmexitOption notify_vmexit;
diff --git a/include/sysemu/rtc.h b/include/sysemu/rtc.h
index 0fc8ad6fdf1e38ca0c7d1488b7899797c18bb1aa..3edae762d46a64ceb1bc06b5b27235ce38a07b14 100644
--- a/include/sysemu/rtc.h
+++ b/include/sysemu/rtc.h
@@ -54,5 +54,7 @@ void qemu_get_timedate(struct tm *tm, time_t offset);
  * then this function will return 3600.
  */
 time_t qemu_timedate_diff(struct tm *tm);
-
+time_t get_rtc_date_diff(void);
+void set_rtc_date_diff(time_t diff);
+int64_t qmp_query_rtc_date_diff(Error **errp);
 #endif
diff --git a/include/tcg/startup.h b/include/tcg/startup.h
index f71305765c0291138a2b01ab6605dfa12e0cd783..c6cb1d92a76b36ef059368f9606f28eca256f30f 100644
--- a/include/tcg/startup.h
+++ b/include/tcg/startup.h
@@ -45,6 +45,11 @@ void tcg_init(size_t tb_size, int splitwx, unsigned max_cpus);
  */
 void tcg_register_thread(void);
 
+/**
+ * tcg_register_thread: Unregister this thread with the TCG runtime
+ */
+void tcg_unregister_thread(void);
+
 /**
  * tcg_prologue_init(): Generate the code for the TCG prologue
  *
diff --git a/include/ui/rect.h b/include/ui/rect.h
index 94898f92d0d4e893d4590b1e6d41bf51d07a13d9..68f05d78a8e6b0cadc80c229dc2a1fce889e1adf 100644
--- a/include/ui/rect.h
+++ b/include/ui/rect.h
@@ -19,7 +19,7 @@ static inline void qemu_rect_init(QemuRect *rect,
                                   uint16_t width, uint16_t height)
 {
     rect->x = x;
-    rect->y = x;
+    rect->y = y;
     rect->width = width;
     rect->height = height;
 }
diff --git a/io/channel-tls.c b/io/channel-tls.c
index 58fe1aceeeafd473dfb3fde267f18eccee6bdb65..a8ad89c3d11053c44633c68d78f56d77854225dc 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -69,37 +69,40 @@ qio_channel_tls_new_server(QIOChannel *master,
                            const char *aclname,
                            Error **errp)
 {
-    QIOChannelTLS *ioc;
+    QIOChannelTLS *tioc;
+    QIOChannel *ioc;
 
-    ioc = QIO_CHANNEL_TLS(object_new(TYPE_QIO_CHANNEL_TLS));
+    tioc = QIO_CHANNEL_TLS(object_new(TYPE_QIO_CHANNEL_TLS));
+    ioc = QIO_CHANNEL(tioc);
 
-    ioc->master = master;
+    tioc->master = master;
+    ioc->follow_coroutine_ctx = master->follow_coroutine_ctx;
     if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) {
-        qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SHUTDOWN);
+        qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
     }
     object_ref(OBJECT(master));
 
-    ioc->session = qcrypto_tls_session_new(
+    tioc->session = qcrypto_tls_session_new(
         creds,
         NULL,
         aclname,
         QCRYPTO_TLS_CREDS_ENDPOINT_SERVER,
         errp);
-    if (!ioc->session) {
+    if (!tioc->session) {
         goto error;
     }
 
     qcrypto_tls_session_set_callbacks(
-        ioc->session,
+        tioc->session,
         qio_channel_tls_write_handler,
         qio_channel_tls_read_handler,
-        ioc);
+        tioc);
 
-    trace_qio_channel_tls_new_server(ioc, master, creds, aclname);
-    return ioc;
+    trace_qio_channel_tls_new_server(tioc, master, creds, aclname);
+    return tioc;
 
  error:
-    object_unref(OBJECT(ioc));
+    object_unref(OBJECT(tioc));
     return NULL;
 }
 
@@ -116,6 +119,7 @@ qio_channel_tls_new_client(QIOChannel *master,
     ioc = QIO_CHANNEL(tioc);
 
     tioc->master = master;
+    ioc->follow_coroutine_ctx = master->follow_coroutine_ctx;
     if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) {
         qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
     }
diff --git a/io/channel-websock.c b/io/channel-websock.c
index a12acc27cf25d312564c316469949482cb7020d6..de39f0d182d2610162c9e9a13c909b51e3d70518 100644
--- a/io/channel-websock.c
+++ b/io/channel-websock.c
@@ -883,6 +883,7 @@ qio_channel_websock_new_server(QIOChannel *master)
     ioc = QIO_CHANNEL(wioc);
 
     wioc->master = master;
+    ioc->follow_coroutine_ctx = master->follow_coroutine_ctx;
     if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) {
         qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
     }
diff --git a/io/trace-events b/io/trace-events
index 3cc5cf1efdf398b25701f3ea3aba61c1b6c6b435..79e1a19af7b70ce4870528c92b15f983066029a3 100644
--- a/io/trace-events
+++ b/io/trace-events
@@ -38,7 +38,7 @@ qio_channel_file_new_path(void *ioc, const char *path, int flags, int mode, int
 
 # channel-tls.c
 qio_channel_tls_new_client(void *ioc, void *master, void *creds, const char *hostname) "TLS new client ioc=%p master=%p creds=%p hostname=%s"
-qio_channel_tls_new_server(void *ioc, void *master, void *creds, const char *aclname) "TLS new client ioc=%p master=%p creds=%p acltname=%s"
+qio_channel_tls_new_server(void *ioc, void *master, void *creds, const char *aclname) "TLS new client ioc=%p master=%p creds=%p aclname=%s"
 qio_channel_tls_handshake_start(void *ioc) "TLS handshake start ioc=%p"
 qio_channel_tls_handshake_pending(void *ioc, int status) "TLS handshake pending ioc=%p status=%d"
 qio_channel_tls_handshake_fail(void *ioc) "TLS handshake fail ioc=%p"
diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h
index 38e5957526c26a124df7d99e79a7cbfed6b25f6d..552fdcb18f290e4e7a2217ba03a8dc73a5463674 100644
--- a/linux-headers/asm-arm64/kvm.h
+++ b/linux-headers/asm-arm64/kvm.h
@@ -110,6 +110,7 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_PTRAUTH_ADDRESS	5 /* VCPU uses address authentication */
 #define KVM_ARM_VCPU_PTRAUTH_GENERIC	6 /* VCPU uses generic authentication */
 #define KVM_ARM_VCPU_HAS_EL2		7 /* Support nested virtualization */
+#define KVM_ARM_VCPU_TEC		8 /* VCPU TEC state as part of cvm */
 
 struct kvm_vcpu_init {
 	__u32 target;
@@ -491,6 +492,109 @@ struct kvm_smccc_filter {
 #define KVM_HYPERCALL_EXIT_SMC		(1U << 0)
 #define KVM_HYPERCALL_EXIT_16BIT	(1U << 1)
 
+/*
+ * Get feature ID registers userspace writable mask.
+ *
+ * From DDI0487J.a, D19.2.66 ("ID_AA64MMFR2_EL1, AArch64 Memory Model
+ * Feature Register 2"):
+ *
+ * "The Feature ID space is defined as the System register space in
+ * AArch64 with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7},
+ * op2=={0-7}."
+ *
+ * This covers all currently known R/O registers that indicate
+ * anything useful feature wise, including the ID registers.
+ *
+ * If we ever need to introduce a new range, it will be described as
+ * such in the range field.
+ */
+#define KVM_ARM_FEATURE_ID_RANGE_IDX(op0, op1, crn, crm, op2)		\
+	({								\
+		__u64 __op1 = (op1) & 3;				\
+		__op1 -= (__op1 == 3);					\
+		(__op1 << 6 | ((crm) & 7) << 3 | (op2));		\
+	})
+
+#define KVM_ARM_FEATURE_ID_RANGE	0
+#define KVM_ARM_FEATURE_ID_RANGE_SIZE	(3 * 8 * 8)
+
+struct reg_mask_range {
+	__u64 addr;		/* Pointer to mask array */
+	__u32 range;		/* Requested range */
+	__u32 reserved[13];
+};
+
+/* KVM_CAP_ARM_TMM on VM fd */
+#define KVM_CAP_ARM_TMM_CONFIG_CVM		0
+#define KVM_CAP_ARM_TMM_CREATE_RD		1
+#define KVM_CAP_ARM_TMM_POPULATE_CVM		2
+#define KVM_CAP_ARM_TMM_ACTIVATE_CVM		3
+ 
+#define KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA256		0
+#define KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA512		1
+ 
+#define KVM_CAP_ARM_TMM_RPV_SIZE 64
+ 
+/* List of configuration items accepted for KVM_CAP_ARM_RME_CONFIG_REALM */
+#define KVM_CAP_ARM_TMM_CFG_RPV			0
+#define KVM_CAP_ARM_TMM_CFG_HASH_ALGO		1
+#define KVM_CAP_ARM_TMM_CFG_SVE			2
+#define KVM_CAP_ARM_TMM_CFG_DBG			3
+#define KVM_CAP_ARM_TMM_CFG_PMU			4
+#define KVM_CAP_ARM_TMM_CFG_KAE			5
+
+#define KVM_ARM_TMM_MAX_KAE_VF_NUM		11
+
+struct kvm_cap_arm_tmm_config_item {
+	__u32 cfg;
+	union {
+		/* cfg == KVM_CAP_ARM_TMM_CFG_RPV */
+		struct {
+			__u8 rpv[KVM_CAP_ARM_TMM_RPV_SIZE];
+		};
+
+		/* cfg == KVM_CAP_ARM_TMM_CFG_HASH_ALGO */
+		struct {
+			__u32 hash_algo;
+		};
+
+		/* cfg == KVM_CAP_ARM_TMM_CFG_SVE */
+		struct {
+			__u32 sve_vq;
+		};
+
+		/* cfg == KVM_CAP_ARM_TMM_CFG_DBG */
+		struct {
+			__u32 num_brps;
+			__u32 num_wrps;
+		};
+
+		/* cfg == KVM_CAP_ARM_TMM_CFG_PMU */
+		struct {
+			__u32 num_pmu_cntrs;
+		};
+
+		/* cfg == KVM_CAP_ARM_TMM_CFG_KAE */
+		struct {
+			__u32 kae_vf_num;
+			__u64 sec_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM];
+			__u64 hpre_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM];
+		};
+		/* Fix the size of the union */
+		__u8 reserved[256];
+	};
+};
+
+#define KVM_ARM_TMM_POPULATE_FLAGS_MEASURE	(1U << 0)
+struct kvm_cap_arm_tmm_populate_region_args {
+	__u64 populate_ipa_base1;
+	__u64 populate_ipa_size1;
+	__u64 populate_ipa_base2;
+	__u64 populate_ipa_size2;
+	__u32 flags;
+	__u32 reserved[3];
+};
+
 #endif
 
 #endif /* __ARM_KVM_H__ */
diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h
index abe087c53b4b04348431716c1cb638b99d5a788a..756b013fb8324bd7a320e60cebec2ca692faa149 100644
--- a/linux-headers/asm-generic/unistd.h
+++ b/linux-headers/asm-generic/unistd.h
@@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr)
 #define __NR_getcwd 17
 __SYSCALL(__NR_getcwd, sys_getcwd)
 #define __NR_lookup_dcookie 18
-__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall)
 #define __NR_eventfd2 19
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
 #define __NR_epoll_create1 20
@@ -816,15 +816,21 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 #define __NR_set_mempolicy_home_node 450
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
-
 #define __NR_cachestat 451
 __SYSCALL(__NR_cachestat, sys_cachestat)
-
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_map_shadow_stack 453
+__SYSCALL(__NR_map_shadow_stack, sys_map_shadow_stack)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 457
 
 /*
  * 32 bit systems traditionally used different
diff --git a/linux-headers/asm-loongarch/bitsperlong.h b/linux-headers/asm-loongarch/bitsperlong.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dc0bb0c13b29dd814f403f2fd4efb3b36be0619
--- /dev/null
+++ b/linux-headers/asm-loongarch/bitsperlong.h
@@ -0,0 +1 @@
+#include <asm-generic/bitsperlong.h>
diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h
new file mode 100644
index 0000000000000000000000000000000000000000..d22b19e134fb993c19e2484a2172cd5843cb4e5c
--- /dev/null
+++ b/linux-headers/asm-loongarch/kvm.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+
+#ifndef __UAPI_ASM_LOONGARCH_KVM_H
+#define __UAPI_ASM_LOONGARCH_KVM_H
+
+#include <linux/types.h>
+
+/*
+ * KVM LoongArch specific structures and definitions.
+ *
+ * Some parts derived from the x86 version of this file.
+ */
+
+#define __KVM_HAVE_READONLY_MEM
+#define __KVM_HAVE_GUEST_DEBUG
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET	1
+#define KVM_DIRTY_LOG_PAGE_OFFSET	64
+#define __KVM_HAVE_IRQ_LINE
+
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+/*
+ * for KVM_GET_REGS and KVM_SET_REGS
+ */
+struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	__u64 gpr[32];
+	__u64 pc;
+};
+
+/*
+ * for KVM_GET_FPU and KVM_SET_FPU
+ */
+struct kvm_fpu {
+	__u32 fcsr;
+	__u64 fcc;    /* 8x8 */
+	struct kvm_fpureg {
+		__u64 val64[4];
+	} fpr[32];
+};
+
+/*
+ * For LoongArch, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various
+ * registers.  The id field is broken down as follows:
+ *
+ *  bits[63..52] - As per linux/kvm.h
+ *  bits[51..32] - Must be zero.
+ *  bits[31..16] - Register set.
+ *
+ * Register set = 0: GP registers from kvm_regs (see definitions below).
+ *
+ * Register set = 1: CSR registers.
+ *
+ * Register set = 2: KVM specific registers (see definitions below).
+ *
+ * Register set = 3: FPU / SIMD registers (see definitions below).
+ *
+ * Other sets registers may be added in the future.  Each set would
+ * have its own identifier in bits[31..16].
+ */
+
+#define KVM_REG_LOONGARCH_GPR		(KVM_REG_LOONGARCH | 0x00000ULL)
+#define KVM_REG_LOONGARCH_CSR		(KVM_REG_LOONGARCH | 0x10000ULL)
+#define KVM_REG_LOONGARCH_KVM		(KVM_REG_LOONGARCH | 0x20000ULL)
+#define KVM_REG_LOONGARCH_FPSIMD	(KVM_REG_LOONGARCH | 0x30000ULL)
+#define KVM_REG_LOONGARCH_CPUCFG	(KVM_REG_LOONGARCH | 0x40000ULL)
+#define KVM_REG_LOONGARCH_LBT		(KVM_REG_LOONGARCH | 0x50000ULL)
+#define KVM_REG_LOONGARCH_MASK		(KVM_REG_LOONGARCH | 0x70000ULL)
+#define KVM_CSR_IDX_MASK		0x7fff
+#define KVM_CPUCFG_IDX_MASK		0x7fff
+
+/*
+ * KVM_REG_LOONGARCH_KVM - KVM specific control registers.
+ */
+
+#define KVM_REG_LOONGARCH_COUNTER	(KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 1)
+#define KVM_REG_LOONGARCH_VCPU_RESET	(KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 2)
+/* Debugging: Special instruction for software breakpoint */
+#define KVM_REG_LOONGARCH_DEBUG_INST	(KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3)
+
+/* LBT registers */
+#define KVM_REG_LOONGARCH_LBT_SCR0	(KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 1)
+#define KVM_REG_LOONGARCH_LBT_SCR1	(KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 2)
+#define KVM_REG_LOONGARCH_LBT_SCR2	(KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 3)
+#define KVM_REG_LOONGARCH_LBT_SCR3	(KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 4)
+#define KVM_REG_LOONGARCH_LBT_EFLAGS	(KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 5)
+#define KVM_REG_LOONGARCH_LBT_FTOP	(KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 6)
+
+#define LOONGARCH_REG_SHIFT		3
+#define LOONGARCH_REG_64(TYPE, REG)	(TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT))
+#define KVM_IOC_CSRID(REG)		LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG)
+#define KVM_IOC_CPUCFG(REG)		LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG)
+
+/* Device Control API on vm fd */
+#define KVM_LOONGARCH_VM_FEAT_CTRL		0
+#define  KVM_LOONGARCH_VM_FEAT_LSX		0
+#define  KVM_LOONGARCH_VM_FEAT_LASX		1
+#define  KVM_LOONGARCH_VM_FEAT_X86BT		2
+#define  KVM_LOONGARCH_VM_FEAT_ARMBT		3
+#define  KVM_LOONGARCH_VM_FEAT_MIPSBT		4
+#define  KVM_LOONGARCH_VM_FEAT_PMU		5
+#define  KVM_LOONGARCH_VM_FEAT_PV_IPI		6
+#define  KVM_LOONGARCH_VM_FEAT_PV_STEALTIME	7
+
+/* Device Control API on vcpu fd */
+#define KVM_LOONGARCH_VCPU_CPUCFG	0
+#define KVM_LOONGARCH_VCPU_PVTIME_CTRL	1
+#define  KVM_LOONGARCH_VCPU_PVTIME_GPA	0
+
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+/* dummy definition */
+struct kvm_sregs {
+};
+
+struct kvm_iocsr_entry {
+	__u32 addr;
+	__u32 pad;
+	__u64 data;
+};
+
+#define KVM_NR_IRQCHIPS		1
+#define KVM_IRQCHIP_NUM_PINS	64
+#define KVM_MAX_CORES		256
+
+#define KVM_DEV_LOONGARCH_IPI_GRP_REGS		0x40000001
+
+#define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS	0x40000002
+
+#define KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS		0x40000003
+#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU	0x0
+#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE	0x1
+#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE	0x2
+
+#define KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL		0x40000004
+#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU	0x0
+#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE	0x1
+#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED	0x3
+
+#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS	0x40000005
+#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL	0x40000006
+#define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT	0
+
+#endif /* __UAPI_ASM_LOONGARCH_KVM_H */
diff --git a/linux-headers/asm-loongarch/mman.h b/linux-headers/asm-loongarch/mman.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eebf89f5ab17884a98543f3b37a3b710355083b
--- /dev/null
+++ b/linux-headers/asm-loongarch/mman.h
@@ -0,0 +1 @@
+#include <asm-generic/mman.h>
diff --git a/linux-headers/asm-loongarch/unistd.h b/linux-headers/asm-loongarch/unistd.h
new file mode 100644
index 0000000000000000000000000000000000000000..b344b1f917153b6d70ab7d434f05598b116d3642
--- /dev/null
+++ b/linux-headers/asm-loongarch/unistd.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#define __ARCH_WANT_NEW_STAT
+#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
+
+#include <asm-generic/unistd.h>
diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h
index 46d8500654c3c0648868c8d5d2a355acbd90dd97..994b6f008f5414d5f1cb6f6d08c5e5298bb70261 100644
--- a/linux-headers/asm-mips/unistd_n32.h
+++ b/linux-headers/asm-mips/unistd_n32.h
@@ -381,5 +381,9 @@
 #define __NR_set_mempolicy_home_node (__NR_Linux + 450)
 #define __NR_cachestat (__NR_Linux + 451)
 #define __NR_fchmodat2 (__NR_Linux + 452)
+#define __NR_map_shadow_stack (__NR_Linux + 453)
+#define __NR_futex_wake (__NR_Linux + 454)
+#define __NR_futex_wait (__NR_Linux + 455)
+#define __NR_futex_requeue (__NR_Linux + 456)
 
 #endif /* _ASM_UNISTD_N32_H */
diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h
index c2f7ac673bb551c144d09e8e64b174995efec8c4..41dcf5877a1b80bbb01d20e8f3b77e16d9803d6d 100644
--- a/linux-headers/asm-mips/unistd_n64.h
+++ b/linux-headers/asm-mips/unistd_n64.h
@@ -357,5 +357,9 @@
 #define __NR_set_mempolicy_home_node (__NR_Linux + 450)
 #define __NR_cachestat (__NR_Linux + 451)
 #define __NR_fchmodat2 (__NR_Linux + 452)
+#define __NR_map_shadow_stack (__NR_Linux + 453)
+#define __NR_futex_wake (__NR_Linux + 454)
+#define __NR_futex_wait (__NR_Linux + 455)
+#define __NR_futex_requeue (__NR_Linux + 456)
 
 #endif /* _ASM_UNISTD_N64_H */
diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h
index 757c68f2add872eb6a2ed66aa51229f7950b4530..ae9d334d96e3444c4eb39f924f5fec183100a094 100644
--- a/linux-headers/asm-mips/unistd_o32.h
+++ b/linux-headers/asm-mips/unistd_o32.h
@@ -427,5 +427,9 @@
 #define __NR_set_mempolicy_home_node (__NR_Linux + 450)
 #define __NR_cachestat (__NR_Linux + 451)
 #define __NR_fchmodat2 (__NR_Linux + 452)
+#define __NR_map_shadow_stack (__NR_Linux + 453)
+#define __NR_futex_wake (__NR_Linux + 454)
+#define __NR_futex_wait (__NR_Linux + 455)
+#define __NR_futex_requeue (__NR_Linux + 456)
 
 #endif /* _ASM_UNISTD_O32_H */
diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h
index 8ef94bbac13839ce53ea6c6b79c6886f29055030..b9b23d66d7d9ae5f052f70577c38d573f3225ee9 100644
--- a/linux-headers/asm-powerpc/unistd_32.h
+++ b/linux-headers/asm-powerpc/unistd_32.h
@@ -434,6 +434,10 @@
 #define __NR_set_mempolicy_home_node 450
 #define __NR_cachestat 451
 #define __NR_fchmodat2 452
+#define __NR_map_shadow_stack 453
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
 
 
 #endif /* _ASM_UNISTD_32_H */
diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h
index 0e7ee43e884fdded1759437a3efad8da9751467d..cbb4b3e8f7c2f1fa9c572b4c37bdd99637c8925b 100644
--- a/linux-headers/asm-powerpc/unistd_64.h
+++ b/linux-headers/asm-powerpc/unistd_64.h
@@ -406,6 +406,10 @@
 #define __NR_set_mempolicy_home_node 450
 #define __NR_cachestat 451
 #define __NR_fchmodat2 452
+#define __NR_map_shadow_stack 453
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
 
 
 #endif /* _ASM_UNISTD_64_H */
diff --git a/linux-headers/asm-riscv/kvm.h b/linux-headers/asm-riscv/kvm.h
index 992c5e407104958532d7ba930d50447ebe56fa25..60d3b21dead7d8846050d20a96ef1a0b3ad1ba20 100644
--- a/linux-headers/asm-riscv/kvm.h
+++ b/linux-headers/asm-riscv/kvm.h
@@ -80,6 +80,7 @@ struct kvm_riscv_csr {
 	unsigned long sip;
 	unsigned long satp;
 	unsigned long scounteren;
+	unsigned long senvcfg;
 };
 
 /* AIA CSR registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */
@@ -93,6 +94,11 @@ struct kvm_riscv_aia_csr {
 	unsigned long iprio2h;
 };
 
+/* Smstateen CSR for KVM_GET_ONE_REG and KVM_SET_ONE_REG */
+struct kvm_riscv_smstateen_csr {
+	unsigned long sstateen0;
+};
+
 /* TIMER registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */
 struct kvm_riscv_timer {
 	__u64 frequency;
@@ -131,6 +137,8 @@ enum KVM_RISCV_ISA_EXT_ID {
 	KVM_RISCV_ISA_EXT_ZICSR,
 	KVM_RISCV_ISA_EXT_ZIFENCEI,
 	KVM_RISCV_ISA_EXT_ZIHPM,
+	KVM_RISCV_ISA_EXT_SMSTATEEN,
+	KVM_RISCV_ISA_EXT_ZICOND,
 	KVM_RISCV_ISA_EXT_MAX,
 };
 
@@ -148,6 +156,7 @@ enum KVM_RISCV_SBI_EXT_ID {
 	KVM_RISCV_SBI_EXT_PMU,
 	KVM_RISCV_SBI_EXT_EXPERIMENTAL,
 	KVM_RISCV_SBI_EXT_VENDOR,
+	KVM_RISCV_SBI_EXT_DBCN,
 	KVM_RISCV_SBI_EXT_MAX,
 };
 
@@ -178,10 +187,13 @@ enum KVM_RISCV_SBI_EXT_ID {
 #define KVM_REG_RISCV_CSR		(0x03 << KVM_REG_RISCV_TYPE_SHIFT)
 #define KVM_REG_RISCV_CSR_GENERAL	(0x0 << KVM_REG_RISCV_SUBTYPE_SHIFT)
 #define KVM_REG_RISCV_CSR_AIA		(0x1 << KVM_REG_RISCV_SUBTYPE_SHIFT)
+#define KVM_REG_RISCV_CSR_SMSTATEEN	(0x2 << KVM_REG_RISCV_SUBTYPE_SHIFT)
 #define KVM_REG_RISCV_CSR_REG(name)	\
 		(offsetof(struct kvm_riscv_csr, name) / sizeof(unsigned long))
 #define KVM_REG_RISCV_CSR_AIA_REG(name)	\
 	(offsetof(struct kvm_riscv_aia_csr, name) / sizeof(unsigned long))
+#define KVM_REG_RISCV_CSR_SMSTATEEN_REG(name)  \
+	(offsetof(struct kvm_riscv_smstateen_csr, name) / sizeof(unsigned long))
 
 /* Timer registers are mapped as type 4 */
 #define KVM_REG_RISCV_TIMER		(0x04 << KVM_REG_RISCV_TYPE_SHIFT)
diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h
index 716fa368ca711658c7fa54fa815b370f34089fd7..c093e6d5f9111ae5be58b0702251e19ecffc98cc 100644
--- a/linux-headers/asm-s390/unistd_32.h
+++ b/linux-headers/asm-s390/unistd_32.h
@@ -425,5 +425,9 @@
 #define __NR_set_mempolicy_home_node 450
 #define __NR_cachestat 451
 #define __NR_fchmodat2 452
+#define __NR_map_shadow_stack 453
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
 
 #endif /* _ASM_S390_UNISTD_32_H */
diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h
index b2a11b1d139f0787b3b7201a46326b6f821278f4..114c0569a49aa5e8a50f9834e2487f0d9abc12b7 100644
--- a/linux-headers/asm-s390/unistd_64.h
+++ b/linux-headers/asm-s390/unistd_64.h
@@ -373,5 +373,9 @@
 #define __NR_set_mempolicy_home_node 450
 #define __NR_cachestat 451
 #define __NR_fchmodat2 452
+#define __NR_map_shadow_stack 453
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
 
 #endif /* _ASM_S390_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h
index d749ad1c24ec2dfee13a0665d781e03c235465cf..329649c377be129711ce0f9c76433ace56df4706 100644
--- a/linux-headers/asm-x86/unistd_32.h
+++ b/linux-headers/asm-x86/unistd_32.h
@@ -443,6 +443,10 @@
 #define __NR_set_mempolicy_home_node 450
 #define __NR_cachestat 451
 #define __NR_fchmodat2 452
+#define __NR_map_shadow_stack 453
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
 
 
 #endif /* _ASM_UNISTD_32_H */
diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h
index cea67282ebfeadf28cb631222e535ab1ef8c520c..4583606ce68420d75bccf5c411da675f860744b3 100644
--- a/linux-headers/asm-x86/unistd_64.h
+++ b/linux-headers/asm-x86/unistd_64.h
@@ -366,6 +366,9 @@
 #define __NR_cachestat 451
 #define __NR_fchmodat2 452
 #define __NR_map_shadow_stack 453
+#define __NR_futex_wake 454
+#define __NR_futex_wait 455
+#define __NR_futex_requeue 456
 
 
 #endif /* _ASM_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h
index 5b2e79bf4c4610246dfb9962052825d0be85806d..146d74d8e4b0146d80ead022189149e27441b8a1 100644
--- a/linux-headers/asm-x86/unistd_x32.h
+++ b/linux-headers/asm-x86/unistd_x32.h
@@ -318,6 +318,9 @@
 #define __NR_set_mempolicy_home_node (__X32_SYSCALL_BIT + 450)
 #define __NR_cachestat (__X32_SYSCALL_BIT + 451)
 #define __NR_fchmodat2 (__X32_SYSCALL_BIT + 452)
+#define __NR_futex_wake (__X32_SYSCALL_BIT + 454)
+#define __NR_futex_wait (__X32_SYSCALL_BIT + 455)
+#define __NR_futex_requeue (__X32_SYSCALL_BIT + 456)
 #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
 #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
 #define __NR_ioctl (__X32_SYSCALL_BIT + 514)
diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h
index 218bf7ac98d07c589058af8cb1156582d47423db..3e57fee01cc7ec52083e30bfe789d88f39d6751f 100644
--- a/linux-headers/linux/iommufd.h
+++ b/linux-headers/linux/iommufd.h
@@ -37,16 +37,22 @@
 enum {
 	IOMMUFD_CMD_BASE = 0x80,
 	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
-	IOMMUFD_CMD_IOAS_ALLOC,
-	IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
-	IOMMUFD_CMD_IOAS_COPY,
-	IOMMUFD_CMD_IOAS_IOVA_RANGES,
-	IOMMUFD_CMD_IOAS_MAP,
-	IOMMUFD_CMD_IOAS_UNMAP,
-	IOMMUFD_CMD_OPTION,
-	IOMMUFD_CMD_VFIO_IOAS,
-	IOMMUFD_CMD_HWPT_ALLOC,
-	IOMMUFD_CMD_GET_HW_INFO,
+	IOMMUFD_CMD_IOAS_ALLOC = 0x81,
+	IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
+	IOMMUFD_CMD_IOAS_COPY = 0x83,
+	IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84,
+	IOMMUFD_CMD_IOAS_MAP = 0x85,
+	IOMMUFD_CMD_IOAS_UNMAP = 0x86,
+	IOMMUFD_CMD_OPTION = 0x87,
+	IOMMUFD_CMD_VFIO_IOAS = 0x88,
+	IOMMUFD_CMD_HWPT_ALLOC = 0x89,
+	IOMMUFD_CMD_GET_HW_INFO = 0x8a,
+	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b,
+	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
+	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
+	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
+	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 };
 
 /**
@@ -347,20 +353,116 @@ struct iommu_vfio_ioas {
 };
 #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
 
+/**
+ * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation
+ * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as
+ *                                the parent HWPT in a nesting configuration.
+ * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is
+ *                                   enforced on device attachment
+ * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is
+ *                             valid.
+ */
+enum iommufd_hwpt_alloc_flags {
+	IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
+	IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
+	IOMMU_HWPT_FAULT_ID_VALID = 1 << 2,
+};
+
+/**
+ * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table
+ *                                entry attributes
+ * @IOMMU_VTD_S1_SRE: Supervisor request
+ * @IOMMU_VTD_S1_EAFE: Extended access enable
+ * @IOMMU_VTD_S1_WPE: Write protect enable
+ */
+enum iommu_hwpt_vtd_s1_flags {
+	IOMMU_VTD_S1_SRE = 1 << 0,
+	IOMMU_VTD_S1_EAFE = 1 << 1,
+	IOMMU_VTD_S1_WPE = 1 << 2,
+};
+
+/**
+ * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table
+ *                            info (IOMMU_HWPT_DATA_VTD_S1)
+ * @flags: Combination of enum iommu_hwpt_vtd_s1_flags
+ * @pgtbl_addr: The base address of the stage-1 page table.
+ * @addr_width: The address width of the stage-1 page table
+ * @__reserved: Must be 0
+ */
+struct iommu_hwpt_vtd_s1 {
+	__aligned_u64 flags;
+	__aligned_u64 pgtbl_addr;
+	__u32 addr_width;
+	__u32 __reserved;
+};
+
+/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE
+ *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ *       the translation. Must be little-endian.
+ *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ *       - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent. The S1 will
+ * install a Context Descriptor Table pointing at userspace memory translated
+ * by the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+	__aligned_le64 ste[2];
+};
+
+/**
+ * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
+ * @IOMMU_HWPT_DATA_NONE: no data
+ * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
+ */
+enum iommu_hwpt_data_type {
+	IOMMU_HWPT_DATA_NONE = 0,
+	IOMMU_HWPT_DATA_VTD_S1 = 1,
+	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
+};
+
 /**
  * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC)
  * @size: sizeof(struct iommu_hwpt_alloc)
- * @flags: Must be 0
+ * @flags: Combination of enum iommufd_hwpt_alloc_flags
  * @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS to connect this HWPT to
+ * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to
  * @out_hwpt_id: The ID of the new HWPT
  * @__reserved: Must be 0
+ * @data_type: One of enum iommu_hwpt_data_type
+ * @data_len: Length of the type specific data
+ * @data_uptr: User pointer to the type specific data
+ * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of
+ *            IOMMU_HWPT_FAULT_ID_VALID is set.
+ * @__reserved2: Padding to 64-bit alignment. Must be 0.
  *
  * Explicitly allocate a hardware page table object. This is the same object
  * type that is returned by iommufd_device_attach() and represents the
  * underlying iommu driver's iommu_domain kernel object.
  *
- * A HWPT will be created with the IOVA mappings from the given IOAS.
+ * A kernel-managed HWPT will be created with the mappings from the given
+ * IOAS via the @pt_id. The @data_type for this allocation must be set to
+ * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
+ * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
+ *
+ * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware. The device
+ * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU
+ * instance.
+ *
+ * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
+ * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
+ * must be given.
  */
 struct iommu_hwpt_alloc {
 	__u32 size;
@@ -369,13 +471,28 @@ struct iommu_hwpt_alloc {
 	__u32 pt_id;
 	__u32 out_hwpt_id;
 	__u32 __reserved;
+	__u32 data_type;
+	__u32 data_len;
+	__aligned_u64 data_uptr;
+	__u32 fault_id;
+	__u32 __reserved2;
 };
 #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
 
+/**
+ * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info
+ * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings
+ *                                         on a nested_parent domain.
+ *                                         https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
+ */
+enum iommu_hw_info_vtd_flags {
+	IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0,
+};
+
 /**
  * struct iommu_hw_info_vtd - Intel VT-d hardware information
  *
- * @flags: Must be 0
+ * @flags: Combination of enum iommu_hw_info_vtd_flags
  * @__reserved: Must be 0
  *
  * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec
@@ -393,15 +510,72 @@ struct iommu_hw_info_vtd {
 	__aligned_u64 ecap_reg;
 };
 
+/**
+ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
+ *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+ *
+ * @flags: Must be set to 0
+ * @__reserved: Must be 0
+ * @idr: Implemented features for ARM SMMU Non-secure programming interface
+ * @iidr: Information about the implementation and implementer of ARM SMMU,
+ *        and architecture version supported
+ * @aidr: ARM SMMU architecture version
+ *
+ * For the details of @idr, @iidr and @aidr, please refer to the chapters
+ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
+ *
+ * User space should read the underlying ARM SMMUv3 hardware information for
+ * the list of supported features.
+ *
+ * Note that these values reflect the raw HW capability, without any insight if
+ * any required kernel driver support is present. Bits may be set indicating the
+ * HW has functionality that is lacking kernel software support, such as BTM. If
+ * a VMM is using this information to construct emulated copies of these
+ * registers it should only forward bits that it knows it can support.
+ *
+ * In future, presence of required kernel support will be indicated in flags.
+ */
+struct iommu_hw_info_arm_smmuv3 {
+	__u32 flags;
+	__u32 __reserved;
+	__u32 idr[6];
+	__u32 iidr;
+	__u32 aidr;
+};
+
 /**
  * enum iommu_hw_info_type - IOMMU Hardware Info Types
  * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
  *                           info
  * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
  */
 enum iommu_hw_info_type {
-	IOMMU_HW_INFO_TYPE_NONE,
-	IOMMU_HW_INFO_TYPE_INTEL_VTD,
+	IOMMU_HW_INFO_TYPE_NONE = 0,
+	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
+	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
+};
+
+/**
+ * enum iommufd_hw_capabilities
+ * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking
+ *                               If available, it means the following APIs
+ *                               are supported:
+ *
+ *                                   IOMMU_HWPT_GET_DIRTY_BITMAP
+ *                                   IOMMU_HWPT_SET_DIRTY_TRACKING
+ *
+ * @IOMMU_HW_CAP_PASID_EXEC: Execute Permission Supported, user ignores it
+ *                           when the struct iommu_hw_info::out_max_pasid_log2
+ *                           is zero.
+ * @IOMMU_HW_CAP_PASID_PRIV: Privileged Mode Supported, user ignores it
+ *                           when the struct iommu_hw_info::out_max_pasid_log2
+ *                           is zero.
+ */
+enum iommufd_hw_capabilities {
+	IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
+	IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1,
+	IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2,
 };
 
 /**
@@ -415,6 +589,11 @@ enum iommu_hw_info_type {
  *             the iommu type specific hardware information data
  * @out_data_type: Output the iommu hardware info type as defined in the enum
  *                 iommu_hw_info_type.
+ * @out_capabilities: Output the generic iommu capability info type as defined
+ *                    in the enum iommu_hw_capabilities.
+ * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support.
+ *                      PCI devices turn to out_capabilities to check if the
+ *                      specific capabilities is supported or not.
  * @__reserved: Must be 0
  *
  * Query an iommu type specific hardware information data from an iommu behind
@@ -438,7 +617,346 @@ struct iommu_hw_info {
 	__u32 data_len;
 	__aligned_u64 data_uptr;
 	__u32 out_data_type;
-	__u32 __reserved;
+	__u8 out_max_pasid_log2;
+	__u8 __reserved[3];
+	__aligned_u64 out_capabilities;
 };
 #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
+
+/*
+ * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty
+ *                                              tracking
+ * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking
+ */
+enum iommufd_hwpt_set_dirty_tracking_flags {
+	IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1,
+};
+
+/**
+ * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING)
+ * @size: sizeof(struct iommu_hwpt_set_dirty_tracking)
+ * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @__reserved: Must be 0
+ *
+ * Toggle dirty tracking on an HW pagetable.
+ */
+struct iommu_hwpt_set_dirty_tracking {
+	__u32 size;
+	__u32 flags;
+	__u32 hwpt_id;
+	__u32 __reserved;
+};
+#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \
+					  IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING)
+
+/**
+ * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits
+ * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing
+ *                                        any dirty bits metadata. This flag
+ *                                        can be passed in the expectation
+ *                                        where the next operation is an unmap
+ *                                        of the same IOVA range.
+ *
+ */
+enum iommufd_hwpt_get_dirty_bitmap_flags {
+	IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1,
+};
+
+/**
+ * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
+ * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags
+ * @__reserved: Must be 0
+ * @iova: base IOVA of the bitmap first bit
+ * @length: IOVA range size
+ * @page_size: page size granularity of each bit in the bitmap
+ * @data: bitmap where to set the dirty bits. The bitmap bits each
+ *        represent a page_size which you deviate from an arbitrary iova.
+ *
+ * Checking a given IOVA is dirty:
+ *
+ *  data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
+ *
+ * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
+ * with the dirty IOVAs. In doing so it will also by default clear any
+ * dirty bit metadata set in the IOPTE.
+ */
+struct iommu_hwpt_get_dirty_bitmap {
+	__u32 size;
+	__u32 hwpt_id;
+	__u32 flags;
+	__u32 __reserved;
+	__aligned_u64 iova;
+	__aligned_u64 length;
+	__aligned_u64 page_size;
+	__aligned_u64 data;
+};
+#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
+					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
+
+/**
+ * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation
+ *                                        Data Type
+ * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
+ * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3
+ */
+enum iommu_hwpt_invalidate_data_type {
+	IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
+	IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1,
+};
+
+/**
+ * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d
+ *                                           stage-1 cache invalidation
+ * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies
+ *                            to all-levels page structure cache or just
+ *                            the leaf PTE cache.
+ */
+enum iommu_hwpt_vtd_s1_invalidate_flags {
+	IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0,
+};
+
+/**
+ * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation
+ *                                       (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1)
+ * @addr: The start address of the range to be invalidated. It needs to
+ *        be 4KB aligned.
+ * @npages: Number of contiguous 4K pages to be invalidated.
+ * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags
+ * @__reserved: Must be 0
+ *
+ * The Intel VT-d specific invalidation data for user-managed stage-1 cache
+ * invalidation in nested translation. Userspace uses this structure to
+ * tell the impacted cache scope after modifying the stage-1 page table.
+ *
+ * Invalidating all the caches related to the page table by setting @addr
+ * to be 0 and @npages to be U64_MAX.
+ *
+ * The device TLB will be invalidated automatically if ATS is enabled.
+ */
+struct iommu_hwpt_vtd_s1_invalidate {
+	__aligned_u64 addr;
+	__aligned_u64 npages;
+	__u32 flags;
+	__u32 __reserved;
+};
+
+/**
+ * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation
+ *         (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3)
+ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
+ *       Must be little-endian.
+ *
+ * Supported command list only when passing in a vIOMMU via @hwpt_id:
+ *     CMDQ_OP_TLBI_NSNH_ALL
+ *     CMDQ_OP_TLBI_NH_VA
+ *     CMDQ_OP_TLBI_NH_VAA
+ *     CMDQ_OP_TLBI_NH_ALL
+ *     CMDQ_OP_TLBI_NH_ASID
+ *     CMDQ_OP_ATC_INV
+ *     CMDQ_OP_CFGI_CD
+ *     CMDQ_OP_CFGI_CD_ALL
+ *
+ * -EIO will be returned if the command is not supported.
+ */
+struct iommu_viommu_arm_smmuv3_invalidate {
+	__aligned_le64 cmd[2];
+};
+
+/**
+ * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
+ * @size: sizeof(struct iommu_hwpt_invalidate)
+ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation
+ * @data_uptr: User pointer to an array of driver-specific cache invalidation
+ *             data.
+ * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
+ *             type of all the entries in the invalidation request array. It
+ *             should be a type supported by the hwpt pointed by @hwpt_id.
+ * @entry_len: Length (in bytes) of a request entry in the request array
+ * @entry_num: Input the number of cache invalidation requests in the array.
+ *             Output the number of requests successfully handled by kernel.
+ * @__reserved: Must be 0.
+ *
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field.
+ *
+ * Each ioctl can support one or more cache invalidation requests in the array
+ * that has a total size of @entry_len * @entry_num.
+ *
+ * An empty invalidation request array by setting @entry_num==0 is allowed, and
+ * @entry_len and @data_uptr would be ignored in this case. This can be used to
+ * check if the given @data_type is supported or not by kernel.
+ */
+struct iommu_hwpt_invalidate {
+	__u32 size;
+	__u32 hwpt_id;
+	__aligned_u64 data_uptr;
+	__u32 data_type;
+	__u32 entry_len;
+	__u32 entry_num;
+	__u32 __reserved;
+};
+#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
+
+/**
+ * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault
+ * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is
+ *                                   valid.
+ * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group.
+ */
+enum iommu_hwpt_pgfault_flags {
+	IOMMU_PGFAULT_FLAGS_PASID_VALID		= (1 << 0),
+	IOMMU_PGFAULT_FLAGS_LAST_PAGE		= (1 << 1),
+};
+
+/**
+ * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault
+ * @IOMMU_PGFAULT_PERM_READ: request for read permission
+ * @IOMMU_PGFAULT_PERM_WRITE: request for write permission
+ * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the
+ *                           Execute Requested bit set in PASID TLP Prefix.
+ * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the
+ *                           Privileged Mode Requested bit set in PASID TLP
+ *                           Prefix.
+ */
+enum iommu_hwpt_pgfault_perm {
+	IOMMU_PGFAULT_PERM_READ			= (1 << 0),
+	IOMMU_PGFAULT_PERM_WRITE		= (1 << 1),
+	IOMMU_PGFAULT_PERM_EXEC			= (1 << 2),
+	IOMMU_PGFAULT_PERM_PRIV			= (1 << 3),
+};
+
+/**
+ * struct iommu_hwpt_pgfault - iommu page fault data
+ * @flags: Combination of enum iommu_hwpt_pgfault_flags
+ * @dev_id: id of the originated device
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: Combination of enum iommu_hwpt_pgfault_perm
+ * @addr: Fault address
+ * @length: a hint of how much data the requestor is expecting to fetch. For
+ *          example, if the PRI initiator knows it is going to do a 10MB
+ *          transfer, it could fill in 10MB and the OS could pre-fault in
+ *          10MB of IOVA. It's default to 0 if there's no such hint.
+ * @cookie: kernel-managed cookie identifying a group of fault messages. The
+ *          cookie number encoded in the last page fault of the group should
+ *          be echoed back in the response message.
+ */
+struct iommu_hwpt_pgfault {
+	__u32 flags;
+	__u32 dev_id;
+	__u32 pasid;
+	__u32 grpid;
+	__u32 perm;
+	__u64 addr;
+	__u32 length;
+	__u32 cookie;
+};
+
+/**
+ * enum iommufd_page_response_code - Return status of fault handlers
+ * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
+ *                             populated, retry the access. This is the
+ *                             "Success" defined in PCI 10.4.2.1.
+ * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
+ *                             access. This is the "Invalid Request" in PCI
+ *                             10.4.2.1.
+ */
+enum iommufd_page_response_code {
+	IOMMUFD_PAGE_RESP_SUCCESS = 0,
+	IOMMUFD_PAGE_RESP_INVALID = 1,
+};
+
+/**
+ * struct iommu_hwpt_page_response - IOMMU page fault response
+ * @cookie: The kernel-managed cookie reported in the fault message.
+ * @code: One of response code in enum iommufd_page_response_code.
+ */
+struct iommu_hwpt_page_response {
+	__u32 cookie;
+	__u32 code;
+};
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_fault_id;
+	__u32 out_fault_fd;
+};
+#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+	IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
+ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ *           of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ */
+struct iommu_vdevice_alloc {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 out_vdevice_id;
+	__aligned_u64 virt_id;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
 #endif
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 0d74ee999aa9e1fbd4e62ee6ef28e759e34b5adc..d3bf7fac004bde381bf0ed20947b5a8ffa9dc8b0 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -14,6 +14,8 @@
 #include <linux/ioctl.h>
 #include <asm/kvm.h>
 
+#include "sysemu/numa.h"
+
 #define KVM_API_VERSION 12
 
 /* *** Deprecated interfaces *** */
@@ -264,6 +266,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_RISCV_SBI        35
 #define KVM_EXIT_RISCV_CSR        36
 #define KVM_EXIT_NOTIFY           37
+#define KVM_EXIT_LOONGARCH_IOCSR  38
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -336,8 +339,16 @@ struct kvm_run {
 			__u32 len;
 			__u8  is_write;
 		} mmio;
+		/* KVM_EXIT_LOONGARCH_IOCSR */
+		struct {
+			__u64 phys_addr;
+			__u8  data[8];
+			__u32 len;
+			__u8  is_write;
+		} iocsr_io;
 		/* KVM_EXIT_HYPERCALL */
 		struct {
+#define KVM_HC_MAP_GPA_RANGE 12
 			__u64 nr;
 			__u64 args[6];
 			__u64 ret;
@@ -808,6 +819,8 @@ enum {
 	kvm_ioeventfd_flag_nr_deassign,
 	kvm_ioeventfd_flag_nr_virtio_ccw_notify,
 	kvm_ioeventfd_flag_nr_fast_mmio,
+	kvm_ioeventfd_flag_nr_batch_begin,
+	kvm_ioeventfd_flag_nr_batch_end,
 	kvm_ioeventfd_flag_nr_max,
 };
 
@@ -816,6 +829,10 @@ enum {
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
 	(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+#define KVM_IOEVENTFD_FLAG_BATCH_BEGIN \
+    (1<< kvm_ioeventfd_flag_nr_batch_begin)
+#define KVM_IOEVENTFD_FLAG_BATCH_END \
+    (1 << kvm_ioeventfd_flag_nr_batch_end)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
@@ -1188,6 +1205,25 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_COUNTER_OFFSET 227
 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
+#define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230
+
+#define KVM_CAP_ARM_TMM 300
+
+#define KVM_CAP_SEV_ES_GHCB 500
+#define KVM_CAP_HYGON_COCO_EXT 501
+/* support userspace to request firmware to build CSV3 guest's memory space */
+#define KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM  (1 << 0)
+/* support request to update CSV3 guest's memory region multiple times */
+#define KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA (1 << 1)
+/* support request to inject secret to CSV3 guest */
+#define KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET    (1 << 2)
+
+#define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502
+
+#define KVM_CAP_ARM_HISI_IPIV 798
+#define KVM_CAP_ARM_VIRT_MSI_BYPASS 799
+
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1358,6 +1394,7 @@ struct kvm_dirty_tlb {
 #define KVM_REG_ARM64		0x6000000000000000ULL
 #define KVM_REG_MIPS		0x7000000000000000ULL
 #define KVM_REG_RISCV		0x8000000000000000ULL
+#define KVM_REG_LOONGARCH	0x9000000000000000ULL
 
 #define KVM_REG_SIZE_SHIFT	52
 #define KVM_REG_SIZE_MASK	0x00f0000000000000ULL
@@ -1449,6 +1486,13 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_PV_TIME	KVM_DEV_TYPE_ARM_PV_TIME
 	KVM_DEV_TYPE_RISCV_AIA,
 #define KVM_DEV_TYPE_RISCV_AIA		KVM_DEV_TYPE_RISCV_AIA
+	KVM_DEV_TYPE_LOONGARCH_IPI,
+#define KVM_DEV_TYPE_LOONGARCH_IPI	KVM_DEV_TYPE_LOONGARCH_IPI
+	KVM_DEV_TYPE_LOONGARCH_EIOINTC,
+#define KVM_DEV_TYPE_LOONGARCH_EIOINTC	KVM_DEV_TYPE_LOONGARCH_EIOINTC
+	KVM_DEV_TYPE_LOONGARCH_PCHPIC,
+#define KVM_DEV_TYPE_LOONGARCH_PCHPIC	KVM_DEV_TYPE_LOONGARCH_PCHPIC
+
 	KVM_DEV_TYPE_MAX,
 };
 
@@ -1457,6 +1501,38 @@ struct kvm_vfio_spapr_tce {
 	__s32	tablefd;
 };
 
+#define MAX_NUMA_NODE 8
+#define MAX_CPU_BIT_MAP 4
+#define MAX_NODE_BIT_MAP (MAX_NODES / BITS_PER_LONG)
+
+struct kvm_numa_node {
+	__u64 numa_id;
+	__u64 ipa_start;
+	__u64 ipa_size;
+	__u64 host_numa_nodes[MAX_NODE_BIT_MAP];
+	__u64 cpu_id[MAX_CPU_BIT_MAP];
+};
+
+struct kvm_numa_info {
+	__u64 numa_cnt;
+	struct kvm_numa_node numa_nodes[MAX_NUMA_NODE];
+};
+
+struct kvm_user_data {
+	__u64 loader_start;
+	/*
+	 * When the lowest bit of dtb_info is 0, the value of dtb_info represents the size of the DTB,
+	 * and data_start and data_size represent the address base and size of the MMIO.
+	 * When the lowest bit of dtb_info is 1, data_start and data_size represent the address base
+	 * and size of the DTB.
+	 */
+	__u64 dtb_info;
+	__u64 data_start;
+	__u64 data_size;
+	__u64 ram_size;
+	struct kvm_numa_info numa_info;
+};
+
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
@@ -1469,7 +1545,7 @@ struct kvm_vfio_spapr_tce {
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
-
+#define KVM_LOAD_USER_DATA		  _IOW(KVMIO,  0x49, struct kvm_user_data)
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
 	__u64 user_addr;
@@ -1514,6 +1590,17 @@ struct kvm_s390_ucas_mapping {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+
+#ifdef __aarch64__
+struct kvm_master_dev_info
+{
+    __u32 nvectors;    /* number of msi vectors */
+    struct kvm_msi msi[0];
+};
+#define KVM_CREATE_SHADOW_DEV     _IOW(KVMIO,  0xf0, struct kvm_master_dev_info)
+#define KVM_DEL_SHADOW_DEV        _IOW(KVMIO,  0xf1, __u32)
+#endif
+
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -1558,6 +1645,7 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
 /* Available with KVM_CAP_COUNTER_OFFSET */
 #define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO,  0xb5, struct kvm_arm_counter_offset)
+#define KVM_ARM_GET_REG_WRITABLE_MASKS _IOR(KVMIO,  0xb6, struct reg_mask_range)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
@@ -1567,6 +1655,10 @@ struct kvm_s390_ucas_mapping {
 #define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
 #define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
 
+/* ioctls for control vcpu setup during system reset */
+#define KVM_CONTROL_VCPU_PRE_SYSTEM_RESET  _IO(KVMIO, 0xe8)
+#define KVM_CONTROL_VCPU_POST_SYSTEM_RESET _IO(KVMIO, 0xe9)
+
 /*
  * ioctls for vcpu fds
  */
@@ -1914,6 +2006,9 @@ enum sev_cmd_id {
 	/* Guest Migration Extension */
 	KVM_SEV_SEND_CANCEL,
 
+	/* Hygon CSV batch command */
+	KVM_CSV_COMMAND_BATCH = 0x18,
+
 	KVM_SEV_NR_MAX,
 };
 
@@ -1992,6 +2087,14 @@ struct kvm_sev_send_update_data {
 	__u32 trans_len;
 };
 
+struct kvm_sev_send_update_vmsa {
+	__u32 vcpu_id;
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
 struct kvm_sev_receive_start {
 	__u32 handle;
 	__u32 policy;
@@ -2010,6 +2113,98 @@ struct kvm_sev_receive_update_data {
 	__u32 trans_len;
 };
 
+struct kvm_sev_receive_update_vmsa {
+	__u32 vcpu_id;
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_csv_batch_list_node {
+	__u64 cmd_data_addr;
+	__u64 addr;
+	__u64 next_cmd_addr;
+};
+
+struct kvm_csv_command_batch {
+	__u32 command_id;
+	__u64 csv_batch_list_uaddr;
+};
+
+struct kvm_csv_init {
+	__u64 userid_addr;
+	__u32 len;
+};
+
+/* CSV3 command */
+enum csv3_cmd_id {
+	KVM_CSV3_NR_MIN = 0xc0,
+
+	KVM_CSV3_INIT = KVM_CSV3_NR_MIN,
+	KVM_CSV3_LAUNCH_ENCRYPT_DATA,
+	KVM_CSV3_LAUNCH_ENCRYPT_VMCB,
+	KVM_CSV3_SEND_ENCRYPT_DATA,
+	KVM_CSV3_SEND_ENCRYPT_CONTEXT,
+	KVM_CSV3_RECEIVE_ENCRYPT_DATA,
+	KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT,
+	KVM_CSV3_HANDLE_MEMORY,
+
+	KVM_CSV3_SET_GUEST_PRIVATE_MEMORY = 0xc8,
+
+	KVM_CSV3_NR_MAX,
+};
+
+struct kvm_csv3_launch_encrypt_data {
+	__u64 gpa;
+	__u64 uaddr;
+	__u32 len;
+};
+
+struct kvm_csv3_init_data {
+	__u64 nodemask;
+};
+
+struct kvm_csv3_send_encrypt_data {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_addr_data;
+	__u32 guest_addr_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_csv3_send_encrypt_context {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_csv3_receive_encrypt_data {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_addr_data;
+	__u32 guest_addr_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_csv3_receive_encrypt_context {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+#define KVM_CSV3_RELEASE_SHARED_MEMORY (0x0001)
+
+struct kvm_csv3_handle_memory {
+	__u64 gpa;
+	__u32 num_pages;
+	__u32 opcode;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
@@ -2252,4 +2447,8 @@ struct kvm_s390_zpci_op {
 /* flags for kvm_s390_zpci_op->u.reg_aen.flags */
 #define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
 
+/* get tmi version */
+#define KVM_GET_TMI_VERSION	_IOR(KVMIO, 0xd2, uint64_t)
+#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM  0x20001
+
 #endif /* __LINUX_KVM_H */
diff --git a/linux-headers/linux/psp-sev.h b/linux-headers/linux/psp-sev.h
index 12ccb70099d417a8870fe38a73dd762a8ab3855c..bcb21339ee39fae6739e70c27e98b2d336dcc253 100644
--- a/linux-headers/linux/psp-sev.h
+++ b/linux-headers/linux/psp-sev.h
@@ -68,6 +68,7 @@ typedef enum {
 	SEV_RET_INVALID_PARAM,
 	SEV_RET_RESOURCE_LIMIT,
 	SEV_RET_SECURE_DATA_INVALID,
+	SEV_RET_INVALID_KEY = 0x27,
 	SEV_RET_MAX,
 } sev_ret_code;
 
diff --git a/linux-headers/linux/stddef.h b/linux-headers/linux/stddef.h
index 9bb07083ac89e7ea6d51903e19804cd403435978..bf9749dd1422607dd57e1f64b8a39f0b32ab0ede 100644
--- a/linux-headers/linux/stddef.h
+++ b/linux-headers/linux/stddef.h
@@ -27,8 +27,13 @@
 	union { \
 		struct { MEMBERS } ATTRS; \
 		struct TAG { MEMBERS } ATTRS NAME; \
-	}
+	} ATTRS
 
+#ifdef __cplusplus
+/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */
+#define __DECLARE_FLEX_ARRAY(T, member)	\
+	T member[0]
+#else
 /**
  * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union
  *
@@ -49,3 +54,5 @@
 #ifndef __counted_by
 #define __counted_by(m)
 #endif
+
+#endif /* _LINUX_STDDEF_H */
diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h
index 59978fbaae3379174fe21a00ce668b2db5eba302..953c75fedae9eac851fed94cac653c29aa493148 100644
--- a/linux-headers/linux/userfaultfd.h
+++ b/linux-headers/linux/userfaultfd.h
@@ -40,7 +40,8 @@
 			   UFFD_FEATURE_EXACT_ADDRESS |		\
 			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
 			   UFFD_FEATURE_WP_UNPOPULATED |	\
-			   UFFD_FEATURE_POISON)
+			   UFFD_FEATURE_POISON |		\
+			   UFFD_FEATURE_WP_ASYNC)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -216,6 +217,11 @@ struct uffdio_api {
 	 * (i.e. empty ptes).  This will be the default behavior for shmem
 	 * & hugetlbfs, so this flag only affects anonymous memory behavior
 	 * when userfault write-protection mode is registered.
+	 *
+	 * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection
+	 * asynchronous mode is supported in which the write fault is
+	 * automatically resolved and write-protection is un-set.
+	 * It implies UFFD_FEATURE_WP_UNPOPULATED.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -232,6 +238,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
 #define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
 #define UFFD_FEATURE_POISON			(1<<14)
+#define UFFD_FEATURE_WP_ASYNC			(1<<15)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index acf72b4999fa75dfcd4c49426c9fdbc135251248..5b1e2871affae86f15b71e3fb84f73ba0015346e 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -25,6 +25,7 @@
 #define VFIO_TYPE1_IOMMU		1
 #define VFIO_SPAPR_TCE_IOMMU		2
 #define VFIO_TYPE1v2_IOMMU		3
+#define VFIO_TYPE1v2_S_IOMMU		12
 /*
  * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping).  This
  * capability is subject to change as groups are added or removed.
@@ -56,6 +57,16 @@
  */
 #define VFIO_UPDATE_VADDR		10
 
+/*
+ * The vfio_iommu driver may support user clears dirty log manually, which means
+ * dirty log can be requested to not cleared automatically after dirty log is
+ * copied to userspace, it's user's duty to clear dirty log.
+ *
+ * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and
+ * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP.
+ */
+#define VFIO_DIRTY_LOG_MANUAL_CLEAR	11
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -214,6 +225,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6)	/* vfio-fsl-mc device */
 #define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
 #define VFIO_DEVICE_FLAGS_CDX	(1 << 8)	/* vfio-cdx device */
+#define VFIO_DEVICE_FLAGS_SECURE	(1 << 15)	/* secure pci device */
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
 	__u32   cap_offset;	/* Offset within info struct of first cap */
@@ -277,8 +289,8 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
 	__u32	index;		/* Region index */
 	__u32	cap_offset;	/* Offset within info struct of first cap */
-	__u64	size;		/* Region size (bytes) */
-	__u64	offset;		/* Region offset from start of device fd */
+	__aligned_u64	size;	/* Region size (bytes) */
+	__aligned_u64	offset;	/* Region offset from start of device fd */
 };
 #define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
 
@@ -294,8 +306,8 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_CAP_SPARSE_MMAP	1
 
 struct vfio_region_sparse_mmap_area {
-	__u64	offset;	/* Offset of mmap'able area within region */
-	__u64	size;	/* Size of mmap'able area */
+	__aligned_u64	offset;	/* Offset of mmap'able area within region */
+	__aligned_u64	size;	/* Size of mmap'able area */
 };
 
 struct vfio_region_info_cap_sparse_mmap {
@@ -450,9 +462,9 @@ struct vfio_device_migration_info {
 					     VFIO_DEVICE_STATE_V1_RESUMING)
 
 	__u32 reserved;
-	__u64 pending_bytes;
-	__u64 data_offset;
-	__u64 data_size;
+	__aligned_u64 pending_bytes;
+	__aligned_u64 data_offset;
+	__aligned_u64 data_size;
 };
 
 /*
@@ -476,7 +488,7 @@ struct vfio_device_migration_info {
 
 struct vfio_region_info_cap_nvlink2_ssatgt {
 	struct vfio_info_cap_header header;
-	__u64 tgt;
+	__aligned_u64 tgt;
 };
 
 /*
@@ -816,7 +828,7 @@ struct vfio_device_gfx_plane_info {
 	__u32 drm_plane_type;	/* type of plane: DRM_PLANE_TYPE_* */
 	/* out */
 	__u32 drm_format;	/* drm format of plane */
-	__u64 drm_format_mod;   /* tiled mode */
+	__aligned_u64 drm_format_mod;   /* tiled mode */
 	__u32 width;	/* width of plane */
 	__u32 height;	/* height of plane */
 	__u32 stride;	/* stride of plane */
@@ -829,6 +841,7 @@ struct vfio_device_gfx_plane_info {
 		__u32 region_index;	/* region index */
 		__u32 dmabuf_id;	/* dma-buf id */
 	};
+	__u32 reserved;
 };
 
 #define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14)
@@ -863,9 +876,10 @@ struct vfio_device_ioeventfd {
 #define VFIO_DEVICE_IOEVENTFD_32	(1 << 2) /* 4-byte write */
 #define VFIO_DEVICE_IOEVENTFD_64	(1 << 3) /* 8-byte write */
 #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK	(0xf)
-	__u64	offset;			/* device fd offset of write */
-	__u64	data;			/* data to be written */
+	__aligned_u64	offset;		/* device fd offset of write */
+	__aligned_u64	data;		/* data to be written */
 	__s32	fd;			/* -1 for de-assignment */
+	__u32	reserved;
 };
 
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
@@ -1434,6 +1448,27 @@ struct vfio_device_feature_mig_data_size {
 
 #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device
+ * based on the operation specified in op flag.
+ *
+ * The functionality is incorporated for devices that needs bus master control,
+ * but the in-band device interface lacks the support. Consequently, it is not
+ * applicable to PCI devices, as bus master control for PCI devices is managed
+ * in-band through the configuration space. At present, this feature is supported
+ * only for CDX devices.
+ * When the device's BUS MASTER setting is configured as CLEAR, it will result in
+ * blocking all incoming DMA requests from the device. On the other hand, configuring
+ * the device's BUS MASTER setting as SET (enable) will grant the device the
+ * capability to perform DMA to the host memory.
+ */
+struct vfio_device_feature_bus_master {
+	__u32 op;
+#define		VFIO_DEVICE_FEATURE_CLEAR_MASTER	0	/* Clear Bus Master */
+#define		VFIO_DEVICE_FEATURE_SET_MASTER		1	/* Set Bus Master */
+};
+#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
@@ -1449,7 +1484,7 @@ struct vfio_iommu_type1_info {
 	__u32	flags;
 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
 #define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
-	__u64	iova_pgsizes;	/* Bitmap of supported page sizes */
+	__aligned_u64	iova_pgsizes;		/* Bitmap of supported page sizes */
 	__u32   cap_offset;	/* Offset within info struct of first cap */
 	__u32   pad;
 };
@@ -1628,8 +1663,30 @@ struct vfio_iommu_type1_dma_unmap {
  * actual bitmap. If dirty pages logging is not enabled, an error will be
  * returned.
  *
- * Only one of the flags _START, _STOP and _GET may be specified at a time.
+ * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as
+ * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying
+ * dirty bitmap is not cleared automatically. The user can clear it manually by
+ * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set.
  *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set,
+ * instructs the IOMMU driver to clear the dirty status of pages in a bitmap
+ * for IOMMU container for a given IOVA range. The user must specify the IOVA
+ * range, the bitmap and the pgsize through the structure
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
+ * supports clearing a bitmap of the smallest supported pgsize only and can be
+ * modified in future to clear a bitmap of any specified supported pgsize. The
+ * user must provide a memory area for the bitmap memory and specify its size
+ * in bitmap.size. One bit is used to represent one page consecutively starting
+ * from iova offset. The user should provide page size in bitmap.pgsize field.
+ * A bit set in the bitmap indicates that the page at that offset from iova is
+ * cleared the dirty status, and dirty tracking is re-enabled for that page. The
+ * caller must set argsz to a value including the size of structure
+ * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If
+ * dirty pages logging is not enabled, an error will be returned. Note: user
+ * should clear dirty log before handle corresponding dirty pages.
+ *
+ * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be
+ * specified at a time.
  */
 struct vfio_iommu_type1_dirty_bitmap {
 	__u32        argsz;
@@ -1637,6 +1694,8 @@ struct vfio_iommu_type1_dirty_bitmap {
 #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
 #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
 #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR	(1 << 3)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP	(1 << 4)
 	__u8         data[];
 };
 
diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h
index f5c48b61ab62244104bbf1b2100d3db7286f8c82..f5c05abe8b0a5bba7857318437c375a12014bda3 100644
--- a/linux-headers/linux/vhost.h
+++ b/linux-headers/linux/vhost.h
@@ -43,6 +43,10 @@
  * The bit is set using an atomic 32 bit operation. */
 /* Set base address for logging. */
 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
+/* Set buffer size for logging */
+#define VHOST_SET_LOG_SIZE _IOW(VHOST_VIRTIO, 0x05, __u64)
+/* Logging sync */
+#define VHOST_LOG_SYNC _IO(VHOST_VIRTIO, 0x06)
 /* Specify an eventfd file descriptor to signal on log write. */
 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
 /* By default, a device gets one vhost_worker that its virtqueues share. This
@@ -219,4 +223,49 @@
  */
 #define VHOST_VDPA_RESUME		_IO(VHOST_VIRTIO, 0x7E)
 
+/* Get the group for the descriptor table including driver & device areas
+ * of a virtqueue: read index, write group in num.
+ * The virtqueue index is stored in the index field of vhost_vring_state.
+ * The group ID of the descriptor table for this specific virtqueue
+ * is returned via num field of vhost_vring_state.
+ */
+#define VHOST_VDPA_GET_VRING_DESC_GROUP	_IOWR(VHOST_VIRTIO, 0x7F,	\
+					      struct vhost_vring_state)
+
+/* Bind a vDPA device to the specified iommufd
+ * 
+ * After the return of this ioctl, the vDPA device is binded to the specified
+ * iommufd, and the device id is also returned.
+ */
+#define VHOST_VDPA_BIND_IOMMUFD _IO(VHOST_VIRTIO, 0x90)
+
+/* Unbind a vDPA device from the specified iommufd
+ * 
+ * After the return of this ioctl, the vDPA device is unbinded from the specified
+ * iommufd.
+ */
+#define VHOST_VDPA_UNBIND_IOMMUFD _IO(VHOST_VIRTIO, 0x91)
+
+/* Associate the vDPA device with an address space within the bound iommufd
+ *
+ * After the return of this ioctl, the vDPA device is attached to the bound
+ * iommufd.
+ */
+#define VHOST_VDPA_ATTACH_IOMMUFD_PT _IO(VHOST_VIRTIO, 0x92)
+
+/* Detach the vDPA device from an address space within the bound iommufd.
+ * 
+ * After the return of this ioctl, the vDPA device is detached from the address
+ * space within the bound iommufd.
+ */
+#define VHOST_VDPA_DETACH_IOMMUFD_PT _IO(VHOST_VIRTIO, 0x93)
+
+/* set and get device buffer */
+#define VHOST_GET_DEV_BUFFER _IOR(VHOST_VIRTIO, 0xb0, struct vhost_vdpa_config)
+#define VHOST_SET_DEV_BUFFER _IOW(VHOST_VIRTIO, 0xb1, struct vhost_vdpa_config)
+#define VHOST_GET_DEV_BUFFER_SIZE _IOR(VHOST_VIRTIO, 0xb3, __u32)
+
+/* set device migtration state */
+#define VHOST_VDPA_SET_MIG_STATE _IOW(VHOST_VIRTIO, 0xb2, __u8)
+
 #endif
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index cf9e74468b11c7dd35dc6e6c3c57ef9d6336d01d..87eb318b46eb850646892e998bb116700a75459c 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2980,7 +2980,7 @@ static uintptr_t pgb_try_itree(const PGBAddrs *ga, uintptr_t base,
 static uintptr_t pgb_find_itree(const PGBAddrs *ga, IntervalTreeRoot *root,
                                 uintptr_t align, uintptr_t brk)
 {
-    uintptr_t last = mmap_min_addr;
+    uintptr_t last = sizeof(uintptr_t) == 4 ? MiB : GiB;
     uintptr_t base, skip;
 
     while (true) {
@@ -3263,7 +3263,8 @@ static void load_elf_image(const char *image_name, const ImageSource *src,
                            char **pinterp_name)
 {
     g_autofree struct elf_phdr *phdr = NULL;
-    abi_ulong load_addr, load_bias, loaddr, hiaddr, error;
+    abi_ulong load_addr, load_bias, loaddr, hiaddr, error, align;
+    size_t reserve_size, align_size;
     int i, prot_exec;
     Error *err = NULL;
 
@@ -3347,6 +3348,9 @@ static void load_elf_image(const char *image_name, const ImageSource *src,
 
     load_addr = loaddr;
 
+    align = pow2ceil(info->alignment);
+    info->alignment = align;
+
     if (pinterp_name != NULL) {
         if (ehdr->e_type == ET_EXEC) {
             /*
@@ -3355,8 +3359,6 @@ static void load_elf_image(const char *image_name, const ImageSource *src,
              */
             probe_guest_base(image_name, loaddr, hiaddr);
         } else {
-            abi_ulong align;
-
             /*
              * The binary is dynamic, but we still need to
              * select guest_base.  In this case we pass a size.
@@ -3374,10 +3376,7 @@ static void load_elf_image(const char *image_name, const ImageSource *src,
              * Since we do not have complete control over the guest
              * address space, we prefer the kernel to choose some address
              * rather than force the use of LOAD_ADDR via MAP_FIXED.
-             * But without MAP_FIXED we cannot guarantee alignment,
-             * only suggest it.
              */
-            align = pow2ceil(info->alignment);
             if (align) {
                 load_addr &= -align;
             }
@@ -3401,13 +3400,35 @@ static void load_elf_image(const char *image_name, const ImageSource *src,
      * In both cases, we will overwrite pages in this range with mappings
      * from the executable.
      */
-    load_addr = target_mmap(load_addr, (size_t)hiaddr - loaddr + 1, PROT_NONE,
+    reserve_size = (size_t)hiaddr - loaddr + 1;
+    align_size = reserve_size;
+
+    if (ehdr->e_type != ET_EXEC && align > qemu_real_host_page_size()) {
+        align_size += align - 1;
+    }
+
+    load_addr = target_mmap(load_addr, align_size, PROT_NONE,
                             MAP_PRIVATE | MAP_ANON | MAP_NORESERVE |
                             (ehdr->e_type == ET_EXEC ? MAP_FIXED_NOREPLACE : 0),
                             -1, 0);
     if (load_addr == -1) {
         goto exit_mmap;
     }
+
+    if (align_size != reserve_size) {
+        abi_ulong align_addr = ROUND_UP(load_addr, align);
+        abi_ulong align_end = align_addr + reserve_size;
+        abi_ulong load_end = load_addr + align_size;
+
+        if (align_addr != load_addr) {
+            target_munmap(load_addr, align_addr - load_addr);
+        }
+        if (align_end != load_end) {
+            target_munmap(align_end, load_end - align_end);
+        }
+        load_addr = align_addr;
+    }
+
     load_bias = load_addr - loaddr;
 
     if (elf_is_fdpic(ehdr)) {
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index e384e14248901c686a3aa747c5f0f2620382b8b6..513996e6fa26a226c7c907d381d42ea2ee909654 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -53,7 +53,6 @@
 #include <utime.h>
 #include <sys/sysinfo.h>
 #include <sys/signalfd.h>
-//#include <sys/user.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/tcp.h>
diff --git a/meson.build b/meson.build
index 6c77d9687ded405fcd6cd8b231929663644c890a..d221f5cad53ec5864b35a8b4132d46df4d3c8b57 100644
--- a/meson.build
+++ b/meson.build
@@ -114,6 +114,8 @@ elif cpu in ['riscv32']
   kvm_targets = ['riscv32-softmmu']
 elif cpu in ['riscv64']
   kvm_targets = ['riscv64-softmmu']
+elif cpu in ['loongarch64']
+  kvm_targets = ['loongarch64-softmmu']
 else
   kvm_targets = []
 endif
@@ -1041,6 +1043,32 @@ if not get_option('zstd').auto() or have_block
                     required: get_option('zstd'),
                     method: 'pkg-config')
 endif
+qpl = not_found
+if not get_option('qpl').auto() or have_system
+  qpl = dependency('qpl', version: '>=1.5.0',
+                    required: get_option('qpl'),
+                    method: 'pkg-config')
+endif
+uadk = not_found
+if not get_option('uadk').auto() or have_system
+  libwd = dependency('libwd', version: '>=2.6',
+                      required: get_option('uadk'),
+                      method: 'pkg-config')
+  libwd_comp = dependency('libwd_comp', version: '>=2.6',
+                           required: get_option('uadk'),
+                           method: 'pkg-config')
+  if libwd.found() and libwd_comp.found()
+     uadk = declare_dependency(dependencies: [libwd, libwd_comp])
+  endif
+endif
+
+qatzip = not_found
+if not get_option('qatzip').auto() or have_system
+  qatzip = dependency('qatzip', version: '>=1.1.2',
+                      required: get_option('qatzip'),
+                      method: 'pkg-config')
+endif
+
 virgl = not_found
 
 have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found()
@@ -1137,7 +1165,7 @@ iconv = not_found
 curses = not_found
 if have_system and get_option('curses').allowed()
   curses_test = '''
-    #if defined(__APPLE__) || defined(__OpenBSD__)
+    #ifdef __APPLE__
     #define _XOPEN_SOURCE_EXTENDED 1
     #endif
     #include <locale.h>
@@ -1483,6 +1511,8 @@ endif
 gcrypt = not_found
 nettle = not_found
 hogweed = not_found
+crypto_sm4 = not_found
+crypto_sm3 = not_found
 xts = 'none'
 
 if get_option('nettle').enabled() and get_option('gcrypt').enabled()
@@ -1508,6 +1538,28 @@ if not gnutls_crypto.found()
          cc.find_library('gpg-error', required: true)],
         version: gcrypt.version())
     endif
+    crypto_sm4 = gcrypt
+    # SM4 ALG is available in libgcrypt >= 1.9
+    if gcrypt.found() and not cc.links('''
+      #include <gcrypt.h>
+      int main(void) {
+        gcry_cipher_hd_t handler;
+        gcry_cipher_open(&handler, GCRY_CIPHER_SM4, GCRY_CIPHER_MODE_ECB, 0);
+        return 0;
+      }''', dependencies: gcrypt)
+      crypto_sm4 = not_found
+    endif
+    crypto_sm3 = gcrypt
+    # SM3 ALG is available in libgcrypt >= 1.8
+    if gcrypt.found() and not cc.links('''
+      #include <gcrypt.h>
+      int main(void) {
+        gcry_md_hd_t handler;
+        gcry_md_open(&handler, GCRY_MD_SM3, 0);
+        return 0;
+      }''', dependencies: gcrypt)
+      crypto_sm3 = not_found
+    endif
   endif
   if (not get_option('nettle').auto() or have_system) and not gcrypt.found()
     nettle = dependency('nettle', version: '>=3.4',
@@ -1516,6 +1568,43 @@ if not gnutls_crypto.found()
     if nettle.found() and not cc.has_header('nettle/xts.h', dependencies: nettle)
       xts = 'private'
     endif
+    crypto_sm4 = nettle
+    # SM4 ALG is available in nettle >= 3.9
+    if nettle.found() and not cc.links('''
+      #include <nettle/sm4.h>
+      int main(void) {
+        struct sm4_ctx ctx;
+        unsigned char key[16] = {0};
+        sm4_set_encrypt_key(&ctx, key);
+        return 0;
+      }''', dependencies: nettle)
+      crypto_sm4 = not_found
+    endif
+    crypto_sm3 = nettle
+    # SM3 ALG is available in nettle >= 3.4
+    if nettle.found() and not cc.links('''
+      #include <nettle/sm3.h>
+      #include <nettle/hmac.h>
+      int main(void) {
+      struct sm3_ctx ctx;
+      struct hmac_sm3_ctx hmac_ctx;
+      unsigned char data[64] = {0};
+      unsigned char output[32];
+    
+      // SM3 hash function test
+      sm3_init(&ctx);
+      sm3_update(&ctx, 64, data);
+      sm3_digest(&ctx, 32, data);
+
+      // HMAC-SM3 test
+      hmac_sm3_set_key(&hmac_ctx, 32, data);
+      hmac_sm3_update(&hmac_ctx, 64, data);
+      hmac_sm3_digest(&hmac_ctx, 32, output);
+    
+      return 0;
+      }''', dependencies: nettle)
+      crypto_sm3 = not_found
+    endif
   endif
 endif
 
@@ -1909,8 +1998,14 @@ endif
 # libxdp
 libxdp = not_found
 if not get_option('af_xdp').auto() or have_system
-    libxdp = dependency('libxdp', required: get_option('af_xdp'),
-                        version: '>=1.4.0', method: 'pkg-config')
+    if libbpf.found()
+        libxdp = dependency('libxdp', required: get_option('af_xdp'),
+                            version: '>=1.4.0', method: 'pkg-config')
+    else
+        if get_option('af_xdp').enabled()
+            error('libxdp requested, but libbpf is not available')
+        endif
+    endif
 endif
 
 # libdw
@@ -2118,6 +2213,8 @@ config_host_data.set('CONFIG_BLKIO', blkio.found())
 if blkio.found()
   config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD',
                        blkio.version().version_compare('>=1.3.0'))
+  config_host_data.set('CONFIG_BLKIO_WRITE_ZEROS_FUA',
+                       blkio.version().version_compare('>=1.4.0'))
 endif
 config_host_data.set('CONFIG_CURL', curl.found())
 config_host_data.set('CONFIG_CURSES', curses.found())
@@ -2202,12 +2299,17 @@ config_host_data.set('CONFIG_GNUTLS_CRYPTO', gnutls_crypto.found())
 config_host_data.set('CONFIG_TASN1', tasn1.found())
 config_host_data.set('CONFIG_GCRYPT', gcrypt.found())
 config_host_data.set('CONFIG_NETTLE', nettle.found())
+config_host_data.set('CONFIG_CRYPTO_SM4', crypto_sm4.found())
+config_host_data.set('CONFIG_CRYPTO_SM3', crypto_sm3.found())
 config_host_data.set('CONFIG_HOGWEED', hogweed.found())
 config_host_data.set('CONFIG_QEMU_PRIVATE_XTS', xts == 'private')
 config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim)
 config_host_data.set('CONFIG_STATX', has_statx)
 config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id)
 config_host_data.set('CONFIG_ZSTD', zstd.found())
+config_host_data.set('CONFIG_QPL', qpl.found())
+config_host_data.set('CONFIG_UADK', uadk.found())
+config_host_data.set('CONFIG_QATZIP', qatzip.found())
 config_host_data.set('CONFIG_FUSE', fuse.found())
 config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found())
 config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found())
@@ -3358,6 +3460,7 @@ if have_system or have_user
     'target/hppa',
     'target/i386',
     'target/i386/kvm',
+    'target/loongarch',
     'target/mips/tcg',
     'target/nios2',
     'target/ppc',
@@ -4277,6 +4380,8 @@ summary_info += {'nettle':            nettle}
 if nettle.found()
    summary_info += {'  XTS':             xts != 'private'}
 endif
+summary_info += {'SM4 ALG support':   crypto_sm4}
+summary_info += {'SM3 ALG support':   crypto_sm3}
 summary_info += {'AF_ALG support':    have_afalg}
 summary_info += {'rng-none':          get_option('rng_none')}
 summary_info += {'Linux keyring':     have_keyring}
@@ -4379,6 +4484,9 @@ summary_info += {'snappy support':    snappy}
 summary_info += {'bzip2 support':     libbzip2}
 summary_info += {'lzfse support':     liblzfse}
 summary_info += {'zstd support':      zstd}
+summary_info += {'Query Processing Library support': qpl}
+summary_info += {'UADK Library support': uadk}
+summary_info += {'qatzip support':    qatzip}
 summary_info += {'NUMA host support': numa}
 summary_info += {'capstone':          capstone}
 summary_info += {'libpmem support':   libpmem}
diff --git a/meson_options.txt b/meson_options.txt
index c9baeda6395634c3478a3c2a3a8c8f57fbe0b592..61996300d57ee34c0a9843d1100b6309bf5bd260 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -121,7 +121,7 @@ option('avx512f', type: 'feature', value: 'disabled',
        description: 'AVX512F optimizations')
 option('avx512bw', type: 'feature', value: 'auto',
        description: 'AVX512BW optimizations')
-option('keyring', type: 'feature', value: 'auto',
+option('keyring', type: 'feature', value: 'disabled',
        description: 'Linux keyring support')
 option('libkeyutils', type: 'feature', value: 'auto',
        description: 'Linux keyutils support')
@@ -259,6 +259,12 @@ option('xkbcommon', type : 'feature', value : 'auto',
        description: 'xkbcommon support')
 option('zstd', type : 'feature', value : 'auto',
        description: 'zstd compression support')
+option('qpl', type : 'feature', value : 'auto',
+       description: 'Query Processing Library support')
+option('uadk', type : 'feature', value : 'auto',
+       description: 'UADK Library support')
+option('qatzip', type: 'feature', value: 'auto',
+       description: 'QATzip compression support')
 option('fuse', type: 'feature', value: 'auto',
        description: 'FUSE block device export')
 option('fuse_lseek', type : 'feature', value : 'auto',
diff --git a/migration/block.c b/migration/block.c
index a15f9bddcb9d8abaa8e9a31dd9f72ff732395b59..710ef6f4909b905a82cd0926c39659f0a6ee14ef 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -409,7 +409,10 @@ static int init_blk_migration(QEMUFile *f)
         }
 
         sectors = bdrv_nb_sectors(bs);
-        if (sectors <= 0) {
+        if (sectors == 0) {
+            continue;
+        }
+        if (sectors < 0) {
             ret = sectors;
             bdrv_next_cleanup(&it);
             goto out;
diff --git a/migration/channel.c b/migration/channel.c
index ca3319a30985c420020281391206ef418a014e3c..f9de064f3b134dfe9a2128e9189d5565dd7f81e7 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -117,9 +117,12 @@ int migration_channel_read_peek(QIOChannel *ioc,
         len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL,
                                      QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
 
-        if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
-            error_setg(errp,
-                       "Failed to peek at channel");
+        if (len < 0 && len != QIO_CHANNEL_ERR_BLOCK) {
+            return -1;
+        }
+
+        if (len == 0) {
+            error_setg(errp, "Failed to peek at channel");
             return -1;
         }
 
diff --git a/migration/colo.c b/migration/colo.c
index 4447e349149a19bf1ed9969e923b1ea8b2296989..8f301b7e57164d484b0ffb28778ba82269611e60 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -830,6 +830,16 @@ static void *colo_process_incoming_thread(void *opaque)
         return NULL;
     }
 
+    /* Make sure all file formats throw away their mutable metadata */
+    qemu_mutex_lock_iothread();
+    bdrv_activate_all(&local_err);
+    if (local_err) {
+        qemu_mutex_unlock_iothread();
+        error_report_err(local_err);
+        return NULL;
+    }
+    qemu_mutex_unlock_iothread();
+
     failover_init_state();
 
     mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
@@ -917,7 +927,6 @@ out:
 int coroutine_fn colo_incoming_co(void)
 {
     MigrationIncomingState *mis = migration_incoming_get_current();
-    Error *local_err = NULL;
     QemuThread th;
 
     assert(qemu_mutex_iothread_locked());
@@ -926,13 +935,6 @@ int coroutine_fn colo_incoming_co(void)
         return 0;
     }
 
-    /* Make sure all file formats throw away their mutable metadata */
-    bdrv_activate_all(&local_err);
-    if (local_err) {
-        error_report_err(local_err);
-        return -EINVAL;
-    }
-
     qemu_thread_create(&th, "COLO incoming", colo_process_incoming_thread,
                        mis, QEMU_THREAD_JOINABLE);
 
diff --git a/migration/file.c b/migration/file.c
index 5d4975f43e198b6dfdc86a6c96ddcd1b2f09e3c1..fb3f743e54e6699ce037bc501f9cd136a0fc43be 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -46,12 +46,19 @@ void file_start_outgoing_migration(MigrationState *s,
 
     trace_migration_file_outgoing(filename);
 
-    fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC,
-                                     0600, errp);
+    fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY, 0600, errp);
     if (!fioc) {
         return;
     }
 
+    if (ftruncate(fioc->fd, offset)) {
+        error_setg_errno(errp, errno,
+                         "failed to truncate migration file to offset %" PRIx64,
+                         offset);
+        object_unref(OBJECT(fioc));
+        return;
+    }
+
     ioc = QIO_CHANNEL(fioc);
     if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
         return;
diff --git a/migration/meson.build b/migration/meson.build
index 92b1cc429783cf040fec324ebb111105d1b68284..aba2581705cb53de18765bbe854926a58a2c21cc 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -22,7 +22,7 @@ system_ss.add(files(
   'migration.c',
   'multifd.c',
   'multifd-zlib.c',
-  'ram-compress.c',
+  'multifd-zero-page.c',
   'options.c',
   'postcopy-ram.c',
   'savevm.c',
@@ -40,7 +40,11 @@ if get_option('live_block_migration').allowed()
   system_ss.add(files('block.c'))
 endif
 system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
+system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
+system_ss.add(when: uadk, if_true: files('multifd-uadk.c'))
+system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
 
 specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
                 if_true: files('ram.c',
-                               'target.c'))
+                               'target.c',
+                               'ram-compress.c'))
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 86ae832176a03512e4d5a4c9896facebc55edf4d..d6d5f373a198505cf6a36ac679ab80af8f3dc10b 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -30,6 +30,7 @@
 #include "sysemu/runstate.h"
 #include "ui/qemu-spice.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
 #include "options.h"
 #include "migration.h"
 
@@ -291,6 +292,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
             MigrationParameter_str(MIGRATION_PARAMETER_DECOMPRESS_THREADS),
             params->decompress_threads);
         assert(params->has_throttle_trigger_threshold);
+        monitor_printf(mon, "%s: %s\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_METHOD),
+            CompressMethod_str(params->compress_method));
         monitor_printf(mon, "%s: %u\n",
             MigrationParameter_str(MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD),
             params->throttle_trigger_threshold);
@@ -344,6 +348,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, "%s: %s\n",
             MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION),
             MultiFDCompression_str(params->multifd_compression));
+        assert(params->has_zero_page_detection);
+        monitor_printf(mon, "%s: %s\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION),
+            qapi_enum_lookup(&ZeroPageDetection_lookup,
+                params->zero_page_detection));
         monitor_printf(mon, "%s: %" PRIu64 " bytes\n",
             MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE),
             params->xbzrle_cache_size);
@@ -392,6 +401,24 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, "%s: %s\n",
             MigrationParameter_str(MIGRATION_PARAMETER_MODE),
             qapi_enum_lookup(&MigMode_lookup, params->mode));
+
+        assert(params->sev_pdh);
+        monitor_printf(mon, "%s: %s\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_SEV_PDH),
+            params->sev_pdh);
+        assert(params->sev_plat_cert);
+        monitor_printf(mon, "%s: %s\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_SEV_PLAT_CERT),
+            params->sev_plat_cert);
+        assert(params->sev_amd_cert);
+        monitor_printf(mon, "%s: %s\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_SEV_AMD_CERT),
+            params->sev_amd_cert);
+
+        assert(params->has_hdbss_buffer_size);
+        monitor_printf(mon, "%s: %u\n",
+            MigrationParameter_str(MIGRATION_PARAMETER_HDBSS_BUFFER_SIZE),
+            params->hdbss_buffer_size);
     }
 
     qapi_free_MigrationParameters(params);
@@ -403,6 +430,11 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict)
     const char *name = qdict_get_str(qdict, "name");
     Error *err = NULL;
 
+    if (virtcca_cvm_enabled()) {
+        error_setg(&err, "The loadvm command is temporarily unsupported in cvm.");
+        return;
+    }
+
     vm_stop(RUN_STATE_RESTORE_VM);
 
     if (load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) {
@@ -519,6 +551,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
     MigrateSetParameters *p = g_new0(MigrateSetParameters, 1);
     uint64_t valuebw = 0;
     uint64_t cache_size;
+    CompressMethod compress_method;
     Error *err = NULL;
     int val, ret;
 
@@ -544,6 +577,14 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         p->has_decompress_threads = true;
         visit_type_uint8(v, param, &p->decompress_threads, &err);
         break;
+    case MIGRATION_PARAMETER_COMPRESS_METHOD:
+        p->has_compress_method = true;
+        visit_type_CompressMethod(v, param, &compress_method, &err);
+        if (err) {
+            break;
+        }
+        p->compress_method = compress_method;
+        break;
     case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD:
         p->has_throttle_trigger_threshold = true;
         visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err);
@@ -628,10 +669,18 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         p->has_multifd_zlib_level = true;
         visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
         break;
+    case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL:
+        p->has_multifd_qatzip_level = true;
+        visit_type_uint8(v, param, &p->multifd_qatzip_level, &err);
+        break;
     case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
         p->has_multifd_zstd_level = true;
         visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
         break;
+    case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION:
+        p->has_zero_page_detection = true;
+        visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err);
+        break;
     case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE:
         p->has_xbzrle_cache_size = true;
         if (!visit_type_size(v, param, &cache_size, &err)) {
@@ -679,6 +728,25 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         p->has_mode = true;
         visit_type_MigMode(v, param, &p->mode, &err);
         break;
+    case MIGRATION_PARAMETER_SEV_PDH:
+        p->sev_pdh = g_new0(StrOrNull, 1);
+        p->sev_pdh->type = QTYPE_QSTRING;
+        visit_type_str(v, param, &p->sev_pdh->u.s, &err);
+        break;
+    case MIGRATION_PARAMETER_SEV_PLAT_CERT:
+        p->sev_plat_cert = g_new0(StrOrNull, 1);
+        p->sev_plat_cert->type = QTYPE_QSTRING;
+        visit_type_str(v, param, &p->sev_plat_cert->u.s, &err);
+        break;
+    case MIGRATION_PARAMETER_SEV_AMD_CERT:
+        p->sev_amd_cert = g_new0(StrOrNull, 1);
+        p->sev_amd_cert->type = QTYPE_QSTRING;
+        visit_type_str(v, param, &p->sev_amd_cert->u.s, &err);
+        break;
+    case MIGRATION_PARAMETER_HDBSS_BUFFER_SIZE:
+        p->has_hdbss_buffer_size = true;
+        visit_type_uint8(v, param, &p->hdbss_buffer_size, &err);
+        break;
     default:
         assert(0);
     }
diff --git a/migration/migration.c b/migration/migration.c
index 3ce04b2aafe2cf40ca47af3743938a54626d2383..526f926b797279958db61c1b163e60f76577e384 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -68,6 +68,8 @@
 #include "sysemu/dirtylimit.h"
 #include "qemu/sockets.h"
 
+#define DEFAULT_FD_MAX 4096
+
 static NotifierList migration_state_notifiers =
     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
 
@@ -99,7 +101,6 @@ static bool migration_object_check(MigrationState *ms, Error **errp);
 static int migration_maybe_pause(MigrationState *s,
                                  int *current_active_state,
                                  int new_state);
-static void migrate_fd_cancel(MigrationState *s);
 static bool close_return_path_on_source(MigrationState *s);
 
 static void migration_downtime_start(MigrationState *s)
@@ -128,11 +129,17 @@ static bool migration_needs_multiple_sockets(void)
     return migrate_multifd() || migrate_postcopy_preempt();
 }
 
-static bool transport_supports_multi_channels(SocketAddress *saddr)
+static bool transport_supports_multi_channels(MigrationAddress *addr)
 {
-    return saddr->type == SOCKET_ADDRESS_TYPE_INET ||
-           saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
-           saddr->type == SOCKET_ADDRESS_TYPE_VSOCK;
+    if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
+        SocketAddress *saddr = &addr->u.socket;
+
+        return saddr->type == SOCKET_ADDRESS_TYPE_INET ||
+               saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
+               saddr->type == SOCKET_ADDRESS_TYPE_VSOCK;
+    }
+
+    return false;
 }
 
 static bool
@@ -140,8 +147,7 @@ migration_channels_and_transport_compatible(MigrationAddress *addr,
                                             Error **errp)
 {
     if (migration_needs_multiple_sockets() &&
-        (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) &&
-        !transport_supports_multi_channels(&addr->u.socket)) {
+        !transport_supports_multi_channels(addr)) {
         error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)");
         return false;
     }
@@ -270,8 +276,13 @@ void migration_incoming_state_destroy(void)
 {
     struct MigrationIncomingState *mis = migration_incoming_get_current();
 
-    multifd_load_cleanup();
+    multifd_recv_cleanup();
     compress_threads_load_cleanup();
+    /*
+     * RAM state cleanup needs to happen after multifd cleanup, because
+     * multifd threads can use some of its states (receivedmap).
+     */
+    qemu_loadvm_state_cleanup();
 
     if (mis->to_src_file) {
         /* Tell source that we are done */
@@ -623,7 +634,7 @@ static void process_incoming_migration_bh(void *opaque)
 
     trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced");
 
-    multifd_load_shutdown();
+    multifd_recv_shutdown();
 
     dirty_bitmap_mig_before_vm_start();
 
@@ -699,6 +710,13 @@ process_incoming_migration_co(void *opaque)
     }
 
     if (ret < 0) {
+        MigrationState *s = migrate_get_current();
+
+        if (migrate_has_error(s)) {
+            WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
+                error_report_err(s->error);
+            }
+        }
         error_report("load of migration failed: %s", strerror(-ret));
         goto fail;
     }
@@ -715,7 +733,7 @@ fail:
                       MIGRATION_STATUS_FAILED);
     qemu_fclose(mis->from_src_file);
 
-    multifd_load_cleanup();
+    multifd_recv_cleanup();
     compress_threads_load_cleanup();
 
     exit(EXIT_FAILURE);
@@ -848,8 +866,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
         default_channel = !mis->from_src_file;
     }
 
-    if (multifd_load_setup(errp) != 0) {
-        error_setg(errp, "Failed to setup multifd channels");
+    if (multifd_recv_setup(errp) != 0) {
         return;
     }
 
@@ -1301,7 +1318,7 @@ static void migrate_fd_cleanup(MigrationState *s)
         }
         qemu_mutex_lock_iothread();
 
-        multifd_save_cleanup();
+        multifd_send_shutdown();
         qemu_mutex_lock(&s->qemu_file_lock);
         tmp = s->to_dst_file;
         s->to_dst_file = NULL;
@@ -1377,7 +1394,7 @@ static void migrate_error_free(MigrationState *s)
     }
 }
 
-static void migrate_fd_error(MigrationState *s, const Error *error)
+void migrate_fd_error(MigrationState *s, const Error *error)
 {
     trace_migrate_fd_error(error_get_pretty(error));
     assert(s->to_dst_file == NULL);
@@ -1386,7 +1403,7 @@ static void migrate_fd_error(MigrationState *s, const Error *error)
     migrate_set_error(s, error);
 }
 
-static void migrate_fd_cancel(MigrationState *s)
+void migrate_fd_cancel(MigrationState *s)
 {
     int old_state ;
 
@@ -1713,6 +1730,31 @@ void migrate_del_blocker(Error **reasonp)
     }
 }
 
+/* 
+ * Kernel will expand the fatable allocated to the qemu process when
+ * the number of fds held by qemu process exceeds a power of 2 (starting from 64).
+ * Each expansion introduces tens of ms of latency due to RCU synchronization.
+ * The expansion is completed during qemu process initialization to avoid
+ * triggering this action during the migration downtime phase.
+ */
+static void qemu_pre_extend_fdtable(void)
+{
+    int buffer[DEFAULT_FD_MAX] = {0};
+    int i;
+
+    /* expand fdtable */
+    for (i = 0; i < DEFAULT_FD_MAX; i++) {
+        buffer[i] = qemu_dup(STDIN_FILENO);
+    }
+
+    /* close tmp fd */
+    for (i = 0; i < DEFAULT_FD_MAX; i++) {
+        if (buffer[i] > 0) {
+            (void)qemu_close(buffer[i]);
+        }
+    }
+}
+
 void qmp_migrate_incoming(const char *uri, bool has_channels,
                           MigrationChannelList *channels, Error **errp)
 {
@@ -1732,6 +1774,8 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
         return;
     }
 
+    qemu_pre_extend_fdtable();
+
     qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
 
     if (local_err) {
@@ -3299,6 +3343,9 @@ static void *migration_thread(void *opaque)
     MigThrError thr_error;
     bool urgent = false;
 
+    /* report migration thread pid to libvirt */
+    qapi_event_send_migration_pid(qemu_get_thread_id());
+
     thread = migration_threads_add("live_migration", qemu_get_thread_id());
 
     rcu_register_thread();
@@ -3306,6 +3353,10 @@ static void *migration_thread(void *opaque)
     object_ref(OBJECT(s));
     update_iteration_initial_status(s);
 
+    if (!multifd_send_setup()) {
+        goto out;
+    }
+
     qemu_mutex_lock_iothread();
     qemu_savevm_state_header(s->to_dst_file);
     qemu_mutex_unlock_iothread();
@@ -3377,6 +3428,7 @@ static void *migration_thread(void *opaque)
         urgent = migration_rate_limit();
     }
 
+out:
     trace_migration_thread_after_loop();
     migration_iteration_finish(s);
     object_unref(OBJECT(s));
@@ -3630,15 +3682,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
         return;
     }
 
-    if (multifd_save_setup(&local_err) != 0) {
-        migrate_set_error(s, local_err);
-        error_report_err(local_err);
-        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
-                          MIGRATION_STATUS_FAILED);
-        migrate_fd_cleanup(s);
-        return;
-    }
-
     if (migrate_background_snapshot()) {
         qemu_thread_create(&s->thread, "bg_snapshot",
                 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
diff --git a/migration/migration.h b/migration/migration.h
index cf2c9c88e01d670b5b4c9d3de315a1b37a4f0d21..eeddb7c0bde3ec700200820aca6b99bb2544ee9d 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -482,6 +482,7 @@ bool  migration_has_all_channels(void);
 
 uint64_t migrate_max_downtime(void);
 
+void migrate_fd_error(MigrationState *s, const Error *error);
 void migrate_set_error(MigrationState *s, const Error *error);
 bool migrate_has_error(MigrationState *s);
 
@@ -550,4 +551,8 @@ void migration_rp_kick(MigrationState *s);
 
 int migration_stop_vm(RunState state);
 
+void migrate_fd_cancel(MigrationState *s);
+
+bool memcrypt_enabled(void);
+
 #endif
diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c
new file mode 100644
index 0000000000000000000000000000000000000000..88b6fb44ad66263629edf1e79f15fe39900c43c3
--- /dev/null
+++ b/migration/multifd-qatzip.c
@@ -0,0 +1,395 @@
+/*
+ * Multifd QATzip compression implementation
+ *
+ * Copyright (c) Bytedance
+ *
+ * Authors:
+ *  Bryan Zhang <bryan.zhang@bytedance.com>
+ *  Hao Xiang <hao.xiang@bytedance.com>
+ *  Yichen Wang <yichen.wang@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/ramblock.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qapi-types-migration.h"
+#include "options.h"
+#include "multifd.h"
+#include <qatzip.h>
+
+typedef struct {
+    /*
+     * Unique session for use with QATzip API
+     */
+    QzSession_T sess;
+
+    /*
+     * For compression: Buffer for pages to compress
+     * For decompression: Buffer for data to decompress
+     */
+    uint8_t *in_buf;
+    uint32_t in_len;
+
+    /*
+     * For compression: Output buffer of compressed data
+     * For decompression: Output buffer of decompressed data
+     */
+    uint8_t *out_buf;
+    uint32_t out_len;
+} QatzipData;
+
+/**
+ * qatzip_send_setup: Set up QATzip session and private buffers.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_send_setup(MultiFDSendParams *p, Error **errp)
+{
+    QatzipData *q;
+    QzSessionParamsDeflate_T params;
+    const char *err_msg;
+    int ret;
+
+    q = g_new0(QatzipData, 1);
+    p->compress_data = q;
+    /* We need one extra place for the packet header */
+    p->iov = g_new0(struct iovec, 2);
+
+    /*
+     * Initialize QAT device with software fallback by default. This allows
+     * QATzip to use CPU path when QAT hardware reaches maximum throughput.
+     */
+    ret = qzInit(&q->sess, true);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzInit failed";
+        goto err;
+    }
+
+    ret = qzGetDefaultsDeflate(&params);
+    if (ret != QZ_OK) {
+        err_msg = "qzGetDefaultsDeflate failed";
+        goto err;
+    }
+
+    /* Make sure to use configured QATzip compression level. */
+    params.common_params.comp_lvl = migrate_multifd_qatzip_level();
+    ret = qzSetupSessionDeflate(&q->sess, &params);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzSetupSessionDeflate failed";
+        goto err;
+    }
+
+    if (MULTIFD_PACKET_SIZE > UINT32_MAX) {
+        err_msg = "packet size too large for QAT";
+        goto err;
+    }
+
+    q->in_len = MULTIFD_PACKET_SIZE;
+    /*
+     * PINNED_MEM is an enum from qatzip headers, which means to use
+     * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device
+     * is not available or software fallback is used, the malloc flag needs to
+     * be set as COMMON_MEM.
+     */
+    q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM);
+    if (!q->in_buf) {
+        q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM);
+        if (!q->in_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess);
+    q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM);
+    if (!q->out_buf) {
+        q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM);
+        if (!q->out_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    return 0;
+
+err:
+    error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg);
+    return -1;
+}
+
+/**
+ * qatzip_send_cleanup: Tear down QATzip session and release private buffers.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     None
+ */
+static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+    QatzipData *q = p->compress_data;
+
+    if (q) {
+        if (q->in_buf) {
+            qzFree(q->in_buf);
+        }
+        if (q->out_buf) {
+            qzFree(q->out_buf);
+        }
+        (void)qzTeardownSession(&q->sess);
+        (void)qzClose(&q->sess);
+        g_free(q);
+    }
+
+    g_free(p->iov);
+    p->iov = NULL;
+    p->compress_data = NULL;
+}
+
+/**
+ * qatzip_send_prepare: Compress pages and update IO channel info.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+    MultiFDPages_t *pages = p->pages;
+    QatzipData *q = p->compress_data;
+    int ret;
+    unsigned int in_len, out_len;
+
+    if (!multifd_send_prepare_common(p)) {
+        goto out;
+    }
+
+    /*
+     * Unlike other multifd compression implementations, we use a non-streaming
+     * API and place all the data into one buffer, rather than sending each
+     * page to the compression API at a time. Based on initial benchmarks, the
+     * non-streaming API outperforms the streaming API. Plus, the logic in QEMU
+     * is friendly to using the non-streaming API anyway. If either of these
+     * statements becomes no longer true, we can revisit adding a streaming
+     * implementation.
+     */
+    for (int i = 0; i < pages->normal_num; i++) {
+        memcpy(q->in_buf + (i * p->page_size),
+               pages->block->host + pages->offset[i],
+               p->page_size);
+    }
+
+    in_len = pages->normal_num * p->page_size;
+    if (in_len > q->in_len) {
+        error_setg(errp, "multifd %u: unexpectedly large input", p->id);
+        return -1;
+    }
+    out_len = q->out_len;
+
+    ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1);
+    if (ret != QZ_OK) {
+        error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK",
+                   p->id, ret);
+        return -1;
+    }
+    if (in_len != pages->normal_num * p->page_size) {
+        error_setg(errp, "multifd %u: QATzip failed to compress all input",
+                   p->id);
+        return -1;
+    }
+
+    p->iov[p->iovs_num].iov_base = q->out_buf;
+    p->iov[p->iovs_num].iov_len = out_len;
+    p->iovs_num++;
+    p->next_packet_size = out_len;
+
+out:
+    p->flags |= MULTIFD_FLAG_QATZIP;
+    multifd_send_fill_packet(p);
+    return 0;
+}
+
+/**
+ * qatzip_recv_setup: Set up QATzip session and allocate private buffers.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+    QatzipData *q;
+    QzSessionParamsDeflate_T params;
+    const char *err_msg;
+    int ret;
+
+    q = g_new0(QatzipData, 1);
+    p->compress_data = q;
+
+    /*
+     * Initialize QAT device with software fallback by default. This allows
+     * QATzip to use CPU path when QAT hardware reaches maximum throughput.
+     */
+    ret = qzInit(&q->sess, true);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzInit failed";
+        goto err;
+    }
+
+    ret = qzGetDefaultsDeflate(&params);
+    if (ret != QZ_OK) {
+        err_msg = "qzGetDefaultsDeflate failed";
+        goto err;
+    }
+
+    ret = qzSetupSessionDeflate(&q->sess, &params);
+    if (ret != QZ_OK && ret != QZ_DUPLICATE) {
+        err_msg = "qzSetupSessionDeflate failed";
+        goto err;
+    }
+
+    /*
+     * Reserve extra spaces for the incoming packets. Current implementation
+     * doesn't send uncompressed pages in case the compression gets too big.
+     */
+    q->in_len = MULTIFD_PACKET_SIZE * 2;
+    /*
+     * PINNED_MEM is an enum from qatzip headers, which means to use
+     * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device
+     * is not available or software fallback is used, the malloc flag needs to
+     * be set as COMMON_MEM.
+     */
+    q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM);
+    if (!q->in_buf) {
+        q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM);
+        if (!q->in_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    q->out_len = MULTIFD_PACKET_SIZE;
+    q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM);
+    if (!q->out_buf) {
+        q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM);
+        if (!q->out_buf) {
+            err_msg = "qzMalloc failed";
+            goto err;
+        }
+    }
+
+    return 0;
+
+err:
+    error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg);
+    return -1;
+}
+
+/**
+ * qatzip_recv_cleanup: Tear down QATzip session and release private buffers.
+ *
+ * @param p    Multifd channel params
+ * @return     None
+ */
+static void qatzip_recv_cleanup(MultiFDRecvParams *p)
+{
+    QatzipData *q = p->compress_data;
+
+    if (q) {
+        if (q->in_buf) {
+            qzFree(q->in_buf);
+        }
+        if (q->out_buf) {
+            qzFree(q->out_buf);
+        }
+        (void)qzTeardownSession(&q->sess);
+        (void)qzClose(&q->sess);
+        g_free(q);
+    }
+    p->compress_data = NULL;
+}
+
+
+/**
+ * qatzip_recv: Decompress pages and copy them to the appropriate
+ * locations.
+ *
+ * @param p    Multifd channel params
+ * @param errp Pointer to error, which will be set in case of error
+ * @return     0 on success, -1 on error (and *errp will be set)
+ */
+static int qatzip_recv(MultiFDRecvParams *p, Error **errp)
+{
+    QatzipData *q = p->compress_data;
+    int ret;
+    unsigned int in_len, out_len;
+    uint32_t in_size = p->next_packet_size;
+    uint32_t expected_size = p->normal_num * p->page_size;
+    uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+
+    if (in_size > q->in_len) {
+        error_setg(errp, "multifd %u: received unexpectedly large packet",
+                   p->id);
+        return -1;
+    }
+
+    if (flags != MULTIFD_FLAG_QATZIP) {
+        error_setg(errp, "multifd %u: flags received %x flags expected %x",
+                   p->id, flags, MULTIFD_FLAG_QATZIP);
+        return -1;
+    }
+
+    multifd_recv_zero_page_process(p);
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
+    ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    in_len = in_size;
+    out_len = q->out_len;
+    ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len);
+    if (ret != QZ_OK) {
+        error_setg(errp, "multifd %u: qzDecompress failed", p->id);
+        return -1;
+    }
+    if (out_len != expected_size) {
+        error_setg(errp, "multifd %u: packet size received %u size expected %u",
+                   p->id, out_len, expected_size);
+        return -1;
+    }
+
+    /* Copy each page to its appropriate location. */
+    for (int i = 0; i < p->normal_num; i++) {
+        memcpy(p->host + p->normal[i],
+               q->out_buf + p->page_size * i,
+               p->page_size);
+        ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
+    }
+    return 0;
+}
+
+static MultiFDMethods multifd_qatzip_ops = {
+    .send_setup = qatzip_send_setup,
+    .send_cleanup = qatzip_send_cleanup,
+    .send_prepare = qatzip_send_prepare,
+    .recv_setup = qatzip_recv_setup,
+    .recv_cleanup = qatzip_recv_cleanup,
+    .recv = qatzip_recv
+};
+
+static void multifd_qatzip_register(void)
+{
+    multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops);
+}
+
+migration_init(multifd_qatzip_register);
diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
new file mode 100644
index 0000000000000000000000000000000000000000..fea60e39379511dac03d63ada0fb0499f31e48ff
--- /dev/null
+++ b/migration/multifd-qpl.c
@@ -0,0 +1,763 @@
+/*
+ * Multifd qpl compression accelerator implementation
+ *
+ * Copyright (c) 2023 Intel Corporation
+ *
+ * Authors:
+ *  Yuan Liu<yuan1.liu@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "qapi/qapi-types-migration.h"
+#include "exec/ramblock.h"
+#include "multifd.h"
+#include "qpl/qpl.h"
+
+/* Maximum number of retries to resubmit a job if IAA work queues are full */
+#define MAX_SUBMIT_RETRY_NUM (3)
+
+typedef struct {
+    /* the QPL hardware path job */
+    qpl_job *job;
+    /* indicates if fallback to software path is required */
+    bool fallback_sw_path;
+    /* output data from the software path */
+    uint8_t *sw_output;
+    /* output data length from the software path */
+    uint32_t sw_output_len;
+} QplHwJob;
+
+typedef struct {
+    /* array of hardware jobs, the number of jobs equals the number pages */
+    QplHwJob *hw_jobs;
+    /* the QPL software job for the slow path and software fallback */
+    qpl_job *sw_job;
+    /* the number of pages that the QPL needs to process at one time */
+    uint32_t page_num;
+    /* array of compressed page buffers */
+    uint8_t *zbuf;
+    /* array of compressed page lengths */
+    uint32_t *zlen;
+    /* the status of the hardware device */
+    bool hw_avail;
+} QplData;
+
+/**
+ * check_hw_avail: check if IAA hardware is available
+ *
+ * If the IAA hardware does not exist or is unavailable,
+ * the QPL hardware job initialization will fail.
+ *
+ * Returns true if IAA hardware is available, otherwise false.
+ *
+ * @job_size: indicates the hardware job size if hardware is available
+ */
+static bool check_hw_avail(uint32_t *job_size)
+{
+    qpl_path_t path = qpl_path_hardware;
+    uint32_t size = 0;
+    qpl_job *job;
+
+    if (qpl_get_job_size(path, &size) != QPL_STS_OK) {
+        return false;
+    }
+    assert(size > 0);
+    job = g_malloc0(size);
+    if (qpl_init_job(path, job) != QPL_STS_OK) {
+        g_free(job);
+        return false;
+    }
+    g_free(job);
+    *job_size = size;
+    return true;
+}
+
+/**
+ * multifd_qpl_free_sw_job: clean up software job
+ *
+ * Free the software job resources.
+ *
+ * @qpl: pointer to the QplData structure
+ */
+static void multifd_qpl_free_sw_job(QplData *qpl)
+{
+    assert(qpl);
+    if (qpl->sw_job) {
+        qpl_fini_job(qpl->sw_job);
+        g_free(qpl->sw_job);
+        qpl->sw_job = NULL;
+    }
+}
+
+/**
+ * multifd_qpl_free_jobs: clean up hardware jobs
+ *
+ * Free all hardware job resources.
+ *
+ * @qpl: pointer to the QplData structure
+ */
+static void multifd_qpl_free_hw_job(QplData *qpl)
+{
+    assert(qpl);
+    if (qpl->hw_jobs) {
+        for (int i = 0; i < qpl->page_num; i++) {
+            qpl_fini_job(qpl->hw_jobs[i].job);
+            g_free(qpl->hw_jobs[i].job);
+            qpl->hw_jobs[i].job = NULL;
+        }
+        g_free(qpl->hw_jobs);
+        qpl->hw_jobs = NULL;
+    }
+}
+
+/**
+ * multifd_qpl_init_sw_job: initialize a software job
+ *
+ * Use the QPL software path to initialize a job
+ *
+ * @qpl: pointer to the QplData structure
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_init_sw_job(QplData *qpl, Error **errp)
+{
+    qpl_path_t path = qpl_path_software;
+    uint32_t size = 0;
+    qpl_job *job = NULL;
+    qpl_status status;
+
+    status = qpl_get_job_size(path, &size);
+    if (status != QPL_STS_OK) {
+        error_setg(errp, "qpl_get_job_size failed with error %d", status);
+        return -1;
+    }
+    job = g_malloc0(size);
+    status = qpl_init_job(path, job);
+    if (status != QPL_STS_OK) {
+        error_setg(errp, "qpl_init_job failed with error %d", status);
+        g_free(job);
+        return -1;
+    }
+    qpl->sw_job = job;
+    return 0;
+}
+
+/**
+ * multifd_qpl_init_jobs: initialize hardware jobs
+ *
+ * Use the QPL hardware path to initialize jobs
+ *
+ * @qpl: pointer to the QplData structure
+ * @size: the size of QPL hardware path job
+ * @errp: pointer to an error
+ */
+static void multifd_qpl_init_hw_job(QplData *qpl, uint32_t size, Error **errp)
+{
+    qpl_path_t path = qpl_path_hardware;
+    qpl_job *job = NULL;
+    qpl_status status;
+
+    qpl->hw_jobs = g_new0(QplHwJob, qpl->page_num);
+    for (int i = 0; i < qpl->page_num; i++) {
+        job = g_malloc0(size);
+        status = qpl_init_job(path, job);
+        /* the job initialization should succeed after check_hw_avail */
+        assert(status == QPL_STS_OK);
+        qpl->hw_jobs[i].job = job;
+    }
+}
+
+/**
+ * multifd_qpl_init: initialize QplData structure
+ *
+ * Allocate and initialize a QplData structure
+ *
+ * Returns a QplData pointer on success or NULL on error
+ *
+ * @num: the number of pages
+ * @size: the page size
+ * @errp: pointer to an error
+ */
+static QplData *multifd_qpl_init(uint32_t num, uint32_t size, Error **errp)
+{
+    uint32_t job_size = 0;
+    QplData *qpl;
+
+    qpl = g_new0(QplData, 1);
+    qpl->page_num = num;
+    if (multifd_qpl_init_sw_job(qpl, errp) != 0) {
+        g_free(qpl);
+        return NULL;
+    }
+    qpl->hw_avail = check_hw_avail(&job_size);
+    if (qpl->hw_avail) {
+        multifd_qpl_init_hw_job(qpl, job_size, errp);
+    }
+    qpl->zbuf = g_malloc0(size * num);
+    qpl->zlen = g_new0(uint32_t, num);
+    return qpl;
+}
+
+/**
+ * multifd_qpl_deinit: clean up QplData structure
+ *
+ * Free jobs, buffers and the QplData structure
+ *
+ * @qpl: pointer to the QplData structure
+ */
+static void multifd_qpl_deinit(QplData *qpl)
+{
+    if (qpl) {
+        multifd_qpl_free_sw_job(qpl);
+        multifd_qpl_free_hw_job(qpl);
+        g_free(qpl->zbuf);
+        g_free(qpl->zlen);
+        g_free(qpl);
+    }
+}
+
+/**
+ * multifd_qpl_send_setup: set up send side
+ *
+ * Set up the channel with QPL compression.
+ *
+ * Returns 0 on success or -1 on error
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp)
+{
+    QplData *qpl;
+
+    qpl = multifd_qpl_init(p->page_count, p->page_size, errp);
+    if (!qpl) {
+        return -1;
+    }
+    p->compress_data = qpl;
+
+    /*
+     * the page will be compressed independently and sent using an IOV. The
+     * additional two IOVs are used to store packet header and compressed data
+     * length
+     */
+    p->iov = g_new0(struct iovec, p->page_count + 2);
+    return 0;
+}
+
+/**
+ * multifd_qpl_send_cleanup: clean up send side
+ *
+ * Close the channel and free memory.
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+    multifd_qpl_deinit(p->compress_data);
+    p->compress_data = NULL;
+    g_free(p->iov);
+    p->iov = NULL;
+}
+
+/**
+ * multifd_qpl_prepare_job: prepare the job
+ *
+ * Set the QPL job parameters and properties.
+ *
+ * @job: pointer to the qpl_job structure
+ * @is_compression: indicates compression and decompression
+ * @input: pointer to the input data buffer
+ * @input_len: the length of the input data
+ * @output: pointer to the output data buffer
+ * @output_len: the length of the output data
+ */
+static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression,
+                                    uint8_t *input, uint32_t input_len,
+                                    uint8_t *output, uint32_t output_len)
+{
+    job->op = is_compression ? qpl_op_compress : qpl_op_decompress;
+    job->next_in_ptr = input;
+    job->next_out_ptr = output;
+    job->available_in = input_len;
+    job->available_out = output_len;
+    job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY;
+    /* only supports compression level 1 */
+    job->level = 1;
+}
+
+/**
+ * multifd_qpl_prepare_comp_job: prepare the compression job
+ *
+ * Set the compression job parameters and properties.
+ *
+ * @job: pointer to the qpl_job structure
+ * @input: pointer to the input data buffer
+ * @output: pointer to the output data buffer
+ * @size: the page size
+ */
+static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input,
+                                         uint8_t *output, uint32_t size)
+{
+    /*
+     * Set output length to less than the page size to force the job to
+     * fail in case it compresses to a larger size. We'll send that page
+     * without compression and skip the decompression operation on the
+     * destination.
+     */
+    multifd_qpl_prepare_job(job, true, input, size, output, size - 1);
+}
+
+/**
+ * multifd_qpl_prepare_decomp_job: prepare the decompression job
+ *
+ * Set the decompression job parameters and properties.
+ *
+ * @job: pointer to the qpl_job structure
+ * @input: pointer to the input data buffer
+ * @len: the length of the input data
+ * @output: pointer to the output data buffer
+ * @size: the page size
+ */
+static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t *input,
+                                           uint32_t len, uint8_t *output,
+                                           uint32_t size)
+{
+    multifd_qpl_prepare_job(job, false, input, len, output, size);
+}
+
+/**
+ * multifd_qpl_fill_iov: fill in the IOV
+ *
+ * Fill in the QPL packet IOV
+ *
+ * @p: Params for the channel being used
+ * @data: pointer to the IOV data
+ * @len: The length of the IOV data
+ */
+static void multifd_qpl_fill_iov(MultiFDSendParams *p, uint8_t *data,
+                                 uint32_t len)
+{
+    p->iov[p->iovs_num].iov_base = data;
+    p->iov[p->iovs_num].iov_len = len;
+    p->iovs_num++;
+    p->next_packet_size += len;
+}
+
+/**
+ * multifd_qpl_fill_packet: fill the compressed page into the QPL packet
+ *
+ * Fill the compressed page length and IOV into the QPL packet
+ *
+ * @idx: The index of the compressed length array
+ * @p: Params for the channel being used
+ * @data: pointer to the compressed page buffer
+ * @len: The length of the compressed page
+ */
+static void multifd_qpl_fill_packet(uint32_t idx, MultiFDSendParams *p,
+                                    uint8_t *data, uint32_t len)
+{
+    QplData *qpl = p->compress_data;
+
+    qpl->zlen[idx] = cpu_to_be32(len);
+    multifd_qpl_fill_iov(p, data, len);
+}
+
+/**
+ * multifd_qpl_submit_job: submit a job to the hardware
+ *
+ * Submit a QPL hardware job to the IAA device
+ *
+ * Returns true if the job is submitted successfully, otherwise false.
+ *
+ * @job: pointer to the qpl_job structure
+ */
+static bool multifd_qpl_submit_job(qpl_job *job)
+{
+    qpl_status status;
+    uint32_t num = 0;
+
+retry:
+    status = qpl_submit_job(job);
+    if (status == QPL_STS_QUEUES_ARE_BUSY_ERR) {
+        if (num < MAX_SUBMIT_RETRY_NUM) {
+            num++;
+            goto retry;
+        }
+    }
+    return (status == QPL_STS_OK);
+}
+
+/**
+ * multifd_qpl_compress_pages_slow_path: compress pages using slow path
+ *
+ * Compress the pages using software. If compression fails, the uncompressed
+ * page will be sent.
+ *
+ * @p: Params for the channel being used
+ */
+static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p)
+{
+    QplData *qpl = p->compress_data;
+    uint32_t size = p->page_size;
+    qpl_job *job = qpl->sw_job;
+    uint8_t *zbuf = qpl->zbuf;
+    uint8_t *buf;
+
+    for (int i = 0; i < p->pages->normal_num; i++) {
+        buf = p->pages->block->host + p->pages->offset[i];
+        multifd_qpl_prepare_comp_job(job, buf, zbuf, size);
+        if (qpl_execute_job(job) == QPL_STS_OK) {
+            multifd_qpl_fill_packet(i, p, zbuf, job->total_out);
+        } else {
+            /* send the uncompressed page */
+            multifd_qpl_fill_packet(i, p, buf, size);
+        }
+        zbuf += size;
+    }
+}
+
+/**
+ * multifd_qpl_compress_pages: compress pages
+ *
+ * Submit the pages to the IAA hardware for compression. If hardware
+ * compression fails, it falls back to software compression. If software
+ * compression also fails, the uncompressed page is sent.
+ *
+ * @p: Params for the channel being used
+ */
+static void multifd_qpl_compress_pages(MultiFDSendParams *p)
+{
+    QplData *qpl = p->compress_data;
+    MultiFDPages_t *pages = p->pages;
+    uint32_t size = p->page_size;
+    QplHwJob *hw_job;
+    uint8_t *buf;
+    uint8_t *zbuf;
+
+    for (int i = 0; i < pages->normal_num; i++) {
+        buf = pages->block->host + pages->offset[i];
+        zbuf = qpl->zbuf + (size * i);
+        hw_job = &qpl->hw_jobs[i];
+        multifd_qpl_prepare_comp_job(hw_job->job, buf, zbuf, size);
+        if (multifd_qpl_submit_job(hw_job->job)) {
+            hw_job->fallback_sw_path = false;
+        } else {
+            /*
+             * The IAA work queue is full, any immediate subsequent job
+             * submission is likely to fail, sending the page via the QPL
+             * software path at this point gives us a better chance of
+             * finding the queue open for the next pages.
+             */
+            hw_job->fallback_sw_path = true;
+            multifd_qpl_prepare_comp_job(qpl->sw_job, buf, zbuf, size);
+            if (qpl_execute_job(qpl->sw_job) == QPL_STS_OK) {
+                hw_job->sw_output = zbuf;
+                hw_job->sw_output_len = qpl->sw_job->total_out;
+            } else {
+                hw_job->sw_output = buf;
+                hw_job->sw_output_len = size;
+            }
+        }
+    }
+
+    for (int i = 0; i < pages->normal_num; i++) {
+        buf = pages->block->host + pages->offset[i];
+        zbuf = qpl->zbuf + (size * i);
+        hw_job = &qpl->hw_jobs[i];
+        if (hw_job->fallback_sw_path) {
+            multifd_qpl_fill_packet(i, p, hw_job->sw_output,
+                                    hw_job->sw_output_len);
+            continue;
+        }
+        if (qpl_wait_job(hw_job->job) == QPL_STS_OK) {
+            multifd_qpl_fill_packet(i, p, zbuf, hw_job->job->total_out);
+        } else {
+            /* send the uncompressed page */
+            multifd_qpl_fill_packet(i, p, buf, size);
+        }
+    }
+}
+
+/**
+ * multifd_qpl_send_prepare: prepare data to be able to send
+ *
+ * Create a compressed buffer with all the pages that we are going to
+ * send.
+ *
+ * Returns 0 on success or -1 on error
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+    QplData *qpl = p->compress_data;
+    uint32_t len = 0;
+
+    if (!multifd_send_prepare_common(p)) {
+        goto out;
+    }
+
+    /* The first IOV is used to store the compressed page lengths */
+    len = p->pages->normal_num * sizeof(uint32_t);
+    multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len);
+    if (qpl->hw_avail) {
+        multifd_qpl_compress_pages(p);
+    } else {
+        multifd_qpl_compress_pages_slow_path(p);
+    }
+
+out:
+    p->flags |= MULTIFD_FLAG_QPL;
+    multifd_send_fill_packet(p);
+    return 0;
+}
+
+/**
+ * multifd_qpl_recv_setup: set up receive side
+ *
+ * Create the compressed channel and buffer.
+ *
+ * Returns 0 on success or -1 on error
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+    QplData *qpl;
+
+    qpl = multifd_qpl_init(p->page_count, p->page_size, errp);
+    if (!qpl) {
+        return -1;
+    }
+    p->compress_data = qpl;
+    return 0;
+}
+
+/**
+ * multifd_qpl_recv_cleanup: set up receive side
+ *
+ * Close the channel and free memory.
+ *
+ * @p: Params for the channel being used
+ */
+static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p)
+{
+    multifd_qpl_deinit(p->compress_data);
+    p->compress_data = NULL;
+}
+
+/**
+ * multifd_qpl_process_and_check_job: process and check a QPL job
+ *
+ * Process the job and check whether the job output length is the
+ * same as the specified length
+ *
+ * Returns true if the job execution succeeded and the output length
+ * is equal to the specified length, otherwise false.
+ *
+ * @job: pointer to the qpl_job structure
+ * @is_hardware: indicates whether the job is a hardware job
+ * @len: Specified output length
+ * @errp: pointer to an error
+ */
+static bool multifd_qpl_process_and_check_job(qpl_job *job, bool is_hardware,
+                                              uint32_t len, Error **errp)
+{
+    qpl_status status;
+
+    status = (is_hardware ? qpl_wait_job(job) : qpl_execute_job(job));
+    if (status != QPL_STS_OK) {
+        error_setg(errp, "qpl job failed with error %d", status);
+        return false;
+    }
+    if (job->total_out != len) {
+        error_setg(errp, "qpl decompressed len %u, expected len %u",
+                   job->total_out, len);
+        return false;
+    }
+    return true;
+}
+
+/**
+ * multifd_qpl_decompress_pages_slow_path: decompress pages using slow path
+ *
+ * Decompress the pages using software
+ *
+ * Returns 0 on success or -1 on error
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p,
+                                                  Error **errp)
+{
+    QplData *qpl = p->compress_data;
+    uint32_t size = p->page_size;
+    qpl_job *job = qpl->sw_job;
+    uint8_t *zbuf = qpl->zbuf;
+    uint8_t *addr;
+    uint32_t len;
+
+    for (int i = 0; i < p->normal_num; i++) {
+        len = qpl->zlen[i];
+        addr = p->host + p->normal[i];
+        /* the page is uncompressed, load it */
+        if (len == size) {
+            memcpy(addr, zbuf, size);
+            zbuf += size;
+            continue;
+        }
+        multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size);
+        if (!multifd_qpl_process_and_check_job(job, false, size, errp)) {
+            return -1;
+        }
+        zbuf += len;
+    }
+    return 0;
+}
+
+/**
+ * multifd_qpl_decompress_pages: decompress pages
+ *
+ * Decompress the pages using the IAA hardware. If hardware
+ * decompression fails, it falls back to software decompression.
+ *
+ * Returns 0 on success or -1 on error
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp)
+{
+    QplData *qpl = p->compress_data;
+    uint32_t size = p->page_size;
+    uint8_t *zbuf = qpl->zbuf;
+    uint8_t *addr;
+    uint32_t len;
+    qpl_job *job;
+
+    for (int i = 0; i < p->normal_num; i++) {
+        addr = p->host + p->normal[i];
+        len = qpl->zlen[i];
+        /* the page is uncompressed if received length equals the page size */
+        if (len == size) {
+            memcpy(addr, zbuf, size);
+            zbuf += size;
+            continue;
+        }
+
+        job = qpl->hw_jobs[i].job;
+        multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size);
+        if (multifd_qpl_submit_job(job)) {
+            qpl->hw_jobs[i].fallback_sw_path = false;
+        } else {
+            /*
+             * The IAA work queue is full, any immediate subsequent job
+             * submission is likely to fail, sending the page via the QPL
+             * software path at this point gives us a better chance of
+             * finding the queue open for the next pages.
+             */
+            qpl->hw_jobs[i].fallback_sw_path = true;
+            job = qpl->sw_job;
+            multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size);
+            if (!multifd_qpl_process_and_check_job(job, false, size, errp)) {
+                return -1;
+            }
+        }
+        zbuf += len;
+    }
+
+    for (int i = 0; i < p->normal_num; i++) {
+        /* ignore pages that have already been processed */
+        if (qpl->zlen[i] == size || qpl->hw_jobs[i].fallback_sw_path) {
+            continue;
+        }
+
+        job = qpl->hw_jobs[i].job;
+        if (!multifd_qpl_process_and_check_job(job, true, size, errp)) {
+            return -1;
+        }
+    }
+    return 0;
+}
+/**
+ * multifd_qpl_recv: read the data from the channel into actual pages
+ *
+ * Read the compressed buffer, and uncompress it into the actual
+ * pages.
+ *
+ * Returns 0 on success or -1 on error
+ *
+ * @p: Params for the channel being used
+ * @errp: pointer to an error
+ */
+static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp)
+{
+    QplData *qpl = p->compress_data;
+    uint32_t in_size = p->next_packet_size;
+    uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+    uint32_t len = 0;
+    uint32_t zbuf_len = 0;
+    int ret;
+
+    if (flags != MULTIFD_FLAG_QPL) {
+        error_setg(errp, "multifd %u: flags received %x flags expected %x",
+                   p->id, flags, MULTIFD_FLAG_QPL);
+        return -1;
+    }
+    multifd_recv_zero_page_process(p);
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
+    /* read compressed page lengths */
+    len = p->normal_num * sizeof(uint32_t);
+    assert(len < in_size);
+    ret = qio_channel_read_all(p->c, (void *) qpl->zlen, len, errp);
+    if (ret != 0) {
+        return ret;
+    }
+    for (int i = 0; i < p->normal_num; i++) {
+        qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]);
+        assert(qpl->zlen[i] <= p->page_size);
+        zbuf_len += qpl->zlen[i];
+        ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
+    }
+
+    /* read compressed pages */
+    assert(in_size == len + zbuf_len);
+    ret = qio_channel_read_all(p->c, (void *) qpl->zbuf, zbuf_len, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    if (qpl->hw_avail) {
+        return multifd_qpl_decompress_pages(p, errp);
+    }
+    return multifd_qpl_decompress_pages_slow_path(p, errp);
+}
+
+static MultiFDMethods multifd_qpl_ops = {
+    .send_setup = multifd_qpl_send_setup,
+    .send_cleanup = multifd_qpl_send_cleanup,
+    .send_prepare = multifd_qpl_send_prepare,
+    .recv_setup = multifd_qpl_recv_setup,
+    .recv_cleanup = multifd_qpl_recv_cleanup,
+    .recv = multifd_qpl_recv,
+};
+
+static void multifd_qpl_register(void)
+{
+    multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops);
+}
+
+migration_init(multifd_qpl_register);
diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c
new file mode 100644
index 0000000000000000000000000000000000000000..9a582fc91986981a612e77f1da03af3cacd6e7e6
--- /dev/null
+++ b/migration/multifd-uadk.c
@@ -0,0 +1,371 @@
+/*
+ * Multifd UADK compression accelerator implementation
+ *
+ * Copyright (c) 2024 Huawei Technologies R & D (UK) Ltd
+ *
+ * Authors:
+ *  Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "exec/ramblock.h"
+#include "migration.h"
+#include "multifd.h"
+#include "options.h"
+#include "qemu/error-report.h"
+#include "uadk/wd_comp.h"
+#include "uadk/wd_sched.h"
+
+struct wd_data {
+    handle_t handle;
+    uint8_t *buf;
+    uint32_t *buf_hdr;
+};
+
+static bool uadk_hw_init(void)
+{
+    char alg[] = "zlib";
+    int ret;
+
+    ret = wd_comp_init2(alg, SCHED_POLICY_RR, TASK_HW);
+    if (ret && ret != -WD_EEXIST) {
+        return false;
+    } else {
+        return true;
+    }
+}
+
+static struct wd_data *multifd_uadk_init_sess(uint32_t count,
+                                              uint32_t page_size,
+                                              bool compress, Error **errp)
+{
+    struct wd_comp_sess_setup ss = {0};
+    struct sched_params param = {0};
+    uint32_t size = count * page_size;
+    struct wd_data *wd;
+
+    wd = g_new0(struct wd_data, 1);
+
+    if (uadk_hw_init()) {
+        ss.alg_type = WD_ZLIB;
+        if (compress) {
+            ss.op_type = WD_DIR_COMPRESS;
+            /* Add an additional page for handling output > input */
+            size += page_size;
+        } else {
+            ss.op_type = WD_DIR_DECOMPRESS;
+        }
+        /* We use default level 1 compression and 4K window size */
+        param.type = ss.op_type;
+        ss.sched_param = &param;
+
+        wd->handle = wd_comp_alloc_sess(&ss);
+        if (!wd->handle) {
+            error_setg(errp, "multifd: failed wd_comp_alloc_sess");
+            goto out;
+        }
+    } else {
+        /* For CI test use */
+        warn_report_once("UADK hardware not available. Switch to no compression mode");
+    }
+
+    wd->buf = g_try_malloc(size);
+    if (!wd->buf) {
+        error_setg(errp, "multifd: out of mem for uadk buf");
+        goto out_free_sess;
+    }
+    wd->buf_hdr = g_new0(uint32_t, count);
+    return wd;
+
+out_free_sess:
+    if (wd->handle) {
+        wd_comp_free_sess(wd->handle);
+    }
+out:
+    wd_comp_uninit2();
+    g_free(wd);
+    return NULL;
+}
+
+static void multifd_uadk_uninit_sess(struct wd_data *wd)
+{
+    if (wd->handle) {
+        wd_comp_free_sess(wd->handle);
+    }
+    wd_comp_uninit2();
+    g_free(wd->buf);
+    g_free(wd->buf_hdr);
+    g_free(wd);
+}
+
+/**
+ * multifd_uadk_send_setup: setup send side
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp)
+{
+    struct wd_data *wd;
+
+    wd = multifd_uadk_init_sess(p->page_count, p->page_size, true, errp);
+    if (!wd) {
+        return -1;
+    }
+
+    p->compress_data = wd;
+    assert(p->iov == NULL);
+    /*
+     * Each page will be compressed independently and sent using an IOV. The
+     * additional two IOVs are used to store packet header and compressed data
+     * length
+     */
+
+    p->iov = g_new0(struct iovec, p->page_count + 2);
+    return 0;
+}
+
+/**
+ * multifd_uadk_send_cleanup: cleanup send side
+ *
+ * Close the channel and return memory.
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+    struct wd_data *wd = p->compress_data;
+
+    multifd_uadk_uninit_sess(wd);
+    p->compress_data = NULL;
+    g_free(p->iov);
+    p->iov = NULL;
+}
+
+static inline void prepare_next_iov(MultiFDSendParams *p, void *base,
+                                    uint32_t len)
+{
+    p->iov[p->iovs_num].iov_base = (uint8_t *)base;
+    p->iov[p->iovs_num].iov_len = len;
+    p->next_packet_size += len;
+    p->iovs_num++;
+}
+
+/**
+ * multifd_uadk_send_prepare: prepare data to be able to send
+ *
+ * Create a compressed buffer with all the pages that we are going to
+ * send.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+    struct wd_data *uadk_data = p->compress_data;
+    uint32_t hdr_size;
+    uint8_t *buf = uadk_data->buf;
+    int ret = 0;
+
+    if (!multifd_send_prepare_common(p)) {
+        goto out;
+    }
+
+    hdr_size = p->pages->normal_num * sizeof(uint32_t);
+    /* prepare the header that stores the lengths of all compressed data */
+    prepare_next_iov(p, uadk_data->buf_hdr, hdr_size);
+
+    for (int i = 0; i < p->pages->normal_num; i++) {
+        struct wd_comp_req creq = {
+            .op_type = WD_DIR_COMPRESS,
+            .src     = p->pages->block->host + p->pages->offset[i],
+            .src_len = p->page_size,
+            .dst     = buf,
+            /* Set dst_len to double the src in case compressed out >= page_size */
+            .dst_len = p->page_size * 2,
+        };
+
+        if (uadk_data->handle) {
+            ret = wd_do_comp_sync(uadk_data->handle, &creq);
+            if (ret || creq.status) {
+                error_setg(errp, "multifd %u: failed compression, ret %d status %d",
+                           p->id, ret, creq.status);
+                return -1;
+            }
+            if (creq.dst_len < p->page_size) {
+                uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len);
+                prepare_next_iov(p, buf, creq.dst_len);
+                buf += creq.dst_len;
+            }
+        }
+        /*
+         * Send raw data if no UADK hardware or if compressed out >= page_size.
+         * We might be better off sending raw data if output is slightly less
+         * than page_size as well because at the receive end we can skip the
+         * decompression. But it is tricky to find the right number here.
+         */
+        if (!uadk_data->handle || creq.dst_len >= p->page_size) {
+            uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size);
+            prepare_next_iov(p, p->pages->block->host + p->pages->offset[i],
+                             p->page_size);
+            buf += p->page_size;
+        }
+    }
+out:
+    p->flags |= MULTIFD_FLAG_UADK;
+    multifd_send_fill_packet(p);
+    return 0;
+}
+
+/**
+ * multifd_uadk_recv_setup: setup receive side
+ *
+ * Create the compressed channel and buffer.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+    struct wd_data *wd;
+
+    wd = multifd_uadk_init_sess(p->page_count, p->page_size, false, errp);
+    if (!wd) {
+        return -1;
+    }
+    p->compress_data = wd;
+    return 0;
+}
+
+/**
+ * multifd_uadk_recv_cleanup: cleanup receive side
+ *
+ * Close the channel and return memory.
+ *
+ * @p: Params for the channel that we are using
+ */
+static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p)
+{
+    struct wd_data *wd = p->compress_data;
+
+    multifd_uadk_uninit_sess(wd);
+    p->compress_data = NULL;
+}
+
+/**
+ * multifd_uadk_recv: read the data from the channel into actual pages
+ *
+ * Read the compressed buffer, and uncompress it into the actual
+ * pages.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp)
+{
+    struct wd_data *uadk_data = p->compress_data;
+    uint32_t in_size = p->next_packet_size;
+    uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+    uint32_t hdr_len = p->normal_num * sizeof(uint32_t);
+    uint32_t data_len = 0;
+    uint8_t *buf = uadk_data->buf;
+    int ret = 0;
+
+    if (flags != MULTIFD_FLAG_UADK) {
+        error_setg(errp, "multifd %u: flags received %x flags expected %x",
+                   p->id, flags, MULTIFD_FLAG_ZLIB);
+        return -1;
+    }
+
+    multifd_recv_zero_page_process(p);
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
+    /* read compressed data lengths */
+    assert(hdr_len < in_size);
+    ret = qio_channel_read_all(p->c, (void *) uadk_data->buf_hdr,
+                               hdr_len, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    for (int i = 0; i < p->normal_num; i++) {
+        uadk_data->buf_hdr[i] = be32_to_cpu(uadk_data->buf_hdr[i]);
+        data_len += uadk_data->buf_hdr[i];
+        assert(uadk_data->buf_hdr[i] <= p->page_size);
+    }
+
+    /* read compressed data */
+    assert(in_size == hdr_len + data_len);
+    ret = qio_channel_read_all(p->c, (void *)buf, data_len, errp);
+    if (ret != 0) {
+        return ret;
+    }
+
+    for (int i = 0; i < p->normal_num; i++) {
+        struct wd_comp_req creq = {
+            .op_type = WD_DIR_DECOMPRESS,
+            .src     = buf,
+            .src_len = uadk_data->buf_hdr[i],
+            .dst     = p->host + p->normal[i],
+            .dst_len = p->page_size,
+        };
+
+        if (uadk_data->buf_hdr[i] == p->page_size) {
+            memcpy(p->host + p->normal[i], buf, p->page_size);
+            buf += p->page_size;
+            continue;
+        }
+
+        if (unlikely(!uadk_data->handle)) {
+            error_setg(errp, "multifd %u: UADK HW not available for decompression",
+                       p->id);
+            return -1;
+        }
+
+        ret = wd_do_comp_sync(uadk_data->handle, &creq);
+        if (ret || creq.status) {
+            error_setg(errp, "multifd %u: failed decompression, ret %d status %d",
+                       p->id, ret, creq.status);
+            return -1;
+        }
+        if (creq.dst_len != p->page_size) {
+            error_setg(errp, "multifd %u: decompressed length error", p->id);
+            return -1;
+        }
+        buf += uadk_data->buf_hdr[i];
+     }
+
+    return 0;
+}
+
+static MultiFDMethods multifd_uadk_ops = {
+    .send_setup = multifd_uadk_send_setup,
+    .send_cleanup = multifd_uadk_send_cleanup,
+    .send_prepare = multifd_uadk_send_prepare,
+    .recv_setup = multifd_uadk_recv_setup,
+    .recv_cleanup = multifd_uadk_recv_cleanup,
+    .recv = multifd_uadk_recv,
+};
+
+static void multifd_uadk_register(void)
+{
+    multifd_register_ops(MULTIFD_COMPRESSION_UADK, &multifd_uadk_ops);
+}
+migration_init(multifd_uadk_register);
diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
new file mode 100644
index 0000000000000000000000000000000000000000..e1b8370f88e1ef91d98eb2375b242327c7c5bdc4
--- /dev/null
+++ b/migration/multifd-zero-page.c
@@ -0,0 +1,89 @@
+/*
+ * Multifd zero page detection implementation.
+ *
+ * Copyright (c) 2024 Bytedance Inc
+ *
+ * Authors:
+ *  Hao Xiang <hao.xiang@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "exec/ramblock.h"
+#include "migration.h"
+#include "multifd.h"
+#include "options.h"
+#include "ram.h"
+
+static bool multifd_zero_page_enabled(void)
+{
+    return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD;
+}
+
+static void swap_page_offset(ram_addr_t *pages_offset, int a, int b)
+{
+    ram_addr_t temp;
+
+    if (a == b) {
+        return;
+    }
+
+    temp = pages_offset[a];
+    pages_offset[a] = pages_offset[b];
+    pages_offset[b] = temp;
+}
+
+/**
+ * multifd_send_zero_page_detect: Perform zero page detection on all pages.
+ *
+ * Sorts normal pages before zero pages in p->pages->offset and updates
+ * p->pages->normal_num.
+ *
+ * @param p A pointer to the send params.
+ */
+void multifd_send_zero_page_detect(MultiFDSendParams *p)
+{
+    MultiFDPages_t *pages = p->pages;
+    RAMBlock *rb = pages->block;
+    int i = 0;
+    int j = pages->num - 1;
+
+    if (!multifd_zero_page_enabled()) {
+        pages->normal_num = pages->num;
+        return;
+    }
+
+    /*
+     * Sort the page offset array by moving all normal pages to
+     * the left and all zero pages to the right of the array.
+     */
+    while (i <= j) {
+        uint64_t offset = pages->offset[i];
+
+        if (!buffer_is_zero(rb->host + offset, p->page_size)) {
+            i++;
+            continue;
+        }
+
+        swap_page_offset(pages->offset, i, j);
+        ram_release_page(rb->idstr, offset);
+        j--;
+    }
+
+    pages->normal_num = i;
+}
+
+void multifd_recv_zero_page_process(MultiFDRecvParams *p)
+{
+    for (int i = 0; i < p->zero_num; i++) {
+        void *page = p->host + p->zero[i];
+        if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) {
+            memset(page, 0, p->page_size);
+        } else {
+            ramblock_recv_bitmap_set_offset(p->block, p->zero[i]);
+        }
+    }
+}
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 37ce48621e7866da0995a44be6977daa3fde591a..2df498378023db1f6727ba4eb562e529f4c849e5 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -69,7 +69,11 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
         err_msg = "out of memory for buf";
         goto err_free_zbuff;
     }
-    p->data = z;
+    p->compress_data = z;
+
+    /* Needs 2 IOVs, one for packet header and one for compressed data */
+    p->iov = g_new0(struct iovec, 2);
+
     return 0;
 
 err_free_zbuff:
@@ -92,15 +96,18 @@ err_free_z:
  */
 static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
 {
-    struct zlib_data *z = p->data;
+    struct zlib_data *z = p->compress_data;
 
     deflateEnd(&z->zs);
     g_free(z->zbuff);
     z->zbuff = NULL;
     g_free(z->buf);
     z->buf = NULL;
-    g_free(p->data);
-    p->data = NULL;
+    g_free(p->compress_data);
+    p->compress_data = NULL;
+
+    g_free(p->iov);
+    p->iov = NULL;
 }
 
 /**
@@ -116,17 +123,22 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
  */
 static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
 {
-    struct zlib_data *z = p->data;
+    MultiFDPages_t *pages = p->pages;
+    struct zlib_data *z = p->compress_data;
     z_stream *zs = &z->zs;
     uint32_t out_size = 0;
     int ret;
     uint32_t i;
 
-    for (i = 0; i < p->normal_num; i++) {
+    if (!multifd_send_prepare_common(p)) {
+        goto out;
+    }
+
+    for (i = 0; i < pages->normal_num; i++) {
         uint32_t available = z->zbuff_len - out_size;
         int flush = Z_NO_FLUSH;
 
-        if (i == p->normal_num - 1) {
+        if (i == pages->normal_num - 1) {
             flush = Z_SYNC_FLUSH;
         }
 
@@ -135,7 +147,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
          * with compression. zlib does not guarantee that this is safe,
          * therefore copy the page before calling deflate().
          */
-        memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size);
+        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
         zs->avail_in = p->page_size;
         zs->next_in = z->buf;
 
@@ -169,8 +181,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
     p->iov[p->iovs_num].iov_len = out_size;
     p->iovs_num++;
     p->next_packet_size = out_size;
-    p->flags |= MULTIFD_FLAG_ZLIB;
 
+out:
+    p->flags |= MULTIFD_FLAG_ZLIB;
+    multifd_send_fill_packet(p);
     return 0;
 }
 
@@ -189,7 +203,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp)
     struct zlib_data *z = g_new0(struct zlib_data, 1);
     z_stream *zs = &z->zs;
 
-    p->data = z;
+    p->compress_data = z;
     zs->zalloc = Z_NULL;
     zs->zfree = Z_NULL;
     zs->opaque = Z_NULL;
@@ -219,17 +233,17 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp)
  */
 static void zlib_recv_cleanup(MultiFDRecvParams *p)
 {
-    struct zlib_data *z = p->data;
+    struct zlib_data *z = p->compress_data;
 
     inflateEnd(&z->zs);
     g_free(z->zbuff);
     z->zbuff = NULL;
-    g_free(p->data);
-    p->data = NULL;
+    g_free(p->compress_data);
+    p->compress_data = NULL;
 }
 
 /**
- * zlib_recv_pages: read the data from the channel into actual pages
+ * zlib_recv: read the data from the channel into actual pages
  *
  * Read the compressed buffer, and uncompress it into the actual
  * pages.
@@ -239,9 +253,9 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p)
  * @p: Params for the channel that we are using
  * @errp: pointer to an error
  */
-static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
+static int zlib_recv(MultiFDRecvParams *p, Error **errp)
 {
-    struct zlib_data *z = p->data;
+    struct zlib_data *z = p->compress_data;
     z_stream *zs = &z->zs;
     uint32_t in_size = p->next_packet_size;
     /* we measure the change of total_out */
@@ -256,6 +270,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, flags, MULTIFD_FLAG_ZLIB);
         return -1;
     }
+
+    multifd_recv_zero_page_process(p);
+
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
     ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
 
     if (ret != 0) {
@@ -269,6 +291,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
         int flush = Z_NO_FLUSH;
         unsigned long start = zs->total_out;
 
+        ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
         if (i == p->normal_num - 1) {
             flush = Z_SYNC_FLUSH;
         }
@@ -305,6 +328,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, out_size, expected_size);
         return -1;
     }
+
     return 0;
 }
 
@@ -314,7 +338,7 @@ static MultiFDMethods multifd_zlib_ops = {
     .send_prepare = zlib_send_prepare,
     .recv_setup = zlib_recv_setup,
     .recv_cleanup = zlib_recv_cleanup,
-    .recv_pages = zlib_recv_pages
+    .recv = zlib_recv
 };
 
 static void multifd_zlib_register(void)
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index b471daadcd083a04fb495b3d1e06436d119d4c24..46ee68b6cef1cf85f23b946bbd50d9eebdb82dde 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
     struct zstd_data *z = g_new0(struct zstd_data, 1);
     int res;
 
-    p->data = z;
     z->zcs = ZSTD_createCStream();
     if (!z->zcs) {
         g_free(z);
@@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
         error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
         return -1;
     }
+    p->compress_data = z;
+
+    /* Needs 2 IOVs, one for packet header and one for compressed data */
+    p->iov = g_new0(struct iovec, 2);
     return 0;
 }
 
@@ -90,14 +93,17 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
  */
 static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
 {
-    struct zstd_data *z = p->data;
+    struct zstd_data *z = p->compress_data;
 
     ZSTD_freeCStream(z->zcs);
     z->zcs = NULL;
     g_free(z->zbuff);
     z->zbuff = NULL;
-    g_free(p->data);
-    p->data = NULL;
+    g_free(p->compress_data);
+    p->compress_data = NULL;
+
+    g_free(p->iov);
+    p->iov = NULL;
 }
 
 /**
@@ -113,21 +119,26 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
  */
 static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
 {
-    struct zstd_data *z = p->data;
+    MultiFDPages_t *pages = p->pages;
+    struct zstd_data *z = p->compress_data;
     int ret;
     uint32_t i;
 
+    if (!multifd_send_prepare_common(p)) {
+        goto out;
+    }
+
     z->out.dst = z->zbuff;
     z->out.size = z->zbuff_len;
     z->out.pos = 0;
 
-    for (i = 0; i < p->normal_num; i++) {
+    for (i = 0; i < pages->normal_num; i++) {
         ZSTD_EndDirective flush = ZSTD_e_continue;
 
-        if (i == p->normal_num - 1) {
+        if (i == pages->normal_num - 1) {
             flush = ZSTD_e_flush;
         }
-        z->in.src = p->pages->block->host + p->normal[i];
+        z->in.src = p->pages->block->host + pages->offset[i];
         z->in.size = p->page_size;
         z->in.pos = 0;
 
@@ -141,9 +152,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
          */
         do {
             ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush);
-        } while (ret > 0 && (z->in.size - z->in.pos > 0)
-                         && (z->out.size - z->out.pos > 0));
-        if (ret > 0 && (z->in.size - z->in.pos > 0)) {
+        } while (ret > 0 && (z->in.size > z->in.pos)
+                         && (z->out.size > z->out.pos));
+        if (ret > 0 && (z->in.size > z->in.pos)) {
             error_setg(errp, "multifd %u: compressStream buffer too small",
                        p->id);
             return -1;
@@ -158,8 +169,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
     p->iov[p->iovs_num].iov_len = z->out.pos;
     p->iovs_num++;
     p->next_packet_size = z->out.pos;
-    p->flags |= MULTIFD_FLAG_ZSTD;
 
+out:
+    p->flags |= MULTIFD_FLAG_ZSTD;
+    multifd_send_fill_packet(p);
     return 0;
 }
 
@@ -178,7 +191,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp)
     struct zstd_data *z = g_new0(struct zstd_data, 1);
     int ret;
 
-    p->data = z;
+    p->compress_data = z;
     z->zds = ZSTD_createDStream();
     if (!z->zds) {
         g_free(z);
@@ -216,18 +229,18 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp)
  */
 static void zstd_recv_cleanup(MultiFDRecvParams *p)
 {
-    struct zstd_data *z = p->data;
+    struct zstd_data *z = p->compress_data;
 
     ZSTD_freeDStream(z->zds);
     z->zds = NULL;
     g_free(z->zbuff);
     z->zbuff = NULL;
-    g_free(p->data);
-    p->data = NULL;
+    g_free(p->compress_data);
+    p->compress_data = NULL;
 }
 
 /**
- * zstd_recv_pages: read the data from the channel into actual pages
+ * zstd_recv: read the data from the channel into actual pages
  *
  * Read the compressed buffer, and uncompress it into the actual
  * pages.
@@ -237,13 +250,13 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p)
  * @p: Params for the channel that we are using
  * @errp: pointer to an error
  */
-static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
+static int zstd_recv(MultiFDRecvParams *p, Error **errp)
 {
     uint32_t in_size = p->next_packet_size;
     uint32_t out_size = 0;
     uint32_t expected_size = p->normal_num * p->page_size;
     uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
-    struct zstd_data *z = p->data;
+    struct zstd_data *z = p->compress_data;
     int ret;
     int i;
 
@@ -252,6 +265,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, flags, MULTIFD_FLAG_ZSTD);
         return -1;
     }
+
+    multifd_recv_zero_page_process(p);
+
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
     ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
 
     if (ret != 0) {
@@ -263,6 +284,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
     z->in.pos = 0;
 
     for (i = 0; i < p->normal_num; i++) {
+        ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
         z->out.dst = p->host + p->normal[i];
         z->out.size = p->page_size;
         z->out.pos = 0;
@@ -277,7 +299,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
          */
         do {
             ret = ZSTD_decompressStream(z->zds, &z->out, &z->in);
-        } while (ret > 0 && (z->in.size - z->in.pos > 0)
+        } while (ret > 0 && (z->in.size > z->in.pos)
                          && (z->out.pos < p->page_size));
         if (ret > 0 && (z->out.pos < p->page_size)) {
             error_setg(errp, "multifd %u: decompressStream buffer too small",
@@ -305,7 +327,7 @@ static MultiFDMethods multifd_zstd_ops = {
     .send_prepare = zstd_send_prepare,
     .recv_setup = zstd_recv_setup,
     .recv_cleanup = zstd_recv_cleanup,
-    .recv_pages = zstd_recv_pages
+    .recv = zstd_recv
 };
 
 static void multifd_zstd_register(void)
diff --git a/migration/multifd.c b/migration/multifd.c
index 409460684f29a4671f3da69fe5da20c02322de62..4c310deb61b080bf91a7eee6321bcfc1d2ac1c34 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -11,12 +11,15 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qemu/rcu.h"
 #include "exec/target_page.h"
 #include "sysemu/sysemu.h"
 #include "exec/ramblock.h"
+#include "qemu/cutils.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qapi/qapi-events-migration.h"
 #include "ram.h"
 #include "migration.h"
 #include "migration-stats.h"
@@ -45,20 +48,79 @@ typedef struct {
     uint64_t unused2[4];    /* Reserved for future use */
 } __attribute__((packed)) MultiFDInit_t;
 
+struct {
+    MultiFDSendParams *params;
+    /* array of pages to sent */
+    MultiFDPages_t *pages;
+    /*
+     * Global number of generated multifd packets.
+     *
+     * Note that we used 'uintptr_t' because it'll naturally support atomic
+     * operations on both 32bit / 64 bits hosts.  It means on 32bit systems
+     * multifd will overflow the packet_num easier, but that should be
+     * fine.
+     *
+     * Another option is to use QEMU's Stat64 then it'll be 64 bits on all
+     * hosts, however so far it does not support atomic fetch_add() yet.
+     * Make it easy for now.
+     */
+    uintptr_t packet_num;
+    /*
+     * Synchronization point past which no more channels will be
+     * created.
+     */
+    QemuSemaphore channels_created;
+    /* send channels ready */
+    QemuSemaphore channels_ready;
+    /*
+     * Have we already run terminate threads.  There is a race when it
+     * happens that we got one error while we are exiting.
+     * We will use atomic operations.  Only valid values are 0 and 1.
+     */
+    int exiting;
+    /* multifd ops */
+    MultiFDMethods *ops;
+} *multifd_send_state;
+
+struct {
+    MultiFDRecvParams *params;
+    /* number of created threads */
+    int count;
+    /* syncs main thread and channels */
+    QemuSemaphore sem_sync;
+    /* global number of generated multifd packets */
+    uint64_t packet_num;
+    int exiting;
+    /* multifd ops */
+    MultiFDMethods *ops;
+} *multifd_recv_state;
+
+static bool multifd_use_packets(void)
+{
+    return true;
+}
+
 /* Multifd without compression */
 
 /**
  * nocomp_send_setup: setup send side
  *
- * For no compression this function does nothing.
- *
- * Returns 0 for success or -1 for error
- *
  * @p: Params for the channel that we are using
  * @errp: pointer to an error
  */
 static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
 {
+    if (migrate_zero_copy_send()) {
+        p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
+    }
+
+    if (multifd_use_packets()) {
+        /* We need one extra place for the packet header */
+        p->iov = g_new0(struct iovec, p->page_count + 1);
+    } else {
+        p->iov = g_new0(struct iovec, p->page_count);
+    }
+
     return 0;
 }
 
@@ -72,9 +134,24 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
  */
 static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
 {
+    g_free(p->iov);
+    p->iov = NULL;
     return;
 }
 
+static void multifd_send_prepare_iovs(MultiFDSendParams *p)
+{
+    MultiFDPages_t *pages = p->pages;
+
+    for (int i = 0; i < pages->normal_num; i++) {
+        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
+        p->iov[p->iovs_num].iov_len = p->page_size;
+        p->iovs_num++;
+    }
+
+    p->next_packet_size = pages->normal_num * p->page_size;
+}
+
 /**
  * nocomp_send_prepare: prepare date to be able to send
  *
@@ -88,16 +165,38 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
  */
 static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
 {
-    MultiFDPages_t *pages = p->pages;
+    bool use_zero_copy_send = migrate_zero_copy_send();
+    int ret;
 
-    for (int i = 0; i < p->normal_num; i++) {
-        p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i];
-        p->iov[p->iovs_num].iov_len = p->page_size;
-        p->iovs_num++;
+    multifd_send_zero_page_detect(p);
+
+    if (!multifd_use_packets()) {
+        multifd_send_prepare_iovs(p);
+        return 0;
+    }
+
+    if (!use_zero_copy_send) {
+        /*
+         * Only !zerocopy needs the header in IOV; zerocopy will
+         * send it separately.
+         */
+        multifd_send_prepare_header(p);
     }
 
-    p->next_packet_size = p->normal_num * p->page_size;
+    multifd_send_prepare_iovs(p);
     p->flags |= MULTIFD_FLAG_NOCOMP;
+
+    multifd_send_fill_packet(p);
+
+    if (use_zero_copy_send) {
+        /* Send header first, without zerocopy */
+        ret = qio_channel_write_all(p->c, (void *)p->packet,
+                                    p->packet_len, errp);
+        if (ret != 0) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -113,6 +212,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
  */
 static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
 {
+    p->iov = g_new0(struct iovec, p->page_count);
     return 0;
 }
 
@@ -125,10 +225,12 @@ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
  */
 static void nocomp_recv_cleanup(MultiFDRecvParams *p)
 {
+    g_free(p->iov);
+    p->iov = NULL;
 }
 
 /**
- * nocomp_recv_pages: read the data from the channel into actual pages
+ * nocomp_recv: read the data from the channel
  *
  * For no compression we just need to read things into the correct place.
  *
@@ -137,18 +239,32 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p)
  * @p: Params for the channel that we are using
  * @errp: pointer to an error
  */
-static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
+static int nocomp_recv(MultiFDRecvParams *p, Error **errp)
 {
-    uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+    uint32_t flags;
+
+    if (!multifd_use_packets()) {
+        return 0;
+    }
+
+    flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
 
     if (flags != MULTIFD_FLAG_NOCOMP) {
         error_setg(errp, "multifd %u: flags received %x flags expected %x",
                    p->id, flags, MULTIFD_FLAG_NOCOMP);
         return -1;
     }
+
+    multifd_recv_zero_page_process(p);
+
+    if (!p->normal_num) {
+        return 0;
+    }
+
     for (int i = 0; i < p->normal_num; i++) {
         p->iov[i].iov_base = p->host + p->normal[i];
         p->iov[i].iov_len = p->page_size;
+        ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
     }
     return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
 }
@@ -159,7 +275,7 @@ static MultiFDMethods multifd_nocomp_ops = {
     .send_prepare = nocomp_send_prepare,
     .recv_setup = nocomp_recv_setup,
     .recv_cleanup = nocomp_recv_cleanup,
-    .recv_pages = nocomp_recv_pages
+    .recv = nocomp_recv
 };
 
 static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
@@ -172,6 +288,18 @@ void multifd_register_ops(int method, MultiFDMethods *ops)
     multifd_ops[method] = ops;
 }
 
+/* Reset a MultiFDPages_t* object for the next use */
+static void multifd_pages_reset(MultiFDPages_t *pages)
+{
+    /*
+     * We don't need to touch offset[] array, because it will be
+     * overwritten later when reused.
+     */
+    pages->num = 0;
+    pages->normal_num = 0;
+    pages->block = NULL;
+}
+
 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 {
     MultiFDInit_t msg = {};
@@ -228,56 +356,68 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
     }
 
     if (msg.id > migrate_multifd_channels()) {
-        error_setg(errp, "multifd: received channel version %u "
-                   "expected %u", msg.version, MULTIFD_VERSION);
+        error_setg(errp, "multifd: received channel id %u is greater than "
+                   "number of channels %u", msg.id, migrate_multifd_channels());
         return -1;
     }
 
     return msg.id;
 }
 
-static MultiFDPages_t *multifd_pages_init(size_t size)
+static MultiFDPages_t *multifd_pages_init(uint32_t n)
 {
     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 
-    pages->allocated = size;
-    pages->offset = g_new0(ram_addr_t, size);
+    pages->allocated = n;
+    pages->offset = g_new0(ram_addr_t, n);
 
     return pages;
 }
 
 static void multifd_pages_clear(MultiFDPages_t *pages)
 {
-    pages->num = 0;
+    multifd_pages_reset(pages);
     pages->allocated = 0;
-    pages->packet_num = 0;
-    pages->block = NULL;
     g_free(pages->offset);
     pages->offset = NULL;
     g_free(pages);
 }
 
-static void multifd_send_fill_packet(MultiFDSendParams *p)
+void multifd_send_fill_packet(MultiFDSendParams *p)
 {
     MultiFDPacket_t *packet = p->packet;
+    MultiFDPages_t *pages = p->pages;
+    uint64_t packet_num;
+    uint32_t zero_num = pages->num - pages->normal_num;
     int i;
 
     packet->flags = cpu_to_be32(p->flags);
     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
-    packet->normal_pages = cpu_to_be32(p->normal_num);
+    packet->normal_pages = cpu_to_be32(pages->normal_num);
+    packet->zero_pages = cpu_to_be32(zero_num);
     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
-    packet->packet_num = cpu_to_be64(p->packet_num);
 
-    if (p->pages->block) {
-        strncpy(packet->ramblock, p->pages->block->idstr, 256);
+    packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num);
+    packet->packet_num = cpu_to_be64(packet_num);
+
+    if (pages->block) {
+        pstrcpy(packet->ramblock, sizeof(packet->ramblock),
+                pages->block->idstr);
     }
 
-    for (i = 0; i < p->normal_num; i++) {
+    for (i = 0; i < pages->num; i++) {
         /* there are architectures where ram_addr_t is 32 bit */
-        uint64_t temp = p->normal[i];
+        uint64_t temp = pages->offset[i];
 
         packet->offset[i] = cpu_to_be64(temp);
     }
+
+    p->packets_sent++;
+    p->total_normal_pages += pages->normal_num;
+    p->total_zero_pages += zero_num;
+
+    trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num,
+                       p->flags, p->next_packet_size);
 }
 
 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
@@ -318,15 +458,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
     p->normal_num = be32_to_cpu(packet->normal_pages);
     if (p->normal_num > packet->pages_alloc) {
         error_setg(errp, "multifd: received packet "
-                   "with %u pages and expected maximum pages are %u",
+                   "with %u normal pages and expected maximum pages are %u",
                    p->normal_num, packet->pages_alloc) ;
         return -1;
     }
 
+    p->zero_num = be32_to_cpu(packet->zero_pages);
+    if (p->zero_num > packet->pages_alloc - p->normal_num) {
+        error_setg(errp, "multifd: received packet "
+                   "with %u zero pages and expected maximum zero pages are %u",
+                   p->zero_num, packet->pages_alloc - p->normal_num) ;
+        return -1;
+    }
+
     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
     p->packet_num = be64_to_cpu(packet->packet_num);
+    p->packets_recved++;
+    p->total_normal_pages += p->normal_num;
+    p->total_zero_pages += p->zero_num;
+
+    trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num,
+                       p->flags, p->next_packet_size);
 
-    if (p->normal_num == 0) {
+    if (p->normal_num == 0 && p->zero_num == 0) {
         return 0;
     }
 
@@ -352,26 +506,42 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
         p->normal[i] = offset;
     }
 
+    for (i = 0; i < p->zero_num; i++) {
+        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
+
+        if (offset > (p->block->used_length - p->page_size)) {
+            error_setg(errp, "multifd: offset too long %" PRIu64
+                       " (max " RAM_ADDR_FMT ")",
+                       offset, p->block->used_length);
+            return -1;
+        }
+        p->zero[i] = offset;
+    }
+
     return 0;
 }
 
-struct {
-    MultiFDSendParams *params;
-    /* array of pages to sent */
-    MultiFDPages_t *pages;
-    /* global number of generated multifd packets */
-    uint64_t packet_num;
-    /* send channels ready */
-    QemuSemaphore channels_ready;
-    /*
-     * Have we already run terminate threads.  There is a race when it
-     * happens that we got one error while we are exiting.
-     * We will use atomic operations.  Only valid values are 0 and 1.
-     */
-    int exiting;
-    /* multifd ops */
-    MultiFDMethods *ops;
-} *multifd_send_state;
+static bool multifd_send_should_exit(void)
+{
+    return qatomic_read(&multifd_send_state->exiting);
+}
+
+static bool multifd_recv_should_exit(void)
+{
+    return qatomic_read(&multifd_recv_state->exiting);
+}
+
+/*
+ * The migration thread can wait on either of the two semaphores.  This
+ * function can be used to kick the main thread out of waiting on either of
+ * them.  Should mostly only be called when something wrong happened with
+ * the current multifd send thread.
+ */
+static void multifd_send_kick_main(MultiFDSendParams *p)
+{
+    qemu_sem_post(&p->sem_sync);
+    qemu_sem_post(&multifd_send_state->channels_ready);
+}
 
 /*
  * How we use multifd_send_state->pages and channel->pages?
@@ -389,20 +559,23 @@ struct {
  * thread is using the channel mutex when changing it, and the channel
  * have to had finish with its own, otherwise pending_job can't be
  * false.
+ *
+ * Returns true if succeed, false otherwise.
  */
-
-static int multifd_send_pages(QEMUFile *f)
+static bool multifd_send_pages(void)
 {
     int i;
     static int next_channel;
     MultiFDSendParams *p = NULL; /* make happy gcc */
     MultiFDPages_t *pages = multifd_send_state->pages;
 
-    if (qatomic_read(&multifd_send_state->exiting)) {
-        return -1;
+    if (multifd_send_should_exit()) {
+        return false;
     }
 
+    /* We wait here, until at least one channel is ready */
     qemu_sem_wait(&multifd_send_state->channels_ready);
+
     /*
      * next_channel can remain from a previous migration that was
      * using more channels, so ensure it doesn't overflow if the
@@ -410,69 +583,100 @@ static int multifd_send_pages(QEMUFile *f)
      */
     next_channel %= migrate_multifd_channels();
     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
-        p = &multifd_send_state->params[i];
-
-        qemu_mutex_lock(&p->mutex);
-        if (p->quit) {
-            error_report("%s: channel %d has already quit!", __func__, i);
-            qemu_mutex_unlock(&p->mutex);
-            return -1;
+        if (multifd_send_should_exit()) {
+            return false;
         }
-        if (!p->pending_job) {
-            p->pending_job++;
+        p = &multifd_send_state->params[i];
+        /*
+         * Lockless read to p->pending_job is safe, because only multifd
+         * sender thread can clear it.
+         */
+        if (qatomic_read(&p->pending_job) == false) {
             next_channel = (i + 1) % migrate_multifd_channels();
             break;
         }
-        qemu_mutex_unlock(&p->mutex);
     }
-    assert(!p->pages->num);
-    assert(!p->pages->block);
 
-    p->packet_num = multifd_send_state->packet_num++;
+    /*
+     * Make sure we read p->pending_job before all the rest.  Pairs with
+     * qatomic_store_release() in multifd_send_thread().
+     */
+    smp_mb_acquire();
+    assert(!p->pages->num);
     multifd_send_state->pages = p->pages;
     p->pages = pages;
-    qemu_mutex_unlock(&p->mutex);
+    /*
+     * Making sure p->pages is setup before marking pending_job=true. Pairs
+     * with the qatomic_load_acquire() in multifd_send_thread().
+     */
+    qatomic_store_release(&p->pending_job, true);
     qemu_sem_post(&p->sem);
 
-    return 1;
+    return true;
 }
 
-int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
+static inline bool multifd_queue_empty(MultiFDPages_t *pages)
 {
-    MultiFDPages_t *pages = multifd_send_state->pages;
-    bool changed = false;
+    return pages->num == 0;
+}
 
-    if (!pages->block) {
-        pages->block = block;
-    }
+static inline bool multifd_queue_full(MultiFDPages_t *pages)
+{
+    return pages->num == pages->allocated;
+}
 
-    if (pages->block == block) {
-        pages->offset[pages->num] = offset;
-        pages->num++;
+static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset)
+{
+    pages->offset[pages->num++] = offset;
+}
 
-        if (pages->num < pages->allocated) {
-            return 1;
-        }
-    } else {
-        changed = true;
-    }
+/* Returns true if enqueue successful, false otherwise */
+bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
+{
+    MultiFDPages_t *pages;
 
-    if (multifd_send_pages(f) < 0) {
-        return -1;
+retry:
+    pages = multifd_send_state->pages;
+
+    /* If the queue is empty, we can already enqueue now */
+    if (multifd_queue_empty(pages)) {
+        pages->block = block;
+        multifd_enqueue(pages, offset);
+        return true;
     }
 
-    if (changed) {
-        return multifd_queue_page(f, block, offset);
+    /*
+     * Not empty, meanwhile we need a flush.  It can because of either:
+     *
+     * (1) The page is not on the same ramblock of previous ones, or,
+     * (2) The queue is full.
+     *
+     * After flush, always retry.
+     */
+    if (pages->block != block || multifd_queue_full(pages)) {
+        if (!multifd_send_pages()) {
+            return false;
+        }
+        goto retry;
     }
 
-    return 1;
+    /* Not empty, and we still have space, do it! */
+    multifd_enqueue(pages, offset);
+    return true;
 }
 
-static void multifd_send_terminate_threads(Error *err)
+/* Multifd send side hit an error; remember it and prepare to quit */
+static void multifd_send_set_error(Error *err)
 {
-    int i;
-
-    trace_multifd_send_terminate_threads(err != NULL);
+    /*
+     * We don't want to exit each threads twice.  Depending on where
+     * we get the error, or if there are two independent errors in two
+     * threads at the same time, we can end calling this function
+     * twice.
+     */
+    if (qatomic_xchg(&multifd_send_state->exiting, 1)) {
+        return;
+    }
 
     if (err) {
         MigrationState *s = migrate_get_current();
@@ -485,86 +689,104 @@ static void multifd_send_terminate_threads(Error *err)
                               MIGRATION_STATUS_FAILED);
         }
     }
+}
+
+static void multifd_send_terminate_threads(void)
+{
+    int i;
+
+    trace_multifd_send_terminate_threads();
 
     /*
-     * We don't want to exit each threads twice.  Depending on where
-     * we get the error, or if there are two independent errors in two
-     * threads at the same time, we can end calling this function
-     * twice.
+     * Tell everyone we're quitting.  No xchg() needed here; we simply
+     * always set it.
      */
-    if (qatomic_xchg(&multifd_send_state->exiting, 1)) {
-        return;
-    }
+    qatomic_set(&multifd_send_state->exiting, 1);
 
+    /*
+     * Firstly, kick all threads out; no matter whether they are just idle,
+     * or blocked in an IO system call.
+     */
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
 
-        qemu_mutex_lock(&p->mutex);
-        p->quit = true;
         qemu_sem_post(&p->sem);
         if (p->c) {
             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
         }
-        qemu_mutex_unlock(&p->mutex);
     }
+
+    /*
+     * Finally recycle all the threads.
+     */
+    for (i = 0; i < migrate_multifd_channels(); i++) {
+        MultiFDSendParams *p = &multifd_send_state->params[i];
+
+        if (p->tls_thread_created) {
+            qemu_thread_join(&p->tls_thread);
+        }
+
+        if (p->thread_created) {
+            qemu_thread_join(&p->thread);
+        }
+    }
+}
+
+static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
+{
+    if (p->c) {
+        migration_ioc_unregister_yank(p->c);
+        object_unref(OBJECT(p->c));
+        p->c = NULL;
+    }
+    qemu_sem_destroy(&p->sem);
+    qemu_sem_destroy(&p->sem_sync);
+    g_free(p->name);
+    p->name = NULL;
+    multifd_pages_clear(p->pages);
+    p->pages = NULL;
+    p->packet_len = 0;
+    g_free(p->packet);
+    p->packet = NULL;
+    multifd_send_state->ops->send_cleanup(p, errp);
+
+    return *errp == NULL;
 }
 
-static int multifd_send_channel_destroy(QIOChannel *send)
+static void multifd_send_cleanup_state(void)
 {
-    return socket_send_channel_destroy(send);
+    socket_cleanup_outgoing_migration();
+    qemu_sem_destroy(&multifd_send_state->channels_created);
+    qemu_sem_destroy(&multifd_send_state->channels_ready);
+    g_free(multifd_send_state->params);
+    multifd_send_state->params = NULL;
+    multifd_pages_clear(multifd_send_state->pages);
+    multifd_send_state->pages = NULL;
+    g_free(multifd_send_state);
+    multifd_send_state = NULL;
 }
 
-void multifd_save_cleanup(void)
+void multifd_send_shutdown(void)
 {
     int i;
 
     if (!migrate_multifd()) {
         return;
     }
-    multifd_send_terminate_threads(NULL);
-    for (i = 0; i < migrate_multifd_channels(); i++) {
-        MultiFDSendParams *p = &multifd_send_state->params[i];
 
-        if (p->running) {
-            qemu_thread_join(&p->thread);
-        }
-    }
+    multifd_send_terminate_threads();
+
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
         Error *local_err = NULL;
 
-        if (p->registered_yank) {
-            migration_ioc_unregister_yank(p->c);
-        }
-        multifd_send_channel_destroy(p->c);
-        p->c = NULL;
-        qemu_mutex_destroy(&p->mutex);
-        qemu_sem_destroy(&p->sem);
-        qemu_sem_destroy(&p->sem_sync);
-        g_free(p->name);
-        p->name = NULL;
-        multifd_pages_clear(p->pages);
-        p->pages = NULL;
-        p->packet_len = 0;
-        g_free(p->packet);
-        p->packet = NULL;
-        g_free(p->iov);
-        p->iov = NULL;
-        g_free(p->normal);
-        p->normal = NULL;
-        multifd_send_state->ops->send_cleanup(p, &local_err);
-        if (local_err) {
+        if (!multifd_send_cleanup_channel(p, &local_err)) {
             migrate_set_error(migrate_get_current(), local_err);
             error_free(local_err);
         }
     }
-    qemu_sem_destroy(&multifd_send_state->channels_ready);
-    g_free(multifd_send_state->params);
-    multifd_send_state->params = NULL;
-    multifd_pages_clear(multifd_send_state->pages);
-    multifd_send_state->pages = NULL;
-    g_free(multifd_send_state);
-    multifd_send_state = NULL;
+
+    multifd_send_cleanup_state();
 }
 
 static int multifd_zero_copy_flush(QIOChannel *c)
@@ -584,7 +806,7 @@ static int multifd_zero_copy_flush(QIOChannel *c)
     return ret;
 }
 
-int multifd_send_sync_main(QEMUFile *f)
+int multifd_send_sync_main(void)
 {
     int i;
     bool flush_zero_copy;
@@ -593,47 +815,38 @@ int multifd_send_sync_main(QEMUFile *f)
         return 0;
     }
     if (multifd_send_state->pages->num) {
-        if (multifd_send_pages(f) < 0) {
+        if (!multifd_send_pages()) {
             error_report("%s: multifd_send_pages fail", __func__);
             return -1;
         }
     }
 
-    /*
-     * When using zero-copy, it's necessary to flush the pages before any of
-     * the pages can be sent again, so we'll make sure the new version of the
-     * pages will always arrive _later_ than the old pages.
-     *
-     * Currently we achieve this by flushing the zero-page requested writes
-     * per ram iteration, but in the future we could potentially optimize it
-     * to be less frequent, e.g. only after we finished one whole scanning of
-     * all the dirty bitmaps.
-     */
-
     flush_zero_copy = migrate_zero_copy_send();
 
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
 
-        trace_multifd_send_sync_main_signal(p->id);
-
-        qemu_mutex_lock(&p->mutex);
-
-        if (p->quit) {
-            error_report("%s: channel %d has already quit", __func__, i);
-            qemu_mutex_unlock(&p->mutex);
+        if (multifd_send_should_exit()) {
             return -1;
         }
 
-        p->packet_num = multifd_send_state->packet_num++;
-        p->flags |= MULTIFD_FLAG_SYNC;
-        p->pending_job++;
-        qemu_mutex_unlock(&p->mutex);
+        trace_multifd_send_sync_main_signal(p->id);
+
+        /*
+         * We should be the only user so far, so not possible to be set by
+         * others concurrently.
+         */
+        assert(qatomic_read(&p->pending_sync) == false);
+        qatomic_set(&p->pending_sync, true);
         qemu_sem_post(&p->sem);
     }
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
 
+        if (multifd_send_should_exit()) {
+            return -1;
+        }
+
         qemu_sem_wait(&multifd_send_state->channels_ready);
         trace_multifd_send_sync_main_wait(p->id);
         qemu_sem_wait(&p->sem_sync);
@@ -653,75 +866,45 @@ static void *multifd_send_thread(void *opaque)
     MigrationThread *thread = NULL;
     Error *local_err = NULL;
     int ret = 0;
-    bool use_zero_copy_send = migrate_zero_copy_send();
+    bool use_packets = multifd_use_packets();
 
     thread = migration_threads_add(p->name, qemu_get_thread_id());
 
+    /* report multifd thread pid to libvirt */
+    qapi_event_send_migration_multifd_pid(qemu_get_thread_id());
+
     trace_multifd_send_thread_start(p->id);
     rcu_register_thread();
 
-    if (multifd_send_initial_packet(p, &local_err) < 0) {
-        ret = -1;
-        goto out;
+    if (use_packets) {
+        if (multifd_send_initial_packet(p, &local_err) < 0) {
+            ret = -1;
+            goto out;
+        }
     }
-    /* initial packet */
-    p->num_packets = 1;
 
     while (true) {
         qemu_sem_post(&multifd_send_state->channels_ready);
         qemu_sem_wait(&p->sem);
 
-        if (qatomic_read(&multifd_send_state->exiting)) {
+        if (multifd_send_should_exit()) {
             break;
         }
-        qemu_mutex_lock(&p->mutex);
-
-        if (p->pending_job) {
-            uint64_t packet_num = p->packet_num;
-            uint32_t flags;
-            p->normal_num = 0;
-
-            if (use_zero_copy_send) {
-                p->iovs_num = 0;
-            } else {
-                p->iovs_num = 1;
-            }
 
-            for (int i = 0; i < p->pages->num; i++) {
-                p->normal[p->normal_num] = p->pages->offset[i];
-                p->normal_num++;
-            }
+        /*
+         * Read pending_job flag before p->pages.  Pairs with the
+         * qatomic_store_release() in multifd_send_pages().
+         */
+        if (qatomic_load_acquire(&p->pending_job)) {
+            MultiFDPages_t *pages = p->pages;
 
-            if (p->normal_num) {
-                ret = multifd_send_state->ops->send_prepare(p, &local_err);
-                if (ret != 0) {
-                    qemu_mutex_unlock(&p->mutex);
-                    break;
-                }
-            }
-            multifd_send_fill_packet(p);
-            flags = p->flags;
             p->flags = 0;
-            p->num_packets++;
-            p->total_normal_pages += p->normal_num;
-            p->pages->num = 0;
-            p->pages->block = NULL;
-            qemu_mutex_unlock(&p->mutex);
-
-            trace_multifd_send(p->id, packet_num, p->normal_num, flags,
-                               p->next_packet_size);
+            p->iovs_num = 0;
+            assert(pages->num);
 
-            if (use_zero_copy_send) {
-                /* Send header first, without zerocopy */
-                ret = qio_channel_write_all(p->c, (void *)p->packet,
-                                            p->packet_len, &local_err);
-                if (ret != 0) {
-                    break;
-                }
-            } else {
-                /* Send header using the same writev call */
-                p->iov[0].iov_len = p->packet_len;
-                p->iov[0].iov_base = p->packet;
+            ret = multifd_send_state->ops->send_prepare(p, &local_err);
+            if (ret != 0) {
+                break;
             }
 
             ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL,
@@ -731,18 +914,41 @@ static void *multifd_send_thread(void *opaque)
             }
 
             stat64_add(&mig_stats.multifd_bytes,
-                       p->next_packet_size + p->packet_len);
+                       (uint64_t)p->next_packet_size + p->packet_len);
+            stat64_add(&mig_stats.normal_pages, pages->normal_num);
+            stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num);
+
+            multifd_pages_reset(p->pages);
             p->next_packet_size = 0;
-            qemu_mutex_lock(&p->mutex);
-            p->pending_job--;
-            qemu_mutex_unlock(&p->mutex);
 
-            if (flags & MULTIFD_FLAG_SYNC) {
-                qemu_sem_post(&p->sem_sync);
-            }
+            /*
+             * Making sure p->pages is published before saying "we're
+             * free".  Pairs with the smp_mb_acquire() in
+             * multifd_send_pages().
+             */
+            qatomic_store_release(&p->pending_job, false);
         } else {
-            qemu_mutex_unlock(&p->mutex);
-            /* sometimes there are spurious wakeups */
+            /*
+             * If not a normal job, must be a sync request.  Note that
+             * pending_sync is a standalone flag (unlike pending_job), so
+             * it doesn't require explicit memory barriers.
+             */
+            assert(qatomic_read(&p->pending_sync));
+
+            if (use_packets) {
+                p->flags = MULTIFD_FLAG_SYNC;
+                multifd_send_fill_packet(p);
+                ret = qio_channel_write_all(p->c, (void *)p->packet,
+                                            p->packet_len, &local_err);
+                if (ret != 0) {
+                    break;
+                }
+                /* p->next_packet_size will always be zero for a SYNC packet */
+                stat64_add(&mig_stats.multifd_bytes, p->packet_len);
+            }
+
+            qatomic_set(&p->pending_sync, false);
+            qemu_sem_post(&p->sem_sync);
         }
     }
 
@@ -750,62 +956,37 @@ out:
     if (ret) {
         assert(local_err);
         trace_multifd_send_error(p->id);
-        multifd_send_terminate_threads(local_err);
-        qemu_sem_post(&p->sem_sync);
-        qemu_sem_post(&multifd_send_state->channels_ready);
+        multifd_send_set_error(local_err);
+        multifd_send_kick_main(p);
         error_free(local_err);
     }
 
-    qemu_mutex_lock(&p->mutex);
-    p->running = false;
-    qemu_mutex_unlock(&p->mutex);
-
     rcu_unregister_thread();
     migration_threads_remove(thread);
-    trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages);
+    trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages,
+                                  p->total_zero_pages);
 
     return NULL;
 }
 
-static bool multifd_channel_connect(MultiFDSendParams *p,
-                                    QIOChannel *ioc,
-                                    Error **errp);
-
-static void multifd_tls_outgoing_handshake(QIOTask *task,
-                                           gpointer opaque)
-{
-    MultiFDSendParams *p = opaque;
-    QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
-    Error *err = NULL;
-
-    if (!qio_task_propagate_error(task, &err)) {
-        trace_multifd_tls_outgoing_handshake_complete(ioc);
-        if (multifd_channel_connect(p, ioc, &err)) {
-            return;
-        }
-    }
-
-    trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err));
+static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque);
 
-    /*
-     * Error happen, mark multifd_send_thread status as 'quit' although it
-     * is not created, and then tell who pay attention to me.
-     */
-    p->quit = true;
-    qemu_sem_post(&multifd_send_state->channels_ready);
-    qemu_sem_post(&p->sem_sync);
-}
+typedef struct {
+    MultiFDSendParams *p;
+    QIOChannelTLS *tioc;
+} MultiFDTLSThreadArgs;
 
 static void *multifd_tls_handshake_thread(void *opaque)
 {
-    MultiFDSendParams *p = opaque;
-    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c);
+    MultiFDTLSThreadArgs *args = opaque;
 
-    qio_channel_tls_handshake(tioc,
-                              multifd_tls_outgoing_handshake,
-                              p,
+    qio_channel_tls_handshake(args->tioc,
+                              multifd_new_send_channel_async,
+                              args->p,
                               NULL,
                               NULL);
+    g_free(args);
+
     return NULL;
 }
 
@@ -815,6 +996,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p,
 {
     MigrationState *s = migrate_get_current();
     const char *hostname = s->hostname;
+    MultiFDTLSThreadArgs *args;
     QIOChannelTLS *tioc;
 
     tioc = migration_tls_client_create(ioc, hostname, errp);
@@ -822,76 +1004,91 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p,
         return false;
     }
 
+    /*
+     * Ownership of the socket channel now transfers to the newly
+     * created TLS channel, which has already taken a reference.
+     */
     object_unref(OBJECT(ioc));
     trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname);
     qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing");
-    p->c = QIO_CHANNEL(tioc);
-    qemu_thread_create(&p->thread, "multifd-tls-handshake-worker",
-                       multifd_tls_handshake_thread, p,
+
+    args = g_new0(MultiFDTLSThreadArgs, 1);
+    args->tioc = tioc;
+    args->p = p;
+
+    p->tls_thread_created = true;
+    qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker",
+                       multifd_tls_handshake_thread, args,
                        QEMU_THREAD_JOINABLE);
     return true;
 }
 
-static bool multifd_channel_connect(MultiFDSendParams *p,
-                                    QIOChannel *ioc,
-                                    Error **errp)
+static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
 {
-    trace_multifd_set_outgoing_channel(
-        ioc, object_get_typename(OBJECT(ioc)),
-        migrate_get_current()->hostname);
-
-    if (migrate_channel_requires_tls_upgrade(ioc)) {
-        /*
-         * tls_channel_connect will call back to this
-         * function after the TLS handshake,
-         * so we mustn't call multifd_send_thread until then
-         */
-        return multifd_tls_channel_connect(p, ioc, errp);
+    qio_channel_set_delay(ioc, false);
 
-    } else {
-        migration_ioc_register_yank(ioc);
-        p->registered_yank = true;
-        p->c = ioc;
-        qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
-                           QEMU_THREAD_JOINABLE);
-    }
-    return true;
-}
+    migration_ioc_register_yank(ioc);
+    /* Setup p->c only if the channel is completely setup */
+    p->c = ioc;
 
-static void multifd_new_send_channel_cleanup(MultiFDSendParams *p,
-                                             QIOChannel *ioc, Error *err)
-{
-     migrate_set_error(migrate_get_current(), err);
-     /* Error happen, we need to tell who pay attention to me */
-     qemu_sem_post(&multifd_send_state->channels_ready);
-     qemu_sem_post(&p->sem_sync);
-     /*
-      * Although multifd_send_thread is not created, but main migration
-      * thread need to judge whether it is running, so we need to mark
-      * its status.
-      */
-     p->quit = true;
-     object_unref(OBJECT(ioc));
-     error_free(err);
+    p->thread_created = true;
+    qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
+                       QEMU_THREAD_JOINABLE);
 }
 
+/*
+ * When TLS is enabled this function is called once to establish the
+ * TLS connection and a second time after the TLS handshake to create
+ * the multifd channel. Without TLS it goes straight into the channel
+ * creation.
+ */
 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
 {
     MultiFDSendParams *p = opaque;
     QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
     Error *local_err = NULL;
+    bool ret;
 
     trace_multifd_new_send_channel_async(p->id);
-    if (!qio_task_propagate_error(task, &local_err)) {
-        qio_channel_set_delay(ioc, false);
-        p->running = true;
-        if (multifd_channel_connect(p, ioc, &local_err)) {
+
+    if (qio_task_propagate_error(task, &local_err)) {
+        ret = false;
+        goto out;
+    }
+
+    trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)),
+                                       migrate_get_current()->hostname);
+
+    if (migrate_channel_requires_tls_upgrade(ioc)) {
+        ret = multifd_tls_channel_connect(p, ioc, &local_err);
+        if (ret) {
             return;
         }
+    } else {
+        multifd_channel_connect(p, ioc);
+        ret = true;
+    }
+
+out:
+    /*
+     * Here we're not interested whether creation succeeded, only that
+     * it happened at all.
+     */
+    qemu_sem_post(&multifd_send_state->channels_created);
+
+    if (ret) {
+        return;
     }
 
     trace_multifd_new_send_channel_async_error(p->id, local_err);
-    multifd_new_send_channel_cleanup(p, ioc, local_err);
+    multifd_send_set_error(local_err);
+    /*
+     * For error cases (TLS or non-TLS), IO channel is always freed here
+     * rather than when cleanup multifd: since p->c is not set, multifd
+     * cleanup code doesn't even know its existence.
+     */
+    object_unref(OBJECT(ioc));
+    error_free(local_err);
 }
 
 static void multifd_new_send_channel_create(gpointer opaque)
@@ -899,20 +1096,24 @@ static void multifd_new_send_channel_create(gpointer opaque)
     socket_send_channel_create(multifd_new_send_channel_async, opaque);
 }
 
-int multifd_save_setup(Error **errp)
+bool multifd_send_setup(void)
 {
-    int thread_count;
+    MigrationState *s = migrate_get_current();
+    Error *local_err = NULL;
+    int thread_count, ret = 0;
     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+    bool use_packets = multifd_use_packets();
     uint8_t i;
 
     if (!migrate_multifd()) {
-        return 0;
+        return true;
     }
 
     thread_count = migrate_multifd_channels();
     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
     multifd_send_state->pages = multifd_pages_init(page_count);
+    qemu_sem_init(&multifd_send_state->channels_created, 0);
     qemu_sem_init(&multifd_send_state->channels_ready, 0);
     qatomic_set(&multifd_send_state->exiting, 0);
     multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];
@@ -920,59 +1121,53 @@ int multifd_save_setup(Error **errp)
     for (i = 0; i < thread_count; i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
 
-        qemu_mutex_init(&p->mutex);
         qemu_sem_init(&p->sem, 0);
         qemu_sem_init(&p->sem_sync, 0);
-        p->quit = false;
-        p->pending_job = 0;
         p->id = i;
         p->pages = multifd_pages_init(page_count);
-        p->packet_len = sizeof(MultiFDPacket_t)
-                      + sizeof(uint64_t) * page_count;
-        p->packet = g_malloc0(p->packet_len);
-        p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
-        p->packet->version = cpu_to_be32(MULTIFD_VERSION);
+
+        if (use_packets) {
+            p->packet_len = sizeof(MultiFDPacket_t)
+                          + sizeof(uint64_t) * page_count;
+            p->packet = g_malloc0(p->packet_len);
+            p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
+            p->packet->version = cpu_to_be32(MULTIFD_VERSION);
+        }
         p->name = g_strdup_printf("multifdsend_%d", i);
-        /* We need one extra place for the packet header */
-        p->iov = g_new0(struct iovec, page_count + 1);
-        p->normal = g_new0(ram_addr_t, page_count);
         p->page_size = qemu_target_page_size();
         p->page_count = page_count;
-
-        if (migrate_zero_copy_send()) {
-            p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
-        } else {
-            p->write_flags = 0;
-        }
-
+        p->write_flags = 0;
         multifd_new_send_channel_create(p);
     }
 
+    /*
+     * Wait until channel creation has started for all channels. The
+     * creation can still fail, but no more channels will be created
+     * past this point.
+     */
+    for (i = 0; i < thread_count; i++) {
+        qemu_sem_wait(&multifd_send_state->channels_created);
+    }
+
     for (i = 0; i < thread_count; i++) {
         MultiFDSendParams *p = &multifd_send_state->params[i];
-        Error *local_err = NULL;
-        int ret;
 
         ret = multifd_send_state->ops->send_setup(p, &local_err);
         if (ret) {
-            error_propagate(errp, local_err);
-            return ret;
+            break;
         }
     }
-    return 0;
-}
 
-struct {
-    MultiFDRecvParams *params;
-    /* number of created threads */
-    int count;
-    /* syncs main thread and channels */
-    QemuSemaphore sem_sync;
-    /* global number of generated multifd packets */
-    uint64_t packet_num;
-    /* multifd ops */
-    MultiFDMethods *ops;
-} *multifd_recv_state;
+    if (ret) {
+        migrate_set_error(s, local_err);
+        error_report_err(local_err);
+        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+                          MIGRATION_STATUS_FAILED);
+        return false;
+    }
+
+    return true;
+}
 
 static void multifd_recv_terminate_threads(Error *err)
 {
@@ -980,6 +1175,10 @@ static void multifd_recv_terminate_threads(Error *err)
 
     trace_multifd_recv_terminate_threads(err != NULL);
 
+    if (qatomic_xchg(&multifd_recv_state->exiting, 1)) {
+        return;
+    }
+
     if (err) {
         MigrationState *s = migrate_get_current();
         migrate_set_error(s, err);
@@ -993,8 +1192,14 @@ static void multifd_recv_terminate_threads(Error *err)
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
-        qemu_mutex_lock(&p->mutex);
-        p->quit = true;
+        /*
+         * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
+         * however try to wakeup it without harm in cleanup phase.
+         */
+        if (multifd_use_packets()) {
+            qemu_sem_post(&p->sem_sync);
+        }
+
         /*
          * We could arrive here for two reasons:
          *  - normal quit, i.e. everything went fine, just finished
@@ -1004,18 +1209,45 @@ static void multifd_recv_terminate_threads(Error *err)
         if (p->c) {
             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
         }
-        qemu_mutex_unlock(&p->mutex);
     }
 }
 
-void multifd_load_shutdown(void)
+void multifd_recv_shutdown(void)
 {
     if (migrate_multifd()) {
         multifd_recv_terminate_threads(NULL);
     }
 }
 
-void multifd_load_cleanup(void)
+static void multifd_recv_cleanup_channel(MultiFDRecvParams *p)
+{
+    migration_ioc_unregister_yank(p->c);
+    object_unref(OBJECT(p->c));
+    p->c = NULL;
+    qemu_mutex_destroy(&p->mutex);
+    qemu_sem_destroy(&p->sem_sync);
+    g_free(p->name);
+    p->name = NULL;
+    p->packet_len = 0;
+    g_free(p->packet);
+    p->packet = NULL;
+    g_free(p->normal);
+    p->normal = NULL;
+    g_free(p->zero);
+    p->zero = NULL;
+    multifd_recv_state->ops->recv_cleanup(p);
+}
+
+static void multifd_recv_cleanup_state(void)
+{
+    qemu_sem_destroy(&multifd_recv_state->sem_sync);
+    g_free(multifd_recv_state->params);
+    multifd_recv_state->params = NULL;
+    g_free(multifd_recv_state);
+    multifd_recv_state = NULL;
+}
+
+void multifd_recv_cleanup(void)
 {
     int i;
 
@@ -1026,56 +1258,39 @@ void multifd_load_cleanup(void)
     for (i = 0; i < migrate_multifd_channels(); i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
-        if (p->running) {
-            /*
-             * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
-             * however try to wakeup it without harm in cleanup phase.
-             */
-            qemu_sem_post(&p->sem_sync);
+        if (p->thread_created) {
+            qemu_thread_join(&p->thread);
         }
-
-        qemu_thread_join(&p->thread);
     }
     for (i = 0; i < migrate_multifd_channels(); i++) {
-        MultiFDRecvParams *p = &multifd_recv_state->params[i];
-
-        migration_ioc_unregister_yank(p->c);
-        object_unref(OBJECT(p->c));
-        p->c = NULL;
-        qemu_mutex_destroy(&p->mutex);
-        qemu_sem_destroy(&p->sem_sync);
-        g_free(p->name);
-        p->name = NULL;
-        p->packet_len = 0;
-        g_free(p->packet);
-        p->packet = NULL;
-        g_free(p->iov);
-        p->iov = NULL;
-        g_free(p->normal);
-        p->normal = NULL;
-        multifd_recv_state->ops->recv_cleanup(p);
+        multifd_recv_cleanup_channel(&multifd_recv_state->params[i]);
     }
-    qemu_sem_destroy(&multifd_recv_state->sem_sync);
-    g_free(multifd_recv_state->params);
-    multifd_recv_state->params = NULL;
-    g_free(multifd_recv_state);
-    multifd_recv_state = NULL;
+    multifd_recv_cleanup_state();
 }
 
 void multifd_recv_sync_main(void)
 {
+    int thread_count = migrate_multifd_channels();
     int i;
 
-    if (!migrate_multifd()) {
+    if (!migrate_multifd() || !multifd_use_packets()) {
         return;
     }
-    for (i = 0; i < migrate_multifd_channels(); i++) {
-        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
-        trace_multifd_recv_sync_main_wait(p->id);
+    /*
+     * Initiate the synchronization by waiting for all channels.
+     * For socket-based migration this means each channel has received
+     * the SYNC packet on the stream.
+     */
+    for (i = 0; i < thread_count; i++) {
+        trace_multifd_recv_sync_main_wait(i);
         qemu_sem_wait(&multifd_recv_state->sem_sync);
     }
-    for (i = 0; i < migrate_multifd_channels(); i++) {
+
+    /*
+     * Sync done. Release the channels for the next iteration.
+     */
+    for (i = 0; i < thread_count; i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 
         WITH_QEMU_LOCK_GUARD(&p->mutex) {
@@ -1093,50 +1308,54 @@ static void *multifd_recv_thread(void *opaque)
 {
     MultiFDRecvParams *p = opaque;
     Error *local_err = NULL;
+    bool use_packets = multifd_use_packets();
     int ret;
 
     trace_multifd_recv_thread_start(p->id);
     rcu_register_thread();
 
     while (true) {
-        uint32_t flags;
+        uint32_t flags = 0;
+        bool has_data = false;
+        p->normal_num = 0;
 
-        if (p->quit) {
+        if (multifd_recv_should_exit()) {
             break;
         }
 
-        ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
-                                       p->packet_len, &local_err);
-        if (ret == 0 || ret == -1) {   /* 0: EOF  -1: Error */
-            break;
-        }
+        if (use_packets) {
+            ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
+                                           p->packet_len, &local_err);
+            if (ret == 0 || ret == -1) {   /* 0: EOF  -1: Error */
+                break;
+            }
 
-        qemu_mutex_lock(&p->mutex);
-        ret = multifd_recv_unfill_packet(p, &local_err);
-        if (ret) {
+            qemu_mutex_lock(&p->mutex);
+            ret = multifd_recv_unfill_packet(p, &local_err);
+            if (ret) {
+                qemu_mutex_unlock(&p->mutex);
+                break;
+            }
+
+            flags = p->flags;
+            /* recv methods don't know how to handle the SYNC flag */
+            p->flags &= ~MULTIFD_FLAG_SYNC;
+            has_data = p->normal_num || p->zero_num;
             qemu_mutex_unlock(&p->mutex);
-            break;
         }
 
-        flags = p->flags;
-        /* recv methods don't know how to handle the SYNC flag */
-        p->flags &= ~MULTIFD_FLAG_SYNC;
-        trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags,
-                           p->next_packet_size);
-        p->num_packets++;
-        p->total_normal_pages += p->normal_num;
-        qemu_mutex_unlock(&p->mutex);
-
-        if (p->normal_num) {
-            ret = multifd_recv_state->ops->recv_pages(p, &local_err);
+        if (has_data) {
+            ret = multifd_recv_state->ops->recv(p, &local_err);
             if (ret != 0) {
                 break;
             }
         }
 
-        if (flags & MULTIFD_FLAG_SYNC) {
-            qemu_sem_post(&multifd_recv_state->sem_sync);
-            qemu_sem_wait(&p->sem_sync);
+        if (use_packets) {
+            if (flags & MULTIFD_FLAG_SYNC) {
+                qemu_sem_post(&multifd_recv_state->sem_sync);
+                qemu_sem_wait(&p->sem_sync);
+            }
         }
     }
 
@@ -1144,20 +1363,20 @@ static void *multifd_recv_thread(void *opaque)
         multifd_recv_terminate_threads(local_err);
         error_free(local_err);
     }
-    qemu_mutex_lock(&p->mutex);
-    p->running = false;
-    qemu_mutex_unlock(&p->mutex);
 
     rcu_unregister_thread();
-    trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages);
+    trace_multifd_recv_thread_end(p->id, p->packets_recved,
+                                  p->total_normal_pages,
+                                  p->total_zero_pages);
 
     return NULL;
 }
 
-int multifd_load_setup(Error **errp)
+int multifd_recv_setup(Error **errp)
 {
     int thread_count;
     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+    bool use_packets = multifd_use_packets();
     uint8_t i;
 
     /*
@@ -1172,6 +1391,7 @@ int multifd_load_setup(Error **errp)
     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
     qatomic_set(&multifd_recv_state->count, 0);
+    qatomic_set(&multifd_recv_state->exiting, 0);
     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
     multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
 
@@ -1180,26 +1400,26 @@ int multifd_load_setup(Error **errp)
 
         qemu_mutex_init(&p->mutex);
         qemu_sem_init(&p->sem_sync, 0);
-        p->quit = false;
         p->id = i;
-        p->packet_len = sizeof(MultiFDPacket_t)
-                      + sizeof(uint64_t) * page_count;
-        p->packet = g_malloc0(p->packet_len);
+
+        if (use_packets) {
+            p->packet_len = sizeof(MultiFDPacket_t)
+                + sizeof(uint64_t) * page_count;
+            p->packet = g_malloc0(p->packet_len);
+        }
         p->name = g_strdup_printf("multifdrecv_%d", i);
-        p->iov = g_new0(struct iovec, page_count);
         p->normal = g_new0(ram_addr_t, page_count);
+        p->zero = g_new0(ram_addr_t, page_count);
         p->page_count = page_count;
         p->page_size = qemu_target_page_size();
     }
 
     for (i = 0; i < thread_count; i++) {
         MultiFDRecvParams *p = &multifd_recv_state->params[i];
-        Error *local_err = NULL;
         int ret;
 
-        ret = multifd_recv_state->ops->recv_setup(p, &local_err);
+        ret = multifd_recv_state->ops->recv_setup(p, errp);
         if (ret) {
-            error_propagate(errp, local_err);
             return ret;
         }
     }
@@ -1230,18 +1450,24 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
 {
     MultiFDRecvParams *p;
     Error *local_err = NULL;
+    bool use_packets = multifd_use_packets();
     int id;
 
-    id = multifd_recv_initial_packet(ioc, &local_err);
-    if (id < 0) {
-        multifd_recv_terminate_threads(local_err);
-        error_propagate_prepend(errp, local_err,
-                                "failed to receive packet"
-                                " via multifd channel %d: ",
-                                qatomic_read(&multifd_recv_state->count));
-        return;
+    if (use_packets) {
+        id = multifd_recv_initial_packet(ioc, &local_err);
+        if (id < 0) {
+            multifd_recv_terminate_threads(local_err);
+            error_propagate_prepend(errp, local_err,
+                                    "failed to receive packet"
+                                    " via multifd channel %d: ",
+                                    qatomic_read(&multifd_recv_state->count));
+            return;
+        }
+        trace_multifd_recv_new_channel(id);
+    } else {
+        /* next patch gives this a meaningful value */
+        id = 0;
     }
-    trace_multifd_recv_new_channel(id);
 
     p = &multifd_recv_state->params[id];
     if (p->c != NULL) {
@@ -1253,11 +1479,22 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
     }
     p->c = ioc;
     object_ref(OBJECT(ioc));
-    /* initial packet */
-    p->num_packets = 1;
 
-    p->running = true;
+    p->thread_created = true;
     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                        QEMU_THREAD_JOINABLE);
     qatomic_inc(&multifd_recv_state->count);
 }
+
+bool multifd_send_prepare_common(MultiFDSendParams *p)
+{
+    multifd_send_prepare_header(p);
+    multifd_send_zero_page_detect(p);
+
+    if (!p->pages->normal_num) {
+        p->next_packet_size = 0;
+        return false;
+    }
+
+    return true;
+}
diff --git a/migration/multifd.h b/migration/multifd.h
index a835643b48c85ccbfc20d705eb67af5b09323d17..57c13347881eb557f1dcb032a6674cb3ca0e8036 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -13,26 +13,31 @@
 #ifndef QEMU_MIGRATION_MULTIFD_H
 #define QEMU_MIGRATION_MULTIFD_H
 
-int multifd_save_setup(Error **errp);
-void multifd_save_cleanup(void);
-int multifd_load_setup(Error **errp);
-void multifd_load_cleanup(void);
-void multifd_load_shutdown(void);
+#include "ram.h"
+
+bool multifd_send_setup(void);
+void multifd_send_shutdown(void);
+int multifd_recv_setup(Error **errp);
+void multifd_recv_cleanup(void);
+void multifd_recv_shutdown(void);
 bool multifd_recv_all_channels_created(void);
 void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
 void multifd_recv_sync_main(void);
-int multifd_send_sync_main(QEMUFile *f);
-int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
+int multifd_send_sync_main(void);
+bool multifd_queue_page(RAMBlock *block, ram_addr_t offset);
 
 /* Multifd Compression flags */
 #define MULTIFD_FLAG_SYNC (1 << 0)
 
-/* We reserve 3 bits for compression methods */
-#define MULTIFD_FLAG_COMPRESSION_MASK (7 << 1)
+/* We reserve 5 bits for compression methods */
+#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1)
 /* we need to be compatible. Before compression value was 0 */
 #define MULTIFD_FLAG_NOCOMP (0 << 1)
 #define MULTIFD_FLAG_ZLIB (1 << 1)
 #define MULTIFD_FLAG_ZSTD (2 << 1)
+#define MULTIFD_FLAG_QPL (4 << 1)
+#define MULTIFD_FLAG_UADK (8 << 1)
+#define MULTIFD_FLAG_QATZIP (16 << 1)
 
 /* This value needs to be a multiple of qemu_target_page_size() */
 #define MULTIFD_PACKET_SIZE (512 * 1024)
@@ -48,18 +53,26 @@ typedef struct {
     /* size of the next packet that contains pages */
     uint32_t next_packet_size;
     uint64_t packet_num;
-    uint64_t unused[4];    /* Reserved for future use */
+    /* zero pages */
+    uint32_t zero_pages;
+    uint32_t unused32[1];    /* Reserved for future use */
+    uint64_t unused64[3];    /* Reserved for future use */
     char ramblock[256];
+    /*
+     * This array contains the pointers to:
+     *  - normal pages (initial normal_pages entries)
+     *  - zero pages (following zero_pages entries)
+     */
     uint64_t offset[];
 } __attribute__((packed)) MultiFDPacket_t;
 
 typedef struct {
     /* number of used pages */
     uint32_t num;
+    /* number of normal pages */
+    uint32_t normal_num;
     /* number of allocated pages */
     uint32_t allocated;
-    /* global number of generated multifd packets */
-    uint64_t packet_num;
     /* offset of each page */
     ram_addr_t *offset;
     RAMBlock *block;
@@ -75,10 +88,11 @@ typedef struct {
     char *name;
     /* channel thread id */
     QemuThread thread;
+    bool thread_created;
+    QemuThread tls_thread;
+    bool tls_thread_created;
     /* communication channel */
     QIOChannel *c;
-    /* is the yank function registered */
-    bool registered_yank;
     /* packet allocated len */
     uint32_t packet_len;
     /* guest page size */
@@ -93,18 +107,19 @@ typedef struct {
     /* syncs main thread and channels */
     QemuSemaphore sem_sync;
 
-    /* this mutex protects the following parameters */
-    QemuMutex mutex;
-    /* is this channel thread running */
-    bool running;
-    /* should this thread finish */
-    bool quit;
     /* multifd flags for each packet */
     uint32_t flags;
-    /* global number of generated multifd packets */
-    uint64_t packet_num;
-    /* thread has work to do */
-    int pending_job;
+    /*
+     * The sender thread has work to do if either of below boolean is set.
+     *
+     * @pending_job:  a job is pending
+     * @pending_sync: a sync request is pending
+     *
+     * For both of these fields, they're only set by the requesters, and
+     * cleared by the multifd sender threads.
+     */
+    bool pending_job;
+    bool pending_sync;
     /* array of pages to sent.
      * The owner of 'pages' depends of 'pending_job' value:
      * pending_job == 0 -> migration_thread can use it.
@@ -119,19 +134,17 @@ typedef struct {
     /* size of the next packet that contains pages */
     uint32_t next_packet_size;
     /* packets sent through this channel */
-    uint64_t num_packets;
+    uint64_t packets_sent;
     /* non zero pages sent through this channel */
     uint64_t total_normal_pages;
+    /* zero pages sent through this channel */
+    uint64_t total_zero_pages;
     /* buffers to send */
     struct iovec *iov;
     /* number of iovs used */
     uint32_t iovs_num;
-    /* Pages that are not zero */
-    ram_addr_t *normal;
-    /* num of non zero pages */
-    uint32_t normal_num;
     /* used for compression methods */
-    void *data;
+    void *compress_data;
 }  MultiFDSendParams;
 
 typedef struct {
@@ -144,6 +157,7 @@ typedef struct {
     char *name;
     /* channel thread id */
     QemuThread thread;
+    bool thread_created;
     /* communication channel */
     QIOChannel *c;
     /* packet allocated len */
@@ -158,8 +172,6 @@ typedef struct {
 
     /* this mutex protects the following parameters */
     QemuMutex mutex;
-    /* is this channel thread running */
-    bool running;
     /* should this thread finish */
     bool quit;
     /* multifd flags for each packet */
@@ -173,22 +185,28 @@ typedef struct {
     MultiFDPacket_t *packet;
     /* size of the next packet that contains pages */
     uint32_t next_packet_size;
-    /* packets sent through this channel */
-    uint64_t num_packets;
+    /* packets received through this channel */
+    uint64_t packets_recved;
     /* ramblock */
     RAMBlock *block;
     /* ramblock host address */
     uint8_t *host;
     /* non zero pages recv through this channel */
     uint64_t total_normal_pages;
+    /* zero pages recv through this channel */
+    uint64_t total_zero_pages;
     /* buffers to recv */
     struct iovec *iov;
     /* Pages that are not zero */
     ram_addr_t *normal;
     /* num of non zero pages */
     uint32_t normal_num;
+    /* Pages that are zero */
+    ram_addr_t *zero;
+    /* num of zero pages */
+    uint32_t zero_num;
     /* used for de-compression methods */
-    void *data;
+    void *compress_data;
 } MultiFDRecvParams;
 
 typedef struct {
@@ -202,11 +220,22 @@ typedef struct {
     int (*recv_setup)(MultiFDRecvParams *p, Error **errp);
     /* Cleanup for receiving side */
     void (*recv_cleanup)(MultiFDRecvParams *p);
-    /* Read all pages */
-    int (*recv_pages)(MultiFDRecvParams *p, Error **errp);
+    /* Read all data */
+    int (*recv)(MultiFDRecvParams *p, Error **errp);
 } MultiFDMethods;
 
 void multifd_register_ops(int method, MultiFDMethods *ops);
+void multifd_send_fill_packet(MultiFDSendParams *p);
+bool multifd_send_prepare_common(MultiFDSendParams *p);
+void multifd_send_zero_page_detect(MultiFDSendParams *p);
+void multifd_recv_zero_page_process(MultiFDRecvParams *p);
 
-#endif
+static inline void multifd_send_prepare_header(MultiFDSendParams *p)
+{
+    p->iov[0].iov_len = p->packet_len;
+    p->iov[0].iov_base = p->packet;
+    p->iovs_num++;
+}
 
+
+#endif
diff --git a/migration/options.c b/migration/options.c
index 8d8ec73ad95bd1dc7db9cc9881099524085544ae..6ba7ff65a379e5bed488df699d5e80c0b2072d55 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -47,6 +47,7 @@
 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
+#define DEFAULT_MIGRATE_COMPRESS_METHOD COMPRESS_METHOD_ZLIB
 /* Define default autoconverge cpu throttle migration parameters */
 #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
@@ -62,6 +63,13 @@
 #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
 /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
 #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
+/*
+ * 1: best speed, ... 9: best compress ratio
+ * There is some nuance here. Refer to QATzip documentation to understand
+ * the mapping of QATzip levels to standard deflate levels.
+ */
+#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1
+
 /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
 #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
 
@@ -113,6 +121,9 @@ Property migration_properties[] = {
     DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
                       parameters.decompress_threads,
                       DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
+    DEFINE_PROP_COMPRESS_METHOD("compress-method", MigrationState,
+                      parameters.compress_method,
+                      DEFAULT_MIGRATE_COMPRESS_METHOD),
     DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
                       parameters.throttle_trigger_threshold,
                       DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
@@ -143,6 +154,9 @@ Property migration_properties[] = {
     DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
                       parameters.multifd_zlib_level,
                       DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
+    DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState,
+                      parameters.multifd_qatzip_level,
+                      DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL),
     DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
                       parameters.multifd_zstd_level,
                       DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
@@ -179,6 +193,15 @@ Property migration_properties[] = {
     DEFINE_PROP_MIG_MODE("mode", MigrationState,
                       parameters.mode,
                       MIG_MODE_NORMAL),
+    DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
+                       parameters.zero_page_detection,
+                       ZERO_PAGE_DETECTION_LEGACY),
+    DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh),
+    DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert),
+    DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert),
+    DEFINE_PROP_UINT8("hdbss-buffer-size", MigrationState,
+                      parameters.hdbss_buffer_size,
+                      DEFAULT_HDBSS_BUFFER_SIZE),
 
     /* Migration capabilities */
     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -795,6 +818,15 @@ int migrate_decompress_threads(void)
     return s->parameters.decompress_threads;
 }
 
+CompressMethod migrate_compress_method(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->parameters.compress_method;
+}
+
 uint64_t migrate_downtime_limit(void)
 {
     MigrationState *s = migrate_get_current();
@@ -837,6 +869,13 @@ MigMode migrate_mode(void)
     return s->parameters.mode;
 }
 
+int migrate_hdbss_buffer_size(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return s->parameters.hdbss_buffer_size;
+}
+
 int migrate_multifd_channels(void)
 {
     MigrationState *s = migrate_get_current();
@@ -859,6 +898,13 @@ int migrate_multifd_zlib_level(void)
     return s->parameters.multifd_zlib_level;
 }
 
+int migrate_multifd_qatzip_level(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return s->parameters.multifd_qatzip_level;
+}
+
 int migrate_multifd_zstd_level(void)
 {
     MigrationState *s = migrate_get_current();
@@ -901,6 +947,13 @@ uint64_t migrate_xbzrle_cache_size(void)
     return s->parameters.xbzrle_cache_size;
 }
 
+ZeroPageDetection migrate_zero_page_detection(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return s->parameters.zero_page_detection;
+}
+
 /* parameter setters */
 
 void migrate_set_block_incremental(bool value)
@@ -953,6 +1006,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->compress_wait_thread = s->parameters.compress_wait_thread;
     params->has_decompress_threads = true;
     params->decompress_threads = s->parameters.decompress_threads;
+    params->has_compress_method = true;
+    params->compress_method = s->parameters.compress_method;
     params->has_throttle_trigger_threshold = true;
     params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
     params->has_cpu_throttle_initial = true;
@@ -981,6 +1036,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->multifd_compression = s->parameters.multifd_compression;
     params->has_multifd_zlib_level = true;
     params->multifd_zlib_level = s->parameters.multifd_zlib_level;
+    params->has_multifd_qatzip_level = true;
+    params->multifd_qatzip_level = s->parameters.multifd_qatzip_level;
     params->has_multifd_zstd_level = true;
     params->multifd_zstd_level = s->parameters.multifd_zstd_level;
     params->has_xbzrle_cache_size = true;
@@ -997,6 +1054,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->announce_rounds = s->parameters.announce_rounds;
     params->has_announce_step = true;
     params->announce_step = s->parameters.announce_step;
+    params->sev_pdh = g_strdup(s->parameters.sev_pdh);
+    params->sev_plat_cert = g_strdup(s->parameters.sev_plat_cert);
+    params->sev_amd_cert = g_strdup(s->parameters.sev_amd_cert);
 
     if (s->parameters.has_block_bitmap_mapping) {
         params->has_block_bitmap_mapping = true;
@@ -1011,6 +1071,10 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit;
     params->has_mode = true;
     params->mode = s->parameters.mode;
+    params->has_zero_page_detection = true;
+    params->zero_page_detection = s->parameters.zero_page_detection;
+    params->has_hdbss_buffer_size = true;
+    params->hdbss_buffer_size = s->parameters.hdbss_buffer_size;
 
     return params;
 }
@@ -1025,6 +1089,7 @@ void migrate_params_init(MigrationParameters *params)
     params->has_compress_threads = true;
     params->has_compress_wait_thread = true;
     params->has_decompress_threads = true;
+    params->has_compress_method = true;
     params->has_throttle_trigger_threshold = true;
     params->has_cpu_throttle_initial = true;
     params->has_cpu_throttle_increment = true;
@@ -1036,6 +1101,7 @@ void migrate_params_init(MigrationParameters *params)
     params->has_multifd_channels = true;
     params->has_multifd_compression = true;
     params->has_multifd_zlib_level = true;
+    params->has_multifd_qatzip_level = true;
     params->has_multifd_zstd_level = true;
     params->has_xbzrle_cache_size = true;
     params->has_max_postcopy_bandwidth = true;
@@ -1047,6 +1113,39 @@ void migrate_params_init(MigrationParameters *params)
     params->has_x_vcpu_dirty_limit_period = true;
     params->has_vcpu_dirty_limit = true;
     params->has_mode = true;
+    params->has_zero_page_detection = true;
+    params->has_hdbss_buffer_size = true;
+
+    params->sev_pdh = g_strdup("");
+    params->sev_plat_cert = g_strdup("");
+    params->sev_amd_cert = g_strdup("");
+}
+
+static bool compress_level_check(MigrationParameters *params, Error **errp)
+{
+    switch (params->compress_method) {
+    case COMPRESS_METHOD_ZLIB:
+        if (params->compress_level > 9 || params->compress_level < 1) {
+            error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
+                           "a value in the range of 0 to 9 for Zlib method");
+            return false;
+        }
+        break;
+#ifdef CONFIG_ZSTD
+    case COMPRESS_METHOD_ZSTD:
+        if (params->compress_level > 19 || params->compress_level < 1) {
+            error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
+                        "a value in the range of 1 to 19 for Zstd method");
+            return false;
+        }
+        break;
+#endif
+    default:
+        error_setg(errp, "Checking compress_level failed for unknown reason");
+        return false;
+    }
+
+    return true;
 }
 
 /*
@@ -1055,10 +1154,7 @@ void migrate_params_init(MigrationParameters *params)
  */
 bool migrate_params_check(MigrationParameters *params, Error **errp)
 {
-    if (params->has_compress_level &&
-        (params->compress_level > 9)) {
-        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
-                   "a value between 0 and 9");
+    if (params->has_compress_level && !compress_level_check(params, errp)) {
         return false;
     }
 
@@ -1145,6 +1241,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp)
         return false;
     }
 
+    if (params->has_multifd_qatzip_level &&
+        ((params->multifd_qatzip_level > 9) ||
+        (params->multifd_qatzip_level < 1))) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level",
+                   "a value between 1 and 9");
+        return false;
+    }
+
     if (params->has_multifd_zstd_level &&
         (params->multifd_zstd_level > 20)) {
         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
@@ -1259,6 +1363,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
         dest->decompress_threads = params->decompress_threads;
     }
 
+    if (params->has_compress_method) {
+        dest->compress_method = params->compress_method;
+    }
+
     if (params->has_throttle_trigger_threshold) {
         dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
     }
@@ -1310,6 +1418,15 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
     if (params->has_multifd_compression) {
         dest->multifd_compression = params->multifd_compression;
     }
+    if (params->has_multifd_qatzip_level) {
+        dest->multifd_qatzip_level = params->multifd_qatzip_level;
+    }
+    if (params->has_multifd_zlib_level) {
+        dest->multifd_zlib_level = params->multifd_zlib_level;
+    }
+    if (params->has_multifd_zstd_level) {
+        dest->multifd_zstd_level = params->multifd_zstd_level;
+    }
     if (params->has_xbzrle_cache_size) {
         dest->xbzrle_cache_size = params->xbzrle_cache_size;
     }
@@ -1348,6 +1465,27 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
     if (params->has_mode) {
         dest->mode = params->mode;
     }
+
+    if (params->has_zero_page_detection) {
+        dest->zero_page_detection = params->zero_page_detection;
+    }
+
+    if (params->sev_pdh) {
+        assert(params->sev_pdh->type == QTYPE_QSTRING);
+        dest->sev_pdh = params->sev_pdh->u.s;
+    }
+    if (params->sev_plat_cert) {
+        assert(params->sev_plat_cert->type == QTYPE_QSTRING);
+        dest->sev_plat_cert = params->sev_plat_cert->u.s;
+    }
+    if (params->sev_amd_cert) {
+        assert(params->sev_amd_cert->type == QTYPE_QSTRING);
+        dest->sev_amd_cert = params->sev_amd_cert->u.s;
+    }
+
+    if (params->has_hdbss_buffer_size) {
+        dest->hdbss_buffer_size = params->hdbss_buffer_size;
+    }
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1380,6 +1518,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
         s->parameters.decompress_threads = params->decompress_threads;
     }
 
+    if (params->has_compress_method) {
+        s->parameters.compress_method = params->compress_method;
+    }
+
     if (params->has_throttle_trigger_threshold) {
         s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
     }
@@ -1445,6 +1587,15 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
     if (params->has_multifd_compression) {
         s->parameters.multifd_compression = params->multifd_compression;
     }
+    if (params->has_multifd_qatzip_level) {
+        s->parameters.multifd_qatzip_level = params->multifd_qatzip_level;
+    }
+    if (params->has_multifd_zlib_level) {
+        s->parameters.multifd_zlib_level = params->multifd_zlib_level;
+    }
+    if (params->has_multifd_zstd_level) {
+        s->parameters.multifd_zstd_level = params->multifd_zstd_level;
+    }
     if (params->has_xbzrle_cache_size) {
         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
@@ -1492,6 +1643,30 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
     if (params->has_mode) {
         s->parameters.mode = params->mode;
     }
+
+    if (params->has_zero_page_detection) {
+        s->parameters.zero_page_detection = params->zero_page_detection;
+    }
+
+    if (params->sev_pdh) {
+        g_free(s->parameters.sev_pdh);
+        assert(params->sev_pdh->type == QTYPE_QSTRING);
+        s->parameters.sev_pdh = g_strdup(params->sev_pdh->u.s);
+    }
+    if (params->sev_plat_cert) {
+        g_free(s->parameters.sev_plat_cert);
+        assert(params->sev_plat_cert->type == QTYPE_QSTRING);
+        s->parameters.sev_plat_cert = g_strdup(params->sev_plat_cert->u.s);
+    }
+    if (params->sev_amd_cert) {
+        g_free(s->parameters.sev_amd_cert);
+        assert(params->sev_amd_cert->type == QTYPE_QSTRING);
+        s->parameters.sev_amd_cert = g_strdup(params->sev_amd_cert->u.s);
+    }
+
+    if (params->has_hdbss_buffer_size) {
+        s->parameters.hdbss_buffer_size = params->hdbss_buffer_size;
+    }
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
@@ -1517,6 +1692,27 @@ void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
         params->tls_authz->type = QTYPE_QSTRING;
         params->tls_authz->u.s = strdup("");
     }
+    /* TODO Rewrite "" to null instead */
+    if (params->sev_pdh
+        && params->sev_pdh->type == QTYPE_QNULL) {
+        qobject_unref(params->sev_pdh->u.n);
+        params->sev_pdh->type = QTYPE_QSTRING;
+        params->sev_pdh->u.s = strdup("");
+    }
+    /* TODO Rewrite "" to null instead */
+    if (params->sev_plat_cert
+        && params->sev_plat_cert->type == QTYPE_QNULL) {
+        qobject_unref(params->sev_plat_cert->u.n);
+        params->sev_plat_cert->type = QTYPE_QSTRING;
+        params->sev_plat_cert->u.s = strdup("");
+    }
+    /* TODO Rewrite "" to null instead */
+    if (params->sev_amd_cert
+        && params->sev_amd_cert->type == QTYPE_QNULL) {
+        qobject_unref(params->sev_amd_cert->u.n);
+        params->sev_amd_cert->type = QTYPE_QSTRING;
+        params->sev_amd_cert->u.s = strdup("");
+    }
 
     migrate_params_test_apply(params, &tmp);
 
diff --git a/migration/options.h b/migration/options.h
index 246c160aeeea4c75f104e66a2008004d0ffd7693..6b2a893217f0458018d70a17d7655c86f396a03e 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -78,21 +78,25 @@ uint8_t migrate_cpu_throttle_increment(void);
 uint8_t migrate_cpu_throttle_initial(void);
 bool migrate_cpu_throttle_tailslow(void);
 int migrate_decompress_threads(void);
+CompressMethod migrate_compress_method(void);
 uint64_t migrate_downtime_limit(void);
 uint8_t migrate_max_cpu_throttle(void);
 uint64_t migrate_max_bandwidth(void);
 uint64_t migrate_avail_switchover_bandwidth(void);
 uint64_t migrate_max_postcopy_bandwidth(void);
 MigMode migrate_mode(void);
+int migrate_hdbss_buffer_size(void);
 int migrate_multifd_channels(void);
 MultiFDCompression migrate_multifd_compression(void);
 int migrate_multifd_zlib_level(void);
+int migrate_multifd_qatzip_level(void);
 int migrate_multifd_zstd_level(void);
 uint8_t migrate_throttle_trigger_threshold(void);
 const char *migrate_tls_authz(void);
 const char *migrate_tls_creds(void);
 const char *migrate_tls_hostname(void);
 uint64_t migrate_xbzrle_cache_size(void);
+ZeroPageDetection migrate_zero_page_detection(void);
 
 /* parameters setters */
 
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 94231ff2955c80b3d0fab11a40510d34c334a826..bd1dbc3db175ff75e08ef65fac7978ac802dafb7 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -669,55 +669,6 @@ uint64_t qemu_get_be64(QEMUFile *f)
     return v;
 }
 
-/* return the size after compression, or negative value on error */
-static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
-                              const uint8_t *source, size_t source_len)
-{
-    int err;
-
-    err = deflateReset(stream);
-    if (err != Z_OK) {
-        return -1;
-    }
-
-    stream->avail_in = source_len;
-    stream->next_in = (uint8_t *)source;
-    stream->avail_out = dest_len;
-    stream->next_out = dest;
-
-    err = deflate(stream, Z_FINISH);
-    if (err != Z_STREAM_END) {
-        return -1;
-    }
-
-    return stream->next_out - dest;
-}
-
-/* Compress size bytes of data start at p and store the compressed
- * data to the buffer of f.
- *
- * Since the file is dummy file with empty_ops, return -1 if f has no space to
- * save the compressed data.
- */
-ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
-                                  const uint8_t *p, size_t size)
-{
-    ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
-
-    if (blen < compressBound(size)) {
-        return -1;
-    }
-
-    blen = qemu_compress_data(stream, f->buf + f->buf_index + sizeof(int32_t),
-                              blen, p, size);
-    if (blen < 0) {
-        return -1;
-    }
-
-    qemu_put_be32(f, blen);
-    add_buf_to_iovec(f, blen);
-    return blen + sizeof(int32_t);
-}
 
 /* Put the data in the buffer of f_src to the buffer of f_des, and
  * then reset the buf_index of f_src to 0.
@@ -834,3 +785,15 @@ int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size)
 
     return 0;
 }
+
+ssize_t qemu_put_compress_start(QEMUFile *f, uint8_t **dest_ptr)
+{
+    *dest_ptr = f->buf + f->buf_index + sizeof(int32_t);
+    return IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
+}
+
+void qemu_put_compress_end(QEMUFile *f, unsigned int v)
+{
+    qemu_put_be32(f, v);
+    add_buf_to_iovec(f, v);
+}
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 8aec9fabf7f15bc05ef8bc31eef8928ec36827b8..8afa95732bc8b4ba59792c599c699d95585c9448 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -54,8 +54,8 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
 
 size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset);
 size_t coroutine_mixed_fn qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size);
-ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
-                                  const uint8_t *p, size_t size);
+ssize_t qemu_put_compress_start(QEMUFile *f, uint8_t **dest_ptr);
+void qemu_put_compress_end(QEMUFile *f, unsigned int v);
 int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src);
 bool qemu_file_buffer_empty(QEMUFile *file);
 
diff --git a/migration/ram-compress.c b/migration/ram-compress.c
index fa4388f6a61cc420bd2450038d7410a6a4e5694a..74703f0ec4593f1cd60b8d497f6e63383f68e0ee 100644
--- a/migration/ram-compress.c
+++ b/migration/ram-compress.c
@@ -28,7 +28,6 @@
 
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
-
 #include "ram-compress.h"
 
 #include "qemu/error-report.h"
@@ -40,6 +39,7 @@
 #include "exec/ramblock.h"
 #include "ram.h"
 #include "migration-stats.h"
+#include "exec/ram_addr.h"
 
 static struct {
     int64_t pages;
@@ -65,46 +65,293 @@ static QemuThread *compress_threads;
 static QemuMutex comp_done_lock;
 static QemuCond comp_done_cond;
 
-struct DecompressParam {
-    bool done;
-    bool quit;
-    QemuMutex mutex;
-    QemuCond cond;
-    void *des;
-    uint8_t *compbuf;
-    int len;
-    z_stream stream;
-};
-typedef struct DecompressParam DecompressParam;
-
 static QEMUFile *decomp_file;
 static DecompressParam *decomp_param;
 static QemuThread *decompress_threads;
+MigrationCompressOps *compress_ops;
+MigrationDecompressOps *decompress_ops;
 static QemuMutex decomp_done_lock;
 static QemuCond decomp_done_cond;
 
-static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
-                                           RAMBlock *block, ram_addr_t offset,
-                                           uint8_t *source_buf);
+static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block);
+
+static int zlib_save_setup(CompressParam *param)
+{
+    if (deflateInit(&param->stream,
+                    migrate_compress_level()) != Z_OK) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static ssize_t zlib_compress_data(CompressParam *param, size_t size)
+{
+    int err;
+    uint8_t *dest = NULL;
+    z_stream *stream = &param->stream;
+    uint8_t *p = param->originbuf;
+    QEMUFile *f = f = param->file;
+    ssize_t blen = qemu_put_compress_start(f, &dest);
+
+    if (blen < compressBound(size)) {
+       return -1;
+    }
+
+    err = deflateReset(stream);
+    if (err != Z_OK) {
+        return -1;
+    }
+
+    stream->avail_in = size;
+    stream->next_in = p;
+    stream->avail_out = blen;
+    stream->next_out = dest;
+
+    err = deflate(stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        return -1;
+    }
+
+    blen = stream->next_out - dest;
+    if (blen < 0) {
+        return -1;
+    }
+
+    qemu_put_compress_end(f, blen);
+    return blen + sizeof(int32_t);
+}
+
+static void zlib_save_cleanup(CompressParam *param)
+{
+    deflateEnd(&param->stream);
+}
+
+static int zlib_load_setup(DecompressParam *param)
+{
+    if (inflateInit(&param->stream) != Z_OK) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+zlib_decompress_data(DecompressParam *param, uint8_t *dest, size_t size)
+{
+    int err;
+
+    z_stream *stream = &param->stream;
+
+    err = inflateReset(stream);
+    if (err != Z_OK) {
+        return -1;
+    }
+
+    stream->avail_in = param->len;
+    stream->next_in = param->compbuf;
+    stream->avail_out = size;
+    stream->next_out = dest;
+
+    err = inflate(stream, Z_NO_FLUSH);
+    if (err != Z_STREAM_END) {
+        return -1;
+    }
+
+    return stream->total_out;
+}
+
+static void zlib_load_cleanup(DecompressParam *param)
+{
+    inflateEnd(&param->stream);
+}
+
+static int zlib_check_len(int len)
+{
+    return len < 0 || len > compressBound(TARGET_PAGE_SIZE);
+}
+
+#ifdef CONFIG_ZSTD
+static int zstd_save_setup(CompressParam *param)
+{
+    int res;
+    param->zstd_cs = ZSTD_createCStream();
+    if (!param->zstd_cs) {
+        return -1;
+    }
+    res = ZSTD_initCStream(param->zstd_cs, migrate_compress_level());
+    if (ZSTD_isError(res)) {
+        return -1;
+    }
+    return 0;
+}
+static void zstd_save_cleanup(CompressParam *param)
+{
+    ZSTD_freeCStream(param->zstd_cs);
+    param->zstd_cs = NULL;
+}
+static ssize_t zstd_compress_data(CompressParam *param, size_t size)
+{
+    int ret;
+    uint8_t *dest = NULL;
+    uint8_t *p = param->originbuf;
+    QEMUFile *f = f = param->file;
+    ssize_t blen = qemu_put_compress_start(f, &dest);
+    if (blen < ZSTD_compressBound(size)) {
+        return -1;
+    }
+    param->out.dst = dest;
+    param->out.size = blen;
+    param->out.pos = 0;
+    param->in.src = p;
+    param->in.size = size;
+    param->in.pos = 0;
+    do {
+        ret = ZSTD_compressStream2(param->zstd_cs, &param->out,
+                                   &param->in, ZSTD_e_end);
+    } while (ret > 0 && (param->in.size - param->in.pos > 0)
+            && (param->out.size - param->out.pos > 0));
+    if (ret > 0 && (param->in.size - param->in.pos > 0)) {
+        return -1;
+    }
+    if (ZSTD_isError(ret)) {
+        return -1;
+    }
+    blen = param->out.pos;
+    qemu_put_compress_end(f, blen);
+    return blen + sizeof(int32_t);
+}
+
+static int zstd_load_setup(DecompressParam *param)
+{
+    int ret;
+    param->zstd_ds = ZSTD_createDStream();
+    if (!param->zstd_ds) {
+        return -1;
+    }
+    ret = ZSTD_initDStream(param->zstd_ds);
+    if (ZSTD_isError(ret)) {
+        return -1;
+    }
+    return 0;
+}
+static void zstd_load_cleanup(DecompressParam *param)
+{
+    ZSTD_freeDStream(param->zstd_ds);
+    param->zstd_ds = NULL;
+}
+static int
+zstd_decompress_data(DecompressParam *param, uint8_t *dest, size_t size)
+{
+    int ret;
+    param->out.dst = dest;
+    param->out.size = size;
+    param->out.pos = 0;
+    param->in.src = param->compbuf;
+    param->in.size = param->len;
+    param->in.pos = 0;
+    do {
+        ret = ZSTD_decompressStream(param->zstd_ds, &param->out, &param->in);
+    } while (ret > 0 && (param->in.size - param->in.pos > 0)
+                    && (param->out.size - param->out.pos > 0));
+    if (ret > 0 && (param->in.size - param->in.pos > 0)) {
+        return -1;
+    }
+    if (ZSTD_isError(ret)) {
+        return -1;
+    }
+    return ret;
+}
+static int zstd_check_len(int len)
+{
+    return len < 0 || len > ZSTD_compressBound(TARGET_PAGE_SIZE);
+}
+#endif
+
+static int set_compress_ops(void)
+{
+   compress_ops = g_new0(MigrationCompressOps, 1);
+
+    switch (migrate_compress_method()) {
+    case COMPRESS_METHOD_ZLIB:
+        compress_ops->save_setup = zlib_save_setup;
+        compress_ops->save_cleanup = zlib_save_cleanup;
+        compress_ops->compress_data = zlib_compress_data;
+        break;
+#ifdef CONFIG_ZSTD
+    case COMPRESS_METHOD_ZSTD:
+        compress_ops->save_setup = zstd_save_setup;
+        compress_ops->save_cleanup = zstd_save_cleanup;
+        compress_ops->compress_data = zstd_compress_data;
+        break;
+#endif
+    default:
+        return -1;
+    }
+
+    return 0;
+}
+
+static int set_decompress_ops(void)
+{
+   decompress_ops = g_new0(MigrationDecompressOps, 1);
+
+    switch (migrate_compress_method()) {
+    case COMPRESS_METHOD_ZLIB:
+        decompress_ops->load_setup = zlib_load_setup;
+        decompress_ops->load_cleanup = zlib_load_cleanup;
+        decompress_ops->decompress_data = zlib_decompress_data;
+        decompress_ops->check_len = zlib_check_len;
+        break;
+#ifdef CONFIG_ZSTD
+    case COMPRESS_METHOD_ZSTD:
+        decompress_ops->load_setup = zstd_load_setup;
+        decompress_ops->load_cleanup = zstd_load_cleanup;
+        decompress_ops->decompress_data = zstd_decompress_data;
+        decompress_ops->check_len = zstd_check_len;
+        break;
+#endif
+    default:
+        return -1;
+   }
+
+   return 0;
+}
+
+static void clean_compress_ops(void)
+{
+    compress_ops->save_setup = NULL;
+    compress_ops->save_cleanup = NULL;
+    compress_ops->compress_data = NULL;
+
+    g_free(compress_ops);
+    compress_ops = NULL;
+}
+
+static void clean_decompress_ops(void)
+{
+    decompress_ops->load_setup = NULL;
+    decompress_ops->load_cleanup = NULL;
+    decompress_ops->decompress_data = NULL;
+
+    g_free(decompress_ops);
+    decompress_ops = NULL;
+}
 
 static void *do_data_compress(void *opaque)
 {
     CompressParam *param = opaque;
     RAMBlock *block;
-    ram_addr_t offset;
     CompressResult result;
 
     qemu_mutex_lock(&param->mutex);
     while (!param->quit) {
         if (param->trigger) {
             block = param->block;
-            offset = param->offset;
             param->trigger = false;
             qemu_mutex_unlock(&param->mutex);
 
-            result = do_compress_ram_page(param->file, &param->stream,
-                                          block, offset, param->originbuf);
-
+            result = do_compress_ram_page(param, block);
             qemu_mutex_lock(&comp_done_lock);
             param->done = true;
             param->result = result;
@@ -147,7 +394,7 @@ void compress_threads_save_cleanup(void)
         qemu_thread_join(compress_threads + i);
         qemu_mutex_destroy(&comp_param[i].mutex);
         qemu_cond_destroy(&comp_param[i].cond);
-        deflateEnd(&comp_param[i].stream);
+        compress_ops->save_cleanup(&comp_param[i]);
         g_free(comp_param[i].originbuf);
         qemu_fclose(comp_param[i].file);
         comp_param[i].file = NULL;
@@ -158,6 +405,7 @@ void compress_threads_save_cleanup(void)
     g_free(comp_param);
     compress_threads = NULL;
     comp_param = NULL;
+    clean_compress_ops();
 }
 
 int compress_threads_save_setup(void)
@@ -167,6 +415,12 @@ int compress_threads_save_setup(void)
     if (!migrate_compress()) {
         return 0;
     }
+
+    if (set_compress_ops() < 0) {
+        clean_compress_ops();
+        return -1;
+    }
+
     thread_count = migrate_compress_threads();
     compress_threads = g_new0(QemuThread, thread_count);
     comp_param = g_new0(CompressParam, thread_count);
@@ -178,8 +432,7 @@ int compress_threads_save_setup(void)
             goto exit;
         }
 
-        if (deflateInit(&comp_param[i].stream,
-                        migrate_compress_level()) != Z_OK) {
+        if (compress_ops->save_setup(&comp_param[i]) < 0) {
             g_free(comp_param[i].originbuf);
             goto exit;
         }
@@ -204,15 +457,13 @@ exit:
     return -1;
 }
 
-static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
-                                           RAMBlock *block, ram_addr_t offset,
-                                           uint8_t *source_buf)
+static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block)
 {
-    uint8_t *p = block->host + offset;
+    uint8_t *p = block->host + (param->offset & TARGET_PAGE_MASK);
     size_t page_size = qemu_target_page_size();
     int ret;
 
-    assert(qemu_file_buffer_empty(f));
+    assert(qemu_file_buffer_empty(param->file));
 
     if (buffer_is_zero(p, page_size)) {
         return RES_ZEROPAGE;
@@ -223,12 +474,12 @@ static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream,
      * so that we can catch up the error during compression and
      * decompression
      */
-    memcpy(source_buf, p, page_size);
-    ret = qemu_put_compression_data(f, stream, source_buf, page_size);
+    memcpy(param->originbuf, p, page_size);
+    ret = compress_ops->compress_data(param, page_size);
     if (ret < 0) {
         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
         error_report("compressed data failed!");
-        qemu_fflush(f);
+        qemu_fflush(param->file);
         return RES_NONE;
     }
     return RES_COMPRESS;
@@ -320,50 +571,23 @@ bool compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset,
     }
 }
 
-/* return the size after decompression, or negative value on error */
-static int
-qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
-                     const uint8_t *source, size_t source_len)
-{
-    int err;
-
-    err = inflateReset(stream);
-    if (err != Z_OK) {
-        return -1;
-    }
-
-    stream->avail_in = source_len;
-    stream->next_in = (uint8_t *)source;
-    stream->avail_out = dest_len;
-    stream->next_out = dest;
-
-    err = inflate(stream, Z_NO_FLUSH);
-    if (err != Z_STREAM_END) {
-        return -1;
-    }
-
-    return stream->total_out;
-}
-
 static void *do_data_decompress(void *opaque)
 {
     DecompressParam *param = opaque;
     unsigned long pagesize;
     uint8_t *des;
-    int len, ret;
+    int ret;
 
     qemu_mutex_lock(&param->mutex);
     while (!param->quit) {
         if (param->des) {
             des = param->des;
-            len = param->len;
             param->des = 0;
             qemu_mutex_unlock(&param->mutex);
 
             pagesize = qemu_target_page_size();
 
-            ret = qemu_uncompress_data(&param->stream, des, pagesize,
-                                       param->compbuf, len);
+            ret = decompress_ops->decompress_data(param, des, pagesize);
             if (ret < 0 && migrate_get_current()->decompress_error_check) {
                 error_report("decompress data failed");
                 qemu_file_set_error(decomp_file, ret);
@@ -431,7 +655,7 @@ void compress_threads_load_cleanup(void)
         qemu_thread_join(decompress_threads + i);
         qemu_mutex_destroy(&decomp_param[i].mutex);
         qemu_cond_destroy(&decomp_param[i].cond);
-        inflateEnd(&decomp_param[i].stream);
+        decompress_ops->load_cleanup(&decomp_param[i]);
         g_free(decomp_param[i].compbuf);
         decomp_param[i].compbuf = NULL;
     }
@@ -440,6 +664,7 @@ void compress_threads_load_cleanup(void)
     decompress_threads = NULL;
     decomp_param = NULL;
     decomp_file = NULL;
+    clean_decompress_ops();
 }
 
 int compress_threads_load_setup(QEMUFile *f)
@@ -450,6 +675,11 @@ int compress_threads_load_setup(QEMUFile *f)
         return 0;
     }
 
+    if (set_decompress_ops() < 0) {
+        clean_decompress_ops();
+        return -1;
+    }
+
     /*
      * set compression_counters memory to zero for a new migration
      */
@@ -462,7 +692,7 @@ int compress_threads_load_setup(QEMUFile *f)
     qemu_cond_init(&decomp_done_cond);
     decomp_file = f;
     for (i = 0; i < thread_count; i++) {
-        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
+        if (decompress_ops->load_setup(&decomp_param[i]) < 0) {
             goto exit;
         }
 
diff --git a/migration/ram-compress.h b/migration/ram-compress.h
index 0d89a2f55e763e3a4eedb2724600154a016b37e7..e8700eb36faa6eb020c9a2f212ad3549da3881a7 100644
--- a/migration/ram-compress.h
+++ b/migration/ram-compress.h
@@ -29,6 +29,10 @@
 #ifndef QEMU_MIGRATION_COMPRESS_H
 #define QEMU_MIGRATION_COMPRESS_H
 
+#ifdef CONFIG_ZSTD
+#include <zstd.h>
+#include <zstd_errors.h>
+#endif
 #include "qemu-file.h"
 #include "qapi/qapi-types-migration.h"
 
@@ -39,6 +43,25 @@ enum CompressResult {
 };
 typedef enum CompressResult CompressResult;
 
+struct DecompressParam {
+    bool done;
+    bool quit;
+    QemuMutex mutex;
+    QemuCond cond;
+    void *des;
+    uint8_t *compbuf;
+    int len;
+
+    /* for zlib compression */
+    z_stream stream;
+#ifdef CONFIG_ZSTD
+    ZSTD_DStream *zstd_ds;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+#endif
+};
+typedef struct DecompressParam DecompressParam;
+
 struct CompressParam {
     bool done;
     bool quit;
@@ -51,11 +74,32 @@ struct CompressParam {
     ram_addr_t offset;
 
     /* internally used fields */
-    z_stream stream;
     uint8_t *originbuf;
+
+    /* for zlib compression */
+    z_stream stream;
+
+#ifdef CONFIG_ZSTD
+    ZSTD_CStream *zstd_cs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+#endif
 };
 typedef struct CompressParam CompressParam;
 
+typedef struct {
+    int (*save_setup)(CompressParam *param);
+    void (*save_cleanup)(CompressParam *param);
+    ssize_t (*compress_data)(CompressParam *param, size_t size);
+} MigrationCompressOps;
+
+typedef struct {
+    int (*load_setup)(DecompressParam *param);
+    void (*load_cleanup)(DecompressParam *param);
+    int (*decompress_data)(DecompressParam *param, uint8_t *dest, size_t size);
+    int (*check_len)(int len);
+} MigrationDecompressOps;
+
 void compress_threads_save_cleanup(void);
 int compress_threads_save_setup(void);
 
diff --git a/migration/ram.c b/migration/ram.c
index 8c7886ab797b8a91d9ecf060e8ca016bd436ae46..91bec89a6e80d74712594809a1cbe9dada709c89 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -39,6 +39,7 @@
 #include "migration-stats.h"
 #include "migration/register.h"
 #include "migration/misc.h"
+#include "migration/options.h"
 #include "qemu-file.h"
 #include "postcopy-ram.h"
 #include "page_cache.h"
@@ -63,6 +64,12 @@
 #include "options.h"
 #include "sysemu/dirtylimit.h"
 #include "sysemu/kvm.h"
+#include "exec/confidential-guest-support.h"
+
+/* Defines RAM_SAVE_ENCRYPTED_PAGE and RAM_SAVE_SHARED_REGION_LIST */
+#include "target/i386/sev.h"
+#include "target/i386/csv.h"
+#include "sysemu/kvm.h"
 
 #include "hw/boards.h" /* for machine_dump_guest_core() */
 
@@ -92,10 +99,21 @@
 /* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
-/* We can't use any flag that is bigger than 0x200 */
+#define RAM_SAVE_FLAG_ENCRYPTED_DATA   0x400
+
+bool memcrypt_enabled(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    if(ms->cgs)
+        return ms->cgs->ready;
+    else
+        return false;
+}
 
 XBZRLECacheStats xbzrle_counters;
 
+extern MigrationDecompressOps *decompress_ops;
+
 /* used by the search for pages to send */
 struct PageSearchStatus {
     /* The migration channel used for a specific host page */
@@ -257,6 +275,10 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
                       nr);
 }
 
+void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset)
+{
+    set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
+}
 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 
 /*
@@ -1123,6 +1145,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
     QEMUFile *file = pss->pss_channel;
     int len = 0;
 
+    if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) {
+        return 0;
+    }
+
     if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
         return 0;
     }
@@ -1204,6 +1230,125 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
     return 1;
 }
 
+/**
+ * ram_save_encrypted_page - send the given encrypted page to the stream
+ */
+static int ram_save_encrypted_page(RAMState *rs, PageSearchStatus *pss)
+{
+    QEMUFile *file = pss->pss_channel;
+    int ret;
+    uint8_t *p;
+    RAMBlock *block = pss->block;
+    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
+    uint64_t bytes_xmit = 0;
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+
+    p = block->host + offset;
+    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
+
+    ram_transferred_add(save_page_header(pss, file, block,
+                        offset | RAM_SAVE_FLAG_ENCRYPTED_DATA));
+    qemu_put_be32(file, RAM_SAVE_ENCRYPTED_PAGE);
+    ret = ops->save_outgoing_page(file, p, TARGET_PAGE_SIZE, &bytes_xmit);
+    if (ret) {
+        return -1;
+    }
+    ram_transferred_add(4 + bytes_xmit);
+    stat64_add(&mig_stats.normal_pages, 1);
+
+    return 1;
+}
+
+/**
+ * ram_save_shared_region_list: send the shared region list
+ */
+static int ram_save_shared_region_list(RAMState *rs, QEMUFile *f)
+{
+    int ret;
+    uint64_t bytes_xmit = 0;
+    PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+
+    ram_transferred_add(save_page_header(pss, f,
+                                         pss->last_sent_block,
+                                         RAM_SAVE_FLAG_ENCRYPTED_DATA));
+    qemu_put_be32(f, RAM_SAVE_SHARED_REGIONS_LIST);
+    ret = ops->save_outgoing_shared_regions_list(f, &bytes_xmit);
+    if (ret < 0) {
+        return ret;
+    }
+    ram_transferred_add(4 + bytes_xmit);
+
+    return 0;
+}
+
+/**
+ * ram_save_encrypted_cpu_state: send the encrypted cpu state
+ */
+static int ram_save_encrypted_cpu_state(RAMState *rs, QEMUFile *f)
+{
+    int ret;
+    uint64_t bytes_xmit = 0;
+    PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+
+    ram_transferred_add(save_page_header(pss, f,
+                                         pss->last_sent_block,
+                                         RAM_SAVE_FLAG_ENCRYPTED_DATA));
+    qemu_put_be32(f, RAM_SAVE_ENCRYPTED_CPU_STATE);
+    ret = ops->save_outgoing_cpu_state(f, &bytes_xmit);
+    if (ret < 0) {
+        return ret;
+    }
+    ram_transferred_add(4 + bytes_xmit);
+
+    return 0;
+}
+
+static int load_encrypted_data(QEMUFile *f, uint8_t *ptr)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+
+    int flag;
+
+    flag = qemu_get_be32(f);
+
+    if (flag == RAM_SAVE_ENCRYPTED_PAGE) {
+        return ops->load_incoming_page(f, ptr);
+    } else if (flag == RAM_SAVE_SHARED_REGIONS_LIST) {
+        return ops->load_incoming_shared_regions_list(f);
+    } else if (flag == RAM_SAVE_ENCRYPTED_PAGE_BATCH) {
+        return ops->queue_incoming_page(f, ptr);
+    } else if (flag == RAM_SAVE_ENCRYPTED_PAGE_BATCH_END) {
+        if (ops->queue_incoming_page(f, ptr)) {
+            error_report("Failed to queue incoming data");
+            return -EINVAL;
+        }
+        return ops->load_queued_incoming_pages(f);
+    } else if (flag == RAM_SAVE_ENCRYPTED_CPU_STATE) {
+        return ops->load_incoming_cpu_state(f);
+    } else {
+        error_report("unknown encrypted flag %x", flag);
+        return 1;
+    }
+}
+
 /**
  * ram_save_page: send the given page to the stream
  *
@@ -1250,13 +1395,11 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
     return pages;
 }
 
-static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
-                                 ram_addr_t offset)
+static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
 {
-    if (multifd_queue_page(file, block, offset) < 0) {
+    if (!multifd_queue_page(block, offset)) {
         return -1;
     }
-    stat64_add(&mig_stats.normal_pages, 1);
 
     return 1;
 }
@@ -1336,7 +1479,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
             if (migrate_multifd() &&
                 !migrate_multifd_flush_after_each_section()) {
                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
-                int ret = multifd_send_sync_main(f);
+                int ret = multifd_send_sync_main();
                 if (ret < 0) {
                     return ret;
                 }
@@ -2034,6 +2177,56 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
                                            compress_send_queued_data);
 }
 
+/**
+ * encrypted_test_list: check if the page is encrypted
+ *
+ * Returns a bool indicating whether the page is encrypted.
+ */
+static bool encrypted_test_list(RAMState *rs, RAMBlock *block,
+                                unsigned long page)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+    unsigned long gfn;
+    hwaddr paddr = 0;
+    int ret;
+
+    /* ROM devices contains the unencrypted data */
+    if (memory_region_is_rom(block->mr)) {
+        return false;
+    }
+
+    if (!strcmp(memory_region_name(block->mr), "system.flash0")) {
+        return true;
+    }
+
+    if (!strcmp(memory_region_name(block->mr), "system.flash1")) {
+        return false;
+    }
+
+    if (!strcmp(memory_region_name(block->mr), "vga.vram")) {
+        return false;
+    }
+
+    /*
+     * Translate page in ram_addr_t address space to GPA address
+     * space using memory region.
+     */
+    if (kvm_enabled()) {
+        ret = kvm_physical_memory_addr_from_host(kvm_state,
+                           block->host + (page << TARGET_PAGE_BITS), &paddr);
+        if (ret == 0) {
+            return false;
+        }
+    }
+    gfn = paddr >> TARGET_PAGE_BITS;
+
+    return ops->is_gfn_in_unshared_region(gfn);
+}
+
 /**
  * ram_save_target_page_legacy: save one target page
  *
@@ -2044,7 +2237,6 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
  */
 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
 {
-    RAMBlock *block = pss->block;
     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
     int res;
 
@@ -2052,6 +2244,17 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
         return res;
     }
 
+    /*
+     * If memory encryption is enabled then use memory encryption APIs
+     * to write the outgoing buffer to the wire. The encryption APIs
+     * will take care of accessing the guest memory and re-encrypt it
+     * for the transport purposes.
+     */
+    if (memcrypt_enabled() &&
+        encrypted_test_list(rs, pss->block, pss->page)) {
+        return ram_save_encrypted_page(rs, pss);
+    }
+
     if (save_compress_page(rs, pss, offset)) {
         return 1;
     }
@@ -2060,17 +2263,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
         return 1;
     }
 
+    return ram_save_page(rs, pss);
+}
+
+/**
+ * ram_save_target_page_multifd: send one target page to multifd workers
+ *
+ * Returns 1 if the page was queued, -1 otherwise.
+ *
+ * @rs: current RAM state
+ * @pss: data about the page we want to send
+ */
+static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss)
+{
+    RAMBlock *block = pss->block;
+    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
+
     /*
-     * Do not use multifd in postcopy as one whole host page should be
-     * placed.  Meanwhile postcopy requires atomic update of pages, so even
-     * if host page size == guest page size the dest guest during run may
-     * still see partially copied pages which is data corruption.
+     * While using multifd live migration, we still need to handle zero
+     * page checking on the migration main thread.
      */
-    if (migrate_multifd() && !migration_in_postcopy()) {
-        return ram_save_multifd_page(pss->pss_channel, block, offset);
+    if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
+        if (save_zero_page(rs, pss, offset)) {
+            return 1;
+        }
     }
 
-    return ram_save_page(rs, pss);
+    return ram_save_multifd_page(block, offset);
 }
 
 /* Should be called before sending a host page */
@@ -2177,6 +2396,196 @@ out:
     return ret;
 }
 
+#ifdef CONFIG_HYGON_CSV_MIG_ACCEL
+/**
+ * ram_save_encrypted_pages_in_batch: send the given encrypted pages to
+ *                                    the stream.
+ *
+ * Sending pages of 4K size in batch. The saving stops at the end of
+ * the block.
+ *
+ * The caller must be with ram_state.bitmap_mutex held to call this
+ * function.
+ *
+ * Returns the number of pages written or negative on error
+ *
+ * @rs: current RAM state
+ * @pss: data about the page we want to send
+ */
+static int
+ram_save_encrypted_pages_in_batch(RAMState *rs, PageSearchStatus *pss)
+{
+    bool page_dirty;
+    int ret;
+    int tmppages, pages = 0;
+    uint8_t *p;
+    uint32_t host_len = 0;
+    uint64_t bytes_xmit = 0;
+    ram_addr_t offset, start_offset = 0;
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *)object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+
+    do {
+        page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
+
+        /* Check the pages is dirty and if it is send it */
+        if (page_dirty) {
+            /* Process the unencrypted page */
+            if (!encrypted_test_list(rs, pss->block, pss->page)) {
+                tmppages = migration_ops->ram_save_target_page(rs, pss);
+            } else {
+                /* Caculate the offset and host virtual address of the page */
+                offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
+                p = pss->block->host + offset;
+
+                /* Record the offset and host virtual address of the first
+                 * page in this loop which will be used below.
+                 */
+                if (host_len == 0) {
+                    start_offset = offset | RAM_SAVE_FLAG_ENCRYPTED_DATA;
+                } else {
+                    offset |= (RAM_SAVE_FLAG_ENCRYPTED_DATA | RAM_SAVE_FLAG_CONTINUE);
+                }
+
+                /* Queue the outgoing page if the page is not zero page.
+                 * If the queued pages are up to the outgoing page window size,
+                 * process them below.
+                 */
+                if (ops->queue_outgoing_page(p, TARGET_PAGE_SIZE, offset))
+                    return -1;
+
+                tmppages = 1;
+                host_len += TARGET_PAGE_SIZE;
+
+                stat64_add(&mig_stats.normal_pages, 1);
+            }
+        } else {
+            tmppages = 0;
+        }
+
+        if (tmppages >= 0) {
+            pages += tmppages;
+        } else {
+            return tmppages;
+        }
+
+        pss_find_next_dirty(pss);
+    } while (offset_in_ramblock(pss->block,
+                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS) &&
+             host_len < CSV_OUTGOING_PAGE_WINDOW_SIZE);
+
+    /* Check if there are any queued pages */
+    if (host_len != 0) {
+        ram_transferred_add(save_page_header(pss, pss->pss_channel,
+                                             pss->block, start_offset));
+        /* if only one page queued, flag is BATCH_END, else flag is BATCH */
+        if (host_len > TARGET_PAGE_SIZE)
+            qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE_BATCH);
+        else
+            qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE_BATCH_END);
+        ram_transferred_add(4);
+        /* Process the queued pages in batch */
+        ret = ops->save_queued_outgoing_pages(pss->pss_channel, &bytes_xmit);
+        if (ret) {
+            return -1;
+        }
+        ram_transferred_add(bytes_xmit);
+    }
+
+    /* The offset we leave with is the last one we looked at */
+    pss->page--;
+
+    return pages;
+}
+#endif
+
+/**
+ * ram_save_csv3_pages - send the given csv3 VM pages to the stream
+ */
+static int ram_save_csv3_pages(RAMState *rs, PageSearchStatus *pss)
+{
+    bool page_dirty;
+    int ret;
+    int tmppages, pages = 0;
+    uint8_t *p;
+    uint32_t host_len = 0;
+    uint64_t bytes_xmit = 0;
+    RAMBlock *block = pss->block;
+    ram_addr_t offset = 0;
+    hwaddr paddr = RAM_ADDR_INVALID;
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+
+    if (!kvm_csv3_enabled())
+        return 0;
+
+    do {
+        page_dirty = migration_bitmap_clear_dirty(rs, block, pss->page);
+
+        /* Check the pages is dirty and if it is send it */
+        if (page_dirty) {
+            ret = kvm_physical_memory_addr_from_host(kvm_state,
+                    block->host + (pss->page << TARGET_PAGE_BITS), &paddr);
+            /* Process ROM or MMIO */
+            if (paddr == RAM_ADDR_INVALID ||
+                memory_region_is_rom(block->mr)) {
+                tmppages = migration_ops->ram_save_target_page(rs, pss);
+            } else {
+                /* Caculate the offset and host virtual address of the page */
+                offset = pss->page << TARGET_PAGE_BITS;
+                p = block->host + offset;
+
+                if (ops->queue_outgoing_page(p, TARGET_PAGE_SIZE, offset))
+                    return -1;
+
+                tmppages = 1;
+                host_len += TARGET_PAGE_SIZE;
+
+                stat64_add(&mig_stats.normal_pages, 1);
+            }
+        } else {
+            tmppages = 0;
+        }
+
+        if (tmppages >= 0) {
+            pages += tmppages;
+        } else {
+            return tmppages;
+        }
+
+        pss_find_next_dirty(pss);
+    } while (offset_in_ramblock(block,
+                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS) &&
+             host_len < CSV3_OUTGOING_PAGE_WINDOW_SIZE);
+
+    /* Check if there are any queued pages */
+    if (host_len != 0) {
+        /* Always set offset as 0 for csv3. */
+        ram_transferred_add(save_page_header(pss, pss->pss_channel,
+                                             block, 0 | RAM_SAVE_FLAG_ENCRYPTED_DATA));
+
+        qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE);
+        ram_transferred_add(4);
+        /* Process the queued pages in batch */
+        ret = ops->save_queued_outgoing_pages(pss->pss_channel, &bytes_xmit);
+        if (ret) {
+            return -1;
+        }
+        ram_transferred_add(bytes_xmit);
+    }
+
+    /* The offset we leave with is the last one we looked at */
+    pss->page--;
+
+    return pages;
+}
+
 /**
  * ram_save_host_page: save a whole host page
  *
@@ -2212,6 +2621,21 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
         return 0;
     }
 
+    if (kvm_csv3_enabled())
+        return ram_save_csv3_pages(rs, pss);
+
+#ifdef CONFIG_HYGON_CSV_MIG_ACCEL
+    /*
+     * If command_batch function is enabled and memory encryption is enabled
+     * then use command batch APIs to accelerate the sending process
+     * to write the outgoing buffer to the wire. The encryption APIs
+     * will re-encrypt the data with transport key so that data is prototect
+     * on the wire.
+     */
+    if (memcrypt_enabled() && is_hygon_cpu() && !migration_in_postcopy())
+        return ram_save_encrypted_pages_in_batch(rs, pss);
+#endif
+
     /* Update host page boundary information */
     pss_host_page_prepare(pss);
 
@@ -2404,6 +2828,11 @@ static void ram_save_cleanup(void *opaque)
              * memory_global_dirty_log_stop will assert that
              * memory_global_dirty_log_start/stop used in pairs
              */
+#ifdef TARGET_AARCH64
+            if (kvm_enabled()) {
+                kvm_update_hdbss_cap(false, 0);
+            }
+#endif
             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
         }
     }
@@ -2807,6 +3236,11 @@ static void ram_init_bitmaps(RAMState *rs)
         ram_list_init_bitmaps();
         /* We don't use dirty log with background snapshots */
         if (!migrate_background_snapshot()) {
+#ifdef TARGET_AARCH64
+            if (kvm_enabled()) {
+                kvm_update_hdbss_cap(true, migrate_hdbss_buffer_size());
+            }
+#endif
             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
             migration_bitmap_sync_precopy(rs, false);
         }
@@ -2917,6 +3351,18 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
     }
 }
 
+static int ram_encrypted_save_setup(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs));
+    struct ConfidentialGuestMemoryEncryptionOps *ops =
+        cgs_class->memory_encryption_ops;
+    MigrationParameters *p = &migrate_get_current()->parameters;
+
+    return ops->save_setup(p->sev_pdh, p->sev_plat_cert, p->sev_amd_cert);
+}
+
 /*
  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  * long-running RCU critical section.  When rcu-reclaims in the code
@@ -2952,6 +3398,13 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
 
     WITH_RCU_READ_LOCK_GUARD() {
+
+        if (memcrypt_enabled()) {
+            if (ram_encrypted_save_setup()) {
+                return -1;
+            }
+        }
+
         qemu_put_be64(f, ram_bytes_total_with_ignored()
                          | RAM_SAVE_FLAG_MEM_SIZE);
 
@@ -2982,10 +3435,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     }
 
     migration_ops = g_malloc0(sizeof(MigrationOps));
-    migration_ops->ram_save_target_page = ram_save_target_page_legacy;
+
+    if (migrate_multifd()) {
+        migration_ops->ram_save_target_page = ram_save_target_page_multifd;
+    } else {
+        migration_ops->ram_save_target_page = ram_save_target_page_legacy;
+    }
 
     qemu_mutex_unlock_iothread();
-    ret = multifd_send_sync_main(f);
+    ret = multifd_send_sync_main();
     qemu_mutex_lock_iothread();
     if (ret < 0) {
         return ret;
@@ -3109,7 +3567,7 @@ out:
     if (ret >= 0
         && migration_is_setup_or_active(migrate_get_current()->state)) {
         if (migrate_multifd() && migrate_multifd_flush_after_each_section()) {
-            ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
+            ret = multifd_send_sync_main();
             if (ret < 0) {
                 return ret;
             }
@@ -3181,9 +3639,31 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
             qemu_file_set_error(f, ret);
             return ret;
         }
+
+        /* send the shared regions list */
+        if (memcrypt_enabled()) {
+            ret = ram_save_shared_region_list(rs, f);
+            if (ret < 0) {
+                qemu_file_set_error(f, ret);
+                return ret;
+            }
+
+            /*
+             * send the encrypted cpu state, for example, CSV2 guest's
+             * vmsa for each vcpu.
+             */
+            if (is_hygon_cpu()) {
+                ret = ram_save_encrypted_cpu_state(rs, f);
+                if (ret < 0) {
+                    error_report("Failed to save encrypted cpu state");
+                    qemu_file_set_error(f, ret);
+                    return ret;
+                }
+            }
+        }
     }
 
-    ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
+    ret = multifd_send_sync_main();
     if (ret < 0) {
         return ret;
     }
@@ -3918,7 +4398,8 @@ static int ram_load_precopy(QEMUFile *f)
         }
 
         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
-                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
+                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE |
+                     RAM_SAVE_FLAG_ENCRYPTED_DATA)) {
             RAMBlock *block = ram_block_from_stream(mis, f, flags,
                                                     RAM_CHANNEL_PRECOPY);
 
@@ -3979,7 +4460,7 @@ static int ram_load_precopy(QEMUFile *f)
 
         case RAM_SAVE_FLAG_COMPRESS_PAGE:
             len = qemu_get_be32(f);
-            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
+            if (decompress_ops->check_len(len)) {
                 error_report("Invalid compressed data length: %d", len);
                 ret = -EINVAL;
                 break;
@@ -4011,6 +4492,12 @@ static int ram_load_precopy(QEMUFile *f)
                 qemu_file_set_error(f, ret);
             }
             break;
+        case RAM_SAVE_FLAG_ENCRYPTED_DATA:
+            if (load_encrypted_data(f, host)) {
+                    error_report("Failed to load encrypted data");
+                    ret = -EINVAL;
+            }
+            break;
         default:
             error_report("Unknown combination of migration flags: 0x%x", flags);
             ret = -EINVAL;
diff --git a/migration/ram.h b/migration/ram.h
index 9b937a446b7382ddbea7194a88d44e9fa8b54462..cd263df02662d0d69180c22e0c6950f27f86abea 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -69,6 +69,7 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);
 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr);
+void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset);
 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
                                   const char *block_name);
 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp);
diff --git a/migration/savevm.c b/migration/savevm.c
index eec5503a422081a5e2fce7238820ec6fc8db1b8b..bde05eebe4754feb537e7b23de0bee0e48f89cd7 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -61,6 +61,7 @@
 #include "sysemu/replay.h"
 #include "sysemu/runstate.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
 #include "sysemu/xen.h"
 #include "migration/colo.h"
 #include "qemu/bitmap.h"
@@ -1553,6 +1554,7 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
         ret = vmstate_save(f, se, vmdesc);
         if (ret) {
             qemu_file_set_error(f, ret);
+            json_writer_free(vmdesc);
             return ret;
         }
 
@@ -1572,6 +1574,7 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
             migrate_set_error(ms, local_err);
             error_report_err(local_err);
             qemu_file_set_error(f, ret);
+            json_writer_free(vmdesc);
             return ret;
         }
     }
@@ -2854,6 +2857,10 @@ int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
     uint8_t section_type;
     int ret = 0;
 
+    if (qemu_mutex_iothread_locked()) {
+        memory_region_transaction_begin();
+    }
+
 retry:
     while (true) {
         section_type = qemu_get_byte(f);
@@ -2897,6 +2904,9 @@ retry:
     }
 
 out:
+    if (qemu_mutex_iothread_locked()) {
+        memory_region_transaction_commit();
+    }
     if (ret < 0) {
         qemu_file_set_error(f, ret);
 
@@ -2956,7 +2966,10 @@ int qemu_loadvm_state(QEMUFile *f)
     trace_qemu_loadvm_state_post_main(ret);
 
     if (mis->have_listen_thread) {
-        /* Listen thread still going, can't clean up yet */
+        /*
+         * Postcopy listen thread still going, don't synchronize the
+         * cpus yet.
+         */
         return ret;
     }
 
@@ -2999,7 +3012,6 @@ int qemu_loadvm_state(QEMUFile *f)
         }
     }
 
-    qemu_loadvm_state_cleanup();
     cpu_synchronize_all_post_init();
 
     return ret;
@@ -3042,6 +3054,11 @@ int qemu_loadvm_approve_switchover(void)
 bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
                   bool has_devices, strList *devices, Error **errp)
 {
+    if (virtcca_cvm_enabled()) {
+        error_setg(errp, "The savevm command is temporarily unsupported in cvm.");
+        return false;
+    }
+
     BlockDriverState *bs;
     QEMUSnapshotInfo sn1, *sn = &sn1;
     int ret = -1, ret2;
diff --git a/migration/socket.c b/migration/socket.c
index 98e3ea1514719d7059c30a192ff265e1dc1ba28c..9ab89b1e089b6053078de50db067fab08ef610e3 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -60,17 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp)
     return QIO_CHANNEL(sioc);
 }
 
-int socket_send_channel_destroy(QIOChannel *send)
-{
-    /* Remove channel */
-    object_unref(OBJECT(send));
-    if (outgoing_args.saddr) {
-        qapi_free_SocketAddress(outgoing_args.saddr);
-        outgoing_args.saddr = NULL;
-    }
-    return 0;
-}
-
 struct SocketConnectData {
     MigrationState *s;
     char *hostname;
@@ -137,6 +126,14 @@ void socket_start_outgoing_migration(MigrationState *s,
                                      NULL);
 }
 
+void socket_cleanup_outgoing_migration(void)
+{
+    if (outgoing_args.saddr) {
+        qapi_free_SocketAddress(outgoing_args.saddr);
+        outgoing_args.saddr = NULL;
+    }
+}
+
 static void socket_accept_incoming_migration(QIONetListener *listener,
                                              QIOChannelSocket *cioc,
                                              gpointer opaque)
diff --git a/migration/socket.h b/migration/socket.h
index 5e4c33b8ea541120912310944e6689f92d47a881..46c233ecd29e5e4977c978abbaf7ec553a9c8576 100644
--- a/migration/socket.h
+++ b/migration/socket.h
@@ -23,10 +23,11 @@
 
 void socket_send_channel_create(QIOTaskFunc f, void *data);
 QIOChannel *socket_send_channel_create_sync(Error **errp);
-int socket_send_channel_destroy(QIOChannel *send);
 
 void socket_start_incoming_migration(SocketAddress *saddr, Error **errp);
 
 void socket_start_outgoing_migration(MigrationState *s,
                                      SocketAddress *saddr, Error **errp);
+void socket_cleanup_outgoing_migration(void);
+
 #endif
diff --git a/migration/trace-events b/migration/trace-events
index de4a743c8a77ccc7b28160c50e714ebae66566bf..f0e1cb80c75bbcb71b443df782058cddb5b895c0 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) ""
 # multifd.c
 multifd_new_send_channel_async(uint8_t id) "channel %u"
 multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p"
-multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u"
+multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u"
 multifd_recv_new_channel(uint8_t id) "channel %u"
 multifd_recv_sync_main(long packet_num) "packet num %ld"
 multifd_recv_sync_main_signal(uint8_t id) "channel %u"
-multifd_recv_sync_main_wait(uint8_t id) "channel %u"
+multifd_recv_sync_main_wait(uint8_t id) "iter %u"
 multifd_recv_terminate_threads(bool error) "error %d"
-multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64
+multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64
 multifd_recv_thread_start(uint8_t id) "%u"
-multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u"
+multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u"
 multifd_send_error(uint8_t id) "channel %u"
 multifd_send_sync_main(long packet_num) "packet num %ld"
 multifd_send_sync_main_signal(uint8_t id) "channel %u"
 multifd_send_sync_main_wait(uint8_t id) "channel %u"
-multifd_send_terminate_threads(bool error) "error %d"
-multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %"  PRIu64
+multifd_send_terminate_threads(void) ""
+multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %"  PRIu64 " zero pages %"  PRIu64
 multifd_send_thread_start(uint8_t id) "%u"
 multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s"
 multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s"
diff --git a/migration/vmstate.c b/migration/vmstate.c
index b7723a4187144d4e978713a8994ee200c1249afb..e621d8ddb7159d484e54dc00f1d3616090c42f6e 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -20,9 +20,11 @@
 #include "qemu/bitops.h"
 #include "qemu/error-report.h"
 #include "trace.h"
+#include "exec/memory.h"
 
 static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
-                                   void *opaque, JSONWriter *vmdesc);
+                                   void *opaque, JSONWriter *vmdesc,
+                                   Error **errp);
 static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd,
                                    void *opaque);
 
@@ -183,6 +185,13 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
         return ret;
     }
     if (vmsd->post_load) {
+        /**
+         * We call memory_transaction_begin in qemu_loadvm_state_main,
+         * so address space will not be updated during vm state loading.
+         * But some dev need to use address space here, force commit
+         * memory region transaction before call post_load.
+         */
+        memory_region_commit();
         ret = vmsd->post_load(opaque, version_id);
     }
     trace_vmstate_load_state_end(vmsd->name, "end", ret);
@@ -440,12 +449,13 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
         json_writer_end_array(vmdesc);
     }
 
-    ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc);
+    ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc, errp);
 
     if (vmsd->post_save) {
         int ps_ret = vmsd->post_save(opaque);
-        if (!ret) {
+        if (!ret && ps_ret) {
             ret = ps_ret;
+            error_setg(errp, "post-save failed: %s", vmsd->name);
         }
     }
     return ret;
@@ -515,7 +525,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd,
 }
 
 static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
-                                   void *opaque, JSONWriter *vmdesc)
+                                   void *opaque, JSONWriter *vmdesc,
+                                   Error **errp)
 {
     const VMStateDescription **sub = vmsd->subsections;
     bool vmdesc_has_subsections = false;
@@ -543,7 +554,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
             qemu_put_byte(f, len);
             qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len);
             qemu_put_be32(f, vmsdsub->version_id);
-            ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc);
+            ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp);
             if (ret) {
                 return ret;
             }
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 871898ac46b77dab291bd59d155703337376b3e6..5bb3c9cd461a17a5bae1f1009c919230beda4310 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -24,6 +24,7 @@
 #include "qapi/qapi-commands-control.h"
 #include "qapi/qapi-commands-misc.h"
 #include "qapi/qmp/qdict.h"
+#include "qapi/qapi-visit-migration.h"
 #include "qemu/cutils.h"
 #include "hw/intc/intc.h"
 #include "qemu/log.h"
diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h
index 252de856812f41eb45a88866240950861b943822..d7842fa464fc244a6e8586dd6f42f90e1b953b60 100644
--- a/monitor/monitor-internal.h
+++ b/monitor/monitor-internal.h
@@ -144,6 +144,7 @@ typedef struct {
     const QmpCommandList *commands;
     bool capab_offered[QMP_CAPABILITY__MAX]; /* capabilities offered */
     bool capab[QMP_CAPABILITY__MAX];         /* offered and accepted */
+    uint64_t qmp_client_id; /*qmp client id, update if peer disconnect */
     /*
      * Protects qmp request/response queue.
      * Take monitor_lock first when you need both.
diff --git a/monitor/monitor.c b/monitor/monitor.c
index 01ede1babd3d5ce004f528d870d54b9a0a1279c1..8d59a7661204787922232b6eded826cd10f2f5fe 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -29,10 +29,13 @@
 #include "qapi/qapi-emit-events.h"
 #include "qapi/qapi-visit-control.h"
 #include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qjson.h"
 #include "qemu/error-report.h"
 #include "qemu/option.h"
 #include "sysemu/qtest.h"
 #include "trace.h"
+#include "qemu/log.h"
+#include "qapi/qmp/qobject.h"
 
 /*
  * To prevent flooding clients, events can be throttled. The
@@ -338,6 +341,7 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict)
 {
     Monitor *mon;
     MonitorQMP *qmp_mon;
+    GString *json;
 
     trace_monitor_protocol_event_emit(event, qdict);
     QTAILQ_FOREACH(mon, &mon_list, entry) {
@@ -348,6 +352,13 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict)
         qmp_mon = container_of(mon, MonitorQMP, common);
         if (qmp_mon->commands != &qmp_cap_negotiation_commands) {
             qmp_send_response(qmp_mon, qdict);
+            json = qobject_to_json(QOBJECT(qdict));
+            if (json) {
+                if (!strstr(json->str, "RTC_CHANGE")) {
+                    qemu_log("%s\n", json->str);
+                }
+                g_string_free(json, true);
+            }
         }
     }
 }
@@ -778,6 +789,33 @@ int monitor_init_opts(QemuOpts *opts, Error **errp)
     return ret;
 }
 
+void monitor_qapi_event_discard_io_error(void)
+{
+    GHashTableIter event_iter;
+    MonitorQAPIEventState *evstate;
+    gpointer key, value;
+    GString *json;
+
+    qemu_mutex_lock(&monitor_lock);
+    g_hash_table_iter_init(&event_iter, monitor_qapi_event_state);
+    while (g_hash_table_iter_next(&event_iter, &key, &value)) {
+        evstate = key;
+        /* Only QAPI_EVENT_BLOCK_IO_ERROR is discarded */
+        if (evstate->event == QAPI_EVENT_BLOCK_IO_ERROR) {
+            g_hash_table_iter_remove(&event_iter);
+            json = qobject_to_json(QOBJECT(evstate->qdict));
+            qemu_log(" %s event discarded\n", json->str);
+            timer_del(evstate->timer);
+            timer_free(evstate->timer);
+            qobject_unref(evstate->data);
+            qobject_unref(evstate->qdict);
+            g_string_free(json, true);
+            g_free(evstate);
+        }
+    }
+    qemu_mutex_unlock(&monitor_lock);
+}
+
 QemuOptsList qemu_mon_opts = {
     .name = "mon",
     .implied_opt_name = "chardev",
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index b0f948d33766bb781b5bb1cfdfacb3d190828b9a..c0b66f11bf88ab61a8a195ead686acb94a9e1548 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -23,6 +23,7 @@
 #include "sysemu/runstate.h"
 #include "sysemu/runstate-action.h"
 #include "sysemu/block-backend.h"
+#include "sysemu/kvm.h"
 #include "qapi/error.h"
 #include "qapi/qapi-init-commands.h"
 #include "qapi/qapi-commands-control.h"
@@ -32,6 +33,7 @@
 #include "hw/mem/memory-device.h"
 #include "hw/intc/intc.h"
 #include "hw/rdma/rdma.h"
+#include "qemu/log.h"
 
 NameInfo *qmp_query_name(Error **errp)
 {
@@ -49,6 +51,11 @@ void qmp_quit(Error **errp)
 
 void qmp_stop(Error **errp)
 {
+    if (virtcca_cvm_enabled()) {
+        error_setg(errp, "The stop command is temporarily unsupported in cvm.");
+        return;
+    }
+
     /* if there is a dump in background, we should wait until the dump
      * finished */
     if (qemu_system_dump_in_progress()) {
@@ -110,8 +117,10 @@ void qmp_cont(Error **errp)
     }
 
     if (runstate_check(RUN_STATE_INMIGRATE)) {
+	qemu_log("qmp cont is received in migration\n");
         autostart = 1;
     } else {
+	qemu_log("qmp cont is received and vm is started\n");
         vm_start();
     }
 }
diff --git a/monitor/qmp.c b/monitor/qmp.c
index 6eee450fe40c120e9b39105a780247ae495f84c7..8f7671c5f12513a3c81ecc60ff86ada45f08a066 100644
--- a/monitor/qmp.c
+++ b/monitor/qmp.c
@@ -149,18 +149,19 @@ void qmp_send_response(MonitorQMP *mon, const QDict *rsp)
  * Null @rsp can only happen for commands with QCO_NO_SUCCESS_RESP.
  * Nothing is emitted then.
  */
-static void monitor_qmp_respond(MonitorQMP *mon, QDict *rsp)
+static void monitor_qmp_respond(MonitorQMP *mon, QDict *rsp, uint64_t req_client_id)
 {
-    if (rsp) {
-        qmp_send_response(mon, rsp);
+    if (!rsp || (mon->qmp_client_id != req_client_id)) {
+        return;
     }
+    qmp_send_response(mon, rsp);
 }
 
 /*
  * Runs outside of coroutine context for OOB commands, but in
  * coroutine context for everything else.
  */
-static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req)
+static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req, uint64_t req_client_id)
 {
     QDict *rsp;
     QDict *error;
@@ -180,7 +181,7 @@ static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req)
         }
     }
 
-    monitor_qmp_respond(mon, rsp);
+    monitor_qmp_respond(mon, rsp, req_client_id);
     qobject_unref(rsp);
 }
 
@@ -340,13 +341,13 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data)
                 trace_monitor_qmp_cmd_in_band(id_json->str);
                 g_string_free(id_json, true);
             }
-            monitor_qmp_dispatch(mon, req_obj->req);
+            monitor_qmp_dispatch(mon, req_obj->req, mon->qmp_client_id);
         } else {
             assert(req_obj->err);
             trace_monitor_qmp_err_in_band(error_get_pretty(req_obj->err));
             rsp = qmp_error_response(req_obj->err);
             req_obj->err = NULL;
-            monitor_qmp_respond(mon, rsp);
+            monitor_qmp_respond(mon, rsp, mon->qmp_client_id);
             qobject_unref(rsp);
         }
 
@@ -402,7 +403,7 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err)
             trace_monitor_qmp_cmd_out_of_band(id_json->str);
             g_string_free(id_json, true);
         }
-        monitor_qmp_dispatch(mon, req);
+        monitor_qmp_dispatch(mon, req, mon->qmp_client_id);
         qobject_unref(req);
         return;
     }
@@ -486,6 +487,7 @@ static void monitor_qmp_event(void *opaque, QEMUChrEvent event)
         mon_refcount++;
         break;
     case CHR_EVENT_CLOSED:
+        mon->qmp_client_id++;
         /*
          * Note: this is only useful when the output of the chardev
          * backend is still open.  For example, when the backend is
@@ -539,6 +541,7 @@ void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp)
     }
     qemu_chr_fe_set_echo(&mon->common.chr, true);
 
+    mon->qmp_client_id = 1;
     /* Note: we run QMP monitor in I/O thread when @chr supports that */
     monitor_data_init(&mon->common, true, false,
                       qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_GCONTEXT));
diff --git a/nbd/client.c b/nbd/client.c
index 29ffc609a4b7dd0b01e758ff02839f0e1a651421..987dde43c7d55fd2997dd7cdf3f42e1a2546eff1 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -24,6 +24,8 @@
 #include "nbd-internal.h"
 #include "qemu/cutils.h"
 
+#define NBD_TIMEOUT_SECONDS 30
+
 /* Definitions for opaque data types */
 
 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
@@ -1310,6 +1312,12 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info,
         }
     }
 
+    if (ioctl(fd, NBD_SET_TIMEOUT, NBD_TIMEOUT_SECONDS) < 0) {
+        int serrno = errno;
+        error_setg(errp, "Failed setting timeout");
+        return -serrno;
+    }
+
     trace_nbd_init_finish();
 
     return 0;
diff --git a/nbd/server.c b/nbd/server.c
index 895cf0a7525bdf85996564faa05d515042c7a45e..d1b3c35b59dd8a40d050dacd353fa7c22eec923a 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -124,10 +124,12 @@ struct NBDMetaContexts {
 struct NBDClient {
     int refcount;
     void (*close_fn)(NBDClient *client, bool negotiated);
+    void *owner;
 
     NBDExport *exp;
     QCryptoTLSCreds *tlscreds;
     char *tlsauthz;
+    uint32_t handshake_max_secs;
     QIOChannelSocket *sioc; /* The underlying data channel */
     QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
 
@@ -1865,7 +1867,7 @@ static void nbd_export_request_shutdown(BlockExport *blk_exp)
 
     blk_exp_ref(&exp->common);
     /*
-     * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
+     * TODO: Should we expand QMP BlockExportRemoveMode enum to allow a
      * close mode that stops advertising the export to new clients but
      * still permits existing clients to run to completion? Because of
      * that possibility, nbd_export_close() can be called more than
@@ -2939,6 +2941,7 @@ static coroutine_fn void nbd_trip(void *opaque)
     NBDRequestData *req;
     NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
     int ret;
+    bool client_closing;
     Error *local_err = NULL;
 
     trace_nbd_trip();
@@ -3023,8 +3026,11 @@ disconnect:
     if (local_err) {
         error_reportf_err(local_err, "Disconnect client, due to: ");
     }
+    client_closing = client->closing;
     nbd_request_put(req);
-    client_close(client, true);
+    if (!client_closing) {
+        client_close(client, true);
+    }
     nbd_client_put(client);
 }
 
@@ -3038,33 +3044,63 @@ static void nbd_client_receive_next_request(NBDClient *client)
     }
 }
 
+static void nbd_handshake_timer_cb(void *opaque)
+{
+    QIOChannel *ioc = opaque;
+
+    trace_nbd_handshake_timer_cb();
+    qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+}
+
 static coroutine_fn void nbd_co_client_start(void *opaque)
 {
     NBDClient *client = opaque;
     Error *local_err = NULL;
+    QEMUTimer *handshake_timer = NULL;
 
     qemu_co_mutex_init(&client->send_lock);
 
+    /*
+     * Create a timer to bound the time spent in negotiation. If the
+     * timer expires, it is likely nbd_negotiate will fail because the
+     * socket was shutdown.
+     */
+    if (client->handshake_max_secs > 0) {
+        handshake_timer = aio_timer_new(qemu_get_aio_context(),
+                                        QEMU_CLOCK_REALTIME,
+                                        SCALE_NS,
+                                        nbd_handshake_timer_cb,
+                                        client->sioc);
+        timer_mod(handshake_timer,
+                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
+                  client->handshake_max_secs * NANOSECONDS_PER_SECOND);
+    }
+
     if (nbd_negotiate(client, &local_err)) {
         if (local_err) {
             error_report_err(local_err);
         }
+        timer_free(handshake_timer);
         client_close(client, false);
         return;
     }
 
+    timer_free(handshake_timer);
     nbd_client_receive_next_request(client);
 }
 
 /*
- * Create a new client listener using the given channel @sioc.
+ * Create a new client listener using the given channel @sioc and @owner.
  * Begin servicing it in a coroutine.  When the connection closes, call
- * @close_fn with an indication of whether the client completed negotiation.
+ * @close_fn with an indication of whether the client completed negotiation
+ * within @handshake_max_secs seconds (0 for unbounded).
  */
 void nbd_client_new(QIOChannelSocket *sioc,
+                    uint32_t handshake_max_secs,
                     QCryptoTLSCreds *tlscreds,
                     const char *tlsauthz,
-                    void (*close_fn)(NBDClient *, bool))
+                    void (*close_fn)(NBDClient *, bool),
+                    void *owner)
 {
     NBDClient *client;
     Coroutine *co;
@@ -3076,13 +3112,21 @@ void nbd_client_new(QIOChannelSocket *sioc,
         object_ref(OBJECT(client->tlscreds));
     }
     client->tlsauthz = g_strdup(tlsauthz);
+    client->handshake_max_secs = handshake_max_secs;
     client->sioc = sioc;
     qio_channel_set_delay(QIO_CHANNEL(sioc), false);
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
     client->close_fn = close_fn;
+    client->owner = owner;
 
     co = qemu_coroutine_create(nbd_co_client_start, client);
     qemu_coroutine_enter(co);
 }
+
+void *
+nbd_client_owner(NBDClient *client)
+{
+    return client->owner;
+}
diff --git a/nbd/trace-events b/nbd/trace-events
index 00ae3216a11148a39cfd8b7074bc01e6d5212fc9..cbd0a4ab7e4b9c7de605544ecc20204572e836f6 100644
--- a/nbd/trace-events
+++ b/nbd/trace-events
@@ -76,6 +76,7 @@ nbd_co_receive_request_payload_received(uint64_t cookie, uint64_t len) "Payload
 nbd_co_receive_ext_payload_compliance(uint64_t from, uint64_t len) "client sent non-compliant write without payload flag: from=0x%" PRIx64 ", len=0x%" PRIx64
 nbd_co_receive_align_compliance(const char *op, uint64_t from, uint64_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx64 ", align=0x%" PRIx32
 nbd_trip(void) "Reading request"
+nbd_handshake_timer_cb(void) "client took too long to negotiate"
 
 # client-connection.c
 nbd_connect_thread_sleep(uint64_t timeout) "timeout %" PRIu64
diff --git a/net/colo-compare.c b/net/colo-compare.c
index 7f9e6f89ce058fdaa89b0084cf350298ed20b8ab..d4e51cb306e018123bcdf3250ad1e9a62820d192 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -413,8 +413,7 @@ static void colo_compare_tcp(CompareState *s, Connection *conn)
      * can ensure that the packet's payload is acknowledged by
      * primary and secondary.
     */
-    uint32_t min_ack = conn->pack - conn->sack > 0 ?
-                       conn->sack : conn->pack;
+    uint32_t min_ack = MIN(conn->pack, conn->sack);
 
 pri:
     if (g_queue_is_empty(&conn->primary_list)) {
diff --git a/net/dump.c b/net/dump.c
index 16073f245828f9c8083be6690e759cdcef110746..d880a7e299c550974e1d7066999dd6cde692b381 100644
--- a/net/dump.c
+++ b/net/dump.c
@@ -87,7 +87,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt,
     dumpiov[0].iov_len = sizeof(hdr);
     cnt = iov_copy(&dumpiov[1], cnt, iov, cnt, offset, caplen);
 
-    if (writev(s->fd, dumpiov, cnt + 1) != sizeof(hdr) + caplen) {
+    if (writev(s->fd, &dumpiov[0], cnt + 1) != sizeof(hdr) + caplen) {
         error_report("network dump write error - stopping dump");
         close(s->fd);
         s->fd = -1;
diff --git a/net/meson.build b/net/meson.build
index ce99bd4447f484e8842d7a5c47403217343dd0b5..7264479242c21737e86fbc55bc244c3da3e417a1 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -37,7 +37,7 @@ if have_netmap
   system_ss.add(files('netmap.c'))
 endif
 
-system_ss.add(when: libxdp, if_true: files('af-xdp.c'))
+system_ss.add(when: [libxdp, libbpf], if_true: files('af-xdp.c'))
 
 if have_vhost_net_user
   system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
diff --git a/net/net.c b/net/net.c
index 0520bc1681a469545cfb56160c70d513384f964b..bcd3d7e04cce5b4a7c00345ffdd22b48ffd1f525 100644
--- a/net/net.c
+++ b/net/net.c
@@ -1322,6 +1322,12 @@ void qmp_netdev_del(const char *id, Error **errp)
         return;
     }
 
+    if (nc->info->type == NET_CLIENT_DRIVER_VHOST_USER && nc->peer) {
+        error_setg(errp, "Device '%s' is a netdev for vhostuser,"
+                   "please delete the peer front-end device (virtio-net) first.", id);
+        return;
+    }
+
     qemu_del_net_client(nc);
 
     /*
diff --git a/net/vhost-user.c b/net/vhost-user.c
index 12555518e83887be0a6fbf133bf354f7f57d8b05..86fd5056ab6ce3082678fc0b8e53379b440818a5 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -20,6 +20,9 @@
 #include "qemu/error-report.h"
 #include "qemu/option.h"
 #include "trace.h"
+#include "include/hw/virtio/vhost.h"
+
+#define VHOST_USER_RECONNECT_TIME (3)
 
 typedef struct NetVhostUserState {
     NetClientState nc;
@@ -292,6 +295,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
     trace_vhost_user_event(chr->label, event);
     switch (event) {
     case CHR_EVENT_OPENED:
+        qemu_chr_set_reconnect_time(chr, VHOST_USER_RECONNECT_TIME);
         if (vhost_user_start(queues, ncs, s->vhost_user) < 0) {
             qemu_chr_fe_disconnect(&s->chr);
             return;
@@ -370,6 +374,10 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
         qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
                                  net_vhost_user_event, NULL, nc0->name, NULL,
                                  true);
+        if (used_memslots_is_exceeded()) {
+            error_report("used memslots exceeded the backend limit, quit loop");
+            goto err;
+        }
     } while (!s->started);
 
     assert(s->vhost_net);
diff --git a/os-posix.c b/os-posix.c
index 52ef6990ff9d4c4e2cd52b6f6a820cdbb8efb9fb..8f70ee0534a920ab0761f7abfaaeb29c9b028393 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -306,6 +306,7 @@ int os_mlock(void)
 #ifdef HAVE_MLOCKALL
     int ret = 0;
 
+    qemu_log("do mlockall\n");
     ret = mlockall(MCL_CURRENT | MCL_FUTURE);
     if (ret < 0) {
         error_report("mlockall: %s", strerror(errno));
diff --git a/qapi/block-core.json b/qapi/block-core.json
index ca390c570022a82355d50d295120347a9e5ed2b4..0fa184698a1f2ef08448d9bd1c615233e45176fc 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1286,10 +1286,12 @@
 #
 # @auto: inherit the error handling policy of the backend (since: 2.7)
 #
+# @retry: retrying IO with errors
+#
 # Since: 1.3
 ##
 { 'enum': 'BlockdevOnError',
-  'data': ['report', 'ignore', 'enospc', 'stop', 'auto'] }
+  'data': ['report', 'ignore', 'enospc', 'stop', 'auto', 'retry'] }
 
 ##
 # @MirrorSyncMode:
@@ -1660,8 +1662,6 @@
 #
 # Takes a synchronous snapshot of a block device.
 #
-# For the arguments, see the documentation of BlockdevSnapshotSync.
-#
 # Returns:
 #     - nothing on success
 #     - If @device is not a valid block device, DeviceNotFound
@@ -1691,8 +1691,6 @@
 # device, the block device changes to using 'overlay' as its new
 # active image.
 #
-# For the arguments, see the documentation of BlockdevSnapshot.
-#
 # Features:
 #
 # @allow-write-only-overlay: If present, the check whether this
@@ -4906,6 +4904,8 @@
 #
 # @extent-size-hint: Extent size hint to add to the image file; 0 for
 #     not adding an extent size hint (default: 1 MB, since 5.1)
+# @cache: Cache mode used to write the output disk image
+# @buffersize: Buffer size for creating image
 #
 # Since: 2.12
 ##
@@ -4914,7 +4914,9 @@
             'size':                 'size',
             '*preallocation':       'PreallocMode',
             '*nocow':               'bool',
-            '*extent-size-hint':    'size'} }
+            '*extent-size-hint':    'size',
+            '*cache':               'str',
+            '*buffersize':          'size'} }
 
 ##
 # @BlockdevCreateOptionsGluster:
@@ -5476,10 +5478,12 @@
 #
 # @stop: error caused VM to be stopped
 #
+# @retry: retry IO with errors
+#
 # Since: 2.1
 ##
 { 'enum': 'BlockErrorAction',
-  'data': [ 'ignore', 'report', 'stop' ] }
+  'data': [ 'ignore', 'report', 'stop', 'retry' ] }
 
 ##
 # @BLOCK_IMAGE_CORRUPTED:
@@ -6029,9 +6033,6 @@
 # string, or a snapshot with name already exists, the operation will
 # fail.
 #
-# For the arguments, see the documentation of
-# BlockdevSnapshotInternal.
-#
 # Returns:
 #     - nothing on success
 #     - If @device is not a valid block device, GenericError
diff --git a/qapi/block-export.json b/qapi/block-export.json
index 7874a49ba71320a849052ffd8ea335722fff83fa..1d255d77e329fd0b15cd5f0642167227eb34c280 100644
--- a/qapi/block-export.json
+++ b/qapi/block-export.json
@@ -28,7 +28,7 @@
 # @max-connections: The maximum number of connections to allow at the
 #     same time, 0 for unlimited.  Setting this to 1 also stops the
 #     server from advertising multiple client support (since 5.2;
-#     default: 0)
+#     default: 100)
 #
 # Since: 4.2
 ##
@@ -63,7 +63,7 @@
 # @max-connections: The maximum number of connections to allow at the
 #     same time, 0 for unlimited.  Setting this to 1 also stops the
 #     server from advertising multiple client support (since 5.2;
-#     default: 0).
+#     default: 100).
 #
 # Returns: error if the server is already running.
 #
diff --git a/qapi/crypto.json b/qapi/crypto.json
index fd3d46ebd12821fbed62196e76b87ef25be6802e..af38f0a4bd9e021a382a2a247c887f4df2596cf9 100644
--- a/qapi/crypto.json
+++ b/qapi/crypto.json
@@ -58,11 +58,13 @@
 #
 # @ripemd160: RIPEMD-160. (since 2.7)
 #
+# @sm3: SM3. (since 8.2.0)
+#
 # Since: 2.6
 ##
 { 'enum': 'QCryptoHashAlgorithm',
   'prefix': 'QCRYPTO_HASH_ALG',
-  'data': ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'ripemd160']}
+  'data': ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'ripemd160', 'sm3']}
 
 ##
 # @QCryptoCipherAlgorithm:
@@ -94,6 +96,8 @@
 #
 # @twofish-256: Twofish with 256 bit / 32 byte keys
 #
+# @sm4: SM4 with 128 bit / 16 byte keys (since 9.0)
+#
 # Since: 2.6
 ##
 { 'enum': 'QCryptoCipherAlgorithm',
@@ -102,7 +106,8 @@
            'des', '3des',
            'cast5-128',
            'serpent-128', 'serpent-192', 'serpent-256',
-           'twofish-128', 'twofish-192', 'twofish-256']}
+           'twofish-128', 'twofish-192', 'twofish-256',
+           'sm4']}
 
 ##
 # @QCryptoCipherMode:
diff --git a/qapi/migration.json b/qapi/migration.json
index eb2f8835133480e8439716303a1b97a87a1f67c8..37e1d4857e5cef6b44d9eeea54db56daf97e9015 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -625,11 +625,23 @@
 #
 # @zstd: use zstd compression method.
 #
+# @qatzip: use qatzip compression method.  (Since 9.2)
+#
+# @qpl: use qpl compression method.  Query Processing Library(qpl) is
+#       based on the deflate compression algorithm and use the Intel
+#       In-Memory Analytics Accelerator(IAA) accelerated compression
+#       and decompression.  (Since 9.1)
+#
+# @uadk: use UADK library compression method.  (Since 9.1)
+#
 # Since: 5.0
 ##
 { 'enum': 'MultiFDCompression',
   'data': [ 'none', 'zlib',
-            { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] }
+            { 'name': 'zstd', 'if': 'CONFIG_ZSTD' },
+            { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'},
+            { 'name': 'qpl', 'if': 'CONFIG_QPL' },
+            { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] }
 
 ##
 # @MigMode:
@@ -653,6 +665,23 @@
 { 'enum': 'MigMode',
   'data': [ 'normal', 'cpr-reboot' ] }
 
+##
+# @ZeroPageDetection:
+#
+# @none: Do not perform zero page checking.
+#
+# @legacy: Perform zero page checking in main migration thread.
+#
+# @multifd: Perform zero page checking in multifd sender thread if
+#     multifd migration is enabled, else in the main migration
+#     thread as for @legacy.
+#
+# Since: 9.0
+#
+##
+{ 'enum': 'ZeroPageDetection',
+  'data': [ 'none', 'legacy', 'multifd' ] }
+
 ##
 # @BitmapMigrationBitmapAliasTransform:
 #
@@ -708,6 +737,20 @@
       'bitmaps': [ 'BitmapMigrationBitmapAlias' ]
   } }
 
+##
+# @CompressMethod:
+#
+# An enumeration of multi-thread compression methods.
+#
+# @zlib: use zlib compression method.
+# @zstd: use zstd compression method.
+#
+# Since: 5.0
+#
+##
+{ 'enum': 'CompressMethod',
+  'data': [ 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] }
+
 ##
 # @MigrationParameter:
 #
@@ -746,6 +789,9 @@
 #     fast as compression, so set the decompress-threads to the number
 #     about 1/4 of compress-threads is adequate.
 #
+# @compress-method: Which multi-thread compression method to use.
+#                   Defaults to none. (Since 5.0)
+#
 # @throttle-trigger-threshold: The ratio of bytes_dirty_period and
 #     bytes_xfer_period to trigger throttling.  It is expressed as
 #     percentage.  The default value is 50. (Since 5.0)
@@ -842,6 +888,11 @@
 #     speed, and 9 means best compression ratio which will consume
 #     more CPU. Defaults to 1. (Since 5.0)
 #
+# @multifd-qatzip-level: Set the compression level to be used in live
+#     migration. The level is an integer between 1 and 9, where 1 means
+#     the best compression speed, and 9 means the best compression
+#     ratio which will consume more CPU. Defaults to 1.  (Since 9.2)
+#
 # @multifd-zstd-level: Set the compression level to be used in live
 #     migration, the compression level is an integer between 0 and 20,
 #     where 0 means no compression, 1 means the best compression
@@ -874,6 +925,24 @@
 # @mode: Migration mode. See description in @MigMode. Default is 'normal'.
 #        (Since 8.2)
 #
+# @zero-page-detection: Whether and how to detect zero pages.
+#     See description in @ZeroPageDetection.  Default is 'legacy'.
+#     (since 9.0)
+#
+# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or
+#           pdh filename for hygon
+#           (Since 4.2)
+#
+# @sev-plat-cert: The target host platform certificate chain encoded in base64,
+#                 or plat cert filename for hygon
+#                 (Since 4.2)
+#
+# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in
+#                base64, or vendor cert filename for hygon (Since 4.2)
+#
+# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure).
+#     Defaults to 0.  (Since 8.6)
+#
 # Features:
 #
 # @deprecated: Member @block-incremental is deprecated.  Use
@@ -892,6 +961,7 @@
            { 'name': 'compress-level', 'features': [ 'deprecated' ] },
            { 'name': 'compress-threads', 'features': [ 'deprecated' ] },
            { 'name': 'decompress-threads', 'features': [ 'deprecated' ] },
+           { 'name': 'compress-method', 'features': [ 'deprecated' ] },
            { 'name': 'compress-wait-thread', 'features': [ 'deprecated' ] },
            'throttle-trigger-threshold',
            'cpu-throttle-initial', 'cpu-throttle-increment',
@@ -904,10 +974,13 @@
            'xbzrle-cache-size', 'max-postcopy-bandwidth',
            'max-cpu-throttle', 'multifd-compression',
            'multifd-zlib-level', 'multifd-zstd-level',
+           'multifd-qatzip-level',
            'block-bitmap-mapping',
            { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] },
            'vcpu-dirty-limit',
-           'mode'] }
+           'mode',
+           'zero-page-detection',
+           'sev-pdh', 'sev-plat-cert', 'sev-amd-cert', 'hdbss-buffer-size'] }
 
 ##
 # @MigrateSetParameters:
@@ -935,6 +1008,9 @@
 #
 # @decompress-threads: decompression thread count
 #
+# @compress-method: Set compression method to use in multi-thread compression.
+#                   Defaults to none. (Since 5.0)
+#
 # @throttle-trigger-threshold: The ratio of bytes_dirty_period and
 #     bytes_xfer_period to trigger throttling.  It is expressed as
 #     percentage.  The default value is 50. (Since 5.0)
@@ -1030,6 +1106,11 @@
 #     speed, and 9 means best compression ratio which will consume
 #     more CPU. Defaults to 1. (Since 5.0)
 #
+# @multifd-qatzip-level: Set the compression level to be used in live
+#     migration. The level is an integer between 1 and 9, where 1 means
+#     the best compression speed, and 9 means the best compression
+#     ratio which will consume more CPU. Defaults to 1.  (Since 9.2)
+#
 # @multifd-zstd-level: Set the compression level to be used in live
 #     migration, the compression level is an integer between 0 and 20,
 #     where 0 means no compression, 1 means the best compression
@@ -1062,12 +1143,31 @@
 # @mode: Migration mode. See description in @MigMode. Default is 'normal'.
 #        (Since 8.2)
 #
+# @zero-page-detection: Whether and how to detect zero pages.
+#     See description in @ZeroPageDetection.  Default is 'legacy'.
+#     (since 9.0)
+#
+# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or
+#           pdh filename for hygon
+#           (Since 4.2)
+#
+# @sev-plat-cert: The target host platform certificate chain encoded in base64,
+#                 or plat cert filename for hygon
+#                 (Since 4.2)
+#
+# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in
+#                base64, or vendor cert filename for hygon (Since 4.2)
+#
+# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure).
+#     Defaults to 0.  (Since 8.6)
+#
 # Features:
 #
 # @deprecated: Member @block-incremental is deprecated.  Use
 #     blockdev-mirror with NBD instead.  Members @compress-level,
-#     @compress-threads, @decompress-threads and @compress-wait-thread
-#     are deprecated because @compression is deprecated.
+#     @compress-threads, @decompress-threads, @compress-method
+#     and @compress-wait-thread are deprecated because
+#     @compression is deprecated.
 #
 # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period
 #     are experimental.
@@ -1090,6 +1190,8 @@
                                         'features': [ 'deprecated' ] },
             '*decompress-threads':  { 'type': 'uint8',
                                       'features': [ 'deprecated' ] },
+            '*compress-method': { 'type': 'CompressMethod',
+                                  'features': [ 'deprecated' ] },
             '*throttle-trigger-threshold': 'uint8',
             '*cpu-throttle-initial': 'uint8',
             '*cpu-throttle-increment': 'uint8',
@@ -1110,12 +1212,18 @@
             '*max-cpu-throttle': 'uint8',
             '*multifd-compression': 'MultiFDCompression',
             '*multifd-zlib-level': 'uint8',
+            '*multifd-qatzip-level': 'uint8',
             '*multifd-zstd-level': 'uint8',
             '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
             '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
                                             'features': [ 'unstable' ] },
             '*vcpu-dirty-limit': 'uint64',
-            '*mode': 'MigMode'} }
+            '*mode': 'MigMode',
+            '*zero-page-detection': 'ZeroPageDetection',
+            '*sev-pdh': 'StrOrNull',
+            '*sev-plat-cert': 'StrOrNull',
+            '*sev-amd-cert' : 'StrOrNull',
+            '*hdbss-buffer-size': 'uint8'} }
 
 ##
 # @migrate-set-parameters:
@@ -1161,6 +1269,9 @@
 #
 # @decompress-threads: decompression thread count
 #
+# @compress-method: Which multi-thread compression method to use.
+#                   Defaults to none. (Since 5.0)
+#
 # @throttle-trigger-threshold: The ratio of bytes_dirty_period and
 #     bytes_xfer_period to trigger throttling.  It is expressed as
 #     percentage.  The default value is 50. (Since 5.0)
@@ -1258,6 +1369,11 @@
 #     speed, and 9 means best compression ratio which will consume
 #     more CPU. Defaults to 1. (Since 5.0)
 #
+# @multifd-qatzip-level: Set the compression level to be used in live
+#     migration. The level is an integer between 1 and 9, where 1 means
+#     the best compression speed, and 9 means the best compression
+#     ratio which will consume more CPU. Defaults to 1.  (Since 9.2)
+#
 # @multifd-zstd-level: Set the compression level to be used in live
 #     migration, the compression level is an integer between 0 and 20,
 #     where 0 means no compression, 1 means the best compression
@@ -1290,6 +1406,24 @@
 # @mode: Migration mode. See description in @MigMode. Default is 'normal'.
 #        (Since 8.2)
 #
+# @zero-page-detection: Whether and how to detect zero pages.
+#     See description in @ZeroPageDetection.  Default is 'legacy'.
+#     (since 9.0)
+#
+# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or
+#           pdh filename for hygon
+#           (Since 4.2)
+#
+# @sev-plat-cert: The target host platform certificate chain encoded in base64,
+#                 or plat cert filename for hygon
+#                 (Since 4.2)
+#
+# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in
+#                base64, or vendor cert filename for hygon (Since 4.2)
+#
+# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure).
+#     Defaults to 0.  (Since 8.6)
+#
 # Features:
 #
 # @deprecated: Member @block-incremental is deprecated.  Use
@@ -1315,6 +1449,8 @@
                                        'features': [ 'deprecated' ] },
             '*decompress-threads': { 'type': 'uint8',
                                      'features': [ 'deprecated' ] },
+            '*compress-method': { 'type': 'CompressMethod',
+                                  'features': [ 'deprecated' ] },
             '*throttle-trigger-threshold': 'uint8',
             '*cpu-throttle-initial': 'uint8',
             '*cpu-throttle-increment': 'uint8',
@@ -1335,12 +1471,18 @@
             '*max-cpu-throttle': 'uint8',
             '*multifd-compression': 'MultiFDCompression',
             '*multifd-zlib-level': 'uint8',
+            '*multifd-qatzip-level': 'uint8',
             '*multifd-zstd-level': 'uint8',
             '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ],
             '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
                                             'features': [ 'unstable' ] },
             '*vcpu-dirty-limit': 'uint64',
-            '*mode': 'MigMode'} }
+            '*mode': 'MigMode',
+            '*zero-page-detection': 'ZeroPageDetection',
+            '*sev-pdh': 'str',
+            '*sev-plat-cert': 'str',
+            '*sev-amd-cert' : 'str',
+            '*hdbss-buffer-size': 'uint8'} }
 
 ##
 # @query-migrate-parameters:
@@ -1418,6 +1560,30 @@
 { 'event': 'MIGRATION_PASS',
   'data': { 'pass': 'int' } }
 
+##
+# @MIGRATION_MULTIFD_PID:
+#
+# Emitted when multifd thread appear
+#
+# @pid: pid of multifd thread
+#
+# Since: EulerOS Virtual
+##
+{ 'event': 'MIGRATION_MULTIFD_PID',
+  'data': { 'pid': 'int' } }
+
+##
+# @MIGRATION_PID:
+#
+# Emitted when migration thread appear
+#
+# @pid: pid of migration thread
+#
+# Since: EulerOS Virtual
+##
+{ 'event': 'MIGRATION_PID',
+  'data': { 'pid': 'int' } }
+
 ##
 # @COLOMessage:
 #
diff --git a/qapi/misc-target.json b/qapi/misc-target.json
index 88291453ba476a26a4b5583f79a553857aa061f5..3df0062f3dc872bd50c0e014b243f6e3fb088322 100644
--- a/qapi/misc-target.json
+++ b/qapi/misc-target.json
@@ -487,3 +487,34 @@
 { 'command': 'xen-event-inject',
   'data': { 'port': 'uint32' },
   'if': 'TARGET_I386' }
+
+##
+# @VirtccaCapability:
+#
+# The struct describes capability for VIRTCCA feature.
+#
+# Since: 8.2.0
+##
+{ 'struct': 'VirtccaCapability',
+  'data': { 'enabled': 'bool' },
+  'if': { 'all': ['TARGET_AARCH64' , 'CONFIG_KVM'] }
+}
+
+##
+# @query-virtcca-capabilities:
+#
+# This command is used to get the VIRTCCA capabilities, and is supported
+# on HISI AARCH64 platforms only.
+#
+# Returns: VirtccaCapability objects.
+#
+# Since: 8.2.0
+#
+# Example:
+#
+# -> { "execute": "query-virtcca-capabilities" }
+# <- { "return": { "enabled": true } }
+##
+{ 'command': 'query-virtcca-capabilities', 'returns': 'VirtccaCapability',
+  'if': { 'all': ['TARGET_AARCH64' , 'CONFIG_KVM'] }
+}
\ No newline at end of file
diff --git a/qapi/misc.json b/qapi/misc.json
index cda2effa8155a8a6175036f678877ec2c7cdd1bf..1832d5f4608b86bd9d1e9a85fd291a386b6a4d15 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -550,6 +550,15 @@
  'returns': ['CommandLineOptionInfo'],
  'allow-preconfig': true}
 
+##
+# @query-rtc-date-diff:
+#
+# get vm's time offset
+#
+# Since: 2.8
+##
+{ 'command': 'query-rtc-date-diff', 'returns': 'int64' }
+
 ##
 # @RTC_CHANGE:
 #
diff --git a/qapi/pragma.json b/qapi/pragma.json
index 0aa4eeddd3893bee760fb28ab1d7222dfb3c13ed..7a07b44bb1089d02e56ed185af1368a3bb41fb1b 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
@@ -30,7 +30,8 @@
         'qom-get',
         'query-tpm-models',
         'query-tpm-types',
-        'ringbuf-read' ],
+        'ringbuf-read',
+	'query-rtc-date-diff'],
     # Externally visible types whose member names may use uppercase
     'member-name-exceptions': [     # visible in:
         'ACPISlotType',             # query-acpi-ospm-status
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index 555528b6bbd357fbbc9048647decfa70e36038a2..e33efd3740cbc12bdf9036d5eaa0d011d2defb9f 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -24,6 +24,8 @@
 #include "qapi/qmp/qbool.h"
 #include "qemu/coroutine.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qapi/qmp/qstring.h"
 
 Visitor *qobject_input_visitor_new_qmp(QObject *obj)
 {
@@ -146,6 +148,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
     QObject *id;
     QObject *ret = NULL;
     QDict *rsp = NULL;
+    GString *json;
 
     dict = qobject_to(QDict, request);
     if (!dict) {
@@ -203,8 +206,35 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
         qobject_ref(args);
     }
 
+    json = qobject_to_json(QOBJECT(args));
+    if (json) {
+        if ((strcmp(command, "query-block-jobs") != 0)
+            && (strcmp(command, "query-migrate") != 0)
+            && (strcmp(command, "query-blockstats") != 0)
+            && (strcmp(command, "query-balloon") != 0)
+            && (strcmp(command, "set_password") != 0)) {
+                qemu_log("qmp_cmd_name: %s, arguments: %s\n",
+                         command, json->str);
+        }
+        g_string_free(json, true);
+    }
+
     assert(!(oob && qemu_in_coroutine()));
     assert(monitor_cur() == NULL);
+
+    json = qobject_to_json(QOBJECT(args));
+    if (json) {
+        if ((strcmp(command, "query-block-jobs") != 0)
+            && (strcmp(command, "query-migrate") != 0)
+            && (strcmp(command, "query-blockstats") != 0)
+            && (strcmp(command, "query-balloon") != 0)
+            && (strcmp(command, "set_password") != 0)) {
+                qemu_log("qmp_cmd_name: %s, arguments: %s\n",
+                         command, json->str);
+        }
+        g_string_free(json, true);
+    }
+
     if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) {
         monitor_set_cur(qemu_coroutine_self(), cur_mon);
         cmd->fn(args, &ret, &err);
diff --git a/qapi/qom.json b/qapi/qom.json
index c53ef978ff7e6a81f8c926159fe52c0d349696ac..a5336e6b11accaf35668f3a7ea4f499486556603 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -794,6 +794,23 @@
 { 'struct': 'VfioUserServerProperties',
   'data': { 'socket': 'SocketAddress', 'device': 'str' } }
 
+##
+# @IOMMUFDProperties:
+#
+# Properties for iommufd objects.
+#
+# @fd: file descriptor name previously passed via 'getfd' command,
+#     which represents a pre-opened /dev/iommu.  This allows the
+#     iommufd object to be shared accross several subsystems
+#     (VFIO, VDPA, ...), and the file descriptor to be shared
+#     with other process, e.g. DPDK.  (default: QEMU opens
+#     /dev/iommu by itself)
+#
+# Since: 9.0
+##
+{ 'struct': 'IOMMUFDProperties',
+  'data': { '*fd': 'str' } }
+
 ##
 # @RngProperties:
 #
@@ -866,6 +883,14 @@
 #     designated guest firmware page for measured boot with -kernel
 #     (default: false) (since 6.2)
 #
+# @user-id: the user id of the guest owner, only support on Hygon CPUs
+#     (since 8.2)
+#
+# @secret-header-file: the header file of guest owner's secret, only
+#                      support on Hygon CPUs (since 8.2)
+# @secret-file: the file guest owner's secret, only support on Hygon
+#               CPUs (since 8.2)
+#
 # Since: 2.12
 ##
 { 'struct': 'SevGuestProperties',
@@ -876,7 +901,10 @@
             '*handle': 'uint32',
             '*cbitpos': 'uint32',
             'reduced-phys-bits': 'uint32',
-            '*kernel-hashes': 'bool' } }
+            '*kernel-hashes': 'bool',
+            '*user-id': 'str',
+            '*secret-header-file': 'str',
+            '*secret-file': 'str' } }
 
 ##
 # @ThreadContextProperties:
@@ -899,6 +927,30 @@
   'data': { '*cpu-affinity': ['uint16'],
             '*node-affinity': ['uint16'] } }
 
+##
+# @TmmGuestMeasurementAlgo:
+#
+# Algorithm to use for cvm measurements
+#
+# Since: FIXME
+##
+{ 'enum': 'TmmGuestMeasurementAlgo',
+'data': ['default', 'sha256', 'sha512'] }
+
+##
+# @TmmGuestProperties:
+#
+# Properties for tmm-guest objects.
+#
+# @sve-vector-length: SVE vector length (default: 0, SVE disabled)
+#
+# Since: FIXME
+##
+{ 'struct': 'TmmGuestProperties',
+  'data': { '*sve-vector-length': 'uint32',
+            '*num-pmu-counters': 'uint32',
+            '*kae': 'uint32',
+            '*measurement-algo': 'TmmGuestMeasurementAlgo' } }
 
 ##
 # @ObjectType:
@@ -934,6 +986,7 @@
     'input-barrier',
     { 'name': 'input-linux',
       'if': 'CONFIG_LINUX' },
+    'iommufd',
     'iothread',
     'main-loop',
     { 'name': 'memory-backend-epc',
@@ -962,7 +1015,8 @@
     'tls-creds-x509',
     'tls-cipher-suites',
     { 'name': 'x-remote-object', 'features': [ 'unstable' ] },
-    { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] }
+    { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] },
+    'tmm-guest'
   ] }
 
 ##
@@ -1003,6 +1057,7 @@
       'input-barrier':              'InputBarrierProperties',
       'input-linux':                { 'type': 'InputLinuxProperties',
                                       'if': 'CONFIG_LINUX' },
+      'iommufd':                    'IOMMUFDProperties',
       'iothread':                   'IothreadProperties',
       'main-loop':                  'MainLoopProperties',
       'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
@@ -1029,7 +1084,8 @@
       'tls-creds-x509':             'TlsCredsX509Properties',
       'tls-cipher-suites':          'TlsCredsProperties',
       'x-remote-object':            'RemoteObjectProperties',
-      'x-vfio-user-server':         'VfioUserServerProperties'
+      'x-vfio-user-server':         'VfioUserServerProperties',
+      'tmm-guest':                  'TmmGuestProperties'
   } }
 
 ##
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 068692d13ebd944b78ded2c76b5f1b616b351b4d..20bdcd7b8244a1b55ca30fed49dcb83847775f98 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -52,9 +52,9 @@ SRST
 ERST
 
 DEF("create", img_create,
-    "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-o options] filename [size]")
+    "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-t cache] [-o options] filename [size]")
 SRST
-.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-o OPTIONS] FILENAME [SIZE]
+.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-t CACHE] [-o OPTIONS] FILENAME [SIZE]
 ERST
 
 DEF("dd", img_dd,
diff --git a/qemu-img.c b/qemu-img.c
index 5a77f67719316a103255189bea89d3702a0e1eda..49d914c9c4f7ca8fe4a0a0a7bdf4282bc3cabdd8 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -508,6 +508,22 @@ static int64_t cvtnum(const char *name, const char *value)
     return cvtnum_full(name, value, 0, INT64_MAX);
 }
 
+static bool is_reg_file(const char *filename)
+{
+    struct stat st;
+
+    /* file not exist, file will be create later, so it's a reg file */
+    if (access(filename, F_OK) == -1) {
+        return true;
+    }
+
+    /* file exist, check file type */
+    if (stat(filename, &st) >= 0 && S_ISREG(st.st_mode)) {
+        return true;
+    }
+    return false;
+}
+
 static int img_create(int argc, char **argv)
 {
     int c;
@@ -516,6 +532,7 @@ static int img_create(int argc, char **argv)
     const char *base_fmt = NULL;
     const char *filename;
     const char *base_filename = NULL;
+    const char *cache = BDRV_DEFAULT_CACHE;
     char *options = NULL;
     Error *local_err = NULL;
     bool quiet = false;
@@ -527,7 +544,7 @@ static int img_create(int argc, char **argv)
             {"object", required_argument, 0, OPTION_OBJECT},
             {0, 0, 0, 0}
         };
-        c = getopt_long(argc, argv, ":F:b:f:ho:qu",
+        c = getopt_long(argc, argv, ":F:b:f:t:ho:qu",
                         long_options, NULL);
         if (c == -1) {
             break;
@@ -551,6 +568,9 @@ static int img_create(int argc, char **argv)
         case 'f':
             fmt = optarg;
             break;
+        case 't':
+            cache = optarg;
+            break;
         case 'o':
             if (accumulate_options(&options, optarg) < 0) {
                 goto fail;
@@ -594,6 +614,16 @@ static int img_create(int argc, char **argv)
         error_exit("Unexpected argument: %s", argv[optind]);
     }
 
+    if (is_reg_file(filename)) {
+        if (!options) {
+            options = g_strdup_printf(BLOCK_OPT_CACHE"=%s", cache);
+        } else {
+            char *old_options = options;
+            options = g_strdup_printf("%s,"BLOCK_OPT_CACHE"=%s", options, cache);
+            g_free(old_options);
+        }
+    }
+
     bdrv_img_create(filename, fmt, base_filename, base_fmt,
                     options, img_size, flags, quiet, &local_err);
     if (local_err) {
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 186e6468b1abf2fde0d911bececb1e60d9fd266a..8b09cb5e2a8a904113800d58d1755f4f91e4f1b0 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -389,7 +389,9 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc,
 
     nb_fds++;
     nbd_update_server_watch();
-    nbd_client_new(cioc, tlscreds, tlsauthz, nbd_client_closed);
+    /* TODO - expose handshake timeout as command line option */
+    nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS,
+                   tlscreds, tlsauthz, nbd_client_closed, NULL);
 }
 
 static void nbd_update_server_watch(void)
@@ -587,7 +589,8 @@ int main(int argc, char **argv)
     pthread_t client_thread;
     const char *fmt = NULL;
     Error *local_err = NULL;
-    BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF;
+    BlockdevDetectZeroesOptions detect_zeroes =
+        BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF;
     QDict *options = NULL;
     const char *export_name = NULL; /* defaults to "" later for server mode */
     const char *export_description = NULL;
@@ -843,6 +846,10 @@ int main(int argc, char **argv)
     trace_init_file();
     qemu_set_log(LOG_TRACE, &error_fatal);
 
+    if (!seen_aio && (flags & BDRV_O_NOCACHE)) {
+        flags |= BDRV_O_NATIVE_AIO;
+    }
+
     socket_activation = check_socket_activation();
     if (socket_activation == 0) {
         if (!sockpath) {
diff --git a/qemu-options.hx b/qemu-options.hx
index 42fd09e4de96e962cd5873c49501f6e1dbb5e346..b09d692d5b4b9d51a3af27bef86aefac8550d24c 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -149,14 +149,14 @@ SRST
         platform and configuration dependent.
 
         ``interleave-granularity=granularity`` sets the granularity of
-        interleave. Default 256KiB. Only 256KiB, 512KiB, 1024KiB, 2048KiB
-        4096KiB, 8192KiB and 16384KiB granularities supported.
+        interleave. Default 256 (bytes). Only 256, 512, 1k, 2k,
+        4k, 8k and 16k granularities supported.
 
         Example:
 
         ::
 
-            -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512k
+            -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512
 ERST
 
 DEF("M", HAS_ARG, QEMU_OPTION_M,
@@ -2679,7 +2679,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios,
     "                specify SMBIOS type 3 fields\n"
     "-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str]\n"
     "              [,asset=str][,part=str][,max-speed=%d][,current-speed=%d]\n"
-    "              [,processor-id=%d]\n"
+    "              [,processor-family=%d][,processor-id=%d]\n"
     "                specify SMBIOS type 4 fields\n"
     "-smbios type=8[,external_reference=str][,internal_reference=str][,connector_type=%d][,port_type=%d]\n"
     "                specify SMBIOS type 8 fields\n"
@@ -2690,7 +2690,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios,
     "                specify SMBIOS type 17 fields\n"
     "-smbios type=41[,designation=str][,kind=str][,instance=%d][,pcidev=str]\n"
     "                specify SMBIOS type 41 fields\n",
-    QEMU_ARCH_I386 | QEMU_ARCH_ARM | QEMU_ARCH_LOONGARCH)
+    QEMU_ARCH_I386 | QEMU_ARCH_ARM | QEMU_ARCH_LOONGARCH | QEMU_ARCH_RISCV)
 SRST
 ``-smbios file=binary``
     Load SMBIOS entry from binary file.
@@ -2707,7 +2707,7 @@ SRST
 ``-smbios type=3[,manufacturer=str][,version=str][,serial=str][,asset=str][,sku=str]``
     Specify SMBIOS type 3 fields
 
-``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-id=%d]``
+``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]``
     Specify SMBIOS type 4 fields
 
 ``-smbios type=11[,value=str][,path=filename]``
@@ -5224,6 +5224,18 @@ SRST
 
         The ``share`` boolean option is on by default with memfd.
 
+    ``-object iommufd,id=id[,fd=fd]``
+        Creates an iommufd backend which allows control of DMA mapping
+        through the ``/dev/iommu`` device.
+
+        The ``id`` parameter is a unique ID which frontends (such as
+        vfio-pci of vdpa) will use to connect with the iommufd backend.
+
+        The ``fd`` parameter is an optional pre-opened file descriptor
+        resulting from ``/dev/iommu`` opening. Usually the iommufd is shared
+        across all subsystems, bringing the benefit of centralized
+        reference counting.
+
     ``-object rng-builtin,id=id``
         Creates a random number generator backend which obtains entropy
         from QEMU builtin functions. The ``id`` parameter is a unique ID
@@ -5637,7 +5649,7 @@ SRST
                  -object secret,id=sec0,keyid=secmaster0,format=base64,\\
                      data=$SECRET,iv=$(<iv.b64)
 
-    ``-object sev-guest,id=id,cbitpos=cbitpos,reduced-phys-bits=val,[sev-device=string,policy=policy,handle=handle,dh-cert-file=file,session-file=file,kernel-hashes=on|off]``
+    ``-object sev-guest,id=id,cbitpos=cbitpos,reduced-phys-bits=val,[sev-device=string,policy=policy,handle=handle,dh-cert-file=file,session-file=file,kernel-hashes=on|off,user-id=id,secret-header-file=file,secret-file=file]``
         Create a Secure Encrypted Virtualization (SEV) guest object,
         which can be used to provide the guest memory encryption support
         on AMD processors.
@@ -5681,6 +5693,15 @@ SRST
         cmdline to a designated guest firmware page for measured Linux
         boot with -kernel. The default is off. (Since 6.2)
 
+        The ``user-id`` set the user id of the guest owner, this only
+        support on Hygon CPUs. (Since 8.2)
+
+        The ``secret-header-file`` set the header file of the guest owner's
+        secret, this only support on Hygon CPUs. (Since 8.2)
+
+        The ``secret-file`` set the file of the guest owner's secret, this
+        only support on Hygon CPUs. (Since 8.2)
+
         e.g to launch a SEV guest
 
         .. parsed-literal::
@@ -5845,6 +5866,18 @@ SRST
             (qemu) qom-set /objects/iothread1 poll-max-ns 100000
 ERST
 
+DEF("mem2", HAS_ARG, QEMU_OPTION_mem2,
+    "-mem2 base=addr[G],size=n[MG]\n"
+    "                Map guest memory using host hugepages\n"
+    "                base: starting position of guest physical address\n"
+    "                size: the size of mmaped memory\n"
+    "NOTE: Both `base` and `size` need to be aligned according to 2MB\n",
+    QEMU_ARCH_I386)
+SRST
+``-mem2 base=addr[G],size=n[MG]``
+    Map the host's large page memory at the specified guest address
+    so that some devices can use larger contiguous physical memory.
+ERST
 
 HXCOMM This is the last statement. Insert new options before this line!
 
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 6169bbf7a015386bcdf9c08e4f31a9f5861c2689..f0d8e9e9c5e5976fe2a9af548fec2ae46320e970 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -759,6 +759,7 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp)
     ret = qmp_guest_fsfreeze_do_thaw(errp);
     if (ret >= 0) {
         ga_unset_frozen(ga_state);
+        slog("guest-fsthaw called");
         execute_fsfreeze_hook(FSFREEZE_HOOK_THAW, errp);
     } else {
         ret = 0;
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index 697c65507caa06889e4aae01cebe9f87e35eb4b0..656d1459f14858ee8aa04a3bfb6d27261e20a853 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -1275,6 +1275,9 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp)
     qga_vss_fsfreeze(&i, false, NULL, errp);
 
     ga_unset_frozen(ga_state);
+
+    slog("guest-fsthaw called");
+
     return i;
 }
 
diff --git a/qga/main.c b/qga/main.c
index 8668b9f3d39ea224b61fd6201b0b19e258b6c2c7..8d341ffdf1e1d45016c4f1ccd4534cd2a235382b 100644
--- a/qga/main.c
+++ b/qga/main.c
@@ -1399,7 +1399,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation)
     if (g_mkdir_with_parents(config->state_dir, S_IRWXU) == -1) {
         g_critical("unable to create (an ancestor of) the state directory"
                    " '%s': %s", config->state_dir, strerror(errno));
-        return NULL;
+        goto failed;
     }
 #endif
 
@@ -1407,7 +1407,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation)
         if (config->daemonize) {
             /* delay opening/locking of pidfile till filesystems are unfrozen */
             s->deferred_options.pid_filepath = config->pid_filepath;
-            become_daemon(NULL);
         }
         if (config->log_filepath) {
             /* delay opening the log file till filesystems are unfrozen */
@@ -1416,15 +1415,12 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation)
         ga_disable_logging(s);
         qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL);
     } else {
-        if (config->daemonize) {
-            become_daemon(config->pid_filepath);
-        }
         if (config->log_filepath) {
             FILE *log_file = ga_open_logfile(config->log_filepath);
             if (!log_file) {
                 g_critical("unable to open specified log file: %s",
                            strerror(errno));
-                return NULL;
+                goto failed;
             }
             s->log_file = log_file;
         }
@@ -1435,7 +1431,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation)
                                s->pstate_filepath,
                                ga_is_frozen(s))) {
         g_critical("failed to load persistent state");
-        return NULL;
+        goto failed;
     }
 
     if (config->allowedrpcs) {
@@ -1465,7 +1461,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation)
 #ifndef _WIN32
     if (!register_signal_handlers()) {
         g_critical("failed to register signal handlers");
-        return NULL;
+        goto failed;
     }
 #endif
 
@@ -1478,12 +1474,34 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation)
     s->wakeup_event = CreateEvent(NULL, TRUE, FALSE, TEXT("WakeUp"));
     if (s->wakeup_event == NULL) {
         g_critical("CreateEvent failed");
-        return NULL;
+        goto failed;
     }
 #endif
 
+    if (!channel_init(s, s->config->method, s->config->channel_path,
+                      s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) {
+        g_critical("failed to initialize guest agent channel");
+        return NULL;
+    }
+
+    if (config->daemonize) {
+        if (ga_is_frozen(s)) {
+            become_daemon(NULL);
+        } else {
+            become_daemon(config->pid_filepath);
+        }
+    }
+
     ga_state = s;
     return s;
+failed:
+    g_free(s->pstate_filepath);
+    g_free(s->state_filepath_isfrozen);
+    if (s->log_file) {
+        fclose(s->log_file);
+    }
+    g_free(s);
+    return NULL;
 }
 
 static void cleanup_agent(GAState *s)
@@ -1508,8 +1526,9 @@ static void cleanup_agent(GAState *s)
 
 static int run_agent_once(GAState *s)
 {
-    if (!channel_init(s, s->config->method, s->config->channel_path,
-                      s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) {
+    if (!s->channel &&
+        channel_init(s, s->config->method, s->config->channel_path,
+                     s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) {
         g_critical("failed to initialize guest agent channel");
         return EXIT_FAILURE;
     }
@@ -1518,6 +1537,7 @@ static int run_agent_once(GAState *s)
 
     if (s->channel) {
         ga_channel_free(s->channel);
+        s->channel = NULL;
     }
 
     return EXIT_SUCCESS;
diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
index 3fb4d5b3425c3e2364b7715032adc750a2300e1e..508be19c75816fa27a687d0e1946e62e9f38d5e5 100755
--- a/scripts/kvm/vmxcap
+++ b/scripts/kvm/vmxcap
@@ -24,6 +24,7 @@ MSR_IA32_VMX_TRUE_EXIT_CTLS = 0x48F
 MSR_IA32_VMX_TRUE_ENTRY_CTLS = 0x490
 MSR_IA32_VMX_VMFUNC = 0x491
 MSR_IA32_VMX_PROCBASED_CTLS3 = 0x492
+MSR_IA32_VMX_EXIT_CTLS2 = 0x493
 
 class msr(object):
     def __init__(self):
@@ -116,6 +117,7 @@ controls = [
             54: 'INS/OUTS instruction information',
             55: 'IA32_VMX_TRUE_*_CTLS support',
             56: 'Skip checks on event error code',
+            58: 'VMX nested exception support',
             },
         msr = MSR_IA32_VMX_BASIC,
         ),
@@ -219,11 +221,21 @@ controls = [
             23: 'Clear IA32_BNDCFGS',
             24: 'Conceal VM exits from PT',
             25: 'Clear IA32_RTIT_CTL',
+            31: 'Activate secondary VM-exit controls',
             },
         cap_msr = MSR_IA32_VMX_EXIT_CTLS,
         true_cap_msr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
         ),
 
+    Allowed1Control(
+        name = 'secondary VM-Exit controls',
+        bits = {
+            0: 'Save IA32 FRED MSRs',
+            1: 'Load IA32 FRED MSRs',
+            },
+        cap_msr = MSR_IA32_VMX_EXIT_CTLS2,
+        ),
+
     Control(
         name = 'VM-Entry controls',
         bits = {
@@ -237,6 +249,7 @@ controls = [
             16: 'Load IA32_BNDCFGS',
             17: 'Conceal VM entries from PT',
             18: 'Load IA32_RTIT_CTL',
+            23: 'Load IA32 FRED MSRs',
             },
         cap_msr = MSR_IA32_VMX_ENTRY_CTLS,
         true_cap_msr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 680fa3f581dafcc43d90e2884d5845fc8b9e1017..8604fe8ffa8e668952fc8c761db1da8fd83702e5 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -163,6 +163,7 @@ meson_options_help() {
   printf "%s\n" '  pixman          pixman support'
   printf "%s\n" '  plugins         TCG plugins via shared library loading'
   printf "%s\n" '  png             PNG support with libpng'
+  printf "%s\n" '  qatzip          QATzip compression support'
   printf "%s\n" '  pvrdma          Enable PVRDMA support'
   printf "%s\n" '  qcow1           qcow1 image format support'
   printf "%s\n" '  qed             qed image format support'
@@ -222,6 +223,8 @@ meson_options_help() {
   printf "%s\n" '                  Xen PCI passthrough support'
   printf "%s\n" '  xkbcommon       xkbcommon support'
   printf "%s\n" '  zstd            zstd compression support'
+  printf "%s\n" '  qpl             Query Processing Library support'
+  printf "%s\n" '  uadk            UADK Library support'
 }
 _meson_option_parse() {
   case $1 in
@@ -428,6 +431,8 @@ _meson_option_parse() {
     --enable-png) printf "%s" -Dpng=enabled ;;
     --disable-png) printf "%s" -Dpng=disabled ;;
     --prefix=*) quote_sh "-Dprefix=$2" ;;
+    --enable-qatzip) printf "%s" -Dqatzip=enabled ;;
+    --disable-qatzip) printf "%s" -Dqatzip=disabled ;;
     --enable-pvrdma) printf "%s" -Dpvrdma=enabled ;;
     --disable-pvrdma) printf "%s" -Dpvrdma=disabled ;;
     --enable-qcow1) printf "%s" -Dqcow1=enabled ;;
@@ -562,6 +567,10 @@ _meson_option_parse() {
     --disable-xkbcommon) printf "%s" -Dxkbcommon=disabled ;;
     --enable-zstd) printf "%s" -Dzstd=enabled ;;
     --disable-zstd) printf "%s" -Dzstd=disabled ;;
+    --enable-qpl) printf "%s" -Dqpl=enabled ;;
+    --disable-qpl) printf "%s" -Dqpl=disabled ;;
+    --enable-uadk) printf "%s" -Duadk=enabled ;;
+    --disable-uadk) printf "%s" -Duadk=disabled ;;
     *) return 1 ;;
   esac
 }
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 34295c0fe55b72bbf4db013c4e96262fa1ebfeac..88c76b8f69b07e9b7b587e27e87578686f99a7a9 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -156,6 +156,10 @@ for arch in $ARCHLIST; do
         cp_portable "$tmpdir/bootparam.h" \
                     "$output/include/standard-headers/asm-$arch"
     fi
+    if [ $arch = loongarch ]; then
+        cp "$hdrdir/include/asm/kvm_para.h" "$output/linux-headers/asm-loongarch/"
+        cp "$hdrdir/include/asm/unistd_64.h" "$output/linux-headers/asm-loongarch/"
+    fi
 done
 
 rm -rf "$output/linux-headers/linux"
diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index c6c6347e9b6a5554772908bc4d45f077b9907934..655404fd078147ca05db9d038792e67052c9eceb 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -285,9 +285,12 @@ static void multipath_pr_init(void)
 
 static int is_mpath(int fd)
 {
-    struct dm_ioctl dm = { .flags = DM_NOFLUSH_FLAG };
+    struct dm_ioctl dm;
     struct dm_target_spec *tgt;
 
+    memset(&dm, 0, sizeof(struct dm_ioctl));
+    dm.flags = DM_NOFLUSH_FLAG;
+
     tgt = dm_dev_ioctl(fd, DM_TABLE_STATUS, &dm);
     if (!tgt) {
         if (errno == ENXIO) {
diff --git a/system/cpus.c b/system/cpus.c
index a444a747f0164ee9a998ef4b25b49a2efc4607a8..d9de09b9e87b5f5d4eab095e38f7ca2ed942b2ba 100644
--- a/system/cpus.c
+++ b/system/cpus.c
@@ -193,6 +193,20 @@ void cpu_synchronize_pre_loadvm(CPUState *cpu)
     }
 }
 
+void cpus_control_pre_system_reset(void)
+{
+    if (cpus_accel->control_pre_system_reset) {
+        cpus_accel->control_pre_system_reset();
+    }
+}
+
+void cpus_control_post_system_reset(void)
+{
+    if (cpus_accel->control_post_system_reset) {
+        cpus_accel->control_post_system_reset();
+    }
+}
+
 bool cpus_are_resettable(void)
 {
     if (cpus_accel->cpus_are_resettable) {
@@ -551,12 +565,14 @@ static bool all_vcpus_paused(void)
     return true;
 }
 
-void pause_all_vcpus(void)
+static void request_pause_all_vcpus(void)
 {
     CPUState *cpu;
 
-    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
     CPU_FOREACH(cpu) {
+        if (cpu->stopped) {
+            continue;
+        }
         if (qemu_cpu_is_self(cpu)) {
             qemu_cpu_stop(cpu, true);
         } else {
@@ -564,6 +580,14 @@ void pause_all_vcpus(void)
             qemu_cpu_kick(cpu);
         }
     }
+}
+
+void pause_all_vcpus(void)
+{
+    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
+
+retry:
+    request_pause_all_vcpus();
 
     /* We need to drop the replay_lock so any vCPU threads woken up
      * can finish their replay tasks
@@ -572,14 +596,23 @@ void pause_all_vcpus(void)
 
     while (!all_vcpus_paused()) {
         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
-        CPU_FOREACH(cpu) {
-            qemu_cpu_kick(cpu);
-        }
+        /* During we waited on qemu_pause_cond the bql was unlocked,
+         * the vcpu's state may has been changed by other thread, so
+         * we must request the pause state on all vcpus again.
+         */
+        request_pause_all_vcpus();
     }
 
     qemu_mutex_unlock_iothread();
     replay_mutex_lock();
     qemu_mutex_lock_iothread();
+
+    /* During the bql was unlocked, the vcpu's state may has been
+     * changed by other thread, so we must retry.
+     */
+    if (!all_vcpus_paused()) {
+        goto retry;
+    }
 }
 
 void cpu_resume(CPUState *cpu)
@@ -599,6 +632,9 @@ void resume_all_vcpus(void)
 
     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
     CPU_FOREACH(cpu) {
+        if (!object_property_get_bool(OBJECT(cpu), "realized", &error_abort)) {
+            continue;
+        }
         cpu_resume(cpu);
     }
 }
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index 36211acc7eaeb2a9ac16375daf5f3f59bd64c329..611ea04ffb7459dd485f30f1fff8a5f2f5d27c5c 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -167,7 +167,7 @@ static void dma_blk_cb(void *opaque, int ret)
     if (dbs->iov.size == 0) {
         trace_dma_map_wait(dbs);
         dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
-        cpu_register_map_client(dbs->bh);
+        address_space_register_map_client(dbs->sg->as, dbs->bh);
         goto out;
     }
 
@@ -197,7 +197,7 @@ static void dma_aio_cancel(BlockAIOCB *acb)
     }
 
     if (dbs->bh) {
-        cpu_unregister_map_client(dbs->bh);
+        address_space_unregister_map_client(dbs->sg->as, dbs->bh);
         qemu_bh_delete(dbs->bh);
         dbs->bh = NULL;
     }
diff --git a/system/main.c b/system/main.c
index 9b91d21ea8c9186f2c80ac604542731c12cf05f1..28bb283ebfd609c1c7590cf360cbe05112b78f1c 100644
--- a/system/main.c
+++ b/system/main.c
@@ -23,6 +23,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "qemu-main.h"
 #include "sysemu/sysemu.h"
 
@@ -34,6 +35,7 @@ int qemu_default_main(void)
 {
     int status;
 
+    qemu_log("qemu enter main_loop\n");
     status = qemu_main_loop();
     qemu_cleanup(status);
 
diff --git a/system/memory.c b/system/memory.c
index 798b6c0a171b71db8acb9aea503ea21ac26a07fe..607ce9cf60c4117cd08e8a1ac6e0a33cfdf63127 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -48,6 +48,9 @@ static QTAILQ_HEAD(, MemoryListener) memory_listeners
 static QTAILQ_HEAD(, AddressSpace) address_spaces
     = QTAILQ_HEAD_INITIALIZER(address_spaces);
 
+static QTAILQ_HEAD(, SharedRegionListener) shared_region_listeners
+    = QTAILQ_HEAD_INITIALIZER(shared_region_listeners);
+
 static GHashTable *flat_views;
 
 typedef struct AddrRange AddrRange;
@@ -853,6 +856,13 @@ static void address_space_update_ioeventfds(AddressSpace *as)
         return;
     }
 
+    view = address_space_get_flatview(as);
+    if (view->flags & FLATVIEW_FLAG_LAST_PROCESSED) {
+        flatview_unref(view);
+        return;
+    }
+    view->flags |= FLATVIEW_FLAG_LAST_PROCESSED;
+
     /*
      * It is likely that the number of ioeventfds hasn't changed much, so use
      * the previous size as the starting value, with some headroom to avoid
@@ -861,7 +871,6 @@ static void address_space_update_ioeventfds(AddressSpace *as)
     ioeventfd_max = QEMU_ALIGN_UP(as->ioeventfd_nb, 4);
     ioeventfds = g_new(MemoryRegionIoeventfd, ioeventfd_max);
 
-    view = address_space_get_flatview(as);
     FOR_EACH_FLAT_RANGE(fr, view) {
         for (i = 0; i < fr->mr->ioeventfd_nb; ++i) {
             tmp = addrrange_shift(fr->mr->ioeventfds[i].addr,
@@ -1108,40 +1117,64 @@ static void address_space_update_topology(AddressSpace *as)
     address_space_set_flatview(as);
 }
 
+static void address_space_update_view(AddressSpace *as)
+{
+    FlatView *view;
+
+    view = address_space_get_flatview(as);
+    if (view->flags & FLATVIEW_FLAG_LAST_PROCESSED) {
+        view->flags &= ~FLATVIEW_FLAG_LAST_PROCESSED;
+    }
+    flatview_unref(view);
+}
+
 void memory_region_transaction_begin(void)
 {
     qemu_flush_coalesced_mmio_buffer();
     ++memory_region_transaction_depth;
 }
 
-void memory_region_transaction_commit(void)
+void memory_region_commit(void)
 {
     AddressSpace *as;
 
+    if (memory_region_update_pending) {
+        flatviews_reset();
+
+        MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
+
+        QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+            address_space_set_flatview(as);
+            address_space_update_ioeventfds(as);
+        }
+        memory_region_update_pending = false;
+        ioeventfd_update_pending = false;
+        QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+            address_space_update_view(as);
+        }
+        MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
+    } else if (ioeventfd_update_pending) {
+        MEMORY_LISTENER_CALL_GLOBAL(eventfd_begin, Forward);
+        QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+            address_space_update_ioeventfds(as);
+        }
+        ioeventfd_update_pending = false;
+        QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
+            address_space_update_view(as);
+        }
+        MEMORY_LISTENER_CALL_GLOBAL(eventfd_end, Forward);
+    }
+}
+
+void memory_region_transaction_commit(void)
+{
     assert(memory_region_transaction_depth);
     assert(qemu_mutex_iothread_locked());
 
     --memory_region_transaction_depth;
     if (!memory_region_transaction_depth) {
-        if (memory_region_update_pending) {
-            flatviews_reset();
-
-            MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
-
-            QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
-                address_space_set_flatview(as);
-                address_space_update_ioeventfds(as);
-            }
-            memory_region_update_pending = false;
-            ioeventfd_update_pending = false;
-            MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
-        } else if (ioeventfd_update_pending) {
-            QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
-                address_space_update_ioeventfds(as);
-            }
-            ioeventfd_update_pending = false;
-        }
-   }
+        memory_region_commit();
+    }
 }
 
 static void memory_region_destructor_none(MemoryRegion *mr)
@@ -2226,6 +2259,21 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
     return true;
 }
 
+void shared_region_register_listener(SharedRegionListener *shl)
+{
+    QTAILQ_INSERT_TAIL(&shared_region_listeners, shl, next);
+}
+
+void shared_region_unregister_listener(SharedRegionListener *shl)
+{
+    QTAILQ_REMOVE(&shared_region_listeners, shl, next);
+}
+
+void *shared_region_listeners_get(void)
+{
+    return &shared_region_listeners;
+}
+
 void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
 {
     uint8_t mask = 1 << client;
@@ -2253,6 +2301,12 @@ void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr,
                                         memory_region_get_dirty_log_mask(mr));
 }
 
+bool is_first_section(MemoryRegionSection *section)
+{
+    return section->fv->ranges->addr.start == section->offset_within_address_space &&
+           section->fv->ranges->addr.size == section->size;
+}
+
 /*
  * If memory region `mr' is NULL, do global sync.  Otherwise, sync
  * dirty bitmap for the specified memory region.
@@ -3117,19 +3171,31 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
     as->ioeventfds = NULL;
     QTAILQ_INIT(&as->listeners);
     QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
+    as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE;
+    as->bounce_buffer_size = 0;
+    qemu_mutex_init(&as->map_client_list_lock);
+    QLIST_INIT(&as->map_client_list);
     as->name = g_strdup(name ? name : "anonymous");
     address_space_update_topology(as);
     address_space_update_ioeventfds(as);
+    address_space_update_view(as);
 }
 
 static void do_address_space_destroy(AddressSpace *as)
 {
+    assert(qatomic_read(&as->bounce_buffer_size) == 0);
+    assert(QLIST_EMPTY(&as->map_client_list));
+    qemu_mutex_destroy(&as->map_client_list_lock);
+
     assert(QTAILQ_EMPTY(&as->listeners));
 
     flatview_unref(as->current_map);
     g_free(as->name);
     g_free(as->ioeventfds);
     memory_region_unref(as->root);
+    if (as->free_in_rcu) {
+        g_free(as);
+    }
 }
 
 void address_space_destroy(AddressSpace *as)
diff --git a/system/physmem.c b/system/physmem.c
index a63853a7bc9d5c4fc2ee06f5fe56222c36490fe0..8f4be2d1310c6cb007ae7b4db6ff135d475cf6de 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -89,9 +89,17 @@ RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
 
 static MemoryRegion *system_memory;
 static MemoryRegion *system_io;
+static MemoryRegion *virtcca_shared_memory;
+
+/*
+ * Serves as the sub-MR of the root MR (virtcca_shared_memory)
+ * and is associated with the RAMBlock.
+ */
+MemoryRegion *virtcca_shared_hugepage;
 
 AddressSpace address_space_io;
 AddressSpace address_space_memory;
+AddressSpace address_space_virtcca_shared_memory;
 
 static MemoryRegion io_mem_unassigned;
 
@@ -761,6 +769,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
 
     if (!cpu->cpu_ases) {
         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
+        cpu->cpu_ases_count = cpu->num_ases;
     }
 
     newas = &cpu->cpu_ases[asidx];
@@ -774,6 +783,34 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
     }
 }
 
+void cpu_address_space_destroy(CPUState *cpu, int asidx)
+{
+    CPUAddressSpace *cpuas;
+
+    assert(cpu->cpu_ases);
+    assert(asidx >= 0 && asidx < cpu->num_ases);
+    /* KVM cannot currently support multiple address spaces. */
+    assert(asidx == 0 || !kvm_enabled());
+
+    cpuas = &cpu->cpu_ases[asidx];
+    if (tcg_enabled()) {
+        memory_listener_unregister(&cpuas->tcg_as_listener);
+    }
+
+    address_space_destroy(cpuas->as);
+    g_free_rcu(cpuas->as, rcu);
+
+    if (asidx == 0) {
+        /* reset the convenience alias for address space 0 */
+        cpu->as = NULL;
+    }
+
+    if (--cpu->cpu_ases_count == 0) {
+        g_free(cpu->cpu_ases);
+        cpu->cpu_ases = NULL;
+    }
+}
+
 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 {
     /* Return the AddressSpace corresponding to the specified index */
@@ -1329,7 +1366,14 @@ static int file_ram_open(const char *path,
             /* @path names a file that doesn't exist, create it */
             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
             if (fd >= 0) {
-                *created = true;
+                info_report("open %s success \n", path);
+                /* if fd file type is HUGETLBFS_MAGIC, unlink it, */
+                /* in case to prevent residue after qemu killed */
+                if (qemu_fd_getfiletype(fd) == HUGETLBFS_MAGIC) {
+                    unlink(path);
+                } else {
+                    *created = true;
+                }
                 break;
             }
         } else if (errno == EISDIR) {
@@ -1499,18 +1543,6 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
     return offset;
 }
 
-static unsigned long last_ram_page(void)
-{
-    RAMBlock *block;
-    ram_addr_t last = 0;
-
-    RCU_READ_LOCK_GUARD();
-    RAMBLOCK_FOREACH(block) {
-        last = MAX(last, block->offset + block->max_length);
-    }
-    return last >> TARGET_PAGE_BITS;
-}
-
 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
 {
     int ret;
@@ -1763,13 +1795,11 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
 }
 
 /* Called with ram_list.mutex held */
-static void dirty_memory_extend(ram_addr_t old_ram_size,
-                                ram_addr_t new_ram_size)
+static void dirty_memory_extend(ram_addr_t new_ram_size)
 {
-    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
-                                             DIRTY_MEMORY_BLOCK_SIZE);
-    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
-                                             DIRTY_MEMORY_BLOCK_SIZE);
+    unsigned int old_num_blocks = ram_list.num_dirty_blocks;
+    unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size,
+                                               DIRTY_MEMORY_BLOCK_SIZE);
     int i;
 
     /* Only need to extend if block count increased */
@@ -1801,6 +1831,8 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
             g_free_rcu(old_blocks, rcu);
         }
     }
+
+    ram_list.num_dirty_blocks = new_num_blocks;
 }
 
 static void ram_block_add(RAMBlock *new_block, Error **errp)
@@ -1809,11 +1841,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
     const bool shared = qemu_ram_is_shared(new_block);
     RAMBlock *block;
     RAMBlock *last_block = NULL;
-    ram_addr_t old_ram_size, new_ram_size;
+    ram_addr_t ram_size;
     Error *err = NULL;
 
-    old_ram_size = last_ram_page();
-
     qemu_mutex_lock_ramlist();
     new_block->offset = find_ram_offset(new_block->max_length);
 
@@ -1841,11 +1871,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
         }
     }
 
-    new_ram_size = MAX(old_ram_size,
-              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
-    if (new_ram_size > old_ram_size) {
-        dirty_memory_extend(old_ram_size, new_ram_size);
-    }
+    ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS;
+    dirty_memory_extend(ram_size);
     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
      * QLIST (which has an RCU-friendly variant) does not have insertion at
      * tail, so save the last element in last_block.
@@ -2231,6 +2258,10 @@ RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
         ram_addr_t ram_addr;
         RCU_READ_LOCK_GUARD();
         ram_addr = xen_ram_addr_from_mapcache(ptr);
+        if (ram_addr == RAM_ADDR_INVALID) {
+            return NULL;
+        }
+
         block = qemu_get_ram_block(ram_addr);
         if (block) {
             *offset = ram_addr - block->offset;
@@ -2563,6 +2594,15 @@ static void memory_map_init(void)
     address_space_init(&address_space_io, system_io, "I/O");
 }
 
+void virtcca_shared_memory_address_space_init(void)
+{
+    virtcca_shared_memory = g_malloc(sizeof(*virtcca_shared_memory));
+    memory_region_init(virtcca_shared_memory, NULL,
+                       "virtcca_shared_memory", UINT64_MAX);
+    address_space_init(&address_space_virtcca_shared_memory,
+                       virtcca_shared_memory, "virtcca_shared_memory");
+}
+
 MemoryRegion *get_system_memory(void)
 {
     return system_memory;
@@ -2595,6 +2635,17 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
 }
 
+int64_t memory_section_set_dirty_bytemap(MemoryRegionSection *section, unsigned long *bytemap)
+{
+    ram_addr_t start = section->offset_within_region +
+                       memory_region_get_ram_addr(section->mr);
+    ram_addr_t pages = int128_get64(section->size) >> TARGET_PAGE_BITS;
+
+    hwaddr idx = BYTE_WORD(
+        section->offset_within_address_space >> TARGET_PAGE_BITS);
+    return cpu_physical_memory_set_dirty_bytemap(bytemap + idx, start, pages);
+}
+
 void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
 {
     /*
@@ -2974,55 +3025,51 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
                                      NULL, len, FLUSH_CACHE);
 }
 
+/*
+ * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
+ * to detect illegal pointers passed to address_space_unmap.
+ */
+#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
+
 typedef struct {
+    uint64_t magic;
     MemoryRegion *mr;
-    void *buffer;
     hwaddr addr;
-    hwaddr len;
-    bool in_use;
+    size_t len;
+    uint8_t buffer[];
 } BounceBuffer;
 
-static BounceBuffer bounce;
-
-typedef struct MapClient {
-    QEMUBH *bh;
-    QLIST_ENTRY(MapClient) link;
-} MapClient;
-
-QemuMutex map_client_list_lock;
-static QLIST_HEAD(, MapClient) map_client_list
-    = QLIST_HEAD_INITIALIZER(map_client_list);
-
-static void cpu_unregister_map_client_do(MapClient *client)
+static void
+address_space_unregister_map_client_do(AddressSpaceMapClient *client)
 {
     QLIST_REMOVE(client, link);
     g_free(client);
 }
 
-static void cpu_notify_map_clients_locked(void)
+static void address_space_notify_map_clients_locked(AddressSpace *as)
 {
-    MapClient *client;
+    AddressSpaceMapClient *client;
 
-    while (!QLIST_EMPTY(&map_client_list)) {
-        client = QLIST_FIRST(&map_client_list);
+    while (!QLIST_EMPTY(&as->map_client_list)) {
+        client = QLIST_FIRST(&as->map_client_list);
         qemu_bh_schedule(client->bh);
-        cpu_unregister_map_client_do(client);
+        address_space_unregister_map_client_do(client);
     }
 }
 
-void cpu_register_map_client(QEMUBH *bh)
+void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
 {
-    MapClient *client = g_malloc(sizeof(*client));
+    AddressSpaceMapClient *client = g_malloc(sizeof(*client));
 
-    qemu_mutex_lock(&map_client_list_lock);
+    qemu_mutex_lock(&as->map_client_list_lock);
     client->bh = bh;
-    QLIST_INSERT_HEAD(&map_client_list, client, link);
-    /* Write map_client_list before reading in_use.  */
+    QLIST_INSERT_HEAD(&as->map_client_list, client, link);
+    /* Write map_client_list before reading bounce_buffer_size. */
     smp_mb();
-    if (!qatomic_read(&bounce.in_use)) {
-        cpu_notify_map_clients_locked();
+    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
+        address_space_notify_map_clients_locked(as);
     }
-    qemu_mutex_unlock(&map_client_list_lock);
+    qemu_mutex_unlock(&as->map_client_list_lock);
 }
 
 void cpu_exec_init_all(void)
@@ -3038,28 +3085,27 @@ void cpu_exec_init_all(void)
     finalize_target_page_bits();
     io_mem_init();
     memory_map_init();
-    qemu_mutex_init(&map_client_list_lock);
 }
 
-void cpu_unregister_map_client(QEMUBH *bh)
+void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh)
 {
-    MapClient *client;
+    AddressSpaceMapClient *client;
 
-    qemu_mutex_lock(&map_client_list_lock);
-    QLIST_FOREACH(client, &map_client_list, link) {
+    qemu_mutex_lock(&as->map_client_list_lock);
+    QLIST_FOREACH(client, &as->map_client_list, link) {
         if (client->bh == bh) {
-            cpu_unregister_map_client_do(client);
+            address_space_unregister_map_client_do(client);
             break;
         }
     }
-    qemu_mutex_unlock(&map_client_list_lock);
+    qemu_mutex_unlock(&as->map_client_list_lock);
 }
 
-static void cpu_notify_map_clients(void)
+static void address_space_notify_map_clients(AddressSpace *as)
 {
-    qemu_mutex_lock(&map_client_list_lock);
-    cpu_notify_map_clients_locked();
-    qemu_mutex_unlock(&map_client_list_lock);
+    qemu_mutex_lock(&as->map_client_list_lock);
+    address_space_notify_map_clients_locked(as);
+    qemu_mutex_unlock(&as->map_client_list_lock);
 }
 
 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
@@ -3126,8 +3172,8 @@ flatview_extend_translation(FlatView *fv, hwaddr addr,
  * May map a subset of the requested range, given by and returned in *plen.
  * May return NULL if resources needed to perform the mapping are exhausted.
  * Use only for reads OR writes - not for read-modify-write operations.
- * Use cpu_register_map_client() to know when retrying the map operation is
- * likely to succeed.
+ * Use address_space_register_map_client() to know when retrying the map
+ * operation is likely to succeed.
  */
 void *address_space_map(AddressSpace *as,
                         hwaddr addr,
@@ -3150,28 +3196,40 @@ void *address_space_map(AddressSpace *as,
     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
 
     if (!memory_access_is_direct(mr, is_write)) {
-        if (qatomic_xchg(&bounce.in_use, true)) {
+        size_t used = qatomic_read(&as->bounce_buffer_size);
+        for (;;) {
+            hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
+            size_t new_size = used + alloc;
+            size_t actual =
+                qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
+            if (actual == used) {
+                l = alloc;
+                break;
+            }
+            used = actual;
+        }
+
+        if (l == 0) {
             *plen = 0;
             return NULL;
         }
-        /* Avoid unbounded allocations */
-        l = MIN(l, TARGET_PAGE_SIZE);
-        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
-        bounce.addr = addr;
-        bounce.len = l;
 
+        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
+        bounce->magic = BOUNCE_BUFFER_MAGIC;
         memory_region_ref(mr);
-        bounce.mr = mr;
+        bounce->mr = mr;
+        bounce->addr = addr;
+        bounce->len = l;
+
         if (!is_write) {
             flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
-                               bounce.buffer, l);
+                          bounce->buffer, l);
         }
 
         *plen = l;
-        return bounce.buffer;
+        return bounce->buffer;
     }
 
-
     memory_region_ref(mr);
     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
                                         l, is_write, attrs);
@@ -3186,12 +3244,11 @@ void *address_space_map(AddressSpace *as,
 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                          bool is_write, hwaddr access_len)
 {
-    if (buffer != bounce.buffer) {
-        MemoryRegion *mr;
-        ram_addr_t addr1;
+    MemoryRegion *mr;
+    ram_addr_t addr1;
 
-        mr = memory_region_from_host(buffer, &addr1);
-        assert(mr != NULL);
+    mr = memory_region_from_host(buffer, &addr1);
+    if (mr != NULL) {
         if (is_write) {
             invalidate_and_set_dirty(mr, addr1, access_len);
         }
@@ -3201,16 +3258,23 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
         memory_region_unref(mr);
         return;
     }
+
+
+    BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
+    assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
+
     if (is_write) {
-        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
-                            bounce.buffer, access_len);
-    }
-    qemu_vfree(bounce.buffer);
-    bounce.buffer = NULL;
-    memory_region_unref(bounce.mr);
-    /* Clear in_use before reading map_client_list.  */
-    qatomic_set_mb(&bounce.in_use, false);
-    cpu_notify_map_clients();
+        address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
+                            bounce->buffer, access_len);
+    }
+
+    qatomic_sub(&as->bounce_buffer_size, bounce->len);
+    bounce->magic = ~BOUNCE_BUFFER_MAGIC;
+    memory_region_unref(bounce->mr);
+    g_free(bounce);
+    /* Write bounce_buffer_size before reading map_client_list. */
+    smp_mb();
+    address_space_notify_map_clients(as);
 }
 
 void *cpu_physical_memory_map(hwaddr addr,
diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c
index a13db763e5dd0fbdc3690ef1bc3bb1bbfffade33..5b35704b5ea54efb2ef25f04249c7b49cec4852f 100644
--- a/system/qdev-monitor.c
+++ b/system/qdev-monitor.c
@@ -36,6 +36,7 @@
 #include "qemu/option.h"
 #include "qemu/qemu-print.h"
 #include "qemu/option_int.h"
+#include "qemu/log.h"
 #include "sysemu/block-backend.h"
 #include "migration/misc.h"
 #include "migration/migration.h"
@@ -643,6 +644,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts,
     if (path != NULL) {
         bus = qbus_find(path, errp);
         if (!bus) {
+	    qemu_log("can not find bus for %s\n", driver);
             return NULL;
         }
         if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) {
@@ -713,9 +715,11 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts,
     object_set_properties_from_keyval(&dev->parent_obj, dev->opts, from_json,
                                       errp);
     if (*errp) {
+	qemu_log("the bus %s -driver %s set property failed\n",
+		bus ? bus->name : "None", driver);
         goto err_del_dev;
     }
-
+    qemu_log("add qdev %s:%s success\n", driver, dev->id ? dev->id : "none");
     if (!qdev_realize(dev, bus, errp)) {
         goto err_del_dev;
     }
@@ -737,6 +741,8 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp)
 
     ret = qdev_device_add_from_qdict(qdict, false, errp);
     if (ret) {
+	qemu_log("add qdev %s:%s success\n", qemu_opt_get(opts, "driver"),
+		qemu_opts_id(opts) ? qemu_opts_id(opts) : "none");
         qemu_opts_del(opts);
     }
     qobject_unref(qdict);
diff --git a/system/rtc.c b/system/rtc.c
index 4904581abeb8accb97fd5d9b20c337c8dbec0b48..e16b5fffc59c8036f08d1c950f5365e703f270de 100644
--- a/system/rtc.c
+++ b/system/rtc.c
@@ -44,6 +44,7 @@ static time_t rtc_ref_start_datetime;
 static int rtc_realtime_clock_offset; /* used only with QEMU_CLOCK_REALTIME */
 static int rtc_host_datetime_offset = -1; /* valid & used only with
                                              RTC_BASE_DATETIME */
+static time_t rtc_date_diff = 0;
 QEMUClockType rtc_clock;
 /***********************************************************/
 /* RTC reference time/date access */
@@ -108,6 +109,16 @@ time_t qemu_timedate_diff(struct tm *tm)
     return seconds - qemu_ref_timedate(QEMU_CLOCK_HOST);
 }
 
+time_t get_rtc_date_diff(void)
+{
+    return rtc_date_diff;
+}
+
+void set_rtc_date_diff(time_t diff)
+{
+    rtc_date_diff = diff;
+}
+
 static void configure_rtc_base_datetime(const char *startdate)
 {
     time_t rtc_start_datetime;
diff --git a/system/runstate.c b/system/runstate.c
index ea9d6c2a32a45541a87cecaba569583f60624ea2..7e41626bb15b54b4a86fe4d50ac5388aa168544c 100644
--- a/system/runstate.c
+++ b/system/runstate.c
@@ -116,6 +116,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING },
     { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+    { RUN_STATE_PRELAUNCH, RUN_STATE_POSTMIGRATE },
 
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PAUSED },
@@ -486,6 +487,8 @@ void qemu_system_reset(ShutdownCause reason)
 
     mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;
 
+    cpus_control_pre_system_reset();
+
     cpu_synchronize_all_states();
 
     if (mc && mc->reset) {
@@ -502,6 +505,10 @@ void qemu_system_reset(ShutdownCause reason)
         qapi_event_send_reset(shutdown_caused_by_guest(reason), reason);
     }
     cpu_synchronize_all_post_reset();
+
+    cpus_control_post_system_reset();
+
+    monitor_qapi_event_discard_io_error();
 }
 
 /*
@@ -767,9 +774,11 @@ static bool main_loop_should_exit(int *status)
     }
     if (qemu_powerdown_requested()) {
         qemu_system_powerdown();
+	qemu_log("domain is power down by outside operation\n");
     }
     if (qemu_vmstop_requested(&r)) {
         vm_stop(r);
+	qemu_log("domain is stopped by outside operation\n");
     }
     return false;
 }
diff --git a/system/vl.c b/system/vl.c
index 2bcd9efb9a64396520576ae5ceaf0523c991a4ca..7c10cd13372302f5069279b7669e30e9c857eee7 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -26,6 +26,7 @@
 #include "qemu/help-texts.h"
 #include "qemu/datadir.h"
 #include "qemu/units.h"
+#include "qemu/log.h"
 #include "exec/cpu-common.h"
 #include "exec/page-vary.h"
 #include "hw/qdev-properties.h"
@@ -172,6 +173,8 @@ static QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list);
 static BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue);
 static bool nographic = false;
 static int mem_prealloc; /* force preallocation of physical target memory */
+static ram_addr_t ram2_base;
+static ram_addr_t ram2_size;
 static const char *vga_model = NULL;
 static DisplayOptions dpy;
 static int num_serial_hds;
@@ -503,6 +506,23 @@ static QemuOptsList qemu_action_opts = {
     },
 };
 
+static QemuOptsList qemu_mem2_opts = {
+    .name = "mem2",
+    .merge_lists = true,
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_mem2_opts.head),
+    .desc = {
+        {
+            .name = "base",
+            .type = QEMU_OPT_SIZE,
+        },
+        {
+            .name = "size",
+            .type = QEMU_OPT_SIZE,
+        },
+        { /* end of list */ }
+    },
+};
+
 const char *qemu_get_vm_name(void)
 {
     return qemu_name;
@@ -993,9 +1013,16 @@ static bool vga_interface_available(VGAInterfaceType t)
     const VGAInterfaceInfo *ti = &vga_interfaces[t];
 
     assert(t < VGA_TYPE_MAX);
-    return !ti->class_names[0] ||
-           module_object_class_by_name(ti->class_names[0]) ||
-           module_object_class_by_name(ti->class_names[1]);
+
+    if (!ti->class_names[0] || module_object_class_by_name(ti->class_names[0])) {
+        return true;
+    }
+
+    if (ti->class_names[1] && module_object_class_by_name(ti->class_names[1])) {
+        return true;
+    }
+
+    return false;
 }
 
 static const char *
@@ -1924,6 +1951,9 @@ static void qemu_apply_machine_options(QDict *qdict)
 {
     object_set_properties_from_keyval(OBJECT(current_machine), qdict, false, &error_fatal);
 
+    current_machine->ram2_size = ram2_size;
+    current_machine->ram2_base = ram2_base;
+
     if (semihosting_enabled(false) && !semihosting_get_argc()) {
         /* fall back to the -kernel/-append */
         semihosting_arg_fallback(current_machine->kernel_filename, current_machine->kernel_cmdline);
@@ -2086,11 +2116,57 @@ static void parse_memory_options(void)
     loc_pop(&loc);
 }
 
+static void set_mem2_options(void)
+{
+    uint64_t sz, base;
+    const char *mem_str;
+    QemuOpts *opts = qemu_find_opts_singleton("mem2");
+    Location loc;
+
+    loc_push_none(&loc);
+    qemu_opts_loc_restore(opts);
+
+    mem_str = qemu_opt_get(opts, "base");
+    if (mem_str) {
+        if (!*mem_str) {
+            error_report("missing 'base' option value");
+            exit(EXIT_FAILURE);
+        }
+
+        base = qemu_opt_get_size(opts, "base", ram2_base);
+        ram2_base = base;
+    }
+
+    mem_str = qemu_opt_get(opts, "size");
+    if (mem_str) {
+        if (!*mem_str) {
+            error_report("missing 'base' option value");
+            exit(EXIT_FAILURE);
+        }
+
+        sz = qemu_opt_get_size(opts, "size", ram2_size);
+        ram2_size = sz;
+    }
+
+    if (ram2_base && !ram2_size){
+        error_report("missing 'size' option value");
+        exit(EXIT_FAILURE);
+    }
+    if (!ram2_base && ram2_size){
+        error_report("missing 'base' option value");
+        exit(EXIT_FAILURE);
+    }
+
+    loc_pop(&loc);
+}
+
 static void qemu_create_machine(QDict *qdict)
 {
     MachineClass *machine_class = select_machine(qdict, &error_fatal);
     object_set_machine_compat_props(machine_class->compat_props);
 
+    set_mem2_options();
+
     current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class)));
     object_property_add_child(object_get_root(), "machine",
                               OBJECT(current_machine));
@@ -2633,6 +2709,7 @@ static void qemu_create_cli_devices(void)
     }
 
     /* init generic devices */
+    qemu_log("device init start\n");
     rom_set_order_override(FW_CFG_ORDER_OVERRIDE_DEVICE);
     qemu_opts_foreach(qemu_find_opts("device"),
                       device_init_func, NULL, &error_fatal);
@@ -2768,6 +2845,7 @@ void qemu_init(int argc, char **argv)
     qemu_add_opts(&qemu_semihosting_config_opts);
     qemu_add_opts(&qemu_fw_cfg_opts);
     qemu_add_opts(&qemu_action_opts);
+    qemu_add_opts(&qemu_mem2_opts);
     qemu_add_run_with_opts();
     module_call_init(MODULE_INIT_OPTS);
 
@@ -2778,6 +2856,7 @@ void qemu_init(int argc, char **argv)
 
     qemu_init_subsystems();
 
+    qemu_log("qemu pid is %d, options parsing start\n", getpid());
     /* first pass of option parsing */
     optind = 1;
     while (optind < argc) {
@@ -2997,6 +3076,7 @@ void qemu_init(int argc, char **argv)
                 exit(0);
                 break;
             case QEMU_OPTION_m:
+                qemu_log("memory options parse start\n");
                 opts = qemu_opts_parse_noisily(qemu_find_opts("memory"), optarg, true);
                 if (opts == NULL) {
                     exit(1);
@@ -3585,6 +3665,13 @@ void qemu_init(int argc, char **argv)
             case QEMU_OPTION_nouserconfig:
                 /* Nothing to be parsed here. Especially, do not error out below. */
                 break;
+            case QEMU_OPTION_mem2:
+                opts = qemu_opts_parse_noisily(qemu_find_opts("mem2"),
+                                               optarg, false);
+                if (!opts) {
+                    exit(EXIT_FAILURE);
+                }
+                break;
 #if defined(CONFIG_POSIX)
             case QEMU_OPTION_runas:
                 if (!os_set_runas(optarg)) {
@@ -3697,6 +3784,15 @@ void qemu_init(int argc, char **argv)
     configure_accelerators(argv[0]);
     phase_advance(PHASE_ACCEL_CREATED);
 
+    /*
+     * Must run after kvm_init completes, as virtcca_cvm_enabled()
+     * depends on initialization performed in kvm_init.
+     */
+    if (virtcca_cvm_enabled()) {
+        virtcca_cvm_ram_size = current_machine->ram_size;
+        virtcca_shared_memory_address_space_init();
+    }
+
     /*
      * Beware, QOM objects created before this point miss global and
      * compat properties.
@@ -3714,6 +3810,7 @@ void qemu_init(int argc, char **argv)
      */
 
     machine_class = MACHINE_GET_CLASS(current_machine);
+    qemu_log("configure accelerator %s start\n", machine_class->name);
     if (!qtest_enabled() && machine_class->deprecation_reason) {
         warn_report("Machine type '%s' is deprecated: %s",
                      machine_class->name, machine_class->deprecation_reason);
@@ -3732,6 +3829,7 @@ void qemu_init(int argc, char **argv)
      */
     migration_object_init();
 
+    qemu_log("machine init start\n");
     /* parse features once if machine provides default cpu_type */
     current_machine->cpu_type = machine_class->default_cpu_type;
     if (cpu_option) {
diff --git a/target/arm/arm-powerctl.c b/target/arm/arm-powerctl.c
index c078849403c7751e303a620d15f63da549f75919..fb19b04189293722795432903fe7da4dbff0cf2a 100644
--- a/target/arm/arm-powerctl.c
+++ b/target/arm/arm-powerctl.c
@@ -16,6 +16,7 @@
 #include "qemu/log.h"
 #include "qemu/main-loop.h"
 #include "sysemu/tcg.h"
+#include "hw/boards.h"
 
 #ifndef DEBUG_ARM_POWERCTL
 #define DEBUG_ARM_POWERCTL 0
@@ -28,18 +29,37 @@
         } \
     } while (0)
 
+static CPUArchId *arm_get_archid_by_id(uint64_t id)
+{
+    int n;
+    CPUArchId *arch_id;
+    MachineState *ms = MACHINE(qdev_get_machine());
+
+    /*
+     * At this point disabled CPUs don't have a CPUState, but their CPUArchId
+     * exists.
+     *
+     * TODO: Is arch_id == mp_affinity? This needs work.
+     */
+    for (n = 0; n < ms->possible_cpus->len; n++) {
+        arch_id = &ms->possible_cpus->cpus[n];
+
+        if (arch_id->arch_id == id) {
+            return arch_id;
+        }
+    }
+    return NULL;
+}
+
 CPUState *arm_get_cpu_by_id(uint64_t id)
 {
-    CPUState *cpu;
+    CPUArchId *arch_id;
 
     DPRINTF("cpu %" PRId64 "\n", id);
 
-    CPU_FOREACH(cpu) {
-        ARMCPU *armcpu = ARM_CPU(cpu);
-
-        if (armcpu->mp_affinity == id) {
-            return cpu;
-        }
+    arch_id = arm_get_archid_by_id(id);
+    if (arch_id && arch_id->cpu) {
+        return CPU(arch_id->cpu);
     }
 
     qemu_log_mask(LOG_GUEST_ERROR,
@@ -97,6 +117,7 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
 {
     CPUState *target_cpu_state;
     ARMCPU *target_cpu;
+    CPUArchId *arch_id;
     struct CpuOnInfo *info;
 
     assert(qemu_mutex_iothread_locked());
@@ -117,12 +138,24 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
     }
 
     /* Retrieve the cpu we are powering up */
-    target_cpu_state = arm_get_cpu_by_id(cpuid);
-    if (!target_cpu_state) {
+    arch_id = arm_get_archid_by_id(cpuid);
+    if (!arch_id) {
         /* The cpu was not found */
         return QEMU_ARM_POWERCTL_INVALID_PARAM;
     }
 
+    target_cpu_state = CPU(arch_id->cpu);
+    if (!qemu_enabled_cpu(target_cpu_state)) {
+        /*
+         * The cpu is not plugged in or disabled. We should return appropriate
+         * value as introduced in DEN0022E PSCI 1.2 issue E
+         */
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "[ARM]%s: Denying attempt to online removed/disabled "
+                      "CPU%" PRId64"\n", __func__, cpuid);
+        return QEMU_ARM_POWERCTL_IS_OFF;
+    }
+
     target_cpu = ARM_CPU(target_cpu_state);
     if (target_cpu->power_state == PSCI_ON) {
         qemu_log_mask(LOG_GUEST_ERROR,
diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h
index 954d3582685c44eb156809f013a520603d721d08..165a497f7b9151562a143152f7f0dbd822e25a26 100644
--- a/target/arm/cpu-features.h
+++ b/target/arm/cpu-features.h
@@ -771,7 +771,7 @@ static inline bool isar_feature_aa64_hcx(const ARMISARegisters *id)
 
 static inline bool isar_feature_aa64_tidcp1(const ARMISARegisters *id)
 {
-    return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR1, TIDCP1) != 0;
+    return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, TIDCP1) != 0;
 }
 
 static inline bool isar_feature_aa64_hafs(const ARMISARegisters *id)
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index efb22a87f9eda26e212b6a7895d35d67461c6c08..09d391bd348aaf9c7e59a58d35487a1e68b540e6 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -142,6 +142,16 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
     QLIST_INSERT_HEAD(&cpu->pre_el_change_hooks, entry, node);
 }
 
+void arm_unregister_pre_el_change_hooks(ARMCPU *cpu)
+{
+    ARMELChangeHook *entry, *next;
+
+    QLIST_FOREACH_SAFE(entry, &cpu->pre_el_change_hooks, node, next) {
+        QLIST_REMOVE(entry, node);
+        g_free(entry);
+    }
+}
+
 void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
                                  void *opaque)
 {
@@ -153,6 +163,16 @@ void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
     QLIST_INSERT_HEAD(&cpu->el_change_hooks, entry, node);
 }
 
+void arm_unregister_el_change_hooks(ARMCPU *cpu)
+{
+    ARMELChangeHook *entry, *next;
+
+    QLIST_FOREACH_SAFE(entry, &cpu->el_change_hooks, node, next) {
+        QLIST_REMOVE(entry, node);
+        g_free(entry);
+    }
+}
+
 static void cp_reg_reset(gpointer key, gpointer value, gpointer opaque)
 {
     /* Reset a single ARMCPRegInfo register */
@@ -1615,6 +1635,10 @@ void arm_cpu_post_init(Object *obj)
         }
     } else if (cpu_isar_feature(aa32_vfp, cpu)) {
         cpu->has_vfp = true;
+        if (tcg_enabled() || qtest_enabled()) {
+            qdev_property_add_static(DEVICE(obj),
+                                     &arm_cpu_has_vfp_property);
+        }
         if (cpu_isar_feature(aa32_simd_r32, cpu)) {
             cpu->has_vfp_d32 = true;
             /*
@@ -2390,6 +2414,94 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
     acc->parent_realize(dev, errp);
 }
 
+static void arm_cpu_unrealizefn(DeviceState *dev)
+{
+    ARMCPUClass *acc = ARM_CPU_GET_CLASS(dev);
+    ARMCPU *cpu = ARM_CPU(dev);
+    CPUARMState *env = &cpu->env;
+    CPUState *cs = CPU(dev);
+    bool has_secure;
+
+#ifndef CONFIG_USER_ONLY
+    has_secure = cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY);
+
+    /* rock 'n' un-roll, whatever happened in the arm_cpu_realizefn cleanly */
+    cpu_address_space_destroy(cs, ARMASIdx_NS);
+
+    if (cpu->tag_memory != NULL) {
+        cpu_address_space_destroy(cs, ARMASIdx_TagNS);
+        if (has_secure) {
+            cpu_address_space_destroy(cs, ARMASIdx_TagS);
+        }
+    }
+
+    if (has_secure) {
+        cpu_address_space_destroy(cs, ARMASIdx_S);
+    }
+#endif
+
+    destroy_cpreg_list(cpu);
+    arm_cpu_unregister_gdb_regs(cpu);
+    unregister_cp_regs_for_features(cpu);
+
+#ifndef CONFIG_USER_ONLY
+    if (tcg_enabled() && cpu_isar_feature(aa64_rme, cpu)) {
+        arm_unregister_el_change_hooks(cpu);
+    }
+#endif
+
+    if (cpu->sau_sregion && arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        g_free(env->sau.rbar);
+        g_free(env->sau.rlar);
+    }
+
+    if (arm_feature(env, ARM_FEATURE_PMSA) &&
+        arm_feature(env, ARM_FEATURE_V7)) {
+        if (cpu->pmsav7_dregion) {
+            if (arm_feature(env, ARM_FEATURE_V8)) {
+                g_free(env->pmsav8.rbar[M_REG_NS]);
+                g_free(env->pmsav8.rlar[M_REG_NS]);
+                if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+                    g_free(env->pmsav8.rbar[M_REG_S]);
+                    g_free(env->pmsav8.rlar[M_REG_S]);
+                }
+            } else {
+                g_free(env->pmsav7.drbar);
+                g_free(env->pmsav7.drsr);
+                g_free(env->pmsav7.dracr);
+            }
+        }
+        if (cpu->pmsav8r_hdregion) {
+            g_free(env->pmsav8.hprbar);
+            g_free(env->pmsav8.hprlar);
+        }
+    }
+
+    if (arm_feature(env, ARM_FEATURE_PMU)) {
+        if (!kvm_enabled()) {
+            arm_unregister_pre_el_change_hooks(cpu);
+            arm_unregister_el_change_hooks(cpu);
+        }
+
+#ifndef CONFIG_USER_ONLY
+        if (cpu->pmu_timer) {
+            timer_del(cpu->pmu_timer);
+        }
+#endif
+    }
+
+    cpu_remove_sync(CPU(dev));
+    acc->parent_unrealize(dev);
+
+#ifndef CONFIG_USER_ONLY
+    timer_del(cpu->gt_timer[GTIMER_PHYS]);
+    timer_del(cpu->gt_timer[GTIMER_VIRT]);
+    timer_del(cpu->gt_timer[GTIMER_HYP]);
+    timer_del(cpu->gt_timer[GTIMER_SEC]);
+    timer_del(cpu->gt_timer[GTIMER_HYPVIRT]);
+#endif
+}
+
 static ObjectClass *arm_cpu_class_by_name(const char *cpu_model)
 {
     ObjectClass *oc;
@@ -2422,6 +2534,10 @@ static Property arm_cpu_properties[] = {
     DEFINE_PROP_UINT64("mp-affinity", ARMCPU,
                         mp_affinity, ARM64_AFFINITY_INVALID),
     DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID),
+    DEFINE_PROP_INT32("socket-id", ARMCPU, socket_id, 0),
+    DEFINE_PROP_INT32("cluster-id", ARMCPU, cluster_id, 0),
+    DEFINE_PROP_INT32("core-id", ARMCPU, core_id, 0),
+    DEFINE_PROP_INT32("thread-id", ARMCPU, thread_id, 0),
     DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1),
     DEFINE_PROP_END_OF_LIST()
 };
@@ -2473,6 +2589,12 @@ static const struct TCGCPUOps arm_tcg_ops = {
 };
 #endif /* CONFIG_TCG */
 
+static int64_t arm_cpu_get_arch_id(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    return cpu->mp_affinity;
+}
+
 static void arm_cpu_class_init(ObjectClass *oc, void *data)
 {
     ARMCPUClass *acc = ARM_CPU_CLASS(oc);
@@ -2482,6 +2604,8 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
 
     device_class_set_parent_realize(dc, arm_cpu_realizefn,
                                     &acc->parent_realize);
+    device_class_set_parent_unrealize(dc, arm_cpu_unrealizefn,
+                                      &acc->parent_unrealize);
 
     device_class_set_props(dc, arm_cpu_properties);
 
@@ -2491,6 +2615,7 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
     cc->class_by_name = arm_cpu_class_by_name;
     cc->has_work = arm_cpu_has_work;
     cc->dump_state = arm_cpu_dump_state;
+    cc->get_arch_id = arm_cpu_get_arch_id;
     cc->set_pc = arm_cpu_set_pc;
     cc->get_pc = arm_cpu_get_pc;
     cc->gdb_read_register = arm_cpu_gdb_read_register;
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index a0282e0d2817e513db8eb5a159155f52258e6d52..a5ba7f2a2657dbe871b755f4afc2355eb57f5914 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -971,6 +971,9 @@ struct ArchCPU {
 
     /* KVM steal time */
     OnOffAuto kvm_steal_time;
+
+    /* KVM SVE has been finalized for this CPU */
+    bool kvm_sve_finalized;
 #endif /* CONFIG_KVM */
 
     /* Uniprocessor system with MP extensions */
@@ -1096,6 +1099,10 @@ struct ArchCPU {
     QLIST_HEAD(, ARMELChangeHook) el_change_hooks;
 
     int32_t node_id; /* NUMA node this CPU belongs to */
+    int32_t socket_id;
+    int32_t cluster_id;
+    int32_t core_id;
+    int32_t thread_id;
 
     /* Used to synchronize KVM and QEMU in-kernel device levels */
     uint8_t device_irq_level;
@@ -1134,6 +1141,7 @@ struct ARMCPUClass {
 
     const ARMCPUInfo *info;
     DeviceRealize parent_realize;
+    DeviceUnrealize parent_unrealize;
     ResettablePhases parent_phases;
 };
 
@@ -3355,6 +3363,13 @@ static inline AddressSpace *arm_addressspace(CPUState *cs, MemTxAttrs attrs)
  */
 void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
                                  void *opaque);
+/**
+ * arm_unregister_pre_el_change_hook:
+ * unregister all pre EL change hook functions. Generally called during
+ * unrealize'ing leg
+ */
+void arm_unregister_pre_el_change_hooks(ARMCPU *cpu);
+
 /**
  * arm_register_el_change_hook:
  * Register a hook function which will be called immediately after this
@@ -3367,6 +3382,12 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
  */
 void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void
         *opaque);
+/**
+ * arm_unregister_el_change_hook:
+ * unregister all EL change hook functions.  Generally called during
+ * unrealize'ing leg
+ */
+void arm_unregister_el_change_hooks(ARMCPU *cpu);
 
 /**
  * arm_rebuild_hflags:
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 1e9c6c85aece12172665e13d9d0f1122250b1dc6..4a1d2daeb6484e5611f922f2fd7652b4c9550e5d 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -110,6 +110,11 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp)
          */
         if (!cpu_isar_feature(aa64_sve, cpu)) {
             /* SVE is disabled and so are all vector lengths.  Good. */
+            /*
+             * SVE is disabled and so are all vector lengths.  Good.
+             * Disable all SVE extensions as well.
+             */
+            cpu->isar.id_aa64zfr0 = 0;
             return;
         }
 
@@ -705,6 +710,77 @@ static void aarch64_a53_initfn(Object *obj)
     define_cortex_a72_a57_a53_cp_reginfo(cpu);
 }
 
+static void aarch64_a72_initfn(Object *obj)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+
+    cpu->dtb_compatible = "arm,cortex-a72";
+    cpu->kvm_target = QEMU_KVM_ARM_TARGET_GENERIC_V8;
+    set_feature(&cpu->env, ARM_FEATURE_V8);
+    set_feature(&cpu->env, ARM_FEATURE_NEON);
+    set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
+    set_feature(&cpu->env, ARM_FEATURE_AARCH64);
+    set_feature(&cpu->env, ARM_FEATURE_CBAR_RO);
+    set_feature(&cpu->env, ARM_FEATURE_EL2);
+    set_feature(&cpu->env, ARM_FEATURE_EL3);
+    set_feature(&cpu->env, ARM_FEATURE_PMU);
+    cpu->midr = 0x410fd083;
+    cpu->revidr = 0x00000000;
+    cpu->reset_fpsid = 0x41034080;
+    cpu->isar.mvfr0 = 0x10110222;
+    cpu->isar.mvfr1 = 0x12111111;
+    cpu->isar.mvfr2 = 0x00000043;
+    cpu->ctr = 0x8444c004;
+    cpu->reset_sctlr = 0x00c50838;
+    cpu->isar.id_pfr0 = 0x00000131;
+    cpu->isar.id_pfr1 = 0x00011011;
+    cpu->isar.id_dfr0 = 0x03010066;
+    cpu->id_afr0 = 0x00000000;
+    cpu->isar.id_mmfr0 = 0x10201105;
+    cpu->isar.id_mmfr1 = 0x40000000;
+    cpu->isar.id_mmfr2 = 0x01260000;
+    cpu->isar.id_mmfr3 = 0x02102211;
+    cpu->isar.id_isar0 = 0x02101110;
+    cpu->isar.id_isar1 = 0x13112111;
+    cpu->isar.id_isar2 = 0x21232042;
+    cpu->isar.id_isar3 = 0x01112131;
+    cpu->isar.id_isar4 = 0x00011142;
+    cpu->isar.id_isar5 = 0x00011121;
+    cpu->isar.id_aa64pfr0 = 0x00002222;
+    cpu->isar.id_aa64dfr0 = 0x10305106;
+    cpu->isar.id_aa64isar0 = 0x00011120;
+    cpu->isar.id_aa64mmfr0 = 0x00001124;
+    cpu->isar.dbgdidr = 0x3516d000;
+    cpu->clidr = 0x0a200023;
+    cpu->ccsidr[0] = 0x701fe00a; /* 32KB L1 dcache */
+    cpu->ccsidr[1] = 0x201fe012; /* 48KB L1 icache */
+    cpu->ccsidr[2] = 0x707fe07a; /* 1MB L2 cache */
+    cpu->dcz_blocksize = 4; /* 64 bytes */
+    cpu->gic_num_lrs = 4;
+    cpu->gic_vpribits = 5;
+    cpu->gic_vprebits = 5;
+    define_cortex_a72_a57_a53_cp_reginfo(cpu);
+}
+
+static void aarch64_kunpeng_920_initfn(Object *obj)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+
+    /*
+     * Hisilicon Kunpeng-920 CPU is similar to cortex-a72,
+     * so first initialize cpu data as cortex-a72,
+     * and then update the special register.
+     */
+    aarch64_a72_initfn(obj);
+
+    cpu->midr = 0x480fd010;
+    cpu->ctr = 0x84448004;
+    cpu->isar.id_aa64pfr0 = 0x11001111;
+    cpu->isar.id_aa64dfr0 = 0x110305408;
+    cpu->isar.id_aa64isar0 = 0x10211120;
+    cpu->isar.id_aa64mmfr0 = 0x101125;
+}
+
 static void aarch64_host_initfn(Object *obj)
 {
 #if defined(CONFIG_KVM)
@@ -723,6 +799,34 @@ static void aarch64_host_initfn(Object *obj)
 #endif
 }
 
+static void aarch64_max_ft2000plus_initfn(Object *obj)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+
+    if (kvm_enabled()) {
+        kvm_arm_set_cpu_features_from_host(cpu);
+    } else {
+        aarch64_a72_initfn(obj);
+        cpu->midr = 0x70186622;
+    }
+    aarch64_add_sve_properties(obj);
+    aarch64_add_pauth_properties(obj);
+}
+
+static void aarch64_max_tengyun_s2500_initfn(Object *obj)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+
+    if (kvm_enabled()) {
+        kvm_arm_set_cpu_features_from_host(cpu);
+    } else {
+        aarch64_a72_initfn(obj);
+        cpu->midr = 0x70186632;
+    }
+    aarch64_add_sve_properties(obj);
+    aarch64_add_pauth_properties(obj);
+}
+
 static void aarch64_max_initfn(Object *obj)
 {
     if (kvm_enabled() || hvf_enabled()) {
@@ -744,6 +848,9 @@ static void aarch64_max_initfn(Object *obj)
 static const ARMCPUInfo aarch64_cpus[] = {
     { .name = "cortex-a57",         .initfn = aarch64_a57_initfn },
     { .name = "cortex-a53",         .initfn = aarch64_a53_initfn },
+    { .name = "Kunpeng-920",        .initfn = aarch64_kunpeng_920_initfn},
+    { .name = "FT-2000+",           .initfn = aarch64_max_ft2000plus_initfn },
+    { .name = "Tengyun-S2500",      .initfn = aarch64_max_tengyun_s2500_initfn },
     { .name = "max",                .initfn = aarch64_max_initfn },
 #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
     { .name = "host",               .initfn = aarch64_host_initfn },
@@ -778,6 +885,18 @@ static void aarch64_cpu_set_aarch64(Object *obj, bool value, Error **errp)
     }
 }
 
+static void aarch64_cpu_initfn(Object *obj)
+{
+    CPUState *cs = CPU(obj);
+
+    /*
+     * we start every ARM64 vcpu as disabled possible vCPU. It needs to be
+     * enabled explicitly
+     */
+    cs->disabled = true;
+    cs->thread_id = 0;
+}
+
 static void aarch64_cpu_finalizefn(Object *obj)
 {
 }
@@ -790,7 +909,9 @@ static const gchar *aarch64_gdb_arch_name(CPUState *cs)
 static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
 {
     CPUClass *cc = CPU_CLASS(oc);
+    DeviceClass *dc = DEVICE_CLASS(oc);
 
+    dc->user_creatable = true;
     cc->gdb_read_register = aarch64_cpu_gdb_read_register;
     cc->gdb_write_register = aarch64_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 34;
@@ -836,6 +957,7 @@ void aarch64_cpu_register(const ARMCPUInfo *info)
 static const TypeInfo aarch64_cpu_type_info = {
     .name = TYPE_AARCH64_CPU,
     .parent = TYPE_ARM_CPU,
+    .instance_init = aarch64_cpu_initfn,
     .instance_finalize = aarch64_cpu_finalizefn,
     .abstract = true,
     .class_init = aarch64_cpu_class_init,
diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
index 28f546a5ff9525f9512ac1cd518ebbb2fd06e86f..5ba1e28e346d79e335c6fe22d2581a885b151fef 100644
--- a/target/arm/gdbstub.c
+++ b/target/arm/gdbstub.c
@@ -553,3 +553,9 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
     }
 #endif /* CONFIG_TCG */
 }
+
+void arm_cpu_unregister_gdb_regs(ARMCPU *cpu)
+{
+    CPUState *cs = CPU(cpu);
+    gdb_unregister_coprocessor_all(cs);
+}
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 2746d3fdac884ab96a7aa07e9ed6d62195cf7630..0370a739e3bbfcf3b04c548190764dd5ac3c4637 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -263,6 +263,19 @@ void init_cpreg_list(ARMCPU *cpu)
     g_list_free(keys);
 }
 
+void destroy_cpreg_list(ARMCPU *cpu)
+{
+    assert(cpu->cpreg_indexes);
+    assert(cpu->cpreg_values);
+    assert(cpu->cpreg_vmstate_indexes);
+    assert(cpu->cpreg_vmstate_values);
+
+    g_free(cpu->cpreg_indexes);
+    g_free(cpu->cpreg_values);
+    g_free(cpu->cpreg_vmstate_indexes);
+    g_free(cpu->cpreg_vmstate_values);
+}
+
 /*
  * Some registers are not accessible from AArch32 EL3 if SCR.NS == 0.
  */
@@ -1169,13 +1182,21 @@ static bool pmu_counter_enabled(CPUARMState *env, uint8_t counter)
     bool enabled, prohibited = false, filtered;
     bool secure = arm_is_secure(env);
     int el = arm_current_el(env);
-    uint64_t mdcr_el2 = arm_mdcr_el2_eff(env);
-    uint8_t hpmn = mdcr_el2 & MDCR_HPMN;
+    uint64_t mdcr_el2;
+    uint8_t hpmn;
 
+    /*
+     * We might be called for M-profile cores where MDCR_EL2 doesn't
+     * exist and arm_mdcr_el2_eff() will assert, so this early-exit check
+     * must be before we read that value.
+     */
     if (!arm_feature(env, ARM_FEATURE_PMU)) {
         return false;
     }
 
+    mdcr_el2 = arm_mdcr_el2_eff(env);
+    hpmn = mdcr_el2 & MDCR_HPMN;
+
     if (!arm_feature(env, ARM_FEATURE_EL2) ||
             (counter < hpmn || counter == 31)) {
         e = env->cp15.c9_pmcr & PMCRE;
@@ -9438,6 +9459,18 @@ void register_cp_regs_for_features(ARMCPU *cpu)
 #endif
 }
 
+void unregister_cp_regs_for_features(ARMCPU *cpu)
+{
+    CPUARMState *env = &cpu->env;
+    if (arm_feature(env, ARM_FEATURE_M)) {
+        /* M profile has no coprocessor registers */
+        return;
+    }
+
+    /* empty it all. unregister all the coprocessor registers */
+    g_hash_table_remove_all(cpu->cp_regs);
+}
+
 /* Sort alphabetically by type name, except for "any". */
 static gint arm_cpu_list_compare(gconstpointer a, gconstpointer b)
 {
@@ -10823,6 +10856,24 @@ static void arm_cpu_do_interrupt_aarch32(CPUState *cs)
     }
 
     if (env->exception.target_el == 2) {
+        /* Debug exceptions are reported differently on AArch32 */
+        switch (syn_get_ec(env->exception.syndrome)) {
+        case EC_BREAKPOINT:
+        case EC_BREAKPOINT_SAME_EL:
+        case EC_AA32_BKPT:
+        case EC_VECTORCATCH:
+            env->exception.syndrome = syn_insn_abort(arm_current_el(env) == 2,
+                                                     0, 0, 0x22);
+            break;
+        case EC_WATCHPOINT:
+            env->exception.syndrome = syn_set_ec(env->exception.syndrome,
+                                                 EC_DATAABORT);
+            break;
+        case EC_WATCHPOINT_SAME_EL:
+            env->exception.syndrome = syn_set_ec(env->exception.syndrome,
+                                                 EC_DATAABORT_SAME_EL);
+            break;
+        }
         arm_cpu_do_interrupt_aarch32_hyp(cs);
         return;
     }
@@ -11321,7 +11372,7 @@ void arm_cpu_do_interrupt(CPUState *cs)
                       env->exception.syndrome);
     }
 
-    if (tcg_enabled() && arm_is_psci_call(cpu, cs->exception_index)) {
+    if (arm_is_psci_call(cpu, cs->exception_index)) {
         arm_handle_psci_call(cpu);
         qemu_log_mask(CPU_LOG_INT, "...handled as PSCI call\n");
         return;
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 757e13b0f904e7390d431158342d9350457e583e..d7cc00a084b413fc6df6c7322b37a542b3780be1 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -392,85 +392,85 @@ struct hvf_sreg_match {
 };
 
 static struct hvf_sreg_match hvf_sreg_match[] = {
-    { HV_SYS_REG_DBGBVR0_EL1, HVF_SYSREG(0, 0, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR0_EL1, HVF_SYSREG(0, 0, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR0_EL1, HVF_SYSREG(0, 0, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR0_EL1, HVF_SYSREG(0, 0, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR1_EL1, HVF_SYSREG(0, 1, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR1_EL1, HVF_SYSREG(0, 1, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR1_EL1, HVF_SYSREG(0, 1, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR1_EL1, HVF_SYSREG(0, 1, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR2_EL1, HVF_SYSREG(0, 2, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR2_EL1, HVF_SYSREG(0, 2, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR2_EL1, HVF_SYSREG(0, 2, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR2_EL1, HVF_SYSREG(0, 2, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR3_EL1, HVF_SYSREG(0, 3, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR3_EL1, HVF_SYSREG(0, 3, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR3_EL1, HVF_SYSREG(0, 3, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR3_EL1, HVF_SYSREG(0, 3, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR4_EL1, HVF_SYSREG(0, 4, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR4_EL1, HVF_SYSREG(0, 4, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR4_EL1, HVF_SYSREG(0, 4, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR4_EL1, HVF_SYSREG(0, 4, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR5_EL1, HVF_SYSREG(0, 5, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR5_EL1, HVF_SYSREG(0, 5, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR5_EL1, HVF_SYSREG(0, 5, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR5_EL1, HVF_SYSREG(0, 5, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR6_EL1, HVF_SYSREG(0, 6, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR6_EL1, HVF_SYSREG(0, 6, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR6_EL1, HVF_SYSREG(0, 6, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR6_EL1, HVF_SYSREG(0, 6, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR7_EL1, HVF_SYSREG(0, 7, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR7_EL1, HVF_SYSREG(0, 7, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR7_EL1, HVF_SYSREG(0, 7, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR7_EL1, HVF_SYSREG(0, 7, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR8_EL1, HVF_SYSREG(0, 8, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR8_EL1, HVF_SYSREG(0, 8, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR8_EL1, HVF_SYSREG(0, 8, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR8_EL1, HVF_SYSREG(0, 8, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR9_EL1, HVF_SYSREG(0, 9, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR9_EL1, HVF_SYSREG(0, 9, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR9_EL1, HVF_SYSREG(0, 9, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR9_EL1, HVF_SYSREG(0, 9, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR10_EL1, HVF_SYSREG(0, 10, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR10_EL1, HVF_SYSREG(0, 10, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR10_EL1, HVF_SYSREG(0, 10, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR10_EL1, HVF_SYSREG(0, 10, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR11_EL1, HVF_SYSREG(0, 11, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR11_EL1, HVF_SYSREG(0, 11, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR11_EL1, HVF_SYSREG(0, 11, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR11_EL1, HVF_SYSREG(0, 11, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR12_EL1, HVF_SYSREG(0, 12, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR12_EL1, HVF_SYSREG(0, 12, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR12_EL1, HVF_SYSREG(0, 12, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR12_EL1, HVF_SYSREG(0, 12, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR13_EL1, HVF_SYSREG(0, 13, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR13_EL1, HVF_SYSREG(0, 13, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR13_EL1, HVF_SYSREG(0, 13, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR13_EL1, HVF_SYSREG(0, 13, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR14_EL1, HVF_SYSREG(0, 14, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR14_EL1, HVF_SYSREG(0, 14, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR14_EL1, HVF_SYSREG(0, 14, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR14_EL1, HVF_SYSREG(0, 14, 14, 0, 7) },
-
-    { HV_SYS_REG_DBGBVR15_EL1, HVF_SYSREG(0, 15, 14, 0, 4) },
-    { HV_SYS_REG_DBGBCR15_EL1, HVF_SYSREG(0, 15, 14, 0, 5) },
-    { HV_SYS_REG_DBGWVR15_EL1, HVF_SYSREG(0, 15, 14, 0, 6) },
-    { HV_SYS_REG_DBGWCR15_EL1, HVF_SYSREG(0, 15, 14, 0, 7) },
+    { HV_SYS_REG_DBGBVR0_EL1, HVF_SYSREG(0, 0, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR0_EL1, HVF_SYSREG(0, 0, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR0_EL1, HVF_SYSREG(0, 0, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR0_EL1, HVF_SYSREG(0, 0, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR1_EL1, HVF_SYSREG(0, 1, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR1_EL1, HVF_SYSREG(0, 1, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR1_EL1, HVF_SYSREG(0, 1, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR1_EL1, HVF_SYSREG(0, 1, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR2_EL1, HVF_SYSREG(0, 2, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR2_EL1, HVF_SYSREG(0, 2, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR2_EL1, HVF_SYSREG(0, 2, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR2_EL1, HVF_SYSREG(0, 2, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR3_EL1, HVF_SYSREG(0, 3, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR3_EL1, HVF_SYSREG(0, 3, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR3_EL1, HVF_SYSREG(0, 3, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR3_EL1, HVF_SYSREG(0, 3, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR4_EL1, HVF_SYSREG(0, 4, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR4_EL1, HVF_SYSREG(0, 4, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR4_EL1, HVF_SYSREG(0, 4, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR4_EL1, HVF_SYSREG(0, 4, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR5_EL1, HVF_SYSREG(0, 5, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR5_EL1, HVF_SYSREG(0, 5, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR5_EL1, HVF_SYSREG(0, 5, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR5_EL1, HVF_SYSREG(0, 5, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR6_EL1, HVF_SYSREG(0, 6, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR6_EL1, HVF_SYSREG(0, 6, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR6_EL1, HVF_SYSREG(0, 6, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR6_EL1, HVF_SYSREG(0, 6, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR7_EL1, HVF_SYSREG(0, 7, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR7_EL1, HVF_SYSREG(0, 7, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR7_EL1, HVF_SYSREG(0, 7, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR7_EL1, HVF_SYSREG(0, 7, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR8_EL1, HVF_SYSREG(0, 8, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR8_EL1, HVF_SYSREG(0, 8, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR8_EL1, HVF_SYSREG(0, 8, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR8_EL1, HVF_SYSREG(0, 8, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR9_EL1, HVF_SYSREG(0, 9, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR9_EL1, HVF_SYSREG(0, 9, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR9_EL1, HVF_SYSREG(0, 9, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR9_EL1, HVF_SYSREG(0, 9, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR10_EL1, HVF_SYSREG(0, 10, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR10_EL1, HVF_SYSREG(0, 10, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR10_EL1, HVF_SYSREG(0, 10, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR10_EL1, HVF_SYSREG(0, 10, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR11_EL1, HVF_SYSREG(0, 11, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR11_EL1, HVF_SYSREG(0, 11, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR11_EL1, HVF_SYSREG(0, 11, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR11_EL1, HVF_SYSREG(0, 11, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR12_EL1, HVF_SYSREG(0, 12, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR12_EL1, HVF_SYSREG(0, 12, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR12_EL1, HVF_SYSREG(0, 12, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR12_EL1, HVF_SYSREG(0, 12, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR13_EL1, HVF_SYSREG(0, 13, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR13_EL1, HVF_SYSREG(0, 13, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR13_EL1, HVF_SYSREG(0, 13, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR13_EL1, HVF_SYSREG(0, 13, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR14_EL1, HVF_SYSREG(0, 14, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR14_EL1, HVF_SYSREG(0, 14, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR14_EL1, HVF_SYSREG(0, 14, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR14_EL1, HVF_SYSREG(0, 14, 2, 0, 7) },
+
+    { HV_SYS_REG_DBGBVR15_EL1, HVF_SYSREG(0, 15, 2, 0, 4) },
+    { HV_SYS_REG_DBGBCR15_EL1, HVF_SYSREG(0, 15, 2, 0, 5) },
+    { HV_SYS_REG_DBGWVR15_EL1, HVF_SYSREG(0, 15, 2, 0, 6) },
+    { HV_SYS_REG_DBGWCR15_EL1, HVF_SYSREG(0, 15, 2, 0, 7) },
 
 #ifdef SYNC_NO_RAW_REGS
     /*
@@ -482,7 +482,7 @@ static struct hvf_sreg_match hvf_sreg_match[] = {
     { HV_SYS_REG_MPIDR_EL1, HVF_SYSREG(0, 0, 3, 0, 5) },
     { HV_SYS_REG_ID_AA64PFR0_EL1, HVF_SYSREG(0, 4, 3, 0, 0) },
 #endif
-    { HV_SYS_REG_ID_AA64PFR1_EL1, HVF_SYSREG(0, 4, 3, 0, 2) },
+    { HV_SYS_REG_ID_AA64PFR1_EL1, HVF_SYSREG(0, 4, 3, 0, 1) },
     { HV_SYS_REG_ID_AA64DFR0_EL1, HVF_SYSREG(0, 5, 3, 0, 0) },
     { HV_SYS_REG_ID_AA64DFR1_EL1, HVF_SYSREG(0, 5, 3, 0, 1) },
     { HV_SYS_REG_ID_AA64ISAR0_EL1, HVF_SYSREG(0, 6, 3, 0, 0) },
@@ -1272,6 +1272,7 @@ static int hvf_sysreg_read(CPUState *cpu, uint32_t reg, uint32_t rt)
         /* Call the TCG sysreg handler. This is only safe for GICv3 regs. */
         if (!hvf_sysreg_read_cp(cpu, reg, &val)) {
             hvf_raise_exception(cpu, EXCP_UDEF, syn_uncategorized());
+            return 1;
         }
         break;
     case SYSREG_DBGBVR0_EL1:
diff --git a/target/arm/internals.h b/target/arm/internals.h
index 143d57c0fe46a04d22c8b1fcfa1629b5f7322855..ed9bfb29c8731b028eb77df3860989e8b08d0655 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -187,9 +187,12 @@ void arm_cpu_register(const ARMCPUInfo *info);
 void aarch64_cpu_register(const ARMCPUInfo *info);
 
 void register_cp_regs_for_features(ARMCPU *cpu);
+void unregister_cp_regs_for_features(ARMCPU *cpu);
 void init_cpreg_list(ARMCPU *cpu);
+void destroy_cpreg_list(ARMCPU *cpu);
 
 void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
+void arm_cpu_unregister_gdb_regs(ARMCPU *cpu);
 void arm_translate_init(void);
 
 void arm_restore_state_to_opc(CPUState *cs,
@@ -311,21 +314,10 @@ vaddr arm_adjust_watchpoint_address(CPUState *cs, vaddr addr, int len);
 /* Callback function for when a watchpoint or breakpoint triggers. */
 void arm_debug_excp_handler(CPUState *cs);
 
-#if defined(CONFIG_USER_ONLY) || !defined(CONFIG_TCG)
-static inline bool arm_is_psci_call(ARMCPU *cpu, int excp_type)
-{
-    return false;
-}
-static inline void arm_handle_psci_call(ARMCPU *cpu)
-{
-    g_assert_not_reached();
-}
-#else
 /* Return true if the r0/x0 value indicates that this SMC/HVC is a PSCI call. */
 bool arm_is_psci_call(ARMCPU *cpu, int excp_type);
 /* Actually handle a PSCI call */
 void arm_handle_psci_call(ARMCPU *cpu);
-#endif
 
 /**
  * arm_clear_exclusive: clear the exclusive monitor
@@ -1273,7 +1265,7 @@ FIELD(MTEDESC, TBI,   4, 2)
 FIELD(MTEDESC, TCMA,  6, 2)
 FIELD(MTEDESC, WRITE, 8, 1)
 FIELD(MTEDESC, ALIGN, 9, 3)
-FIELD(MTEDESC, SIZEM1, 12, SIMD_DATA_BITS - 12)  /* size - 1 */
+FIELD(MTEDESC, SIZEM1, 12, SIMD_DATA_BITS - SVE_MTEDESC_SHIFT - 12)  /* size - 1 */
 
 bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr);
 uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra);
diff --git a/target/arm/kvm-consts.h b/target/arm/kvm-consts.h
index 7c6adc14f6e8dc8870316f64a158f1b95d654678..c034823170f46c20465c81ebde89b87a14577347 100644
--- a/target/arm/kvm-consts.h
+++ b/target/arm/kvm-consts.h
@@ -133,6 +133,8 @@ MISMATCH_CHECK(QEMU_PSCI_RET_DISABLED, PSCI_RET_DISABLED);
 #define QEMU_KVM_ARM_TARGET_CORTEX_A57 2
 #define QEMU_KVM_ARM_TARGET_XGENE_POTENZA 3
 #define QEMU_KVM_ARM_TARGET_CORTEX_A53 4
+/* Generic ARM v8 target */
+#define QEMU_KVM_ARM_TARGET_GENERIC_V8 5
 
 /* There's no kernel define for this: sentinel value which
  * matches no KVM target value for either 64 or 32 bit
@@ -144,6 +146,7 @@ MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_FOUNDATION_V8, KVM_ARM_TARGET_FOUNDATION_V8);
 MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A57, KVM_ARM_TARGET_CORTEX_A57);
 MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_XGENE_POTENZA, KVM_ARM_TARGET_XGENE_POTENZA);
 MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A53, KVM_ARM_TARGET_CORTEX_A53);
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_GENERIC_V8, KVM_ARM_TARGET_GENERIC_V8);
 
 #define CP_REG_ARM64                   0x6000000000000000ULL
 #define CP_REG_ARM_COPROC_MASK         0x000000000FFF0000
diff --git a/target/arm/kvm-tmm.c b/target/arm/kvm-tmm.c
new file mode 100644
index 0000000000000000000000000000000000000000..d18ac1089647fbf035a896c469e6d2901dda3ce7
--- /dev/null
+++ b/target/arm/kvm-tmm.c
@@ -0,0 +1,441 @@
+/*
+ * QEMU add virtcca cvm feature.
+ * 
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ * 
+ */
+
+#include "qemu/osdep.h"
+#include "exec/confidential-guest-support.h"
+#include "hw/boards.h"
+#include "hw/core/cpu.h"
+#include "kvm_arm.h"
+#include "migration/blocker.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc-target.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
+#include "hw/loader.h"
+#include "linux-headers/asm-arm64/kvm.h"
+#include <unistd.h>
+
+#define TYPE_TMM_GUEST "tmm-guest"
+OBJECT_DECLARE_SIMPLE_TYPE(TmmGuest, TMM_GUEST)
+
+#define TMM_PAGE_SIZE qemu_real_host_page_size()
+#define TMM_MAX_PMU_CTRS    0x20
+#define TMM_MAX_CFG      6
+#define TMM_MEMORY_INFO_SYSFS "/sys/kernel/tmm/memory_info"
+
+typedef struct {
+    uint32_t kae_vf_num;
+    hwaddr sec_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM];
+    hwaddr hpre_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM];
+} KaeDeviceInfo;
+
+struct TmmGuest {
+    ConfidentialGuestSupport parent_obj;
+    GSList *ram_regions;
+    TmmGuestMeasurementAlgo measurement_algo;
+    uint32_t sve_vl;
+    uint32_t num_pmu_cntrs;
+    KaeDeviceInfo kae_device_info;
+};
+
+typedef struct {
+    hwaddr base1;
+    hwaddr len1;
+    hwaddr base2;
+    hwaddr len2;
+    bool populate;
+} TmmRamRegion;
+
+static TmmGuest *tmm_guest;
+ 
+bool kvm_arm_tmm_enabled(void)
+{
+    return !!tmm_guest;
+}
+ 
+static int tmm_configure_one(TmmGuest *guest, uint32_t cfg, Error **errp)
+{
+    int ret = 1;
+    const char *cfg_str;
+    struct kvm_cap_arm_tmm_config_item args = {
+        .cfg = cfg,
+    };
+ 
+    switch (cfg) {
+    case KVM_CAP_ARM_TMM_CFG_RPV:
+        return 0;
+    case KVM_CAP_ARM_TMM_CFG_HASH_ALGO:
+        switch (guest->measurement_algo) {
+        case TMM_GUEST_MEASUREMENT_ALGO_DEFAULT:
+             return 0;
+        case TMM_GUEST_MEASUREMENT_ALGO_SHA256:
+            args.hash_algo = KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA256;
+            break;
+        case TMM_GUEST_MEASUREMENT_ALGO_SHA512:
+            args.hash_algo = KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA512;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        cfg_str = "hash algorithm";
+        break;
+        case KVM_CAP_ARM_TMM_CFG_SVE:
+            if (!guest->sve_vl) {
+                return 0;
+            }
+            args.sve_vq = guest->sve_vl / 128;
+            cfg_str = "SVE";
+            break;
+        case KVM_CAP_ARM_TMM_CFG_DBG:
+            return 0;
+        case KVM_CAP_ARM_TMM_CFG_PMU:
+            if (!guest->num_pmu_cntrs) {
+                return 0;
+            }
+            args.num_pmu_cntrs = guest->num_pmu_cntrs;
+            cfg_str = "PMU";
+            break;
+        case KVM_CAP_ARM_TMM_CFG_KAE:
+            if (!guest->kae_device_info.kae_vf_num) {
+                return 0;
+            }
+            args.kae_vf_num= guest->kae_device_info.kae_vf_num;
+            for (int i = 0; i < guest->kae_device_info.kae_vf_num; i++) {
+                args.sec_addr[i] = guest->kae_device_info.sec_addr[i];
+                args.hpre_addr[i] = guest->kae_device_info.hpre_addr[i];
+            }
+            cfg_str = "KAE";
+            break;
+        default:
+            g_assert_not_reached();
+    }
+ 
+    ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0,
+                            KVM_CAP_ARM_TMM_CONFIG_CVM, (intptr_t)&args);
+    if (ret) {
+        error_setg_errno(errp, -ret, "TMM: failed to configure %s", cfg_str);
+    }
+ 
+    return ret;
+}
+
+static gint tmm_compare_ram_regions(gconstpointer a, gconstpointer b)
+{
+    const TmmRamRegion *ra = a;
+    const TmmRamRegion *rb = b;
+
+    g_assert(ra->base1 != rb->base1);
+    return ra->base1 < rb->base1 ? -1 : 1;
+}
+
+void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate)
+{
+    TmmRamRegion *region;
+
+    region = g_new0(TmmRamRegion, 1);
+    region->base1 = QEMU_ALIGN_DOWN(base1, TMM_PAGE_SIZE);
+    region->len1 = QEMU_ALIGN_UP(len1, TMM_PAGE_SIZE);
+    region->base2 = QEMU_ALIGN_DOWN(base2, TMM_PAGE_SIZE);
+    region->len2 = QEMU_ALIGN_UP(len2, TMM_PAGE_SIZE);
+    region->populate = populate;
+
+    tmm_guest->ram_regions = g_slist_insert_sorted(tmm_guest->ram_regions,
+                                                   region, tmm_compare_ram_regions);
+}
+
+static void tmm_populate_region(gpointer data, gpointer unused)
+{
+    int ret;
+    const TmmRamRegion *region = data;
+    struct kvm_cap_arm_tmm_populate_region_args populate_args = {
+        .populate_ipa_base1 = region->base1,
+        .populate_ipa_size1 = region->len1,
+        .populate_ipa_base2 = region->base2,
+        .populate_ipa_size2 = region->len2,
+        .flags = KVM_ARM_TMM_POPULATE_FLAGS_MEASURE,
+    };
+
+    if (!region->populate) {
+        return;
+    }
+
+    ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0,
+                            KVM_CAP_ARM_TMM_POPULATE_CVM,
+                            (intptr_t)&populate_args);
+    if (ret) {
+        error_report("TMM: failed to populate cvm region (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx"): %s",
+                     region->base1, region->len1, region->base2, region->len2, strerror(-ret));
+        exit(1);
+    }
+}
+
+static int tmm_create_rd(Error **errp)
+{
+    int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0,
+                                KVM_CAP_ARM_TMM_CREATE_RD);
+    if (ret) {
+        error_setg_errno(errp, -ret, "TMM: failed to create tmm Descriptor");
+    }
+    return ret;
+}
+
+static void tmm_vm_state_change(void *opaque, bool running, RunState state)
+{
+    int ret;
+    CPUState *cs;
+
+    if (!running) {
+        return;
+    }
+
+    g_slist_foreach(tmm_guest->ram_regions, tmm_populate_region, NULL);
+    g_slist_free_full(g_steal_pointer(&tmm_guest->ram_regions), g_free);
+
+    CPU_FOREACH(cs) {
+        ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_TEC);
+        if (ret) {
+            error_report("TMM: failed to finalize vCPU: %s", strerror(-ret));
+            exit(1);
+        }
+    }
+
+    ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0,
+                            KVM_CAP_ARM_TMM_ACTIVATE_CVM);
+    if (ret) {
+        error_report("TMM: failed to activate cvm: %s", strerror(-ret));
+        exit(1);
+    }
+}
+
+int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp)
+{
+    int ret;
+    int cfg;
+ 
+    if (!tmm_guest) {
+        return -ENODEV;
+    }
+ 
+    if (!kvm_check_extension(kvm_state, KVM_CAP_ARM_TMM)) {
+        error_setg(errp, "KVM does not support TMM");
+        return -ENODEV;
+    }
+ 
+    for (cfg = 0; cfg < TMM_MAX_CFG; cfg++) {
+        ret = tmm_configure_one(tmm_guest, cfg, &error_abort);
+        if (ret) {
+            return ret;
+        }
+    }
+ 
+    ret = tmm_create_rd(&error_abort);
+    if (ret) {
+        return ret;
+    }
+ 
+    qemu_add_vm_change_state_handler(tmm_vm_state_change, NULL);
+    return 0;
+}
+ 
+static void tmm_get_sve_vl(Object *obj, Visitor *v, const char *name,
+                           void *opaque, Error **errp)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+ 
+    visit_type_uint32(v, name, &guest->sve_vl, errp);
+}
+ 
+static void tmm_set_sve_vl(Object *obj, Visitor *v, const char *name,
+                           void *opaque, Error **errp)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+    uint32_t value;
+ 
+    if (!visit_type_uint32(v, name, &value, errp)) {
+        return;
+    }
+ 
+    if (value & 0x7f || value >= ARM_MAX_VQ * 128) {
+        error_setg(errp, "invalid SVE vector length");
+        return;
+    }
+ 
+    guest->sve_vl = value;
+}
+
+static void tmm_get_num_pmu_cntrs(Object *obj, Visitor *v, const char *name,
+                                  void *opaque, Error **errp)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+
+    visit_type_uint32(v, name, &guest->num_pmu_cntrs, errp);
+}
+
+static void tmm_set_num_pmu_cntrs(Object *obj, Visitor *v, const char *name,
+                                  void *opaque, Error **errp)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+    uint32_t value;
+
+    if (!visit_type_uint32(v, name, &value, errp)) {
+        return;
+    }
+
+    if (value >= TMM_MAX_PMU_CTRS) {
+        error_setg(errp, "invalid number of PMU counters");
+        return;
+    }
+
+    guest->num_pmu_cntrs = value;
+}
+
+static int tmm_get_measurement_algo(Object *obj, Error **errp G_GNUC_UNUSED)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+
+    return guest->measurement_algo;
+}
+
+static void tmm_set_measurement_algo(Object *obj, int algo, Error **errp G_GNUC_UNUSED)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+
+    guest->measurement_algo = algo;
+}
+
+static void tmm_get_kae_vf_num(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+
+    visit_type_uint32(v, name, &guest->kae_device_info.kae_vf_num, errp);
+}
+
+static void tmm_set_kae_vf_num(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    TmmGuest *guest = TMM_GUEST(obj);
+    uint32_t value;
+
+    if (!visit_type_uint32(v, name, &value, errp)) {
+        return;
+    }
+
+    if (value > KVM_ARM_TMM_MAX_KAE_VF_NUM) {
+        error_setg(errp, "invalid number of kae vfs");
+        return;
+    }
+
+    guest->kae_device_info.kae_vf_num = value;
+}
+
+int tmm_get_kae_num(void)
+{
+    return tmm_guest->kae_device_info.kae_vf_num;
+}
+
+void tmm_set_sec_addr(hwaddr base, int num)
+{
+    tmm_guest->kae_device_info.sec_addr[num] = base;
+}
+
+void tmm_set_hpre_addr(hwaddr base, int num)
+{
+    tmm_guest->kae_device_info.hpre_addr[num] = base;
+}
+
+static void tmm_guest_class_init(ObjectClass *oc, void *data)
+{
+    object_class_property_add_enum(oc, "measurement-algo",
+                                   "TmmGuestMeasurementAlgo",
+                                   &TmmGuestMeasurementAlgo_lookup,
+                                   tmm_get_measurement_algo,
+                                   tmm_set_measurement_algo);
+    object_class_property_set_description(oc, "measurement-algo",
+                                          "cvm measurement algorithm ('sha256', 'sha512')"); 
+    /*
+     * This is not ideal. Normally SVE parameters are given to -cpu, but the
+     * cvm parameters are needed much earlier than CPU initialization. We also
+     * don't have a way to discover what is supported at the moment, the idea is
+     * that the user knows exactly what hardware it is running on because these
+     * parameters are part of the measurement and play in the attestation.
+     */
+    object_class_property_add(oc, "sve-vector-length", "uint32", tmm_get_sve_vl,
+                              tmm_set_sve_vl, NULL, NULL);
+    object_class_property_set_description(oc, "sve-vector-length",
+            "SVE vector length. 0 disables SVE (the default)");
+    object_class_property_add(oc, "num-pmu-counters", "uint32",
+                              tmm_get_num_pmu_cntrs, tmm_set_num_pmu_cntrs,
+                              NULL, NULL);
+    object_class_property_set_description(oc, "num-pmu-counters",
+            "Number of PMU counters");
+    object_class_property_add(oc, "kae", "uint32", tmm_get_kae_vf_num,
+                              tmm_set_kae_vf_num, NULL, NULL);
+    object_class_property_set_description(oc, "kae",
+            "Number of KAE virtual functions. 0 disables KAE (the default)");
+}
+ 
+static void tmm_guest_instance_init(Object *obj)
+{
+    if (tmm_guest) {
+        error_report("a single instance of TmmGuest is supported");
+        exit(1);
+    }
+    tmm_guest = TMM_GUEST(obj);
+}
+ 
+static const TypeInfo tmm_guest_info = {
+    .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT,
+    .name = TYPE_TMM_GUEST,
+    .instance_size = sizeof(struct TmmGuest),
+    .instance_init = tmm_guest_instance_init,
+    .class_init = tmm_guest_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_USER_CREATABLE },
+        { }
+    }
+};
+ 
+static void tmm_register_types(void)
+{
+    type_register_static(&tmm_guest_info);
+}
+type_init(tmm_register_types);
+
+static VirtccaCapability *virtcca_get_capabilities(Error **errp)
+{
+    VirtccaCapability *cap = NULL;
+    uint64_t tmi_version = 0;
+    int rc = 0;
+
+    if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) {
+        error_setg(errp, "VIRTCCA is not enabled in KVM");
+        return NULL;
+    }
+
+    rc = access(TMM_MEMORY_INFO_SYSFS, R_OK);
+    if (rc < 0) {
+        error_setg_errno(errp, errno, "VIRTCCA: Failed to read %s",
+                    TMM_MEMORY_INFO_SYSFS);
+        return NULL;
+    }
+
+    cap = g_new0(VirtccaCapability, 1);
+
+    cap->enabled = true;
+
+    return cap;
+}
+
+VirtccaCapability *qmp_query_virtcca_capabilities(Error **errp)
+{
+    return virtcca_get_capabilities(errp);
+}
\ No newline at end of file
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 7903e2ddde1b70bbd0db7f27992afe86763899d5..ab31515a2af6500374ec9676117c162e13692365 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -26,6 +26,8 @@
 #include "trace.h"
 #include "internals.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
 #include "exec/memattrs.h"
 #include "exec/address-spaces.h"
 #include "hw/boards.h"
@@ -255,9 +257,27 @@ int kvm_arch_get_default_type(MachineState *ms)
     return fixed_ipa ? 0 : size;
 }
 
+static void kvm_update_ipiv_cap(KVMState *s)
+{
+    int ret;
+
+    if (!kvm_check_extension(s, KVM_CAP_ARM_HISI_IPIV)) {
+        return;
+    }
+
+    ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HISI_IPIV, 0);
+    if (ret) {
+        fprintf(stderr, "Could not enable KVM_CAP_ARM_HISI_IPIV: %d\n", ret);
+    }
+
+    return;
+}
+
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
     int ret = 0;
+
     /* For ARM interrupt delivery is always asynchronous,
      * whether we are using an in-kernel VGIC or not.
      */
@@ -308,7 +328,25 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
         }
     }
 
+    /*
+     * To be able to handle PSCI CPU ON calls in QEMU, we need to install SMCCC
+     * filter in the Host KVM. This is required to support features like
+     * virtual CPU Hotplug on ARM platforms.
+     */
+    if (mc->has_hotpluggable_cpus && ms->smp.max_cpus > ms->smp.cpus) {
+        if (kvm_arm_set_smccc_filter(PSCI_0_2_FN64_CPU_ON,
+                                    KVM_SMCCC_FILTER_FWD_TO_USER)) {
+            error_report("CPU On PSCI-to-user-space fwd filter install failed");
+        } else if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF,
+                                    KVM_SMCCC_FILTER_FWD_TO_USER)) {
+            error_report("CPU Off PSCI-to-user-space fwd filter install failed");
+        } else {
+            s->kvm_smccc_filter_enabled = true;
+        }
+    }
+
     kvm_arm_init_debug(s);
+    kvm_update_ipiv_cap(s);
 
     return ret;
 }
@@ -592,6 +630,10 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level)
             continue;
         }
 
+        if (virtcca_cvm_enabled() && regidx == KVM_REG_ARM_TIMER_CNT) {
+            continue;
+        }
+
         switch (regidx & KVM_REG_SIZE_MASK) {
         case KVM_REG_SIZE_U32:
             v32 = cpu->cpreg_values[i];
@@ -634,11 +676,12 @@ void kvm_arm_cpu_post_load(ARMCPU *cpu)
 void kvm_arm_reset_vcpu(ARMCPU *cpu)
 {
     int ret;
+    CPUState *cs = CPU(cpu);
 
     /* Re-init VCPU so that all registers are set to
      * their respective reset values.
      */
-    ret = kvm_arm_vcpu_init(CPU(cpu));
+    ret = kvm_arm_vcpu_init(cs);
     if (ret < 0) {
         fprintf(stderr, "kvm_arm_vcpu_init failed: %s\n", strerror(-ret));
         abort();
@@ -655,6 +698,45 @@ void kvm_arm_reset_vcpu(ARMCPU *cpu)
      * for the same reason we do so in kvm_arch_get_registers().
      */
     write_list_to_cpustate(cpu);
+
+    /*
+     * Ensure we call kvm_arch_put_registers(). The vCPU isn't marked dirty if
+     * it was parked in KVM and is now booting from a PSCI CPU_ON call.
+     */
+    cs->vcpu_dirty = true;
+}
+
+void kvm_arm_create_host_vcpu(ARMCPU *cpu)
+{
+    CPUState *cs = CPU(cpu);
+    unsigned long vcpu_id = cs->cpu_index;
+    int ret;
+
+    ret = kvm_create_vcpu(cs);
+    if (ret < 0) {
+        error_report("Failed to create host vcpu %ld", vcpu_id);
+        abort();
+    }
+
+    /*
+     * Initialize the vCPU in the host. This will reset the sys regs
+     * for this vCPU and related registers like MPIDR_EL1 etc. also
+     * gets programmed during this call to host. These are referred
+     * later while setting device attributes of the GICR during GICv3
+     * reset
+     */
+    arm_cpu_finalize_features(cpu, &error_abort);
+    ret = kvm_arch_init_vcpu(cs);
+    if (ret < 0) {
+        error_report("Failed to initialize host vcpu %ld", vcpu_id);
+        abort();
+    }
+
+    /*
+     * park the created vCPU. shall be used during kvm_get_vcpu() when
+     * threads are created during realization of ARM vCPUs.
+     */
+    kvm_park_vcpu(cs);
 }
 
 /*
@@ -925,6 +1007,38 @@ static int kvm_arm_handle_dabt_nisv(CPUState *cs, uint64_t esr_iss,
     return -1;
 }
 
+static int kvm_arm_handle_hypercall(CPUState *cs, struct kvm_run *run)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+
+    kvm_cpu_synchronize_state(cs);
+
+    /*
+     * hard coding immediate to 0 as we dont expect non-zero value as of now
+     * This might change in future versions. Hence, KVM_GET_ONE_REG  could be
+     * used in such cases but it must be enhanced then only synchronize will
+     * also fetch ESR_EL2 value.
+     */
+    if (run->hypercall.flags == KVM_HYPERCALL_EXIT_SMC) {
+        cs->exception_index = EXCP_SMC;
+        env->exception.syndrome = syn_aa64_smc(0);
+    } else {
+        cs->exception_index = EXCP_HVC;
+        env->exception.syndrome = syn_aa64_hvc(0);
+    }
+    env->exception.target_el = 1;
+    qemu_mutex_lock_iothread();
+    arm_cpu_do_interrupt(cs);
+    qemu_mutex_unlock_iothread();
+
+    /*
+     * For PSCI, exit the kvm_run loop and process the work. Especially
+     * important if this was a CPU_OFF command and we can't return to the guest.
+     */
+    return EXCP_INTERRUPT;
+}
+
 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
 {
     int ret = 0;
@@ -940,6 +1054,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
         ret = kvm_arm_handle_dabt_nisv(cs, run->arm_nisv.esr_iss,
                                        run->arm_nisv.fault_ipa);
         break;
+    case KVM_EXIT_HYPERCALL:
+          ret = kvm_arm_handle_hypercall(cs, run);
+        break;
     default:
         qemu_log_mask(LOG_UNIMP, "%s: un-handled exit reason %d\n",
                       __func__, run->exit_reason);
@@ -1053,6 +1170,51 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
     return 0;
 }
 
+int kvm_create_shadow_device(PCIDevice *dev)
+{
+    KVMState *s = kvm_state;
+    struct kvm_master_dev_info *mdi;
+    MSIMessage msg;
+    uint32_t vector, nvectors = msix_nr_vectors_allocated(dev);
+    uint32_t request_id;
+    int ret;
+
+    if (!kvm_vm_check_extension(s, KVM_CAP_ARM_VIRT_MSI_BYPASS) || !nvectors) {
+        return 0;
+    }
+
+    mdi = g_malloc0(sizeof(uint32_t) + sizeof(struct kvm_msi) * nvectors);
+    mdi->nvectors = nvectors;
+    request_id = pci_requester_id(dev);
+
+    for (vector = 0; vector < nvectors; vector++) {
+        msg = msix_get_message(dev, vector);
+        mdi->msi[vector].address_lo = extract64(msg.address, 0, 32);
+        mdi->msi[vector].address_hi = extract64(msg.address, 32, 32);
+        mdi->msi[vector].data = le32_to_cpu(msg.data);
+        mdi->msi[vector].flags = KVM_MSI_VALID_DEVID;
+        mdi->msi[vector].devid = request_id;
+        memset(mdi->msi[vector].pad, 0, sizeof(mdi->msi[vector].pad));
+    }
+
+    ret = kvm_vm_ioctl(s, KVM_CREATE_SHADOW_DEV, mdi);
+    g_free(mdi);
+    return ret;
+}
+
+int kvm_delete_shadow_device(PCIDevice *dev)
+{
+    KVMState *s = kvm_state;
+    uint32_t request_id, nvectors = msix_nr_vectors_allocated(dev);
+
+    if (!kvm_vm_check_extension(s, KVM_CAP_ARM_VIRT_MSI_BYPASS) || !nvectors) {
+        return 0;
+    }
+
+    request_id = pci_requester_id(dev);
+    return kvm_vm_ioctl(s, KVM_DEL_SHADOW_DEV, &request_id);
+}
+
 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
                                 int vector, PCIDevice *dev)
 {
@@ -1071,7 +1233,7 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 
 bool kvm_arch_cpu_check_are_resettable(void)
 {
-    return true;
+    return !virtcca_cvm_enabled();
 }
 
 static void kvm_arch_get_eager_split_size(Object *obj, Visitor *v,
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 3c175c93a7a8f29a7a157c9bf059db601f546b7f..b099287ed0645d466604456236171c6b1dc3dd52 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -111,6 +111,25 @@ bool kvm_arm_hw_debug_active(CPUState *cs)
     return ((cur_hw_wps > 0) || (cur_hw_bps > 0));
 }
 
+static bool kvm_arm_set_vm_attr(struct kvm_device_attr *attr, const char *name)
+{
+    int err;
+
+    err = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, attr);
+    if (err != 0) {
+        error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err));
+        return false;
+    }
+
+    err = kvm_vm_ioctl(kvm_state, KVM_SET_DEVICE_ATTR, attr);
+    if (err != 0) {
+        error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err));
+        return false;
+    }
+
+    return true;
+}
+
 static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr,
                                     const char *name)
 {
@@ -181,6 +200,28 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
     }
 }
 
+int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction)
+{
+    struct kvm_smccc_filter filter = {
+        .base = func,
+        .nr_functions = 1,
+        .action = faction,
+    };
+    struct kvm_device_attr attr = {
+        .group = KVM_ARM_VM_SMCCC_CTRL,
+        .attr = KVM_ARM_VM_SMCCC_FILTER,
+        .flags = 0,
+        .addr = (uintptr_t)&filter,
+    };
+
+    if (!kvm_arm_set_vm_attr(&attr, "SMCCC Filter")) {
+        error_report("failed to set SMCCC filter in KVM Host");
+        return -1;
+    }
+
+    return 0;
+}
+
 static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id)
 {
     uint64_t ret;
@@ -543,6 +584,11 @@ static int kvm_arm_sve_set_vls(CPUState *cs)
 
     assert(cpu->sve_max_vq <= KVM_ARM64_SVE_VQ_MAX);
 
+    if (virtcca_cvm_enabled()) {
+        /* Already set through tmm config */
+        return 0;
+    }
+
     return kvm_set_one_reg(cs, KVM_REG_ARM64_SVE_VLS, &vls[0]);
 }
 
@@ -562,7 +608,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
         return -EINVAL;
     }
 
-    qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs);
+    /*
+     * Install VM change handler only when vCPU thread has been spawned
+     * i.e. vCPU is being realized
+     */
+    if (cs->thread_id) {
+        cs->vmcse = qemu_add_vm_change_state_handler(kvm_arm_vm_state_change,
+                                                     cs);
+    }
 
     /* Determine init features for this CPU */
     memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features));
@@ -599,7 +652,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
         return ret;
     }
 
-    if (cpu_isar_feature(aa64_sve, cpu)) {
+    if (cpu_isar_feature(aa64_sve, cpu) && !cpu->kvm_sve_finalized) {
         ret = kvm_arm_sve_set_vls(cs);
         if (ret) {
             return ret;
@@ -622,9 +675,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
     }
 
     /*
-     * When KVM is in use, PSCI is emulated in-kernel and not by qemu.
-     * Currently KVM has its own idea about MPIDR assignment, so we
-     * override our defaults with what we get from KVM.
+     * KVM may emulate PSCI in-kernel. Currently KVM has its own idea about
+     * MPIDR assignment, so we override our defaults with what we get from KVM.
      */
     ret = kvm_get_one_reg(cs, ARM64_SYS_REG(ARM_CPU_ID_MPIDR), &mpidr);
     if (ret) {
@@ -640,6 +692,10 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
 int kvm_arch_destroy_vcpu(CPUState *cs)
 {
+    if (cs->thread_id) {
+        qemu_del_vm_change_state_handler(cs->vmcse);
+    }
+
     return 0;
 }
 
diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h
index 051a0da41c4929c178134f2b2a67622e84ef42b5..a29d4548f4ce39a767ac59c694beda3ed7b61506 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -73,6 +73,8 @@ int kvm_arm_vcpu_finalize(CPUState *cs, int feature);
 void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group,
                              uint64_t attr, int dev_fd, uint64_t addr_ormask);
 
+void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size);
+
 /**
  * kvm_arm_init_cpreg_list:
  * @cpu: ARMCPU
@@ -163,6 +165,17 @@ void kvm_arm_cpu_post_load(ARMCPU *cpu);
  */
 void kvm_arm_reset_vcpu(ARMCPU *cpu);
 
+/**
+ * kvm_arm_create_host_vcpu:
+ * @cpu: ARMCPU
+ *
+ * Called at to pre create all possible kvm vCPUs within the the host at the
+ * virt machine init time. This will also init this pre-created vCPU and
+ * hence result in vCPU reset at host. These pre created and inited vCPUs
+ * shall be parked for use when ARM vCPUs are actually realized.
+ */
+void kvm_arm_create_host_vcpu(ARMCPU *cpu);
+
 /**
  * kvm_arm_init_serror_injection:
  * @cs: CPUState
@@ -377,6 +390,24 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa);
 
 int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level);
 
+void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate);
+
+int tmm_get_kae_num(void);
+void tmm_set_sec_addr(hwaddr base, int num);
+void tmm_set_hpre_addr(hwaddr base, int num);
+
+int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp);
+bool kvm_arm_tmm_enabled(void);
+
+/**
+ * kvm_arm_set_smccc_filter
+ * @func: funcion
+ * @faction: SMCCC filter action(handle, deny, fwd-to-user) to be deployed
+ *
+ * Sets the ARMs SMC-CC filter in KVM Host for selective hypercall exits
+ */
+int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction);
+
 #else
 
 /*
@@ -451,6 +482,36 @@ static inline uint32_t kvm_arm_sve_get_vls(CPUState *cs)
     g_assert_not_reached();
 }
 
+static inline int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction)
+{
+    g_assert_not_reached();
+}
+
+static inline int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp G_GNUC_UNUSED)
+{
+    g_assert_not_reached();
+}
+
+static inline void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2,
+                                      hwaddr len2, bool populate)
+{
+    g_assert_not_reached();
+}
+
+static inline void tmm_set_sec_addr(hwaddr base, int num)
+{
+    g_assert_not_reached();
+}
+
+static inline void tmm_set_hpre_addr(hwaddr base, int num)
+{
+    g_assert_not_reached();
+}
+
+static inline int tmm_get_kae_num(void)
+{
+    g_assert_not_reached();
+}
 #endif
 
 /**
diff --git a/target/arm/meson.build b/target/arm/meson.build
index 5d04a8e94f2ebace7dd56ee3c92923b4e1b8d55d..389ee5465882e31718e27dedd878036eeae2ea8a 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -10,6 +10,7 @@ arm_ss.add(zlib)
 
 arm_ss.add(when: 'CONFIG_KVM', if_true: files('hyp_gdbstub.c', 'kvm.c', 'kvm64.c'), if_false: files('kvm-stub.c'))
 arm_ss.add(when: 'CONFIG_HVF', if_true: files('hyp_gdbstub.c'))
+arm_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c', 'kvm64.c', 'kvm-tmm.c'), if_false: files('kvm-stub.c'))
 
 arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
   'cpu64.c',
@@ -23,6 +24,7 @@ arm_system_ss.add(files(
   'arm-qmp-cmds.c',
   'cortex-regs.c',
   'machine.c',
+  'psci.c',
   'ptw.c',
 ))
 
diff --git a/target/arm/tcg/psci.c b/target/arm/psci.c
similarity index 97%
rename from target/arm/tcg/psci.c
rename to target/arm/psci.c
index 6c1239bb9685a00ef7d9ad883ba4c1a0e87ada8b..a8690a16afc53c314411959fe9ebd81bd4ba4652 100644
--- a/target/arm/tcg/psci.c
+++ b/target/arm/psci.c
@@ -21,7 +21,9 @@
 #include "exec/helper-proto.h"
 #include "kvm-consts.h"
 #include "qemu/main-loop.h"
+#include "qemu/error-report.h"
 #include "sysemu/runstate.h"
+#include "sysemu/tcg.h"
 #include "internals.h"
 #include "arm-powerctl.h"
 
@@ -157,6 +159,11 @@ void arm_handle_psci_call(ARMCPU *cpu)
     case QEMU_PSCI_0_1_FN_CPU_SUSPEND:
     case QEMU_PSCI_0_2_FN_CPU_SUSPEND:
     case QEMU_PSCI_0_2_FN64_CPU_SUSPEND:
+       if (!tcg_enabled()) {
+            warn_report("CPU suspend not supported in non-tcg mode");
+            break;
+       }
+#ifdef CONFIG_TCG
         /* Affinity levels are not supported in QEMU */
         if (param[1] & 0xfffe0000) {
             ret = QEMU_PSCI_RET_INVALID_PARAMS;
@@ -169,6 +176,7 @@ void arm_handle_psci_call(ARMCPU *cpu)
             env->regs[0] = 0;
         }
         helper_wfi(env, 4);
+#endif
         break;
     case QEMU_PSCI_1_0_FN_PSCI_FEATURES:
         switch (param[1]) {
diff --git a/target/arm/syndrome.h b/target/arm/syndrome.h
index 95454b5b3bb01be2800f0933da43635c9f6fcde5..eccb759da6bb982f532b220867fd0898d2b16c92 100644
--- a/target/arm/syndrome.h
+++ b/target/arm/syndrome.h
@@ -25,6 +25,8 @@
 #ifndef TARGET_ARM_SYNDROME_H
 #define TARGET_ARM_SYNDROME_H
 
+#include "qemu/bitops.h"
+
 /* Valid Syndrome Register EC field values */
 enum arm_exception_class {
     EC_UNCATEGORIZED          = 0x00,
@@ -80,6 +82,7 @@ typedef enum {
     SME_ET_InactiveZA,
 } SMEExceptionType;
 
+#define ARM_EL_EC_LENGTH 6
 #define ARM_EL_EC_SHIFT 26
 #define ARM_EL_IL_SHIFT 25
 #define ARM_EL_ISV_SHIFT 24
@@ -91,6 +94,11 @@ static inline uint32_t syn_get_ec(uint32_t syn)
     return syn >> ARM_EL_EC_SHIFT;
 }
 
+static inline uint32_t syn_set_ec(uint32_t syn, uint32_t ec)
+{
+    return deposit32(syn, ARM_EL_EC_SHIFT, ARM_EL_EC_LENGTH, ec);
+}
+
 /*
  * Utility functions for constructing various kinds of syndrome value.
  * Note that in general we follow the AArch64 syndrome values; in a
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
index 6fca38f2ccb1d37ecdd15ba2a082b744231828da..ad3cfcb3bda90eede7a8bed922c38c04f388fdce 100644
--- a/target/arm/tcg/meson.build
+++ b/target/arm/tcg/meson.build
@@ -51,7 +51,3 @@ arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
   'sme_helper.c',
   'sve_helper.c',
 ))
-
-arm_system_ss.add(files(
-  'psci.c',
-))
diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c
index 1ee2690ceb5a293fdc23146b1a9b294b2b2c49a5..ae4f39ed02f13b641f5e1ceb4c83d313d9209d89 100644
--- a/target/arm/tcg/sme_helper.c
+++ b/target/arm/tcg/sme_helper.c
@@ -573,8 +573,8 @@ void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 
     /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+    if (!tbi_check(mtedesc, bit55) ||
+        tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
         mtedesc = 0;
     }
 
@@ -750,8 +750,8 @@ void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 
     /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+    if (!tbi_check(mtedesc, bit55) ||
+        tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
         mtedesc = 0;
     }
 
@@ -916,7 +916,7 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
                         if (pb & 1) {
                             uint32_t *a = vza_row + H1_4(col);
                             uint32_t *m = vzm + H1_4(col);
-                            *a = float32_muladd(n, *m, *a, 0, vst);
+                            *a = float32_muladd(n, *m, *a, 0, &fpst);
                         }
                         col += 4;
                         pb >>= 4;
@@ -1134,10 +1134,10 @@ static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
     uint64_t sum = 0;                                                       \
     /* Apply P to N as a mask, making the inactive elements 0. */           \
     n &= expand_pred_h(p);                                                  \
-    sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
-    sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
-    sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
-    sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
+    sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0);                      \
+    sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16);                    \
+    sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32);                    \
+    sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48);                    \
     return neg ? a - sum : a + sum;                                         \
 }
 
diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c
index f006d152cc9dc8581cbf94918e7e8768d5b6e341..9694201550aa69251cf704aa8e7fe71fc469dba0 100644
--- a/target/arm/tcg/sve_helper.c
+++ b/target/arm/tcg/sve_helper.c
@@ -5800,8 +5800,8 @@ void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 
     /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+    if (!tbi_check(mtedesc, bit55) ||
+        tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
         mtedesc = 0;
     }
 
@@ -6156,8 +6156,8 @@ void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 
     /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+    if (!tbi_check(mtedesc, bit55) ||
+        tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
         mtedesc = 0;
     }
 
@@ -6306,9 +6306,6 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
 
     flags = info.page[0].flags | info.page[1].flags;
     if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
-        g_assert_not_reached();
-#else
         /*
          * At least one page includes MMIO.
          * Any bus operation can fail with cpu_transaction_failed,
@@ -6339,7 +6336,6 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
             } while (reg_off & 63);
         } while (reg_off <= reg_last);
         return;
-#endif
     }
 
     mem_off = info.mem_off_first[0];
@@ -6410,8 +6406,8 @@ void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 
     /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+    if (!tbi_check(mtedesc, bit55) ||
+        tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
         mtedesc = 0;
     }
 
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index a2e49c39f9f3caf17b1bc1fdbba6018121962777..5beac07b602a26e8b655097d6e59702f6b1119af 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -3306,7 +3306,7 @@ static bool trans_LDAPR(DisasContext *s, arg_LDAPR *a)
     if (a->rn == 31) {
         gen_check_sp_alignment(s);
     }
-    mop = check_atomic_align(s, a->rn, a->sz);
+    mop = check_ordered_align(s, a->rn, 0, false, a->sz);
     clean_addr = gen_mte_check1(s, cpu_reg_sp(s, a->rn), false,
                                 a->rn != 31, mop);
     /*
@@ -8221,7 +8221,7 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
         narrowfn(tcg_rd_narrowed, tcg_env, tcg_rd);
         tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
         if (i == 0) {
-            tcg_gen_mov_i64(tcg_final, tcg_rd);
+            tcg_gen_extract_i64(tcg_final, tcg_rd, 0, esize);
         } else {
             tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
         }
@@ -10141,6 +10141,7 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
         tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
         write_vec_element(s, tcg_rd, rd, i, size + 1);
     }
+    clear_vec_high(s, true, rd);
 }
 
 /* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index 8f0dfc884ec6a0c5dd5ae213649bb18e04b37839..1e89516736612042f037bdc4011c20d33d7cfa66 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -49,7 +49,15 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
     /* Prepare a power-of-two modulo via extraction of @len bits. */
     len = ctz32(streaming_vec_reg_size(s)) - esz;
 
-    if (vertical) {
+    if (!len) {
+        /*
+         * SVL is 128 and the element size is 128. There is exactly
+         * one 128x128 tile in the ZA storage, and so we calculate
+         * (Rs + imm) MOD 1, which is always 0. We need to special case
+         * this because TCG doesn't allow deposit ops with len 0.
+         */
+        tcg_gen_movi_i32(tmp, 0);
+    } else if (vertical) {
         /*
          * Compute the byte offset of the index within the tile:
          *     (index % (svl / size)) * size
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
index 296e7d1ce223b6c7e33a6dbe6519d2d17002fd17..1b722ae75d2d8c51b6dec3d207da6fc67c27cd52 100644
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@@ -50,13 +50,27 @@ static int tszimm_esz(DisasContext *s, int x)
 
 static int tszimm_shr(DisasContext *s, int x)
 {
-    return (16 << tszimm_esz(s, x)) - x;
+    /*
+     * We won't use the tszimm_shr() value if tszimm_esz() returns -1 (the
+     * trans function will check for esz < 0), so we can return any
+     * value we like from here in that case as long as we avoid UB.
+     */
+    int esz = tszimm_esz(s, x);
+    if (esz < 0) {
+        return esz;
+    }
+    return (16 << esz) - x;
 }
 
 /* See e.g. LSL (immediate, predicated).  */
 static int tszimm_shl(DisasContext *s, int x)
 {
-    return x - (8 << tszimm_esz(s, x));
+    /* As with tszimm_shr(), value will be unused if esz < 0 */
+    int esz = tszimm_esz(s, x);
+    if (esz < 0) {
+        return esz;
+    }
+    return x - (8 << esz);
 }
 
 /* The SH bit is in bit 8.  Extract the low 8 and shift.  */
@@ -4443,26 +4457,28 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
 {
     unsigned vsz = vec_full_reg_size(s);
     TCGv_ptr t_pg;
+    uint32_t sizem1;
     int desc = 0;
 
-    /*
-     * For e.g. LD4, there are not enough arguments to pass all 4
-     * registers as pointers, so encode the regno into the data field.
-     * For consistency, do this even for LD1.
-     */
+    assert(mte_n >= 1 && mte_n <= 4);
+    sizem1 = (mte_n << dtype_msz(dtype)) - 1;
+    assert(sizem1 <= R_MTEDESC_SIZEM1_MASK >> R_MTEDESC_SIZEM1_SHIFT);
     if (s->mte_active[0]) {
-        int msz = dtype_msz(dtype);
-
         desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
         desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
         desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
         desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
-        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1);
+        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, sizem1);
         desc <<= SVE_MTEDESC_SHIFT;
     } else {
         addr = clean_data_tbi(s, addr);
     }
 
+    /*
+     * For e.g. LD4, there are not enough arguments to pass all 4
+     * registers as pointers, so encode the regno into the data field.
+     * For consistency, do this even for LD1.
+     */
     desc = simd_desc(vsz, vsz, zt | desc);
     t_pg = tcg_temp_new_ptr();
 
@@ -4600,7 +4616,7 @@ static void do_ld_zpa(DisasContext *s, int zt, int pg,
      * accessible via the instruction encoding.
      */
     assert(fn != NULL);
-    do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn);
+    do_mem_zpa(s, zt, pg, addr, dtype, nreg + 1, false, fn);
 }
 
 static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a)
@@ -5168,14 +5184,13 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
     if (nreg == 0) {
         /* ST1 */
         fn = fn_single[s->mte_active[0]][be][msz][esz];
-        nreg = 1;
     } else {
         /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */
         assert(msz == esz);
         fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz];
     }
     assert(fn != NULL);
-    do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn);
+    do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg + 1, true, fn);
 }
 
 static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a)
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index b3660173d1d07aa7c765a9d8e7789ccc3bcc114d..e555e885a13edf0deb39ac9ffc86b57c6a9e5a26 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -4584,7 +4584,7 @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
             tcg_gen_andi_i32(t, t, 1u << maskbit);
             tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label);
 
-            gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
+            gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
             /*
              * gen_exception_insn() will set is_jmp to DISAS_NORETURN,
              * but since we're conditionally branching over it, we want
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 1f93510b85cd352f093552b2897a87d6f7e98ea3..83b49ef0092cb9fa78e386565950c2bb377dd1a5 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -692,6 +692,13 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 {                                                                         \
     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
+    /*                                                                    \
+     * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
+     * first iteration might not be a full 16 byte segment. But           \
+     * for vector lengths beyond that this must be SVE and we know        \
+     * opr_sz is a multiple of 16, so we need not clamp segend            \
+     * to opr_sz_n when we advance it at the end of the loop.             \
+     */                                                                   \
     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
     intptr_t index = simd_data(desc);                                     \
     TYPED *d = vd, *a = va;                                               \
@@ -709,7 +716,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
                     n[i * 4 + 2] * m2 +                                   \
                     n[i * 4 + 3] * m3);                                   \
         } while (++i < segend);                                           \
-        segend = i + 4;                                                   \
+        segend = i + (16 / sizeof(TYPED));                                \
     } while (i < opr_sz_n);                                               \
     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 }
@@ -843,7 +850,7 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
     uint32_t neg_real = flip ^ neg_imag;
     intptr_t elements = opr_sz / sizeof(float16);
-    intptr_t eltspersegment = 16 / sizeof(float16);
+    intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
     intptr_t i, j;
 
     /* Shift boolean to the sign bit so we can xor to negate.  */
@@ -905,7 +912,7 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
     uint32_t neg_real = flip ^ neg_imag;
     intptr_t elements = opr_sz / sizeof(float32);
-    intptr_t eltspersegment = 16 / sizeof(float32);
+    intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
     intptr_t i, j;
 
     /* Shift boolean to the sign bit so we can xor to negate.  */
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index 3e5e37abbe8787652fa04bc8e968a08f39ec7426..ff59bc552228339cb2f0baececf0e18b7b6856a0 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -1121,8 +1121,8 @@ const FloatRoundMode arm_rmode_to_sf_map[] = {
 uint64_t HELPER(fjcvtzs)(float64 value, void *vstatus)
 {
     float_status *status = vstatus;
-    uint32_t inexact, frac;
-    uint32_t e_old, e_new;
+    uint32_t frac, e_old, e_new;
+    bool inexact;
 
     e_old = get_float_exception_flags(status);
     set_float_exception_flags(0, status);
@@ -1130,13 +1130,13 @@ uint64_t HELPER(fjcvtzs)(float64 value, void *vstatus)
     e_new = get_float_exception_flags(status);
     set_float_exception_flags(e_old | e_new, status);
 
-    if (value == float64_chs(float64_zero)) {
-        /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */
-        inexact = 1;
-    } else {
-        /* Normal inexact or overflow or NaN */
-        inexact = e_new & (float_flag_inexact | float_flag_invalid);
-    }
+    /* Normal inexact, denormal with flush-to-zero, or overflow or NaN */
+    inexact = e_new & (float_flag_inexact |
+                       float_flag_input_denormal |
+                       float_flag_invalid);
+
+    /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */
+    inexact |= value == float64_chs(float64_zero);
 
     /* Pack the result and the env->ZF representation of Z together.  */
     return deposit64(frac, 32, 32, inexact);
diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c
index 4af020933aaa3809ce6e23b56c1893df38bd8b6d..a83099de6b4959275d5b8b2ffd5de5706d3fbae7 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -2123,9 +2123,16 @@ void free_instruction(Context *c)
         g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE);
     }
     g_array_free(c->inst.strings, TRUE);
+    /*
+     * Free list of arguments that might need initialization, if they haven't
+     * already been freed.
+     */
+    if (c->inst.init_list) {
+        g_array_free(c->inst.init_list, TRUE);
+    }
     /* Free INAME token value */
     g_string_free(c->inst.name, TRUE);
-    /* Free variables and registers */
+    /* Free declared TCGv variables */
     g_array_free(c->inst.allocated, TRUE);
     /* Initialize instruction-specific portion of the context */
     memset(&(c->inst), 0, sizeof(Inst));
diff --git a/target/hexagon/meson.build b/target/hexagon/meson.build
index da8e608d0065defad0f18d437e57783286c313cd..436217f25a13b295f7ab0940c5cd7bbed0051efa 100644
--- a/target/hexagon/meson.build
+++ b/target/hexagon/meson.build
@@ -188,7 +188,7 @@ if idef_parser_enabled and 'hexagon-linux-user' in target_dirs
         arguments: ['@INPUT@', '--defines=@OUTPUT1@', '--output=@OUTPUT0@']
     )
 
-    glib_dep = dependency('glib-2.0', native: true)
+    glib_dep = dependency('glib-2.0', native: true, static: false)
 
     idef_parser = executable(
         'idef-parser',
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 8be45c69c99ab53e5da3d9d6be304e082a14e2e5..ba100c21a264d1041d2c05d38ed15313d411f422 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -188,7 +188,7 @@ typedef struct CPUArchState {
 
     target_ulong psw;        /* All psw bits except the following:  */
     target_ulong psw_n;      /* boolean */
-    target_long psw_v;       /* in most significant bit */
+    target_long psw_v;       /* in bit 31 */
 
     /* Splitting the carry-borrow field into the MSB and "the rest", allows
      * for "the rest" to be deleted when it is unused, but the MSB is in use.
diff --git a/target/hppa/helper.c b/target/hppa/helper.c
index 859644c47af1d709772277950cb6cfcf94b6ad9c..9e35b65f2939d05713c13640ad41d85b9813a46e 100644
--- a/target/hppa/helper.c
+++ b/target/hppa/helper.c
@@ -53,7 +53,7 @@ target_ulong cpu_hppa_get_psw(CPUHPPAState *env)
     }
 
     psw |= env->psw_n * PSW_N;
-    psw |= (env->psw_v < 0) * PSW_V;
+    psw |= ((env->psw_v >> 31) & 1) * PSW_V;
     psw |= env->psw;
 
     return psw;
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index cd16cb893daf8d2a2be94a91f5ad47de787a6080..f79d0c9abf317fd363d521de58294c872f5749d6 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -29,6 +29,7 @@
 #include "hvf/hvf-i386.h"
 #include "kvm/kvm_i386.h"
 #include "sev.h"
+#include "csv.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qapi/qapi-visit-machine.h"
@@ -905,9 +906,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
         .type = CPUID_FEATURE_WORD,
         .feat_names = {
             "fsgsbase", "tsc-adjust", "sgx", "bmi1",
-            "hle", "avx2", NULL, "smep",
+            "hle", "avx2", "fdp-excptn-only", "smep",
             "bmi2", "erms", "invpcid", "rtm",
-            NULL, NULL, "mpx", NULL,
+            NULL, "zero-fcs-fds", "mpx", NULL,
             "avx512f", "avx512dq", "rdseed", "adx",
             "smap", "avx512ifma", "pcommit", "clflushopt",
             "clwb", "intel-pt", "avx512pf", "avx512er",
@@ -961,13 +962,13 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
     [FEAT_7_1_EAX] = {
         .type = CPUID_FEATURE_WORD,
         .feat_names = {
-            NULL, NULL, NULL, NULL,
+            "sha512", "sm3", "sm4", NULL,
             "avx-vnni", "avx512-bf16", NULL, "cmpccxadd",
             NULL, NULL, "fzrm", "fsrs",
             "fsrc", NULL, NULL, NULL,
-            NULL, NULL, NULL, NULL,
+            NULL, "fred", "lkgs", "wrmsrns",
             NULL, "amx-fp16", NULL, "avx-ifma",
-            NULL, NULL, NULL, NULL,
+            NULL, NULL, "lam", NULL,
             NULL, NULL, NULL, NULL,
         },
         .cpuid = {
@@ -999,8 +1000,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
     [FEAT_7_2_EDX] = {
         .type = CPUID_FEATURE_WORD,
         .feat_names = {
-            NULL, NULL, NULL, NULL,
-            NULL, "mcdt-no", NULL, NULL,
+            "intel-psfd", "ipred-ctrl", "rrsba-ctrl", "ddpd-u",
+            "bhi-ctrl", "mcdt-no", NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
@@ -1156,9 +1157,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             "taa-no", NULL, NULL, NULL,
             NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no",
             NULL, "fb-clear", NULL, NULL,
-            NULL, NULL, NULL, NULL,
-            "pbrsb-no", NULL, "gds-no", NULL,
-            NULL, NULL, NULL, NULL,
+            "bhi-no", NULL, NULL, NULL,
+            "pbrsb-no", NULL, "gds-no", "rfds-no",
+            "rfds-clear", NULL, NULL, NULL,
         },
         .msr = {
             .index = MSR_IA32_ARCH_CAPABILITIES,
@@ -1270,7 +1271,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             "vmx-exit-save-efer", "vmx-exit-load-efer",
                 "vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs",
             NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL,
-            NULL, "vmx-exit-load-pkrs", NULL, NULL,
+            NULL, "vmx-exit-load-pkrs", NULL, "vmx-exit-secondary-ctls",
         },
         .msr = {
             .index = MSR_IA32_VMX_TRUE_EXIT_CTLS,
@@ -1285,7 +1286,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             NULL, "vmx-entry-ia32e-mode", NULL, NULL,
             NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", "vmx-entry-load-efer",
             "vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL,
-            NULL, NULL, "vmx-entry-load-pkrs", NULL,
+            NULL, NULL, "vmx-entry-load-pkrs", "vmx-entry-load-fred",
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
         },
@@ -1343,6 +1344,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             [54] = "vmx-ins-outs",
             [55] = "vmx-true-ctls",
             [56] = "vmx-any-errcode",
+            [58] = "vmx-nested-exception",
         },
         .msr = {
             .index = MSR_IA32_VMX_BASIC,
@@ -1549,8 +1551,20 @@ static FeatureDep feature_dependencies[] = {
         .to = { FEAT_SVM,                   ~0ull },
     },
     {
-        .from = { FEAT_VMX_SECONDARY_CTLS,  VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE },
-        .to = { FEAT_7_0_ECX,               CPUID_7_0_ECX_WAITPKG },
+        .from = { FEAT_7_0_ECX,             CPUID_7_0_ECX_WAITPKG },
+        .to = { FEAT_VMX_SECONDARY_CTLS,    VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE },
+    },
+    {
+        .from = { FEAT_8000_0001_EDX,       CPUID_EXT2_LM },
+        .to = { FEAT_7_1_EAX,               CPUID_7_1_EAX_FRED },
+    },
+    {
+        .from = { FEAT_7_1_EAX,             CPUID_7_1_EAX_LKGS },
+        .to = { FEAT_7_1_EAX,               CPUID_7_1_EAX_FRED },
+    },
+    {
+        .from = { FEAT_7_1_EAX,             CPUID_7_1_EAX_WRMSRNS },
+        .to = { FEAT_7_1_EAX,               CPUID_7_1_EAX_FRED },
     },
 };
 
@@ -1671,9 +1685,10 @@ static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu)
  * Returns the set of feature flags that are supported and migratable by
  * QEMU, for a given FeatureWord.
  */
-static uint64_t x86_cpu_get_migratable_flags(FeatureWord w)
+static uint64_t x86_cpu_get_migratable_flags(X86CPU *cpu, FeatureWord w)
 {
     FeatureWordInfo *wi = &feature_word_info[w];
+    CPUX86State *env = &cpu->env;
     uint64_t r = 0;
     int i;
 
@@ -1687,6 +1702,12 @@ static uint64_t x86_cpu_get_migratable_flags(FeatureWord w)
             r |= f;
         }
     }
+
+    /* when tsc-khz is set explicitly, invtsc is migratable */
+    if ((w == FEAT_8000_0007_EDX) && env->user_tsc_khz) {
+        r |= CPUID_APM_INVTSC;
+    }
+
     return r;
 }
 
@@ -2162,6 +2183,56 @@ static const CPUCaches epyc_genoa_cache_info = {
     },
 };
 
+static const CPUCaches dharma_cache_info = {
+    .l1d_cache = &(CPUCacheInfo) {
+        .type = DATA_CACHE,
+        .level = 1,
+        .size = 32 * KiB,
+        .line_size = 64,
+        .associativity = 8,
+        .partitions = 1,
+        .sets = 64,
+        .lines_per_tag = 1,
+        .self_init = 1,
+        .no_invd_sharing = true,
+    },
+    .l1i_cache = &(CPUCacheInfo) {
+        .type = INSTRUCTION_CACHE,
+        .level = 1,
+        .size = 32 * KiB,
+        .line_size = 64,
+        .associativity = 8,
+        .partitions = 1,
+        .sets = 64,
+        .lines_per_tag = 1,
+        .self_init = 1,
+        .no_invd_sharing = true,
+    },
+    .l2_cache = &(CPUCacheInfo) {
+        .type = UNIFIED_CACHE,
+        .level = 2,
+        .size = 512 * KiB,
+        .line_size = 64,
+        .associativity = 8,
+        .partitions = 1,
+        .sets = 1024,
+        .lines_per_tag = 1,
+    },
+    .l3_cache = &(CPUCacheInfo) {
+        .type = UNIFIED_CACHE,
+        .level = 3,
+        .size = 16 * MiB,
+        .line_size = 64,
+        .associativity = 16,
+        .partitions = 1,
+        .sets = 16384,
+        .lines_per_tag = 1,
+        .self_init = true,
+        .inclusive = true,
+        .complex_indexing = true,
+    },
+};
+
 /* The following VMX features are not supported by KVM and are left out in the
  * CPU definitions:
  *
@@ -3402,6 +3473,7 @@ static const X86CPUDefinition builtin_x86_defs[] = {
             },
             {
                 .version = 4,
+                .note = "IBRS, EPT switching, no TSX",
                 .props = (PropValue[]) {
                     { "vmx-eptp-switching", "on" },
                     { /* end of list */ }
@@ -3536,7 +3608,7 @@ static const X86CPUDefinition builtin_x86_defs[] = {
               },
             },
             { .version = 4,
-              .note = "ARCH_CAPABILITIES, no TSX",
+              .note = "ARCH_CAPABILITIES, EPT switching, no TSX",
               .props = (PropValue[]) {
                   { "vmx-eptp-switching", "on" },
                   { /* end of list */ }
@@ -3822,6 +3894,16 @@ static const X86CPUDefinition builtin_x86_defs[] = {
                     { /* end of list */ }
                 },
             },
+            {
+                .version = 7,
+                .note = "TSX, taa-no",
+                .props = (PropValue[]) {
+                    /* Restore TSX features removed by -v2 above */
+                    { "hle", "on" },
+                    { "rtm", "on" },
+                    { /* end of list */ }
+                },
+            },
             { /* end of list */ }
         }
     },
@@ -3960,6 +4042,17 @@ static const X86CPUDefinition builtin_x86_defs[] = {
                     { /* end of list */ }
                 }
             },
+            {
+                .version = 3,
+                .props = (PropValue[]) {
+                    { "ss", "on" },
+                    { "tsc-adjust", "on" },
+                    { "cldemote", "on" },
+                    { "movdiri", "on" },
+                    { "movdir64b", "on" },
+                    { /* end of list */ }
+                }
+            },
             { /* end of list */ }
         }
     },
@@ -4099,6 +4192,286 @@ static const X86CPUDefinition builtin_x86_defs[] = {
             { /* end of list */ },
         },
     },
+    {
+        .name = "SierraForest",
+        .level = 0x23,
+        .vendor = CPUID_VENDOR_INTEL,
+        .family = 6,
+        .model = 175,
+        .stepping = 0,
+        /*
+         * please keep the ascending order so that we can have a clear view of
+         * bit position of each feature.
+         */
+        .features[FEAT_1_EDX] =
+            CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
+            CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
+            CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
+            CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR |
+            CPUID_SSE | CPUID_SSE2,
+        .features[FEAT_1_ECX] =
+            CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 |
+            CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
+            CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
+            CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES |
+            CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | CPUID_EXT_RDRAND,
+        .features[FEAT_8000_0001_EDX] =
+            CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
+            CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
+        .features[FEAT_8000_0001_ECX] =
+            CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
+        .features[FEAT_8000_0008_EBX] =
+            CPUID_8000_0008_EBX_WBNOINVD,
+        .features[FEAT_7_0_EBX] =
+            CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 |
+            CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS |
+            CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX |
+            CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB |
+            CPUID_7_0_EBX_SHA_NI,
+        .features[FEAT_7_0_ECX] =
+            CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_GFNI |
+            CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ |
+            CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT,
+        .features[FEAT_7_0_EDX] =
+            CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE |
+            CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_ARCH_CAPABILITIES |
+            CPUID_7_0_EDX_SPEC_CTRL_SSBD,
+        .features[FEAT_ARCH_CAPABILITIES] =
+            MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL |
+            MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO |
+            MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_SBDR_SSDP_NO |
+            MSR_ARCH_CAP_FBSDP_NO | MSR_ARCH_CAP_PSDP_NO |
+            MSR_ARCH_CAP_PBRSB_NO,
+        .features[FEAT_XSAVE] =
+            CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC |
+            CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES,
+        .features[FEAT_6_EAX] =
+            CPUID_6_EAX_ARAT,
+        .features[FEAT_7_1_EAX] =
+            CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_CMPCCXADD |
+            CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_AVX_IFMA,
+        .features[FEAT_7_1_EDX] =
+            CPUID_7_1_EDX_AVX_VNNI_INT8 | CPUID_7_1_EDX_AVX_NE_CONVERT,
+        .features[FEAT_7_2_EDX] =
+            CPUID_7_2_EDX_MCDT_NO,
+        .features[FEAT_VMX_BASIC] =
+            MSR_VMX_BASIC_INS_OUTS | MSR_VMX_BASIC_TRUE_CTLS,
+        .features[FEAT_VMX_ENTRY_CTLS] =
+            VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_IA32E_MODE |
+            VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+            VMX_VM_ENTRY_LOAD_IA32_PAT | VMX_VM_ENTRY_LOAD_IA32_EFER,
+        .features[FEAT_VMX_EPT_VPID_CAPS] =
+            MSR_VMX_EPT_EXECONLY | MSR_VMX_EPT_PAGE_WALK_LENGTH_4 |
+            MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | MSR_VMX_EPT_1GB |
+            MSR_VMX_EPT_INVEPT | MSR_VMX_EPT_AD_BITS |
+            MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT |
+            MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR |
+            MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT |
+            MSR_VMX_EPT_INVVPID_ALL_CONTEXT |
+            MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS,
+        .features[FEAT_VMX_EXIT_CTLS] =
+            VMX_VM_EXIT_SAVE_DEBUG_CONTROLS |
+            VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
+            VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_IA32_PAT |
+            VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER |
+            VMX_VM_EXIT_LOAD_IA32_EFER | VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER,
+        .features[FEAT_VMX_MISC] =
+            MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_ACTIVITY_HLT |
+            MSR_VMX_MISC_VMWRITE_VMEXIT,
+        .features[FEAT_VMX_PINBASED_CTLS] =
+            VMX_PIN_BASED_EXT_INTR_MASK | VMX_PIN_BASED_NMI_EXITING |
+            VMX_PIN_BASED_VIRTUAL_NMIS | VMX_PIN_BASED_VMX_PREEMPTION_TIMER |
+            VMX_PIN_BASED_POSTED_INTR,
+        .features[FEAT_VMX_PROCBASED_CTLS] =
+            VMX_CPU_BASED_VIRTUAL_INTR_PENDING |
+            VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING |
+            VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING |
+            VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING |
+            VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING |
+            VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING |
+            VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_VIRTUAL_NMI_PENDING |
+            VMX_CPU_BASED_MOV_DR_EXITING | VMX_CPU_BASED_UNCOND_IO_EXITING |
+            VMX_CPU_BASED_USE_IO_BITMAPS | VMX_CPU_BASED_MONITOR_TRAP_FLAG |
+            VMX_CPU_BASED_USE_MSR_BITMAPS | VMX_CPU_BASED_MONITOR_EXITING |
+            VMX_CPU_BASED_PAUSE_EXITING |
+            VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS,
+        .features[FEAT_VMX_SECONDARY_CTLS] =
+            VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+            VMX_SECONDARY_EXEC_ENABLE_EPT | VMX_SECONDARY_EXEC_DESC |
+            VMX_SECONDARY_EXEC_RDTSCP |
+            VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+            VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_WBINVD_EXITING |
+            VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST |
+            VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT |
+            VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+            VMX_SECONDARY_EXEC_RDRAND_EXITING |
+            VMX_SECONDARY_EXEC_ENABLE_INVPCID |
+            VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS |
+            VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML |
+            VMX_SECONDARY_EXEC_XSAVES,
+        .features[FEAT_VMX_VMFUNC] =
+            MSR_VMX_VMFUNC_EPT_SWITCHING,
+        .xlevel = 0x80000008,
+        .model_id = "Intel Xeon Processor (SierraForest)",
+        .versions = (X86CPUVersionDefinition[]) {
+            { .version = 1 },
+            {
+                .version = 2,
+                .props = (PropValue[]) {
+                    { "ss", "on" },
+                    { "tsc-adjust", "on" },
+                    { "cldemote", "on" },
+                    { "movdiri", "on" },
+                    { "movdir64b", "on" },
+                    { "gds-no", "on" },
+                    { "rfds-no", "on" },
+                    { "lam", "on" },
+                    { "intel-psfd", "on"},
+                    { "ipred-ctrl", "on"},
+                    { "rrsba-ctrl", "on"},
+                    { "bhi-ctrl", "on"},
+                    { "stepping", "3" },
+                    { /* end of list */ }
+                }
+            },
+            { /* end of list */ },
+        },
+    },
+    {
+        .name = "ClearwaterForest",
+        .level = 0x23,
+        .xlevel = 0x80000008,
+        .vendor = CPUID_VENDOR_INTEL,
+        .family = 6,
+        .model = 221,
+        .stepping = 0,
+        /*
+         * please keep the ascending order so that we can have a clear view of
+         * bit position of each feature.
+         */
+        .features[FEAT_1_EDX] =
+            CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
+            CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
+            CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
+            CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR |
+            CPUID_SSE | CPUID_SSE2 | CPUID_SS,
+        .features[FEAT_1_ECX] =
+            CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 |
+            CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
+            CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
+            CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES |
+            CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | CPUID_EXT_RDRAND,
+        .features[FEAT_8000_0001_EDX] =
+            CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
+            CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
+        .features[FEAT_8000_0001_ECX] =
+            CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
+        .features[FEAT_8000_0008_EBX] =
+            CPUID_8000_0008_EBX_WBNOINVD,
+        .features[FEAT_7_0_EBX] =
+            CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_TSC_ADJUST |
+            CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP |
+            CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID |
+            CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP |
+            CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB |
+            CPUID_7_0_EBX_SHA_NI,
+        .features[FEAT_7_0_ECX] =
+            CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_GFNI |
+            CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ |
+            CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT |
+            CPUID_7_0_ECX_CLDEMOTE | CPUID_7_0_ECX_MOVDIRI |
+            CPUID_7_0_ECX_MOVDIR64B,
+        .features[FEAT_7_0_EDX] =
+            CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE |
+            CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_ARCH_CAPABILITIES |
+            CPUID_7_0_EDX_SPEC_CTRL_SSBD,
+        .features[FEAT_ARCH_CAPABILITIES] =
+            MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL |
+            MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO |
+            MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_SBDR_SSDP_NO |
+            MSR_ARCH_CAP_FBSDP_NO | MSR_ARCH_CAP_PSDP_NO |
+            MSR_ARCH_CAP_BHI_NO | MSR_ARCH_CAP_PBRSB_NO |
+            MSR_ARCH_CAP_GDS_NO | MSR_ARCH_CAP_RFDS_NO,
+        .features[FEAT_XSAVE] =
+            CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC |
+            CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES,
+        .features[FEAT_6_EAX] =
+            CPUID_6_EAX_ARAT,
+        .features[FEAT_7_1_EAX] =
+            CPUID_7_1_EAX_SHA512 | CPUID_7_1_EAX_SM3 | CPUID_7_1_EAX_SM4 |
+            CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_CMPCCXADD |
+            CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_AVX_IFMA |
+            CPUID_7_1_EAX_LAM,
+        .features[FEAT_7_1_EDX] =
+            CPUID_7_1_EDX_AVX_VNNI_INT8 | CPUID_7_1_EDX_AVX_NE_CONVERT |
+            CPUID_7_1_EDX_AVX_VNNI_INT16 | CPUID_7_1_EDX_PREFETCHITI,
+        .features[FEAT_7_2_EDX] =
+            CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL |
+            CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_DDPD_U |
+            CPUID_7_2_EDX_BHI_CTRL | CPUID_7_2_EDX_MCDT_NO,
+        .features[FEAT_VMX_BASIC] =
+            MSR_VMX_BASIC_INS_OUTS | MSR_VMX_BASIC_TRUE_CTLS,
+        .features[FEAT_VMX_ENTRY_CTLS] =
+            VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_IA32E_MODE |
+            VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+            VMX_VM_ENTRY_LOAD_IA32_PAT | VMX_VM_ENTRY_LOAD_IA32_EFER,
+        .features[FEAT_VMX_EPT_VPID_CAPS] =
+            MSR_VMX_EPT_EXECONLY | MSR_VMX_EPT_PAGE_WALK_LENGTH_4 |
+            MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | MSR_VMX_EPT_1GB |
+            MSR_VMX_EPT_INVEPT | MSR_VMX_EPT_AD_BITS |
+            MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT |
+            MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR |
+            MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT |
+            MSR_VMX_EPT_INVVPID_ALL_CONTEXT |
+            MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS,
+        .features[FEAT_VMX_EXIT_CTLS] =
+            VMX_VM_EXIT_SAVE_DEBUG_CONTROLS |
+            VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
+            VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_IA32_PAT |
+            VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER |
+            VMX_VM_EXIT_LOAD_IA32_EFER | VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER,
+        .features[FEAT_VMX_MISC] =
+            MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_ACTIVITY_HLT |
+            MSR_VMX_MISC_VMWRITE_VMEXIT,
+        .features[FEAT_VMX_PINBASED_CTLS] =
+            VMX_PIN_BASED_EXT_INTR_MASK | VMX_PIN_BASED_NMI_EXITING |
+            VMX_PIN_BASED_VIRTUAL_NMIS | VMX_PIN_BASED_VMX_PREEMPTION_TIMER |
+            VMX_PIN_BASED_POSTED_INTR,
+        .features[FEAT_VMX_PROCBASED_CTLS] =
+            VMX_CPU_BASED_VIRTUAL_INTR_PENDING |
+            VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING |
+            VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING |
+            VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING |
+            VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING |
+            VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING |
+            VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_VIRTUAL_NMI_PENDING |
+            VMX_CPU_BASED_MOV_DR_EXITING | VMX_CPU_BASED_UNCOND_IO_EXITING |
+            VMX_CPU_BASED_USE_IO_BITMAPS | VMX_CPU_BASED_MONITOR_TRAP_FLAG |
+            VMX_CPU_BASED_USE_MSR_BITMAPS | VMX_CPU_BASED_MONITOR_EXITING |
+            VMX_CPU_BASED_PAUSE_EXITING |
+            VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS,
+        .features[FEAT_VMX_SECONDARY_CTLS] =
+            VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+            VMX_SECONDARY_EXEC_ENABLE_EPT | VMX_SECONDARY_EXEC_DESC |
+            VMX_SECONDARY_EXEC_RDTSCP |
+            VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+            VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_WBINVD_EXITING |
+            VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST |
+            VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT |
+            VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+            VMX_SECONDARY_EXEC_RDRAND_EXITING |
+            VMX_SECONDARY_EXEC_ENABLE_INVPCID |
+            VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS |
+            VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML |
+            VMX_SECONDARY_EXEC_XSAVES,
+        .features[FEAT_VMX_VMFUNC] =
+            MSR_VMX_VMFUNC_EPT_SWITCHING,
+        .model_id = "Intel Xeon Processor (ClearwaterForest)",
+        .versions = (X86CPUVersionDefinition[]) {
+            { .version = 1 },
+            { /* end of list */ },
+        },
+    },
     {
         .name = "Denverton",
         .level = 21,
@@ -4657,6 +5030,20 @@ static const X86CPUDefinition builtin_x86_defs[] = {
                   { /* end of list */ }
               },
             },
+            { .version = 3,
+              .props = (PropValue[]) {
+                  { "xsaves", "off" },
+                  { "perfctr-core", "on" },
+                  { "clzero", "on" },
+                  { "xsaveerptr", "on" },
+                  { "aes", "on" },
+                  { "pclmulqdq", "on" },
+                  { "sha-ni", "on" },
+                  { "model-id",
+                      "Hygon Dhyana-v3 processor" },
+                  { /* end of list */ }
+              },
+            },
             { /* end of list */ }
         }
     },
@@ -4852,7 +5239,7 @@ static const X86CPUDefinition builtin_x86_defs[] = {
             CPUID_8000_0008_EBX_STIBP_ALWAYS_ON |
             CPUID_8000_0008_EBX_AMD_SSBD | CPUID_8000_0008_EBX_AMD_PSFD,
         .features[FEAT_8000_0021_EAX] =
-            CPUID_8000_0021_EAX_No_NESTED_DATA_BP |
+            CPUID_8000_0021_EAX_NO_NESTED_DATA_BP |
             CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING |
             CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE |
             CPUID_8000_0021_EAX_AUTO_IBRS,
@@ -4888,6 +5275,55 @@ static const X86CPUDefinition builtin_x86_defs[] = {
         .model_id = "AMD EPYC-Genoa Processor",
         .cache_info = &epyc_genoa_cache_info,
     },
+    {
+        .name = "Dharma",
+        .level = 0xd,
+        .vendor = CPUID_VENDOR_HYGON,
+        .family = 24,
+        .model = 4,
+        .stepping = 0,
+        .features[FEAT_1_EDX] =
+            CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH |
+            CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE |
+            CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE |
+            CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE |
+            CPUID_VME | CPUID_FP87,
+        .features[FEAT_1_ECX] =
+            CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX |
+            CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT |
+            CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 |
+            CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 |
+            CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3,
+        .features[FEAT_8000_0001_EDX] =
+            CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB |
+            CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX |
+            CPUID_EXT2_SYSCALL,
+        .features[FEAT_8000_0001_ECX] =
+            CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH |
+            CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM |
+            CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM |
+            CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE,
+        .features[FEAT_8000_0008_EBX] =
+            CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR |
+            CPUID_8000_0008_EBX_IBPB | CPUID_8000_0008_EBX_IBRS |
+            CPUID_8000_0008_EBX_STIBP | CPUID_8000_0008_EBX_AMD_SSBD,
+        .features[FEAT_7_0_EBX] =
+            CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 |
+            CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED |
+            CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT |
+            CPUID_7_0_EBX_SHA_NI,
+        .features[FEAT_7_0_ECX] = CPUID_7_0_ECX_UMIP,
+        .features[FEAT_XSAVE] =
+            CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC |
+            CPUID_XSAVE_XGETBV1,
+        .features[FEAT_6_EAX] =
+            CPUID_6_EAX_ARAT,
+        .features[FEAT_SVM] =
+            CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE,
+        .xlevel = 0x8000001E,
+        .model_id = "Hygon Dharma Processor",
+        .cache_info = &dharma_cache_info,
+    },
 };
 
 /*
@@ -5475,6 +5911,7 @@ static void x86_cpu_get_unavailable_features(Object *obj, Visitor *v,
 
     x86_cpu_list_feature_names(xc->filtered_features, &result);
     visit_type_strList(v, "unavailable-features", &result, errp);
+    qapi_free_strList(result);
 }
 
 /* Print all cpuid feature names in featureset
@@ -5683,8 +6120,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp)
 
 #endif /* !CONFIG_USER_ONLY */
 
-uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
-                                            bool migratable_only)
+uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w)
 {
     FeatureWordInfo *wi = &feature_word_info[w];
     uint64_t r = 0;
@@ -5726,8 +6162,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
         r &= ~unavail;
     }
 #endif
-    if (migratable_only) {
-        r &= x86_cpu_get_migratable_flags(w);
+    if (cpu && cpu->migratable) {
+        r &= x86_cpu_get_migratable_flags(cpu, w);
     }
     return r;
 }
@@ -6566,6 +7002,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
             /* 64 bit processor */
              *eax |= (cpu_x86_virtual_addr_width(env) << 8);
+             *eax |= (cpu->guest_phys_bits << 16);
         }
         *ebx = env->features[FEAT_8000_0008_EBX];
         if (cs->nr_cores * cs->nr_threads > 1) {
@@ -6596,9 +7033,31 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         }
         break;
     case 0x8000001D:
+	/* Populate AMD Processor Cache Information */
         *eax = 0;
         if (cpu->cache_info_passthrough) {
             x86_cpu_get_cache_cpuid(index, count, eax, ebx, ecx, edx);
+
+	    /*
+             * Clear BITs[25:14] and then update them based on the guest
+             * vCPU topology, like what we do in encode_cache_cpuid8000001d
+             * when cache_info_passthrough is not enabled.
+             */
+            *eax &= ~0x03FFC000;
+            switch (count) {
+            case 0: /* L1 dcache info */
+            case 1: /* L1 icache info */
+            case 2: /* L2 cache info */
+                *eax |= ((topo_info.threads_per_core - 1) << 14);
+                break;
+            case 3: /* L3 cache info */
+                *eax |= ((topo_info.cores_per_die *
+                          topo_info.threads_per_core - 1) << 14);
+                break;
+            default: /* end of info */
+                *eax = *ebx = *ecx = *edx = 0;
+                break;
+            }
             break;
         }
         switch (count) {
@@ -6660,6 +7119,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         if (sev_enabled()) {
             *eax = 0x2;
             *eax |= sev_es_enabled() ? 0x8 : 0;
+            *eax |= csv3_enabled() ? 0x40000000 : 0; /* bit 30 for CSV3 */
             *ebx = sev_get_cbit_position() & 0x3f; /* EBX[5:0] */
             *ebx |= (sev_get_reduced_phys_bits() & 0x3f) << 6; /* EBX[11:6] */
         }
@@ -6927,6 +7387,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
     if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) {
         env->features[FEAT_XSAVE_XCR0_LO] = 0;
         env->features[FEAT_XSAVE_XCR0_HI] = 0;
+        env->features[FEAT_XSAVE_XSS_LO] = 0;
+        env->features[FEAT_XSAVE_XSS_HI] = 0;
         return;
     }
 
@@ -6945,9 +7407,9 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu)
     }
 
     env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK;
-    env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32;
+    env->features[FEAT_XSAVE_XCR0_HI] = (mask & CPUID_XSTATE_XCR0_MASK) >> 32;
     env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK;
-    env->features[FEAT_XSAVE_XSS_HI] = mask >> 32;
+    env->features[FEAT_XSAVE_XSS_HI] = (mask & CPUID_XSTATE_XSS_MASK) >> 32;
 }
 
 /***** Steps involved on loading and filtering CPUID data
@@ -7022,7 +7484,7 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
              * by the user.
              */
             env->features[w] |=
-                x86_cpu_get_supported_feature_word(w, cpu->migratable) &
+                x86_cpu_get_supported_feature_word(cpu, w) &
                 ~env->user_features[w] &
                 ~feature_word_info[w].no_autoenable_flags;
         }
@@ -7148,7 +7610,7 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool verbose)
 
     for (w = 0; w < FEATURE_WORDS; w++) {
         uint64_t host_feat =
-            x86_cpu_get_supported_feature_word(w, false);
+            x86_cpu_get_supported_feature_word(NULL, w);
         uint64_t requested_features = env->features[w];
         uint64_t unavailable_features = requested_features & ~host_feat;
         mark_unavailable_features(cpu, w, unavailable_features, prefix);
@@ -7264,7 +7726,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
         env->features[FEAT_PERF_CAPABILITIES] & PERF_CAP_LBR_FMT;
     if (requested_lbr_fmt && kvm_enabled()) {
         uint64_t host_perf_cap =
-            x86_cpu_get_supported_feature_word(FEAT_PERF_CAPABILITIES, false);
+            x86_cpu_get_supported_feature_word(NULL, FEAT_PERF_CAPABILITIES);
         unsigned host_lbr_fmt = host_perf_cap & PERF_CAP_LBR_FMT;
 
         if (!cpu->enable_pmu) {
@@ -7318,6 +7780,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
         goto out;
     }
 
+    if (cpu->guest_phys_bits == -1) {
+        /*
+         * If it was not set by the user, or by the accelerator via
+         * cpu_exec_realizefn, clear.
+         */
+        cpu->guest_phys_bits = 0;
+    }
+
     if (cpu->ucode_rev == 0) {
         /*
          * The default is the same as KVM's. Note that this check
@@ -7368,6 +7838,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
         if (cpu->phys_bits == 0) {
             cpu->phys_bits = TCG_PHYS_ADDR_BITS;
         }
+        if (cpu->guest_phys_bits &&
+            (cpu->guest_phys_bits > cpu->phys_bits ||
+            cpu->guest_phys_bits < 32)) {
+            error_setg(errp, "guest-phys-bits should be between 32 and %u "
+                             " (but is %u)",
+                             cpu->phys_bits, cpu->guest_phys_bits);
+            return;
+        }
     } else {
         /* For 32 bit systems don't use the user set value, but keep
          * phys_bits consistent with what we tell the guest.
@@ -7376,6 +7854,10 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
             error_setg(errp, "phys-bits is not user-configurable in 32 bit");
             return;
         }
+        if (cpu->guest_phys_bits != 0) {
+            error_setg(errp, "guest-phys-bits is not user-configurable in 32 bit");
+            return;
+        }
 
         if (env->features[FEAT_1_EDX] & (CPUID_PSE36 | CPUID_PAE)) {
             cpu->phys_bits = 36;
@@ -7664,6 +8146,24 @@ static void x86_cpu_set_pc(CPUState *cs, vaddr value)
     cpu->env.eip = value;
 }
 
+
+/* At present, we check the vm is *LARGE* or not, i.e. whether
+ * the memory size is more than 4T or not.
+ */
+const uint64_t large_vm_mem_size = 0x40000000000UL;
+void x86_cpu_adjuest_by_ram_size(ram_addr_t ram_size, X86CPU *cpu)
+{
+    /* If there is not a large vm, we set the phys_bits to 42 bits,
+     * otherwise, we increase the phys_bits to 46 bits.
+     */
+    if (ram_size < large_vm_mem_size) {
+        cpu->phys_bits = DEFAULT_VM_CPU_PHYS_BITS;
+    } else {
+        cpu->phys_bits = LARGE_VM_CPU_PHYS_BITS;
+        cpu->fill_mtrr_mask = true;
+    }
+}
+
 static vaddr x86_cpu_get_pc(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
@@ -7864,9 +8364,10 @@ static Property x86_cpu_properties[] = {
     DEFINE_PROP_BOOL("x-force-features", X86CPU, force_features, false),
     DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true),
     DEFINE_PROP_UINT32("phys-bits", X86CPU, phys_bits, 0),
+    DEFINE_PROP_UINT32("guest-phys-bits", X86CPU, guest_phys_bits, -1),
     DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
     DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 0),
-    DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+    DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, false),
     DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7,
                        UINT32_MAX),
     DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index ef987f344cff2409b7746b12f654ed7e3fb560dd..4424e58d1ba24fd9578623c92ca169e48e73e22b 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -24,6 +24,7 @@
 #include "cpu-qom.h"
 #include "kvm/hyperv-proto.h"
 #include "exec/cpu-defs.h"
+#include "exec/cpu-common.h"
 #include "qapi/qapi-types-common.h"
 #include "qemu/cpu-float.h"
 #include "qemu/timer.h"
@@ -261,6 +262,13 @@ typedef enum X86Seg {
 #define CR4_SMAP_MASK   (1U << 21)
 #define CR4_PKE_MASK   (1U << 22)
 #define CR4_PKS_MASK   (1U << 24)
+#define CR4_LAM_SUP_MASK (1U << 28)
+
+#ifdef TARGET_X86_64
+#define CR4_FRED_MASK   (1ULL << 32)
+#else
+#define CR4_FRED_MASK   0
+#endif
 
 #define CR4_RESERVED_MASK \
 (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \
@@ -269,7 +277,8 @@ typedef enum X86Seg {
                 | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \
                 | CR4_LA57_MASK \
                 | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \
-                | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK))
+                | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \
+                | CR4_LAM_SUP_MASK | CR4_FRED_MASK))
 
 #define DR6_BD          (1 << 13)
 #define DR6_BS          (1 << 14)
@@ -519,9 +528,22 @@ typedef enum X86Seg {
 
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
+#define MSR_AMD64_SEV_ES_GHCB           0xc0010130
+
 #define MSR_IA32_XFD                    0x000001c4
 #define MSR_IA32_XFD_ERR                0x000001c5
 
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0              0x000001cc       /* Stack level 0 regular stack pointer */
+#define MSR_IA32_FRED_RSP1              0x000001cd       /* Stack level 1 regular stack pointer */
+#define MSR_IA32_FRED_RSP2              0x000001ce       /* Stack level 2 regular stack pointer */
+#define MSR_IA32_FRED_RSP3              0x000001cf       /* Stack level 3 regular stack pointer */
+#define MSR_IA32_FRED_STKLVLS           0x000001d0       /* FRED exception stack levels */
+#define MSR_IA32_FRED_SSP1              0x000001d1       /* Stack level 1 shadow stack pointer in ring 0 */
+#define MSR_IA32_FRED_SSP2              0x000001d2       /* Stack level 2 shadow stack pointer in ring 0 */
+#define MSR_IA32_FRED_SSP3              0x000001d3       /* Stack level 3 shadow stack pointer in ring 0 */
+#define MSR_IA32_FRED_CONFIG            0x000001d4       /* FRED Entrypoint and interrupt stack level */
+
 #define MSR_IA32_BNDCFGS                0x00000d90
 #define MSR_IA32_XSS                    0x00000da0
 #define MSR_IA32_UMWAIT_CONTROL         0xe1
@@ -633,8 +655,7 @@ typedef enum FeatureWord {
 } FeatureWord;
 
 typedef uint64_t FeatureWordArray[FEATURE_WORDS];
-uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
-                                            bool migratable_only);
+uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w);
 
 /* cpuid_features bits */
 #define CPUID_FP87 (1U << 0)
@@ -780,6 +801,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 
 /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */
 #define CPUID_7_0_EBX_FSGSBASE          (1U << 0)
+/* Support TSC adjust MSR */
+#define CPUID_7_0_EBX_TSC_ADJUST        (1U << 1)
 /* Support SGX */
 #define CPUID_7_0_EBX_SGX               (1U << 2)
 /* 1st Group of Advanced Bit Manipulation Extensions */
@@ -788,6 +811,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_7_0_EBX_HLE               (1U << 4)
 /* Intel Advanced Vector Extensions 2 */
 #define CPUID_7_0_EBX_AVX2              (1U << 5)
+/* FPU data pointer updated only on x87 exceptions */
+#define CPUID_7_0_EBX_FDP_EXCPTN_ONLY (1u << 6)
 /* Supervisor-mode Execution Prevention */
 #define CPUID_7_0_EBX_SMEP              (1U << 7)
 /* 2nd Group of Advanced Bit Manipulation Extensions */
@@ -798,6 +823,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_7_0_EBX_INVPCID           (1U << 10)
 /* Restricted Transactional Memory */
 #define CPUID_7_0_EBX_RTM               (1U << 11)
+/* Zero out FPU CS and FPU DS */
+#define CPUID_7_0_EBX_ZERO_FCS_FDS      (1U << 13)
 /* Memory Protection Extension */
 #define CPUID_7_0_EBX_MPX               (1U << 14)
 /* AVX-512 Foundation */
@@ -909,6 +936,12 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 /* Speculative Store Bypass Disable */
 #define CPUID_7_0_EDX_SPEC_CTRL_SSBD    (1U << 31)
 
+/* SHA512 Instruction */
+#define CPUID_7_1_EAX_SHA512            (1U << 0)
+/* SM3 Instruction */
+#define CPUID_7_1_EAX_SM3               (1U << 1)
+/* SM4 Instruction */
+#define CPUID_7_1_EAX_SM4               (1U << 2)
 /* AVX VNNI Instruction */
 #define CPUID_7_1_EAX_AVX_VNNI          (1U << 4)
 /* AVX512 BFloat16 Instruction */
@@ -921,20 +954,40 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_7_1_EAX_FSRS              (1U << 11)
 /* Fast Short REP CMPS/SCAS */
 #define CPUID_7_1_EAX_FSRC              (1U << 12)
+/* Flexible return and event delivery (FRED) */
+#define CPUID_7_1_EAX_FRED              (1U << 17)
+/* Load into IA32_KERNEL_GS_BASE (LKGS) */
+#define CPUID_7_1_EAX_LKGS              (1U << 18)
+/* Non-Serializing Write to Model Specific Register (WRMSRNS) */
+#define CPUID_7_1_EAX_WRMSRNS           (1U << 19)
 /* Support Tile Computational Operations on FP16 Numbers */
 #define CPUID_7_1_EAX_AMX_FP16          (1U << 21)
 /* Support for VPMADD52[H,L]UQ */
 #define CPUID_7_1_EAX_AVX_IFMA          (1U << 23)
+/* Linear Address Masking */
+#define CPUID_7_1_EAX_LAM               (1U << 26)
 
 /* Support for VPDPB[SU,UU,SS]D[,S] */
 #define CPUID_7_1_EDX_AVX_VNNI_INT8     (1U << 4)
 /* AVX NE CONVERT Instructions */
 #define CPUID_7_1_EDX_AVX_NE_CONVERT    (1U << 5)
+/* AVX-VNNI-INT16 Instructions */
+#define CPUID_7_1_EDX_AVX_VNNI_INT16    (1U << 10)
 /* AMX COMPLEX Instructions */
 #define CPUID_7_1_EDX_AMX_COMPLEX       (1U << 8)
 /* PREFETCHIT0/1 Instructions */
 #define CPUID_7_1_EDX_PREFETCHITI       (1U << 14)
 
+/* Indicate bit 7 of the IA32_SPEC_CTRL MSR is supported */
+#define CPUID_7_2_EDX_PSFD              (1U << 0)
+/* Indicate bits 3 and 4 of the IA32_SPEC_CTRL MSR are supported */
+#define CPUID_7_2_EDX_IPRED_CTRL        (1U << 1)
+/* Indicate bits 5 and 6 of the IA32_SPEC_CTRL MSR are supported */
+#define CPUID_7_2_EDX_RRSBA_CTRL        (1U << 2)
+/* Indicate bit 8 of the IA32_SPEC_CTRL MSR is supported */
+#define CPUID_7_2_EDX_DDPD_U            (1U << 3)
+/* Indicate bit 10 of the IA32_SPEC_CTRL MSR is supported */
+#define CPUID_7_2_EDX_BHI_CTRL          (1U << 4)
 /* Do not exhibit MXCSR Configuration Dependent Timing (MCDT) behavior */
 #define CPUID_7_2_EDX_MCDT_NO           (1U << 5)
 
@@ -964,7 +1017,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_8000_0008_EBX_AMD_PSFD    (1U << 28)
 
 /* Processor ignores nested data breakpoints */
-#define CPUID_8000_0021_EAX_No_NESTED_DATA_BP    (1U << 0)
+#define CPUID_8000_0021_EAX_NO_NESTED_DATA_BP    (1U << 0)
 /* LFENCE is always serializing */
 #define CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING    (1U << 2)
 /* Null Selector Clears Base */
@@ -1028,7 +1081,10 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define MSR_ARCH_CAP_FBSDP_NO           (1U << 14)
 #define MSR_ARCH_CAP_PSDP_NO            (1U << 15)
 #define MSR_ARCH_CAP_FB_CLEAR           (1U << 17)
+#define MSR_ARCH_CAP_BHI_NO             (1U << 20)
 #define MSR_ARCH_CAP_PBRSB_NO           (1U << 24)
+#define MSR_ARCH_CAP_GDS_NO             (1U << 26)
+#define MSR_ARCH_CAP_RFDS_NO            (1U << 27)
 
 #define MSR_CORE_CAP_SPLIT_LOCK_DETECT  (1U << 5)
 
@@ -1040,6 +1096,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define MSR_VMX_BASIC_INS_OUTS                       (1ULL << 54)
 #define MSR_VMX_BASIC_TRUE_CTLS                      (1ULL << 55)
 #define MSR_VMX_BASIC_ANY_ERRCODE                    (1ULL << 56)
+#define MSR_VMX_BASIC_NESTED_EXCEPTION               (1ULL << 58)
 
 #define MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK     0x1Full
 #define MSR_VMX_MISC_STORE_LMA                       (1ULL << 5)
@@ -1135,6 +1192,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define VMX_VM_EXIT_PT_CONCEAL_PIP                  0x01000000
 #define VMX_VM_EXIT_CLEAR_IA32_RTIT_CTL             0x02000000
 #define VMX_VM_EXIT_LOAD_IA32_PKRS                  0x20000000
+#define VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS     0x80000000
 
 #define VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS            0x00000004
 #define VMX_VM_ENTRY_IA32E_MODE                     0x00000200
@@ -1672,6 +1730,17 @@ typedef struct CPUArchState {
     target_ulong cstar;
     target_ulong fmask;
     target_ulong kernelgsbase;
+
+    /* FRED MSRs */
+    uint64_t fred_rsp0;
+    uint64_t fred_rsp1;
+    uint64_t fred_rsp2;
+    uint64_t fred_rsp3;
+    uint64_t fred_stklvls;
+    uint64_t fred_ssp1;
+    uint64_t fred_ssp2;
+    uint64_t fred_ssp3;
+    uint64_t fred_config;
 #endif
 
     uint64_t tsc_adjust;
@@ -1884,6 +1953,9 @@ typedef struct CPUArchState {
 
     /* Number of dies within this CPU package. */
     unsigned nr_dies;
+
+    /* GHCB guest physical address info */
+    uint64_t ghcb_gpa;
 } CPUX86State;
 
 struct kvm_msrs;
@@ -2019,6 +2091,14 @@ struct ArchCPU {
     /* Number of physical address bits supported */
     uint32_t phys_bits;
 
+    /*
+     * Number of guest physical address bits available. Usually this is
+     * identical to host physical address bits. With NPT or EPT 4-level
+     * paging, guest physical address space might be restricted to 48 bits
+     * even if the host cpu supports more physical address bits.
+     */
+    uint32_t guest_phys_bits;
+
     /* in order to simplify APIC support, we leave this pointer to the
        user */
     struct DeviceState *apic_state;
@@ -2081,6 +2161,11 @@ struct X86CPUClass {
 extern const VMStateDescription vmstate_x86_cpu;
 #endif
 
+#define DEFAULT_VM_CPU_PHYS_BITS 42
+#define LARGE_VM_CPU_PHYS_BITS 46
+
+void x86_cpu_adjuest_by_ram_size(ram_addr_t ram_size, X86CPU *cpu);
+
 int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request);
 
 int x86_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cpu,
@@ -2519,6 +2604,12 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env)
     if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) {
         reserved_bits |= CR4_PKS_MASK;
     }
+    if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) {
+        reserved_bits |= CR4_LAM_SUP_MASK;
+    }
+    if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED)) {
+        reserved_bits |= CR4_FRED_MASK;
+    }
     return reserved_bits;
 }
 
diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c
new file mode 100644
index 0000000000000000000000000000000000000000..735cce0e4b78c5e13818a8c508c1d5f4304432b5
--- /dev/null
+++ b/target/i386/csv-sysemu-stub.c
@@ -0,0 +1,51 @@
+/*
+ * QEMU CSV system stub
+ *
+ * Copyright: Hygon Info Technologies Ltd. 2022
+ *
+ * Author:
+ *      Jiang Xin <jiangxin@hygon.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sev.h"
+#include "csv.h"
+
+int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops)
+{
+    return 0;
+}
+
+int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp)
+{
+    g_assert_not_reached();
+}
+
+int csv3_launch_encrypt_vmcb(void)
+{
+    g_assert_not_reached();
+}
+
+int csv3_shared_region_dma_map(uint64_t start, uint64_t end)
+{
+    return 0;
+}
+
+void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end)
+{
+
+}
+
+void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages)
+{
+
+}
+
+int csv3_set_guest_private_memory(Error **errp)
+{
+    g_assert_not_reached();
+}
diff --git a/target/i386/csv.c b/target/i386/csv.c
new file mode 100644
index 0000000000000000000000000000000000000000..b229f7c317789abb5f72ea5f7a959ed259c915a7
--- /dev/null
+++ b/target/i386/csv.c
@@ -0,0 +1,757 @@
+/*
+ * QEMU CSV support
+ *
+ * Copyright: Hygon Info Technologies Ltd. 2022
+ *
+ * Author:
+ *      Jiang Xin <jiangxin@hygon.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "sysemu/kvm.h"
+#include "exec/address-spaces.h"
+#include "migration/blocker.h"
+#include "migration/qemu-file.h"
+#include "migration/misc.h"
+#include "monitor/monitor.h"
+
+#include <linux/kvm.h>
+#include <linux/psp-sev.h>
+
+#ifdef CONFIG_NUMA
+#include <numaif.h>
+#endif
+
+#include "trace.h"
+#include "cpu.h"
+#include "sev.h"
+#include "csv.h"
+
+bool csv_kvm_cpu_reset_inhibit;
+uint32_t kvm_hygon_coco_ext;
+uint32_t kvm_hygon_coco_ext_inuse;
+
+struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = {
+    .save_setup = sev_save_setup,
+    .save_outgoing_page = NULL,
+    .load_incoming_page = csv3_load_incoming_page,
+    .is_gfn_in_unshared_region = NULL,
+    .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list,
+    .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list,
+    .queue_outgoing_page = csv3_queue_outgoing_page,
+    .save_queued_outgoing_pages = csv3_save_queued_outgoing_pages,
+    .queue_incoming_page = NULL,
+    .load_queued_incoming_pages = NULL,
+    .save_outgoing_cpu_state = csv3_save_outgoing_context,
+    .load_incoming_cpu_state = csv3_load_incoming_context,
+};
+
+#define CSV3_OUTGOING_PAGE_NUM \
+        (CSV3_OUTGOING_PAGE_WINDOW_SIZE / TARGET_PAGE_SIZE)
+
+Csv3GuestState csv3_guest = { 0 };
+
+int
+csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops)
+{
+    int fw_error;
+    int ret;
+    struct kvm_csv3_init_data data = { 0 };
+
+#ifdef CONFIG_NUMA
+    int mode;
+    unsigned long nodemask;
+
+    /* Set flags as 0 to retrieve the default NUMA policy. */
+    ret = get_mempolicy(&mode, &nodemask, sizeof(nodemask) * 8, NULL, 0);
+    if (ret == 0 && mode == MPOL_BIND)
+        data.nodemask = nodemask;
+#endif
+
+    if (!ops || !ops->sev_ioctl || !ops->fw_error_to_str)
+        return -1;
+
+    csv3_guest.policy = policy;
+    if (csv3_enabled()) {
+        ret = ops->sev_ioctl(fd, KVM_CSV3_INIT, &data, &fw_error);
+        if (ret) {
+            csv3_guest.policy = 0;
+            error_report("%s: Fail to initialize ret=%d fw_error=%d '%s'",
+                       __func__, ret, fw_error, ops->fw_error_to_str(fw_error));
+            return -1;
+        }
+
+        kvm_csv3_allowed = true;
+
+        csv3_guest.sev_fd = fd;
+        csv3_guest.state = state;
+        csv3_guest.sev_ioctl = ops->sev_ioctl;
+        csv3_guest.fw_error_to_str = ops->fw_error_to_str;
+        QTAILQ_INIT(&csv3_guest.dma_map_regions_list);
+        qemu_mutex_init(&csv3_guest.dma_map_regions_list_mutex);
+        csv3_guest.sev_send_start = ops->sev_send_start;
+        csv3_guest.sev_receive_start = ops->sev_receive_start;
+    }
+    return 0;
+}
+
+bool
+csv3_enabled(void)
+{
+    if (!is_hygon_cpu())
+        return false;
+
+    return sev_es_enabled() && (csv3_guest.policy & GUEST_POLICY_CSV3_BIT);
+}
+
+static bool
+csv3_check_state(SevState state)
+{
+    return *((SevState *)csv3_guest.state) == state;
+}
+
+static int
+csv3_ioctl(int cmd, void *data, int *error)
+{
+    if (csv3_guest.sev_ioctl)
+        return csv3_guest.sev_ioctl(csv3_guest.sev_fd, cmd, data, error);
+    else
+        return -1;
+}
+
+static const char *
+fw_error_to_str(int code)
+{
+    if (csv3_guest.fw_error_to_str)
+        return csv3_guest.fw_error_to_str(code);
+    else
+        return NULL;
+}
+
+static int
+csv3_launch_encrypt_data(uint64_t gpa, uint8_t *addr, uint64_t len)
+{
+    int ret, fw_error;
+    struct kvm_csv3_launch_encrypt_data update;
+
+    if (!addr || !len) {
+        return 1;
+    }
+
+    update.gpa = (__u64)gpa;
+    update.uaddr = (__u64)(unsigned long)addr;
+    update.len = len;
+    trace_kvm_csv3_launch_encrypt_data(gpa, addr, len);
+    ret = csv3_ioctl(KVM_CSV3_LAUNCH_ENCRYPT_DATA, &update, &fw_error);
+    if (ret) {
+        error_report("%s: CSV3 LAUNCH_ENCRYPT_DATA ret=%d fw_error=%d '%s'",
+                __func__, ret, fw_error, fw_error_to_str(fw_error));
+    }
+
+    return ret;
+}
+
+int
+csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp)
+{
+    int ret = 0;
+
+    if (!csv3_enabled()) {
+        error_setg(errp, "%s: CSV3 is not enabled", __func__);
+        return -1;
+    }
+
+    /* if CSV3 is in update state then load the data to secure memory */
+    if (csv3_check_state(SEV_STATE_LAUNCH_UPDATE)) {
+        ret = csv3_launch_encrypt_data(gpa, ptr, len);
+        if (ret)
+            error_setg(errp, "%s: CSV3 fail to encrypt data", __func__);
+    }
+
+    return ret;
+}
+
+int
+csv3_launch_encrypt_vmcb(void)
+{
+    int ret, fw_error;
+
+    if (!csv3_enabled()) {
+        error_report("%s: CSV3 is not enabled", __func__);
+        return -1;
+    }
+
+    ret = csv3_ioctl(KVM_CSV3_LAUNCH_ENCRYPT_VMCB, NULL, &fw_error);
+    if (ret) {
+        error_report("%s: CSV3 LAUNCH_ENCRYPT_VMCB ret=%d fw_error=%d '%s'",
+                     __func__, ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int csv3_shared_region_dma_map(uint64_t start, uint64_t end)
+{
+    MemoryRegionSection section;
+    AddressSpace *as;
+    QTAILQ_HEAD(, SharedRegionListener) *shared_region_listeners;
+    SharedRegionListener *shl;
+    MemoryListener *listener;
+    uint64_t size;
+    Csv3GuestState *s = &csv3_guest;
+    struct dma_map_region *region, *pos;
+    int ret = 0;
+
+    if (!csv3_enabled())
+        return 0;
+
+    if (end <= start)
+        return 0;
+
+    shared_region_listeners = shared_region_listeners_get();
+    if (QTAILQ_EMPTY(shared_region_listeners))
+        return 0;
+
+    size = end - start;
+
+    qemu_mutex_lock(&s->dma_map_regions_list_mutex);
+    QTAILQ_FOREACH(pos, &s->dma_map_regions_list, list) {
+        if (start >= (pos->start + pos->size)) {
+            continue;
+        } else if ((start + size) <= pos->start) {
+            break;
+        } else {
+            goto end;
+        }
+    }
+    QTAILQ_FOREACH(shl, shared_region_listeners, next) {
+        listener = shl->listener;
+        as = shl->as;
+        section = memory_region_find(as->root, start, size);
+        if (!section.mr) {
+            goto end;
+        }
+
+        if (!memory_region_is_ram(section.mr)) {
+            memory_region_unref(section.mr);
+            goto end;
+        }
+
+        if (listener->region_add) {
+            listener->region_add(listener, &section);
+        }
+        memory_region_unref(section.mr);
+    }
+
+    region = g_malloc0(sizeof(*region));
+    if (!region) {
+        ret = -1;
+        goto end;
+    }
+    region->start = start;
+    region->size = size;
+
+    if (pos) {
+        QTAILQ_INSERT_BEFORE(pos, region, list);
+    } else {
+        QTAILQ_INSERT_TAIL(&s->dma_map_regions_list, region, list);
+    }
+
+end:
+    qemu_mutex_unlock(&s->dma_map_regions_list_mutex);
+    return ret;
+}
+
+void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages)
+{
+    struct kvm_csv3_handle_memory mem = { 0 };
+    MemoryRegion *mr = NULL;
+    void *hva;
+    int ret;
+
+    if (!csv3_enabled())
+        return;
+
+    if (!gpa || !num_pages)
+        return;
+
+    mem.gpa = (__u64)gpa;
+    mem.num_pages = (__u32)num_pages;
+    mem.opcode = (__u32)KVM_CSV3_RELEASE_SHARED_MEMORY;
+
+    /* unpin the pages */
+    ret = csv3_ioctl(KVM_CSV3_HANDLE_MEMORY, &mem, NULL);
+    if (ret <= 0) {
+        if (ret < 0)
+            error_report("%s: CSV3 unpin failed ret %d", __func__, ret);
+        return;
+    }
+
+    /* drop the pages */
+    hva = gpa2hva(&mr, gpa, num_pages << TARGET_PAGE_BITS, NULL);
+    if (hva) {
+        ret = madvise(hva, num_pages << TARGET_PAGE_BITS, MADV_DONTNEED);
+        if (ret)
+            error_report("%s: madvise failed %d", __func__, ret);
+    }
+}
+
+void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end)
+{
+    MemoryRegionSection section;
+    AddressSpace *as;
+    QTAILQ_HEAD(, SharedRegionListener) *shared_region_listeners;
+    SharedRegionListener *shl;
+    MemoryListener *listener;
+    uint64_t size;
+    Csv3GuestState *s = &csv3_guest;
+    struct dma_map_region *pos, *next_pos;
+
+    if (!csv3_enabled())
+        return;
+
+    if (end <= start)
+        return;
+
+    shared_region_listeners = shared_region_listeners_get();
+    if (QTAILQ_EMPTY(shared_region_listeners))
+        return;
+
+    size = end - start;
+
+    qemu_mutex_lock(&s->dma_map_regions_list_mutex);
+    QTAILQ_FOREACH_SAFE(pos, &s->dma_map_regions_list, list, next_pos) {
+        uint64_t l, r;
+        uint64_t curr_end = pos->start + pos->size;
+
+        l = MAX(start, pos->start);
+        r = MIN(start + size, pos->start + pos->size);
+        if (l < r) {
+            if ((start <= pos->start) && (start + size >= pos->start + pos->size)) {
+                QTAILQ_FOREACH(shl, shared_region_listeners, next) {
+                    listener = shl->listener;
+                    as = shl->as;
+                    section = memory_region_find(as->root, pos->start, pos->size);
+                    if (!section.mr) {
+                        goto end;
+                    }
+                    if (listener->region_del) {
+                        listener->region_del(listener, &section);
+                    }
+                    memory_region_unref(section.mr);
+                }
+
+                QTAILQ_REMOVE(&s->dma_map_regions_list, pos, list);
+                g_free(pos);
+            }
+            break;
+        }
+        if ((start + size) <= curr_end) {
+            break;
+        }
+    }
+end:
+    qemu_mutex_unlock(&s->dma_map_regions_list_mutex);
+    return;
+}
+
+static inline hwaddr csv3_hva_to_gfn(uint8_t *ptr)
+{
+    ram_addr_t offset = RAM_ADDR_INVALID;
+
+    kvm_physical_memory_addr_from_host(kvm_state, ptr, &offset);
+
+    return offset >> TARGET_PAGE_BITS;
+}
+
+static int
+csv3_send_start(QEMUFile *f, uint64_t *bytes_sent)
+{
+    if (csv3_guest.sev_send_start)
+        return csv3_guest.sev_send_start(f, bytes_sent);
+    else
+        return -1;
+}
+
+static int
+csv3_send_get_packet_len(int *fw_err)
+{
+    int ret;
+    struct kvm_csv3_send_encrypt_data update = {0};
+
+    update.hdr_len = 0;
+    update.trans_len = 0;
+    ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_DATA, &update, fw_err);
+    if (*fw_err != SEV_RET_INVALID_LEN) {
+        error_report("%s: failed to get session length ret=%d fw_error=%d '%s'",
+                    __func__, ret, *fw_err, fw_error_to_str(*fw_err));
+        ret = 0;
+        goto err;
+    }
+
+    if (update.hdr_len <= INT_MAX)
+        ret = update.hdr_len;
+    else
+        ret = 0;
+
+err:
+    return ret;
+}
+
+static int
+csv3_send_encrypt_data(Csv3GuestState *s, QEMUFile *f,
+                       uint8_t *ptr, uint32_t size, uint64_t *bytes_sent)
+{
+    int ret, fw_error = 0;
+    guchar *trans;
+    uint32_t guest_addr_entry_num;
+    uint32_t i;
+    struct kvm_csv3_send_encrypt_data update = { };
+
+    /*
+     * If this is first call then query the packet header bytes and allocate
+     * the packet buffer.
+     */
+    if (!s->send_packet_hdr) {
+        s->send_packet_hdr_len = csv3_send_get_packet_len(&fw_error);
+        if (s->send_packet_hdr_len < 1) {
+            error_report("%s: SEND_UPDATE fw_error=%d '%s'",
+                         __func__, fw_error, fw_error_to_str(fw_error));
+            return 1;
+        }
+
+        s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len);
+    }
+
+    if (!s->guest_addr_len || !s->guest_addr_data) {
+        error_report("%s: invalid host address or size", __func__);
+        return 1;
+    } else {
+        guest_addr_entry_num = s->guest_addr_len / sizeof(struct guest_addr_entry);
+    }
+
+    /* allocate transport buffer */
+    trans = g_new(guchar, guest_addr_entry_num * TARGET_PAGE_SIZE);
+
+    update.hdr_uaddr = (uintptr_t)s->send_packet_hdr;
+    update.hdr_len = s->send_packet_hdr_len;
+    update.guest_addr_data = (uintptr_t)s->guest_addr_data;
+    update.guest_addr_len = s->guest_addr_len;
+    update.trans_uaddr = (uintptr_t)trans;
+    update.trans_len = guest_addr_entry_num * TARGET_PAGE_SIZE;
+
+    trace_kvm_csv3_send_encrypt_data(trans, update.trans_len);
+
+    ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_DATA, &update, &fw_error);
+    if (ret) {
+        error_report("%s: SEND_ENCRYPT_DATA ret=%d fw_error=%d '%s'",
+                     __func__, ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+    for (i = 0; i < guest_addr_entry_num; i++) {
+        if (s->guest_addr_data[i].share)
+            memcpy(trans + i * TARGET_PAGE_SIZE, (guchar *)s->guest_hva_data[i].hva,
+                   TARGET_PAGE_SIZE);
+    }
+
+    qemu_put_be32(f, update.hdr_len);
+    qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len);
+    *bytes_sent += 4 + update.hdr_len;
+
+    qemu_put_be32(f, update.guest_addr_len);
+    qemu_put_buffer(f, (uint8_t *)update.guest_addr_data, update.guest_addr_len);
+    *bytes_sent += 4 + update.guest_addr_len;
+
+    qemu_put_be32(f, update.trans_len);
+    qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+    *bytes_sent += (4 + update.trans_len);
+
+err:
+    s->guest_addr_len = 0;
+    g_free(trans);
+    return ret;
+}
+
+int
+csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr)
+{
+    Csv3GuestState *s = &csv3_guest;
+    uint32_t i = 0;
+
+    if (!s->guest_addr_data) {
+        s->guest_hva_data = g_new0(struct guest_hva_entry, CSV3_OUTGOING_PAGE_NUM);
+        s->guest_addr_data = g_new0(struct guest_addr_entry, CSV3_OUTGOING_PAGE_NUM);
+        s->guest_addr_len = 0;
+    }
+
+    if (s->guest_addr_len >= sizeof(struct guest_addr_entry) * CSV3_OUTGOING_PAGE_NUM) {
+        error_report("Failed to queue outgoing page");
+        return 1;
+    }
+
+    i = s->guest_addr_len / sizeof(struct guest_addr_entry);
+    s->guest_hva_data[i].hva = (uintptr_t)ptr;
+    s->guest_addr_data[i].share = 0;
+    s->guest_addr_data[i].reserved = 0;
+    s->guest_addr_data[i].gfn = csv3_hva_to_gfn(ptr);
+    s->guest_addr_len += sizeof(struct guest_addr_entry);
+
+    return 0;
+}
+
+int
+csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent)
+{
+    Csv3GuestState *s = &csv3_guest;
+
+    /*
+     * If this is a first buffer then create outgoing encryption context
+     * and write our PDH, policy and session data.
+     */
+    if (!csv3_check_state(SEV_STATE_SEND_UPDATE) &&
+        csv3_send_start(f, bytes_sent)) {
+        error_report("Failed to create outgoing context");
+        return 1;
+    }
+
+    return csv3_send_encrypt_data(s, f, NULL, 0, bytes_sent);
+}
+
+static int
+csv3_receive_start(QEMUFile *f)
+{
+    if (csv3_guest.sev_receive_start)
+        return csv3_guest.sev_receive_start(f);
+    else
+        return -1;
+}
+
+static int csv3_receive_encrypt_data(QEMUFile *f, uint8_t *ptr)
+{
+    int ret = 1, fw_error = 0;
+    uint32_t i, guest_addr_entry_num;
+    gchar *hdr = NULL, *trans = NULL;
+    struct guest_addr_entry *guest_addr_data;
+    struct kvm_csv3_receive_encrypt_data update = {};
+    void *hva = NULL;
+    MemoryRegion *mr = NULL;
+
+    /* get packet header */
+    update.hdr_len = qemu_get_be32(f);
+
+    hdr = g_new(gchar, update.hdr_len);
+    qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len);
+    update.hdr_uaddr = (uintptr_t)hdr;
+
+    /* get guest addr data */
+    update.guest_addr_len = qemu_get_be32(f);
+
+    guest_addr_data = (struct guest_addr_entry *)g_new(gchar, update.guest_addr_len);
+    qemu_get_buffer(f, (uint8_t *)guest_addr_data, update.guest_addr_len);
+    update.guest_addr_data = (uintptr_t)guest_addr_data;
+
+    /* get transport buffer */
+    update.trans_len = qemu_get_be32(f);
+
+    trans = g_new(gchar, update.trans_len);
+    update.trans_uaddr = (uintptr_t)trans;
+    qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+
+    /* update share memory. */
+    guest_addr_entry_num = update.guest_addr_len / sizeof(struct guest_addr_entry);
+    for (i = 0; i < guest_addr_entry_num; i++) {
+        if (guest_addr_data[i].share) {
+            hva = gpa2hva(&mr,
+                          ((uint64_t)guest_addr_data[i].gfn << TARGET_PAGE_BITS),
+                          TARGET_PAGE_SIZE,
+                          NULL);
+            if (hva)
+                memcpy(hva, trans + i * TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
+        }
+    }
+
+    trace_kvm_csv3_receive_encrypt_data(trans, update.trans_len, hdr, update.hdr_len);
+
+    ret = csv3_ioctl(KVM_CSV3_RECEIVE_ENCRYPT_DATA, &update, &fw_error);
+    if (ret) {
+        error_report("Error RECEIVE_ENCRYPT_DATA ret=%d fw_error=%d '%s'",
+                     ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+err:
+    g_free(trans);
+    g_free(guest_addr_data);
+    g_free(hdr);
+    return ret;
+}
+
+int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr)
+{
+    /*
+     * If this is first buffer and SEV is not in recieiving state then
+     * use RECEIVE_START command to create a encryption context.
+     */
+    if (!csv3_check_state(SEV_STATE_RECEIVE_UPDATE) &&
+        csv3_receive_start(f)) {
+        return 1;
+    }
+
+    return csv3_receive_encrypt_data(f, ptr);
+}
+
+static int
+csv3_send_get_context_len(int *fw_err, int *context_len, int *hdr_len)
+{
+    int ret = 0;
+    struct kvm_csv3_send_encrypt_context update = { 0 };
+
+    ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_CONTEXT, &update, fw_err);
+    if (*fw_err != SEV_RET_INVALID_LEN) {
+        error_report("%s: failed to get context length ret=%d fw_error=%d '%s'",
+                    __func__, ret, *fw_err, fw_error_to_str(*fw_err));
+        ret = -1;
+        goto err;
+    }
+
+    if (update.trans_len <= INT_MAX && update.hdr_len <= INT_MAX) {
+        *context_len = update.trans_len;
+        *hdr_len = update.hdr_len;
+    }
+    ret = 0;
+err:
+    return ret;
+}
+
+static int
+csv3_send_encrypt_context(Csv3GuestState *s, QEMUFile *f, uint64_t *bytes_sent)
+{
+    int ret, fw_error = 0;
+    int context_len = 0;
+    int hdr_len = 0;
+    guchar *trans;
+    guchar *hdr;
+    struct kvm_csv3_send_encrypt_context update = { };
+
+    ret = csv3_send_get_context_len(&fw_error, &context_len, &hdr_len);
+    if (context_len < 1 || hdr_len < 1) {
+        error_report("%s: fail to get context length fw_error=%d '%s'",
+                     __func__, fw_error, fw_error_to_str(fw_error));
+        return 1;
+    }
+
+    /* allocate transport buffer */
+    trans = g_new(guchar, context_len);
+    hdr = g_new(guchar, hdr_len);
+
+    update.hdr_uaddr = (uintptr_t)hdr;
+    update.hdr_len = hdr_len;
+    update.trans_uaddr = (uintptr_t)trans;
+    update.trans_len = context_len;
+
+    trace_kvm_csv3_send_encrypt_context(trans, update.trans_len);
+
+    ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_CONTEXT, &update, &fw_error);
+    if (ret) {
+        error_report("%s: SEND_ENCRYPT_CONTEXT ret=%d fw_error=%d '%s'",
+                     __func__, ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+    qemu_put_be32(f, update.hdr_len);
+    qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len);
+    *bytes_sent += 4 + update.hdr_len;
+
+    qemu_put_be32(f, update.trans_len);
+    qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+    *bytes_sent += 4 + update.trans_len;
+
+err:
+    g_free(trans);
+    g_free(hdr);
+    return ret;
+}
+
+static int
+csv3_receive_encrypt_context(Csv3GuestState *s, QEMUFile *f)
+{
+    int ret = 1, fw_error = 0;
+    gchar *hdr = NULL, *trans = NULL;
+    struct kvm_csv3_receive_encrypt_context update = {};
+
+    /* get packet header */
+    update.hdr_len = qemu_get_be32(f);
+
+    hdr = g_new(gchar, update.hdr_len);
+    qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len);
+    update.hdr_uaddr = (uintptr_t)hdr;
+
+    /* get transport buffer */
+    update.trans_len = qemu_get_be32(f);
+
+    trans = g_new(gchar, update.trans_len);
+    update.trans_uaddr = (uintptr_t)trans;
+    qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+
+    trace_kvm_csv3_receive_encrypt_context(trans, update.trans_len, hdr, update.hdr_len);
+
+    ret = csv3_ioctl(KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, &update, &fw_error);
+    if (ret) {
+        error_report("Error RECEIVE_ENCRYPT_CONTEXT ret=%d fw_error=%d '%s'",
+                     ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+err:
+    g_free(trans);
+    g_free(hdr);
+    return ret;
+}
+
+int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent)
+{
+    Csv3GuestState *s = &csv3_guest;
+
+    /* send csv3 context. */
+    return csv3_send_encrypt_context(s, f, bytes_sent);
+}
+
+int csv3_load_incoming_context(QEMUFile *f)
+{
+    Csv3GuestState *s = &csv3_guest;
+
+    /* receive csv3 context. */
+    return csv3_receive_encrypt_context(s, f);
+}
+
+int csv3_set_guest_private_memory(Error **errp)
+{
+    int fw_error;
+    int ret = 0;
+
+    if (!csv3_enabled()) {
+        error_setg(errp, "%s: CSV3 is not enabled", __func__);
+        return -1;
+    }
+
+    /* if CSV3 is in update state then load the data to secure memory */
+    if (csv3_check_state(SEV_STATE_LAUNCH_UPDATE)) {
+        trace_kvm_csv3_set_guest_private_memory();
+        ret = csv3_ioctl(KVM_CSV3_SET_GUEST_PRIVATE_MEMORY, NULL, &fw_error);
+        if (ret)
+            error_setg(errp, "%s: CSV3 fail set private memory, ret=%d"
+                       " fw_error=%d '%s'",
+                       __func__, ret, fw_error, fw_error_to_str(fw_error));
+    }
+
+    return ret;
+}
diff --git a/target/i386/csv.h b/target/i386/csv.h
new file mode 100644
index 0000000000000000000000000000000000000000..70f9933d3b29d83b5e88bf4b17b26e8df4ca0986
--- /dev/null
+++ b/target/i386/csv.h
@@ -0,0 +1,136 @@
+/*
+ * QEMU CSV support
+ *
+ * Copyright: Hygon Info Technologies Ltd. 2022
+ *
+ * Author:
+ *      Jiang Xin <jiangxin@hygon.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef I386_CSV_H
+#define I386_CSV_H
+
+#include "qapi/qapi-commands-misc-target.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "sev.h"
+
+#define GUEST_POLICY_CSV3_BIT    (1 << 6)
+#define GUEST_POLICY_REUSE_ASID  (1 << 7)
+
+#ifdef CONFIG_CSV
+
+#include "cpu.h"
+
+#define CPUID_VENDOR_HYGON_EBX   0x6f677948  /* "Hygo" */
+#define CPUID_VENDOR_HYGON_ECX   0x656e6975  /* "uine" */
+#define CPUID_VENDOR_HYGON_EDX   0x6e65476e  /* "nGen" */
+
+static bool __attribute__((unused)) is_hygon_cpu(void)
+{
+    uint32_t ebx = 0;
+    uint32_t ecx = 0;
+    uint32_t edx = 0;
+
+    host_cpuid(0, 0, NULL, &ebx, &ecx, &edx);
+
+    if (ebx == CPUID_VENDOR_HYGON_EBX &&
+        ecx == CPUID_VENDOR_HYGON_ECX &&
+        edx == CPUID_VENDOR_HYGON_EDX)
+        return true;
+    else
+        return false;
+}
+
+bool csv3_enabled(void);
+
+#else
+
+#define is_hygon_cpu() (false)
+#define csv3_enabled() (false)
+
+#endif
+
+#define CSV_OUTGOING_PAGE_WINDOW_SIZE     (4094 * TARGET_PAGE_SIZE)
+
+extern bool csv_kvm_cpu_reset_inhibit;
+extern uint32_t kvm_hygon_coco_ext;
+extern uint32_t kvm_hygon_coco_ext_inuse;
+
+typedef struct CsvBatchCmdList CsvBatchCmdList;
+typedef void (*CsvDestroyCmdNodeFn) (void *data);
+
+struct CsvBatchCmdList {
+    struct kvm_csv_batch_list_node *head;
+    struct kvm_csv_batch_list_node *tail;
+    CsvDestroyCmdNodeFn destroy_fn;
+};
+
+int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr);
+int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent);
+int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr);
+int csv_load_queued_incoming_pages(QEMUFile *f);
+int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent);
+int csv_load_incoming_cpu_state(QEMUFile *f);
+
+/* CSV3 */
+struct dma_map_region {
+    uint64_t start, size;
+    QTAILQ_ENTRY(dma_map_region) list;
+};
+
+#define CSV3_OUTGOING_PAGE_WINDOW_SIZE (512 * TARGET_PAGE_SIZE)
+
+struct guest_addr_entry {
+    uint64_t share:    1;
+    uint64_t reserved: 11;
+    uint64_t gfn:      52;
+};
+
+struct guest_hva_entry {
+    uint64_t  hva;
+};
+
+struct Csv3GuestState {
+    uint32_t policy;
+    int sev_fd;
+    void *state;
+    int (*sev_ioctl)(int fd, int cmd, void *data, int *error);
+    const char *(*fw_error_to_str)(int code);
+    QTAILQ_HEAD(, dma_map_region) dma_map_regions_list;
+    QemuMutex dma_map_regions_list_mutex;
+    gchar *send_packet_hdr;
+    size_t send_packet_hdr_len;
+    struct guest_hva_entry *guest_hva_data;
+    struct guest_addr_entry *guest_addr_data;
+    size_t guest_addr_len;
+
+    int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent);
+    int (*sev_receive_start)(QEMUFile *f);
+};
+
+typedef struct Csv3GuestState Csv3GuestState;
+
+extern struct Csv3GuestState csv3_guest;
+extern struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops;
+extern int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops);
+extern int csv3_launch_encrypt_vmcb(void);
+
+int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp);
+
+int csv3_shared_region_dma_map(uint64_t start, uint64_t end);
+void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end);
+void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages);
+int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr);
+int csv3_load_incoming_context(QEMUFile *f);
+int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr);
+int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent);
+int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent);
+
+int csv3_set_guest_private_memory(Error **errp);
+
+#endif
diff --git a/target/i386/helper.c b/target/i386/helper.c
index 2070dd0dda1f4c8e740698779cd79a038c42ffe1..1da7a7d315c3e61cdfc4d97526834db3eb566661 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -219,6 +219,10 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
         new_cr4 &= ~CR4_PKS_MASK;
     }
 
+    if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) {
+        new_cr4 &= ~CR4_LAM_SUP_MASK;
+    }
+
     env->cr[4] = new_cr4;
     env->hflags = hflags;
 
diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c
index 92ecb7254b8363de8c82f4800573ba990381e2ce..07738bf85740e8785a207cbd32cb6744eb78a699 100644
--- a/target/i386/host-cpu.c
+++ b/target/i386/host-cpu.c
@@ -13,6 +13,7 @@
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "sysemu/sysemu.h"
+#include "hw/boards.h"
 
 /* Note: Only safe for use on x86(-64) hosts */
 static uint32_t host_cpu_phys_bits(void)
@@ -57,14 +58,14 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu)
     uint32_t phys_bits = cpu->phys_bits;
     static bool warned;
 
-    /*
-     * Print a warning if the user set it to a value that's not the
-     * host value.
-     */
-    if (phys_bits != host_phys_bits && phys_bits != 0 &&
+    /* adjust x86 cpu phys_bits according to ram_size. */
+    x86_cpu_adjuest_by_ram_size(current_machine->ram_size, cpu);
+
+    /* Print a warning if the host value less than the user set. */
+    if (phys_bits > host_phys_bits && phys_bits != 0 &&
         !warned) {
         warn_report("Host physical bits (%u)"
-                    " does not match phys-bits property (%u)",
+                    " less than phys-bits property (%u)",
                     host_phys_bits, phys_bits);
         warned = true;
     }
diff --git a/target/i386/hvf/x86_task.c b/target/i386/hvf/x86_task.c
index f09bfbdda5bebbdfc21d5a6a0a8c6d46029d29fd..cdea2ea69d97486dd4b282bc2469c13f66906444 100644
--- a/target/i386/hvf/x86_task.c
+++ b/target/i386/hvf/x86_task.c
@@ -122,7 +122,6 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea
     load_regs(cpu);
 
     struct x86_segment_descriptor curr_tss_desc, next_tss_desc;
-    int ret;
     x68_segment_selector old_tss_sel = vmx_read_segment_selector(cpu, R_TR);
     uint64_t old_tss_base = vmx_read_segment_base(cpu, R_TR);
     uint32_t desc_limit;
@@ -138,7 +137,7 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea
     if (reason == TSR_IDT_GATE && gate_valid) {
         int dpl;
 
-        ret = x86_read_call_gate(cpu, &task_gate_desc, gate);
+        x86_read_call_gate(cpu, &task_gate_desc, gate);
 
         dpl = task_gate_desc.dpl;
         x68_segment_selector cs = vmx_read_segment_selector(cpu, R_CS);
@@ -167,11 +166,12 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea
         x86_write_segment_descriptor(cpu, &next_tss_desc, tss_sel);
     }
 
-    if (next_tss_desc.type & 8)
-        ret = task_switch_32(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc);
-    else
+    if (next_tss_desc.type & 8) {
+        task_switch_32(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc);
+    } else {
         //ret = task_switch_16(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc);
         VM_PANIC("task_switch_16");
+    }
 
     macvm_set_cr0(cpu->accel->fd, rvmcs(cpu->accel->fd, VMCS_GUEST_CR0) |
                                 CR0_TS_MASK);
diff --git a/target/i386/kvm/csv-stub.c b/target/i386/kvm/csv-stub.c
new file mode 100644
index 0000000000000000000000000000000000000000..8662d33206076c6686fb0e3bb65e6b4eba60ad61
--- /dev/null
+++ b/target/i386/kvm/csv-stub.c
@@ -0,0 +1,19 @@
+/*
+ * QEMU CSV stub
+ *
+ * Copyright Hygon Info Technologies Ltd. 2024
+ *
+ * Authors:
+ *      Han Liyang <hanliyang@hygon.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "csv.h"
+
+bool csv_kvm_cpu_reset_inhibit;
+uint32_t kvm_hygon_coco_ext;
+uint32_t kvm_hygon_coco_ext_inuse;
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index 9c791b7b0520089c15404eb1a2826c932b80d1ee..a3bc8d8f832c1358270846b473769d872a14463b 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -18,10 +18,32 @@
 #include "kvm_i386.h"
 #include "hw/core/accel-cpu.h"
 
+static void kvm_set_guest_phys_bits(CPUState *cs)
+{
+    X86CPU *cpu = X86_CPU(cs);
+    uint32_t eax, guest_phys_bits;
+
+    eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x80000008, 0, R_EAX);
+    guest_phys_bits = (eax >> 16) & 0xff;
+    if (!guest_phys_bits) {
+        return;
+    }
+    cpu->guest_phys_bits = guest_phys_bits;
+    if (cpu->guest_phys_bits > cpu->phys_bits) {
+        cpu->guest_phys_bits = cpu->phys_bits;
+    }
+
+    if (cpu->host_phys_bits && cpu->host_phys_bits_limit &&
+        cpu->guest_phys_bits > cpu->host_phys_bits_limit) {
+        cpu->guest_phys_bits = cpu->host_phys_bits_limit;
+    }
+}
+
 static bool kvm_cpu_realizefn(CPUState *cs, Error **errp)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
+    bool ret;
 
     /*
      * The realize order is important, since x86_cpu_realize() checks if
@@ -32,13 +54,15 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp)
      *
      * realize order:
      *
-     * x86_cpu_realize():
-     *  -> x86_cpu_expand_features()
-     *  -> cpu_exec_realizefn():
-     *            -> accel_cpu_common_realize()
-     *               kvm_cpu_realizefn() -> host_cpu_realizefn()
-     *  -> cpu_common_realizefn()
-     *  -> check/update ucode_rev, phys_bits, mwait
+     * x86_cpu_realizefn():
+     *   x86_cpu_expand_features()
+     *   cpu_exec_realizefn():
+     *      accel_cpu_common_realize()
+     *        kvm_cpu_realizefn()
+     *          host_cpu_realizefn()
+     *          kvm_set_guest_phys_bits()
+     *   check/update ucode_rev, phys_bits, guest_phys_bits, mwait
+     *   cpu_common_realizefn() (via xcc->parent_realize)
      */
     if (cpu->max_features) {
         if (enable_cpu_pm && kvm_has_waitpkg()) {
@@ -50,7 +74,17 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp)
                                                    MSR_IA32_UCODE_REV);
         }
     }
-    return host_cpu_realizefn(cs, errp);
+    ret = host_cpu_realizefn(cs, errp);
+    if (!ret) {
+        return ret;
+    }
+
+    if ((env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) &&
+        cpu->guest_phys_bits == -1) {
+        kvm_set_guest_phys_bits(cs);
+    }
+
+    return true;
 }
 
 static bool lmce_supported(void)
@@ -103,7 +137,7 @@ static void kvm_cpu_xsave_init(void)
         if (!esa->size) {
             continue;
         }
-        if ((x86_cpu_get_supported_feature_word(esa->feature, false) & esa->bits)
+        if ((x86_cpu_get_supported_feature_word(NULL, esa->feature) & esa->bits)
             != esa->bits) {
             continue;
         }
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 4ce80555b45c14fe8f683c1ee6c5bde5db216a9b..3a88e65635698ec3043a90b05cd984a2e93e7a58 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -32,6 +32,7 @@
 #include "sysemu/runstate.h"
 #include "kvm_i386.h"
 #include "sev.h"
+#include "csv.h"
 #include "xen-emu.h"
 #include "hyperv.h"
 #include "hyperv-proto.h"
@@ -148,6 +149,7 @@ static int has_xcrs;
 static int has_sregs2;
 static int has_exception_payload;
 static int has_triple_fault_event;
+static int has_map_gpa_range;
 
 static bool has_msr_mcg_ext_ctl;
 
@@ -1894,10 +1896,12 @@ int kvm_arch_init_vcpu(CPUState *cs)
             int times;
 
             c->function = i;
-            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
-                       KVM_CPUID_FLAG_STATE_READ_NEXT;
             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
             times = c->eax & 0xff;
+            if (times > 1) {
+                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
+                           KVM_CPUID_FLAG_STATE_READ_NEXT;
+            }
 
             for (j = 1; j < times; ++j) {
                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
@@ -1914,6 +1918,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
         }
         case 0x1f:
             if (env->nr_dies < 2) {
+                cpuid_i--;
                 break;
             }
             /* fallthrough */
@@ -1921,14 +1926,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
         case 0xb:
         case 0xd:
             for (j = 0; ; j++) {
-                if (i == 0xd && j == 64) {
-                    break;
-                }
-
-                if (i == 0x1f && j == 64) {
-                    break;
-                }
-
                 c->function = i;
                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                 c->index = j;
@@ -1944,7 +1941,12 @@ int kvm_arch_init_vcpu(CPUState *cs)
                     break;
                 }
                 if (i == 0xd && c->eax == 0) {
-                    continue;
+                    if (j < 63) {
+                        continue;
+                    } else {
+                        cpuid_i--;
+                        break;
+                    }
                 }
                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                     fprintf(stderr, "cpuid_data is full, no space for "
@@ -1954,7 +1956,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
                 c = &cpuid_data.entries[cpuid_i++];
             }
             break;
-        case 0x7:
         case 0x12:
             for (j = 0; ; j++) {
                 c->function = i;
@@ -1974,6 +1975,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
                 c = &cpuid_data.entries[cpuid_i++];
             }
             break;
+        case 0x7:
         case 0x14:
         case 0x1d:
         case 0x1e: {
@@ -2190,6 +2192,17 @@ int kvm_arch_init_vcpu(CPUState *cs)
         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
     }
 
+    if (sev_enabled()) {
+        c = cpuid_find_entry(&cpuid_data.cpuid,
+                             KVM_CPUID_FEATURES | kvm_base, 0);
+        if (c) {
+            c->eax |= (1 << KVM_FEATURE_MIGRATION_CONTROL);
+            if (has_map_gpa_range) {
+                c->eax |= (1 << KVM_FEATURE_HC_MAP_GPA_RANGE);
+            }
+        }
+    }
+
     cpuid_data.cpuid.nent = cpuid_i;
 
     cpuid_data.cpuid.padding = 0;
@@ -2257,6 +2270,11 @@ void kvm_arch_reset_vcpu(X86CPU *cpu)
         env->mp_state = KVM_MP_STATE_RUNNABLE;
     }
 
+    if (cpu_is_bsp(cpu) &&
+        sev_enabled() && has_map_gpa_range) {
+        sev_remove_shared_regions_list(0, -1);
+    }
+
     /* enabled by default */
     env->poll_control_msr = 1;
 
@@ -2475,6 +2493,32 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
     return true;
 }
 
+/*
+ * Currently this exit is only used by SEV guests for
+ * MSR_KVM_MIGRATION_CONTROL to indicate if the guest
+ * is ready for migration.
+ */
+static uint64_t msr_kvm_migration_control;
+
+static bool kvm_rdmsr_kvm_migration_control(X86CPU *cpu, uint32_t msr,
+                                            uint64_t *val)
+{
+    *val = msr_kvm_migration_control;
+
+    return true;
+}
+
+static bool kvm_wrmsr_kvm_migration_control(X86CPU *cpu, uint32_t msr,
+                                            uint64_t val)
+{
+    msr_kvm_migration_control = val;
+
+    if (val == KVM_MIGRATION_READY)
+        sev_del_migrate_blocker();
+
+    return true;
+}
+
 static Notifier smram_machine_done;
 static KVMMemoryListener smram_listener;
 static AddressSpace smram_address_space;
@@ -2583,6 +2627,34 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
 #endif
     }
 
+    has_map_gpa_range = kvm_check_extension(s, KVM_CAP_EXIT_HYPERCALL);
+    if (has_map_gpa_range) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0,
+                                KVM_EXIT_HYPERCALL_VALID_MASK);
+        if (ret < 0) {
+            error_report("kvm: Failed to enable MAP_GPA_RANGE cap: %s",
+                         strerror(-ret));
+            return ret;
+        }
+    }
+
+    if (is_hygon_cpu()) {
+        /* check and enable Hygon coco extensions */
+        kvm_hygon_coco_ext = (uint32_t)kvm_vm_check_extension(s,
+                                                    KVM_CAP_HYGON_COCO_EXT);
+        if (kvm_hygon_coco_ext) {
+            ret = kvm_vm_enable_cap(s, KVM_CAP_HYGON_COCO_EXT, 0,
+                                    (uint64_t)kvm_hygon_coco_ext);
+            if (ret == -EINVAL) {
+                error_report("kvm: Failed to enable KVM_CAP_HYGON_COCO_EXT cap: %s",
+                             strerror(-ret));
+                kvm_hygon_coco_ext_inuse = 0;
+            } else {
+                kvm_hygon_coco_ext_inuse = (uint32_t)ret;
+            }
+        }
+    }
+
     ret = kvm_get_supported_msrs(s);
     if (ret < 0) {
         return ret;
@@ -2711,6 +2783,15 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
                          strerror(-ret));
             exit(1);
         }
+
+        r = kvm_filter_msr(s, MSR_KVM_MIGRATION_CONTROL,
+                           kvm_rdmsr_kvm_migration_control,
+                           kvm_wrmsr_kvm_migration_control);
+        if (!r) {
+            error_report("Could not install MSR_KVM_MIGRATION_CONTROL handler: %s",
+                         strerror(-ret));
+            exit(1);
+        }
     }
 
     return 0;
@@ -3172,7 +3253,14 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
                       CR4_VMXE_MASK);
 
-    if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
+    if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
+        /* FRED injected-event data (0x2052).  */
+        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52);
+    } else if (f[FEAT_VMX_EXIT_CTLS] &
+               VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) {
+        /* Secondary VM-exit controls (0x2044).  */
+        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44);
+    } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
         /* TSC multiplier (0x2032).  */
         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
     } else {
@@ -3309,6 +3397,17 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
+        if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config);
+        }
     }
 #endif
 
@@ -3561,6 +3660,10 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         }
     }
 
+    if (sev_kvm_has_msr_ghcb) {
+        kvm_msr_entry_add(cpu, MSR_AMD64_SEV_ES_GHCB, env->ghcb_gpa);
+    }
+
     return kvm_buf_set_msrs(cpu);
 }
 
@@ -3781,6 +3884,17 @@ static int kvm_get_msrs(X86CPU *cpu)
         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
+        if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0);
+            kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0);
+        }
     }
 #endif
     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
@@ -3935,6 +4049,10 @@ static int kvm_get_msrs(X86CPU *cpu)
         }
     }
 
+    if (sev_kvm_has_msr_ghcb) {
+        kvm_msr_entry_add(cpu, MSR_AMD64_SEV_ES_GHCB, 0);
+    }
+
     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
     if (ret < 0) {
         return ret;
@@ -4002,6 +4120,33 @@ static int kvm_get_msrs(X86CPU *cpu)
         case MSR_LSTAR:
             env->lstar = msrs[i].data;
             break;
+        case MSR_IA32_FRED_RSP0:
+            env->fred_rsp0 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_RSP1:
+            env->fred_rsp1 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_RSP2:
+            env->fred_rsp2 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_RSP3:
+            env->fred_rsp3 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_STKLVLS:
+            env->fred_stklvls = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_SSP1:
+            env->fred_ssp1 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_SSP2:
+            env->fred_ssp2 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_SSP3:
+            env->fred_ssp3 = msrs[i].data;
+            break;
+        case MSR_IA32_FRED_CONFIG:
+            env->fred_config = msrs[i].data;
+            break;
 #endif
         case MSR_IA32_TSC:
             env->tsc = msrs[i].data;
@@ -4255,6 +4400,9 @@ static int kvm_get_msrs(X86CPU *cpu)
         case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
             env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
             break;
+        case MSR_AMD64_SEV_ES_GHCB:
+            env->ghcb_gpa = msrs[i].data;
+            break;
         }
     }
 
@@ -4935,6 +5083,31 @@ static int kvm_handle_tpr_access(X86CPU *cpu)
     return 1;
 }
 
+static int kvm_handle_exit_hypercall(X86CPU *cpu, struct kvm_run *run)
+{
+    /*
+     * Currently this exit is only used by SEV guests for
+     * guest page encryption status tracking.
+     */
+    if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) {
+        unsigned long enc = run->hypercall.args[2];
+        unsigned long gpa = run->hypercall.args[0];
+        unsigned long npages = run->hypercall.args[1];
+        unsigned long gfn_start = gpa >> TARGET_PAGE_BITS;
+        unsigned long gfn_end = gfn_start + npages;
+
+        if (enc) {
+            sev_remove_shared_regions_list(gfn_start, gfn_end);
+            csv3_shared_region_dma_unmap(gpa, gfn_end << TARGET_PAGE_BITS);
+            csv3_shared_region_release(gpa, npages);
+         } else {
+            sev_add_shared_regions_list(gfn_start, gfn_end);
+            csv3_shared_region_dma_map(gpa, gfn_end << TARGET_PAGE_BITS);
+         }
+    }
+    return 0;
+}
+
 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
 {
     static const uint8_t int3 = 0xcc;
@@ -5358,6 +5531,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
         ret = kvm_xen_handle_exit(cpu, &run->xen);
         break;
 #endif
+    case KVM_EXIT_HYPERCALL:
+        ret = kvm_handle_exit_hypercall(cpu, run);
+        break;
     default:
         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
         ret = -1;
@@ -5524,9 +5700,11 @@ void kvm_update_msi_routes_all(void *private, bool global,
 {
     int cnt = 0, vector;
     MSIRouteEntry *entry;
+    KVMRouteChange c;
     MSIMessage msg;
     PCIDevice *dev;
 
+    c = kvm_irqchip_begin_route_changes(kvm_state);
     /* TODO: explicit route update */
     QLIST_FOREACH(entry, &msi_route_list, list) {
         cnt++;
@@ -5543,9 +5721,9 @@ void kvm_update_msi_routes_all(void *private, bool global,
              */
             continue;
         }
-        kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
+        kvm_irqchip_update_msi_route(&c, entry->virq, msg, dev);
     }
-    kvm_irqchip_commit_routes(kvm_state);
+    kvm_irqchip_commit_route_changes(&c);
     trace_kvm_x86_update_msi_routes(cnt);
 }
 
@@ -5610,6 +5788,9 @@ bool kvm_has_waitpkg(void)
 
 bool kvm_arch_cpu_check_are_resettable(void)
 {
+    if (is_hygon_cpu())
+        return !csv_kvm_cpu_reset_inhibit;
+
     return !sev_es_enabled();
 }
 
diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
index 84d9143e60296d4388a57184b29693231e01205c..3c3f8cf93c1bfe17638beae51942163fc7f4003d 100644
--- a/target/i386/kvm/meson.build
+++ b/target/i386/kvm/meson.build
@@ -8,6 +8,7 @@ i386_kvm_ss.add(files(
 i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
 
 i386_kvm_ss.add(when: 'CONFIG_SEV', if_false: files('sev-stub.c'))
+i386_kvm_ss.add(when: 'CONFIG_CSV', if_false: files('csv-stub.c'))
 
 i386_system_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c'), if_false: files('hyperv-stub.c'))
 
diff --git a/target/i386/kvm/sev-stub.c b/target/i386/kvm/sev-stub.c
index 1be5341e8a6a0e82997d88679f9cecce025d9e71..a0aac1117fbaca51154fc85553ea242cbc8cd647 100644
--- a/target/i386/kvm/sev-stub.c
+++ b/target/i386/kvm/sev-stub.c
@@ -14,8 +14,25 @@
 #include "qemu/osdep.h"
 #include "sev.h"
 
+bool sev_kvm_has_msr_ghcb;
+
 int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
 {
     /* If we get here, cgs must be some non-SEV thing */
     return 0;
 }
+
+int sev_remove_shared_regions_list(unsigned long gfn_start,
+                                   unsigned long gfn_end)
+{
+    return 0;
+}
+
+int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end)
+{
+    return 0;
+}
+
+void sev_del_migrate_blocker(void)
+{
+}
diff --git a/target/i386/machine.c b/target/i386/machine.c
index a1041ef828cb2670e913738a44853b7eeadd1e7e..7cbfbc0efb051c899d59582f82eca8975e1d20bf 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -1544,6 +1544,33 @@ static const VMStateDescription vmstate_msr_xfd = {
 };
 
 #ifdef TARGET_X86_64
+static bool intel_fred_msrs_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+
+    return !!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED);
+}
+
+static const VMStateDescription vmstate_msr_fred = {
+    .name = "cpu/fred",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = intel_fred_msrs_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(env.fred_rsp0, X86CPU),
+        VMSTATE_UINT64(env.fred_rsp1, X86CPU),
+        VMSTATE_UINT64(env.fred_rsp2, X86CPU),
+        VMSTATE_UINT64(env.fred_rsp3, X86CPU),
+        VMSTATE_UINT64(env.fred_stklvls, X86CPU),
+        VMSTATE_UINT64(env.fred_ssp1, X86CPU),
+        VMSTATE_UINT64(env.fred_ssp2, X86CPU),
+        VMSTATE_UINT64(env.fred_ssp3, X86CPU),
+        VMSTATE_UINT64(env.fred_config, X86CPU),
+        VMSTATE_END_OF_LIST()
+            }
+        };
+
 static bool amx_xtile_needed(void *opaque)
 {
     X86CPU *cpu = opaque;
@@ -1605,6 +1632,27 @@ static const VMStateDescription vmstate_triple_fault = {
     }
 };
 
+#if defined(CONFIG_KVM) && defined(TARGET_X86_64)
+static bool msr_ghcb_gpa_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+
+    return env->ghcb_gpa != 0;
+}
+
+static const VMStateDescription vmstate_msr_ghcb_gpa = {
+    .name = "cpu/svm_msr_ghcb_gpa",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = msr_ghcb_gpa_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(env.ghcb_gpa, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+#endif
+
 const VMStateDescription vmstate_x86_cpu = {
     .name = "cpu",
     .version_id = 12,
@@ -1747,10 +1795,14 @@ const VMStateDescription vmstate_x86_cpu = {
         &vmstate_pdptrs,
         &vmstate_msr_xfd,
 #ifdef TARGET_X86_64
+        &vmstate_msr_fred,
         &vmstate_amx_xtile,
 #endif
         &vmstate_arch_lbr,
         &vmstate_triple_fault,
+#if defined(CONFIG_KVM) && defined(TARGET_X86_64)
+        &vmstate_msr_ghcb_gpa,
+#endif
         NULL
     }
 };
diff --git a/target/i386/meson.build b/target/i386/meson.build
index 7c74bfa8591b279e506f64451de406f9d718691d..594a0a6abf7bdc028fa7e94390890c2547ead7db 100644
--- a/target/i386/meson.build
+++ b/target/i386/meson.build
@@ -21,6 +21,7 @@ i386_system_ss.add(files(
   'cpu-sysemu.c',
 ))
 i386_system_ss.add(when: 'CONFIG_SEV', if_true: files('sev.c'), if_false: files('sev-sysemu-stub.c'))
+i386_system_ss.add(when: 'CONFIG_CSV', if_true: files('csv.c'), if_false: files('csv-sysemu-stub.c'))
 
 i386_user_ss = ss.source_set()
 
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 9a71246682584da3204ce2723c0d26b830cb0b16..b4b42fd71650c05751983cd87ccf905abd9ea1bb 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -27,10 +27,13 @@
 #include "crypto/hash.h"
 #include "sysemu/kvm.h"
 #include "sev.h"
+#include "csv.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/runstate.h"
 #include "trace.h"
 #include "migration/blocker.h"
+#include "migration/qemu-file.h"
+#include "migration/misc.h"
 #include "qom/object.h"
 #include "monitor/monitor.h"
 #include "monitor/hmp-target.h"
@@ -42,6 +45,11 @@
 #define TYPE_SEV_GUEST "sev-guest"
 OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST)
 
+struct shared_region {
+    unsigned long gfn_start, gfn_end;
+    QTAILQ_ENTRY(shared_region) list;
+};
+
 
 /**
  * SevGuestState:
@@ -64,6 +72,9 @@ struct SevGuestState {
     uint32_t cbitpos;
     uint32_t reduced_phys_bits;
     bool kernel_hashes;
+    char *user_id;
+    char *secret_header_file;
+    char *secret_file;
 
     /* runtime state */
     uint32_t handle;
@@ -73,10 +84,27 @@ struct SevGuestState {
     int sev_fd;
     SevState state;
     gchar *measurement;
+    guchar *remote_pdh;
+    size_t remote_pdh_len;
+    guchar *remote_plat_cert;
+    size_t remote_plat_cert_len;
+    guchar *amd_cert;
+    size_t amd_cert_len;
+    gchar *send_packet_hdr;
+    size_t send_packet_hdr_len;
+
+    /* needed by live migration of HYGON CSV2 guest */
+    gchar *send_vmsa_packet_hdr;
+    size_t send_vmsa_packet_hdr_len;
 
     uint32_t reset_cs;
     uint32_t reset_ip;
     bool reset_data_valid;
+
+    QTAILQ_HEAD(, shared_region) shared_regions_list;
+
+    /* link list used for HYGON CSV */
+    CsvBatchCmdList *csv_batch_cmd_list;
 };
 
 #define DEFAULT_GUEST_POLICY    0x1 /* disable debug */
@@ -127,6 +155,8 @@ QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0);
 static SevGuestState *sev_guest;
 static Error *sev_mig_blocker;
 
+bool sev_kvm_has_msr_ghcb;
+
 static const char *const sev_fw_errlist[] = {
     [SEV_RET_SUCCESS]                = "",
     [SEV_RET_INVALID_PLATFORM_STATE] = "Platform state is invalid",
@@ -157,6 +187,29 @@ static const char *const sev_fw_errlist[] = {
 
 #define SEV_FW_MAX_ERROR      ARRAY_SIZE(sev_fw_errlist)
 
+#define SEV_FW_BLOB_MAX_SIZE            0x4000          /* 16KB */
+
+#define SHARED_REGION_LIST_CONT     0x1
+#define SHARED_REGION_LIST_END      0x2
+
+#define ENCRYPTED_CPU_STATE_CONT    0x1
+#define ENCRYPTED_CPU_STATE_END     0x2
+
+static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = {
+    .save_setup = sev_save_setup,
+    .save_outgoing_page = sev_save_outgoing_page,
+    .load_incoming_page = sev_load_incoming_page,
+    .is_gfn_in_unshared_region = sev_is_gfn_in_unshared_region,
+    .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list,
+    .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list,
+    .queue_outgoing_page = csv_queue_outgoing_page,
+    .save_queued_outgoing_pages = csv_save_queued_outgoing_pages,
+    .queue_incoming_page = csv_queue_incoming_page,
+    .load_queued_incoming_pages = csv_load_queued_incoming_pages,
+    .save_outgoing_cpu_state = csv_save_outgoing_cpu_state,
+    .load_incoming_cpu_state = csv_load_incoming_cpu_state,
+};
+
 static int
 sev_ioctl(int fd, int cmd, void *data, int *error)
 {
@@ -323,6 +376,54 @@ sev_guest_set_dh_cert_file(Object *obj, const char *value, Error **errp)
     s->dh_cert_file = g_strdup(value);
 }
 
+static char *
+sev_guest_get_user_id(Object *obj, Error **errp)
+{
+    SevGuestState *s = SEV_GUEST(obj);
+
+    return g_strdup(s->user_id);
+}
+
+static void
+sev_guest_set_user_id(Object *obj, const char *value, Error **errp)
+{
+    SevGuestState *s = SEV_GUEST(obj);
+
+    s->user_id = g_strdup(value);
+}
+
+static char *
+sev_guest_get_secret_header_file(Object *obj, Error **errp)
+{
+    SevGuestState *s = SEV_GUEST(obj);
+
+    return g_strdup(s->secret_header_file);
+}
+
+static void
+sev_guest_set_secret_header_file(Object *obj, const char *value, Error **errp)
+{
+    SevGuestState *s = SEV_GUEST(obj);
+
+    s->secret_header_file = g_strdup(value);
+}
+
+static char *
+sev_guest_get_secret_file(Object *obj, Error **errp)
+{
+    SevGuestState *s = SEV_GUEST(obj);
+
+    return g_strdup(s->secret_file);
+}
+
+static void
+sev_guest_set_secret_file(Object *obj, const char *value, Error **errp)
+{
+    SevGuestState *s = SEV_GUEST(obj);
+
+    s->secret_file = g_strdup(value);
+}
+
 static char *
 sev_guest_get_sev_device(Object *obj, Error **errp)
 {
@@ -376,6 +477,21 @@ sev_guest_class_init(ObjectClass *oc, void *data)
                                    sev_guest_set_kernel_hashes);
     object_class_property_set_description(oc, "kernel-hashes",
             "add kernel hashes to guest firmware for measured Linux boot");
+    object_class_property_add_str(oc, "user-id",
+                                  sev_guest_get_user_id,
+                                  sev_guest_set_user_id);
+    object_class_property_set_description(oc, "user-id",
+            "user id of the guest owner");
+    object_class_property_add_str(oc, "secret-header-file",
+                                  sev_guest_get_secret_header_file,
+                                  sev_guest_set_secret_header_file);
+    object_class_property_set_description(oc, "secret-header-file",
+            "header file of the guest owner's secret");
+    object_class_property_add_str(oc, "secret-file",
+                                  sev_guest_get_secret_file,
+                                  sev_guest_set_secret_file);
+    object_class_property_set_description(oc, "secret-file",
+            "file of the guest owner's secret");
 }
 
 static void
@@ -539,7 +655,8 @@ static int sev_get_cpu0_id(int fd, guchar **id, size_t *id_len, Error **errp)
 
     /* query the ID length */
     r = sev_platform_ioctl(fd, SEV_GET_ID2, &get_id2, &err);
-    if (r < 0 && err != SEV_RET_INVALID_LEN) {
+    if (r < 0 && err != SEV_RET_INVALID_LEN &&
+        !(is_hygon_cpu() && err == SEV_RET_INVALID_PARAM)) {
         error_setg(errp, "SEV: Failed to get ID ret=%d fw_err=%d (%s)",
                    r, err, fw_error_to_str(err));
         return 1;
@@ -794,6 +911,9 @@ sev_launch_update_vmsa(SevGuestState *sev)
     return ret;
 }
 
+static int
+csv_load_launch_secret(const char *secret_header_file, const char *secret_file);
+
 static void
 sev_launch_get_measure(Notifier *notifier, void *unused)
 {
@@ -807,8 +927,12 @@ sev_launch_get_measure(Notifier *notifier, void *unused)
     }
 
     if (sev_es_enabled()) {
-        /* measure all the VM save areas before getting launch_measure */
-        ret = sev_launch_update_vmsa(sev);
+        if (csv3_enabled()) {
+            ret = csv3_launch_encrypt_vmcb();
+        } else {
+            /* measure all the VM save areas before getting launch_measure */
+            ret = sev_launch_update_vmsa(sev);
+        }
         if (ret) {
             exit(1);
         }
@@ -840,6 +964,15 @@ sev_launch_get_measure(Notifier *notifier, void *unused)
     /* encode the measurement value and emit the event */
     sev->measurement = g_base64_encode(data, measurement.len);
     trace_kvm_sev_launch_measurement(sev->measurement);
+
+    /* Hygon CSV will auto load guest owner's secret */
+    if (is_hygon_cpu()) {
+        if (sev->secret_header_file &&
+            strlen(sev->secret_header_file) &&
+            sev->secret_file &&
+            strlen(sev->secret_file))
+            csv_load_launch_secret(sev->secret_header_file, sev->secret_file);
+    }
 }
 
 static char *sev_get_launch_measurement(void)
@@ -894,18 +1027,142 @@ sev_launch_finish(SevGuestState *sev)
     migrate_add_blocker(&sev_mig_blocker, &error_fatal);
 }
 
+void
+sev_del_migrate_blocker(void)
+{
+    migrate_del_blocker(&sev_mig_blocker);
+}
+
+static int
+sev_receive_finish(SevGuestState *s)
+{
+    int error, ret = 1;
+
+    trace_kvm_sev_receive_finish();
+    ret = sev_ioctl(s->sev_fd, KVM_SEV_RECEIVE_FINISH, 0, &error);
+    if (ret) {
+        error_report("%s: RECEIVE_FINISH ret=%d fw_error=%d '%s'",
+                     __func__, ret, error, fw_error_to_str(error));
+        goto err;
+    }
+
+    sev_set_guest_state(s, SEV_STATE_RUNNING);
+err:
+    return ret;
+}
+
 static void
 sev_vm_state_change(void *opaque, bool running, RunState state)
 {
     SevGuestState *sev = opaque;
 
     if (running) {
-        if (!sev_check_state(sev, SEV_STATE_RUNNING)) {
+        if (sev_check_state(sev, SEV_STATE_RECEIVE_UPDATE)) {
+            sev_receive_finish(sev);
+        } else if (!sev_check_state(sev, SEV_STATE_RUNNING)) {
             sev_launch_finish(sev);
         }
     }
 }
 
+static inline bool check_blob_length(size_t value)
+{
+    if (value > SEV_FW_BLOB_MAX_SIZE) {
+        error_report("invalid length max=%d got=%ld",
+                     SEV_FW_BLOB_MAX_SIZE, value);
+        return false;
+    }
+
+    return true;
+}
+
+int sev_save_setup(const char *pdh, const char *plat_cert,
+                   const char *amd_cert)
+{
+    SevGuestState *s = sev_guest;
+
+    if (is_hygon_cpu()) {
+        if (sev_read_file_base64(pdh, &s->remote_pdh,
+                                 &s->remote_pdh_len) < 0) {
+            goto error;
+        }
+    } else {
+        s->remote_pdh = g_base64_decode(pdh, &s->remote_pdh_len);
+    }
+    if (!check_blob_length(s->remote_pdh_len)) {
+        goto error;
+    }
+
+    if (is_hygon_cpu()) {
+        if (sev_read_file_base64(plat_cert, &s->remote_plat_cert,
+                                 &s->remote_plat_cert_len) < 0) {
+            goto error;
+        }
+    } else {
+        s->remote_plat_cert = g_base64_decode(plat_cert,
+                                              &s->remote_plat_cert_len);
+    }
+    if (!check_blob_length(s->remote_plat_cert_len)) {
+        goto error;
+    }
+
+    if (is_hygon_cpu()) {
+        if (sev_read_file_base64(amd_cert, &s->amd_cert,
+                                 &s->amd_cert_len) < 0) {
+            goto error;
+        }
+    } else {
+        s->amd_cert = g_base64_decode(amd_cert, &s->amd_cert_len);
+    }
+    if (!check_blob_length(s->amd_cert_len)) {
+        goto error;
+    }
+
+    return 0;
+
+error:
+    g_free(s->remote_pdh);
+    g_free(s->remote_plat_cert);
+    g_free(s->amd_cert);
+
+    return 1;
+}
+
+static void
+sev_send_finish(void)
+{
+    int ret, error;
+
+    trace_kvm_sev_send_finish();
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_FINISH, 0, &error);
+    if (ret) {
+        error_report("%s: SEND_FINISH ret=%d fw_error=%d '%s'",
+                     __func__, ret, error, fw_error_to_str(error));
+    }
+
+    g_free(sev_guest->send_packet_hdr);
+    if (sev_es_enabled() && is_hygon_cpu()) {
+        g_free(sev_guest->send_vmsa_packet_hdr);
+    }
+    sev_set_guest_state(sev_guest, SEV_STATE_RUNNING);
+}
+
+static void
+sev_migration_state_notifier(Notifier *notifier, void *data)
+{
+    MigrationState *s = data;
+
+    if (migration_has_finished(s) ||
+        migration_in_postcopy_after_devices(s) ||
+        migration_has_failed(s)) {
+        if (sev_check_state(sev_guest, SEV_STATE_SEND_UPDATE)) {
+            sev_send_finish();
+        }
+    }
+}
+
+static Notifier sev_migration_state;
+
 int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
 {
     SevGuestState *sev
@@ -920,6 +1177,9 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
         return 0;
     }
 
+    ConfidentialGuestSupportClass *cgs_class =
+        (ConfidentialGuestSupportClass *) object_get_class(OBJECT(cgs));
+
     ret = ram_block_discard_disable(true);
     if (ret) {
         error_report("%s: cannot disable RAM discard", __func__);
@@ -996,22 +1256,90 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
     }
 
     trace_kvm_sev_init();
-    ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error);
+
+    /* Only support reuse asid for CSV/CSV2 guest */
+    if (is_hygon_cpu() &&
+        (sev_guest->policy & GUEST_POLICY_REUSE_ASID)) {
+        char *user_id = NULL;
+        struct kvm_csv_init *init_cmd_buf = NULL;
+
+        user_id = object_property_get_str(OBJECT(sev), "user-id", NULL);
+        if (user_id && strlen(user_id)) {
+            init_cmd_buf = g_new0(struct kvm_csv_init, 1);
+            init_cmd_buf->len = strlen(user_id);
+            init_cmd_buf->userid_addr = (__u64)user_id;
+        }
+        ret = sev_ioctl(sev->sev_fd, cmd, init_cmd_buf, &fw_error);
+
+        if (user_id) {
+            g_free(user_id);
+            g_free(init_cmd_buf);
+        }
+    } else {
+        ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error);
+    }
+
     if (ret) {
         error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'",
                    __func__, ret, fw_error, fw_error_to_str(fw_error));
         goto err;
     }
 
-    ret = sev_launch_start(sev);
-    if (ret) {
-        error_setg(errp, "%s: failed to create encryption context", __func__);
-        goto err;
+    /* Support CSV3 */
+    if (!ret && cmd == KVM_SEV_ES_INIT) {
+        ret = csv3_init(sev_guest->policy, sev->sev_fd, (void *)&sev->state, &sev_ops);
+        if (ret) {
+            error_setg(errp, "%s: failed to init csv3 context", __func__);
+            goto err;
+        }
+        /* The CSV3 guest is not resettable */
+        if (csv3_enabled())
+            csv_kvm_cpu_reset_inhibit = true;
+    }
+
+    /*
+     * The LAUNCH context is used for new guest, if its an incoming guest
+     * then RECEIVE context will be created after the connection is established.
+     */
+    if (!runstate_check(RUN_STATE_INMIGRATE)) {
+        ret = sev_launch_start(sev);
+        if (ret) {
+            error_setg(errp, "%s: failed to create encryption context", __func__);
+            goto err;
+        }
+    } else {
+        /*
+         * The CSV2 guest is not resettable after migrated to target machine,
+         * set csv_kvm_cpu_reset_inhibit to true to indicate the CSV2 guest is
+         * not resettable.
+         */
+        if (is_hygon_cpu() && sev_es_enabled()) {
+            csv_kvm_cpu_reset_inhibit = true;
+        }
     }
 
-    ram_block_notifier_add(&sev_ram_notifier);
+    /* CSV3 guest do not need notifier to reg/unreg memory */
+    if (!csv3_enabled()) {
+        ram_block_notifier_add(&sev_ram_notifier);
+    }
     qemu_add_machine_init_done_notifier(&sev_machine_done_notify);
     qemu_add_vm_change_state_handler(sev_vm_state_change, sev);
+    migration_add_notifier(&sev_migration_state, sev_migration_state_notifier);
+
+    if (csv3_enabled()) {
+        cgs_class->memory_encryption_ops = &csv3_memory_encryption_ops;
+    } else {
+        cgs_class->memory_encryption_ops = &sev_memory_encryption_ops;
+    }
+    QTAILQ_INIT(&sev->shared_regions_list);
+
+    /* Determine whether support MSR_AMD64_SEV_ES_GHCB */
+    if (sev_es_enabled()) {
+        sev_kvm_has_msr_ghcb =
+                kvm_vm_check_extension(kvm_state, KVM_CAP_SEV_ES_GHCB);
+    } else {
+        sev_kvm_has_msr_ghcb = false;
+    }
 
     cgs->ready = true;
 
@@ -1044,6 +1372,7 @@ sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp)
 int sev_inject_launch_secret(const char *packet_hdr, const char *secret,
                              uint64_t gpa, Error **errp)
 {
+    ERRP_GUARD();
     struct kvm_sev_launch_secret input;
     g_autofree guchar *data = NULL, *hdr = NULL;
     int error, ret = 1;
@@ -1087,7 +1416,17 @@ int sev_inject_launch_secret(const char *packet_hdr, const char *secret,
     input.trans_uaddr = (uint64_t)(unsigned long)data;
     input.trans_len = data_sz;
 
-    input.guest_uaddr = (uint64_t)(unsigned long)hva;
+    /* For Hygon CSV3 guest, the guest_uaddr should be the gpa */
+    if (csv3_enabled()) {
+        if (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET) {
+            input.guest_uaddr = gpa;
+        } else {
+            error_setg(errp, "CSV3 inject secret unsupported!");
+            return 1;
+        }
+    } else {
+        input.guest_uaddr = (uint64_t)(unsigned long)hva;
+    }
     input.guest_len = data_sz;
 
     trace_kvm_sev_launch_secret(gpa, input.guest_uaddr,
@@ -1251,74 +1590,1120 @@ int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size)
     return 0;
 }
 
-static const QemuUUID sev_hash_table_header_guid = {
-    .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93,
-                    0xd4, 0x11, 0xfd, 0x21)
-};
+static int
+sev_get_send_session_length(void)
+{
+    int ret, fw_err = 0;
+    struct kvm_sev_send_start start = {};
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_START, &start, &fw_err);
+    if (fw_err != SEV_RET_INVALID_LEN) {
+        ret = -1;
+        error_report("%s: failed to get session length ret=%d fw_error=%d '%s'",
+                     __func__, ret, fw_err, fw_error_to_str(fw_err));
+        goto err;
+    }
 
-static const QemuUUID sev_kernel_entry_guid = {
-    .data = UUID_LE(0x4de79437, 0xabd2, 0x427f, 0xb8, 0x35, 0xd5, 0xb1,
-                    0x72, 0xd2, 0x04, 0x5b)
-};
-static const QemuUUID sev_initrd_entry_guid = {
-    .data = UUID_LE(0x44baf731, 0x3a2f, 0x4bd7, 0x9a, 0xf1, 0x41, 0xe2,
-                    0x91, 0x69, 0x78, 0x1d)
-};
-static const QemuUUID sev_cmdline_entry_guid = {
-    .data = UUID_LE(0x97d02dd8, 0xbd20, 0x4c94, 0xaa, 0x78, 0xe7, 0x71,
-                    0x4d, 0x36, 0xab, 0x2a)
-};
+    ret = start.session_len;
+err:
+    return ret;
+}
 
-/*
- * Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page
- * which is included in SEV's initial memory measurement.
- */
-bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
+static int
+sev_send_start(SevGuestState *s, QEMUFile *f, uint64_t *bytes_sent)
 {
-    uint8_t *data;
-    SevHashTableDescriptor *area;
-    SevHashTable *ht;
-    PaddedSevHashTable *padded_ht;
-    uint8_t cmdline_hash[HASH_SIZE];
-    uint8_t initrd_hash[HASH_SIZE];
-    uint8_t kernel_hash[HASH_SIZE];
-    uint8_t *hashp;
-    size_t hash_len = HASH_SIZE;
-    hwaddr mapped_len = sizeof(*padded_ht);
-    MemTxAttrs attrs = { 0 };
-    bool ret = true;
+    gsize pdh_len = 0, plat_cert_len;
+    int session_len, ret, fw_error;
+    struct kvm_sev_send_start start = { };
+    guchar *pdh = NULL, *plat_cert = NULL, *session = NULL;
+    Error *local_err = NULL;
+
+    if (!s->remote_pdh || !s->remote_plat_cert || !s->amd_cert_len) {
+        error_report("%s: missing remote PDH or PLAT_CERT", __func__);
+        return 1;
+    }
+
+   start.pdh_cert_uaddr = (uintptr_t) s->remote_pdh;
+   start.pdh_cert_len = s->remote_pdh_len;
+
+   start.plat_certs_uaddr = (uintptr_t)s->remote_plat_cert;
+   start.plat_certs_len = s->remote_plat_cert_len;
+
+   start.amd_certs_uaddr = (uintptr_t)s->amd_cert;
+   start.amd_certs_len = s->amd_cert_len;
+
+    /* get the session length */
+   session_len = sev_get_send_session_length();
+   if (session_len < 0) {
+       ret = 1;
+       goto err;
+   }
+
+   session = g_new0(guchar, session_len);
+   start.session_uaddr = (unsigned long)session;
+   start.session_len = session_len;
+
+   /* Get our PDH certificate */
+   ret = sev_get_pdh_info(s->sev_fd, &pdh, &pdh_len,
+                          &plat_cert, &plat_cert_len, &local_err);
+   if (ret) {
+       error_report("Failed to get our PDH cert");
+       goto err;
+   }
+
+   trace_kvm_sev_send_start(start.pdh_cert_uaddr, start.pdh_cert_len,
+                            start.plat_certs_uaddr, start.plat_certs_len,
+                            start.amd_certs_uaddr, start.amd_certs_len);
+
+   ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_START, &start, &fw_error);
+   if (ret < 0) {
+       error_report("%s: SEND_START ret=%d fw_error=%d '%s'",
+               __func__, ret, fw_error, fw_error_to_str(fw_error));
+       goto err;
+   }
+
+   qemu_put_be32(f, start.policy);
+   qemu_put_be32(f, pdh_len);
+   qemu_put_buffer(f, (uint8_t *)pdh, pdh_len);
+   qemu_put_be32(f, start.session_len);
+   qemu_put_buffer(f, (uint8_t *)start.session_uaddr, start.session_len);
+   *bytes_sent = 12 + pdh_len + start.session_len;
+
+   sev_set_guest_state(s, SEV_STATE_SEND_UPDATE);
+
+err:
+   g_free(pdh);
+   g_free(plat_cert);
+   return ret;
+}
+
+static int
+sev_send_get_packet_len(int *fw_err)
+{
+    int ret;
+    struct kvm_sev_send_update_data update = { 0, };
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_DATA,
+                    &update, fw_err);
+    if (*fw_err != SEV_RET_INVALID_LEN) {
+        ret = 0;
+        error_report("%s: failed to get session length ret=%d fw_error=%d '%s'",
+                    __func__, ret, *fw_err, fw_error_to_str(*fw_err));
+        goto err;
+    }
+
+    ret = update.hdr_len;
+
+err:
+    return ret;
+}
+
+static int
+sev_send_update_data(SevGuestState *s, QEMUFile *f, uint8_t *ptr, uint32_t size,
+                     uint64_t *bytes_sent)
+{
+    int ret, fw_error;
+    guchar *trans;
+    struct kvm_sev_send_update_data update = { };
 
     /*
-     * Only add the kernel hashes if the sev-guest configuration explicitly
-     * stated kernel-hashes=on.
+     * If this is first call then query the packet header bytes and allocate
+     * the packet buffer.
      */
-    if (!sev_guest->kernel_hashes) {
-        return false;
-    }
+    if (!s->send_packet_hdr) {
+        s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error);
+        if (s->send_packet_hdr_len < 1) {
+            error_report("%s: SEND_UPDATE fw_error=%d '%s'",
+                         __func__, fw_error, fw_error_to_str(fw_error));
+            return 1;
+        }
 
-    if (!pc_system_ovmf_table_find(SEV_HASH_TABLE_RV_GUID, &data, NULL)) {
-        error_setg(errp, "SEV: kernel specified but guest firmware "
-                         "has no hashes table GUID");
-        return false;
+        s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len);
     }
-    area = (SevHashTableDescriptor *)data;
-    if (!area->base || area->size < sizeof(PaddedSevHashTable)) {
-        error_setg(errp, "SEV: guest firmware hashes table area is invalid "
-                         "(base=0x%x size=0x%x)", area->base, area->size);
-        return false;
+
+    /* allocate transport buffer */
+    trans = g_new(guchar, size);
+
+    update.hdr_uaddr = (uintptr_t)s->send_packet_hdr;
+    update.hdr_len = s->send_packet_hdr_len;
+    update.guest_uaddr = (uintptr_t)ptr;
+    update.guest_len = size;
+    update.trans_uaddr = (uintptr_t)trans;
+    update.trans_len = size;
+
+    trace_kvm_sev_send_update_data(ptr, trans, size);
+
+    ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_UPDATE_DATA, &update, &fw_error);
+    if (ret) {
+        error_report("%s: SEND_UPDATE_DATA ret=%d fw_error=%d '%s'",
+                     __func__, ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
     }
 
+    qemu_put_be32(f, update.hdr_len);
+    qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len);
+    *bytes_sent = 4 + update.hdr_len;
+
+    qemu_put_be32(f, update.trans_len);
+    qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+    *bytes_sent += (4 + update.trans_len);
+
+err:
+    g_free(trans);
+    return ret;
+}
+
+int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr,
+                           uint32_t sz, uint64_t *bytes_sent)
+{
+    SevGuestState *s = sev_guest;
+
     /*
-     * Calculate hash of kernel command-line with the terminating null byte. If
-     * the user doesn't supply a command-line via -append, the 1-byte "\0" will
-     * be used.
+     * If this is a first buffer then create outgoing encryption context
+     * and write our PDH, policy and session data.
      */
-    hashp = cmdline_hash;
-    if (qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA256, ctx->cmdline_data,
-                           ctx->cmdline_size, &hashp, &hash_len, errp) < 0) {
-        return false;
+    if (!sev_check_state(s, SEV_STATE_SEND_UPDATE) &&
+        sev_send_start(s, f, bytes_sent)) {
+        error_report("Failed to create outgoing context");
+        return 1;
     }
-    assert(hash_len == HASH_SIZE);
+
+    return sev_send_update_data(s, f, ptr, sz, bytes_sent);
+}
+
+static int
+sev_receive_start(SevGuestState *sev, QEMUFile *f)
+{
+    int ret = 1;
+    int fw_error;
+    struct kvm_sev_receive_start start = { };
+    gchar *session = NULL, *pdh_cert = NULL;
+
+    /* get SEV guest handle */
+    start.handle = object_property_get_int(OBJECT(sev), "handle",
+                                           &error_abort);
+
+    /* get the source policy */
+    start.policy = qemu_get_be32(f);
+
+    /* get source PDH key */
+    start.pdh_len = qemu_get_be32(f);
+    if (!check_blob_length(start.pdh_len)) {
+        return 1;
+    }
+
+    pdh_cert = g_new(gchar, start.pdh_len);
+    qemu_get_buffer(f, (uint8_t *)pdh_cert, start.pdh_len);
+    start.pdh_uaddr = (uintptr_t)pdh_cert;
+
+    /* get source session data */
+    start.session_len = qemu_get_be32(f);
+    if (!check_blob_length(start.session_len)) {
+        return 1;
+    }
+    session = g_new(gchar, start.session_len);
+    qemu_get_buffer(f, (uint8_t *)session, start.session_len);
+    start.session_uaddr = (uintptr_t)session;
+
+    trace_kvm_sev_receive_start(start.policy, session, pdh_cert);
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_START,
+                    &start, &fw_error);
+    if (ret < 0) {
+        error_report("Error RECEIVE_START ret=%d fw_error=%d '%s'",
+                      ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+    object_property_set_int(OBJECT(sev), "handle", start.handle, &error_abort);
+    sev_set_guest_state(sev, SEV_STATE_RECEIVE_UPDATE);
+err:
+    g_free(session);
+    g_free(pdh_cert);
+
+    return ret;
+}
+
+static int sev_receive_update_data(QEMUFile *f, uint8_t *ptr)
+{
+    int ret = 1, fw_error = 0;
+    gchar *hdr = NULL, *trans = NULL;
+    struct kvm_sev_receive_update_data update = {};
+
+    /* get packet header */
+    update.hdr_len = qemu_get_be32(f);
+    if (!check_blob_length(update.hdr_len)) {
+        return 1;
+    }
+
+    hdr = g_new(gchar, update.hdr_len);
+    qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len);
+    update.hdr_uaddr = (uintptr_t)hdr;
+
+    /* get transport buffer */
+    update.trans_len = qemu_get_be32(f);
+    if (!check_blob_length(update.trans_len)) {
+        goto err;
+    }
+
+    trans = g_new(gchar, update.trans_len);
+    update.trans_uaddr = (uintptr_t)trans;
+    qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+
+    update.guest_uaddr = (uintptr_t) ptr;
+    update.guest_len = update.trans_len;
+
+    trace_kvm_sev_receive_update_data(trans, ptr, update.guest_len,
+            hdr, update.hdr_len);
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_UPDATE_DATA,
+                    &update, &fw_error);
+    if (ret) {
+        error_report("Error RECEIVE_UPDATE_DATA ret=%d fw_error=%d '%s'",
+                     ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+err:
+    g_free(trans);
+    g_free(hdr);
+    return ret;
+}
+
+int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr)
+{
+    SevGuestState *s = sev_guest;
+
+    /*
+     * If this is first buffer and SEV is not in recieiving state then
+     * use RECEIVE_START command to create a encryption context.
+     */
+    if (!sev_check_state(s, SEV_STATE_RECEIVE_UPDATE) &&
+        sev_receive_start(s, f)) {
+        return 1;
+    }
+
+    return sev_receive_update_data(f, ptr);
+}
+
+int sev_remove_shared_regions_list(unsigned long start, unsigned long end)
+{
+    SevGuestState *s = sev_guest;
+    struct shared_region *pos, *next_pos;
+
+    QTAILQ_FOREACH_SAFE(pos, &s->shared_regions_list, list, next_pos) {
+        unsigned long l, r;
+        unsigned long curr_gfn_end = pos->gfn_end;
+
+        /*
+         * Find if any intersection exists ?
+         * left bound for intersecting segment
+         */
+        l = MAX(start, pos->gfn_start);
+        /* right bound for intersecting segment */
+        r = MIN(end, pos->gfn_end);
+        if (l <= r) {
+            if (pos->gfn_start == l && pos->gfn_end == r) {
+                QTAILQ_REMOVE(&s->shared_regions_list, pos, list);
+                g_free(pos);
+            } else if (l == pos->gfn_start) {
+                pos->gfn_start = r;
+            } else if (r == pos->gfn_end) {
+                pos->gfn_end = l;
+            } else {
+                /* Do a de-merge -- split linked list nodes */
+                struct shared_region *shrd_region;
+
+                pos->gfn_end = l;
+                shrd_region = g_malloc0(sizeof(*shrd_region));
+                if (!shrd_region) {
+                    return 0;
+                }
+                shrd_region->gfn_start = r;
+                shrd_region->gfn_end = curr_gfn_end;
+                QTAILQ_INSERT_AFTER(&s->shared_regions_list, pos,
+                                    shrd_region, list);
+            }
+        }
+        if (end <= curr_gfn_end) {
+            break;
+        }
+    }
+    return 0;
+}
+
+int sev_add_shared_regions_list(unsigned long start, unsigned long end)
+{
+    struct shared_region *shrd_region;
+    struct shared_region *pos;
+    SevGuestState *s = sev_guest;
+
+    if (QTAILQ_EMPTY(&s->shared_regions_list)) {
+        shrd_region = g_malloc0(sizeof(*shrd_region));
+        if (!shrd_region) {
+            return -1;
+        }
+        shrd_region->gfn_start = start;
+        shrd_region->gfn_end = end;
+        QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list);
+        return 0;
+    }
+
+    /*
+     * shared regions list is a sorted list in ascending order
+     * of guest PA's and also merges consecutive range of guest PA's
+     */
+    QTAILQ_FOREACH(pos, &s->shared_regions_list, list) {
+        /* handle duplicate overlapping regions */
+        if (start >= pos->gfn_start && end <= pos->gfn_end) {
+            return 0;
+        }
+        if (pos->gfn_end < start) {
+            continue;
+        }
+        /* merge consecutive guest PA(s) -- forward merge */
+        if (pos->gfn_start <= start && pos->gfn_end >= start) {
+            pos->gfn_end = end;
+            return 0;
+        }
+        break;
+    }
+    /*
+     * Add a new node
+     */
+    shrd_region = g_malloc0(sizeof(*shrd_region));
+    if (!shrd_region) {
+        return -1;
+    }
+    shrd_region->gfn_start = start;
+    shrd_region->gfn_end = end;
+    if (pos) {
+        QTAILQ_INSERT_BEFORE(pos, shrd_region, list);
+    } else {
+        QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list);
+    }
+    return 1;
+}
+
+int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent)
+{
+    SevGuestState *s = sev_guest;
+    struct shared_region *pos;
+
+    QTAILQ_FOREACH(pos, &s->shared_regions_list, list) {
+        qemu_put_be32(f, SHARED_REGION_LIST_CONT);
+        qemu_put_be32(f, pos->gfn_start);
+        qemu_put_be32(f, pos->gfn_end);
+        *bytes_sent += 12;
+    }
+
+    qemu_put_be32(f, SHARED_REGION_LIST_END);
+    *bytes_sent += 4;
+    return 0;
+}
+
+int sev_load_incoming_shared_regions_list(QEMUFile *f)
+{
+    SevGuestState *s = sev_guest;
+    struct shared_region *shrd_region;
+    int status;
+
+    status = qemu_get_be32(f);
+    while (status == SHARED_REGION_LIST_CONT) {
+
+        shrd_region = g_malloc0(sizeof(*shrd_region));
+        if (!shrd_region) {
+            return 0;
+        }
+        shrd_region->gfn_start = qemu_get_be32(f);
+        shrd_region->gfn_end = qemu_get_be32(f);
+
+        QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list);
+
+        status = qemu_get_be32(f);
+    }
+    return 0;
+}
+
+bool sev_is_gfn_in_unshared_region(unsigned long gfn)
+{
+    SevGuestState *s = sev_guest;
+    struct shared_region *pos;
+
+    QTAILQ_FOREACH(pos, &s->shared_regions_list, list) {
+        if (gfn >= pos->gfn_start && gfn < pos->gfn_end) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static CsvBatchCmdList *
+csv_batch_cmd_list_create(struct kvm_csv_batch_list_node *head,
+                          CsvDestroyCmdNodeFn func)
+{
+    CsvBatchCmdList *csv_batch_cmd_list =
+                        g_malloc0(sizeof(*csv_batch_cmd_list));
+
+    if (!csv_batch_cmd_list) {
+        return NULL;
+    }
+
+    csv_batch_cmd_list->head = head;
+    csv_batch_cmd_list->tail = head;
+    csv_batch_cmd_list->destroy_fn = func;
+
+    return csv_batch_cmd_list;
+}
+
+static int
+csv_batch_cmd_list_add_after(CsvBatchCmdList *list,
+                             struct kvm_csv_batch_list_node *new_node)
+{
+    list->tail->next_cmd_addr = (__u64)new_node;
+    list->tail = new_node;
+
+    return 0;
+}
+
+static struct kvm_csv_batch_list_node *
+csv_batch_cmd_list_node_create(uint64_t cmd_data_addr, uint64_t addr)
+{
+    struct kvm_csv_batch_list_node *new_node =
+                        g_malloc0(sizeof(struct kvm_csv_batch_list_node));
+
+    if (!new_node) {
+        return NULL;
+    }
+
+    new_node->cmd_data_addr = cmd_data_addr;
+    new_node->addr = addr;
+    new_node->next_cmd_addr = 0;
+
+    return new_node;
+}
+
+static int csv_batch_cmd_list_destroy(CsvBatchCmdList *list)
+{
+    struct kvm_csv_batch_list_node *node = list->head;
+
+    while (node != NULL) {
+        if (list->destroy_fn != NULL)
+            list->destroy_fn((void *)node->cmd_data_addr);
+
+        list->head = (struct kvm_csv_batch_list_node *)node->next_cmd_addr;
+        g_free(node);
+        node = list->head;
+    }
+
+    g_free(list);
+    return 0;
+}
+
+static void send_update_data_free(void *data)
+{
+    struct kvm_sev_send_update_data *update =
+                        (struct kvm_sev_send_update_data *)data;
+    g_free((guchar *)update->hdr_uaddr);
+    g_free((guchar *)update->trans_uaddr);
+    g_free(update);
+}
+
+static void receive_update_data_free(void *data)
+{
+    struct kvm_sev_receive_update_data *update =
+                        (struct kvm_sev_receive_update_data *)data;
+    g_free((guchar *)update->hdr_uaddr);
+    g_free((guchar *)update->trans_uaddr);
+    g_free(update);
+}
+
+static int
+csv_send_queue_data(SevGuestState *s, uint8_t *ptr,
+                    uint32_t size, uint64_t addr)
+{
+    int ret = 0;
+    int fw_error;
+    guchar *trans;
+    guchar *packet_hdr;
+    struct kvm_sev_send_update_data *update;
+    struct kvm_csv_batch_list_node *new_node = NULL;
+
+    /* If this is first call then query the packet header bytes and allocate
+     * the packet buffer.
+     */
+    if (s->send_packet_hdr_len < 1) {
+        s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error);
+        if (s->send_packet_hdr_len < 1) {
+            error_report("%s: SEND_UPDATE fw_error=%d '%s'",
+                    __func__, fw_error, fw_error_to_str(fw_error));
+            return 1;
+        }
+    }
+
+    packet_hdr = g_new(guchar, s->send_packet_hdr_len);
+    memset(packet_hdr, 0, s->send_packet_hdr_len);
+
+    update = g_new0(struct kvm_sev_send_update_data, 1);
+
+    /* allocate transport buffer */
+    trans = g_new(guchar, size);
+
+    update->hdr_uaddr = (unsigned long)packet_hdr;
+    update->hdr_len = s->send_packet_hdr_len;
+    update->guest_uaddr = (unsigned long)ptr;
+    update->guest_len = size;
+    update->trans_uaddr = (unsigned long)trans;
+    update->trans_len = size;
+
+    new_node = csv_batch_cmd_list_node_create((uint64_t)update, addr);
+    if (!new_node) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    if (s->csv_batch_cmd_list == NULL) {
+        s->csv_batch_cmd_list = csv_batch_cmd_list_create(new_node,
+                                                send_update_data_free);
+        if (s->csv_batch_cmd_list == NULL) {
+            ret = -ENOMEM;
+            goto err;
+        }
+    } else {
+        /* Add new_node's command address to the last_node */
+        csv_batch_cmd_list_add_after(s->csv_batch_cmd_list, new_node);
+    }
+
+    trace_kvm_sev_send_update_data(ptr, trans, size);
+
+    return ret;
+
+err:
+    g_free(trans);
+    g_free(update);
+    g_free(packet_hdr);
+    g_free(new_node);
+    if (s->csv_batch_cmd_list) {
+        csv_batch_cmd_list_destroy(s->csv_batch_cmd_list);
+        s->csv_batch_cmd_list = NULL;
+    }
+    return ret;
+}
+
+static int
+csv_receive_queue_data(SevGuestState *s, QEMUFile *f, uint8_t *ptr)
+{
+    int ret = 0;
+    gchar *hdr = NULL, *trans = NULL;
+    struct kvm_sev_receive_update_data *update;
+    struct kvm_csv_batch_list_node *new_node = NULL;
+
+    update = g_new0(struct kvm_sev_receive_update_data, 1);
+    /* get packet header */
+    update->hdr_len = qemu_get_be32(f);
+    hdr = g_new(gchar, update->hdr_len);
+    qemu_get_buffer(f, (uint8_t *)hdr, update->hdr_len);
+    update->hdr_uaddr = (unsigned long)hdr;
+
+    /* get transport buffer */
+    update->trans_len = qemu_get_be32(f);
+    trans = g_new(gchar, update->trans_len);
+    update->trans_uaddr = (unsigned long)trans;
+    qemu_get_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len);
+
+    /* set guest address,guest len is page_size */
+    update->guest_uaddr = (uint64_t)ptr;
+    update->guest_len = TARGET_PAGE_SIZE;
+
+    new_node = csv_batch_cmd_list_node_create((uint64_t)update, 0);
+    if (!new_node) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    if (s->csv_batch_cmd_list == NULL) {
+        s->csv_batch_cmd_list = csv_batch_cmd_list_create(new_node,
+                                                receive_update_data_free);
+        if (s->csv_batch_cmd_list == NULL) {
+            ret = -ENOMEM;
+            goto err;
+        }
+    } else {
+        /* Add new_node's command address to the last_node */
+        csv_batch_cmd_list_add_after(s->csv_batch_cmd_list, new_node);
+    }
+
+    trace_kvm_sev_receive_update_data(trans, (void *)ptr, update->guest_len,
+            (void *)hdr, update->hdr_len);
+
+    return ret;
+
+err:
+    g_free(trans);
+    g_free(update);
+    g_free(hdr);
+    g_free(new_node);
+    if (s->csv_batch_cmd_list) {
+        csv_batch_cmd_list_destroy(s->csv_batch_cmd_list);
+        s->csv_batch_cmd_list = NULL;
+    }
+    return ret;
+}
+
+static int
+csv_command_batch(uint32_t cmd_id, uint64_t head_uaddr, int *fw_err)
+{
+    int ret;
+    struct kvm_csv_command_batch command_batch = { };
+
+    command_batch.command_id = cmd_id;
+    command_batch.csv_batch_list_uaddr = head_uaddr;
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_CSV_COMMAND_BATCH,
+                    &command_batch, fw_err);
+    if (ret) {
+        error_report("%s: COMMAND_BATCH ret=%d fw_err=%d '%s'",
+                __func__, ret, *fw_err, fw_error_to_str(*fw_err));
+    }
+
+    return ret;
+}
+
+static int
+csv_send_update_data_batch(SevGuestState *s, QEMUFile *f, uint64_t *bytes_sent)
+{
+    int ret, fw_error = 0;
+    struct kvm_sev_send_update_data *update;
+    struct kvm_csv_batch_list_node *node;
+
+    ret = csv_command_batch(KVM_SEV_SEND_UPDATE_DATA,
+                            (uint64_t)s->csv_batch_cmd_list->head, &fw_error);
+    if (ret) {
+        error_report("%s: csv_command_batch ret=%d fw_error=%d '%s'",
+                __func__, ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+    for (node = s->csv_batch_cmd_list->head;
+         node != NULL;
+         node = (struct kvm_csv_batch_list_node *)node->next_cmd_addr) {
+        if (node != s->csv_batch_cmd_list->head) {
+            /* head's page header is saved before send_update_data */
+            qemu_put_be64(f, node->addr);
+            *bytes_sent += 8;
+            if (node->next_cmd_addr != 0)
+                qemu_put_be32(f, RAM_SAVE_ENCRYPTED_PAGE_BATCH);
+            else
+                qemu_put_be32(f, RAM_SAVE_ENCRYPTED_PAGE_BATCH_END);
+            *bytes_sent += 4;
+        }
+        update = (struct kvm_sev_send_update_data *)node->cmd_data_addr;
+        qemu_put_be32(f, update->hdr_len);
+        qemu_put_buffer(f, (uint8_t *)update->hdr_uaddr, update->hdr_len);
+        *bytes_sent += (4 + update->hdr_len);
+
+        qemu_put_be32(f, update->trans_len);
+        qemu_put_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len);
+        *bytes_sent += (4 + update->trans_len);
+    }
+
+err:
+    csv_batch_cmd_list_destroy(s->csv_batch_cmd_list);
+    s->csv_batch_cmd_list = NULL;
+    return ret;
+}
+
+static int
+csv_receive_update_data_batch(SevGuestState *s)
+{
+    int ret;
+    int fw_error;
+
+    ret = csv_command_batch(KVM_SEV_RECEIVE_UPDATE_DATA,
+                            (uint64_t)s->csv_batch_cmd_list->head, &fw_error);
+    if (ret) {
+        error_report("%s: csv_command_batch ret=%d fw_error=%d '%s'",
+                __func__, ret, fw_error, fw_error_to_str(fw_error));
+    }
+
+    csv_batch_cmd_list_destroy(s->csv_batch_cmd_list);
+    s->csv_batch_cmd_list = NULL;
+    return ret;
+}
+
+int
+csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr)
+{
+    SevGuestState *s = sev_guest;
+
+    /* Only support for HYGON CSV */
+    if (!is_hygon_cpu()) {
+        error_report("Only support enqueue pages for HYGON CSV");
+        return -EINVAL;
+    }
+
+    return csv_send_queue_data(s, ptr, sz, addr);
+}
+
+int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr)
+{
+    SevGuestState *s = sev_guest;
+
+    /* Only support for HYGON CSV */
+    if (!is_hygon_cpu()) {
+        error_report("Only support enqueue received pages for HYGON CSV");
+        return -EINVAL;
+    }
+
+    /*
+     * If this is first buffer and SEV is not in recieiving state then
+     * use RECEIVE_START command to create a encryption context.
+     */
+    if (!sev_check_state(s, SEV_STATE_RECEIVE_UPDATE) &&
+        sev_receive_start(s, f)) {
+        return 1;
+    }
+
+    return csv_receive_queue_data(s, f, ptr);
+}
+
+int
+csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent)
+{
+    SevGuestState *s = sev_guest;
+
+    /* Only support for HYGON CSV */
+    if (!is_hygon_cpu()) {
+        error_report("Only support transfer queued pages for HYGON CSV");
+        return -EINVAL;
+    }
+
+    /*
+     * If this is a first buffer then create outgoing encryption context
+     * and write our PDH, policy and session data.
+     */
+    if (!sev_check_state(s, SEV_STATE_SEND_UPDATE) &&
+        sev_send_start(s, f, bytes_sent)) {
+        error_report("Failed to create outgoing context");
+        return 1;
+    }
+
+    return csv_send_update_data_batch(s, f, bytes_sent);
+}
+
+int csv_load_queued_incoming_pages(QEMUFile *f)
+{
+    SevGuestState *s = sev_guest;
+
+    /* Only support for HYGON CSV */
+    if (!is_hygon_cpu()) {
+        error_report("Only support load queued pages for HYGON CSV");
+        return -EINVAL;
+    }
+
+    return csv_receive_update_data_batch(s);
+}
+
+static int
+sev_send_vmsa_get_packet_len(int *fw_err)
+{
+    int ret;
+    struct kvm_sev_send_update_vmsa update = { 0, };
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_VMSA,
+                    &update, fw_err);
+    if (*fw_err != SEV_RET_INVALID_LEN) {
+        ret = 0;
+        error_report("%s: failed to get session length ret=%d fw_error=%d '%s'",
+                     __func__, ret, *fw_err, fw_error_to_str(*fw_err));
+        goto err;
+    }
+
+    ret = update.hdr_len;
+
+err:
+    return ret;
+}
+
+static int
+sev_send_update_vmsa(SevGuestState *s, QEMUFile *f, uint32_t cpu_id,
+                     uint32_t cpu_index, uint32_t size, uint64_t *bytes_sent)
+{
+    int ret, fw_error;
+    guchar *trans = NULL;
+    struct kvm_sev_send_update_vmsa update = {};
+
+    /*
+     * If this is first call then query the packet header bytes and allocate
+     * the packet buffer.
+     */
+    if (!s->send_vmsa_packet_hdr) {
+        s->send_vmsa_packet_hdr_len = sev_send_vmsa_get_packet_len(&fw_error);
+        if (s->send_vmsa_packet_hdr_len < 1) {
+            error_report("%s: SEND_UPDATE_VMSA fw_error=%d '%s'",
+                         __func__, fw_error, fw_error_to_str(fw_error));
+            return 1;
+        }
+
+        s->send_vmsa_packet_hdr = g_new(gchar, s->send_vmsa_packet_hdr_len);
+    }
+
+    /* allocate transport buffer */
+    trans = g_new(guchar, size);
+
+    update.vcpu_id = cpu_id;
+    update.hdr_uaddr = (uintptr_t)s->send_vmsa_packet_hdr;
+    update.hdr_len = s->send_vmsa_packet_hdr_len;
+    update.trans_uaddr = (uintptr_t)trans;
+    update.trans_len = size;
+
+    trace_kvm_sev_send_update_vmsa(cpu_id, cpu_index, trans, size);
+
+    ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_UPDATE_VMSA, &update, &fw_error);
+    if (ret) {
+        error_report("%s: SEND_UPDATE_VMSA ret=%d fw_error=%d '%s'",
+                     __func__, ret, fw_error, fw_error_to_str(fw_error));
+        goto err;
+    }
+
+    /*
+     * Migration of vCPU's VMState according to the instance_id
+     * (i.e. CPUState.cpu_index)
+     */
+    qemu_put_be32(f, sizeof(uint32_t));
+    qemu_put_buffer(f, (uint8_t *)&cpu_index, sizeof(uint32_t));
+    *bytes_sent += 4 + sizeof(uint32_t);
+
+    qemu_put_be32(f, update.hdr_len);
+    qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len);
+    *bytes_sent += 4 + update.hdr_len;
+
+    qemu_put_be32(f, update.trans_len);
+    qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+    *bytes_sent += 4 + update.trans_len;
+
+err:
+    g_free(trans);
+    return ret;
+}
+
+int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent)
+{
+    SevGuestState *s = sev_guest;
+    CPUState *cpu;
+    int ret = 0;
+
+    /* Only support migrate VMSAs for HYGON CSV2 guest */
+    if (!sev_es_enabled() || !is_hygon_cpu()) {
+        return 0;
+    }
+
+    CPU_FOREACH(cpu) {
+        qemu_put_be32(f, ENCRYPTED_CPU_STATE_CONT);
+        *bytes_sent += 4;
+        ret = sev_send_update_vmsa(s, f, kvm_arch_vcpu_id(cpu),
+                                   cpu->cpu_index, TARGET_PAGE_SIZE, bytes_sent);
+        if (ret) {
+            goto err;
+        }
+    }
+
+    qemu_put_be32(f, ENCRYPTED_CPU_STATE_END);
+    *bytes_sent += 4;
+
+err:
+    return ret;
+}
+
+static int sev_receive_update_vmsa(QEMUFile *f)
+{
+    int ret = 1, fw_error = 0;
+    CPUState *cpu;
+    uint32_t cpu_index, cpu_id = 0;
+    gchar *hdr = NULL, *trans = NULL;
+    struct kvm_sev_receive_update_vmsa update = {};
+
+    /* get cpu index buffer */
+    assert(qemu_get_be32(f) == sizeof(uint32_t));
+    qemu_get_buffer(f, (uint8_t *)&cpu_index, sizeof(uint32_t));
+
+    CPU_FOREACH(cpu) {
+        if (cpu->cpu_index == cpu_index) {
+            cpu_id = kvm_arch_vcpu_id(cpu);
+            break;
+        }
+    }
+    update.vcpu_id = cpu_id;
+
+    /* get packet header */
+    update.hdr_len = qemu_get_be32(f);
+    if (!check_blob_length(update.hdr_len)) {
+        return 1;
+    }
+
+    hdr = g_new(gchar, update.hdr_len);
+    qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len);
+    update.hdr_uaddr = (uintptr_t)hdr;
+
+    /* get transport buffer */
+    update.trans_len = qemu_get_be32(f);
+    if (!check_blob_length(update.trans_len)) {
+        goto err;
+    }
+
+    trans = g_new(gchar, update.trans_len);
+    update.trans_uaddr = (uintptr_t)trans;
+    qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len);
+
+    trace_kvm_sev_receive_update_vmsa(cpu_id, cpu_index,
+                trans, update.trans_len, hdr, update.hdr_len);
+
+    ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_UPDATE_VMSA,
+                    &update, &fw_error);
+    if (ret) {
+        error_report("Error RECEIVE_UPDATE_VMSA ret=%d fw_error=%d '%s'",
+                     ret, fw_error, fw_error_to_str(fw_error));
+    }
+
+err:
+    g_free(trans);
+    g_free(hdr);
+    return ret;
+}
+
+int csv_load_incoming_cpu_state(QEMUFile *f)
+{
+    int status, ret = 0;
+
+    /* Only support migrate VMSAs for HYGON CSV2 guest */
+    if (!sev_es_enabled() || !is_hygon_cpu()) {
+        return 0;
+    }
+
+    status = qemu_get_be32(f);
+    while (status == ENCRYPTED_CPU_STATE_CONT) {
+        ret = sev_receive_update_vmsa(f);
+        if (ret) {
+            break;
+        }
+
+        status = qemu_get_be32(f);
+    }
+
+    return ret;
+}
+
+static int
+csv_load_launch_secret(const char *secret_header_file, const char *secret_file)
+{
+    gsize secret_header_size, secret_size;
+    gchar *secret_header = NULL, *secret = NULL;
+    uint8_t *data;
+    struct sev_secret_area *area;
+    uint64_t gpa;
+    GError *error = NULL;
+    Error *local_err = NULL;
+    int ret = 0;
+
+    if (!g_file_get_contents(secret_header_file,
+                             &secret_header,
+                             &secret_header_size, &error)) {
+        error_report("CSV: Failed to read '%s' (%s)",
+                     secret_header_file, error->message);
+        g_error_free(error);
+        return -1;
+    }
+
+    if (!g_file_get_contents(secret_file, &secret, &secret_size, &error)) {
+        error_report("CSV: Failed to read '%s' (%s)", secret_file, error->message);
+        g_error_free(error);
+        return -1;
+    }
+
+    if (!pc_system_ovmf_table_find(SEV_SECRET_GUID, &data, NULL)) {
+        error_report("CSV: no secret area found in OVMF, gpa must be"
+                     " specified.");
+        return -1;
+    }
+    area = (struct sev_secret_area *)data;
+    gpa = area->base;
+
+    ret = sev_inject_launch_secret((char *)secret_header,
+                                   (char *)secret, gpa, &local_err);
+
+    if (local_err) {
+        error_report_err(local_err);
+    }
+    return ret;
+}
+
+static const QemuUUID sev_hash_table_header_guid = {
+    .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93,
+                    0xd4, 0x11, 0xfd, 0x21)
+};
+
+static const QemuUUID sev_kernel_entry_guid = {
+    .data = UUID_LE(0x4de79437, 0xabd2, 0x427f, 0xb8, 0x35, 0xd5, 0xb1,
+                    0x72, 0xd2, 0x04, 0x5b)
+};
+static const QemuUUID sev_initrd_entry_guid = {
+    .data = UUID_LE(0x44baf731, 0x3a2f, 0x4bd7, 0x9a, 0xf1, 0x41, 0xe2,
+                    0x91, 0x69, 0x78, 0x1d)
+};
+static const QemuUUID sev_cmdline_entry_guid = {
+    .data = UUID_LE(0x97d02dd8, 0xbd20, 0x4c94, 0xaa, 0x78, 0xe7, 0x71,
+                    0x4d, 0x36, 0xab, 0x2a)
+};
+
+/*
+ * Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page
+ * which is included in SEV's initial memory measurement.
+ */
+bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
+{
+    uint8_t *data;
+    SevHashTableDescriptor *area;
+    SevHashTable *ht;
+    PaddedSevHashTable *padded_ht;
+    uint8_t cmdline_hash[HASH_SIZE];
+    uint8_t initrd_hash[HASH_SIZE];
+    uint8_t kernel_hash[HASH_SIZE];
+    uint8_t *hashp;
+    size_t hash_len = HASH_SIZE;
+    hwaddr mapped_len = sizeof(*padded_ht);
+    MemTxAttrs attrs = { 0 };
+    bool ret = true;
+
+    /*
+     * Only add the kernel hashes if the sev-guest configuration explicitly
+     * stated kernel-hashes=on.
+     */
+    if (!sev_guest->kernel_hashes) {
+        return false;
+    }
+
+    if (!pc_system_ovmf_table_find(SEV_HASH_TABLE_RV_GUID, &data, NULL)) {
+        error_setg(errp, "SEV: kernel specified but guest firmware "
+                         "has no hashes table GUID");
+        return false;
+    }
+    area = (SevHashTableDescriptor *)data;
+    if (!area->base || area->size < sizeof(PaddedSevHashTable)) {
+        error_setg(errp, "SEV: guest firmware hashes table area is invalid "
+                         "(base=0x%x size=0x%x)", area->base, area->size);
+        return false;
+    }
+
+    /*
+     * Calculate hash of kernel command-line with the terminating null byte. If
+     * the user doesn't supply a command-line via -append, the 1-byte "\0" will
+     * be used.
+     */
+    hashp = cmdline_hash;
+    if (qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA256, ctx->cmdline_data,
+                           ctx->cmdline_size, &hashp, &hash_len, errp) < 0) {
+        return false;
+    }
+    assert(hash_len == HASH_SIZE);
 
     /*
      * Calculate hash of initrd. If the user doesn't supply an initrd via
@@ -1373,7 +2758,17 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
     /* zero the excess data so the measurement can be reliably calculated */
     memset(padded_ht->padding, 0, sizeof(padded_ht->padding));
 
-    if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) {
+    if (csv3_enabled()) {
+        if (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA) {
+            if (csv3_load_data(area->base, (uint8_t *)padded_ht,
+                               sizeof(*padded_ht), errp) < 0) {
+                ret = false;
+            }
+        } else {
+            error_report("%s: CSV3 load kernel hashes unsupported!", __func__);
+            ret = false;
+        }
+    } else if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) {
         ret = false;
     }
 
@@ -1383,6 +2778,27 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
     return ret;
 }
 
+static int _sev_send_start(QEMUFile *f, uint64_t *bytes_sent)
+{
+    SevGuestState *s = sev_guest;
+
+    return sev_send_start(s, f, bytes_sent);
+}
+
+static int _sev_receive_start(QEMUFile *f)
+{
+    SevGuestState *s = sev_guest;
+
+    return sev_receive_start(s, f);
+}
+
+struct sev_ops sev_ops = {
+    .sev_ioctl = sev_ioctl,
+    .fw_error_to_str = fw_error_to_str,
+    .sev_send_start = _sev_send_start,
+    .sev_receive_start = _sev_receive_start,
+};
+
 static void
 sev_register_types(void)
 {
diff --git a/target/i386/sev.h b/target/i386/sev.h
index e7499c95b1e87cd6ecdff2c28009b65807b52946..647b426b16f4b4c4d003a8a59ac07f42f966f112 100644
--- a/target/i386/sev.h
+++ b/target/i386/sev.h
@@ -38,6 +38,13 @@ typedef struct SevKernelLoaderContext {
     size_t cmdline_size;
 } SevKernelLoaderContext;
 
+#define RAM_SAVE_ENCRYPTED_PAGE           0x1
+#define RAM_SAVE_SHARED_REGIONS_LIST      0x2
+
+#define RAM_SAVE_ENCRYPTED_PAGE_BATCH     0x4
+#define RAM_SAVE_ENCRYPTED_PAGE_BATCH_END 0x5
+#define RAM_SAVE_ENCRYPTED_CPU_STATE      0x6
+
 #ifdef CONFIG_SEV
 bool sev_enabled(void);
 bool sev_es_enabled(void);
@@ -51,12 +58,35 @@ uint32_t sev_get_reduced_phys_bits(void);
 bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp);
 
 int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp);
+int sev_save_setup(const char *pdh, const char *plat_cert,
+                   const char *amd_cert);
+int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr,
+                           uint32_t size, uint64_t *bytes_sent);
+int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr);
 int sev_inject_launch_secret(const char *hdr, const char *secret,
                              uint64_t gpa, Error **errp);
 
 int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size);
 void sev_es_set_reset_vector(CPUState *cpu);
+int sev_remove_shared_regions_list(unsigned long gfn_start,
+                                   unsigned long gfn_end);
+int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end);
+int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent);
+int sev_load_incoming_shared_regions_list(QEMUFile *f);
+bool sev_is_gfn_in_unshared_region(unsigned long gfn);
+void sev_del_migrate_blocker(void);
 
 int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
 
+extern bool sev_kvm_has_msr_ghcb;
+
+struct sev_ops {
+    int (*sev_ioctl)(int fd, int cmd, void *data, int *error);
+    const char *(*fw_error_to_str)(int code);
+    int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent);
+    int (*sev_receive_start)(QEMUFile *f);
+};
+
+extern struct sev_ops sev_ops;
+
 #endif
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index 5b86f439add2a920cd3ee0054e3bf329339854b5..294dbc50e2b0bcf73f1ddddea65f0ebb51db21b3 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -107,6 +107,10 @@ static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new)
 {
     uint32_t cmp;
 
+    CPUState *cpu = env_cpu(in->env);
+    /* We are in cpu_exec, and start_exclusive can't be called directly.*/
+    g_assert(cpu->running);
+    cpu_exec_end(cpu);
     /* Does x86 really perform a rmw cycle on mmio for ptw? */
     start_exclusive();
     cmp = cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0);
@@ -114,6 +118,7 @@ static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new)
         cpu_stl_mmuidx_ra(in->env, in->gaddr, new, in->ptw_idx, 0);
     }
     end_exclusive();
+    cpu_exec_start(cpu);
     return cmp == old;
 }
 
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 037bc47e7c27ca01e955aa92e9c3e660cd9a9a77..19b8250452c30fbde3c755675dfed39cf7558897 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2661,7 +2661,7 @@ static void gen_enter(DisasContext *s, int esp_addend, int level)
     }
 
     /* Copy the FrameTemp value to EBP.  */
-    gen_op_mov_reg_v(s, a_ot, R_EBP, s->T1);
+    gen_op_mov_reg_v(s, d_ot, R_EBP, s->T1);
 
     /* Compute the final value of ESP.  */
     tcg_gen_subi_tl(s->T1, s->T1, esp_addend + size * level);
@@ -2790,7 +2790,7 @@ do_gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, bool jr)
     if (recheck_tf) {
         gen_helper_rechecking_single_step(tcg_env);
         tcg_gen_exit_tb(NULL, 0);
-    } else if (s->flags & HF_TF_MASK) {
+    } else if ((s->flags & HF_TF_MASK) && !inhibit) {
         gen_helper_single_step(tcg_env);
     } else if (jr) {
         tcg_gen_lookup_and_goto_ptr();
diff --git a/target/i386/trace-events b/target/i386/trace-events
index 2cd8726eebb7d42b70a14ba3c128290b428847da..5d4a709a39dedc3f4b0a6ad27f4e51fe25fcdac9 100644
--- a/target/i386/trace-events
+++ b/target/i386/trace-events
@@ -11,3 +11,19 @@ kvm_sev_launch_measurement(const char *value) "data %s"
 kvm_sev_launch_finish(void) ""
 kvm_sev_launch_secret(uint64_t hpa, uint64_t hva, uint64_t secret, int len) "hpa 0x%" PRIx64 " hva 0x%" PRIx64 " data 0x%" PRIx64 " len %d"
 kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data %s"
+kvm_sev_send_start(uint64_t pdh, int l1, uint64_t plat, int l2, uint64_t amd, int l3) "pdh 0x%" PRIx64 " len %d plat 0x%" PRIx64 " len %d amd 0x%" PRIx64 " len %d"
+kvm_sev_send_update_data(void *src, void *dst, int len) "guest %p trans %p len %d"
+kvm_sev_send_finish(void) ""
+kvm_sev_receive_start(int policy, void *session, void *pdh) "policy 0x%x session %p pdh %p"
+kvm_sev_receive_update_data(void *src, void *dst, int len, void *hdr, int hdr_len) "guest %p trans %p len %d hdr %p hdr_len %d"
+kvm_sev_receive_finish(void) ""
+kvm_sev_send_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *dst, int len) "cpu_id %d cpu_index %d trans %p len %d"
+kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int len, void *hdr, int hdr_len) "cpu_id %d cpu_index %d trans %p len %d hdr %p hdr_len %d"
+
+# csv.c
+kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 " addr %p len 0x%" PRIx64
+kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d"
+kvm_csv3_send_encrypt_context(void *dst, int len) "trans %p len %d"
+kvm_csv3_receive_encrypt_data(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d"
+kvm_csv3_receive_encrypt_context(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d"
+kvm_csv3_set_guest_private_memory(void) ""
diff --git a/target/loongarch/arch_dump.c b/target/loongarch/arch_dump.c
new file mode 100644
index 0000000000000000000000000000000000000000..d9e1120333ce1a345e0153118cd6f18f54e7308b
--- /dev/null
+++ b/target/loongarch/arch_dump.c
@@ -0,0 +1,163 @@
+/*
+ * Support for writing ELF notes for LoongArch architectures
+ *
+ * Copyright (c) 2023 Loongarch Technology
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "elf.h"
+#include "sysemu/dump.h"
+#include "internals.h"
+
+/* struct user_pt_regs from arch/loongarch/include/uapi/asm/ptrace.h */
+struct loongarch_user_regs {
+    uint64_t gpr[32];
+    uint64_t pad1[1];
+    /* Special CSR registers. */
+    uint64_t csr_era;
+    uint64_t csr_badv;
+    uint64_t pad2[10];
+} QEMU_PACKED;
+
+QEMU_BUILD_BUG_ON(sizeof(struct loongarch_user_regs) != 360);
+
+/* struct elf_prstatus from include/uapi/linux/elfcore.h */
+struct loongarch_elf_prstatus {
+    char pad1[32]; /* 32 == offsetof(struct elf_prstatus, pr_pid) */
+    uint32_t pr_pid;
+    /*
+     * 76 == offsetof(struct elf_prstatus, pr_reg) -
+     * offsetof(struct elf_prstatus, pr_ppid)
+     */
+    char pad2[76];
+    struct loongarch_user_regs pr_reg;
+    uint32_t pr_fpvalid;
+    char pad3[4];
+} QEMU_PACKED;
+
+QEMU_BUILD_BUG_ON(sizeof(struct loongarch_elf_prstatus) != 480);
+
+/* struct user_fp_state from arch/loongarch/include/uapi/asm/ptrace.h */
+struct loongarch_fpu_struct {
+    uint64_t fpr[32];
+    uint64_t fcc;
+    unsigned int fcsr;
+} QEMU_PACKED;
+
+QEMU_BUILD_BUG_ON(sizeof(struct loongarch_fpu_struct) != 268);
+
+struct loongarch_note {
+    Elf64_Nhdr hdr;
+    char name[8]; /* align_up(sizeof("CORE"), 4) */
+    union {
+        struct loongarch_elf_prstatus prstatus;
+        struct loongarch_fpu_struct fpu;
+    };
+} QEMU_PACKED;
+
+#define LOONGARCH_NOTE_HEADER_SIZE offsetof(struct loongarch_note, prstatus)
+#define LOONGARCH_PRSTATUS_NOTE_SIZE                                          \
+    (LOONGARCH_NOTE_HEADER_SIZE + sizeof(struct loongarch_elf_prstatus))
+#define LOONGARCH_PRFPREG_NOTE_SIZE                                           \
+    (LOONGARCH_NOTE_HEADER_SIZE + sizeof(struct loongarch_fpu_struct))
+
+static void loongarch_note_init(struct loongarch_note *note, DumpState *s,
+                                const char *name, Elf64_Word namesz,
+                                Elf64_Word type, Elf64_Word descsz)
+{
+    memset(note, 0, sizeof(*note));
+
+    note->hdr.n_namesz = cpu_to_dump32(s, namesz);
+    note->hdr.n_descsz = cpu_to_dump32(s, descsz);
+    note->hdr.n_type = cpu_to_dump32(s, type);
+
+    memcpy(note->name, name, namesz);
+}
+
+static int loongarch_write_elf64_fprpreg(WriteCoreDumpFunction f,
+                                         CPULoongArchState *env, int cpuid,
+                                         DumpState *s)
+{
+    struct loongarch_note note;
+    int ret, i;
+
+    loongarch_note_init(&note, s, "CORE", 5, NT_PRFPREG, sizeof(note.fpu));
+    note.fpu.fcsr = cpu_to_dump64(s, env->fcsr0);
+    note.fpu.fcc = cpu_to_dump64(s, read_fcc(env));
+
+    for (i = 0; i < 32; ++i) {
+        note.fpu.fpr[i] = cpu_to_dump64(s, env->fpr[i].vreg.UD[0]);
+    }
+
+    ret = f(&note, LOONGARCH_PRFPREG_NOTE_SIZE, s);
+    if (ret < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int loongarch_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
+                                   int cpuid, DumpState *s)
+{
+    struct loongarch_note note;
+    CPULoongArchState *env = &LOONGARCH_CPU(cs)->env;
+    int ret, i;
+
+    loongarch_note_init(&note, s, "CORE", 5, NT_PRSTATUS,
+                        sizeof(note.prstatus));
+    note.prstatus.pr_pid = cpu_to_dump32(s, cpuid);
+    note.prstatus.pr_fpvalid = cpu_to_dump32(s, 1);
+
+    for (i = 0; i < 32; ++i) {
+        note.prstatus.pr_reg.gpr[i] = cpu_to_dump64(s, env->gpr[i]);
+    }
+    note.prstatus.pr_reg.csr_era  = cpu_to_dump64(s, env->CSR_ERA);
+    note.prstatus.pr_reg.csr_badv = cpu_to_dump64(s, env->CSR_BADV);
+    ret = f(&note, LOONGARCH_PRSTATUS_NOTE_SIZE, s);
+    if (ret < 0) {
+        return -1;
+    }
+
+    ret = loongarch_write_elf64_fprpreg(f, env, cpuid, s);
+    if (ret < 0) {
+        return -1;
+    }
+
+    return ret;
+}
+
+int cpu_get_dump_info(ArchDumpInfo *info,
+                      const GuestPhysBlockList *guest_phys_blocks)
+{
+    info->d_machine = EM_LOONGARCH;
+    info->d_endian = ELFDATA2LSB;
+    info->d_class = ELFCLASS64;
+
+    return 0;
+}
+
+ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
+{
+    size_t note_size = 0;
+
+    if (class == ELFCLASS64) {
+        note_size = LOONGARCH_PRSTATUS_NOTE_SIZE + LOONGARCH_PRFPREG_NOTE_SIZE;
+    }
+
+    return note_size * nr_cpus;
+}
diff --git a/target/loongarch/cpu-csr.h b/target/loongarch/cpu-csr.h
index c59d7a9fcbc4c570d7b0c410e7c05768cc39d24f..0834e91f30e3bfcfd894f8e7e1e49e930f7025bb 100644
--- a/target/loongarch/cpu-csr.h
+++ b/target/loongarch/cpu-csr.h
@@ -67,6 +67,9 @@ FIELD(TLBENTRY, D, 1, 1)
 FIELD(TLBENTRY, PLV, 2, 2)
 FIELD(TLBENTRY, MAT, 4, 2)
 FIELD(TLBENTRY, G, 6, 1)
+FIELD(TLBENTRY, HUGE, 6, 1)
+FIELD(TLBENTRY, HGLOBAL, 12, 1)
+FIELD(TLBENTRY, LEVEL, 13, 2)
 FIELD(TLBENTRY_32, PPN, 8, 24)
 FIELD(TLBENTRY_64, PPN, 12, 36)
 FIELD(TLBENTRY_64, NR, 61, 1)
diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index fc075952e6356060fe46b8530ea1267765d7a867..561566f3a04971760acea2c48c6f1ec325b706e6 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -8,18 +8,29 @@
 #include "qemu/osdep.h"
 #include "qemu/log.h"
 #include "qemu/qemu-print.h"
+#include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
 #include "sysemu/qtest.h"
-#include "exec/cpu_ldst.h"
+#include "sysemu/tcg.h"
+#include "sysemu/kvm.h"
+#include "kvm/kvm_loongarch.h"
 #include "exec/exec-all.h"
 #include "cpu.h"
+#include "hw/qdev-properties.h"
 #include "internals.h"
 #include "fpu/softfloat-helpers.h"
 #include "cpu-csr.h"
+#include "qapi/visitor.h"
 #include "sysemu/reset.h"
-#include "tcg/tcg.h"
 #include "vec.h"
+#ifdef CONFIG_KVM
+#include <linux/kvm.h>
+#endif
+#ifdef CONFIG_TCG
+#include "exec/cpu_ldst.h"
+#include "tcg/tcg.h"
+#endif
 
 const char * const regnames[32] = {
     "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
@@ -35,33 +46,45 @@ const char * const fregnames[32] = {
     "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
 };
 
-static const char * const excp_names[] = {
-    [EXCCODE_INT] = "Interrupt",
-    [EXCCODE_PIL] = "Page invalid exception for load",
-    [EXCCODE_PIS] = "Page invalid exception for store",
-    [EXCCODE_PIF] = "Page invalid exception for fetch",
-    [EXCCODE_PME] = "Page modified exception",
-    [EXCCODE_PNR] = "Page Not Readable exception",
-    [EXCCODE_PNX] = "Page Not Executable exception",
-    [EXCCODE_PPI] = "Page Privilege error",
-    [EXCCODE_ADEF] = "Address error for instruction fetch",
-    [EXCCODE_ADEM] = "Address error for Memory access",
-    [EXCCODE_SYS] = "Syscall",
-    [EXCCODE_BRK] = "Break",
-    [EXCCODE_INE] = "Instruction Non-Existent",
-    [EXCCODE_IPE] = "Instruction privilege error",
-    [EXCCODE_FPD] = "Floating Point Disabled",
-    [EXCCODE_FPE] = "Floating Point Exception",
-    [EXCCODE_DBP] = "Debug breakpoint",
-    [EXCCODE_BCE] = "Bound Check Exception",
-    [EXCCODE_SXD] = "128 bit vector instructions Disable exception",
-    [EXCCODE_ASXD] = "256 bit vector instructions Disable exception",
+struct TypeExcp {
+    int32_t exccode;
+    const char * const name;
+};
+
+static const struct TypeExcp excp_names[] = {
+    {EXCCODE_INT, "Interrupt"},
+    {EXCCODE_PIL, "Page invalid exception for load"},
+    {EXCCODE_PIS, "Page invalid exception for store"},
+    {EXCCODE_PIF, "Page invalid exception for fetch"},
+    {EXCCODE_PME, "Page modified exception"},
+    {EXCCODE_PNR, "Page Not Readable exception"},
+    {EXCCODE_PNX, "Page Not Executable exception"},
+    {EXCCODE_PPI, "Page Privilege error"},
+    {EXCCODE_ADEF, "Address error for instruction fetch"},
+    {EXCCODE_ADEM, "Address error for Memory access"},
+    {EXCCODE_SYS, "Syscall"},
+    {EXCCODE_BRK, "Break"},
+    {EXCCODE_INE, "Instruction Non-Existent"},
+    {EXCCODE_IPE, "Instruction privilege error"},
+    {EXCCODE_FPD, "Floating Point Disabled"},
+    {EXCCODE_FPE, "Floating Point Exception"},
+    {EXCCODE_DBP, "Debug breakpoint"},
+    {EXCCODE_BCE, "Bound Check Exception"},
+    {EXCCODE_SXD, "128 bit vector instructions Disable exception"},
+    {EXCCODE_ASXD, "256 bit vector instructions Disable exception"},
+    {EXCP_HLT, "EXCP_HLT"},
 };
 
 const char *loongarch_exception_name(int32_t exception)
 {
-    assert(excp_names[exception]);
-    return excp_names[exception];
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(excp_names); i++) {
+        if (excp_names[i].exccode == exception) {
+            return excp_names[i].name;
+        }
+    }
+    return "Unknown";
 }
 
 void G_NORETURN do_raise_exception(CPULoongArchState *env,
@@ -70,7 +93,7 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env,
 {
     CPUState *cs = env_cpu(env);
 
-    qemu_log_mask(CPU_LOG_INT, "%s: %d (%s)\n",
+    qemu_log_mask(CPU_LOG_INT, "%s: exception: %d (%s)\n",
                   __func__,
                   exception,
                   loongarch_exception_name(exception));
@@ -108,12 +131,15 @@ void loongarch_cpu_set_irq(void *opaque, int irq, int level)
         return;
     }
 
-    env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0);
-
-    if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) {
-        cpu_interrupt(cs, CPU_INTERRUPT_HARD);
-    } else {
-        cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+    if (kvm_enabled()) {
+        kvm_loongarch_set_interrupt(cpu, irq, level);
+    } else if (tcg_enabled()) {
+        env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0);
+        if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) {
+            cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+        } else {
+            cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+        }
     }
 }
 
@@ -138,29 +164,26 @@ static inline bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env)
 
     return (pending & status) != 0;
 }
+#endif
 
+#ifdef CONFIG_TCG
+#ifndef CONFIG_USER_ONLY
 static void loongarch_cpu_do_interrupt(CPUState *cs)
 {
     LoongArchCPU *cpu = LOONGARCH_CPU(cs);
     CPULoongArchState *env = &cpu->env;
     bool update_badinstr = 1;
     int cause = -1;
-    const char *name;
     bool tlbfill = FIELD_EX64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR);
     uint32_t vec_size = FIELD_EX64(env->CSR_ECFG, CSR_ECFG, VS);
 
     if (cs->exception_index != EXCCODE_INT) {
-        if (cs->exception_index < 0 ||
-            cs->exception_index >= ARRAY_SIZE(excp_names)) {
-            name = "unknown";
-        } else {
-            name = excp_names[cs->exception_index];
-        }
-
         qemu_log_mask(CPU_LOG_INT,
                      "%s enter: pc " TARGET_FMT_lx " ERA " TARGET_FMT_lx
-                     " TLBRERA " TARGET_FMT_lx " %s exception\n", __func__,
-                     env->pc, env->CSR_ERA, env->CSR_TLBRERA, name);
+                     " TLBRERA " TARGET_FMT_lx " exception: %d (%s)\n",
+                     __func__, env->pc, env->CSR_ERA, env->CSR_TLBRERA,
+                     cs->exception_index,
+                     loongarch_exception_name(cs->exception_index));
     }
 
     switch (cs->exception_index) {
@@ -320,7 +343,6 @@ static bool loongarch_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
 }
 #endif
 
-#ifdef CONFIG_TCG
 static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
                                               const TranslationBlock *tb)
 {
@@ -407,6 +429,14 @@ static void loongarch_la464_initfn(Object *obj)
     data = FIELD_DP32(data, CPUCFG5, CC_DIV, 1);
     env->cpucfg[5] = data;
 
+    if (kvm_enabled()) {
+        data = 0;
+        data = FIELD_DP32(data, CPUCFG6, PMP, 1);
+        data = FIELD_DP32(data, CPUCFG6, PMNUM, 3);
+        data = FIELD_DP32(data, CPUCFG6, PMBITS, 63);
+        env->cpucfg[6] = data;
+    }
+
     data = 0;
     data = FIELD_DP32(data, CPUCFG16, L1_IUPRE, 1);
     data = FIELD_DP32(data, CPUCFG16, L1_DPRE, 1);
@@ -443,6 +473,18 @@ static void loongarch_la464_initfn(Object *obj)
     env->cpucfg[20] = data;
 
     env->CSR_ASID = FIELD_DP64(0, CSR_ASID, ASIDBITS, 0xa);
+
+    env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, SAVE_NUM, 8);
+    env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, TIMER_BITS, 0x2f);
+    env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, VSMAX, 7);
+
+    env->CSR_PRCFG2 = 0x3ffff000;
+
+    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2);
+    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63);
+    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_WAYS, 7);
+    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_SETS, 8);
+
     loongarch_cpu_post_init(obj);
 }
 
@@ -507,17 +549,19 @@ static void loongarch_cpu_reset_hold(Object *obj)
         lacc->parent_phases.hold(obj);
     }
 
+#ifdef CONFIG_TCG
     env->fcsr0_mask = FCSR0_M1 | FCSR0_M2 | FCSR0_M3;
+#endif
     env->fcsr0 = 0x0;
 
     int n;
-    /* Set csr registers value after reset */
+    /* Set csr registers value after reset, see the manual 6.4. */
     env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PLV, 0);
     env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, IE, 0);
     env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DA, 1);
     env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PG, 0);
-    env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATF, 1);
-    env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATM, 1);
+    env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATF, 0);
+    env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATM, 0);
 
     env->CSR_EUEN = FIELD_DP64(env->CSR_EUEN, CSR_EUEN, FPE, 0);
     env->CSR_EUEN = FIELD_DP64(env->CSR_EUEN, CSR_EUEN, SXE, 0);
@@ -529,17 +573,28 @@ static void loongarch_cpu_reset_hold(Object *obj)
     env->CSR_ECFG = FIELD_DP64(env->CSR_ECFG, CSR_ECFG, VS, 0);
     env->CSR_ECFG = FIELD_DP64(env->CSR_ECFG, CSR_ECFG, LIE, 0);
 
-    env->CSR_ESTAT = env->CSR_ESTAT & (~MAKE_64BIT_MASK(0, 2));
+    env->CSR_ESTAT = 0;
     env->CSR_RVACFG = FIELD_DP64(env->CSR_RVACFG, CSR_RVACFG, RBITS, 0);
+    env->CSR_CPUID = cs->cpu_index;
     env->CSR_TCFG = FIELD_DP64(env->CSR_TCFG, CSR_TCFG, EN, 0);
     env->CSR_LLBCTL = FIELD_DP64(env->CSR_LLBCTL, CSR_LLBCTL, KLO, 0);
     env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR, 0);
     env->CSR_MERRCTL = FIELD_DP64(env->CSR_MERRCTL, CSR_MERRCTL, ISMERR, 0);
-
-    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2);
-    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63);
-    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_WAYS, 7);
-    env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_SETS, 8);
+    env->CSR_TID = cs->cpu_index;
+    /*
+     * Workaround for edk2-stable202408, CSR PGD register is set only if
+     * its value is equal to zero for boot cpu, it causes reboot issue.
+     *
+     * Here clear CSR registers relative with TLB.
+     */
+    env->CSR_PGDH = 0;
+    env->CSR_PGDL = 0;
+    env->CSR_PWCL = 0;
+    env->CSR_PWCH = 0;
+    env->CSR_STLBPS = 0;
+    env->CSR_EENTRY = 0;
+    env->CSR_TLBRENTRY = 0;
+    env->CSR_MERRENTRY = 0;
 
     for (n = 0; n < 4; n++) {
         env->CSR_DMW[n] = FIELD_DP64(env->CSR_DMW[n], CSR_DMW, PLV0, 0);
@@ -550,10 +605,17 @@ static void loongarch_cpu_reset_hold(Object *obj)
 
 #ifndef CONFIG_USER_ONLY
     env->pc = 0x1c000000;
+#ifdef CONFIG_TCG
     memset(env->tlb, 0, sizeof(env->tlb));
 #endif
+    if (kvm_enabled()) {
+        kvm_arch_reset_vcpu(cs);
+    }
+#endif
 
+#ifdef CONFIG_TCG
     restore_fp_status(env);
+#endif
     cs->exception_index = -1;
 }
 
@@ -576,53 +638,23 @@ static void loongarch_cpu_realizefn(DeviceState *dev, Error **errp)
 
     loongarch_cpu_register_gdb_regs_for_features(cs);
 
-    cpu_reset(cs);
     qemu_init_vcpu(cs);
+    cpu_reset(cs);
 
     lacc->parent_realize(dev, errp);
 }
 
-#ifndef CONFIG_USER_ONLY
-static void loongarch_qemu_write(void *opaque, hwaddr addr,
-                                 uint64_t val, unsigned size)
-{
-    qemu_log_mask(LOG_UNIMP, "[%s]: Unimplemented reg 0x%" HWADDR_PRIx "\n",
-                  __func__, addr);
-}
-
-static uint64_t loongarch_qemu_read(void *opaque, hwaddr addr, unsigned size)
-{
-    switch (addr) {
-    case VERSION_REG:
-        return 0x11ULL;
-    case FEATURE_REG:
-        return 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI |
-               1ULL << IOCSRF_CSRIPI;
-    case VENDOR_REG:
-        return 0x6e6f73676e6f6f4cULL; /* "Loongson" */
-    case CPUNAME_REG:
-        return 0x303030354133ULL;     /* "3A5000" */
-    case MISC_FUNC_REG:
-        return 1ULL << IOCSRM_EXTIOI_EN;
-    }
-    return 0ULL;
-}
+static void loongarch_cpu_unrealizefn(DeviceState *dev)
+{
+    LoongArchCPUClass *mcc = LOONGARCH_CPU_GET_CLASS(dev);
 
-static const MemoryRegionOps loongarch_qemu_ops = {
-    .read = loongarch_qemu_read,
-    .write = loongarch_qemu_write,
-    .endianness = DEVICE_LITTLE_ENDIAN,
-    .valid = {
-        .min_access_size = 4,
-        .max_access_size = 8,
-    },
-    .impl = {
-        .min_access_size = 8,
-        .max_access_size = 8,
-    },
-};
+#ifndef CONFIG_USER_ONLY
+    cpu_remove_sync(CPU(dev));
 #endif
 
+    mcc->parent_unrealize(dev);
+}
+
 static bool loongarch_get_lsx(Object *obj, Error **errp)
 {
     LoongArchCPU *cpu = LOONGARCH_CPU(obj);
@@ -675,17 +707,54 @@ static void loongarch_set_lasx(Object *obj, bool value, Error **errp)
     }
 }
 
+static bool loongarch_get_lbt(Object *obj, Error **errp)
+{
+    return LOONGARCH_CPU(obj)->lbt != ON_OFF_AUTO_OFF;
+}
+
+static void loongarch_set_lbt(Object *obj, bool value, Error **errp)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(obj);
+
+    cpu->lbt = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+}
+
+static bool loongarch_get_pmu(Object *obj, Error **errp)
+{
+    return LOONGARCH_CPU(obj)->pmu != ON_OFF_AUTO_OFF;
+}
+
+static void loongarch_set_pmu(Object *obj, bool value, Error **errp)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(obj);
+
+    cpu->pmu = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+}
+
 void loongarch_cpu_post_init(Object *obj)
 {
     LoongArchCPU *cpu = LOONGARCH_CPU(obj);
 
-    if (FIELD_EX32(cpu->env.cpucfg[2], CPUCFG2, LSX)) {
-        object_property_add_bool(obj, "lsx", loongarch_get_lsx,
-                                 loongarch_set_lsx);
-    }
-    if (FIELD_EX32(cpu->env.cpucfg[2], CPUCFG2, LASX)) {
-        object_property_add_bool(obj, "lasx", loongarch_get_lasx,
-                                 loongarch_set_lasx);
+    object_property_add_bool(obj, "lsx", loongarch_get_lsx,
+                             loongarch_set_lsx);
+    object_property_add_bool(obj, "lasx", loongarch_get_lasx,
+                             loongarch_set_lasx);
+
+    if (kvm_enabled()) {
+	cpu->lbt = ON_OFF_AUTO_AUTO;
+        object_property_add_bool(obj, "lbt", loongarch_get_lbt,
+                                 loongarch_set_lbt);
+        object_property_set_description(obj, "lbt",
+                                   "Set off to disable Binary Tranlation.");
+
+        cpu->pmu = ON_OFF_AUTO_AUTO;
+        object_property_add_bool(obj, "pmu", loongarch_get_pmu,
+                                 loongarch_set_pmu);
+        object_property_set_description(obj, "pmu",
+                                   "Set off to performance monitor unit.");
+
+    } else {
+        cpu->lbt = ON_OFF_AUTO_OFF;
     }
 }
 
@@ -693,17 +762,12 @@ static void loongarch_cpu_init(Object *obj)
 {
 #ifndef CONFIG_USER_ONLY
     LoongArchCPU *cpu = LOONGARCH_CPU(obj);
-    CPULoongArchState *env = &cpu->env;
 
     qdev_init_gpio_in(DEVICE(cpu), loongarch_cpu_set_irq, N_IRQS);
+#ifdef CONFIG_TCG
     timer_init_ns(&cpu->timer, QEMU_CLOCK_VIRTUAL,
                   &loongarch_constant_timer_cb, cpu);
-    memory_region_init_io(&env->system_iocsr, OBJECT(cpu), NULL,
-                          env, "iocsr", UINT64_MAX);
-    address_space_init(&env->address_space_iocsr, &env->system_iocsr, "IOCSR");
-    memory_region_init_io(&env->iocsr_mem, OBJECT(cpu), &loongarch_qemu_ops,
-                          NULL, "iocsr_misc", 0x428);
-    memory_region_add_subregion(&env->system_iocsr, 0, &env->iocsr_mem);
+#endif
 #endif
 }
 
@@ -734,8 +798,7 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
     int i;
 
     qemu_fprintf(f, " PC=%016" PRIx64 " ", env->pc);
-    qemu_fprintf(f, " FCSR0 0x%08x  fp_status 0x%02x\n", env->fcsr0,
-                 get_float_exception_flags(&env->fp_status));
+    qemu_fprintf(f, " FCSR0 0x%08x\n", env->fcsr0);
 
     /* gpr */
     for (i = 0; i < 32; i++) {
@@ -758,10 +821,12 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
     qemu_fprintf(f, "EENTRY=%016" PRIx64 "\n", env->CSR_EENTRY);
     qemu_fprintf(f, "PRCFG1=%016" PRIx64 ", PRCFG2=%016" PRIx64 ","
                  " PRCFG3=%016" PRIx64 "\n",
-                 env->CSR_PRCFG1, env->CSR_PRCFG3, env->CSR_PRCFG3);
+                 env->CSR_PRCFG1, env->CSR_PRCFG2, env->CSR_PRCFG3);
     qemu_fprintf(f, "TLBRENTRY=%016" PRIx64 "\n", env->CSR_TLBRENTRY);
     qemu_fprintf(f, "TLBRBADV=%016" PRIx64 "\n", env->CSR_TLBRBADV);
     qemu_fprintf(f, "TLBRERA=%016" PRIx64 "\n", env->CSR_TLBRERA);
+    qemu_fprintf(f, "TCFG=%016" PRIx64 "\n", env->CSR_TCFG);
+    qemu_fprintf(f, "TVAL=%016" PRIx64 "\n", env->CSR_TVAL);
 
     /* fpr */
     if (flags & CPU_DUMP_FPU) {
@@ -795,6 +860,7 @@ static struct TCGCPUOps loongarch_tcg_ops = {
 #include "hw/core/sysemu-cpu-ops.h"
 
 static const struct SysemuCPUOps loongarch_sysemu_ops = {
+    .write_elf64_note = loongarch_cpu_write_elf64_note,
     .get_phys_page_debug = loongarch_cpu_get_phys_page_debug,
 };
 
@@ -806,6 +872,15 @@ static int64_t loongarch_cpu_get_arch_id(CPUState *cs)
 }
 #endif
 
+static Property loongarch_cpu_properties[] = {
+    DEFINE_PROP_INT32("socket-id", LoongArchCPU, socket_id, 0),
+    DEFINE_PROP_INT32("core-id", LoongArchCPU, core_id, 0),
+    DEFINE_PROP_INT32("thread-id", LoongArchCPU, thread_id, 0),
+    DEFINE_PROP_INT32("node-id", LoongArchCPU, node_id, CPU_UNSET_NUMA_NODE_ID),
+
+    DEFINE_PROP_END_OF_LIST()
+};
+
 static void loongarch_cpu_class_init(ObjectClass *c, void *data)
 {
     LoongArchCPUClass *lacc = LOONGARCH_CPU_CLASS(c);
@@ -813,8 +888,11 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data)
     DeviceClass *dc = DEVICE_CLASS(c);
     ResettableClass *rc = RESETTABLE_CLASS(c);
 
+    device_class_set_props(dc, loongarch_cpu_properties);
     device_class_set_parent_realize(dc, loongarch_cpu_realizefn,
                                     &lacc->parent_realize);
+    device_class_set_parent_unrealize(dc, loongarch_cpu_unrealizefn,
+                                      &lacc->parent_unrealize);
     resettable_class_set_parent_phases(rc, NULL, loongarch_cpu_reset_hold, NULL,
                                        &lacc->parent_phases);
 
@@ -836,6 +914,7 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data)
 #ifdef CONFIG_TCG
     cc->tcg_ops = &loongarch_tcg_ops;
 #endif
+    dc->user_creatable = true;
 }
 
 static const gchar *loongarch32_gdb_arch_name(CPUState *cs)
diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index 00d1fba597f8eb791d99bc937115515e58eba610..6cc717c5eaffb1dec9bec4917acb66c55bb4524b 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -18,6 +18,7 @@
 #endif
 #include "cpu-csr.h"
 #include "cpu-qom.h"
+#include "qapi/qapi-types-common.h"
 
 #define IOCSRF_TEMP             0
 #define IOCSRF_NODECNT          1
@@ -36,6 +37,7 @@
 #define CPUNAME_REG             0x20
 #define MISC_FUNC_REG           0x420
 #define IOCSRM_EXTIOI_EN        48
+#define IOCSRM_EXTIOI_INT_ENCODE  49
 
 #define IOCSR_MEM_SIZE          0x428
 
@@ -154,6 +156,7 @@ FIELD(CPUCFG2, LLFTP_VER, 15, 3)
 FIELD(CPUCFG2, LBT_X86, 18, 1)
 FIELD(CPUCFG2, LBT_ARM, 19, 1)
 FIELD(CPUCFG2, LBT_MIPS, 20, 1)
+FIELD(CPUCFG2, LBT_ALL, 18, 3)
 FIELD(CPUCFG2, LSPW, 21, 1)
 FIELD(CPUCFG2, LAM, 22, 1)
 
@@ -185,6 +188,8 @@ FIELD(CPUCFG6, PMNUM, 4, 4)
 FIELD(CPUCFG6, PMBITS, 8, 6)
 FIELD(CPUCFG6, UPM, 14, 1)
 
+#define PMNUM_MAX 16
+
 /* cpucfg[16] bits */
 FIELD(CPUCFG16, L1_IUPRE, 0, 1)
 FIELD(CPUCFG16, L1_IUUNIFY, 1, 1)
@@ -272,6 +277,7 @@ union fpr_t {
     VReg  vreg;
 };
 
+#ifdef CONFIG_TCG
 struct LoongArchTLB {
     uint64_t tlb_misc;
     /* Fields corresponding to CSR_TLBELO0/1 */
@@ -279,23 +285,35 @@ struct LoongArchTLB {
     uint64_t tlb_entry1;
 };
 typedef struct LoongArchTLB LoongArchTLB;
+#endif
+
+enum loongarch_features {
+    LOONGARCH_FEATURE_LBT, /* loongson binary translation extension */
+    LOONGARCH_FEATURE_PMU,
+};
+
+typedef struct  LoongArchBT {
+    /* scratch registers */
+    uint64_t scr0;
+    uint64_t scr1;
+    uint64_t scr2;
+    uint64_t scr3;
+    /* loongarch eflags */
+    uint32_t eflags;
+    uint32_t ftop;
+} lbt_t;
 
 typedef struct CPUArchState {
     uint64_t gpr[32];
     uint64_t pc;
 
     fpr_t fpr[32];
-    float_status fp_status;
     bool cf[8];
-
     uint32_t fcsr0;
-    uint32_t fcsr0_mask;
+    lbt_t  lbt;
 
     uint32_t cpucfg[21];
 
-    uint64_t lladdr; /* LL virtual address compared against SC */
-    uint64_t llval;
-
     /* LoongArch CSRs */
     uint64_t CSR_CRMD;
     uint64_t CSR_PRMD;
@@ -319,6 +337,7 @@ typedef struct CPUArchState {
     uint64_t CSR_PWCH;
     uint64_t CSR_STLBPS;
     uint64_t CSR_RVACFG;
+    uint64_t CSR_CPUID;
     uint64_t CSR_PRCFG1;
     uint64_t CSR_PRCFG2;
     uint64_t CSR_PRCFG3;
@@ -350,21 +369,41 @@ typedef struct CPUArchState {
     uint64_t CSR_DBG;
     uint64_t CSR_DERA;
     uint64_t CSR_DSAVE;
-    uint64_t CSR_CPUID;
+    struct {
+        uint64_t guest_addr;
+    } stealtime;
 
+#ifdef CONFIG_TCG
+    float_status fp_status;
+    uint32_t fcsr0_mask;
+    uint64_t lladdr; /* LL virtual address compared against SC */
+    uint64_t llval;
+#endif
 #ifndef CONFIG_USER_ONLY
+#ifdef CONFIG_TCG
     LoongArchTLB  tlb[LOONGARCH_TLB_MAX];
+#endif
 
-    AddressSpace address_space_iocsr;
-    MemoryRegion system_iocsr;
-    MemoryRegion iocsr_mem;
+    AddressSpace *address_space_iocsr;
     bool load_elf;
     uint64_t elf_address;
+    uint32_t mp_state;
     /* Store ipistate to access from this struct */
     DeviceState *ipistate;
+
+    struct loongarch_boot_info *boot_info;
 #endif
+    struct {
+        uint64_t guest_addr;
+    } st;
 } CPULoongArchState;
 
+typedef struct LoongArchCPUTopo {
+    int32_t socket_id;  /* socket-id of this VCPU */
+    int32_t core_id;    /* core-id of this VCPU */
+    int32_t thread_id;  /* thread-id of this VCPU */
+} LoongArchCPUTopo;
+
 /**
  * LoongArchCPU:
  * @env: #CPULoongArchState
@@ -377,9 +416,18 @@ struct ArchCPU {
     CPULoongArchState env;
     QEMUTimer timer;
     uint32_t  phy_id;
+    OnOffAuto lbt;
+    OnOffAuto pmu;
+    int32_t socket_id;  /* socket-id of this VCPU */
+    int32_t core_id;    /* core-id of this VCPU */
+    int32_t thread_id;  /* thread-id of this VCPU */
+    int32_t node_id;    /* NUMA node this CPU belongs to */
 
     /* 'compatible' string for this CPU for Linux device trees */
     const char *dtb_compatible;
+    /* used by KVM_REG_LOONGARCH_COUNTER ioctl to access guest time counters */
+    uint64_t kvm_state_counter;
+    VMChangeStateEntry *vmsentry;
 };
 
 /**
@@ -393,6 +441,7 @@ struct LoongArchCPUClass {
     CPUClass parent_class;
 
     DeviceRealize parent_realize;
+    DeviceUnrealize parent_unrealize;
     ResettablePhases parent_phases;
 };
 
diff --git a/target/loongarch/cpu_helper.c b/target/loongarch/cpu_helper.c
new file mode 100644
index 0000000000000000000000000000000000000000..39037eecb4bd566b838e808ad2aa3875f1eebd02
--- /dev/null
+++ b/target/loongarch/cpu_helper.c
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * LoongArch CPU helpers for qemu
+ *
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "cpu-csr.h"
+
+#ifdef CONFIG_TCG
+static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical,
+                                   int *prot, target_ulong address,
+                                   int access_type, int index, int mmu_idx)
+{
+    LoongArchTLB *tlb = &env->tlb[index];
+    uint64_t plv = mmu_idx;
+    uint64_t tlb_entry, tlb_ppn;
+    uint8_t tlb_ps, n, tlb_v, tlb_d, tlb_plv, tlb_nx, tlb_nr, tlb_rplv;
+
+    if (index >= LOONGARCH_STLB) {
+        tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS);
+    } else {
+        tlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS);
+    }
+    n = (address >> tlb_ps) & 0x1;/* Odd or even */
+
+    tlb_entry = n ? tlb->tlb_entry1 : tlb->tlb_entry0;
+    tlb_v = FIELD_EX64(tlb_entry, TLBENTRY, V);
+    tlb_d = FIELD_EX64(tlb_entry, TLBENTRY, D);
+    tlb_plv = FIELD_EX64(tlb_entry, TLBENTRY, PLV);
+    if (is_la64(env)) {
+        tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_64, PPN);
+        tlb_nx = FIELD_EX64(tlb_entry, TLBENTRY_64, NX);
+        tlb_nr = FIELD_EX64(tlb_entry, TLBENTRY_64, NR);
+        tlb_rplv = FIELD_EX64(tlb_entry, TLBENTRY_64, RPLV);
+    } else {
+        tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_32, PPN);
+        tlb_nx = 0;
+        tlb_nr = 0;
+        tlb_rplv = 0;
+    }
+
+    /* Remove sw bit between bit12 -- bit PS*/
+    tlb_ppn = tlb_ppn & ~(((0x1UL << (tlb_ps - 12)) -1));
+
+    /* Check access rights */
+    if (!tlb_v) {
+        return TLBRET_INVALID;
+    }
+
+    if (access_type == MMU_INST_FETCH && tlb_nx) {
+        return TLBRET_XI;
+    }
+
+    if (access_type == MMU_DATA_LOAD && tlb_nr) {
+        return TLBRET_RI;
+    }
+
+    if (((tlb_rplv == 0) && (plv > tlb_plv)) ||
+        ((tlb_rplv == 1) && (plv != tlb_plv))) {
+        return TLBRET_PE;
+    }
+
+    if ((access_type == MMU_DATA_STORE) && !tlb_d) {
+        return TLBRET_DIRTY;
+    }
+
+    *physical = (tlb_ppn << R_TLBENTRY_64_PPN_SHIFT) |
+                (address & MAKE_64BIT_MASK(0, tlb_ps));
+    *prot = PAGE_READ;
+    if (tlb_d) {
+        *prot |= PAGE_WRITE;
+    }
+    if (!tlb_nx) {
+        *prot |= PAGE_EXEC;
+    }
+    return TLBRET_MATCH;
+}
+
+/*
+ * One tlb entry holds an adjacent odd/even pair, the vpn is the
+ * content of the virtual page number divided by 2. So the
+ * compare vpn is bit[47:15] for 16KiB page. while the vppn
+ * field in tlb entry contains bit[47:13], so need adjust.
+ * virt_vpn = vaddr[47:13]
+ */
+bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr,
+                          int *index)
+{
+    LoongArchTLB *tlb;
+    uint16_t csr_asid, tlb_asid, stlb_idx;
+    uint8_t tlb_e, tlb_ps, tlb_g, stlb_ps;
+    int i, compare_shift;
+    uint64_t vpn, tlb_vppn;
+
+    csr_asid = FIELD_EX64(env->CSR_ASID, CSR_ASID, ASID);
+    stlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS);
+    vpn = (vaddr & TARGET_VIRT_MASK) >> (stlb_ps + 1);
+    stlb_idx = vpn & 0xff; /* VA[25:15] <==> TLBIDX.index for 16KiB Page */
+    compare_shift = stlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT;
+
+    /* Search STLB */
+    for (i = 0; i < 8; ++i) {
+        tlb = &env->tlb[i * 256 + stlb_idx];
+        tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E);
+        if (tlb_e) {
+            tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN);
+            tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID);
+            tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G);
+
+            if ((tlb_g == 1 || tlb_asid == csr_asid) &&
+                (vpn == (tlb_vppn >> compare_shift))) {
+                *index = i * 256 + stlb_idx;
+                return true;
+            }
+        }
+    }
+
+    /* Search MTLB */
+    for (i = LOONGARCH_STLB; i < LOONGARCH_TLB_MAX; ++i) {
+        tlb = &env->tlb[i];
+        tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E);
+        if (tlb_e) {
+            tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN);
+            tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS);
+            tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID);
+            tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G);
+            compare_shift = tlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT;
+            vpn = (vaddr & TARGET_VIRT_MASK) >> (tlb_ps + 1);
+            if ((tlb_g == 1 || tlb_asid == csr_asid) &&
+                (vpn == (tlb_vppn >> compare_shift))) {
+                *index = i;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical,
+                                 int *prot, target_ulong address,
+                                 MMUAccessType access_type, int mmu_idx)
+{
+    int index, match;
+
+    match = loongarch_tlb_search(env, address, &index);
+    if (match) {
+        return loongarch_map_tlb_entry(env, physical, prot,
+                                       address, access_type, index, mmu_idx);
+    }
+
+    return TLBRET_NOMATCH;
+}
+#else
+static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical,
+                                 int *prot, target_ulong address,
+                                 MMUAccessType access_type, int mmu_idx)
+{
+    return TLBRET_NOMATCH;
+}
+#endif
+
+static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va,
+                        target_ulong dmw)
+{
+    if (is_la64(env)) {
+        return va & TARGET_VIRT_MASK;
+    } else {
+        uint32_t pseg = FIELD_EX32(dmw, CSR_DMW_32, PSEG);
+        return (va & MAKE_64BIT_MASK(0, R_CSR_DMW_32_VSEG_SHIFT)) | \
+            (pseg << R_CSR_DMW_32_VSEG_SHIFT);
+    }
+}
+
+int get_physical_address(CPULoongArchState *env, hwaddr *physical,
+                         int *prot, target_ulong address,
+                         MMUAccessType access_type, int mmu_idx)
+{
+    int user_mode = mmu_idx == MMU_IDX_USER;
+    int kernel_mode = mmu_idx == MMU_IDX_KERNEL;
+    uint32_t plv, base_c, base_v;
+    int64_t addr_high;
+    uint8_t da = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, DA);
+    uint8_t pg = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG);
+
+    /* Check PG and DA */
+    if (da & !pg) {
+        *physical = address & TARGET_PHYS_MASK;
+        *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        return TLBRET_MATCH;
+    }
+
+    plv = kernel_mode | (user_mode << R_CSR_DMW_PLV3_SHIFT);
+    if (is_la64(env)) {
+        base_v = address >> R_CSR_DMW_64_VSEG_SHIFT;
+    } else {
+        base_v = address >> R_CSR_DMW_32_VSEG_SHIFT;
+    }
+    /* Check direct map window */
+    for (int i = 0; i < 4; i++) {
+        if (is_la64(env)) {
+            base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_64, VSEG);
+        } else {
+            base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_32, VSEG);
+        }
+        if ((plv & env->CSR_DMW[i]) && (base_c == base_v)) {
+            *physical = dmw_va2pa(env, address, env->CSR_DMW[i]);
+            *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+            return TLBRET_MATCH;
+        }
+    }
+
+    /* Check valid extension */
+    addr_high = sextract64(address, TARGET_VIRT_ADDR_SPACE_BITS, 16);
+    if (!(addr_high == 0 || addr_high == -1)) {
+        return TLBRET_BADADDR;
+    }
+
+    /* Mapped address */
+    return loongarch_map_address(env, physical, prot, address,
+                                 access_type, mmu_idx);
+}
+
+hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cs, vaddr addr)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+    hwaddr phys_addr;
+    int prot;
+
+    if (get_physical_address(env, &phys_addr, &prot, addr, MMU_DATA_LOAD,
+                             cpu_mmu_index(env, false)) != 0) {
+        return -1;
+    }
+    return phys_addr;
+}
diff --git a/target/loongarch/gdbstub.c b/target/loongarch/gdbstub.c
index 5fc2f19e965e101a269c923a101c9be05d2bd2ac..cc72680c38b282bdcc2889537fa902483510089b 100644
--- a/target/loongarch/gdbstub.c
+++ b/target/loongarch/gdbstub.c
@@ -33,28 +33,29 @@ void write_fcc(CPULoongArchState *env, uint64_t val)
 
 int loongarch_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n)
 {
-    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
-    CPULoongArchState *env = &cpu->env;
-    uint64_t val;
-
-    if (0 <= n && n < 32) {
-        val = env->gpr[n];
-    } else if (n == 32) {
-        /* orig_a0 */
-        val = 0;
-    } else if (n == 33) {
-        val = env->pc;
-    } else if (n == 34) {
-        val = env->CSR_BADV;
-    }
+    CPULoongArchState *env = cpu_env(cs);
 
     if (0 <= n && n <= 34) {
+        uint64_t val;
+
+        if (n < 32) {
+            val = env->gpr[n];
+        } else if (n == 32) {
+            /* orig_a0 */
+            val = 0;
+        } else if (n == 33) {
+            val = env->pc;
+        } else /* if (n == 34) */ {
+            val = env->CSR_BADV;
+        }
+
         if (is_la64(env)) {
             return gdb_get_reg64(mem_buf, val);
         } else {
             return gdb_get_reg32(mem_buf, val);
         }
     }
+
     return 0;
 }
 
@@ -67,10 +68,10 @@ int loongarch_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
     int length = 0;
 
     if (is_la64(env)) {
-        tmp = ldq_p(mem_buf);
+        tmp = ldq_le_p(mem_buf);
         read_length = 8;
     } else {
-        tmp = ldl_p(mem_buf);
+        tmp = ldl_le_p(mem_buf);
         read_length = 4;
     }
 
@@ -103,13 +104,13 @@ static int loongarch_gdb_set_fpu(CPULoongArchState *env,
     int length = 0;
 
     if (0 <= n && n < 32) {
-        env->fpr[n].vreg.D(0) = ldq_p(mem_buf);
+        env->fpr[n].vreg.D(0) = ldq_le_p(mem_buf);
         length = 8;
     } else if (32 <= n && n < 40) {
         env->cf[n - 32] = ldub_p(mem_buf);
         length = 1;
     } else if (n == 40) {
-        env->fcsr0 = ldl_p(mem_buf);
+        env->fcsr0 = ldl_le_p(mem_buf);
         length = 4;
     }
     return length;
diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h
index c492863cc5defa89d0392de53355e2cc4d0f6922..1a02427627d8e968a1b829bdb7915a212860856e 100644
--- a/target/loongarch/internals.h
+++ b/target/loongarch/internals.h
@@ -16,11 +16,6 @@
 #define TARGET_PHYS_MASK MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS)
 #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS)
 
-/* Global bit used for lddir/ldpte */
-#define LOONGARCH_PAGE_HUGE_SHIFT   6
-/* Global bit for huge page */
-#define LOONGARCH_HGLOBAL_SHIFT     12
-
 void loongarch_translate_init(void);
 
 void loongarch_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
@@ -31,10 +26,23 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env,
 
 const char *loongarch_exception_name(int32_t exception);
 
+#ifdef CONFIG_TCG
 int ieee_ex_to_loongarch(int xcpt);
 void restore_fp_status(CPULoongArchState *env);
+#endif
 
 #ifndef CONFIG_USER_ONLY
+enum {
+    TLBRET_MATCH = 0,
+    TLBRET_BADADDR = 1,
+    TLBRET_NOMATCH = 2,
+    TLBRET_INVALID = 3,
+    TLBRET_DIRTY = 4,
+    TLBRET_RI = 5,
+    TLBRET_XI = 6,
+    TLBRET_PE = 7,
+};
+
 extern const VMStateDescription vmstate_loongarch_cpu;
 
 void loongarch_cpu_set_irq(void *opaque, int irq, int level);
@@ -44,12 +52,18 @@ uint64_t cpu_loongarch_get_constant_timer_counter(LoongArchCPU *cpu);
 uint64_t cpu_loongarch_get_constant_timer_ticks(LoongArchCPU *cpu);
 void cpu_loongarch_store_constant_timer_config(LoongArchCPU *cpu,
                                                uint64_t value);
+bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr,
+                          int *index);
+int get_physical_address(CPULoongArchState *env, hwaddr *physical,
+                         int *prot, target_ulong address,
+                         MMUAccessType access_type, int mmu_idx);
+hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 
+#ifdef CONFIG_TCG
 bool loongarch_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
                             MMUAccessType access_type, int mmu_idx,
                             bool probe, uintptr_t retaddr);
-
-hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
+#endif
 #endif /* !CONFIG_USER_ONLY */
 
 uint64_t read_fcc(CPULoongArchState *env);
@@ -58,5 +72,7 @@ void write_fcc(CPULoongArchState *env, uint64_t val);
 int loongarch_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n);
 int loongarch_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n);
 void loongarch_cpu_register_gdb_regs_for_features(CPUState *cs);
+int loongarch_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cpu,
+                                   int cpuid, DumpState *s);
 
 #endif
diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c
new file mode 100644
index 0000000000000000000000000000000000000000..f724e77a1bb1d1615453ab0a7b24f9096a4272c2
--- /dev/null
+++ b/target/loongarch/kvm/kvm.c
@@ -0,0 +1,1121 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * QEMU LoongArch KVM
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+
+#include "qapi/error.h"
+#include "qemu/timer.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_int.h"
+#include "hw/pci/pci.h"
+#include "exec/memattrs.h"
+#include "exec/address-spaces.h"
+#include "hw/boards.h"
+#include "hw/irq.h"
+#include "qemu/log.h"
+#include "hw/loader.h"
+#include "migration/migration.h"
+#include "sysemu/runstate.h"
+#include "cpu-csr.h"
+#include "kvm_loongarch.h"
+#include "trace.h"
+
+static bool cap_has_mp_state;
+static unsigned int brk_insn;
+const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
+    KVM_CAP_LAST_INFO
+};
+
+static int kvm_get_stealtime(CPUState *cs)
+{
+    CPULoongArchState *env = cpu_env(cs);
+    int err;
+    struct kvm_device_attr attr = {
+        .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL,
+        .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA,
+        .addr = (uint64_t)&env->stealtime.guest_addr,
+    };
+
+    err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr);
+    if (err) {
+        return 0;
+    }
+
+    err = kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, attr);
+    if (err) {
+        error_report("PVTIME: KVM_GET_DEVICE_ATTR: %s", strerror(errno));
+        return err;
+    }
+
+    return 0;
+}
+
+static int kvm_set_stealtime(CPUState *cs)
+{
+    CPULoongArchState *env = cpu_env(cs);
+    int err;
+    struct kvm_device_attr attr = {
+        .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL,
+        .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA,
+        .addr = (uint64_t)&env->stealtime.guest_addr,
+    };
+
+    err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr);
+    if (err) {
+        return 0;
+    }
+
+    err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr);
+    if (err) {
+        error_report("PVTIME: KVM_SET_DEVICE_ATTR %s with gpa "TARGET_FMT_lx,
+                      strerror(errno), env->stealtime.guest_addr);
+        return err;
+    }
+
+    return 0;
+}
+
+static int kvm_loongarch_get_regs_core(CPUState *cs)
+{
+    int ret = 0;
+    int i;
+    struct kvm_regs regs;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    /* Get the current register set as KVM seems it */
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
+    if (ret < 0) {
+        trace_kvm_failed_get_regs_core(strerror(errno));
+        return ret;
+    }
+    /* gpr[0] value is always 0 */
+    env->gpr[0] = 0;
+    for (i = 1; i < 32; i++) {
+        env->gpr[i] = regs.gpr[i];
+    }
+
+    env->pc = regs.pc;
+    return ret;
+}
+
+static int kvm_loongarch_put_regs_core(CPUState *cs)
+{
+    int ret = 0;
+    int i;
+    struct kvm_regs regs;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    /* Set the registers based on QEMU's view of things */
+    for (i = 0; i < 32; i++) {
+        regs.gpr[i] = env->gpr[i];
+    }
+
+    regs.pc = env->pc;
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, &regs);
+    if (ret < 0) {
+        trace_kvm_failed_put_regs_core(strerror(errno));
+    }
+
+    return ret;
+}
+
+static int kvm_loongarch_get_csr(CPUState *cs)
+{
+    int ret = 0;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CRMD),
+                           &env->CSR_CRMD);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRMD),
+                           &env->CSR_PRMD);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EUEN),
+                           &env->CSR_EUEN);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_MISC),
+                           &env->CSR_MISC);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ECFG),
+                           &env->CSR_ECFG);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ESTAT),
+                           &env->CSR_ESTAT);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ERA),
+                           &env->CSR_ERA);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADV),
+                           &env->CSR_BADV);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADI),
+                           &env->CSR_BADI);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EENTRY),
+                           &env->CSR_EENTRY);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBIDX),
+                           &env->CSR_TLBIDX);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBEHI),
+                           &env->CSR_TLBEHI);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO0),
+                           &env->CSR_TLBELO0);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO1),
+                           &env->CSR_TLBELO1);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ASID),
+                           &env->CSR_ASID);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDL),
+                           &env->CSR_PGDL);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDH),
+                           &env->CSR_PGDH);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGD),
+                           &env->CSR_PGD);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCL),
+                           &env->CSR_PWCL);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCH),
+                           &env->CSR_PWCH);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_STLBPS),
+                           &env->CSR_STLBPS);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG),
+                           &env->CSR_RVACFG);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID),
+                           &env->CSR_CPUID);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1),
+                           &env->CSR_PRCFG1);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG2),
+                           &env->CSR_PRCFG2);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG3),
+                           &env->CSR_PRCFG3);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(0)),
+                           &env->CSR_SAVE[0]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(1)),
+                           &env->CSR_SAVE[1]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(2)),
+                           &env->CSR_SAVE[2]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(3)),
+                           &env->CSR_SAVE[3]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(4)),
+                           &env->CSR_SAVE[4]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(5)),
+                           &env->CSR_SAVE[5]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(6)),
+                           &env->CSR_SAVE[6]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(7)),
+                           &env->CSR_SAVE[7]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TID),
+                           &env->CSR_TID);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CNTC),
+                           &env->CSR_CNTC);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TICLR),
+                           &env->CSR_TICLR);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_LLBCTL),
+                           &env->CSR_LLBCTL);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL1),
+                           &env->CSR_IMPCTL1);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL2),
+                           &env->CSR_IMPCTL2);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRENTRY),
+                           &env->CSR_TLBRENTRY);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRBADV),
+                           &env->CSR_TLBRBADV);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRERA),
+                           &env->CSR_TLBRERA);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRSAVE),
+                           &env->CSR_TLBRSAVE);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO0),
+                           &env->CSR_TLBRELO0);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO1),
+                           &env->CSR_TLBRELO1);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBREHI),
+                           &env->CSR_TLBREHI);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRPRMD),
+                           &env->CSR_TLBRPRMD);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(0)),
+                           &env->CSR_DMW[0]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(1)),
+                           &env->CSR_DMW[1]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(2)),
+                           &env->CSR_DMW[2]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(3)),
+                           &env->CSR_DMW[3]);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TVAL),
+                           &env->CSR_TVAL);
+
+    ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TCFG),
+                           &env->CSR_TCFG);
+
+    return ret;
+}
+
+static int kvm_loongarch_put_csr(CPUState *cs, int level)
+{
+    int ret = 0;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CRMD),
+                           &env->CSR_CRMD);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRMD),
+                           &env->CSR_PRMD);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EUEN),
+                           &env->CSR_EUEN);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_MISC),
+                           &env->CSR_MISC);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ECFG),
+                           &env->CSR_ECFG);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ESTAT),
+                           &env->CSR_ESTAT);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ERA),
+                           &env->CSR_ERA);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADV),
+                           &env->CSR_BADV);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADI),
+                           &env->CSR_BADI);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EENTRY),
+                           &env->CSR_EENTRY);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBIDX),
+                           &env->CSR_TLBIDX);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBEHI),
+                           &env->CSR_TLBEHI);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO0),
+                           &env->CSR_TLBELO0);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO1),
+                           &env->CSR_TLBELO1);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ASID),
+                           &env->CSR_ASID);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDL),
+                           &env->CSR_PGDL);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDH),
+                           &env->CSR_PGDH);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGD),
+                           &env->CSR_PGD);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCL),
+                           &env->CSR_PWCL);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCH),
+                           &env->CSR_PWCH);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_STLBPS),
+                           &env->CSR_STLBPS);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG),
+                           &env->CSR_RVACFG);
+
+    /* CPUID is constant after poweron, it should be set only once */
+    if (level >= KVM_PUT_FULL_STATE) {
+        ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID),
+                           &env->CSR_CPUID);
+    }
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1),
+                           &env->CSR_PRCFG1);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG2),
+                           &env->CSR_PRCFG2);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG3),
+                           &env->CSR_PRCFG3);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(0)),
+                           &env->CSR_SAVE[0]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(1)),
+                           &env->CSR_SAVE[1]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(2)),
+                           &env->CSR_SAVE[2]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(3)),
+                           &env->CSR_SAVE[3]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(4)),
+                           &env->CSR_SAVE[4]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(5)),
+                           &env->CSR_SAVE[5]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(6)),
+                           &env->CSR_SAVE[6]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(7)),
+                           &env->CSR_SAVE[7]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TID),
+                           &env->CSR_TID);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CNTC),
+                           &env->CSR_CNTC);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TICLR),
+                           &env->CSR_TICLR);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_LLBCTL),
+                           &env->CSR_LLBCTL);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL1),
+                           &env->CSR_IMPCTL1);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL2),
+                           &env->CSR_IMPCTL2);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRENTRY),
+                           &env->CSR_TLBRENTRY);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRBADV),
+                           &env->CSR_TLBRBADV);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRERA),
+                           &env->CSR_TLBRERA);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRSAVE),
+                           &env->CSR_TLBRSAVE);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO0),
+                           &env->CSR_TLBRELO0);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO1),
+                           &env->CSR_TLBRELO1);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBREHI),
+                           &env->CSR_TLBREHI);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRPRMD),
+                           &env->CSR_TLBRPRMD);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(0)),
+                           &env->CSR_DMW[0]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(1)),
+                           &env->CSR_DMW[1]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(2)),
+                           &env->CSR_DMW[2]);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(3)),
+                           &env->CSR_DMW[3]);
+    /*
+     * timer cfg must be put at last since it is used to enable
+     * guest timer
+     */
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TVAL),
+                           &env->CSR_TVAL);
+
+    ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TCFG),
+                           &env->CSR_TCFG);
+    return ret;
+}
+
+static int kvm_loongarch_get_regs_fp(CPUState *cs)
+{
+    int ret, i;
+    struct kvm_fpu fpu;
+
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_FPU, &fpu);
+    if (ret < 0) {
+        trace_kvm_failed_get_fpu(strerror(errno));
+        return ret;
+    }
+
+    env->fcsr0 = fpu.fcsr;
+    for (i = 0; i < 32; i++) {
+        env->fpr[i].vreg.UD[0] = fpu.fpr[i].val64[0];
+        env->fpr[i].vreg.UD[1] = fpu.fpr[i].val64[1];
+        env->fpr[i].vreg.UD[2] = fpu.fpr[i].val64[2];
+        env->fpr[i].vreg.UD[3] = fpu.fpr[i].val64[3];
+    }
+    for (i = 0; i < 8; i++) {
+        env->cf[i] = fpu.fcc & 0xFF;
+        fpu.fcc = fpu.fcc >> 8;
+    }
+
+    return ret;
+}
+
+static int kvm_loongarch_put_regs_fp(CPUState *cs)
+{
+    int ret, i;
+    struct kvm_fpu fpu;
+
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    fpu.fcsr = env->fcsr0;
+    fpu.fcc = 0;
+    for (i = 0; i < 32; i++) {
+        fpu.fpr[i].val64[0] = env->fpr[i].vreg.UD[0];
+        fpu.fpr[i].val64[1] = env->fpr[i].vreg.UD[1];
+        fpu.fpr[i].val64[2] = env->fpr[i].vreg.UD[2];
+        fpu.fpr[i].val64[3] = env->fpr[i].vreg.UD[3];
+    }
+
+    for (i = 0; i < 8; i++) {
+        fpu.fcc |= env->cf[i] << (8 * i);
+    }
+
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_FPU, &fpu);
+    if (ret < 0) {
+        trace_kvm_failed_put_fpu(strerror(errno));
+    }
+
+    return ret;
+}
+
+static int kvm_loongarch_put_lbt(CPUState *cs)
+{
+    CPULoongArchState *env = cpu_env(cs);
+    uint64_t val;
+    int ret;
+
+    /* check whether vm support LBT firstly */
+    if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LBT_ALL) != 7) {
+        return 0;
+    }
+
+    /* set six LBT registers including scr0-scr3, eflags, ftop */
+    ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR0, &env->lbt.scr0);
+    ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR1, &env->lbt.scr1);
+    ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR2, &env->lbt.scr2);
+    ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR3, &env->lbt.scr3);
+    /*
+     * Be cautious, KVM_REG_LOONGARCH_LBT_FTOP is defined as 64-bit however
+     * lbt.ftop is 32-bit; the same with KVM_REG_LOONGARCH_LBT_EFLAGS register
+     */
+    val = env->lbt.eflags;
+    ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_EFLAGS, &val);
+    val = env->lbt.ftop;
+    ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_FTOP, &val);
+
+    return ret;
+}
+
+static int kvm_loongarch_get_lbt(CPUState *cs)
+{
+    CPULoongArchState *env = cpu_env(cs);
+    uint64_t val;
+    int ret;
+
+    /* check whether vm support LBT firstly */
+    if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LBT_ALL) != 7) {
+        return 0;
+    }
+
+    /* get six LBT registers including scr0-scr3, eflags, ftop */
+    ret = kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR0, &env->lbt.scr0);
+    ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR1, &env->lbt.scr1);
+    ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR2, &env->lbt.scr2);
+    ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR3, &env->lbt.scr3);
+    ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_EFLAGS, &val);
+    env->lbt.eflags = (uint32_t)val;
+    ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_FTOP, &val);
+    env->lbt.ftop = (uint32_t)val;
+
+    return ret;
+}
+
+void kvm_arch_reset_vcpu(CPUState *cs)
+{
+    CPULoongArchState *env = cpu_env(cs);
+    int ret = 0;
+    uint64_t unused = 0;
+
+    env->mp_state = KVM_MP_STATE_RUNNABLE;
+    ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_VCPU_RESET, &unused);
+    if (ret) {
+        error_report("Failed to set KVM_REG_LOONGARCH_VCPU_RESET: %s",
+                     strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+}
+
+static int kvm_loongarch_get_mpstate(CPUState *cs)
+{
+    int ret = 0;
+    struct kvm_mp_state mp_state;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    if (cap_has_mp_state) {
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
+        if (ret) {
+            trace_kvm_failed_get_mpstate(strerror(errno));
+            return ret;
+        }
+        env->mp_state = mp_state.mp_state;
+    }
+
+    return ret;
+}
+
+static int kvm_loongarch_put_mpstate(CPUState *cs)
+{
+    int ret = 0;
+
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    struct kvm_mp_state mp_state = {
+        .mp_state = env->mp_state
+    };
+
+    if (cap_has_mp_state) {
+        ret = kvm_vcpu_ioctl(cs, KVM_SET_MP_STATE, &mp_state);
+        if (ret) {
+            trace_kvm_failed_put_mpstate(strerror(errno));
+        }
+    }
+
+    return ret;
+}
+
+static int kvm_loongarch_get_cpucfg(CPUState *cs)
+{
+    int i, ret = 0;
+    uint64_t val;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    for (i = 0; i < 21; i++) {
+        ret = kvm_get_one_reg(cs, KVM_IOC_CPUCFG(i), &val);
+        if (ret < 0) {
+            trace_kvm_failed_get_cpucfg(strerror(errno));
+        }
+        env->cpucfg[i] = (uint32_t)val;
+    }
+    return ret;
+}
+
+static int kvm_check_cpucfg2(CPUState *cs)
+{
+    int ret;
+    uint64_t val;
+    struct kvm_device_attr attr = {
+        .group = KVM_LOONGARCH_VCPU_CPUCFG,
+        .attr = 2,
+        .addr = (uint64_t)&val,
+    };
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    ret = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, &attr);
+
+    if (!ret) {
+        kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, &attr);
+        env->cpucfg[2] &= val;
+
+        if (FIELD_EX32(env->cpucfg[2], CPUCFG2, FP)) {
+            /* The FP minimal version is 1. */
+            env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, FP_VER, 1);
+        }
+
+        if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LLFTP)) {
+            /* The LLFTP minimal version is 1. */
+            env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LLFTP_VER, 1);
+        }
+    }
+
+    return ret;
+}
+
+static int kvm_loongarch_put_cpucfg(CPUState *cs)
+{
+    int i, ret = 0;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+    uint64_t val;
+
+    for (i = 0; i < 21; i++) {
+	if (i == 2) {
+            ret = kvm_check_cpucfg2(cs);
+            if (ret) {
+                return ret;
+            }
+        }
+        val = env->cpucfg[i];
+        ret = kvm_set_one_reg(cs, KVM_IOC_CPUCFG(i), &val);
+        if (ret < 0) {
+            trace_kvm_failed_put_cpucfg(strerror(errno));
+        }
+    }
+    return ret;
+}
+
+int kvm_arch_get_registers(CPUState *cs)
+{
+    int ret;
+
+    ret = kvm_loongarch_get_regs_core(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_get_cpucfg(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_get_csr(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_get_regs_fp(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_get_lbt(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_get_stealtime(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_get_mpstate(cs);
+    return ret;
+}
+
+int kvm_arch_put_registers(CPUState *cs, int level)
+{
+    int ret;
+
+    ret = kvm_loongarch_put_regs_core(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_put_cpucfg(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_put_csr(cs, level);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_put_regs_fp(cs);
+    if (ret) {
+        return ret;
+    }
+
+    ret = kvm_loongarch_put_lbt(cs);
+    if (ret) {
+        return ret;
+    }
+
+    if (level >= KVM_PUT_FULL_STATE) {
+        /*
+         * only KVM_PUT_FULL_STATE is required, kvm kernel will clear
+         * guest_addr for KVM_PUT_RESET_STATE
+         */
+        ret = kvm_set_stealtime(cs);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    ret = kvm_loongarch_put_mpstate(cs);
+    return ret;
+}
+
+static void kvm_loongarch_vm_stage_change(void *opaque, bool running,
+                                          RunState state)
+{
+    int ret;
+    CPUState *cs = opaque;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+
+    if (running) {
+        ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_COUNTER,
+                              &cpu->kvm_state_counter);
+        if (ret < 0) {
+            trace_kvm_failed_put_counter(strerror(errno));
+        }
+    } else {
+        ret = kvm_get_one_reg(cs, KVM_REG_LOONGARCH_COUNTER,
+                              &cpu->kvm_state_counter);
+        if (ret < 0) {
+            trace_kvm_failed_get_counter(strerror(errno));
+        }
+    }
+}
+
+static bool kvm_feature_supported(CPUState *cs, enum loongarch_features feature)
+{
+    int ret;
+    struct kvm_device_attr attr;
+
+    switch (feature) {
+    case LOONGARCH_FEATURE_LBT:
+        /*
+         * Return all if all the LBT features are supported such as:
+         *  KVM_LOONGARCH_VM_FEAT_X86BT
+         *  KVM_LOONGARCH_VM_FEAT_ARMBT
+         *  KVM_LOONGARCH_VM_FEAT_MIPSBT
+         */
+        attr.group = KVM_LOONGARCH_VM_FEAT_CTRL;
+        attr.attr = KVM_LOONGARCH_VM_FEAT_X86BT;
+        ret = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr);
+        attr.attr = KVM_LOONGARCH_VM_FEAT_ARMBT;
+        ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr);
+        attr.attr = KVM_LOONGARCH_VM_FEAT_MIPSBT;
+        ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr);
+        return (ret == 0);
+
+    case LOONGARCH_FEATURE_PMU:
+        attr.group = KVM_LOONGARCH_VM_FEAT_CTRL;
+        attr.attr = KVM_LOONGARCH_VM_FEAT_PMU;
+        ret = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr);
+        return (ret == 0);
+
+    default:
+        return false;
+    }
+
+    return false;
+}
+
+static int kvm_cpu_check_lbt(CPUState *cs, Error **errp)
+{
+    CPULoongArchState *env = cpu_env(cs);
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    bool kvm_supported;
+
+    kvm_supported = kvm_feature_supported(cs, LOONGARCH_FEATURE_LBT);
+    if (cpu->lbt == ON_OFF_AUTO_ON) {
+        if (kvm_supported) {
+            env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LBT_ALL, 7);
+        } else {
+            error_setg(errp, "'lbt' feature not supported by KVM on this host");
+            return -ENOTSUP;
+        }
+    } else if ((cpu->lbt == ON_OFF_AUTO_AUTO) && kvm_supported) {
+        env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LBT_ALL, 7);
+    }
+
+    return 0;
+}
+
+static int kvm_cpu_check_pmu(CPUState *cs, Error **errp)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = cpu_env(cs);
+    bool kvm_supported;
+
+    kvm_supported = kvm_feature_supported(cs, LOONGARCH_FEATURE_PMU);
+    if (cpu->pmu == ON_OFF_AUTO_ON) {
+        if (!kvm_supported) {
+            error_setg(errp, "'pmu' feature not supported by KVM on the host");
+            return -ENOTSUP;
+        }
+    } else if (cpu->pmu != ON_OFF_AUTO_AUTO) {
+        /* disable pmu if ON_OFF_AUTO_OFF is set */
+        kvm_supported = false;
+    }
+
+    if (kvm_supported) {
+        env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMP, 1);
+        env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMNUM, 3);
+        env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMBITS, 63);
+        env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, UPM, 1);
+    }
+    return 0;
+}
+
+int kvm_arch_init_vcpu(CPUState *cs)
+{
+    uint64_t val;
+    int ret;
+    Error *local_err = NULL;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+
+    ret = 0;
+    cpu->vmsentry = qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs);
+
+    if (!kvm_get_one_reg(cs, KVM_REG_LOONGARCH_DEBUG_INST, &val)) {
+        brk_insn = val;
+    }
+
+    ret = kvm_cpu_check_lbt(cs, &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+    }
+
+    ret = kvm_cpu_check_pmu(cs, &local_err);
+    if (ret < 0) {
+        error_report_err(local_err);
+    }
+
+    return ret;
+}
+
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    qemu_del_vm_change_state_handler(cpu->vmsentry);
+    return 0;
+}
+
+unsigned long kvm_arch_vcpu_id(CPUState *cs)
+{
+    return cs->cpu_index;
+}
+
+int kvm_arch_release_virq_post(int virq)
+{
+    return 0;
+}
+
+int kvm_arch_msi_data_to_gsi(uint32_t data)
+{
+    abort();
+}
+
+int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
+                             uint64_t address, uint32_t data, PCIDevice *dev)
+{
+    return 0;
+}
+
+int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
+                                int vector, PCIDevice *dev)
+{
+    return 0;
+}
+
+void kvm_arch_init_irq_routing(KVMState *s)
+{
+}
+
+int kvm_arch_get_default_type(MachineState *ms)
+{
+    return 0;
+}
+
+int kvm_arch_init(MachineState *ms, KVMState *s)
+{
+    cap_has_mp_state = kvm_check_extension(s, KVM_CAP_MP_STATE);
+    return 0;
+}
+
+int kvm_arch_irqchip_create(KVMState *s)
+{
+    return 0;
+}
+
+void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
+{
+}
+
+MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
+{
+    return MEMTXATTRS_UNSPECIFIED;
+}
+
+int kvm_arch_process_async_events(CPUState *cs)
+{
+    return cs->halted;
+}
+
+bool kvm_arch_stop_on_emulation_error(CPUState *cs)
+{
+    return true;
+}
+
+bool kvm_arch_cpu_check_are_resettable(void)
+{
+    return true;
+}
+
+
+void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
+{
+    if (kvm_sw_breakpoints_active(cpu)) {
+        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+    }
+}
+
+int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
+{
+    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 0) ||
+        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk_insn, 4, 1)) {
+        error_report("%s failed", __func__);
+        return -EINVAL;
+    }
+    return 0;
+}
+
+int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
+{
+    static uint32_t brk;
+
+    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk, 4, 0) ||
+        brk != brk_insn ||
+        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 1)) {
+        error_report("%s failed", __func__);
+        return -EINVAL;
+    }
+    return 0;
+}
+
+int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type)
+{
+    return -ENOSYS;
+}
+
+int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type)
+{
+    return -ENOSYS;
+}
+
+void kvm_arch_remove_all_hw_breakpoints(void)
+{
+}
+
+static bool kvm_loongarch_handle_debug(CPUState *cs, struct kvm_run *run)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    kvm_cpu_synchronize_state(cs);
+    if (cs->singlestep_enabled) {
+        return true;
+    }
+
+    if (kvm_find_sw_breakpoint(cs, env->pc)) {
+        return true;
+    }
+
+    return false;
+}
+
+int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
+{
+    int ret = 0;
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+    MemTxAttrs attrs = {};
+
+    attrs.requester_id = env_cpu(env)->cpu_index;
+
+    trace_kvm_arch_handle_exit(run->exit_reason);
+    switch (run->exit_reason) {
+    case KVM_EXIT_LOONGARCH_IOCSR:
+        address_space_rw(env->address_space_iocsr,
+                         run->iocsr_io.phys_addr,
+                         attrs,
+                         run->iocsr_io.data,
+                         run->iocsr_io.len,
+                         run->iocsr_io.is_write);
+        break;
+
+    case KVM_EXIT_DEBUG:
+        if (kvm_loongarch_handle_debug(cs, run)) {
+            ret = EXCP_DEBUG;
+        }
+        break;
+
+    default:
+        ret = -1;
+        warn_report("KVM: unknown exit reason %d", run->exit_reason);
+        break;
+    }
+    return ret;
+}
+
+int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level)
+{
+    struct kvm_interrupt intr;
+    CPUState *cs = CPU(cpu);
+
+    if (level) {
+        intr.irq = irq;
+    } else {
+        intr.irq = -irq;
+    }
+
+    trace_kvm_set_intr(irq, level);
+    return kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &intr);
+}
+
+void kvm_arch_accel_class_init(ObjectClass *oc)
+{
+}
diff --git a/target/loongarch/kvm/kvm_loongarch.h b/target/loongarch/kvm/kvm_loongarch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1051a341ec2633647e55f3c37ad5998c4f02ba06
--- /dev/null
+++ b/target/loongarch/kvm/kvm_loongarch.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * QEMU LoongArch kvm interface
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ */
+
+#include "cpu.h"
+
+#ifndef QEMU_KVM_LOONGARCH_H
+#define QEMU_KVM_LOONGARCH_H
+
+int  kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level);
+void kvm_arch_reset_vcpu(CPUState *cs);
+
+#endif
diff --git a/target/loongarch/kvm/meson.build b/target/loongarch/kvm/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..2266de6ca97c3122bbf437a129fae97771010529
--- /dev/null
+++ b/target/loongarch/kvm/meson.build
@@ -0,0 +1 @@
+loongarch_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c'))
diff --git a/target/loongarch/loongarch-qmp-cmds.c b/target/loongarch/loongarch-qmp-cmds.c
index 645672ff593fb202216ac135ad14dfc148aa936c..dc78a3ffa22b90f73f695f24b2d22b8f5221f13e 100644
--- a/target/loongarch/loongarch-qmp-cmds.c
+++ b/target/loongarch/loongarch-qmp-cmds.c
@@ -42,7 +42,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp)
 }
 
 static const char *cpu_model_advertised_features[] = {
-    "lsx", "lasx", NULL
+    "lsx", "lasx", "lbt", "pmu", NULL
 };
 
 CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type,
diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c
index 1c4e01d07695ac67e22fa77166d48dabd7959da2..57abdddc09d4f129a1a9dd89e73e8eed340a2129 100644
--- a/target/loongarch/machine.c
+++ b/target/loongarch/machine.c
@@ -8,7 +8,10 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "migration/cpu.h"
+#include "sysemu/tcg.h"
 #include "vec.h"
+#include "kvm/kvm_loongarch.h"
+#include "sysemu/kvm.h"
 
 static const VMStateDescription vmstate_fpu_reg = {
     .name = "fpu_reg",
@@ -109,9 +112,38 @@ static const VMStateDescription vmstate_lasx = {
     },
 };
 
+static bool lbt_needed(void *opaque)
+{
+    LoongArchCPU *cpu = opaque;
+
+    return !!FIELD_EX64(cpu->env.cpucfg[2], CPUCFG2, LBT_ALL);
+}
+
+static const VMStateDescription vmstate_lbt = {
+    .name = "cpu/lbt",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .needed = lbt_needed,
+    .fields = (const VMStateField[]) {
+        VMSTATE_UINT64(env.lbt.scr0,   LoongArchCPU),
+        VMSTATE_UINT64(env.lbt.scr1,   LoongArchCPU),
+        VMSTATE_UINT64(env.lbt.scr2,   LoongArchCPU),
+        VMSTATE_UINT64(env.lbt.scr3,   LoongArchCPU),
+        VMSTATE_UINT32(env.lbt.eflags, LoongArchCPU),
+        VMSTATE_UINT32(env.lbt.ftop,   LoongArchCPU),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
+static bool tlb_needed(void *opaque)
+{
+    return tcg_enabled();
+}
+
 /* TLB state */
-const VMStateDescription vmstate_tlb = {
-    .name = "cpu/tlb",
+static const VMStateDescription vmstate_tlb_entry = {
+    .name = "cpu/tlb_entry",
     .version_id = 0,
     .minimum_version_id = 0,
     .fields = (VMStateField[]) {
@@ -122,15 +154,31 @@ const VMStateDescription vmstate_tlb = {
     }
 };
 
+static const VMStateDescription vmstate_tlb = {
+    .name = "cpu/tlb",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .needed = tlb_needed,
+    .fields = (const VMStateField[]) {
+        VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX,
+                             0, vmstate_tlb_entry, LoongArchTLB),
+        VMSTATE_END_OF_LIST()
+    }
+};
+#endif
+
 /* LoongArch CPU state */
 const VMStateDescription vmstate_loongarch_cpu = {
     .name = "cpu",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields = (VMStateField[]) {
+    .version_id = 3,
+    .minimum_version_id = 3,
+    .fields = (const VMStateField[]) {
         VMSTATE_UINTTL_ARRAY(env.gpr, LoongArchCPU, 32),
         VMSTATE_UINTTL(env.pc, LoongArchCPU),
 
+        /* PV time */
+        VMSTATE_UINT64(env.st.guest_addr, LoongArchCPU),
+
         /* Remaining CSRs */
         VMSTATE_UINT64(env.CSR_CRMD, LoongArchCPU),
         VMSTATE_UINT64(env.CSR_PRMD, LoongArchCPU),
@@ -187,9 +235,10 @@ const VMStateDescription vmstate_loongarch_cpu = {
         VMSTATE_UINT64(env.CSR_DBG, LoongArchCPU),
         VMSTATE_UINT64(env.CSR_DERA, LoongArchCPU),
         VMSTATE_UINT64(env.CSR_DSAVE, LoongArchCPU),
-        /* TLB */
-        VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX,
-                             0, vmstate_tlb, LoongArchTLB),
+
+        VMSTATE_UINT64(kvm_state_counter, LoongArchCPU),
+        /* PV steal time */
+        VMSTATE_UINT64(env.stealtime.guest_addr, LoongArchCPU),
 
         VMSTATE_END_OF_LIST()
     },
@@ -197,6 +246,10 @@ const VMStateDescription vmstate_loongarch_cpu = {
         &vmstate_fpu,
         &vmstate_lsx,
         &vmstate_lasx,
+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
+        &vmstate_tlb,
+#endif
+        &vmstate_lbt,
         NULL
     }
 };
diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build
index 18e8191e2b67ea369ea43796c471e3d7a89a9d01..7817318287d0f8c460218bbc63ea60614efae50c 100644
--- a/target/loongarch/meson.build
+++ b/target/loongarch/meson.build
@@ -3,31 +3,21 @@ gen = decodetree.process('insns.decode')
 loongarch_ss = ss.source_set()
 loongarch_ss.add(files(
   'cpu.c',
-))
-loongarch_tcg_ss = ss.source_set()
-loongarch_tcg_ss.add(gen)
-loongarch_tcg_ss.add(files(
-  'fpu_helper.c',
-  'op_helper.c',
-  'translate.c',
   'gdbstub.c',
-  'vec_helper.c',
 ))
-loongarch_tcg_ss.add(zlib)
 
 loongarch_system_ss = ss.source_set()
 loongarch_system_ss.add(files(
+  'arch_dump.c',
+  'cpu_helper.c',
   'loongarch-qmp-cmds.c',
   'machine.c',
-  'tlb_helper.c',
-  'constant_timer.c',
-  'csr_helper.c',
-  'iocsr_helper.c',
 ))
 
 common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen])
 
-loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss])
+subdir('tcg')
 
 target_arch += {'loongarch': loongarch_ss}
 target_system_arch += {'loongarch': loongarch_system_ss}
+subdir('kvm')
diff --git a/target/loongarch/constant_timer.c b/target/loongarch/tcg/constant_timer.c
similarity index 100%
rename from target/loongarch/constant_timer.c
rename to target/loongarch/tcg/constant_timer.c
diff --git a/target/loongarch/csr_helper.c b/target/loongarch/tcg/csr_helper.c
similarity index 100%
rename from target/loongarch/csr_helper.c
rename to target/loongarch/tcg/csr_helper.c
diff --git a/target/loongarch/fpu_helper.c b/target/loongarch/tcg/fpu_helper.c
similarity index 100%
rename from target/loongarch/fpu_helper.c
rename to target/loongarch/tcg/fpu_helper.c
diff --git a/target/loongarch/insn_trans/trans_arith.c.inc b/target/loongarch/tcg/insn_trans/trans_arith.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_arith.c.inc
rename to target/loongarch/tcg/insn_trans/trans_arith.c.inc
diff --git a/target/loongarch/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
similarity index 95%
rename from target/loongarch/insn_trans/trans_atomic.c.inc
rename to target/loongarch/tcg/insn_trans/trans_atomic.c.inc
index 80c2e286fd889192ce11b4513a7d16df533d1dea..974bc2a70feddbf021a07b19a0859781eb3a11c4 100644
--- a/target/loongarch/insn_trans/trans_atomic.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc
@@ -5,14 +5,14 @@
 
 static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop)
 {
-    TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
+    TCGv t1 = tcg_temp_new();
     TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE);
     TCGv t0 = make_address_i(ctx, src1, a->imm);
 
-    tcg_gen_qemu_ld_i64(dest, t0, ctx->mem_idx, mop);
+    tcg_gen_qemu_ld_i64(t1, t0, ctx->mem_idx, mop);
     tcg_gen_st_tl(t0, tcg_env, offsetof(CPULoongArchState, lladdr));
-    tcg_gen_st_tl(dest, tcg_env, offsetof(CPULoongArchState, llval));
-    gen_set_gpr(a->rd, dest, EXT_NONE);
+    tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval));
+    gen_set_gpr(a->rd, t1, EXT_NONE);
 
     return true;
 }
diff --git a/target/loongarch/insn_trans/trans_bit.c.inc b/target/loongarch/tcg/insn_trans/trans_bit.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_bit.c.inc
rename to target/loongarch/tcg/insn_trans/trans_bit.c.inc
diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/tcg/insn_trans/trans_branch.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_branch.c.inc
rename to target/loongarch/tcg/insn_trans/trans_branch.c.inc
diff --git a/target/loongarch/insn_trans/trans_extra.c.inc b/target/loongarch/tcg/insn_trans/trans_extra.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_extra.c.inc
rename to target/loongarch/tcg/insn_trans/trans_extra.c.inc
diff --git a/target/loongarch/insn_trans/trans_farith.c.inc b/target/loongarch/tcg/insn_trans/trans_farith.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_farith.c.inc
rename to target/loongarch/tcg/insn_trans/trans_farith.c.inc
diff --git a/target/loongarch/insn_trans/trans_fcmp.c.inc b/target/loongarch/tcg/insn_trans/trans_fcmp.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_fcmp.c.inc
rename to target/loongarch/tcg/insn_trans/trans_fcmp.c.inc
diff --git a/target/loongarch/insn_trans/trans_fcnv.c.inc b/target/loongarch/tcg/insn_trans/trans_fcnv.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_fcnv.c.inc
rename to target/loongarch/tcg/insn_trans/trans_fcnv.c.inc
diff --git a/target/loongarch/insn_trans/trans_fmemory.c.inc b/target/loongarch/tcg/insn_trans/trans_fmemory.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_fmemory.c.inc
rename to target/loongarch/tcg/insn_trans/trans_fmemory.c.inc
diff --git a/target/loongarch/insn_trans/trans_fmov.c.inc b/target/loongarch/tcg/insn_trans/trans_fmov.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_fmov.c.inc
rename to target/loongarch/tcg/insn_trans/trans_fmov.c.inc
diff --git a/target/loongarch/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_memory.c.inc
rename to target/loongarch/tcg/insn_trans/trans_memory.c.inc
diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc b/target/loongarch/tcg/insn_trans/trans_privileged.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_privileged.c.inc
rename to target/loongarch/tcg/insn_trans/trans_privileged.c.inc
diff --git a/target/loongarch/insn_trans/trans_shift.c.inc b/target/loongarch/tcg/insn_trans/trans_shift.c.inc
similarity index 88%
rename from target/loongarch/insn_trans/trans_shift.c.inc
rename to target/loongarch/tcg/insn_trans/trans_shift.c.inc
index 2f4bd6ff28819eb4c665a3775e6867f37294e6a7..377307785aab4837bc181f1691632e7970a9889d 100644
--- a/target/loongarch/insn_trans/trans_shift.c.inc
+++ b/target/loongarch/tcg/insn_trans/trans_shift.c.inc
@@ -67,19 +67,9 @@ static void gen_rotr_d(TCGv dest, TCGv src1, TCGv src2)
     tcg_gen_rotr_tl(dest, src1, t0);
 }
 
-static bool trans_srai_w(DisasContext *ctx, arg_srai_w *a)
+static void gen_sari_w(TCGv dest, TCGv src1, target_long imm)
 {
-    TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE);
-    TCGv src1 = gpr_src(ctx, a->rj, EXT_ZERO);
-
-    if (!avail_64(ctx)) {
-        return false;
-    }
-
-    tcg_gen_sextract_tl(dest, src1, a->imm, 32 - a->imm);
-    gen_set_gpr(a->rd, dest, EXT_NONE);
-
-    return true;
+    tcg_gen_sextract_tl(dest, src1, imm, 32 - imm);
 }
 
 TRANS(sll_w, ALL, gen_rrr, EXT_ZERO, EXT_NONE, EXT_SIGN, gen_sll_w)
@@ -94,6 +84,7 @@ TRANS(slli_w, ALL, gen_rri_c, EXT_NONE, EXT_SIGN, tcg_gen_shli_tl)
 TRANS(slli_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_shli_tl)
 TRANS(srli_w, ALL, gen_rri_c, EXT_ZERO, EXT_SIGN, tcg_gen_shri_tl)
 TRANS(srli_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_shri_tl)
+TRANS(srai_w, ALL, gen_rri_c, EXT_NONE, EXT_NONE, gen_sari_w)
 TRANS(srai_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_sari_tl)
 TRANS(rotri_w, 64, gen_rri_v, EXT_NONE, EXT_NONE, gen_rotr_w)
 TRANS(rotri_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_rotri_tl)
diff --git a/target/loongarch/insn_trans/trans_vec.c.inc b/target/loongarch/tcg/insn_trans/trans_vec.c.inc
similarity index 100%
rename from target/loongarch/insn_trans/trans_vec.c.inc
rename to target/loongarch/tcg/insn_trans/trans_vec.c.inc
diff --git a/target/loongarch/iocsr_helper.c b/target/loongarch/tcg/iocsr_helper.c
similarity index 76%
rename from target/loongarch/iocsr_helper.c
rename to target/loongarch/tcg/iocsr_helper.c
index 6cd01d5f0940bede6de45f6ed35442e08770e937..b6916f53d20ca133f0000e773685cb94240bafe2 100644
--- a/target/loongarch/iocsr_helper.c
+++ b/target/loongarch/tcg/iocsr_helper.c
@@ -17,52 +17,52 @@
 
 uint64_t helper_iocsrrd_b(CPULoongArchState *env, target_ulong r_addr)
 {
-    return address_space_ldub(&env->address_space_iocsr, r_addr,
+    return address_space_ldub(env->address_space_iocsr, r_addr,
                               GET_MEMTXATTRS(env), NULL);
 }
 
 uint64_t helper_iocsrrd_h(CPULoongArchState *env, target_ulong r_addr)
 {
-    return address_space_lduw(&env->address_space_iocsr, r_addr,
+    return address_space_lduw(env->address_space_iocsr, r_addr,
                               GET_MEMTXATTRS(env), NULL);
 }
 
 uint64_t helper_iocsrrd_w(CPULoongArchState *env, target_ulong r_addr)
 {
-    return address_space_ldl(&env->address_space_iocsr, r_addr,
+    return address_space_ldl(env->address_space_iocsr, r_addr,
                              GET_MEMTXATTRS(env), NULL);
 }
 
 uint64_t helper_iocsrrd_d(CPULoongArchState *env, target_ulong r_addr)
 {
-    return address_space_ldq(&env->address_space_iocsr, r_addr,
+    return address_space_ldq(env->address_space_iocsr, r_addr,
                              GET_MEMTXATTRS(env), NULL);
 }
 
 void helper_iocsrwr_b(CPULoongArchState *env, target_ulong w_addr,
                       target_ulong val)
 {
-    address_space_stb(&env->address_space_iocsr, w_addr,
+    address_space_stb(env->address_space_iocsr, w_addr,
                       val, GET_MEMTXATTRS(env), NULL);
 }
 
 void helper_iocsrwr_h(CPULoongArchState *env, target_ulong w_addr,
                       target_ulong val)
 {
-    address_space_stw(&env->address_space_iocsr, w_addr,
+    address_space_stw(env->address_space_iocsr, w_addr,
                       val, GET_MEMTXATTRS(env), NULL);
 }
 
 void helper_iocsrwr_w(CPULoongArchState *env, target_ulong w_addr,
                       target_ulong val)
 {
-    address_space_stl(&env->address_space_iocsr, w_addr,
+    address_space_stl(env->address_space_iocsr, w_addr,
                       val, GET_MEMTXATTRS(env), NULL);
 }
 
 void helper_iocsrwr_d(CPULoongArchState *env, target_ulong w_addr,
                       target_ulong val)
 {
-    address_space_stq(&env->address_space_iocsr, w_addr,
+    address_space_stq(env->address_space_iocsr, w_addr,
                       val, GET_MEMTXATTRS(env), NULL);
 }
diff --git a/target/loongarch/tcg/meson.build b/target/loongarch/tcg/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..1a3cd589fb3c24d0513d004c8d3009d8489e7e57
--- /dev/null
+++ b/target/loongarch/tcg/meson.build
@@ -0,0 +1,19 @@
+if 'CONFIG_TCG' not in config_all
+  subdir_done()
+endif
+
+loongarch_ss.add([zlib, gen])
+
+loongarch_ss.add(files(
+  'fpu_helper.c',
+  'op_helper.c',
+  'translate.c',
+  'vec_helper.c',
+))
+
+loongarch_system_ss.add(files(
+  'constant_timer.c',
+  'csr_helper.c',
+  'iocsr_helper.c',
+  'tlb_helper.c',
+))
diff --git a/target/loongarch/op_helper.c b/target/loongarch/tcg/op_helper.c
similarity index 100%
rename from target/loongarch/op_helper.c
rename to target/loongarch/tcg/op_helper.c
diff --git a/target/loongarch/tlb_helper.c b/target/loongarch/tcg/tlb_helper.c
similarity index 67%
rename from target/loongarch/tlb_helper.c
rename to target/loongarch/tcg/tlb_helper.c
index 449043c68be02cc2280f194419462360a3baa045..eedd1ac376265cb8ba08e74ca4de089ad970bb6a 100644
--- a/target/loongarch/tlb_helper.c
+++ b/target/loongarch/tcg/tlb_helper.c
@@ -17,234 +17,32 @@
 #include "exec/log.h"
 #include "cpu-csr.h"
 
-enum {
-    TLBRET_MATCH = 0,
-    TLBRET_BADADDR = 1,
-    TLBRET_NOMATCH = 2,
-    TLBRET_INVALID = 3,
-    TLBRET_DIRTY = 4,
-    TLBRET_RI = 5,
-    TLBRET_XI = 6,
-    TLBRET_PE = 7,
-};
-
-static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical,
-                                   int *prot, target_ulong address,
-                                   int access_type, int index, int mmu_idx)
+static void get_dir_base_width(CPULoongArchState *env, uint64_t *dir_base,
+                               uint64_t *dir_width, target_ulong level)
 {
-    LoongArchTLB *tlb = &env->tlb[index];
-    uint64_t plv = mmu_idx;
-    uint64_t tlb_entry, tlb_ppn;
-    uint8_t tlb_ps, n, tlb_v, tlb_d, tlb_plv, tlb_nx, tlb_nr, tlb_rplv;
-
-    if (index >= LOONGARCH_STLB) {
-        tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS);
-    } else {
-        tlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS);
-    }
-    n = (address >> tlb_ps) & 0x1;/* Odd or even */
-
-    tlb_entry = n ? tlb->tlb_entry1 : tlb->tlb_entry0;
-    tlb_v = FIELD_EX64(tlb_entry, TLBENTRY, V);
-    tlb_d = FIELD_EX64(tlb_entry, TLBENTRY, D);
-    tlb_plv = FIELD_EX64(tlb_entry, TLBENTRY, PLV);
-    if (is_la64(env)) {
-        tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_64, PPN);
-        tlb_nx = FIELD_EX64(tlb_entry, TLBENTRY_64, NX);
-        tlb_nr = FIELD_EX64(tlb_entry, TLBENTRY_64, NR);
-        tlb_rplv = FIELD_EX64(tlb_entry, TLBENTRY_64, RPLV);
-    } else {
-        tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_32, PPN);
-        tlb_nx = 0;
-        tlb_nr = 0;
-        tlb_rplv = 0;
-    }
-
-    /* Remove sw bit between bit12 -- bit PS*/
-    tlb_ppn = tlb_ppn & ~(((0x1UL << (tlb_ps - 12)) -1));
-
-    /* Check access rights */
-    if (!tlb_v) {
-        return TLBRET_INVALID;
-    }
-
-    if (access_type == MMU_INST_FETCH && tlb_nx) {
-        return TLBRET_XI;
-    }
-
-    if (access_type == MMU_DATA_LOAD && tlb_nr) {
-        return TLBRET_RI;
-    }
-
-    if (((tlb_rplv == 0) && (plv > tlb_plv)) ||
-        ((tlb_rplv == 1) && (plv != tlb_plv))) {
-        return TLBRET_PE;
-    }
-
-    if ((access_type == MMU_DATA_STORE) && !tlb_d) {
-        return TLBRET_DIRTY;
-    }
-
-    *physical = (tlb_ppn << R_TLBENTRY_64_PPN_SHIFT) |
-                (address & MAKE_64BIT_MASK(0, tlb_ps));
-    *prot = PAGE_READ;
-    if (tlb_d) {
-        *prot |= PAGE_WRITE;
-    }
-    if (!tlb_nx) {
-        *prot |= PAGE_EXEC;
-    }
-    return TLBRET_MATCH;
-}
-
-/*
- * One tlb entry holds an adjacent odd/even pair, the vpn is the
- * content of the virtual page number divided by 2. So the
- * compare vpn is bit[47:15] for 16KiB page. while the vppn
- * field in tlb entry contains bit[47:13], so need adjust.
- * virt_vpn = vaddr[47:13]
- */
-static bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr,
-                                 int *index)
-{
-    LoongArchTLB *tlb;
-    uint16_t csr_asid, tlb_asid, stlb_idx;
-    uint8_t tlb_e, tlb_ps, tlb_g, stlb_ps;
-    int i, compare_shift;
-    uint64_t vpn, tlb_vppn;
-
-    csr_asid = FIELD_EX64(env->CSR_ASID, CSR_ASID, ASID);
-    stlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS);
-    vpn = (vaddr & TARGET_VIRT_MASK) >> (stlb_ps + 1);
-    stlb_idx = vpn & 0xff; /* VA[25:15] <==> TLBIDX.index for 16KiB Page */
-    compare_shift = stlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT;
-
-    /* Search STLB */
-    for (i = 0; i < 8; ++i) {
-        tlb = &env->tlb[i * 256 + stlb_idx];
-        tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E);
-        if (tlb_e) {
-            tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN);
-            tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID);
-            tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G);
-
-            if ((tlb_g == 1 || tlb_asid == csr_asid) &&
-                (vpn == (tlb_vppn >> compare_shift))) {
-                *index = i * 256 + stlb_idx;
-                return true;
-            }
-        }
-    }
-
-    /* Search MTLB */
-    for (i = LOONGARCH_STLB; i < LOONGARCH_TLB_MAX; ++i) {
-        tlb = &env->tlb[i];
-        tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E);
-        if (tlb_e) {
-            tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN);
-            tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS);
-            tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID);
-            tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G);
-            compare_shift = tlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT;
-            vpn = (vaddr & TARGET_VIRT_MASK) >> (tlb_ps + 1);
-            if ((tlb_g == 1 || tlb_asid == csr_asid) &&
-                (vpn == (tlb_vppn >> compare_shift))) {
-                *index = i;
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical,
-                                 int *prot, target_ulong address,
-                                 MMUAccessType access_type, int mmu_idx)
-{
-    int index, match;
-
-    match = loongarch_tlb_search(env, address, &index);
-    if (match) {
-        return loongarch_map_tlb_entry(env, physical, prot,
-                                       address, access_type, index, mmu_idx);
-    }
-
-    return TLBRET_NOMATCH;
-}
-
-static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va,
-                        target_ulong dmw)
-{
-    if (is_la64(env)) {
-        return va & TARGET_VIRT_MASK;
-    } else {
-        uint32_t pseg = FIELD_EX32(dmw, CSR_DMW_32, PSEG);
-        return (va & MAKE_64BIT_MASK(0, R_CSR_DMW_32_VSEG_SHIFT)) | \
-            (pseg << R_CSR_DMW_32_VSEG_SHIFT);
-    }
-}
-
-static int get_physical_address(CPULoongArchState *env, hwaddr *physical,
-                                int *prot, target_ulong address,
-                                MMUAccessType access_type, int mmu_idx)
-{
-    int user_mode = mmu_idx == MMU_IDX_USER;
-    int kernel_mode = mmu_idx == MMU_IDX_KERNEL;
-    uint32_t plv, base_c, base_v;
-    int64_t addr_high;
-    uint8_t da = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, DA);
-    uint8_t pg = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG);
-
-    /* Check PG and DA */
-    if (da & !pg) {
-        *physical = address & TARGET_PHYS_MASK;
-        *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
-        return TLBRET_MATCH;
-    }
-
-    plv = kernel_mode | (user_mode << R_CSR_DMW_PLV3_SHIFT);
-    if (is_la64(env)) {
-        base_v = address >> R_CSR_DMW_64_VSEG_SHIFT;
-    } else {
-        base_v = address >> R_CSR_DMW_32_VSEG_SHIFT;
-    }
-    /* Check direct map window */
-    for (int i = 0; i < 4; i++) {
-        if (is_la64(env)) {
-            base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_64, VSEG);
-        } else {
-            base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_32, VSEG);
-        }
-        if ((plv & env->CSR_DMW[i]) && (base_c == base_v)) {
-            *physical = dmw_va2pa(env, address, env->CSR_DMW[i]);
-            *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
-            return TLBRET_MATCH;
-        }
-    }
-
-    /* Check valid extension */
-    addr_high = sextract64(address, TARGET_VIRT_ADDR_SPACE_BITS, 16);
-    if (!(addr_high == 0 || addr_high == -1)) {
-        return TLBRET_BADADDR;
-    }
-
-    /* Mapped address */
-    return loongarch_map_address(env, physical, prot, address,
-                                 access_type, mmu_idx);
-}
-
-hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cs, vaddr addr)
-{
-    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
-    CPULoongArchState *env = &cpu->env;
-    hwaddr phys_addr;
-    int prot;
-
-    if (get_physical_address(env, &phys_addr, &prot, addr, MMU_DATA_LOAD,
-                             cpu_mmu_index(env, false)) != 0) {
-        return -1;
+    switch (level) {
+    case 1:
+        *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_BASE);
+        *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_WIDTH);
+        break;
+    case 2:
+        *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_BASE);
+        *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_WIDTH);
+        break;
+    case 3:
+        *dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_BASE);
+        *dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_WIDTH);
+        break;
+    case 4:
+        *dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_BASE);
+        *dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_WIDTH);
+        break;
+    default:
+        /* level may be zero for ldpte */
+        *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTBASE);
+        *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTWIDTH);
+        break;
     }
-    return phys_addr;
 }
 
 static void raise_mmu_exception(CPULoongArchState *env, target_ulong address,
@@ -716,7 +514,25 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base,
     target_ulong badvaddr, index, phys, ret;
     int shift;
     uint64_t dir_base, dir_width;
-    bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1;
+
+    if (unlikely((level == 0) || (level > 4))) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Attepted LDDIR with level %"PRId64"\n", level);
+        return base;
+    }
+
+    if (FIELD_EX64(base, TLBENTRY, HUGE)) {
+        if (unlikely(level == 4)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Attempted use of level 4 huge page\n");
+        }
+
+        if (FIELD_EX64(base, TLBENTRY, LEVEL)) {
+            return base;
+        } else {
+            return FIELD_DP64(base, TLBENTRY, LEVEL, level);
+        }
+    }
 
     badvaddr = env->CSR_TLBRBADV;
     base = base & TARGET_PHYS_MASK;
@@ -725,30 +541,7 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base,
     shift = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTEWIDTH);
     shift = (shift + 1) * 3;
 
-    if (huge) {
-        return base;
-    }
-    switch (level) {
-    case 1:
-        dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_BASE);
-        dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_WIDTH);
-        break;
-    case 2:
-        dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_BASE);
-        dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_WIDTH);
-        break;
-    case 3:
-        dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_BASE);
-        dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_WIDTH);
-        break;
-    case 4:
-        dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_BASE);
-        dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_WIDTH);
-        break;
-    default:
-        do_raise_exception(env, EXCCODE_INE, GETPC());
-        return 0;
-    }
+    get_dir_base_width(env, &dir_base, &dir_width, level);
     index = (badvaddr >> dir_base) & ((1 << dir_width) - 1);
     phys = base | index << shift;
     ret = ldq_phys(cs->as, phys) & TARGET_PHYS_MASK;
@@ -761,20 +554,42 @@ void helper_ldpte(CPULoongArchState *env, target_ulong base, target_ulong odd,
     CPUState *cs = env_cpu(env);
     target_ulong phys, tmp0, ptindex, ptoffset0, ptoffset1, ps, badv;
     int shift;
-    bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1;
     uint64_t ptbase = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTBASE);
     uint64_t ptwidth = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTWIDTH);
+    uint64_t dir_base, dir_width;
 
+    /*
+     * The parameter "base" has only two types,
+     * one is the page table base address,
+     * whose bit 6 should be 0,
+     * and the other is the huge page entry,
+     * whose bit 6 should be 1.
+     */
     base = base & TARGET_PHYS_MASK;
+    if (FIELD_EX64(base, TLBENTRY, HUGE)) {
+        /*
+         * Gets the huge page level and Gets huge page size.
+         * Clears the huge page level information in the entry.
+         * Clears huge page bit.
+         * Move HGLOBAL bit to GLOBAL bit.
+         */
+        get_dir_base_width(env, &dir_base, &dir_width,
+                           FIELD_EX64(base, TLBENTRY, LEVEL));
+
+        base = FIELD_DP64(base, TLBENTRY, LEVEL, 0);
+        base = FIELD_DP64(base, TLBENTRY, HUGE, 0);
+        if (FIELD_EX64(base, TLBENTRY, HGLOBAL)) {
+            base = FIELD_DP64(base, TLBENTRY, HGLOBAL, 0);
+            base = FIELD_DP64(base, TLBENTRY, G, 1);
+        }
 
-    if (huge) {
-        /* Huge Page. base is paddr */
-        tmp0 = base ^ (1 << LOONGARCH_PAGE_HUGE_SHIFT);
-        /* Move Global bit */
-        tmp0 = ((tmp0 & (1 << LOONGARCH_HGLOBAL_SHIFT))  >>
-                LOONGARCH_HGLOBAL_SHIFT) << R_TLBENTRY_G_SHIFT |
-                (tmp0 & (~(1 << LOONGARCH_HGLOBAL_SHIFT)));
-        ps = ptbase + ptwidth - 1;
+        ps = dir_base + dir_width - 1;
+        /*
+         * Huge pages are evenly split into parity pages
+         * when loaded into the tlb,
+         * so the tlb page size needs to be divided by 2.
+         */
+        tmp0 = base;
         if (odd) {
             tmp0 += MAKE_64BIT_MASK(ps, 1);
         }
diff --git a/target/loongarch/translate.c b/target/loongarch/tcg/translate.c
similarity index 100%
rename from target/loongarch/translate.c
rename to target/loongarch/tcg/translate.c
diff --git a/target/loongarch/vec_helper.c b/target/loongarch/tcg/vec_helper.c
similarity index 100%
rename from target/loongarch/vec_helper.c
rename to target/loongarch/tcg/vec_helper.c
diff --git a/target/loongarch/trace-events b/target/loongarch/trace-events
new file mode 100644
index 0000000000000000000000000000000000000000..dea11edc0f1164318642cbed4306ca23a29ed3a8
--- /dev/null
+++ b/target/loongarch/trace-events
@@ -0,0 +1,15 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+#kvm.c
+kvm_failed_get_regs_core(const char *msg) "Failed to get core regs from KVM: %s"
+kvm_failed_put_regs_core(const char *msg) "Failed to put core regs into KVM: %s"
+kvm_failed_get_fpu(const char *msg) "Failed to get fpu from KVM: %s"
+kvm_failed_put_fpu(const char *msg) "Failed to put fpu into KVM: %s"
+kvm_failed_get_mpstate(const char *msg) "Failed to get mp_state from KVM: %s"
+kvm_failed_put_mpstate(const char *msg) "Failed to put mp_state into KVM: %s"
+kvm_failed_get_counter(const char *msg) "Failed to get counter from KVM: %s"
+kvm_failed_put_counter(const char *msg) "Failed to put counter into KVM: %s"
+kvm_failed_get_cpucfg(const char *msg) "Failed to get cpucfg from KVM: %s"
+kvm_failed_put_cpucfg(const char *msg) "Failed to put cpucfg into KVM: %s"
+kvm_arch_handle_exit(int num) "kvm arch handle exit, the reason number: %d"
+kvm_set_intr(int irq, int level) "kvm set interrupt, irq num: %d, level: %d"
diff --git a/target/loongarch/trace.h b/target/loongarch/trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2ecb78f0843e24884d8279aec544ecc0f07fec4
--- /dev/null
+++ b/target/loongarch/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-target_loongarch.h"
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index 11c7e0a790203ed71662e85d6cdb312cc46af3e2..d95deaafcd31d05f164a0671f9d6e28c8e6e2c9d 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -396,12 +396,19 @@ static const VMStateDescription vmstate_freg = {
     }
 };
 
-static int fpu_post_load(void *opaque, int version)
+static int fpu_pre_save(void *opaque)
 {
     M68kCPU *s = opaque;
 
-    cpu_m68k_restore_fp_status(&s->env);
+    s->env.fpsr = cpu_m68k_get_fpsr(&s->env);
+    return 0;
+}
+
+static int fpu_post_load(void *opaque, int version)
+{
+    M68kCPU *s = opaque;
 
+    cpu_m68k_set_fpsr(&s->env, s->env.fpsr);
     return 0;
 }
 
@@ -410,6 +417,7 @@ const VMStateDescription vmmstate_fpu = {
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = fpu_needed,
+    .pre_save = fpu_pre_save,
     .post_load = fpu_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(env.fpcr, M68kCPU),
diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index 6cfc696d2ba81e06c367b70678eb8585931e8d20..4d78da9d5fc3135ba0d0f95743a8782ae314e18a 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -199,7 +199,8 @@ void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t);
 void cpu_m68k_set_sr(CPUM68KState *env, uint32_t);
 void cpu_m68k_restore_fp_status(CPUM68KState *env);
 void cpu_m68k_set_fpcr(CPUM68KState *env, uint32_t val);
-
+uint32_t cpu_m68k_get_fpsr(CPUM68KState *env);
+void cpu_m68k_set_fpsr(CPUM68KState *env, uint32_t val);
 
 /*
  * Instead of computing the condition codes after each m68k instruction,
diff --git a/target/m68k/fpu_helper.c b/target/m68k/fpu_helper.c
index ab120b5f59b1f349af12a781fa10539d82769fc6..8314791f50428a5d32d6d1c81e8755be43f37ada 100644
--- a/target/m68k/fpu_helper.c
+++ b/target/m68k/fpu_helper.c
@@ -164,6 +164,78 @@ void HELPER(set_fpcr)(CPUM68KState *env, uint32_t val)
     cpu_m68k_set_fpcr(env, val);
 }
 
+/* Convert host exception flags to cpu_m68k form.  */
+static int cpu_m68k_exceptbits_from_host(int host_bits)
+{
+    int target_bits = 0;
+
+    if (host_bits & float_flag_invalid) {
+        target_bits |= 0x80;
+    }
+    if (host_bits & float_flag_overflow) {
+        target_bits |= 0x40;
+    }
+    if (host_bits & (float_flag_underflow | float_flag_output_denormal)) {
+        target_bits |= 0x20;
+    }
+    if (host_bits & float_flag_divbyzero) {
+        target_bits |= 0x10;
+    }
+    if (host_bits & float_flag_inexact) {
+        target_bits |= 0x08;
+    }
+    return target_bits;
+}
+
+/* Convert cpu_m68k exception flags to target form.  */
+static int cpu_m68k_exceptbits_to_host(int target_bits)
+{
+    int host_bits = 0;
+
+    if (target_bits & 0x80) {
+        host_bits |= float_flag_invalid;
+    }
+    if (target_bits & 0x40) {
+        host_bits |= float_flag_overflow;
+    }
+    if (target_bits & 0x20) {
+        host_bits |= float_flag_underflow;
+    }
+    if (target_bits & 0x10) {
+        host_bits |= float_flag_divbyzero;
+    }
+    if (target_bits & 0x08) {
+        host_bits |= float_flag_inexact;
+    }
+    return host_bits;
+}
+
+uint32_t cpu_m68k_get_fpsr(CPUM68KState *env)
+{
+    int host_flags = get_float_exception_flags(&env->fp_status);
+    int target_flags = cpu_m68k_exceptbits_from_host(host_flags);
+    int except = (env->fpsr & ~(0xf8)) | target_flags;
+    return except;
+}
+
+uint32_t HELPER(get_fpsr)(CPUM68KState *env)
+{
+    return cpu_m68k_get_fpsr(env);
+}
+
+void cpu_m68k_set_fpsr(CPUM68KState *env, uint32_t val)
+{
+    env->fpsr = val;
+
+    int host_flags = cpu_m68k_exceptbits_to_host((int) env->fpsr);
+    set_float_exception_flags(host_flags, &env->fp_status);
+}
+
+void HELPER(set_fpsr)(CPUM68KState *env, uint32_t val)
+{
+    cpu_m68k_set_fpsr(env, val);
+}
+
 #define PREC_BEGIN(prec)                                        \
     do {                                                        \
         FloatX80RoundPrec old =                                 \
diff --git a/target/m68k/helper.c b/target/m68k/helper.c
index 0a1544cd68d9dc7cff8983283d07c83d773c32d6..beab4b96bc3be62372a17c7c44e4291586f8e504 100644
--- a/target/m68k/helper.c
+++ b/target/m68k/helper.c
@@ -118,7 +118,7 @@ static int m68k_fpu_gdb_get_reg(CPUM68KState *env, GByteArray *mem_buf, int n)
     case 8: /* fpcontrol */
         return gdb_get_reg32(mem_buf, env->fpcr);
     case 9: /* fpstatus */
-        return gdb_get_reg32(mem_buf, env->fpsr);
+        return gdb_get_reg32(mem_buf, cpu_m68k_get_fpsr(env));
     case 10: /* fpiar, not implemented */
         return gdb_get_reg32(mem_buf, 0);
     }
@@ -137,7 +137,7 @@ static int m68k_fpu_gdb_set_reg(CPUM68KState *env, uint8_t *mem_buf, int n)
         cpu_m68k_set_fpcr(env, ldl_p(mem_buf));
         return 4;
     case 9: /* fpstatus */
-        env->fpsr = ldl_p(mem_buf);
+        cpu_m68k_set_fpsr(env, ldl_p(mem_buf));
         return 4;
     case 10: /* fpiar, not implemented */
         return 4;
diff --git a/target/m68k/helper.h b/target/m68k/helper.h
index 2bbe0dc03253d323078994f8a102f825b386a365..95aa5e53bb97b38084588d52cc90222b5c141851 100644
--- a/target/m68k/helper.h
+++ b/target/m68k/helper.h
@@ -54,6 +54,8 @@ DEF_HELPER_4(fsdiv, void, env, fp, fp, fp)
 DEF_HELPER_4(fddiv, void, env, fp, fp, fp)
 DEF_HELPER_4(fsgldiv, void, env, fp, fp, fp)
 DEF_HELPER_FLAGS_3(fcmp, TCG_CALL_NO_RWG, void, env, fp, fp)
+DEF_HELPER_2(set_fpsr, void, env, i32)
+DEF_HELPER_1(get_fpsr, i32, env)
 DEF_HELPER_FLAGS_2(set_fpcr, TCG_CALL_NO_RWG, void, env, i32)
 DEF_HELPER_FLAGS_2(ftst, TCG_CALL_NO_RWG, void, env, fp)
 DEF_HELPER_3(fconst, void, env, fp, i32)
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index 4a0b0b270364f2d91e7cdb91ecfc167d32ef4308..f8eeb70379853723f932978e0088a839357a7936 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -4686,7 +4686,7 @@ static void gen_load_fcr(DisasContext *s, TCGv res, int reg)
         tcg_gen_movi_i32(res, 0);
         break;
     case M68K_FPSR:
-        tcg_gen_ld_i32(res, tcg_env, offsetof(CPUM68KState, fpsr));
+        gen_helper_get_fpsr(res, tcg_env);
         break;
     case M68K_FPCR:
         tcg_gen_ld_i32(res, tcg_env, offsetof(CPUM68KState, fpcr));
@@ -4700,7 +4700,7 @@ static void gen_store_fcr(DisasContext *s, TCGv val, int reg)
     case M68K_FPIAR:
         break;
     case M68K_FPSR:
-        tcg_gen_st_i32(val, tcg_env, offsetof(CPUM68KState, fpsr));
+        gen_helper_set_fpsr(tcg_env, val);
         break;
     case M68K_FPCR:
         gen_helper_set_fpcr(tcg_env, val);
diff --git a/target/ppc/machine.c b/target/ppc/machine.c
index 68cbdffecd4f5042464473a5eb00f2326fab1c84..3e010f3a074b2af33e7e2aacdf41de22094d1ecf 100644
--- a/target/ppc/machine.c
+++ b/target/ppc/machine.c
@@ -621,7 +621,7 @@ static bool tlbemb_needed(void *opaque)
 }
 
 static const VMStateDescription vmstate_tlbemb = {
-    .name = "cpu/tlb6xx",
+    .name = "cpu/tlbemb",
     .version_id = 1,
     .minimum_version_id = 1,
     .needed = tlbemb_needed,
diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc
index 6db87ab3367f9fa6d427fe60812c8f1deb19ba64..986453d35f0a8028ca2b31668bee3070ed83cfc4 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2268,7 +2268,7 @@ static bool do_lstxv(DisasContext *ctx, int ra, TCGv displ,
 
 static bool do_lstxv_D(DisasContext *ctx, arg_D *a, bool store, bool paired)
 {
-    if (paired || a->rt >= 32) {
+    if (paired || a->rt < 32) {
         REQUIRE_VSX(ctx);
     } else {
         REQUIRE_VECTOR(ctx);
@@ -2292,7 +2292,7 @@ static bool do_lstxv_PLS_D(DisasContext *ctx, arg_PLS_D *a,
 
 static bool do_lstxv_X(DisasContext *ctx, arg_X *a, bool store, bool paired)
 {
-    if (paired || a->rt >= 32) {
+    if (paired || a->rt < 32) {
         REQUIRE_VSX(ctx);
     } else {
         REQUIRE_VECTOR(ctx);
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 83c7c0cf07be15fe68ec512de670cccf9de56949..77cb59b8a17e4a71b18d600446b36e376d7ef091 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1359,7 +1359,7 @@ const RISCVCPUMultiExtConfig riscv_cpu_extensions[] = {
     /* Vector cryptography extensions */
     MULTI_EXT_CFG_BOOL("zvbb", ext_zvbb, false),
     MULTI_EXT_CFG_BOOL("zvbc", ext_zvbc, false),
-    MULTI_EXT_CFG_BOOL("zvkb", ext_zvkg, false),
+    MULTI_EXT_CFG_BOOL("zvkb", ext_zvkb, false),
     MULTI_EXT_CFG_BOOL("zvkg", ext_zvkg, false),
     MULTI_EXT_CFG_BOOL("zvkned", ext_zvkned, false),
     MULTI_EXT_CFG_BOOL("zvknha", ext_zvknha, false),
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index e7e23b34f455d4b9d01acdfe4808325c86d2169a..4d8f1248dd08f3f6e5a44b121b45450acf1d8dd2 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -1644,10 +1644,10 @@ void riscv_cpu_do_interrupt(CPUState *cs)
     bool async = !!(cs->exception_index & RISCV_EXCP_INT_FLAG);
     target_ulong cause = cs->exception_index & RISCV_EXCP_INT_MASK;
     uint64_t deleg = async ? env->mideleg : env->medeleg;
-    bool s_injected = env->mvip & (1 << cause) & env->mvien &&
-        !(env->mip & (1 << cause));
-    bool vs_injected = env->hvip & (1 << cause) & env->hvien &&
-        !(env->mip & (1 << cause));
+    bool s_injected = env->mvip & (1ULL << cause) & env->mvien &&
+        !(env->mip & (1ULL << cause));
+    bool vs_injected = env->hvip & (1ULL << cause) & env->hvien &&
+        !(env->mip & (1ULL << cause));
     target_ulong tval = 0;
     target_ulong tinst = 0;
     target_ulong htval = 0;
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index fde7ce1a5336b5d744c6676edadfe64b1688a2df..d1bb7bc0d3df7e2da52bba2ee90ab45c3e171dbd 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -704,7 +704,7 @@ static RISCVException write_vxrm(CPURISCVState *env, int csrno,
 static RISCVException read_vxsat(CPURISCVState *env, int csrno,
                                  target_ulong *val)
 {
-    *val = env->vxsat;
+    *val = env->vxsat & BIT(0);
     return RISCV_EXCP_NONE;
 }
 
@@ -714,7 +714,7 @@ static RISCVException write_vxsat(CPURISCVState *env, int csrno,
 #if !defined(CONFIG_USER_ONLY)
     env->mstatus |= MSTATUS_VS;
 #endif
-    env->vxsat = val;
+    env->vxsat = val & BIT(0);
     return RISCV_EXCP_NONE;
 }
 
diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c
index 45b6cf1cfa04a87dd9526c8b017ecca78a956e4f..b3dc2070f9e1c16808978ee8580e1aa7bb748477 100644
--- a/target/riscv/kvm/kvm-cpu.c
+++ b/target/riscv/kvm/kvm-cpu.c
@@ -369,10 +369,14 @@ static void kvm_riscv_update_cpu_cfg_isa_ext(RISCVCPU *cpu, CPUState *cs)
         reg = kvm_cpu_cfg_get(cpu, multi_ext_cfg);
         ret = kvm_set_one_reg(cs, id, &reg);
         if (ret != 0) {
-            error_report("Unable to %s extension %s in KVM, error %d",
-                         reg ? "enable" : "disable",
-                         multi_ext_cfg->name, ret);
-            exit(EXIT_FAILURE);
+            if (!reg && ret == -EINVAL) {
+                warn_report("KVM cannot disable extension %s",
+                            multi_ext_cfg->name);
+            } else {
+                error_report("Unable to enable extension %s in KVM, error %d",
+                             multi_ext_cfg->name, ret);
+                exit(EXIT_FAILURE);
+            }
         }
     }
 }
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index c1c3a4d1eabc706e3a58eaa4227f076ba8e59593..0602b623e0fe6664c7cfc5db071db28091ef84e2 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -196,7 +196,7 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
     uint32_t esz = 1 << log2_esz;
     uint32_t vma = vext_vma(desc);
 
-    for (i = env->vstart; i < env->vl; i++, env->vstart++) {
+    for (i = env->vstart; i < env->vl; env->vstart = ++i) {
         k = 0;
         while (k < nf) {
             if (!vm && !vext_elem_mask(v0, i)) {
@@ -262,7 +262,7 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
     uint32_t esz = 1 << log2_esz;
 
     /* load bytes from guest memory */
-    for (i = env->vstart; i < evl; i++, env->vstart++) {
+    for (i = env->vstart; i < evl; env->vstart = ++i) {
         k = 0;
         while (k < nf) {
             target_ulong addr = base + ((i * nf + k) << log2_esz);
@@ -376,7 +376,7 @@ vext_ldst_index(void *vd, void *v0, target_ulong base,
     uint32_t vma = vext_vma(desc);
 
     /* load bytes from guest memory */
-    for (i = env->vstart; i < env->vl; i++, env->vstart++) {
+    for (i = env->vstart; i < env->vl; env->vstart = ++i) {
         k = 0;
         while (k < nf) {
             if (!vm && !vext_elem_mask(v0, i)) {
@@ -4770,6 +4770,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
         }                                                                 \
         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
     }                                                                     \
+    env->vstart = 0;                                                      \
     /* set tail elements to 1s */                                         \
     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
 }
@@ -5045,7 +5046,7 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
     }                                                                     \
     env->vstart = 0;                                                      \
     /* set tail elements to 1s */                                         \
-    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
+    vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
 }
 
 /* Compress into vd elements of vs2 where vs1 is enabled */
@@ -5063,9 +5064,17 @@ void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
     uint32_t startb = env->vstart * sewb;
     uint32_t i = startb;
 
+    if (HOST_BIG_ENDIAN && i % 8 != 0) {
+        uint32_t j = ROUND_UP(i, 8);
+        memcpy((uint8_t *)vd + H1(j - 1),
+               (uint8_t *)vs2 + H1(j - 1),
+               j - i);
+        i = j;
+    }
+
     memcpy((uint8_t *)vd + H1(i),
            (uint8_t *)vs2 + H1(i),
-           maxsz - startb);
+           maxsz - i);
 
     env->vstart = 0;
 }
diff --git a/target/riscv/vector_internals.c b/target/riscv/vector_internals.c
index 9cf5c17cdeacad77952935a99d9e2021f813047e..be6eb040d223bc01372f31e55584a85698b170a9 100644
--- a/target/riscv/vector_internals.c
+++ b/target/riscv/vector_internals.c
@@ -29,6 +29,28 @@ void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
     if (tot - cnt == 0) {
         return ;
     }
+
+    if (HOST_BIG_ENDIAN) {
+        /*
+         * Deal the situation when the elements are insdie
+         * only one uint64 block including setting the
+         * masked-off element.
+         */
+        if (((tot - 1) ^ cnt) < 8) {
+            memset(base + H1(tot - 1), -1, tot - cnt);
+            return;
+        }
+        /*
+         * Otherwise, at least cross two uint64_t blocks.
+         * Set first unaligned block.
+         */
+        if (cnt % 8 != 0) {
+            uint32_t j = ROUND_UP(cnt, 8);
+            memset(base + H1(j - 1), -1, j - cnt);
+            cnt = j;
+        }
+        /* Set other 64bit aligend blocks */
+    }
     memset(base + cnt, -1, tot - cnt);
 }
 
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index 6acfa1c91b20451222abbacfbfec7d0bfae3a35c..5e64f24cc2e576bdb15fc7bca8ce99604927b9cc 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -350,7 +350,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
     device_class_set_parent_reset(dc, s390_cpu_reset_full, &scc->parent_reset);
 
     scc->reset = s390_cpu_reset;
-    cc->class_by_name = s390_cpu_class_by_name,
+    cc->class_by_name = s390_cpu_class_by_name;
     cc->has_work = s390_cpu_has_work;
     cc->dump_state = s390_cpu_dump_state;
     cc->query_cpu_fast = s390_query_cpu_fast;
diff --git a/target/sparc/helper.c b/target/sparc/helper.c
index bd10b60e4bfb2bdb42dec24ea30045a62621f297..8820c59e7cbbace98972a63e2e8017dbdb2bad2e 100644
--- a/target/sparc/helper.c
+++ b/target/sparc/helper.c
@@ -121,7 +121,7 @@ uint64_t helper_sdiv(CPUSPARCState *env, target_ulong a, target_ulong b)
         return (uint32_t)(b32 < 0 ? INT32_MAX : INT32_MIN) | (-1ull << 32);
     }
 
-    a64 /= b;
+    a64 /= b32;
     r = a64;
     if (unlikely(r != a64)) {
         return (uint32_t)(a64 < 0 ? INT32_MIN : INT32_MAX) | (-1ull << 32);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index bab0a173a37378535bfe8434a32201cdcbbc6b65..ad2690b90d22f36896ba5abd701ec7bfee83e3b4 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -365,8 +365,7 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
      * back to the slow path.
      */
 
-    intptr_t pc_offset;
-    tcg_target_long val_lo, val_hi, pc_hi, offset_hi;
+    intptr_t src_rx, pc_offset;
     tcg_target_long hi12, hi32, hi52;
 
     /* Value fits in signed i32.  */
@@ -376,24 +375,23 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     }
 
     /* PC-relative cases.  */
-    pc_offset = tcg_pcrel_diff(s, (void *)val);
-    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) {
-        /* Single pcaddu2i.  */
-        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
-        return;
+    src_rx = (intptr_t)tcg_splitwx_to_rx(s->code_ptr);
+    if ((val & 3) == 0) {
+        pc_offset = val - src_rx;
+        if (pc_offset == sextreg(pc_offset, 0, 22)) {
+            /* Single pcaddu2i.  */
+            tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
+            return;
+        }
     }
 
-    if (pc_offset == (int32_t)pc_offset) {
-        /* Offset within 32 bits; load with pcalau12i + ori.  */
-        val_lo = sextreg(val, 0, 12);
-        val_hi = val >> 12;
-        pc_hi = (val - pc_offset) >> 12;
-        offset_hi = val_hi - pc_hi;
-
-        tcg_debug_assert(offset_hi == sextreg(offset_hi, 0, 20));
-        tcg_out_opc_pcalau12i(s, rd, offset_hi);
+    pc_offset = (val >> 12) - (src_rx >> 12);
+    if (pc_offset == sextreg(pc_offset, 0, 20)) {
+        /* Load with pcalau12i + ori.  */
+        tcg_target_long val_lo = val & 0xfff;
+        tcg_out_opc_pcalau12i(s, rd, pc_offset);
         if (val_lo != 0) {
-            tcg_out_opc_ori(s, rd, rd, val_lo & 0xfff);
+            tcg_out_opc_ori(s, rd, rd, val_lo);
         }
         return;
     }
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index bb88943f79cfb635f574a1a7cd390c4301a6b3e2..733b44f105f2f2eb22b0a2153a618060c3190911 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -88,7 +88,20 @@ uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
     uint32_t desc = 0;
 
     check_size_align(oprsz, maxsz, 0);
-    tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
+
+    /*
+     * We want to check that 'data' will fit into SIMD_DATA_BITS.
+     * However, some callers want to treat the data as a signed
+     * value (which they can later get back with simd_data())
+     * and some want to treat it as an unsigned value.
+     * So here we assert only that the data will fit into the
+     * field in at least one way. This means that some invalid
+     * values from the caller will not be detected, e.g. if the
+     * caller wants to handle the value as a signed integer but
+     * incorrectly passes us 1 << (SIMD_DATA_BITS - 1).
+     */
+    tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) ||
+                     data == extract32(data, 0, SIMD_DATA_BITS));
 
     oprsz = (oprsz / 8) - 1;
     maxsz = (maxsz / 8) - 1;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 896a36caeba6ce0c480106a77e118c6a05417216..61fcf8597d1fbef96099514e5ec85b210acecbc0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -764,6 +764,14 @@ static void alloc_tcg_plugin_context(TCGContext *s)
 #endif
 }
 
+static void free_tcg_plugin_context(TCGContext *s)
+{
+#ifdef CONFIG_PLUGIN
+    g_ptr_array_unref(s->plugin_tb->insns);
+    g_free(s->plugin_tb);
+#endif
+}
+
 /*
  * All TCG threads except the parent (i.e. the one that called tcg_context_init
  * and registered the target's TCG globals) must register with this function
@@ -814,6 +822,21 @@ void tcg_register_thread(void)
 
     tcg_ctx = s;
 }
+
+void tcg_unregister_thread(void)
+{
+    TCGContext *s = tcg_ctx;
+    unsigned int n;
+
+    /* Unclaim an entry in tcg_ctxs */
+    n = qatomic_fetch_dec(&tcg_cur_ctxs);
+    g_assert(n > 1);
+    qatomic_store_release(&tcg_ctxs[n - 1], 0);
+
+    free_tcg_plugin_context(s);
+
+    g_free(s);
+}
 #endif /* !CONFIG_USER_ONLY */
 
 /* pool based memory allocation */
diff --git a/tests/avocado/acpi-bits.py b/tests/avocado/acpi-bits.py
index 68b9e98d4ea9d4092905e0ca9eb98aae082c0d3b..efe4f52ee05c17800f9be3958c60ac58691df0cc 100644
--- a/tests/avocado/acpi-bits.py
+++ b/tests/avocado/acpi-bits.py
@@ -54,6 +54,8 @@
 deps = ["xorriso", "mformat"] # dependent tools needed in the test setup/box.
 supported_platforms = ['x86_64'] # supported test platforms.
 
+# default timeout of 120 secs is sometimes not enough for bits test.
+BITS_TIMEOUT = 200
 
 def which(tool):
     """ looks up the full path for @tool, returns None if not found
@@ -133,7 +135,7 @@ class AcpiBitsTest(QemuBaseTest): #pylint: disable=too-many-instance-attributes
 
     """
     # in slower systems the test can take as long as 3 minutes to complete.
-    timeout = 200
+    timeout = BITS_TIMEOUT
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -400,7 +402,8 @@ def test_acpi_smbios_bits(self):
 
         # biosbits has been configured to run all the specified test suites
         # in batch mode and then automatically initiate a vm shutdown.
-        # Rely on avocado's unit test timeout.
-        self._vm.event_wait('SHUTDOWN')
+        # Set timeout to BITS_TIMEOUT for SHUTDOWN event from bits VM at par
+        # with the avocado test timeout.
+        self._vm.event_wait('SHUTDOWN', timeout=BITS_TIMEOUT)
         self._vm.wait(timeout=None)
         self.parse_log()
diff --git a/tests/avocado/replay_linux.py b/tests/avocado/replay_linux.py
index 270ccc1eae81340ff9b539f1fc6f713eac263f88..e95bff329999186728510a2663feebb29d888120 100644
--- a/tests/avocado/replay_linux.py
+++ b/tests/avocado/replay_linux.py
@@ -94,7 +94,7 @@ def launch_and_wait(self, record, args, shift):
         else:
             vm.event_wait('SHUTDOWN', self.timeout)
             vm.wait()
-            logger.info('successfully fihished the replay')
+            logger.info('successfully finished the replay')
         elapsed = time.time() - start_time
         logger.info('elapsed time %.2f sec' % elapsed)
         return elapsed
diff --git a/tests/data/acpi/microvm/DSDT.pcie b/tests/data/acpi/microvm/DSDT.pcie
index 765f14ef3d1e54d3cadccbf0a880f8adb73b3f1f..af4c3eb38866d5a32928a73b01ea49a946073aa6 100644
Binary files a/tests/data/acpi/microvm/DSDT.pcie and b/tests/data/acpi/microvm/DSDT.pcie differ
diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT
index c93ad6b7f83a168a1833d7dba1112dd2ab8a431f..714279255b1c17241109cfaaf25a1e1ccb41259e 100644
Binary files a/tests/data/acpi/pc/DSDT and b/tests/data/acpi/pc/DSDT differ
diff --git a/tests/data/acpi/pc/DSDT.acpierst b/tests/data/acpi/pc/DSDT.acpierst
index f643fa2d034053fa07f74f095565b64f021d4290..c1655914728edabdf356ce41d076c7eeb84705ca 100644
Binary files a/tests/data/acpi/pc/DSDT.acpierst and b/tests/data/acpi/pc/DSDT.acpierst differ
diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat
index 9d3695ff289036856886a093733926667a32a058..295e3d7bf86d080cefb9d1571cb84c7325a1d98d 100644
Binary files a/tests/data/acpi/pc/DSDT.acpihmat and b/tests/data/acpi/pc/DSDT.acpihmat differ
diff --git a/tests/data/acpi/pc/DSDT.bridge b/tests/data/acpi/pc/DSDT.bridge
index 840b45f354ac14c858d0af8fbd31e97949a65d4b..ff9694ee4cbd7213b1f9f9e684c6c9feb9564327 100644
Binary files a/tests/data/acpi/pc/DSDT.bridge and b/tests/data/acpi/pc/DSDT.bridge differ
diff --git a/tests/data/acpi/pc/DSDT.cphp b/tests/data/acpi/pc/DSDT.cphp
index dbc0141b2bbc77a6d806ff046dc137992c59a899..dfc25a2cb677843817768ed947df1a1ff5672c67 100644
Binary files a/tests/data/acpi/pc/DSDT.cphp and b/tests/data/acpi/pc/DSDT.cphp differ
diff --git a/tests/data/acpi/pc/DSDT.dimmpxm b/tests/data/acpi/pc/DSDT.dimmpxm
index 1294f655d418dbdccc095e0d47ab220869a61a07..6b74c6cdcb01aa2cb781715a8b595072da5c2fab 100644
Binary files a/tests/data/acpi/pc/DSDT.dimmpxm and b/tests/data/acpi/pc/DSDT.dimmpxm differ
diff --git a/tests/data/acpi/pc/DSDT.hpbridge b/tests/data/acpi/pc/DSDT.hpbridge
index 8012b5eb3155377dc7995b73059ecb267d19232c..39e93ce02b6cd6db664f5469c6b974675205ced8 100644
Binary files a/tests/data/acpi/pc/DSDT.hpbridge and b/tests/data/acpi/pc/DSDT.hpbridge differ
diff --git a/tests/data/acpi/pc/DSDT.hpbrroot b/tests/data/acpi/pc/DSDT.hpbrroot
index 4fa0c6fe720f7859f0541b82f828c0329a3c0548..06e1f09d7069c5917bdb56101551551273410adb 100644
Binary files a/tests/data/acpi/pc/DSDT.hpbrroot and b/tests/data/acpi/pc/DSDT.hpbrroot differ
diff --git a/tests/data/acpi/pc/DSDT.ipmikcs b/tests/data/acpi/pc/DSDT.ipmikcs
index 0a891baf458abee4a772ffba7a31914ec22418ec..950eab96f517ade58f5c479684d449191288e98c 100644
Binary files a/tests/data/acpi/pc/DSDT.ipmikcs and b/tests/data/acpi/pc/DSDT.ipmikcs differ
diff --git a/tests/data/acpi/pc/DSDT.memhp b/tests/data/acpi/pc/DSDT.memhp
index 9b442a64cf711b33d80691fe84f1d3a6256f943b..81533f080774c483a87d31b947e4ac6c7d4b81a8 100644
Binary files a/tests/data/acpi/pc/DSDT.memhp and b/tests/data/acpi/pc/DSDT.memhp differ
diff --git a/tests/data/acpi/pc/DSDT.nohpet b/tests/data/acpi/pc/DSDT.nohpet
index 1754c6878839fc657230e1e714cd7c5142e0a77e..b8b37e810e17176cfc3ce105bf3a8c97de627f8d 100644
Binary files a/tests/data/acpi/pc/DSDT.nohpet and b/tests/data/acpi/pc/DSDT.nohpet differ
diff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem
index 9fc731d3d2bcde5e2612a8ccd81e12098134afe9..6283080304b6b6d80346457a0f428867a9e4a09b 100644
Binary files a/tests/data/acpi/pc/DSDT.numamem and b/tests/data/acpi/pc/DSDT.numamem differ
diff --git a/tests/data/acpi/pc/DSDT.roothp b/tests/data/acpi/pc/DSDT.roothp
index e654c83ebe40c413b204c711adcefe3f04655e8c..19b5f0ceea9ad696e6fff21f0cb9955217b017ac 100644
Binary files a/tests/data/acpi/pc/DSDT.roothp and b/tests/data/acpi/pc/DSDT.roothp differ
diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT
index fb89ae0ac6d4346e33156e9e4d3718698a0a1a8e..b0bbff7686c9a56129bfa3408e62f142cc482713 100644
Binary files a/tests/data/acpi/q35/DSDT and b/tests/data/acpi/q35/DSDT differ
diff --git a/tests/data/acpi/q35/DSDT.acpierst b/tests/data/acpi/q35/DSDT.acpierst
index 46fd25400b7c00ee9149ddb64cb5d5bd73f6a82b..f91cbe55fcfeea319babf7c9a0c6a6ccdc3320d1 100644
Binary files a/tests/data/acpi/q35/DSDT.acpierst and b/tests/data/acpi/q35/DSDT.acpierst differ
diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat
index 61c5bd52a42242e85090934e8e45bf01642609d6..0949fb9d67c70dc882e50501ece421114ad8080b 100644
Binary files a/tests/data/acpi/q35/DSDT.acpihmat and b/tests/data/acpi/q35/DSDT.acpihmat differ
diff --git a/tests/data/acpi/q35/DSDT.acpihmat-noinitiator b/tests/data/acpi/q35/DSDT.acpihmat-noinitiator
index 3aaa2bbdf54a0d0cade14421e84c6ec5a42f96fa..0fa4daa35cf95f93ba8c15f478460fe4e14e6d9e 100644
Binary files a/tests/data/acpi/q35/DSDT.acpihmat-noinitiator and b/tests/data/acpi/q35/DSDT.acpihmat-noinitiator differ
diff --git a/tests/data/acpi/q35/DSDT.applesmc b/tests/data/acpi/q35/DSDT.applesmc
index 944209adeaa5bbb722431161c404cb51b8209993..a5d032b7d96113c9393036b2ba831adb6d584142 100644
Binary files a/tests/data/acpi/q35/DSDT.applesmc and b/tests/data/acpi/q35/DSDT.applesmc differ
diff --git a/tests/data/acpi/q35/DSDT.bridge b/tests/data/acpi/q35/DSDT.bridge
index d9938dba8fa5d405f7696c0dbdc24f3ae42ec934..3464f552974672bde25eb15f1c93c309c57ef5cb 100644
Binary files a/tests/data/acpi/q35/DSDT.bridge and b/tests/data/acpi/q35/DSDT.bridge differ
diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp
index 20955d0aa30120553da35d5a6640055d26255cf9..7fd59bf6702c04a622f05ae356a2ea37312ab403 100644
Binary files a/tests/data/acpi/q35/DSDT.cphp and b/tests/data/acpi/q35/DSDT.cphp differ
diff --git a/tests/data/acpi/q35/DSDT.cxl b/tests/data/acpi/q35/DSDT.cxl
index 145301c52af9a17242bb306c210f8a7e0f01b827..27a1726af2d71417e0cd41b2a14bea0d9c410225 100644
Binary files a/tests/data/acpi/q35/DSDT.cxl and b/tests/data/acpi/q35/DSDT.cxl differ
diff --git a/tests/data/acpi/q35/DSDT.dimmpxm b/tests/data/acpi/q35/DSDT.dimmpxm
index 228374b55bd544116e359f659e546fc66cf8a895..1db0bf454a203006f866e6752d06422ae675cbd3 100644
Binary files a/tests/data/acpi/q35/DSDT.dimmpxm and b/tests/data/acpi/q35/DSDT.dimmpxm differ
diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt
index 45f911ada5645f158f3d6c0c430ec1d52cadc5d8..25f43ae8efb55364a739e6b5e3cb4e71e61862b0 100644
Binary files a/tests/data/acpi/q35/DSDT.ipmibt and b/tests/data/acpi/q35/DSDT.ipmibt differ
diff --git a/tests/data/acpi/q35/DSDT.ipmismbus b/tests/data/acpi/q35/DSDT.ipmismbus
index e5d6811bee1233d74236453c49060390d74d4416..32bcd25bda9e9d2775790385f8da6a11e9d5cb46 100644
Binary files a/tests/data/acpi/q35/DSDT.ipmismbus and b/tests/data/acpi/q35/DSDT.ipmismbus differ
diff --git a/tests/data/acpi/q35/DSDT.ivrs b/tests/data/acpi/q35/DSDT.ivrs
index 46fd25400b7c00ee9149ddb64cb5d5bd73f6a82b..f91cbe55fcfeea319babf7c9a0c6a6ccdc3320d1 100644
Binary files a/tests/data/acpi/q35/DSDT.ivrs and b/tests/data/acpi/q35/DSDT.ivrs differ
diff --git a/tests/data/acpi/q35/DSDT.memhp b/tests/data/acpi/q35/DSDT.memhp
index 5ce081187a578ba7145a9ba20d30be36c13b7663..be90eb71d8dda8fe54c79ffffe103986ee06ae3a 100644
Binary files a/tests/data/acpi/q35/DSDT.memhp and b/tests/data/acpi/q35/DSDT.memhp differ
diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64
index bdf36c4d575bfc4eb2eac3f00c9b7b4270f88677..01f276a6aff38a1d4f58640a9e6d120fc9a04b61 100644
Binary files a/tests/data/acpi/q35/DSDT.mmio64 and b/tests/data/acpi/q35/DSDT.mmio64 differ
diff --git a/tests/data/acpi/q35/DSDT.multi-bridge b/tests/data/acpi/q35/DSDT.multi-bridge
index 1db43a69e4c2affd8bd678bbef4d3c228380288e..1bd2ee8d2ebd3c9e0ed89a86478691f2e06f2590 100644
Binary files a/tests/data/acpi/q35/DSDT.multi-bridge and b/tests/data/acpi/q35/DSDT.multi-bridge differ
diff --git a/tests/data/acpi/q35/DSDT.noacpihp b/tests/data/acpi/q35/DSDT.noacpihp
index 8bc16887e1c963c61aaecf71712a09c0554f6d67..45cc2bcffa42d73db110afd5075556dcfe5d9936 100644
Binary files a/tests/data/acpi/q35/DSDT.noacpihp and b/tests/data/acpi/q35/DSDT.noacpihp differ
diff --git a/tests/data/acpi/q35/DSDT.nohpet b/tests/data/acpi/q35/DSDT.nohpet
index c13e45e3612646cc2e30f00b3b7e53335da816ea..f110504b9c813aa07802fc17d2869596a2eeca6f 100644
Binary files a/tests/data/acpi/q35/DSDT.nohpet and b/tests/data/acpi/q35/DSDT.nohpet differ
diff --git a/tests/data/acpi/q35/DSDT.numamem b/tests/data/acpi/q35/DSDT.numamem
index ba6669437e65952f24516ded954b33fe54bdedfb..6090958f39875f5806e72e23f32cb4b3ae840627 100644
Binary files a/tests/data/acpi/q35/DSDT.numamem and b/tests/data/acpi/q35/DSDT.numamem differ
diff --git a/tests/data/acpi/q35/DSDT.pvpanic-isa b/tests/data/acpi/q35/DSDT.pvpanic-isa
index 6ad42873e91c80cef5a42224cb4d31936dad59b4..7a8e568315a43f1fa98068d8e78995c98064fb91 100644
Binary files a/tests/data/acpi/q35/DSDT.pvpanic-isa and b/tests/data/acpi/q35/DSDT.pvpanic-isa differ
diff --git a/tests/data/acpi/q35/DSDT.tis.tpm12 b/tests/data/acpi/q35/DSDT.tis.tpm12
index e381ce4cbf2b11f56a2d0537db4d21acc97450c9..29a416f0508655d2bfde01fff4d25ad7f89581d9 100644
Binary files a/tests/data/acpi/q35/DSDT.tis.tpm12 and b/tests/data/acpi/q35/DSDT.tis.tpm12 differ
diff --git a/tests/data/acpi/q35/DSDT.tis.tpm2 b/tests/data/acpi/q35/DSDT.tis.tpm2
index a09253042ce4a715922027245de8a2ab7449c5b7..59288f02c43cf2efc1555599131fde05dbbaa1cd 100644
Binary files a/tests/data/acpi/q35/DSDT.tis.tpm2 and b/tests/data/acpi/q35/DSDT.tis.tpm2 differ
diff --git a/tests/data/acpi/q35/DSDT.viot b/tests/data/acpi/q35/DSDT.viot
index 64e81f571120e3eb2b8c6c9545293a78c75b7bbd..d4d05ed620114ce793a59285e6237b566dd1390a 100644
Binary files a/tests/data/acpi/q35/DSDT.viot and b/tests/data/acpi/q35/DSDT.viot differ
diff --git a/tests/data/acpi/virt/DSDT b/tests/data/acpi/virt/DSDT
index c47503990715d389914fdf9c8bccb510761741ac..404bc5ac2188d885e28b6c80d499193865cf1c9c 100644
Binary files a/tests/data/acpi/virt/DSDT and b/tests/data/acpi/virt/DSDT differ
diff --git a/tests/data/acpi/virt/DSDT.acpihmatvirt b/tests/data/acpi/virt/DSDT.acpihmatvirt
index aee6ba017cd730948bfa93e91551eb10a6809293..5f9c0b2d3cdc55949c32d564c92309aa54529d8d 100644
Binary files a/tests/data/acpi/virt/DSDT.acpihmatvirt and b/tests/data/acpi/virt/DSDT.acpihmatvirt differ
diff --git a/tests/data/acpi/virt/DSDT.memhp b/tests/data/acpi/virt/DSDT.memhp
index bae36cdd397473afe3923c52f030641a5ab19d5d..d85565e3b7801de346e302e6e88bc76d88f33f27 100644
Binary files a/tests/data/acpi/virt/DSDT.memhp and b/tests/data/acpi/virt/DSDT.memhp differ
diff --git a/tests/data/acpi/virt/DSDT.pxb b/tests/data/acpi/virt/DSDT.pxb
index fbd78f44c4785d19759daea909fe6d6f9a6e6b01..ccb43ab242521cdfc80f6d6b170d2e0818186632 100644
Binary files a/tests/data/acpi/virt/DSDT.pxb and b/tests/data/acpi/virt/DSDT.pxb differ
diff --git a/tests/data/acpi/virt/DSDT.topology b/tests/data/acpi/virt/DSDT.topology
index 501314c91be01d927fd125e0c72e919fdd85592e..27fee07b86f6ab1b5672960b9e040fdc82185137 100644
Binary files a/tests/data/acpi/virt/DSDT.topology and b/tests/data/acpi/virt/DSDT.topology differ
diff --git a/tests/data/acpi/virt/IORT b/tests/data/acpi/virt/IORT
index 7efd0ce8a6b3928efa7e1373f688ab4c5f50543b..9f0958b3df5aa6b9c3092885f79a20d82da8f011 100644
Binary files a/tests/data/acpi/virt/IORT and b/tests/data/acpi/virt/IORT differ
diff --git a/tests/data/acpi/virt/PPTT b/tests/data/acpi/virt/PPTT
index 7a1258ecf123555b24462c98ccbb76b4ac1d0c2b..b89b2a9c71e0bc2713fc38f5de68fbc39b6302cb 100644
Binary files a/tests/data/acpi/virt/PPTT and b/tests/data/acpi/virt/PPTT differ
diff --git a/tests/data/acpi/virt/PPTT.acpihmatvirt b/tests/data/acpi/virt/PPTT.acpihmatvirt
index 4eef303a5b6168c6bc3795c2e2c53f65b4c4cfd4..945a73fff3b5c93367999199f06dbe7294f594b4 100644
Binary files a/tests/data/acpi/virt/PPTT.acpihmatvirt and b/tests/data/acpi/virt/PPTT.acpihmatvirt differ
diff --git a/tests/data/acpi/virt/PPTT.topology b/tests/data/acpi/virt/PPTT.topology
index 3fbcae5ff08aaf16fedf4da45e941661d79c1174..b0762c0a73eab106bab475f2a177e159b0b01c67 100644
Binary files a/tests/data/acpi/virt/PPTT.topology and b/tests/data/acpi/virt/PPTT.topology differ
diff --git a/tests/docker/dockerfiles/debian-i686-cross.docker b/tests/docker/dockerfiles/debian-i686-cross.docker
index 3fc4e15acdccf942975d70af5a2d87a269b0f5d6..e1c8e2b4948f14943ab68995729f5c3d3b70099f 100644
--- a/tests/docker/dockerfiles/debian-i686-cross.docker
+++ b/tests/docker/dockerfiles/debian-i686-cross.docker
@@ -1,10 +1,10 @@
 # THIS FILE WAS AUTO-GENERATED
 #
-#  $ lcitool dockerfile --layers all --cross-arch i686 debian-11 qemu
+#  $ lcitool dockerfile --layers all --cross-arch i686 debian-12 qemu
 #
 # https://gitlab.com/libvirt/libvirt-ci
 
-FROM docker.io/library/debian:11-slim
+FROM docker.io/library/debian:12-slim
 
 RUN export DEBIAN_FRONTEND=noninteractive && \
     apt-get update && \
@@ -47,16 +47,15 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                       python3-opencv \
                       python3-pillow \
                       python3-pip \
-                      python3-setuptools \
                       python3-sphinx \
                       python3-sphinx-rtd-theme \
                       python3-venv \
-                      python3-wheel \
                       python3-yaml \
                       rpm2cpio \
                       sed \
                       socat \
                       sparse \
+                      swtpm \
                       tar \
                       tesseract-ocr \
                       tesseract-ocr-eng \
@@ -67,8 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
     sed -Ei 's,^# (en_US\.UTF-8 .*)$,\1,' /etc/locale.gen && \
     dpkg-reconfigure locales
 
-RUN /usr/bin/pip3 install tomli
-
 ENV CCACHE_WRAPPERSDIR "/usr/libexec/ccache-wrappers"
 ENV LANG "en_US.UTF-8"
 ENV MAKE "/usr/bin/make"
@@ -145,6 +142,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                       libvdeplug-dev:i386 \
                       libvirglrenderer-dev:i386 \
                       libvte-2.91-dev:i386 \
+                      libxdp-dev:i386 \
                       libzstd-dev:i386 \
                       nettle-dev:i386 \
                       systemtap-sdt-dev:i386 \
diff --git a/tests/docker/dockerfiles/debian-mipsel-cross.docker b/tests/docker/dockerfiles/debian-mipsel-cross.docker
index 5fcd641f1552a8538d9510949b02ad77d6990472..79ce4ae503cd0c01bf262ec6450c3fa054409a51 100644
--- a/tests/docker/dockerfiles/debian-mipsel-cross.docker
+++ b/tests/docker/dockerfiles/debian-mipsel-cross.docker
@@ -1,10 +1,10 @@
 # THIS FILE WAS AUTO-GENERATED
 #
-#  $ lcitool dockerfile --layers all --cross-arch mipsel debian-11 qemu
+#  $ lcitool dockerfile --layers all --cross-arch mipsel debian-12 qemu
 #
 # https://gitlab.com/libvirt/libvirt-ci
 
-FROM docker.io/library/debian:11-slim
+FROM docker.io/library/debian:12-slim
 
 RUN export DEBIAN_FRONTEND=noninteractive && \
     apt-get update && \
@@ -47,16 +47,15 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                       python3-opencv \
                       python3-pillow \
                       python3-pip \
-                      python3-setuptools \
                       python3-sphinx \
                       python3-sphinx-rtd-theme \
                       python3-venv \
-                      python3-wheel \
                       python3-yaml \
                       rpm2cpio \
                       sed \
                       socat \
                       sparse \
+                      swtpm \
                       tar \
                       tesseract-ocr \
                       tesseract-ocr-eng \
@@ -67,8 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
     sed -Ei 's,^# (en_US\.UTF-8 .*)$,\1,' /etc/locale.gen && \
     dpkg-reconfigure locales
 
-RUN /usr/bin/pip3 install tomli
-
 ENV CCACHE_WRAPPERSDIR "/usr/libexec/ccache-wrappers"
 ENV LANG "en_US.UTF-8"
 ENV MAKE "/usr/bin/make"
@@ -143,6 +140,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                       libvdeplug-dev:mipsel \
                       libvirglrenderer-dev:mipsel \
                       libvte-2.91-dev:mipsel \
+                      libxdp-dev:mipsel \
                       libzstd-dev:mipsel \
                       nettle-dev:mipsel \
                       systemtap-sdt-dev:mipsel \
diff --git a/tests/lcitool/refresh b/tests/lcitool/refresh
index 0c93557ad6777fd845beafe1ddcfa32304425108..42ed7eba1dbc5881640f8cf56848de6336b05cff 100755
--- a/tests/lcitool/refresh
+++ b/tests/lcitool/refresh
@@ -159,7 +159,7 @@ try:
                         trailer=cross_build("arm-linux-gnueabihf-",
                                             "arm-softmmu,arm-linux-user"))
 
-    generate_dockerfile("debian-i686-cross", "debian-11",
+    generate_dockerfile("debian-i686-cross", "debian-12",
                         cross="i686",
                         trailer=cross_build("x86_64-linux-gnu-",
                                             "x86_64-softmmu,"
@@ -171,7 +171,7 @@ try:
                         trailer=cross_build("mips64el-linux-gnuabi64-",
                                             "mips64el-softmmu,mips64el-linux-user"))
 
-    generate_dockerfile("debian-mipsel-cross", "debian-11",
+    generate_dockerfile("debian-mipsel-cross", "debian-12",
                         cross="mipsel",
                         trailer=cross_build("mipsel-linux-gnu-",
                                             "mipsel-softmmu,mipsel-linux-user"))
diff --git a/tests/qemu-iotests/049.out b/tests/qemu-iotests/049.out
index 34e1b452e6e1725d18eece31b7339a11edc01adf..b4a9705ec2785c8d9f1f0126b8988ba83cccaaef 100644
--- a/tests/qemu-iotests/049.out
+++ b/tests/qemu-iotests/049.out
@@ -4,90 +4,90 @@ QA output created by 049
 == 1. Traditional size parameter ==
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024b
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1k
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1K
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1G
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1T
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024.0
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024.0b
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5k
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5K
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5G
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5T
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 == 2. Specifying size via -o ==
 
 qemu-img create -f qcow2 -o size=1024 TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1024b TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1k TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1K TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1M TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1G TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1T TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1024.0 TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1024.0b TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1.5k TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1.5K TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1.5M TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1.5G TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o size=1.5T TEST_DIR/t.qcow2
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 == 3. Invalid sizes ==
 
@@ -132,84 +132,84 @@ qemu-img: TEST_DIR/t.qcow2: The image size must be specified only once
 == Check correct interpretation of suffixes for cluster size ==
 
 qemu-img create -f qcow2 -o cluster_size=1024 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=1024b TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=1k TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=1K TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=1M TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1048576 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1048576 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=1024.0 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=1024.0b TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=0.5k TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=0.5K TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o cluster_size=0.5M TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=524288 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=524288 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 == Check compat level option ==
 
 qemu-img create -f qcow2 -o compat=0.10 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o compat=1.1 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o compat=0.42 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.42 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.42 lazy_refcounts=off refcount_bits=16 cache=writeback
 qemu-img: TEST_DIR/t.qcow2: Parameter 'version' does not accept value '0.42'
 
 qemu-img create -f qcow2 -o compat=foobar TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=foobar lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=foobar lazy_refcounts=off refcount_bits=16 cache=writeback
 qemu-img: TEST_DIR/t.qcow2: Parameter 'version' does not accept value 'foobar'
 
 == Check preallocation option ==
 
 qemu-img create -f qcow2 -o preallocation=off TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o preallocation=metadata TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=metadata compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=metadata compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o preallocation=1234 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=1234 compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=1234 compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 qemu-img: TEST_DIR/t.qcow2: Parameter 'preallocation' does not accept value '1234'
 
 == Check encryption option ==
 
 qemu-img create -f qcow2 -o encryption=off TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=off cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=off cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 --object secret,id=sec0,data=123456 -o encryption=on,encrypt.key-secret=sec0 TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=on encrypt.key-secret=sec0 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=on encrypt.key-secret=sec0 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 == Check lazy_refcounts option (only with v3) ==
 
 qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=off TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=on TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=on refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=on refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=off TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 cache=writeback
 
 qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=on TEST_DIR/t.qcow2 64M
-Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16 cache=writeback
 qemu-img: TEST_DIR/t.qcow2: Lazy refcounts only supported with compatibility level 1.1 and above (use version=v3 or greater)
 
 == Expect error when backing file name is empty string ==
diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061
index 53c7d428e38548fac9659b5f710dade4419fae60..b71ac097d14bba1e6e3978459019e0c83e56f0e8 100755
--- a/tests/qemu-iotests/061
+++ b/tests/qemu-iotests/061
@@ -326,12 +326,14 @@ $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG"
 echo
 _make_test_img -o "compat=1.1,data_file=$TEST_IMG.data" 64M
 $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG"
-_img_info --format-specific
+$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt
+$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io
 TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts
 
 echo
 $QEMU_IMG amend -o "data_file=" --image-opts "data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG"
-_img_info --format-specific
+$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt
+$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io
 TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts
 
 echo
diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
index 139fc68177de57e33567094b84d6403cf27b3376..24c33add7ce6030ea8a12a38b128831b293245e9 100644
--- a/tests/qemu-iotests/061.out
+++ b/tests/qemu-iotests/061.out
@@ -545,7 +545,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 qemu-img: data-file can only be set for images that use an external data file
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 data_file=TEST_DIR/t.IMGFMT.data
-qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'foo': No such file or directory
+qemu-io: can't open device TEST_DIR/t.IMGFMT: Could not open 'foo': No such file or directory
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 image: TEST_DIR/t.IMGFMT
 file format: IMGFMT
 virtual size: 64 MiB (67108864 bytes)
@@ -560,7 +562,9 @@ Format specific information:
     corrupt: false
     extended l2: false
 
-qemu-img: Could not open 'TEST_DIR/t.IMGFMT': 'data-file' is required for this image
+qemu-io: can't open device TEST_DIR/t.IMGFMT: 'data-file' is required for this image
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 image: TEST_DIR/t.IMGFMT
 file format: IMGFMT
 virtual size: 64 MiB (67108864 bytes)
diff --git a/tests/qemu-iotests/099.out b/tests/qemu-iotests/099.out
index 8cce6275295fb66acffdbd27c9a4ecfb910a2657..f6f8f259574f8d2e705ebbd551817c6c5875da32 100644
--- a/tests/qemu-iotests/099.out
+++ b/tests/qemu-iotests/099.out
@@ -1,6 +1,6 @@
 QA output created by 099
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
-Formatting 'TEST_DIR/t.IMGFMT.compare', fmt=raw size=131072
+Formatting 'TEST_DIR/t.IMGFMT.compare', fmt=raw size=131072 cache=writeback
 
 === Testing simple filename for blkverify ===
 
diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108
index 54e935acf28a4d5006eab2e1c58c0a61e32257f6..a6fe261265845f8241fe1772dd433a16527ae720 100755
--- a/tests/qemu-iotests/108
+++ b/tests/qemu-iotests/108
@@ -55,7 +55,7 @@ _supported_os Linux
 _unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file
 
 # This test either needs sudo -n losetup or FUSE exports to work
-if sudo -n losetup &>/dev/null; then
+if test -c "/dev/loop-control" && sudo -n losetup &>/dev/null; then
     loopdev=true
 else
     loopdev=false
diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244
index 3e61fa25bb6047b59fc2e8d3ed5596fc7317b02d..bb9cc6512f3545fc03a9f2089d34bd1270360c6b 100755
--- a/tests/qemu-iotests/244
+++ b/tests/qemu-iotests/244
@@ -215,9 +215,22 @@ $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG"
 $QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG"
 
 # blkdebug doesn't support copy offloading, so this tests the error path
-$QEMU_IMG amend -f $IMGFMT -o "data_file=blkdebug::$TEST_IMG.data" "$TEST_IMG"
-$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG"
-$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG"
+test_img_with_blkdebug="json:{
+    'driver': 'qcow2',
+    'file': {
+        'driver': 'file',
+        'filename': '$TEST_IMG'
+    },
+    'data-file': {
+        'driver': 'blkdebug',
+        'image': {
+            'driver': 'file',
+            'filename': '$TEST_IMG.data'
+        }
+    }
+}"
+$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$test_img_with_blkdebug"
+$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$test_img_with_blkdebug"
 
 echo
 echo "=== Flushing should flush the data file ==="
diff --git a/tests/qemu-iotests/270 b/tests/qemu-iotests/270
index 74352342db58c000b992d04a31cc5b23b246cefb..c37b674aa2e15e24840b68c15af98dcc2a601702 100755
--- a/tests/qemu-iotests/270
+++ b/tests/qemu-iotests/270
@@ -60,8 +60,16 @@ _make_test_img -o cluster_size=2M,data_file="$TEST_IMG.orig" \
 # "write" 2G of data without using any space.
 # (qemu-img create does not like it, though, because null-co does not
 # support image creation.)
-$QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \
-    "$TEST_IMG"
+test_img_with_null_data="json:{
+    'driver': '$IMGFMT',
+    'file': {
+        'filename': '$TEST_IMG'
+    },
+    'data-file': {
+        'driver': 'null-co',
+        'size':'4294967296'
+    }
+}"
 
 # This gives us a range of:
 #   2^31 - 512 + 768 - 1 = 2^31 + 255 > 2^31
@@ -74,7 +82,7 @@ $QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \
 # on L2 boundaries, we need large L2 tables; hence the cluster size of
 # 2 MB.  (Anything from 256 kB should work, though, because then one L2
 # table covers 8 GB.)
-$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$test_img_with_null_data" | _filter_qemu_io
 
 _check_test_img
 
diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap b/tests/qemu-iotests/tests/write-zeroes-unmap
new file mode 100644
index 0000000000000000000000000000000000000000..7cfeeaf8391c918c90e60712cccf5b59d10add7a
--- /dev/null
+++ b/tests/qemu-iotests/tests/write-zeroes-unmap
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# group: quick
+#
+# Test write zeros unmap.
+#
+# Copyright (C) Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+trap _cleanup_test_img exit
+
+# get standard environment, filters and checks
+cd ..
+. ./common.rc
+. ./common.filter
+
+_supported_fmt raw
+_supported_proto file
+_supported_os Linux
+
+create_test_image() {
+    _make_test_img -f $IMGFMT 1m
+}
+
+filter_command() {
+    _filter_testdir | _filter_qemu_io | _filter_qemu | _filter_hmp
+}
+
+print_disk_usage() {
+    du -sh $TEST_IMG | _filter_testdir
+}
+
+echo
+echo "=== defaults - write zeros ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \
+    | filter_command
+print_disk_usage
+
+echo
+echo "=== defaults - write zeros unmap ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \
+    | filter_command
+print_disk_usage
+
+
+echo
+echo "=== defaults - write actual zeros ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \
+    | filter_command
+print_disk_usage
+
+echo
+echo "=== discard=off - write zeroes unmap ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=off \
+    | filter_command
+print_disk_usage
+
+echo
+echo "=== detect-zeroes=on - write actual zeros ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,detect-zeroes=on \
+    | filter_command
+print_disk_usage
+
+echo
+echo "=== detect-zeroes=on,discard=on - write actual zeros ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,detect-zeroes=on,discard=on \
+    | filter_command
+print_disk_usage
+
+echo
+echo "=== discard=on - write zeroes ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=on \
+    | filter_command
+print_disk_usage
+
+echo
+echo "=== discard=on - write zeroes unmap ==="
+echo
+
+create_test_image
+echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \
+    | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=on \
+    | filter_command
+print_disk_usage
diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap.out b/tests/qemu-iotests/tests/write-zeroes-unmap.out
new file mode 100644
index 0000000000000000000000000000000000000000..c9319948976fd8d256e8979e8557fc783dcdde29
--- /dev/null
+++ b/tests/qemu-iotests/tests/write-zeroes-unmap.out
@@ -0,0 +1,81 @@
+QA output created by write-zeroes-unmap
+
+=== defaults - write zeros ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -z 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== defaults - write zeros unmap ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -zu 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== defaults - write actual zeros ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -P 0 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== discard=off - write zeroes unmap ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -zu 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== detect-zeroes=on - write actual zeros ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -P 0 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== detect-zeroes=on,discard=on - write actual zeros ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -P 0 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== discard=on - write zeroes ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -z 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+1.0M	TEST_DIR/t.raw
+
+=== discard=on - write zeroes unmap ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) qemu-io none0 "write -zu 0 1m"
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+(qemu) quit
+0	TEST_DIR/t.raw
diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index fe6a9a8563ced0b7bd75b6d52c41f0ef9b813013..21811a1ab5c01b46102f357c138b87212271988c 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -1015,7 +1015,7 @@ static void test_acpi_q35_tcg(void)
     free_test_data(&data);
 }
 
-static void test_acpi_q35_tcg_type4_count(void)
+static void test_acpi_q35_kvm_type4_count(void)
 {
     test_data data = {
         .machine = MACHINE_Q35,
@@ -1031,7 +1031,7 @@ static void test_acpi_q35_tcg_type4_count(void)
     free_test_data(&data);
 }
 
-static void test_acpi_q35_tcg_core_count(void)
+static void test_acpi_q35_kvm_core_count(void)
 {
     test_data data = {
         .machine = MACHINE_Q35,
@@ -1048,7 +1048,7 @@ static void test_acpi_q35_tcg_core_count(void)
     free_test_data(&data);
 }
 
-static void test_acpi_q35_tcg_core_count2(void)
+static void test_acpi_q35_kvm_core_count2(void)
 {
     test_data data = {
         .machine = MACHINE_Q35,
@@ -1065,7 +1065,7 @@ static void test_acpi_q35_tcg_core_count2(void)
     free_test_data(&data);
 }
 
-static void test_acpi_q35_tcg_thread_count(void)
+static void test_acpi_q35_kvm_thread_count(void)
 {
     test_data data = {
         .machine = MACHINE_Q35,
@@ -1082,7 +1082,7 @@ static void test_acpi_q35_tcg_thread_count(void)
     free_test_data(&data);
 }
 
-static void test_acpi_q35_tcg_thread_count2(void)
+static void test_acpi_q35_kvm_thread_count2(void)
 {
     test_data data = {
         .machine = MACHINE_Q35,
@@ -2262,15 +2262,15 @@ int main(int argc, char *argv[])
                 qtest_add_func("acpi/q35/kvm/xapic", test_acpi_q35_kvm_xapic);
                 qtest_add_func("acpi/q35/kvm/dmar", test_acpi_q35_kvm_dmar);
                 qtest_add_func("acpi/q35/type4-count",
-                               test_acpi_q35_tcg_type4_count);
+                               test_acpi_q35_kvm_type4_count);
                 qtest_add_func("acpi/q35/core-count",
-                               test_acpi_q35_tcg_core_count);
+                               test_acpi_q35_kvm_core_count);
                 qtest_add_func("acpi/q35/core-count2",
-                               test_acpi_q35_tcg_core_count2);
+                               test_acpi_q35_kvm_core_count2);
                 qtest_add_func("acpi/q35/thread-count",
-                               test_acpi_q35_tcg_thread_count);
+                               test_acpi_q35_kvm_thread_count);
                 qtest_add_func("acpi/q35/thread-count2",
-                               test_acpi_q35_tcg_thread_count2);
+                               test_acpi_q35_kvm_thread_count2);
             }
             if (qtest_has_device("virtio-iommu-pci")) {
                 qtest_add_func("acpi/q35/viot", test_acpi_q35_viot);
diff --git a/tests/qtest/libqos/loongarch-virt-machine.c b/tests/qtest/libqos/loongarch-virt-machine.c
new file mode 100644
index 0000000000000000000000000000000000000000..c12089c015df6d7f18144e3fe99b588354e45329
--- /dev/null
+++ b/tests/qtest/libqos/loongarch-virt-machine.c
@@ -0,0 +1,114 @@
+/*
+ * libqos driver framework
+ *
+ * Copyright (c) 2018 Emanuele Giuseppe Esposito <e.emanuelegiuseppe@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "qemu/osdep.h"
+#include "../libqtest.h"
+#include "qemu/module.h"
+#include "libqos-malloc.h"
+#include "qgraph.h"
+#include "virtio-mmio.h"
+#include "generic-pcihost.h"
+#include "hw/pci/pci_regs.h"
+
+#define LOONGARCH_PAGE_SIZE               0x1000
+#define LOONGARCH_VIRT_RAM_ADDR           0x100000
+#define LOONGARCH_VIRT_RAM_SIZE           0xFF00000
+
+#define LOONGARCH_VIRT_PIO_BASE           0x18000000
+#define LOONGARCH_VIRT_PCIE_PIO_OFFSET    0x4000
+#define LOONGARCH_VIRT_PCIE_PIO_LIMIT     0x10000
+#define LOONGARCH_VIRT_PCIE_ECAM_BASE     0x20000000
+#define LOONGARCH_VIRT_PCIE_MMIO32_BASE   0x40000000
+#define LOONGARCH_VIRT_PCIE_MMIO32_LIMIT  0x80000000
+
+typedef struct QVirtMachine QVirtMachine;
+
+struct QVirtMachine {
+    QOSGraphObject obj;
+    QGuestAllocator alloc;
+    QVirtioMMIODevice virtio_mmio;
+    QGenericPCIHost bridge;
+};
+
+static void virt_destructor(QOSGraphObject *obj)
+{
+    QVirtMachine *machine = (QVirtMachine *) obj;
+    alloc_destroy(&machine->alloc);
+}
+
+static void *virt_get_driver(void *object, const char *interface)
+{
+    QVirtMachine *machine = object;
+    if (!g_strcmp0(interface, "memory")) {
+        return &machine->alloc;
+    }
+
+    fprintf(stderr, "%s not present in loongarch/virtio\n", interface);
+    g_assert_not_reached();
+}
+
+static QOSGraphObject *virt_get_device(void *obj, const char *device)
+{
+    QVirtMachine *machine = obj;
+    if (!g_strcmp0(device, "generic-pcihost")) {
+        return &machine->bridge.obj;
+    } else if (!g_strcmp0(device, "virtio-mmio")) {
+        return &machine->virtio_mmio.obj;
+    }
+
+    fprintf(stderr, "%s not present in loongarch/virt\n", device);
+    g_assert_not_reached();
+}
+
+static void loongarch_config_qpci_bus(QGenericPCIBus *qpci)
+{
+    qpci->gpex_pio_base = LOONGARCH_VIRT_PIO_BASE;
+    qpci->bus.pio_alloc_ptr = LOONGARCH_VIRT_PCIE_PIO_OFFSET;
+    qpci->bus.pio_limit = LOONGARCH_VIRT_PCIE_PIO_LIMIT;
+    qpci->bus.mmio_alloc_ptr = LOONGARCH_VIRT_PCIE_MMIO32_BASE;
+    qpci->bus.mmio_limit = LOONGARCH_VIRT_PCIE_MMIO32_LIMIT;
+    qpci->ecam_alloc_ptr = LOONGARCH_VIRT_PCIE_ECAM_BASE;
+}
+
+static void *qos_create_machine_loongarch_virt(QTestState *qts)
+{
+    QVirtMachine *machine = g_new0(QVirtMachine, 1);
+
+    alloc_init(&machine->alloc, 0,
+               LOONGARCH_VIRT_RAM_ADDR,
+               LOONGARCH_VIRT_RAM_ADDR + LOONGARCH_VIRT_RAM_SIZE,
+               LOONGARCH_PAGE_SIZE);
+
+    qos_create_generic_pcihost(&machine->bridge, qts, &machine->alloc);
+    loongarch_config_qpci_bus(&machine->bridge.pci);
+
+    machine->obj.get_device = virt_get_device;
+    machine->obj.get_driver = virt_get_driver;
+    machine->obj.destructor = virt_destructor;
+    return machine;
+}
+
+static void virt_machine_register_nodes(void)
+{
+    qos_node_create_machine_args("loongarch64/virt",
+                                 qos_create_machine_loongarch_virt,
+                                 " -cpu la464");
+    qos_node_contains("loongarch64/virt", "generic-pcihost", NULL);
+}
+
+libqos_init(virt_machine_register_nodes);
diff --git a/tests/qtest/libqos/meson.build b/tests/qtest/libqos/meson.build
index 90aae42a22535f0dd8fe2fe7f9a615dd1b9a805b..482c9b2aab99dc0ef0dc4d1c7a2ce6468ecb640a 100644
--- a/tests/qtest/libqos/meson.build
+++ b/tests/qtest/libqos/meson.build
@@ -60,6 +60,7 @@ libqos_srcs = files(
         'arm-xilinx-zynq-a9-machine.c',
         'ppc64_pseries-machine.c',
         'x86_64_pc-machine.c',
+        'loongarch-virt-machine.c',
 )
 
 if have_virtfs
diff --git a/tests/qtest/libqos/qgraph.h b/tests/qtest/libqos/qgraph.h
index 287022a67c1db3dcb347e98bfc577e5ac2901883..1b5de02e7bebf3feaf368124d8fb81ae2c1003e6 100644
--- a/tests/qtest/libqos/qgraph.h
+++ b/tests/qtest/libqos/qgraph.h
@@ -24,7 +24,7 @@
 #include "libqos-malloc.h"
 
 /* maximum path length */
-#define QOS_PATH_MAX_ELEMENT_SIZE 64
+#define QOS_PATH_MAX_ELEMENT_SIZE 128
 
 typedef struct QOSGraphObject QOSGraphObject;
 typedef struct QOSGraphNode QOSGraphNode;
diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index 24fb7b3525cf4ea023649dfc5de45f7c395b0876..164e09c299aaa9342803226e8ce8c9aa5b0f0315 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -118,6 +118,12 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...)
 
     rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}",
                     args);
+
+    if (!qdict_haskey(rsp, "return")) {
+        g_autoptr(GString) s = qobject_to_json_pretty(QOBJECT(rsp), true);
+        g_test_message("%s", s->str);
+    }
+
     g_assert(qdict_haskey(rsp, "return"));
     qobject_unref(rsp);
 
@@ -292,3 +298,35 @@ char *resolve_machine_version(const char *alias, const char *var1,
 
     return find_common_machine_version(machine_name, var1, var2);
 }
+
+typedef struct {
+    char *name;
+    void (*func)(void);
+} MigrationTest;
+
+static void migration_test_destroy(gpointer data)
+{
+    MigrationTest *test = (MigrationTest *)data;
+
+    g_free(test->name);
+    g_free(test);
+}
+
+static void migration_test_wrapper(const void *data)
+{
+    MigrationTest *test = (MigrationTest *)data;
+
+    g_test_message("Running /%s%s", qtest_get_arch(), test->name);
+    test->func();
+}
+
+void migration_test_add(const char *path, void (*fn)(void))
+{
+    MigrationTest *test = g_new0(MigrationTest, 1);
+
+    test->func = fn;
+    test->name = g_strdup(path);
+
+    qtest_add_data_func_full(path, test, migration_test_wrapper,
+                             migration_test_destroy);
+}
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index e31dc85cc7524ba1830f30f066b273e51db3b44f..0d9a02edc7e451f436a7424ff2f67331aa264ad0 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -47,4 +47,5 @@ char *find_common_machine_version(const char *mtype, const char *var1,
                                   const char *var2);
 char *resolve_machine_version(const char *alias, const char *var1,
                               const char *var2);
+void migration_test_add(const char *path, void (*fn)(void));
 #endif /* MIGRATION_HELPERS_H */
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 0fbaa6a90fd6c1dfe2fc3c4e7f05f6b1b3fc10d8..3385ca1f15386e1d0b50b21a24ee601180a2241b 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2560,6 +2560,13 @@ static void *
 test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from,
                                             QTestState *to)
 {
+    /*
+     * Overloading this test to also check that set_parameter does not error.
+     * This is also done in the tests for the other compression methods.
+     */
+    migrate_set_parameter_int(from, "multifd-zlib-level", 2);
+    migrate_set_parameter_int(to, "multifd-zlib-level", 2);
+
     return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib");
 }
 
@@ -2568,10 +2575,42 @@ static void *
 test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from,
                                             QTestState *to)
 {
+    migrate_set_parameter_int(from, "multifd-zstd-level", 2);
+    migrate_set_parameter_int(to, "multifd-zstd-level", 2);
+
     return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd");
 }
 #endif /* CONFIG_ZSTD */
 
+#ifdef CONFIG_QATZIP
+static void *
+test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from,
+                                              QTestState *to)
+{
+    migrate_set_parameter_int(from, "multifd-qatzip-level", 2);
+    migrate_set_parameter_int(to, "multifd-qatzip-level", 2);
+
+    return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip");
+}
+#endif
+
+#ifdef CONFIG_QPL
+static void *
+test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from,
+                                            QTestState *to)
+{
+    return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl");
+}
+#endif /* CONFIG_QPL */
+#ifdef CONFIG_UADK
+static void *
+test_migrate_precopy_tcp_multifd_uadk_start(QTestState *from,
+                                            QTestState *to)
+{
+    return test_migrate_precopy_tcp_multifd_start_common(from, to, "uadk");
+}
+#endif /* CONFIG_UADK */
+
 static void test_multifd_tcp_none(void)
 {
     MigrateCommon args = {
@@ -2607,6 +2646,39 @@ static void test_multifd_tcp_zstd(void)
 }
 #endif
 
+#ifdef CONFIG_QATZIP
+static void test_multifd_tcp_qatzip(void)
+{
+    MigrateCommon args = {
+        .listen_uri = "defer",
+        .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start,
+    };
+    test_precopy_common(&args);
+}
+#endif
+
+#ifdef CONFIG_QPL
+static void test_multifd_tcp_qpl(void)
+{
+    MigrateCommon args = {
+        .listen_uri = "defer",
+        .start_hook = test_migrate_precopy_tcp_multifd_qpl_start,
+    };
+    test_precopy_common(&args);
+}
+#endif
+
+#ifdef CONFIG_UADK
+static void test_multifd_tcp_uadk(void)
+{
+    MigrateCommon args = {
+        .listen_uri = "defer",
+        .start_hook = test_migrate_precopy_tcp_multifd_uadk_start,
+    };
+    test_precopy_common(&args);
+}
+#endif
+
 #ifdef CONFIG_GNUTLS
 static void *
 test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from,
@@ -3339,62 +3411,64 @@ int main(int argc, char **argv)
     module_call_init(MODULE_INIT_QOM);
 
     if (has_uffd) {
-        qtest_add_func("/migration/postcopy/plain", test_postcopy);
-        qtest_add_func("/migration/postcopy/recovery/plain",
-                       test_postcopy_recovery);
-        qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt);
-        qtest_add_func("/migration/postcopy/preempt/recovery/plain",
-                       test_postcopy_preempt_recovery);
+        migration_test_add("/migration/postcopy/plain", test_postcopy);
+        migration_test_add("/migration/postcopy/recovery/plain",
+                           test_postcopy_recovery);
+        migration_test_add("/migration/postcopy/preempt/plain",
+                           test_postcopy_preempt);
+        migration_test_add("/migration/postcopy/preempt/recovery/plain",
+                           test_postcopy_preempt_recovery);
         if (getenv("QEMU_TEST_FLAKY_TESTS")) {
-            qtest_add_func("/migration/postcopy/compress/plain",
-                           test_postcopy_compress);
-            qtest_add_func("/migration/postcopy/recovery/compress/plain",
-                           test_postcopy_recovery_compress);
+            migration_test_add("/migration/postcopy/compress/plain",
+                               test_postcopy_compress);
+            migration_test_add("/migration/postcopy/recovery/compress/plain",
+                               test_postcopy_recovery_compress);
         }
 #ifndef _WIN32
-        qtest_add_func("/migration/postcopy/recovery/double-failures",
-                       test_postcopy_recovery_double_fail);
+        migration_test_add("/migration/postcopy/recovery/double-failures",
+                           test_postcopy_recovery_double_fail);
 #endif /* _WIN32 */
-
     }
 
-    qtest_add_func("/migration/bad_dest", test_baddest);
+    migration_test_add("/migration/bad_dest", test_baddest);
 #ifndef _WIN32
     if (!g_str_equal(arch, "s390x")) {
-        qtest_add_func("/migration/analyze-script", test_analyze_script);
+        migration_test_add("/migration/analyze-script", test_analyze_script);
     }
 #endif
-    qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain);
-    qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle);
+    migration_test_add("/migration/precopy/unix/plain",
+                       test_precopy_unix_plain);
+    migration_test_add("/migration/precopy/unix/xbzrle",
+                       test_precopy_unix_xbzrle);
     /*
      * Compression fails from time to time.
      * Put test here but don't enable it until everything is fixed.
      */
     if (getenv("QEMU_TEST_FLAKY_TESTS")) {
-        qtest_add_func("/migration/precopy/unix/compress/wait",
-                       test_precopy_unix_compress);
-        qtest_add_func("/migration/precopy/unix/compress/nowait",
-                       test_precopy_unix_compress_nowait);
+        migration_test_add("/migration/precopy/unix/compress/wait",
+                           test_precopy_unix_compress);
+        migration_test_add("/migration/precopy/unix/compress/nowait",
+                           test_precopy_unix_compress_nowait);
     }
 
-    qtest_add_func("/migration/precopy/file",
-                   test_precopy_file);
-    qtest_add_func("/migration/precopy/file/offset",
-                   test_precopy_file_offset);
-    qtest_add_func("/migration/precopy/file/offset/bad",
-                   test_precopy_file_offset_bad);
+    migration_test_add("/migration/precopy/file",
+                       test_precopy_file);
+    migration_test_add("/migration/precopy/file/offset",
+                       test_precopy_file_offset);
+    migration_test_add("/migration/precopy/file/offset/bad",
+                       test_precopy_file_offset_bad);
 
     /*
      * Our CI system has problems with shared memory.
      * Don't run this test until we find a workaround.
      */
     if (getenv("QEMU_TEST_FLAKY_TESTS")) {
-        qtest_add_func("/migration/mode/reboot", test_mode_reboot);
+        migration_test_add("/migration/mode/reboot", test_mode_reboot);
     }
 
 #ifdef CONFIG_GNUTLS
-    qtest_add_func("/migration/precopy/unix/tls/psk",
-                   test_precopy_unix_tls_psk);
+    migration_test_add("/migration/precopy/unix/tls/psk",
+                       test_precopy_unix_tls_psk);
 
     if (has_uffd) {
         /*
@@ -3402,110 +3476,120 @@ int main(int argc, char **argv)
          * channels are tested under precopy.  Here what we want to test is the
          * general postcopy path that has TLS channel enabled.
          */
-        qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk);
-        qtest_add_func("/migration/postcopy/recovery/tls/psk",
-                       test_postcopy_recovery_tls_psk);
-        qtest_add_func("/migration/postcopy/preempt/tls/psk",
-                       test_postcopy_preempt_tls_psk);
-        qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk",
-                       test_postcopy_preempt_all);
+        migration_test_add("/migration/postcopy/tls/psk",
+                           test_postcopy_tls_psk);
+        migration_test_add("/migration/postcopy/recovery/tls/psk",
+                           test_postcopy_recovery_tls_psk);
+        migration_test_add("/migration/postcopy/preempt/tls/psk",
+                           test_postcopy_preempt_tls_psk);
+        migration_test_add("/migration/postcopy/preempt/recovery/tls/psk",
+                           test_postcopy_preempt_all);
     }
 #ifdef CONFIG_TASN1
-    qtest_add_func("/migration/precopy/unix/tls/x509/default-host",
-                   test_precopy_unix_tls_x509_default_host);
-    qtest_add_func("/migration/precopy/unix/tls/x509/override-host",
-                   test_precopy_unix_tls_x509_override_host);
+    migration_test_add("/migration/precopy/unix/tls/x509/default-host",
+                       test_precopy_unix_tls_x509_default_host);
+    migration_test_add("/migration/precopy/unix/tls/x509/override-host",
+                       test_precopy_unix_tls_x509_override_host);
 #endif /* CONFIG_TASN1 */
 #endif /* CONFIG_GNUTLS */
 
-    qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain);
+    migration_test_add("/migration/precopy/tcp/plain", test_precopy_tcp_plain);
 
-    qtest_add_func("/migration/precopy/tcp/plain/switchover-ack",
-                   test_precopy_tcp_switchover_ack);
+    migration_test_add("/migration/precopy/tcp/plain/switchover-ack",
+                       test_precopy_tcp_switchover_ack);
 
 #ifdef CONFIG_GNUTLS
-    qtest_add_func("/migration/precopy/tcp/tls/psk/match",
-                   test_precopy_tcp_tls_psk_match);
-    qtest_add_func("/migration/precopy/tcp/tls/psk/mismatch",
-                   test_precopy_tcp_tls_psk_mismatch);
+    migration_test_add("/migration/precopy/tcp/tls/psk/match",
+                       test_precopy_tcp_tls_psk_match);
+    migration_test_add("/migration/precopy/tcp/tls/psk/mismatch",
+                       test_precopy_tcp_tls_psk_mismatch);
 #ifdef CONFIG_TASN1
-    qtest_add_func("/migration/precopy/tcp/tls/x509/default-host",
-                   test_precopy_tcp_tls_x509_default_host);
-    qtest_add_func("/migration/precopy/tcp/tls/x509/override-host",
-                   test_precopy_tcp_tls_x509_override_host);
-    qtest_add_func("/migration/precopy/tcp/tls/x509/mismatch-host",
-                   test_precopy_tcp_tls_x509_mismatch_host);
-    qtest_add_func("/migration/precopy/tcp/tls/x509/friendly-client",
-                   test_precopy_tcp_tls_x509_friendly_client);
-    qtest_add_func("/migration/precopy/tcp/tls/x509/hostile-client",
-                   test_precopy_tcp_tls_x509_hostile_client);
-    qtest_add_func("/migration/precopy/tcp/tls/x509/allow-anon-client",
-                   test_precopy_tcp_tls_x509_allow_anon_client);
-    qtest_add_func("/migration/precopy/tcp/tls/x509/reject-anon-client",
-                   test_precopy_tcp_tls_x509_reject_anon_client);
+    migration_test_add("/migration/precopy/tcp/tls/x509/default-host",
+                       test_precopy_tcp_tls_x509_default_host);
+    migration_test_add("/migration/precopy/tcp/tls/x509/override-host",
+                       test_precopy_tcp_tls_x509_override_host);
+    migration_test_add("/migration/precopy/tcp/tls/x509/mismatch-host",
+                       test_precopy_tcp_tls_x509_mismatch_host);
+    migration_test_add("/migration/precopy/tcp/tls/x509/friendly-client",
+                       test_precopy_tcp_tls_x509_friendly_client);
+    migration_test_add("/migration/precopy/tcp/tls/x509/hostile-client",
+                       test_precopy_tcp_tls_x509_hostile_client);
+    migration_test_add("/migration/precopy/tcp/tls/x509/allow-anon-client",
+                       test_precopy_tcp_tls_x509_allow_anon_client);
+    migration_test_add("/migration/precopy/tcp/tls/x509/reject-anon-client",
+                       test_precopy_tcp_tls_x509_reject_anon_client);
 #endif /* CONFIG_TASN1 */
 #endif /* CONFIG_GNUTLS */
 
-    /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */
+    /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */
 #ifndef _WIN32
-    qtest_add_func("/migration/fd_proto", test_migrate_fd_proto);
+    migration_test_add("/migration/fd_proto", test_migrate_fd_proto);
 #endif
-    qtest_add_func("/migration/validate_uuid", test_validate_uuid);
-    qtest_add_func("/migration/validate_uuid_error", test_validate_uuid_error);
-    qtest_add_func("/migration/validate_uuid_src_not_set",
-                   test_validate_uuid_src_not_set);
-    qtest_add_func("/migration/validate_uuid_dst_not_set",
-                   test_validate_uuid_dst_not_set);
+    migration_test_add("/migration/validate_uuid", test_validate_uuid);
+    migration_test_add("/migration/validate_uuid_error",
+                       test_validate_uuid_error);
+    migration_test_add("/migration/validate_uuid_src_not_set",
+                       test_validate_uuid_src_not_set);
+    migration_test_add("/migration/validate_uuid_dst_not_set",
+                       test_validate_uuid_dst_not_set);
     /*
      * See explanation why this test is slow on function definition
      */
     if (g_test_slow()) {
-        qtest_add_func("/migration/auto_converge", test_migrate_auto_converge);
+        migration_test_add("/migration/auto_converge",
+                           test_migrate_auto_converge);
         if (g_str_equal(arch, "x86_64") &&
             has_kvm && kvm_dirty_ring_supported()) {
-            qtest_add_func("/migration/dirty_limit", test_migrate_dirty_limit);
+            migration_test_add("/migration/dirty_limit",
+                               test_migrate_dirty_limit);
         }
     }
-    qtest_add_func("/migration/multifd/tcp/plain/none",
-                   test_multifd_tcp_none);
-    /*
-     * This test is flaky and sometimes fails in CI and otherwise:
-     * don't run unless user opts in via environment variable.
-     */
-    if (getenv("QEMU_TEST_FLAKY_TESTS")) {
-        qtest_add_func("/migration/multifd/tcp/plain/cancel",
+    migration_test_add("/migration/multifd/tcp/plain/none",
+                       test_multifd_tcp_none);
+    migration_test_add("/migration/multifd/tcp/plain/cancel",
                        test_multifd_tcp_cancel);
-    }
-    qtest_add_func("/migration/multifd/tcp/plain/zlib",
-                   test_multifd_tcp_zlib);
+    migration_test_add("/migration/multifd/tcp/plain/zlib",
+                       test_multifd_tcp_zlib);
 #ifdef CONFIG_ZSTD
-    qtest_add_func("/migration/multifd/tcp/plain/zstd",
-                   test_multifd_tcp_zstd);
+    migration_test_add("/migration/multifd/tcp/plain/zstd",
+                       test_multifd_tcp_zstd);
+#endif
+#ifdef CONFIG_QATZIP
+    migration_test_add("/migration/multifd/tcp/plain/qatzip",
+                test_multifd_tcp_qatzip);
+#endif
+#ifdef CONFIG_QPL
+    migration_test_add("/migration/multifd/tcp/plain/qpl",
+                       test_multifd_tcp_qpl);
+#endif
+#ifdef CONFIG_UADK
+    migration_test_add("/migration/multifd/tcp/plain/uadk",
+                       test_multifd_tcp_uadk);
 #endif
 #ifdef CONFIG_GNUTLS
-    qtest_add_func("/migration/multifd/tcp/tls/psk/match",
-                   test_multifd_tcp_tls_psk_match);
-    qtest_add_func("/migration/multifd/tcp/tls/psk/mismatch",
-                   test_multifd_tcp_tls_psk_mismatch);
+    migration_test_add("/migration/multifd/tcp/tls/psk/match",
+                       test_multifd_tcp_tls_psk_match);
+    migration_test_add("/migration/multifd/tcp/tls/psk/mismatch",
+                       test_multifd_tcp_tls_psk_mismatch);
 #ifdef CONFIG_TASN1
-    qtest_add_func("/migration/multifd/tcp/tls/x509/default-host",
-                   test_multifd_tcp_tls_x509_default_host);
-    qtest_add_func("/migration/multifd/tcp/tls/x509/override-host",
-                   test_multifd_tcp_tls_x509_override_host);
-    qtest_add_func("/migration/multifd/tcp/tls/x509/mismatch-host",
-                   test_multifd_tcp_tls_x509_mismatch_host);
-    qtest_add_func("/migration/multifd/tcp/tls/x509/allow-anon-client",
-                   test_multifd_tcp_tls_x509_allow_anon_client);
-    qtest_add_func("/migration/multifd/tcp/tls/x509/reject-anon-client",
-                   test_multifd_tcp_tls_x509_reject_anon_client);
+    migration_test_add("/migration/multifd/tcp/tls/x509/default-host",
+                       test_multifd_tcp_tls_x509_default_host);
+    migration_test_add("/migration/multifd/tcp/tls/x509/override-host",
+                       test_multifd_tcp_tls_x509_override_host);
+    migration_test_add("/migration/multifd/tcp/tls/x509/mismatch-host",
+                       test_multifd_tcp_tls_x509_mismatch_host);
+    migration_test_add("/migration/multifd/tcp/tls/x509/allow-anon-client",
+                       test_multifd_tcp_tls_x509_allow_anon_client);
+    migration_test_add("/migration/multifd/tcp/tls/x509/reject-anon-client",
+                       test_multifd_tcp_tls_x509_reject_anon_client);
 #endif /* CONFIG_TASN1 */
 #endif /* CONFIG_GNUTLS */
 
     if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) {
-        qtest_add_func("/migration/dirty_ring",
-                       test_precopy_unix_dirty_ring);
-        qtest_add_func("/migration/vcpu_dirty_limit",
-                       test_vcpu_dirty_limit);
+        migration_test_add("/migration/dirty_ring",
+                           test_precopy_unix_dirty_ring);
+        migration_test_add("/migration/vcpu_dirty_limit",
+                           test_vcpu_dirty_limit);
     }
 
     ret = g_test_run();
diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
index 2c15f6095848fea7789bf62f41ee6b9128f67fde..df1f93ea6aade3d23ee294ea3795979151dd8f93 100644
--- a/tests/qtest/qmp-cmd-test.c
+++ b/tests/qtest/qmp-cmd-test.c
@@ -110,6 +110,7 @@ static bool query_is_ignored(const char *cmd)
         "query-sev-capabilities",
         "query-sgx",
         "query-sgx-capabilities",
+        "query-virtcca-capabilities",
         /* Success depends on enabling dirty page rate limit */
         "query-vcpu-dirty-limit",
         NULL
diff --git a/tests/qtest/tpm-tests.c b/tests/qtest/tpm-tests.c
index fb94496bbd8614219ae8416e97c37740ad5e09ed..197714f8d99aeffebb7e32465e04489c267c7372 100644
--- a/tests/qtest/tpm-tests.c
+++ b/tests/qtest/tpm-tests.c
@@ -114,7 +114,7 @@ void tpm_test_swtpm_migration_test(const char *src_tpm_path,
                      sizeof(tpm_pcrread_resp));
 
     tpm_util_migrate(src_qemu, uri);
-    tpm_util_wait_for_migration_complete(src_qemu);
+    tpm_util_wait_for_migration_complete(dst_qemu);
 
     tpm_util_pcrread(dst_qemu, tx, tpm_pcrread_resp,
                      sizeof(tpm_pcrread_resp));
diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c
index d4e437265f6630d376540e19051d22f2bc568151..fadf3f0f2e9d064962068c237c0ba20fc8a7cee7 100644
--- a/tests/qtest/vhost-user-test.c
+++ b/tests/qtest/vhost-user-test.c
@@ -799,6 +799,23 @@ static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc)
     read_guest_mem_server(global_qtest, server);
 }
 
+static void wait_for_rings_started(TestServer *s, size_t count)
+{
+    gint64 end_time;
+
+    g_mutex_lock(&s->data_mutex);
+    end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND;
+    while (ctpop64(s->rings) != count) {
+        if (!g_cond_wait_until(&s->data_cond, &s->data_mutex, end_time)) {
+            /* timeout has passed */
+            g_assert_cmpint(ctpop64(s->rings), ==, count);
+            break;
+        }
+    }
+
+    g_mutex_unlock(&s->data_mutex);
+}
+
 static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc)
 {
     TestServer *s = arg;
@@ -869,6 +886,7 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc)
     qtest_qmp_eventwait(to, "RESUME");
 
     g_assert(wait_for_fds(dest));
+    wait_for_rings_started(dest, 2);
     read_guest_mem_server(to, dest);
 
     g_source_destroy(source);
@@ -880,23 +898,6 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc)
     g_string_free(dest_cmdline, true);
 }
 
-static void wait_for_rings_started(TestServer *s, size_t count)
-{
-    gint64 end_time;
-
-    g_mutex_lock(&s->data_mutex);
-    end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND;
-    while (ctpop64(s->rings) != count) {
-        if (!g_cond_wait_until(&s->data_cond, &s->data_mutex, end_time)) {
-            /* timeout has passed */
-            g_assert_cmpint(ctpop64(s->rings), ==, count);
-            break;
-        }
-    }
-
-    g_mutex_unlock(&s->data_mutex);
-}
-
 static inline void test_server_connect(TestServer *server)
 {
     test_server_create_chr(server, ",reconnect=1");
diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
index cded1d01fcdc91e6cd4028b1951cb3c249874c75..6d593c63928853ebdf1ac374df1bf786c9a74af5 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -40,8 +40,9 @@ endif
 
 # Pauth Tests
 ifneq ($(CROSS_CC_HAS_ARMV8_3),)
-AARCH64_TESTS += pauth-1 pauth-2 pauth-4 pauth-5
+AARCH64_TESTS += pauth-1 pauth-2 pauth-4 pauth-5 test-2375
 pauth-%: CFLAGS += -march=armv8.3-a
+test-2375: CFLAGS += -march=armv8.3-a
 run-pauth-1: QEMU_OPTS += -cpu max
 run-pauth-2: QEMU_OPTS += -cpu max
 # Choose a cpu with FEAT_Pauth but without FEAT_FPAC for pauth-[45].
diff --git a/tests/tcg/aarch64/test-2375.c b/tests/tcg/aarch64/test-2375.c
new file mode 100644
index 0000000000000000000000000000000000000000..84c7e7de7163fa5615c8912c18b4321b95f98078
--- /dev/null
+++ b/tests/tcg/aarch64/test-2375.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) 2024 Linaro Ltd */
+/* See https://gitlab.com/qemu-project/qemu/-/issues/2375 */
+
+#include <assert.h>
+
+int main(void)
+{
+   int r, z;
+
+   asm("msr fpcr, %2\n\t"
+       "fjcvtzs %w0, %d3\n\t"
+       "cset %1, eq"
+       : "=r"(r), "=r"(z)
+       : "r"(0x01000000L),      /* FZ = 1 */
+         "w"(0xfcff00L));       /* denormal */
+
+    assert(r == 0);
+    assert(z == 0);
+    return 0;
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index a05d471090401cab996c76e2d229699e99096e9b..598ba41bb93096dc674fb3193d4877c22046046e 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -142,9 +142,6 @@ if have_system
     'test-vmstate': [migration, io],
     'test-yank': ['socket-helpers.c', qom, io, chardev]
   }
-  if config_host_data.get('CONFIG_INOTIFY1')
-    tests += {'test-util-filemonitor': []}
-  endif
 
   # Some tests: test-char, test-qdev-global-props, and test-qga,
   # are not runnable under TSan due to a known issue.
diff --git a/tests/unit/ptimer-test.c b/tests/unit/ptimer-test.c
index 04b5f4e3d03148010748956bf894c4db3cb55b0f..08240594bbd65bc501629e6e06b7773a0c453e39 100644
--- a/tests/unit/ptimer-test.c
+++ b/tests/unit/ptimer-test.c
@@ -763,6 +763,33 @@ static void check_oneshot_with_load_0(gconstpointer arg)
     ptimer_free(ptimer);
 }
 
+static void check_freq_more_than_1000M(gconstpointer arg)
+{
+    const uint8_t *policy = arg;
+    ptimer_state *ptimer = ptimer_init(ptimer_trigger, NULL, *policy);
+    bool no_round_down = (*policy & PTIMER_POLICY_NO_COUNTER_ROUND_DOWN);
+
+    triggered = false;
+
+    ptimer_transaction_begin(ptimer);
+    ptimer_set_freq(ptimer, 2000000000);
+    ptimer_set_limit(ptimer, 8, 1);
+    ptimer_run(ptimer, 1);
+    ptimer_transaction_commit(ptimer);
+
+    qemu_clock_step(3);
+
+    g_assert_cmpuint(ptimer_get_count(ptimer), ==, no_round_down ? 3 : 2);
+    g_assert_false(triggered);
+
+    qemu_clock_step(1);
+
+    g_assert_cmpuint(ptimer_get_count(ptimer), ==, 0);
+    g_assert_true(triggered);
+
+    ptimer_free(ptimer);
+}
+
 static void add_ptimer_tests(uint8_t policy)
 {
     char policy_name[256] = "";
@@ -857,6 +884,12 @@ static void add_ptimer_tests(uint8_t policy)
                               policy_name),
         g_memdup2(&policy, 1), check_oneshot_with_load_0, g_free);
     g_free(tmp);
+
+    g_test_add_data_func_full(
+        tmp = g_strdup_printf("/ptimer/freq_more_than_1000M policy=%s",
+                              policy_name),
+        g_memdup2(&policy, 1), check_freq_more_than_1000M, g_free);
+    g_free(tmp);
 }
 
 static void add_all_ptimer_policies_comb_tests(void)
diff --git a/tests/unit/test-char.c b/tests/unit/test-char.c
index 649fdf64e1964998f88dec9089e673172e88e476..0cb26331900a642912155575f9763ca317ea259e 100644
--- a/tests/unit/test-char.c
+++ b/tests/unit/test-char.c
@@ -574,7 +574,7 @@ static void char_udp_test_internal(Chardev *reuse_chr, int sock)
     struct sockaddr_in other;
     SocketIdleData d = { 0, };
     Chardev *chr;
-    CharBackend *be;
+    CharBackend stack_be, *be = &stack_be;
     socklen_t alen = sizeof(other);
     int ret;
     char buf[10];
@@ -590,7 +590,6 @@ static void char_udp_test_internal(Chardev *reuse_chr, int sock)
         chr = qemu_chr_new("client", tmp, NULL);
         g_assert_nonnull(chr);
 
-        be = g_alloca(sizeof(CharBackend));
         qemu_chr_fe_init(be, chr, &error_abort);
     }
 
diff --git a/tests/unit/test-crypto-cipher.c b/tests/unit/test-crypto-cipher.c
index d9d9d078ff11c9942922d5228c7d3448a0320731..11ab1a54fca3991fe68c8f7ce180cd436e9194c0 100644
--- a/tests/unit/test-crypto-cipher.c
+++ b/tests/unit/test-crypto-cipher.c
@@ -382,6 +382,19 @@ static QCryptoCipherTestData test_data[] = {
         .plaintext = "90afe91bb288544f2c32dc239b2635e6",
         .ciphertext = "6cb4561c40bf0a9705931cb6d408e7fa",
     },
+#ifdef CONFIG_CRYPTO_SM4
+    {
+        /* SM4, GB/T 32907-2016, Appendix A.1 */
+        .path = "/crypto/cipher/sm4",
+        .alg = QCRYPTO_CIPHER_ALG_SM4,
+        .mode = QCRYPTO_CIPHER_MODE_ECB,
+        .key = "0123456789abcdeffedcba9876543210",
+        .plaintext  =
+            "0123456789abcdeffedcba9876543210",
+        .ciphertext =
+            "681edf34d206965e86b3e94f536e4246",
+    },
+#endif
     {
         /* #1 32 byte key, 32 byte PTX */
         .path = "/crypto/cipher/aes-xts-128-1",
diff --git a/tests/unit/test-crypto-hash.c b/tests/unit/test-crypto-hash.c
index 1f4abb822b6140bc49200921e686078c90fa11a6..61908e1769eaf56320e716a49e7a8e5070b80d56 100644
--- a/tests/unit/test-crypto-hash.c
+++ b/tests/unit/test-crypto-hash.c
@@ -42,6 +42,9 @@
                       "63b54e4cb2d2032b393994aa263c0dbb" \
                       "e00a9f2fe9ef6037352232a1eec55ee7"
 #define OUTPUT_RIPEMD160 "f3d658fad3fdfb2b52c9369cf0d441249ddfa8a0"
+#ifdef CONFIG_CRYPTO_SM3
+#define OUTPUT_SM3 "d4a97db105b477b84c4f20ec9c31a6c814e2705a0b83a5a89748d75f0ef456a1"
+#endif
 
 #define OUTPUT_MD5_B64 "Yo0gY3FWMDWrjvYvSSveyQ=="
 #define OUTPUT_SHA1_B64 "sudPJnWKOkIeUJzuBFJEt4dTzAI="
@@ -54,6 +57,10 @@
                           "7sVe5w=="
 #define OUTPUT_RIPEMD160_B64 "89ZY+tP9+ytSyTac8NRBJJ3fqKA="
 
+#ifdef CONFIG_CRYPTO_SM3
+#define OUTPUT_SM3_B64 "1Kl9sQW0d7hMTyDsnDGmyBTicFoLg6Wol0jXXw70VqE="
+#endif
+
 static const char *expected_outputs[] = {
     [QCRYPTO_HASH_ALG_MD5] = OUTPUT_MD5,
     [QCRYPTO_HASH_ALG_SHA1] = OUTPUT_SHA1,
@@ -62,6 +69,9 @@ static const char *expected_outputs[] = {
     [QCRYPTO_HASH_ALG_SHA384] = OUTPUT_SHA384,
     [QCRYPTO_HASH_ALG_SHA512] = OUTPUT_SHA512,
     [QCRYPTO_HASH_ALG_RIPEMD160] = OUTPUT_RIPEMD160,
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = OUTPUT_SM3,
+#endif
 };
 static const char *expected_outputs_b64[] = {
     [QCRYPTO_HASH_ALG_MD5] = OUTPUT_MD5_B64,
@@ -71,6 +81,9 @@ static const char *expected_outputs_b64[] = {
     [QCRYPTO_HASH_ALG_SHA384] = OUTPUT_SHA384_B64,
     [QCRYPTO_HASH_ALG_SHA512] = OUTPUT_SHA512_B64,
     [QCRYPTO_HASH_ALG_RIPEMD160] = OUTPUT_RIPEMD160_B64,
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = OUTPUT_SM3_B64,
+#endif
 };
 static const int expected_lens[] = {
     [QCRYPTO_HASH_ALG_MD5] = 16,
@@ -80,6 +93,9 @@ static const int expected_lens[] = {
     [QCRYPTO_HASH_ALG_SHA384] = 48,
     [QCRYPTO_HASH_ALG_SHA512] = 64,
     [QCRYPTO_HASH_ALG_RIPEMD160] = 20,
+#ifdef CONFIG_CRYPTO_SM3
+    [QCRYPTO_HASH_ALG_SM3] = 32,
+#endif
 };
 
 static const char hex[] = "0123456789abcdef";
diff --git a/tests/unit/test-crypto-hmac.c b/tests/unit/test-crypto-hmac.c
index 23eb724d94250f5c8a37c21fb4fd9f1dd874fca9..b1d04e9fcccafbde5bf8fbc114c9d67be9dfd4a1 100644
--- a/tests/unit/test-crypto-hmac.c
+++ b/tests/unit/test-crypto-hmac.c
@@ -76,6 +76,14 @@ static QCryptoHmacTestData test_data[] = {
             "94964ed4c1155b62b668c241d67279e5"
             "8a711676",
     },
+#ifdef CONFIG_CRYPTO_SM3
+    {
+        .alg = QCRYPTO_HASH_ALG_SM3,
+        .hex_digest =
+            "760e3799332bc913819b930085360ddb"
+	    "c05529261313d5b15b75bab4fd7ae91e",
+    },
+#endif
 };
 
 static const char hex[] = "0123456789abcdef";
diff --git a/tests/unit/test-crypto-pbkdf.c b/tests/unit/test-crypto-pbkdf.c
index 43c417f6b43389e848d3828174b0f2492c771825..3d76593c86c5e155b581a82c7a2d9a49c926f5e6 100644
--- a/tests/unit/test-crypto-pbkdf.c
+++ b/tests/unit/test-crypto-pbkdf.c
@@ -326,6 +326,22 @@ static QCryptoPbkdfTestData test_data[] = {
                "\xce\xbf\x91\x14\x8b\x5c\x48\x41",
         .nout = 32
     },
+#ifdef CONFIG_CRYPTO_SM3
+    {
+        .path = "/crypto/pbkdf/nonrfc/sm3/iter2",
+        .hash = QCRYPTO_HASH_ALG_SM3,
+        .iterations = 2,
+        .key = "password",
+        .nkey = 8,
+        .salt = "ATHENA.MIT.EDUraeburn",
+        .nsalt = 21,
+        .out = "\x48\x71\x1b\x58\xa3\xcb\xce\x06"
+		"\xba\xad\x77\xa8\xb5\xb9\xd8\x07"
+		"\x6a\xe2\xb3\x5b\x95\xce\xc8\xce"
+		"\xe7\xb1\xcb\xee\x61\xdf\x04\xea",
+        .nout = 32
+    },
+#endif
 #if 0
     {
         .path = "/crypto/pbkdf/nonrfc/whirlpool/iter1200",
diff --git a/tests/unit/test-vmstate.c b/tests/unit/test-vmstate.c
index 0b7d5ecd6838da416318210b39f83fbf9657277d..22c586eee02a2ad84fc1986463ca00bb9080d2ee 100644
--- a/tests/unit/test-vmstate.c
+++ b/tests/unit/test-vmstate.c
@@ -31,6 +31,7 @@
 #include "../migration/savevm.h"
 #include "qemu/module.h"
 #include "io/channel-file.h"
+#include "exec/memory.h"
 
 static int temp_fd;
 
@@ -1479,6 +1480,11 @@ static void test_tmp_struct(void)
     g_assert_cmpint(obj.f, ==, 8); /* From the child->parent */
 }
 
+/* stub for ut */
+void memory_region_commit(void)
+{
+}
+
 int main(int argc, char **argv)
 {
     g_autofree char *temp_file = g_strdup_printf("%s/vmst.test.XXXXXX",
diff --git a/ui/clipboard.c b/ui/clipboard.c
index 3d14bffaf80fd446fd38bdb812075366fe02179c..b3f6fa3c9e1fa83889372e330ef4422e6c7fc587 100644
--- a/ui/clipboard.c
+++ b/ui/clipboard.c
@@ -163,9 +163,15 @@ void qemu_clipboard_set_data(QemuClipboardPeer *peer,
     }
 
     g_free(info->types[type].data);
-    info->types[type].data = g_memdup(data, size);
-    info->types[type].size = size;
-    info->types[type].available = true;
+    if (size) {
+        info->types[type].data = g_memdup2(data, size);
+        info->types[type].size = size;
+        info->types[type].available = true;
+    } else {
+        info->types[type].data = NULL;
+        info->types[type].size = 0;
+        info->types[type].available = false;
+    }
 
     if (update) {
         qemu_clipboard_update(info);
diff --git a/ui/console-vc.c b/ui/console-vc.c
index 9c13cc2981b02317b224be52166cfcb16418c4bf..b1903c3e48001a748a84ca752663aa311b1bf9f5 100644
--- a/ui/console-vc.c
+++ b/ui/console-vc.c
@@ -648,7 +648,7 @@ static void vc_putchar(VCChardev *vc, int ch)
     QemuTextConsole *s = vc->console;
     int i;
     int x, y;
-    char response[40];
+    g_autofree char *response = NULL;
 
     switch(vc->state) {
     case TTY_STATE_NORM:
@@ -821,7 +821,7 @@ static void vc_putchar(VCChardev *vc, int ch)
                     break;
                 case 6:
                     /* report cursor position */
-                    sprintf(response, "\033[%d;%dR",
+                    response = g_strdup_printf("\033[%d;%dR",
                            (s->y_base + s->y) % s->total_height + 1,
                             s->x + 1);
                     vc_respond_str(vc, response);
diff --git a/ui/curses.c b/ui/curses.c
index 8bde8c5cf7c35ca85295cd1a46fef27df00ce1a5..26438486fc97a2fc1b2e872a41fe0a9d73c1ac7e 100644
--- a/ui/curses.c
+++ b/ui/curses.c
@@ -38,7 +38,7 @@
 #include "ui/input.h"
 #include "sysemu/sysemu.h"
 
-#if defined(__APPLE__) || defined(__OpenBSD__)
+#ifdef __APPLE__
 #define _XOPEN_SOURCE_EXTENDED 1
 #endif
 
diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index 3af5ac5bcf336244efbdb42ef08d5754364d0927..75f6b9011a5caf049dee4cafe15f0b1d3f8c6adf 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -150,6 +150,7 @@ void gd_egl_refresh(DisplayChangeListener *dcl)
             vc, vc->window ? vc->window : vc->gfx.drawing_area);
 
     if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) {
+        gd_egl_draw(vc);
         return;
     }
 
diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
index 52dcac161e25759e06460109430f4a8ce993895e..4fff957c3f08028337a7db4e43531fad9eefe10c 100644
--- a/ui/gtk-gl-area.c
+++ b/ui/gtk-gl-area.c
@@ -126,6 +126,7 @@ void gd_gl_area_refresh(DisplayChangeListener *dcl)
     gd_update_monitor_refresh_rate(vc, vc->window ? vc->window : vc->gfx.drawing_area);
 
     if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) {
+        gd_gl_area_draw(vc);
         return;
     }
 
diff --git a/ui/gtk.c b/ui/gtk.c
index 810d7fc7960735e03bde904a62147743b935ff28..1a69f6fc3784876273a47ce096d38ed4cc7cf9d9 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -887,7 +887,7 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion,
     int x, y;
     int mx, my;
     int fbh, fbw;
-    int ww, wh, ws;
+    int ww, wh;
 
     if (!vc->gfx.ds) {
         return TRUE;
@@ -898,8 +898,13 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion,
 
     ww = gtk_widget_get_allocated_width(widget);
     wh = gtk_widget_get_allocated_height(widget);
-    ws = gtk_widget_get_scale_factor(widget);
 
+    /*
+     * `widget` may not have the same size with the frame buffer.
+     * In such cases, some paddings are needed around the `vc`.
+     * To achieve that, `vc` will be displayed at (mx, my)
+     * so that it is displayed at the center of the widget.
+     */
     mx = my = 0;
     if (ww > fbw) {
         mx = (ww - fbw) / 2;
@@ -908,8 +913,12 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion,
         my = (wh - fbh) / 2;
     }
 
-    x = (motion->x - mx) / vc->gfx.scale_x * ws;
-    y = (motion->y - my) / vc->gfx.scale_y * ws;
+    /*
+     * `motion` is reported in `widget` coordinates
+     * so translating it to the coordinates in `vc`.
+     */
+    x = (motion->x - mx) / vc->gfx.scale_x;
+    y = (motion->y - my) / vc->gfx.scale_y;
 
     if (qemu_input_is_absolute(vc->gfx.dcl.con)) {
         if (x < 0 || y < 0 ||
diff --git a/ui/qemu-pixman.c b/ui/qemu-pixman.c
index 5ca55dd19984969ec113fb9d8b105f33e7f4632a..6cada8b45e17bf6fd7fdf5d52b5a88f4be934358 100644
--- a/ui/qemu-pixman.c
+++ b/ui/qemu-pixman.c
@@ -49,7 +49,6 @@ PixelFormat qemu_pixelformat_from_pixman(pixman_format_code_t format)
         break;
     default:
         g_assert_not_reached();
-        break;
     }
 
     pf.amax = (1 << pf.abits) - 1;
diff --git a/ui/sdl2.c b/ui/sdl2.c
index 4971963f0082b98a327117cd9180640d80600462..cc44d2708b698024d9a3e599c216449e38365981 100644
--- a/ui/sdl2.c
+++ b/ui/sdl2.c
@@ -115,6 +115,7 @@ void sdl2_window_create(struct sdl2_console *scon)
         SDL_SetHint(SDL_HINT_RENDER_BATCHING, "1");
 
         scon->winctx = SDL_GL_CreateContext(scon->real_window);
+        SDL_GL_SetSwapInterval(0);
     } else {
         /* The SDL renderer is only used by sdl2-2D, when OpenGL is disabled */
         scon->real_renderer = SDL_CreateRenderer(scon->real_window, -1, 0);
diff --git a/ui/vnc-auth-sasl.c b/ui/vnc-auth-sasl.c
index 47fdae5b215b93b7ea3c9c52ce7bdeee051e9dfc..e321c9decce52d9f070e12b67dc013aeda089f5f 100644
--- a/ui/vnc-auth-sasl.c
+++ b/ui/vnc-auth-sasl.c
@@ -674,6 +674,13 @@ void start_auth_sasl(VncState *vs)
     }
     trace_vnc_auth_sasl_mech_list(vs, mechlist);
 
+    if (g_str_equal(mechlist, "")) {
+	    trace_vnc_auth_fail(vs, vs->auth, "no available SASL mechanisms", "");
+	    sasl_dispose(&vs->sasl.conn);
+	    vs->sasl.conn = NULL;
+	    goto authabort;
+    }
+
     vs->sasl.mechlist = g_strdup(mechlist);
     mechlistlen = strlen(mechlist);
     vnc_write_u32(vs, mechlistlen);
diff --git a/ui/vnc.c b/ui/vnc.c
index 4f23a0fa79f5d7f979dad0d54b485d1d37c97692..5dd77e73cbb4b1962fcd19d20e01300713616eff 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -1365,6 +1365,8 @@ void vnc_disconnect_finish(VncState *vs)
     g_free(vs->zrle);
     g_free(vs->tight);
     g_free(vs);
+
+    qemu_timer_set_mode(QEMU_TIMER_USB_LAZY_MODE, QEMU_USB_CONTROLLER_UHCI);
 }
 
 size_t vnc_client_io_error(VncState *vs, ssize_t ret, Error *err)
@@ -3341,6 +3343,8 @@ static void vnc_connect(VncDisplay *vd, QIOChannelSocket *sioc,
             }
         }
     }
+
+    qemu_timer_set_mode(QEMU_TIMER_USB_NORMAL_MODE, QEMU_USB_CONTROLLER_UHCI);
 }
 
 void vnc_start_protocol(VncState *vs)
diff --git a/util/chardev_open.c b/util/chardev_open.c
new file mode 100644
index 0000000000000000000000000000000000000000..f7764297882f59411c81b304a1ab5d16e903a389
--- /dev/null
+++ b/util/chardev_open.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019, Mellanox Technologies. All rights reserved.
+ * Copyright (C) 2023 Intel Corporation.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *      Redistribution and use in source and binary forms, with or
+ *      without modification, are permitted provided that the following
+ *      conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *
+ * Copied from
+ * https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/chardev_open.h"
+
+static int open_cdev_internal(const char *path, dev_t cdev)
+{
+    struct stat st;
+    int fd;
+
+    fd = qemu_open_old(path, O_RDWR);
+    if (fd == -1) {
+        return -1;
+    }
+    if (fstat(fd, &st) || !S_ISCHR(st.st_mode) ||
+        (cdev != 0 && st.st_rdev != cdev)) {
+        close(fd);
+        return -1;
+    }
+    return fd;
+}
+
+static int open_cdev_robust(dev_t cdev)
+{
+    g_autofree char *devpath = NULL;
+
+    /*
+     * This assumes that udev is being used and is creating the /dev/char/
+     * symlinks.
+     */
+    devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev));
+    return open_cdev_internal(devpath, cdev);
+}
+
+int open_cdev(const char *devpath, dev_t cdev)
+{
+    int fd;
+
+    fd = open_cdev_internal(devpath, cdev);
+    if (fd == -1 && cdev != 0) {
+        return open_cdev_robust(cdev);
+    }
+    return fd;
+}
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index 7b304c79d94204e78022211b6be49257ebc1fab8..650c21846dd19567dc2ec8fa6f2f2cf10db1e2fe 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -80,6 +80,19 @@ union cc_arg {
     int i[2];
 };
 
+/**
+ * coroutines list for libcare
+ */
+struct CoroutineInformation {
+    sigjmp_buf *env;
+    QLIST_ENTRY(CoroutineInformation) next;
+};
+
+static QemuMutex coro_mtx;
+QLIST_HEAD(, CoroutineInformation) coro_info_list = QLIST_HEAD_INITIALIZER(pool);
+int coro_env_offset = offsetof(struct CoroutineInformation, env);
+int coro_next_offset = offsetof(struct CoroutineInformation, next);
+
 /*
  * QEMU_ALWAYS_INLINE only does so if __OPTIMIZE__, so we cannot use it.
  * always_inline is required to avoid TSan runtime fatal errors.
@@ -340,3 +353,42 @@ bool qemu_in_coroutine(void)
 
     return self && self->caller;
 }
+
+static void __attribute__((constructor)) coro_mutex_init(void)
+{
+    qemu_mutex_init(&coro_mtx);
+}
+
+void qemu_coroutine_info_add(const Coroutine *co_)
+{
+    CoroutineUContext *co;
+    struct CoroutineInformation *coro_info;
+
+    /* save coroutine env to coro_info_list */
+    co = DO_UPCAST(CoroutineUContext, base, co_);
+    coro_info = g_malloc0(sizeof(struct CoroutineInformation));
+    coro_info->env = &co->env;
+
+    qemu_mutex_lock(&coro_mtx);
+    QLIST_INSERT_HEAD(&coro_info_list, coro_info, next);
+    qemu_mutex_unlock(&coro_mtx);
+}
+
+void qemu_coroutine_info_delete(const Coroutine *co_)
+{
+    CoroutineUContext *co;
+    struct CoroutineInformation *coro_info;
+
+    /* Remove relative coroutine env info from coro_info_list */
+    co = DO_UPCAST(CoroutineUContext, base, co_);
+
+    qemu_mutex_lock(&coro_mtx);
+    QLIST_FOREACH(coro_info, &coro_info_list, next) {
+        if (coro_info->env == &co->env) {
+            QLIST_REMOVE(coro_info, next);
+            g_free(coro_info);
+            break;
+        }
+    }
+    qemu_mutex_unlock(&coro_mtx);
+}
diff --git a/util/log.c b/util/log.c
index d36c98da0b4ee251bd79f8ff5eda3250ec3bd967..78b6cf225f8de78a0d0092d78d72c3aeac16138d 100644
--- a/util/log.c
+++ b/util/log.c
@@ -143,6 +143,12 @@ void qemu_log_unlock(FILE *logfile)
     }
 }
 
+#ifdef CONFIG_DISABLE_QEMU_LOG
+void qemu_log(const char *fmt, ...)
+{
+    return;
+}
+#else
 void qemu_log(const char *fmt, ...)
 {
     FILE *f = qemu_log_trylock();
@@ -155,6 +161,7 @@ void qemu_log(const char *fmt, ...)
         qemu_log_unlock(f);
     }
 }
+#endif
 
 static void __attribute__((__constructor__)) startup(void)
 {
diff --git a/util/meson.build b/util/meson.build
index c2322ef6e71a1de643d44dd3ad4bad497bc975ec..174c133368a8584315f488786a3d4b59f6cede80 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -108,6 +108,7 @@ if have_block
     util_ss.add(files('filemonitor-stub.c'))
   endif
   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
+  util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c'))
 endif
 
 if cpu == 'aarch64'
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index ed14f9c64de510e1add0e1b1e29844e43a36f012..6890ad676c434cd8047c739d8bb9086a75113a0a 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -30,6 +30,28 @@
 #include <linux/magic.h>
 #endif
 
+size_t qemu_fd_getfiletype(int fd)
+{
+    struct statfs fs;
+    int ret;
+
+    if (fd != -1) {
+        do {
+            ret = fstatfs(fd, &fs);
+        } while (ret != 0 && errno == EINTR);
+
+        if (ret != 0) {
+            fprintf(stderr, "Couldn't fstatfs() fd: %s\n",
+                    strerror(errno));
+            return -1;
+        }
+        return fs.f_type;
+    } else {
+        fprintf(stderr, "fd is invalid \n");
+        return -1;
+    }
+}
+
 QemuFsType qemu_fd_getfs(int fd)
 {
 #ifdef CONFIG_LINUX
diff --git a/util/module.c b/util/module.c
index 32e263163c75dfa8e93ca67c739d4a501162353c..3eb0f06df165503f81c3cbc6657557666307506c 100644
--- a/util/module.c
+++ b/util/module.c
@@ -354,13 +354,13 @@ int module_load_qom(const char *type, Error **errp)
 void module_load_qom_all(void)
 {
     const QemuModinfo *modinfo;
-    Error *local_err = NULL;
 
     if (module_loaded_qom_all) {
         return;
     }
 
     for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+        Error *local_err = NULL;
         if (!modinfo->objs) {
             continue;
         }
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e86fd64e099877cec2818d948da659d261fde040..43af077fedd3257b7991b2dcf28864a4d723c8fc 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -88,6 +88,8 @@ static QemuMutex sigbus_mutex;
 static QemuMutex page_mutex;
 static QemuCond page_cond;
 
+static int started_num_threads;
+
 int qemu_get_thread_id(void)
 {
 #if defined(__linux__)
@@ -344,6 +346,10 @@ static void *do_touch_pages(void *arg)
     }
     qemu_mutex_unlock(&page_mutex);
 
+    while (started_num_threads != memset_args->context->num_threads) {
+        smp_mb();
+    }
+
     /* unblock SIGBUS */
     sigemptyset(&set);
     sigaddset(&set, SIGBUS);
@@ -448,7 +454,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
     context.threads = g_new0(MemsetThread, context.num_threads);
     numpages_per_thread = numpages / context.num_threads;
     leftover = numpages % context.num_threads;
-    for (i = 0; i < context.num_threads; i++) {
+    for (i = 0, started_num_threads = 0; i < context.num_threads; i++) {
         context.threads[i].addr = addr;
         context.threads[i].numpages = numpages_per_thread + (i < leftover);
         context.threads[i].hpagesize = hpagesize;
@@ -464,6 +470,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
                                QEMU_THREAD_JOINABLE);
         }
         addr += context.threads[i].numpages * hpagesize;
+        started_num_threads++;
     }
 
     if (!use_madv_populate_write) {
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 5fd2dbaf8bb7818f6b57059f90fea5305796612e..f550214484ffff5e1cc5f512abaf1c2aa40926d7 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -89,6 +89,8 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
         co = qemu_coroutine_new();
     }
 
+    qemu_coroutine_info_add(co);
+
     co->entry = entry;
     co->entry_arg = opaque;
     QSIMPLEQ_INIT(&co->co_queue_wakeup);
@@ -99,6 +101,8 @@ static void coroutine_delete(Coroutine *co)
 {
     co->caller = NULL;
 
+    qemu_coroutine_info_delete(co);
+
     if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
         if (release_pool_size < qatomic_read(&pool_max_size) * 2) {
             QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 6a0de33dd2b86c339dc9dc5abe6548e44c918cb4..dc891cc557db84d354fc476b8fc002a1d3c4e67b 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -23,6 +23,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "qemu/main-loop.h"
 #include "qemu/timer.h"
 #include "qemu/lockable.h"
@@ -75,6 +76,74 @@ struct QEMUTimerList {
     QemuEvent timers_done_ev;
 };
 
+typedef struct qemu_controller_timer_state {
+    qemu_usb_controller_ptr controller;
+    int refs;
+} controller_timer_state;
+
+typedef controller_timer_state* controller_timer_state_ptr;
+
+static controller_timer_state uhci_timer_state = {
+    .controller = NULL,
+    .refs = 0,
+};
+
+static controller_timer_state_ptr \
+       qemu_usb_controller_tab[MAX_USB_CONTROLLER_TYPES] = {NULL,
+                                                            &uhci_timer_state,
+                                                            NULL, NULL};
+
+int qemu_register_usb_controller(qemu_usb_controller_ptr controller,
+                                 unsigned int type)
+{
+    if (type != QEMU_USB_CONTROLLER_UHCI) {
+        return 0;
+    }
+
+    /* for companion EHCI controller will create three UHCI controllers,
+     * we init it only once.
+     */
+    if (!qemu_usb_controller_tab[type]->controller) {
+        qemu_log("the usb controller (%d) registed frame handler\n", type);
+        qemu_usb_controller_tab[type]->controller = controller;
+    }
+
+    return 0;
+}
+
+int qemu_timer_set_mode(enum qemu_timer_mode mode, unsigned int type)
+{
+    if (type != QEMU_USB_CONTROLLER_UHCI) {
+        qemu_log("the usb controller (%d) no need change frame frep\n", type);
+        return 0;
+    }
+
+    if (!qemu_usb_controller_tab[type]->controller) {
+        qemu_log("the usb controller (%d) not registed yet\n", type);
+        return 0;
+    }
+
+    if (mode == QEMU_TIMER_USB_NORMAL_MODE) {
+        if (qemu_usb_controller_tab[type]->refs++ > 0) {
+            return 0;
+        }
+        qemu_usb_controller_tab[type]->controller->
+            qemu_set_freq(QEMU_USB_NORMAL_FREQ);
+        qemu_log("Set the controller (%d) of freq %d HZ,\n",
+                 type, QEMU_USB_NORMAL_FREQ);
+    } else {
+        if (--qemu_usb_controller_tab[type]->refs > 0) {
+            return 0;
+        }
+        qemu_usb_controller_tab[type]->controller->
+            qemu_set_freq(QEMU_USB_LAZY_FREQ);
+        qemu_log("Set the controller(type:%d) of freq %d HZ,\n",
+                 type, QEMU_USB_LAZY_FREQ);
+    }
+
+    return 0;
+}
+
 /**
  * qemu_clock_ptr:
  * @type: type of clock
diff --git a/util/range.c b/util/range.c
index f3f40098d547b99af662c0ec89f5eb312d296167..2ea640662b193b9c08699e21a36c06b37f1b535b 100644
--- a/util/range.c
+++ b/util/range.c
@@ -61,6 +61,7 @@ GList *range_list_insert(GList *list, Range *data)
         range_extend(l->data, l->next->data);
         g_free(l->next->data);
         new_l = g_list_delete_link(list, l->next);
+        l->next = NULL;
         assert(new_l == list);
     }
 
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
index fdff4867e8bd8c0adaac44a4aec468c198214897..b7d320d0b1318842c6f75184e4851fa087f69588 100644
--- a/util/userfaultfd.c
+++ b/util/userfaultfd.c
@@ -356,31 +356,3 @@ int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
 
     return (int) (res / sizeof(struct uffd_msg));
 }
-
-/**
- * uffd_poll_events: poll UFFD file descriptor for read
- *
- * Returns true if events are available for read, false otherwise
- *
- * @uffd_fd: UFFD file descriptor
- * @tmo: timeout value
- */
-bool uffd_poll_events(int uffd_fd, int tmo)
-{
-    int res;
-    struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
-
-    do {
-        res = poll(&poll_fd, 1, tmo);
-    } while (res < 0 && errno == EINTR);
-
-    if (res == 0) {
-        return false;
-    }
-    if (res < 0) {
-        error_report("uffd_poll_events() failed: errno=%i", errno);
-        return false;
-    }
-
-    return (poll_fd.revents & POLLIN) != 0;
-}