From a73c1e105a199027e9a1d7bc9789188f7da78bda Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Sep 2024 17:34:38 +0100 Subject: [PATCH 01/28] io_uring/cmd: give inline space in request to cmds ANBZ: #20938 commit a6ccb48e13662bcb98282e051512b9686b02d353 upstream. Some io_uring commands can use some inline space in io_kiocb. We have 32 bytes in struct io_uring_cmd, expose it. [Fix conflict] no include/linux/io_uring/cmd.h, move it into io_uring.h Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ca779a61ee5e166e535d70df9c7f07b15d8a0ce.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Ferry Meng Reviewed-by: Guixin Liu Link: https://gitee.com/anolis/cloud-kernel/pulls/4523 --- include/linux/io_uring.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 108cffa3ae5b..cfa98ad8e943 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -41,6 +41,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) return sqe->cmd; } +static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) +{ + BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu)); +} +#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \ + io_uring_cmd_private_sz_check(sizeof(pdu_type)), \ + ((pdu_type *)&(cmd)->pdu) \ +) + #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); -- Gitee From feb38cd050faf5bc0fc9955bcb3f967bda9f3f40 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 02/28] anolis: virtio-blk: add virtio-blk chardev support ANBZ: #20938 Introduce character interfaces for block device (per-device), facilitating access to block devices through io_uring I/O passsthrough. Besides, vblk initialize only use kmalloc with GFP_KERNEL flag, but for char device support, we should ensure cdev kobj must be zero before initialize. So better initial this struct with __GFP_ZERO flag. Now the character devices only named as - /dev/vdXc0 Currently, only one character interface is created for one actual virtblk device, although it has been partitioned. ----- ANCK-6.6 Since virtio_blk no longer maintainers its own refernce count, but instead relies on the refcount of gendisk, we use a hack approach to borrow gendisk's refcount mechanism to ensure that the lifetime of gendisk and vblk structures remain valid during usage of char device. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 112 ++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 65a1f1576e55..5878775d0d95 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -17,6 +17,7 @@ #include #include #include +#include #define PART_BITS 4 #define VQ_NAME_LEN 16 @@ -25,6 +26,8 @@ /* The maximum number of sg elements that fit into a virtqueue */ #define VIRTIO_BLK_MAX_SG_ELEMS 32768 +#define VIRTBLK_MINORS (1U << MINORBITS) + #ifdef CONFIG_ARCH_NO_SG_CHAIN #define VIRTIO_BLK_INLINE_SG_CNT 0 #else @@ -45,6 +48,10 @@ MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/ static int major; static DEFINE_IDA(vd_index_ida); +static DEFINE_IDA(vd_chr_minor_ida); +static dev_t vd_chr_devt; +static struct class *vd_chr_class; + static struct workqueue_struct *virtblk_wq; struct virtio_blk_vq { @@ -84,6 +91,10 @@ struct virtio_blk { /* For zoned device */ unsigned int zone_sectors; + + /* For passthrough cmd */ + struct cdev cdev; + struct device cdev_device; }; struct virtblk_req { @@ -1294,6 +1305,81 @@ static const struct blk_mq_ops virtio_mq_ops = { .poll = virtblk_poll, }; +static void virtblk_cdev_rel(struct device *dev) +{ + ida_free(&vd_chr_minor_ida, MINOR(dev->devt)); +} + +static void virtblk_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ + cdev_device_del(cdev, cdev_device); + put_device(cdev_device); +} + +static int virtblk_cdev_add(struct virtio_blk *vblk, + const struct file_operations *fops) +{ + struct cdev *cdev = &vblk->cdev; + struct device *cdev_device = &vblk->cdev_device; + int minor, ret; + + minor = ida_alloc(&vd_chr_minor_ida, GFP_KERNEL); + if (minor < 0) + return minor; + + cdev_device->parent = &vblk->vdev->dev; + cdev_device->devt = MKDEV(MAJOR(vd_chr_devt), minor); + cdev_device->class = vd_chr_class; + cdev_device->release = virtblk_cdev_rel; + device_initialize(cdev_device); + + ret = dev_set_name(cdev_device, "%sc0", vblk->disk->disk_name); + if (ret) + goto fail; + + cdev_init(cdev, fops); + ret = cdev_device_add(cdev, cdev_device); + if (ret) + goto fail; + + return 0; + +fail: + put_device(cdev_device); + return ret; +} + +static int virtblk_chr_open(struct inode *inode, struct file *file) +{ + int ret = 0; + struct virtio_blk *vblk = container_of(inode->i_cdev, struct virtio_blk, cdev); + + if (vblk->disk) + get_device(disk_to_dev(vblk->disk)); + else + ret = -ENXIO; + + return ret; +} + +static int virtblk_chr_release(struct inode *inode, struct file *file) +{ + struct virtio_blk *vblk = container_of(inode->i_cdev, struct virtio_blk, cdev); + + if (!vblk->disk) + WARN_ON(1); + else + put_device(disk_to_dev(vblk->disk)); + + return 0; +} + +static const struct file_operations virtblk_chr_fops = { + .owner = THIS_MODULE, + .open = virtblk_chr_open, + .release = virtblk_chr_release, +}; + static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); @@ -1335,7 +1421,7 @@ static int virtblk_probe(struct virtio_device *vdev) /* Prevent integer overflows and honor max vq size */ sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); - vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); + vdev->priv = vblk = kzalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; goto out_free_index; @@ -1577,6 +1663,8 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_cleanup_disk; + WARN_ON(virtblk_cdev_add(vblk, &virtblk_chr_fops)); + return 0; out_cleanup_disk: @@ -1601,6 +1689,8 @@ static void virtblk_remove(struct virtio_device *vdev) /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); + virtblk_cdev_del(&vblk->cdev, &vblk->cdev_device); + del_gendisk(vblk->disk); blk_mq_free_tag_set(&vblk->tag_set); @@ -1711,11 +1801,27 @@ static int __init virtio_blk_init(void) goto out_destroy_workqueue; } + error = alloc_chrdev_region(&vd_chr_devt, 0, VIRTBLK_MINORS, + "vblk-generic"); + if (error < 0) + goto out_unregister_blkdev; + + vd_chr_class = class_create("vblk-generic"); + if (IS_ERR(vd_chr_class)) { + error = PTR_ERR(vd_chr_class); + goto out_unregister_chardev; + } + error = register_virtio_driver(&virtio_blk); if (error) - goto out_unregister_blkdev; + goto out_destroy_class; + return 0; +out_destroy_class: + class_destroy(vd_chr_class); +out_unregister_chardev: + unregister_chrdev_region(vd_chr_devt, VIRTBLK_MINORS); out_unregister_blkdev: unregister_blkdev(major, "virtblk"); out_destroy_workqueue: @@ -1726,6 +1832,8 @@ static int __init virtio_blk_init(void) static void __exit virtio_blk_fini(void) { unregister_virtio_driver(&virtio_blk); + class_destroy(vd_chr_class); + unregister_chrdev_region(vd_chr_devt, VIRTBLK_MINORS); unregister_blkdev(major, "virtblk"); destroy_workqueue(virtblk_wq); } -- Gitee From 895d097b964e5cdebe88869f8ed946e8a6c37015 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 03/28] anolis: virtio-blk: add uring-cmd support for I/O passthru on chardev ANBZ: #20938 Add ->uring_cmd() support for virtio-blk chardev (/dev/vdXc0). According to virtio spec, in addition to passing 'hdr' info into kernel, we also need to pass vaddr & data length of the 'iov' requeired for the writev/readv op. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 176 +++++++++++++++++++++++++++++++- include/uapi/linux/virtio_blk.h | 16 +++ 2 files changed, 189 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5878775d0d95..b25331e23c51 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -18,6 +18,9 @@ #include #include #include +#include +#include +#include #define PART_BITS 4 #define VQ_NAME_LEN 16 @@ -54,6 +57,11 @@ static struct class *vd_chr_class; static struct workqueue_struct *virtblk_wq; +struct virtblk_uring_cmd_pdu { + struct bio *bio; + u8 status; +}; + struct virtio_blk_vq { struct virtqueue *vq; spinlock_t lock; @@ -259,9 +267,6 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) return BLK_STS_NOTSUPP; - /* Set fields for all request types */ - vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); - switch (req_op(req)) { case REQ_OP_READ: type = VIRTIO_BLK_T_IN; @@ -309,6 +314,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, type = VIRTIO_BLK_T_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: /* * Out header has already been prepared by the caller (virtblk_get_id() * or virtblk_submit_zone_report()), nothing to do here. @@ -323,6 +329,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, vbr->in_hdr_len = in_hdr_len; vbr->out_hdr.type = cpu_to_virtio32(vdev, type); vbr->out_hdr.sector = cpu_to_virtio64(vdev, sector); + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES || type == VIRTIO_BLK_T_SECURE_ERASE) { @@ -885,6 +892,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) vbr = blk_mq_rq_to_pdu(req); vbr->in_hdr_len = sizeof(vbr->in_hdr.status); vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); vbr->out_hdr.sector = 0; err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); @@ -1305,6 +1313,167 @@ static const struct blk_mq_ops virtio_mq_ops = { .poll = virtblk_poll, }; +static inline struct virtblk_uring_cmd_pdu *virtblk_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return io_uring_cmd_to_pdu(ioucmd, struct virtblk_uring_cmd_pdu); +} + +static void virtblk_uring_task_cb(struct io_uring_cmd *ioucmd, + unsigned issue_flags) +{ + struct virtblk_uring_cmd_pdu *pdu = virtblk_uring_cmd_pdu(ioucmd); + + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + + /* currently result has no use, it should be zero as cqe->res */ + io_uring_cmd_done(ioucmd, pdu->status, 0, issue_flags); +} + +static enum rq_end_io_ret virtblk_uring_cmd_end_io(struct request *req, blk_status_t err) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + struct virtblk_uring_cmd_pdu *pdu = virtblk_uring_cmd_pdu(ioucmd); + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + req->bio = pdu->bio; + pdu->status = vbr->in_hdr.status; + if (!pdu->status) + pdu->status = blk_status_to_errno(err); + + io_uring_cmd_do_in_task_lazy(ioucmd, virtblk_uring_task_cb); + + return RQ_END_IO_FREE; +} + +static int virtblk_map_user_request(struct request *req, uintptr_t ubuffer, + unsigned int bufflen, struct io_uring_cmd *ioucmd, bool vec) +{ + struct request_queue *q = req->q; + int ret; + + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { + struct iov_iter iter; + + /* fixedbufs is only for non-vectored io */ + if (vec) + return -EINVAL; + ret = io_uring_cmd_import_fixed(ubuffer, bufflen, + rq_data_dir(req), &iter, ioucmd); + if (ret < 0) + goto out; + ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); + } else { + ret = blk_rq_map_user_io(req, NULL, (void __user *)ubuffer, + bufflen, GFP_KERNEL, vec, 0, + 0, rq_data_dir(req)); + } + + if (ret) + goto out; + + return ret; +out: + blk_mq_free_request(req); + return ret; +} + +static int virtblk_uring_cmd_io(struct virtio_blk *vblk, + struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) +{ + struct virtblk_uring_cmd_pdu *pdu = virtblk_uring_cmd_pdu(ioucmd); + const struct virtblk_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); + struct request_queue *q = vblk->disk->queue; + struct virtblk_req *vbr; + struct request *req; + blk_opf_t rq_flags = REQ_ALLOC_CACHE; + blk_mq_req_flags_t blk_flags = 0; + u32 type; + uintptr_t data; + unsigned long data_len, flag; + int ret; + + type = READ_ONCE(cmd->type); + flag = READ_ONCE(cmd->flag); + data = READ_ONCE(cmd->data); + data_len = READ_ONCE(cmd->data_len); + + /* Only support OUT and IN for uring_cmd currently */ + if ((type != VIRTIO_BLK_T_OUT) && (type != VIRTIO_BLK_T_IN)) + return -EOPNOTSUPP; + + if (issue_flags & IO_URING_F_NONBLOCK) { + rq_flags |= REQ_NOWAIT; + blk_flags = BLK_MQ_REQ_NOWAIT; + } + + rq_flags |= (type & VIRTIO_BLK_T_OUT) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; + + req = blk_mq_alloc_request(q, rq_flags, blk_flags); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->rq_flags |= RQF_DONTPREP; + vbr = blk_mq_rq_to_pdu(req); + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, READ_ONCE(cmd->ioprio)); + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, READ_ONCE(cmd->sector)); + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); + + if (data && data_len) { + ret = virtblk_map_user_request(req, data, data_len, ioucmd, vec); + if (ret) + return ret; + } else { + /* user should ensure passthrough command have data */ + blk_mq_free_request(req); + return -EINVAL; + } + + /* to free bio on completion, as req->bio will be null at that time */ + pdu->bio = req->bio; + req->end_io_data = ioucmd; + bio_set_dev(req->bio, vblk->disk->part0); + + req->end_io = virtblk_uring_cmd_end_io; + blk_execute_rq_nowait(req, false); + return -EIOCBQUEUED; +} + +static int virtblk_uring_cmd(struct virtio_blk *vblk, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct virtblk_uring_cmd_pdu) > sizeof(ioucmd->pdu)); + + /* currently we need 128 bytes sqe and 16 bytes cqe */ + if ((issue_flags & IO_URING_F_SQE128) != IO_URING_F_SQE128) + return -EOPNOTSUPP; + + switch (ioucmd->cmd_op) { + case VIRTBLK_URING_CMD_IO: + ret = virtblk_uring_cmd_io(vblk, ioucmd, issue_flags, false); + break; + case VIRTBLK_URING_CMD_IO_VEC: + ret = virtblk_uring_cmd_io(vblk, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +static int virtblk_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct virtio_blk *vblk = container_of(file_inode(ioucmd->file)->i_cdev, + struct virtio_blk, cdev); + + return virtblk_uring_cmd(vblk, ioucmd, issue_flags); +} + static void virtblk_cdev_rel(struct device *dev) { ida_free(&vd_chr_minor_ida, MINOR(dev->devt)); @@ -1378,6 +1547,7 @@ static const struct file_operations virtblk_chr_fops = { .owner = THIS_MODULE, .open = virtblk_chr_open, .release = virtblk_chr_release, + .uring_cmd = virtblk_chr_uring_cmd, }; static unsigned int virtblk_queue_depth; diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 3744e4da1b2a..93b6e1b5b9a4 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -313,6 +313,22 @@ struct virtio_scsi_inhdr { }; #endif /* !VIRTIO_BLK_NO_LEGACY */ +struct virtblk_uring_cmd { + /* VIRTIO_BLK_T* */ + __u32 type; + /* io priority. */ + __u32 ioprio; + /* Sector (ie. 512 byte offset) */ + __u64 sector; + + __u64 data; + __u32 data_len; + __u32 flag; +}; + +#define VIRTBLK_URING_CMD_IO 1 +#define VIRTBLK_URING_CMD_IO_VEC 2 + /* And this is the final byte of the write scatter-gather list. */ #define VIRTIO_BLK_S_OK 0 #define VIRTIO_BLK_S_IOERR 1 -- Gitee From 2a9fee98df3cbb63b8bea484b205cefde66ef9c6 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 04/28] anolis: virtio-blk: add uring_cmd iopoll support ANBZ: #20938 Add polling support for uring_cmd polling support for virtblk, which will be called during completion-polling. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b25331e23c51..550da5a88083 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1342,7 +1342,15 @@ static enum rq_end_io_ret virtblk_uring_cmd_end_io(struct request *req, blk_stat if (!pdu->status) pdu->status = blk_status_to_errno(err); - io_uring_cmd_do_in_task_lazy(ioucmd, virtblk_uring_task_cb); + /* + * For iopoll, complete it directly. + * Otherwise, move the completion to task work. + */ + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); + virtblk_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); + } else + io_uring_cmd_do_in_task_lazy(ioucmd, virtblk_uring_task_cb); return RQ_END_IO_FREE; } @@ -1407,6 +1415,8 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, rq_flags |= REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; } + if (issue_flags & IO_URING_F_IOPOLL) + rq_flags |= REQ_POLLED; rq_flags |= (type & VIRTIO_BLK_T_OUT) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; @@ -1431,6 +1441,11 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, return -EINVAL; } + if (blk_rq_is_poll(req)) { + ioucmd->flags |= IORING_URING_CMD_POLLED; + WRITE_ONCE(ioucmd->cookie, req); + } + /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; req->end_io_data = ioucmd; @@ -1474,6 +1489,22 @@ static int virtblk_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue return virtblk_uring_cmd(vblk, ioucmd, issue_flags); } +static int virtblk_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, + struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct request *req; + int ret = 0; + + if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) + return 0; + + req = READ_ONCE(ioucmd->cookie); + if (req && blk_rq_is_poll(req)) + ret = blk_rq_poll(req, iob, poll_flags); + return ret; +} + static void virtblk_cdev_rel(struct device *dev) { ida_free(&vd_chr_minor_ida, MINOR(dev->devt)); @@ -1548,6 +1579,7 @@ static const struct file_operations virtblk_chr_fops = { .open = virtblk_chr_open, .release = virtblk_chr_release, .uring_cmd = virtblk_chr_uring_cmd, + .uring_cmd_iopoll = virtblk_chr_uring_cmd_iopoll, }; static unsigned int virtblk_queue_depth; -- Gitee From 6f3c01e441f2a58659b9aa4bc94da6a6ef53b874 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 05/28] anolis: virtio-blk: add blk_rq_map_sg_bidirectional helper ANBZ: #20938 If a request is bidirectional, we should divide bios into different sglist, according to its IO direction. Signed-off-by: Ferry Meng --- block/blk-merge.c | 69 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 4 +++ 2 files changed, 73 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 889ac59759a2..11e4e70b910a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -595,6 +595,75 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(__blk_rq_map_sg); +static int __blk_bios_map_sg_bidir(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist[], struct scatterlist **sg[]) +{ + struct bio_vec bvec, bvprv = { NULL }; + struct bvec_iter iter; + int nsegs = 0; + bool new_bio = false, write = false, prev_write = false; + /* we have ensure that a bidir req only have two bio in the list, + * what we do here is to map the two bio to two scatterlist. + */ + + for_each_bio(bio) { + write = op_is_write(bio_op(bio)); + bio_for_each_bvec(bvec, bio, iter) { + /* + * Only try to merge bvecs from two bios given we + * have done bio internal merge when adding pages + * to bio. + * For first time enter this loop, 'new_bio' is + * false, ignore prev_write and write until next + * loop. + */ + if (new_bio && prev_write == write && + __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg[prev_write])) + goto next_bvec; + + if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) + nsegs += __blk_bvec_map_sg(bvec, sglist[write], sg[write]); + else + nsegs += blk_bvec_map_sg(q, &bvec, sglist[write], sg[write]); +next_bvec: + new_bio = false; + } + if (likely(bio->bi_iter.bi_size)) { + bvprv = bvec; + new_bio = true; + prev_write = write; + } + } + + return nsegs; +} + +int blk_rq_map_sg_bidir(struct request_queue *q, struct request *rq, + struct scatterlist *sglist_write, struct scatterlist *sglist_read) +{ + int nsegs = 0; + struct scatterlist *sglist[2] = {sglist_read, sglist_write}; + struct scatterlist *last_sg_write = NULL, *last_sg_read = NULL; + struct scatterlist **sglist_last[2] = {&last_sg_write, &last_sg_read}; + + if (rq->bio) + nsegs = __blk_bios_map_sg_bidir(q, rq->bio, sglist, sglist_last); + + if (last_sg_write) + sg_mark_end(last_sg_write); + + if (last_sg_read) + sg_mark_end(last_sg_read); + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); + + return nsegs; +} +EXPORT_SYMBOL(blk_rq_map_sg_bidir); + static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ea73b8189e1a..afe40766a912 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1176,6 +1176,10 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, return __blk_rq_map_sg(q, rq, sglist, &last_sg); } + +int blk_rq_map_sg_bidir(struct request_queue *q, struct request *rq, + struct scatterlist *sglist_write, struct scatterlist *sglist_read); + void blk_dump_rq_flags(struct request *, char *); #ifdef CONFIG_BLK_DEV_ZONED -- Gitee From 10c303d6d7d5641a4559026227db685242744ec7 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 06/28] anolis: virtio-blk: add bidirectional request support ANBZ: #20938 We support virtio-blk bidirectional request, which contains both read and write bios. Now we assume that A bidirectional request only contains two bios, 1st is write and 2nd is read. We should divide 'bios' into two sglists, so virtblk-req need extend its structure, use an extra sglist to store them. In order to avoid corruption of the logic of 'request', using a trick in 'virtblk_map_user_bidirectional': when map iovec from usermode, set REQ_OP_WRITE for req->cmd_flags first, remove it, set read flag and repeat above steps. For block I/O request, add bidirectional flag for passthrough command. If a request is bidirectional, its bio contains read and write iovecs. Add extra and flag in virtblk_uring_cmd, which will be used in bidirectional request. 'write_iov_count' refers to 'num of write iovec' in a bidirectional request. 'flag' indicate this is a bidirectional req or not. We assume that a bidirectional request should satisfy the model "first write then read", and they need to be continuous in iovec. In other words, they should be like this: write - write - write - read - read In this example, virtblk_uring_cmd is: data: iovec addr base data_len: 0x5 flag: 0x1 write_iov_count: 0x3 Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 149 ++++++++++++++++++++++++++++++-- include/linux/blk_types.h | 7 ++ include/uapi/linux/virtio_blk.h | 3 + 3 files changed, 154 insertions(+), 5 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 550da5a88083..4bd43a719d66 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -127,6 +127,7 @@ struct virtblk_req { size_t in_hdr_len; struct sg_table sg_table; + struct sg_table sg_table_extra; struct scatterlist sg[]; }; @@ -156,11 +157,47 @@ static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx return vq; } +static inline bool vbr_is_bidirectional(struct virtblk_req *vbr) +{ + struct request *req = blk_mq_rq_from_pdu(vbr); + + return op_is_bidirectional(req->cmd_flags); +} + +static int virtblk_add_req_bidirectional(struct virtqueue *vq, + struct virtblk_req *vbr, struct scatterlist *data_sg, + struct scatterlist *data_sg_extra) +{ + struct scatterlist out_hdr, in_hdr, *sgs[4]; + unsigned int num_out = 0, num_in = 0; + + /* + * vritblk_add_req use 'bool' have_data, while we use int num to + * validate both OUT and IN direction have data. For bidirectional + * request, __blk_bios_map_sg_bidir() should map at least 2 segments. + */ + if ((sg_nents(data_sg) == 0) || (sg_nents(data_sg_extra) == 0)) + return -EINVAL; + + sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); + sgs[num_out++] = &out_hdr; + sgs[num_out++] = data_sg; + sgs[num_out + num_in++] = data_sg_extra; + sgs[num_out + num_in++] = &in_hdr; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); +} + static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) { struct scatterlist out_hdr, in_hdr, *sgs[3]; unsigned int num_out = 0, num_in = 0; + if (vbr_is_bidirectional(vbr)) + return virtblk_add_req_bidirectional(vq, vbr, + vbr->sg_table.sgl, vbr->sg_table_extra.sgl); + sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &out_hdr; @@ -223,13 +260,57 @@ static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool un return 0; } +static void virtblk_unmap_data_bidirectional(struct request *req, + struct virtblk_req *vbr) +{ + if (blk_rq_nr_phys_segments(req)) { + sg_free_table_chained(&vbr->sg_table, + VIRTIO_BLK_INLINE_SG_CNT); + sg_free_table_chained(&vbr->sg_table_extra, + VIRTIO_BLK_INLINE_SG_CNT); + } +} + static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr) { + if (vbr_is_bidirectional(vbr)) { + virtblk_unmap_data_bidirectional(req, vbr); + return; + } + if (blk_rq_nr_phys_segments(req)) sg_free_table_chained(&vbr->sg_table, VIRTIO_BLK_INLINE_SG_CNT); } +static int virtblk_map_data_bidirectional(struct blk_mq_hw_ctx *hctx, + struct request *req, struct virtblk_req *vbr) +{ + int err; + + vbr->sg_table.sgl = vbr->sg; + err = sg_alloc_table_chained(&vbr->sg_table, + blk_rq_nr_phys_segments(req), + vbr->sg_table.sgl, + VIRTIO_BLK_INLINE_SG_CNT); + if (unlikely(err)) + return -ENOMEM; + + vbr->sg_table_extra.sgl = &vbr->sg[VIRTIO_BLK_INLINE_SG_CNT]; + err = sg_alloc_table_chained(&vbr->sg_table_extra, + blk_rq_nr_phys_segments(req), + vbr->sg_table_extra.sgl, + VIRTIO_BLK_INLINE_SG_CNT); + if (unlikely(err)) { + sg_free_table_chained(&vbr->sg_table, + VIRTIO_BLK_INLINE_SG_CNT); + return -ENOMEM; + } + + return blk_rq_map_sg_bidir(hctx->queue, req, + vbr->sg_table.sgl, vbr->sg_table_extra.sgl); +} + static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, struct virtblk_req *vbr) { @@ -238,6 +319,9 @@ static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, if (!blk_rq_nr_phys_segments(req)) return 0; + if (vbr_is_bidirectional(vbr)) + return virtblk_map_data_bidirectional(hctx, req, vbr); + vbr->sg_table.sgl = vbr->sg; err = sg_alloc_table_chained(&vbr->sg_table, blk_rq_nr_phys_segments(req), @@ -1355,12 +1439,54 @@ static enum rq_end_io_ret virtblk_uring_cmd_end_io(struct request *req, blk_stat return RQ_END_IO_FREE; } +static int virtblk_map_user_bidirectional(struct request *req, uintptr_t ubuffer, + struct io_uring_cmd *ioucmd, unsigned int iov_count, + unsigned int write_iov_count) +{ + int ret; + + /* + * USER command should ensure write_iov_count < iov_count + */ + if (write_iov_count >= iov_count) + return -EINVAL; + + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) + return -EINVAL; + /* + * now bidirectional only support READ-after-WRITE mode, + * set WRITE first and clear it later. + */ + req->cmd_flags |= WRITE; + ret = blk_rq_map_user_io(req, NULL, (void __user *)ubuffer, + write_iov_count, GFP_KERNEL, true, + 0, false, rq_data_dir(req)); + if (ret) + return ret; + + ubuffer += write_iov_count * sizeof(struct iovec); + req->cmd_flags &= ~WRITE; + + ret = blk_rq_map_user_io(req, NULL, (void __user *)ubuffer, + (iov_count - write_iov_count), GFP_KERNEL, + true, 0, false, rq_data_dir(req)); + if (ret) + blk_rq_unmap_user(req->bio); + + return ret; +} static int virtblk_map_user_request(struct request *req, uintptr_t ubuffer, - unsigned int bufflen, struct io_uring_cmd *ioucmd, bool vec) + unsigned int bufflen, struct io_uring_cmd *ioucmd, + bool vec, unsigned int num) { struct request_queue *q = req->q; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); int ret; + if (vbr_is_bidirectional(vbr)) + return virtblk_map_user_bidirectional(req, ubuffer, ioucmd, + bufflen, num); + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { struct iov_iter iter; @@ -1395,17 +1521,19 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, struct request_queue *q = vblk->disk->queue; struct virtblk_req *vbr; struct request *req; + struct bio *bio; blk_opf_t rq_flags = REQ_ALLOC_CACHE; blk_mq_req_flags_t blk_flags = 0; u32 type; uintptr_t data; - unsigned long data_len, flag; + unsigned long data_len, flag, write_iov_count; int ret; type = READ_ONCE(cmd->type); flag = READ_ONCE(cmd->flag); data = READ_ONCE(cmd->data); data_len = READ_ONCE(cmd->data_len); + write_iov_count = READ_ONCE(cmd->write_iov_count); /* Only support OUT and IN for uring_cmd currently */ if ((type != VIRTIO_BLK_T_OUT) && (type != VIRTIO_BLK_T_IN)) @@ -1417,6 +1545,8 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, } if (issue_flags & IO_URING_F_IOPOLL) rq_flags |= REQ_POLLED; + if (flag & VIRTBLK_URING_F_BIDIR) + rq_flags |= REQ_BIDIR; rq_flags |= (type & VIRTIO_BLK_T_OUT) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; @@ -1432,7 +1562,8 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); if (data && data_len) { - ret = virtblk_map_user_request(req, data, data_len, ioucmd, vec); + ret = virtblk_map_user_request(req, data, data_len, ioucmd, + vec, write_iov_count); if (ret) return ret; } else { @@ -1449,7 +1580,9 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; req->end_io_data = ioucmd; - bio_set_dev(req->bio, vblk->disk->part0); + /* for bid command, req have more than one bio, should associate all */ + for (bio = req->bio; bio; bio = bio->bi_next) + bio_set_dev(bio, vblk->disk->part0); req->end_io = virtblk_uring_cmd_end_io; blk_execute_rq_nowait(req, false); @@ -1654,9 +1787,15 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->tag_set.queue_depth = queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + /* For bidirectional passthrough vblk request, both WRITE and READ + * operations need pre-alloc inline SGs. So we should prealloc twice + * the size than original ways. Due to the inability to predict whether + * a request is bidirectional, there may be memory wastage, but won't + * be significant. + */ vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + - sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; + sizeof(struct scatterlist) * 2 * VIRTIO_BLK_INLINE_SG_CNT; vblk->tag_set.driver_data = vblk; vblk->tag_set.nr_hw_queues = vblk->num_vqs; vblk->tag_set.nr_maps = 1; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d8f5999e4748..1d8ec39cd0f3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -473,6 +473,7 @@ enum req_flag_bits { __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ + __REQ_BIDIR, /* request is a bi-directional */ /* * Command specific flags, keep last: @@ -505,6 +506,7 @@ enum req_flag_bits { #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) +#define REQ_BIDIR ((__force blk_opf_t)(1ULL << __REQ_BIDIR)) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) @@ -533,6 +535,11 @@ static inline bool op_is_write(blk_opf_t op) return !!(op & (__force blk_opf_t)1); } +static inline bool op_is_bidirectional(blk_opf_t op) +{ + return op & REQ_BIDIR; +} + /* * Check if the bio or request is one that needs special treatment in the * flush state machine. diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 93b6e1b5b9a4..014aaf6ca5a9 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -324,11 +324,14 @@ struct virtblk_uring_cmd { __u64 data; __u32 data_len; __u32 flag; + __u32 write_iov_count; }; #define VIRTBLK_URING_CMD_IO 1 #define VIRTBLK_URING_CMD_IO_VEC 2 +#define VIRTBLK_URING_F_BIDIR (1 << 0) + /* And this is the final byte of the write scatter-gather list. */ #define VIRTIO_BLK_S_OK 0 #define VIRTIO_BLK_S_IOERR 1 -- Gitee From b49dad29edc4e5b6beca004cc1ede81fc3575bc3 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 07/28] anolis: virtio_ring: introduce dma map page api for virtqueue ANBZ: #20938 Wrapping new API "virtqueue_dma_map_page_attrs" above dma_map_page_attrs, which checks vq use dma_address or not. Signed-off-by: Ferry Meng Signed-off-by: Jingbo Xu --- drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++ include/linux/virtio.h | 5 ++++ 2 files changed, 58 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 219d0e6bf5c1..34f0d2d305e8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -3191,6 +3191,59 @@ void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, } EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); +/** + * virtqueue_dma_map_page_attrs - map DMA for _vq + * @_vq: the struct virtqueue we're talking about. + * @page: the page descriptor of the buffer to do dma + * @offset: the offset of the buffer to do dma inside the page + * @size: the size of the buffer to do dma + * @dir: DMA direction + * @attrs: DMA Attrs + * + * The caller calls this to do dma mapping in advance. The DMA address can be + * passed to this _vq when it is in pre-mapped mode. + * + * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). + */ +dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page, + size_t offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!vq->use_dma_api) + return (dma_addr_t)(page_to_phys(page) + offset); + + return dma_map_page_attrs(vring_dma_dev(vq), page, offset, + size, dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_map_page_attrs); + +/** + * virtqueue_dma_unmap_page_attrs - unmap DMA for _vq + * @_vq: the struct virtqueue we're talking about. + * @addr: the dma address to unmap + * @size: the size of the buffer + * @dir: DMA direction + * @attrs: DMA Attrs + * + * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. + * + */ +void virtqueue_dma_unmap_page_attrs(struct virtqueue *_vq, dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!vq->use_dma_api) + return; + + dma_unmap_page_attrs(vring_dma_dev(vq), addr, size, dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_page_attrs); + /** * virtqueue_dma_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index dfea88de4749..69832c02adff 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -231,6 +231,11 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); +dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page, + size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void virtqueue_dma_unmap_page_attrs(struct virtqueue *_vq, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr); bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr); -- Gitee From 40fdc63c2549701c4fafba1a3465eb08c255c72d Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 08/28] anolis: virtio-blk: add VIRTIO_BLK_RING_PAIR kconfig ANBZ: #20938 Add a new Kconfig for the following ring pair feature. Virtio-blk can use two neighbor virtqueues to serve one request queue. This feature needs backend support, so recommend to disable it if your env don't have a match backend. Signed-off-by: Ferry Meng --- drivers/block/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 5b9d4aaebb81..12c473455096 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -354,6 +354,15 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. +config VIRTIO_BLK_RING_PAIR + bool "Virtio block driver ring pair support" + depends on VIRTIO_BLK + help + This enables virtio-blk use two virtqueues per request queue. Must + be supported by backend. + + If unsure, say N. + config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK -- Gitee From 2be70906549086748a69b9916a5dfd7f48ec53da Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 09/28] anolis: virtio-blk: duplicate functions to cleanup for ring pair ANBZ: #20938 This is in preparation for virtio-blk ring pair feature. If enabled, two neighbor virtqueues are bound to support I/O. One is used for dispatching(SQ) and the other fetches completion(CQ). The first queue will not respond to irq, polling the SQ to recycle used ring. The second queue responds to completion irq, and reaps the CQ." As preparation for virtio-blk ring_pair feature, we duplicate related funcions for later modifications. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 454 +++++++++++++++++++++++++++++++++++++ 1 file changed, 454 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4bd43a719d66..9c84aae982d2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -57,6 +57,16 @@ static struct class *vd_chr_class; static struct workqueue_struct *virtblk_wq; +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +enum virtblk_ring_t { + /* ring_pair submission queue */ + VIRTBLK_RING_SQ = 0, + /* ring_pair completion queue */ + VIRTBLK_RING_CQ = 1, + VIRTBLK_RING_NUM = 2 +}; +#endif + struct virtblk_uring_cmd_pdu { struct bio *bio; u8 status; @@ -164,6 +174,66 @@ static inline bool vbr_is_bidirectional(struct virtblk_req *vbr) return op_is_bidirectional(req->cmd_flags); } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static inline struct virtio_blk_vq *get_virtio_blk_vq_rpair(struct blk_mq_hw_ctx *hctx) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num]; + + return vq; +} + +static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, + struct virtblk_req *vbr, struct scatterlist *data_sg, + struct scatterlist *data_sg_extra) +{ + struct scatterlist out_hdr, in_hdr, *sgs[4]; + unsigned int num_out = 0, num_in = 0; + + /* + * vritblk_add_req use 'bool' have_data, while we use int num to + * validate both OUT and IN direction have data. For bidirectional + * request, __blk_bios_map_sg_bidir() should map at least 2 segments. + */ + if ((sg_nents(data_sg) == 0) || (sg_nents(data_sg_extra) == 0)) + return -EINVAL; + + sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); + sgs[num_out++] = &out_hdr; + sgs[num_out++] = data_sg; + sgs[num_out + num_in++] = data_sg_extra; + sgs[num_out + num_in++] = &in_hdr; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); +} + +static int virtblk_add_req_rpair(struct virtqueue *vq, struct virtblk_req *vbr) +{ + struct scatterlist out_hdr, in_hdr, *sgs[3]; + unsigned int num_out = 0, num_in = 0; + + if (vbr_is_bidirectional(vbr)) + return virtblk_add_req_bidirectional_rpair(vq, vbr, + vbr->sg_table.sgl, vbr->sg_table_extra.sgl); + + sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &out_hdr; + + if (vbr->sg_table.nents) { + if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) + sgs[num_out++] = vbr->sg_table.sgl; + else + sgs[num_out + num_in++] = vbr->sg_table.sgl; + } + + sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); + sgs[num_out + num_in++] = &in_hdr; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); +} +#endif + static int virtblk_add_req_bidirectional(struct virtqueue *vq, struct virtblk_req *vbr, struct scatterlist *data_sg, struct scatterlist *data_sg_extra) @@ -339,6 +409,93 @@ static void virtblk_cleanup_cmd(struct request *req) kfree(bvec_virt(&req->special_vec)); } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static blk_status_t virtblk_setup_cmd_rpair(struct virtio_device *vdev, + struct request *req, + struct virtblk_req *vbr) +{ + size_t in_hdr_len = sizeof(vbr->in_hdr.status); + bool unmap = false; + u32 type; + u64 sector = 0; + + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) + return BLK_STS_NOTSUPP; + + switch (req_op(req)) { + case REQ_OP_READ: + type = VIRTIO_BLK_T_IN; + sector = blk_rq_pos(req); + break; + case REQ_OP_WRITE: + type = VIRTIO_BLK_T_OUT; + sector = blk_rq_pos(req); + break; + case REQ_OP_FLUSH: + type = VIRTIO_BLK_T_FLUSH; + break; + case REQ_OP_DISCARD: + type = VIRTIO_BLK_T_DISCARD; + break; + case REQ_OP_WRITE_ZEROES: + type = VIRTIO_BLK_T_WRITE_ZEROES; + unmap = !(req->cmd_flags & REQ_NOUNMAP); + break; + case REQ_OP_SECURE_ERASE: + type = VIRTIO_BLK_T_SECURE_ERASE; + break; + case REQ_OP_ZONE_OPEN: + type = VIRTIO_BLK_T_ZONE_OPEN; + sector = blk_rq_pos(req); + break; + case REQ_OP_ZONE_CLOSE: + type = VIRTIO_BLK_T_ZONE_CLOSE; + sector = blk_rq_pos(req); + break; + case REQ_OP_ZONE_FINISH: + type = VIRTIO_BLK_T_ZONE_FINISH; + sector = blk_rq_pos(req); + break; + case REQ_OP_ZONE_APPEND: + type = VIRTIO_BLK_T_ZONE_APPEND; + sector = blk_rq_pos(req); + in_hdr_len = sizeof(vbr->in_hdr.zone_append); + break; + case REQ_OP_ZONE_RESET: + type = VIRTIO_BLK_T_ZONE_RESET; + sector = blk_rq_pos(req); + break; + case REQ_OP_ZONE_RESET_ALL: + type = VIRTIO_BLK_T_ZONE_RESET_ALL; + break; + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* + * Out header has already been prepared by the caller (virtblk_get_id() + * or virtblk_submit_zone_report()), nothing to do here. + */ + return 0; + default: + WARN_ON_ONCE(1); + return BLK_STS_IOERR; + } + + /* Set fields for non-REQ_OP_DRV_IN request types */ + vbr->in_hdr_len = in_hdr_len; + vbr->out_hdr.type = cpu_to_virtio32(vdev, type); + vbr->out_hdr.sector = cpu_to_virtio64(vdev, sector); + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); + + if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES || + type == VIRTIO_BLK_T_SECURE_ERASE) { + if (virtblk_setup_discard_write_zeroes_erase(req, unmap)) + return BLK_STS_RESOURCE; + } + + return 0; +} +#endif + static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, struct request *req, struct virtblk_req *vbr) @@ -450,6 +607,37 @@ static inline void virtblk_request_done(struct request *req) blk_mq_end_request(req, status); } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static void virtblk_done_rpair(struct virtqueue *vq) +{ + struct virtio_blk *vblk = vq->vdev->priv; + bool req_done = false; + int qid = vq->index; + struct virtblk_req *vbr; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); + do { + virtqueue_disable_cb(vq); + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); + req_done = true; + } + if (unlikely(virtqueue_is_broken(vq))) + break; + } while (!virtqueue_enable_cb(vq)); + + /* In case queue is stopped waiting for more buffers. */ + if (req_done) + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); +} +#endif + static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; @@ -506,6 +694,29 @@ static blk_status_t virtblk_fail_to_queue(struct request *req, int rc) } } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static blk_status_t virtblk_prep_rq_rpair(struct blk_mq_hw_ctx *hctx, + struct virtio_blk *vblk, + struct request *req, + struct virtblk_req *vbr) +{ + blk_status_t status; + int num; + + status = virtblk_setup_cmd_rpair(vblk->vdev, req, vbr); + if (unlikely(status)) + return status; + + num = virtblk_map_data(hctx, req, vbr); + if (unlikely(num < 0)) + return virtblk_fail_to_queue(req, -ENOMEM); + vbr->sg_table.nents = num; + + blk_mq_start_request(req); + + return BLK_STS_OK; +} +#endif static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, struct virtio_blk *vblk, struct request *req, @@ -528,6 +739,111 @@ static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static blk_status_t virtio_queue_rq_rpair(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + unsigned long flags; + int qid = hctx->queue_num; + bool notify = false; + blk_status_t status; + int err; + + status = virtblk_prep_rq_rpair(hctx, vblk, req, vbr); + if (unlikely(status)) + return status; + + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); + err = virtblk_add_req_rpair(vblk->vqs[qid].vq, vbr); + if (err) { + virtqueue_kick(vblk->vqs[qid].vq); + /* Don't stop the queue if -ENOMEM: we may have failed to + * bounce the buffer due to global resource outage. + */ + if (err == -ENOSPC) + blk_mq_stop_hw_queue(hctx); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + virtblk_unmap_data(req, vbr); + return virtblk_fail_to_queue(req, err); + } + + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + notify = true; + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + + if (notify) + virtqueue_notify(vblk->vqs[qid].vq); + return BLK_STS_OK; +} + +static bool virtblk_prep_rq_batch_rpair(struct request *req) +{ + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + req->mq_hctx->tags->rqs[req->tag] = req; + + return virtblk_prep_rq_rpair(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK; +} + +static void virtblk_add_req_batch_rpair(struct virtio_blk_vq *vq, + struct request **rqlist) +{ + struct request *req; + unsigned long flags; + bool kick; + + spin_lock_irqsave(&vq->lock, flags); + + while ((req = rq_list_pop(rqlist))) { + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + int err; + + err = virtblk_add_req_rpair(vq->vq, vbr); + if (err) { + virtblk_unmap_data(req, vbr); + virtblk_cleanup_cmd(req); + blk_mq_requeue_request(req, true); + } + } + + kick = virtqueue_kick_prepare(vq->vq); + spin_unlock_irqrestore(&vq->lock, flags); + + if (kick) + virtqueue_notify(vq->vq); +} + +static void virtio_queue_rqs_rpair(struct request **rqlist) +{ + struct request *submit_list = NULL; + struct request *requeue_list = NULL; + struct request **requeue_lastp = &requeue_list; + struct virtio_blk_vq *vq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct virtio_blk_vq *this_vq = get_virtio_blk_vq_rpair(req->mq_hctx); + + if (vq && vq != this_vq) + virtblk_add_req_batch_rpair(vq, &submit_list); + vq = this_vq; + + if (virtblk_prep_rq_batch_rpair(req)) + rq_list_add(&submit_list, req); /* reverse order */ + else + rq_list_add_tail(&requeue_lastp, req); + } + + if (vq) + virtblk_add_req_batch_rpair(vq, &submit_list); + *rqlist = requeue_list; +} +#endif + static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1117,6 +1433,94 @@ static void virtblk_config_changed(struct virtio_device *vdev) queue_work(virtblk_wq, &vblk->config_work); } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +bool virtblk_rpair_disable; +module_param_named(rpair_disable, virtblk_rpair_disable, bool, 0444); +MODULE_PARM_DESC(rpair_disable, "disable vring pair detective. (0=Not [default], 1=Yes)"); + +static int init_vq_rpair(struct virtio_blk *vblk) +{ + int err; + unsigned short i; + vq_callback_t **callbacks; + const char **names; + struct virtqueue **vqs; + unsigned short num_vqs; + unsigned short num_poll_vqs; + struct virtio_device *vdev = vblk->vdev; + struct irq_affinity desc = { 0, }; + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, + struct virtio_blk_config, num_queues, + &num_vqs); + if (err) + num_vqs = 1; + + if (!err && !num_vqs) { + dev_err(&vdev->dev, "MQ advertised but zero queues reported\n"); + return -EINVAL; + } + + num_vqs = min_t(unsigned int, + min_not_zero(num_request_queues, nr_cpu_ids), + num_vqs); + + num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1); + + vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs; + vblk->io_queues[HCTX_TYPE_READ] = 0; + vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs; + + dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n", + vblk->io_queues[HCTX_TYPE_DEFAULT], + vblk->io_queues[HCTX_TYPE_READ], + vblk->io_queues[HCTX_TYPE_POLL]); + + vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL); + if (!vblk->vqs) + return -ENOMEM; + + names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL); + callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL); + vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL); + if (!names || !callbacks || !vqs) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < num_vqs - num_poll_vqs; i++) { + callbacks[i] = virtblk_done; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i); + names[i] = vblk->vqs[i].name; + } + + for (; i < num_vqs; i++) { + callbacks[i] = NULL; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i); + names[i] = vblk->vqs[i].name; + } + + /* Discover virtqueues and write information to configuration. */ + err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); + if (err) + goto out; + + for (i = 0; i < num_vqs; i++) { + spin_lock_init(&vblk->vqs[i].lock); + vblk->vqs[i].vq = vqs[i]; + } + vblk->num_vqs = num_vqs; + +out: + kfree(vqs); + kfree(callbacks); + kfree(names); + if (err) + kfree(vblk->vqs); + return err; +} +#endif + static int init_vq(struct virtio_blk *vblk) { int err; @@ -1129,6 +1533,14 @@ static int init_vq(struct virtio_blk *vblk) struct virtio_device *vdev = vblk->vdev; struct irq_affinity desc = { 0, }; +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + /* if virtblk_rpair_disabe = 1, init_vq() should fall back + * to orginal use, so err needs a positive initial value + */ + if (!virtblk_rpair_disable) + return init_vq_rpair(vblk); + +#endif err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, struct virtio_blk_config, num_queues, &num_vqs); @@ -1359,6 +1771,37 @@ static void virtblk_complete_batch(struct io_comp_batch *iob) blk_mq_end_request_batch(iob); } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static int virtblk_poll_rpair(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = get_virtio_blk_vq_rpair(hctx); + struct virtblk_req *vbr; + unsigned long flags; + unsigned int len; + int found = 0; + + spin_lock_irqsave(&vq->lock, flags); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_complete_request_remote(req) && + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), + virtblk_complete_batch)) + virtblk_request_done(req); + } + + if (found) + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + + spin_unlock_irqrestore(&vq->lock, flags); + + return found; +} +#endif + static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; @@ -1388,6 +1831,17 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) return found; } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static const struct blk_mq_ops virtio_mq_pair_ops = { + .queue_rq = virtio_queue_rq_rpair, + .queue_rqs = virtio_queue_rqs_rpair, + .commit_rqs = virtio_commit_rqs, + .complete = virtblk_request_done, + .map_queues = virtblk_map_queues, + .poll = virtblk_poll_rpair, +}; +#endif + static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .queue_rqs = virtio_queue_rqs, -- Gitee From 1ef00e98c202fb3aeaf485edd2767ccb01037cce Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 10/28] anolis: virtio-blk: premap DMA buf ANBZ: #20938 If enabled vring pair, the process of mapping sgs should be managed by virtio-blk driver itself. As SQ virt_ring is responsed by backend (but I/O not fully executed and return IRQ back), we should recycle SQ slot but can't do rq_unmap. Besides, we should maintain scatterlist for hdr and status in virtblk_req. DMA buf will be unmap after CQ return one I/O back. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 134 +++++++++++++++++++++++++++++++++---- 1 file changed, 122 insertions(+), 12 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 9c84aae982d2..1253b1cc045c 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -136,6 +136,9 @@ struct virtblk_req { size_t in_hdr_len; +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + struct scatterlist inline_sg[2]; +#endif struct sg_table sg_table; struct sg_table sg_table_extra; struct scatterlist sg[]; @@ -183,12 +186,102 @@ static inline struct virtio_blk_vq *get_virtio_blk_vq_rpair(struct blk_mq_hw_ctx return vq; } +static int virtblk_map_sg(struct virtqueue *vq, struct scatterlist *sglist, + enum dma_data_direction dir) +{ + struct scatterlist *sg, *last; + + for (sg = sglist; sg; sg = sg_next(sg)) { + sg_dma_address(sg) = virtqueue_dma_map_page_attrs(vq, sg_page(sg), + sg->offset, sg->length, dir, 0); + sg_dma_len(sg) = sg->length; + if (virtqueue_dma_mapping_error(vq, sg->dma_address)) { + last = sg; + goto out; + } + } + return 0; +out: + for (sg = sglist; sg && sg != last; sg = sg_next(sg)) + virtqueue_dma_unmap_page_attrs(vq, sg->dma_address, + sg->length, dir, 0); + return -ENOMEM; +} + +static void virtblk_unmap_sg(struct virtqueue *vq, struct scatterlist *sglist, + enum dma_data_direction dir) +{ + struct scatterlist *sg; + + for (sg = sglist; sg; sg = sg_next(sg)) + virtqueue_dma_unmap_page_attrs(vq, sg->dma_address, + sg->length, dir, 0); +} + +static int virtblk_rq_map(struct virtqueue *vq, struct scatterlist *sgs[], + unsigned int out_sgs, unsigned int in_sgs) +{ + int i, ret, done_out_sgs, done_in_sgs; + + for (i = 0; i < out_sgs; i++) { + ret = virtblk_map_sg(vq, sgs[i], DMA_TO_DEVICE); + if (ret < 0) { + done_out_sgs = i; + goto cleanup_out_map; + } + } + + for (; i < out_sgs + in_sgs; i++) { + ret = virtblk_map_sg(vq, sgs[i], DMA_FROM_DEVICE); + if (ret < 0) { + done_out_sgs = out_sgs; + done_in_sgs = i - out_sgs; + goto cleanup_in_map; + } + } + return 0; + +cleanup_in_map: + for (i = out_sgs; i < out_sgs + done_in_sgs; i++) + virtblk_unmap_sg(vq, sgs[i], DMA_FROM_DEVICE); +cleanup_out_map: + for (i = 0; i < done_out_sgs; i++) + virtblk_unmap_sg(vq, sgs[i], DMA_TO_DEVICE); + return -ENOMEM; +} + +static void virtblk_rq_unmap(struct virtqueue *vq, struct virtblk_req *vbr) +{ + struct request *req = blk_mq_rq_from_pdu(vbr); + int dir; + + virtblk_unmap_sg(vq, &vbr->inline_sg[0], DMA_TO_DEVICE); + virtblk_unmap_sg(vq, &vbr->inline_sg[1], DMA_FROM_DEVICE); + + if (!blk_rq_nr_phys_segments(req)) + return; + + if (vbr_is_bidirectional(vbr)) { + virtblk_unmap_sg(vq, vbr->sg_table.sgl, DMA_TO_DEVICE); + virtblk_unmap_sg(vq, vbr->sg_table_extra.sgl, DMA_FROM_DEVICE); + } else { + if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) + dir = DMA_TO_DEVICE; + else + dir = DMA_FROM_DEVICE; + virtblk_unmap_sg(vq, vbr->sg_table.sgl, dir); + } +} + static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, struct virtblk_req *vbr, struct scatterlist *data_sg, struct scatterlist *data_sg_extra) { - struct scatterlist out_hdr, in_hdr, *sgs[4]; + struct scatterlist *sgs[4]; + struct scatterlist *out_hdr = &vbr->inline_sg[0]; + struct scatterlist *in_hdr = &vbr->inline_sg[1]; unsigned int num_out = 0, num_in = 0; + int ret; /* * vritblk_add_req use 'bool' have_data, while we use int num to @@ -198,27 +291,37 @@ static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, if ((sg_nents(data_sg) == 0) || (sg_nents(data_sg_extra) == 0)) return -EINVAL; - sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); - sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); - sgs[num_out++] = &out_hdr; + sg_init_one(out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sg_init_one(in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); + sgs[num_out++] = out_hdr; sgs[num_out++] = data_sg; sgs[num_out + num_in++] = data_sg_extra; - sgs[num_out + num_in++] = &in_hdr; + sgs[num_out + num_in++] = in_hdr; - return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); + ret = virtblk_rq_map(vq, sgs, num_out, num_in); + if (ret < 0) + return ret; + + ret = virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); + if (ret < 0) + virtblk_rq_unmap(vq, vbr); + return ret; } static int virtblk_add_req_rpair(struct virtqueue *vq, struct virtblk_req *vbr) { - struct scatterlist out_hdr, in_hdr, *sgs[3]; + struct scatterlist *sgs[3]; + struct scatterlist *out_hdr = &vbr->inline_sg[0]; + struct scatterlist *in_hdr = &vbr->inline_sg[1]; unsigned int num_out = 0, num_in = 0; + int ret; if (vbr_is_bidirectional(vbr)) return virtblk_add_req_bidirectional_rpair(vq, vbr, vbr->sg_table.sgl, vbr->sg_table_extra.sgl); - sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); - sgs[num_out++] = &out_hdr; + sg_init_one(out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = out_hdr; if (vbr->sg_table.nents) { if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) @@ -227,10 +330,17 @@ static int virtblk_add_req_rpair(struct virtqueue *vq, struct virtblk_req *vbr) sgs[num_out + num_in++] = vbr->sg_table.sgl; } - sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); - sgs[num_out + num_in++] = &in_hdr; + sg_init_one(in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); + sgs[num_out + num_in++] = in_hdr; - return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); + ret = virtblk_rq_map(vq, sgs, num_out, num_in); + if (ret < 0) + return ret; + + ret = virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); + if (ret < 0) + virtblk_rq_unmap(vq, vbr); + return ret; } #endif -- Gitee From 8318a3524e6684be4b3fe3ead56e5eb2a1b6139b Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 11/28] anolis: virtio-blk/virtio_ring: separate ring_pair add_sgs functions ANBZ: #20938 This is a preparation patch for the following two patches. We want to separate "add_sgs" related functions for vring pair into individual functions, which will be modified later. Please note that in the current version, our support for ring_pair is limited to split_queue with indirect enabled. ---- ANCK-6.6 add virtqueue_add_sgs_premapped() because there is no vq->premmaped flag. Signed-off-by: Ferry Meng --- drivers/virtio/virtio_ring.c | 282 +++++++++++++++++++++++++++++++++++ include/linux/virtio.h | 14 ++ 2 files changed, 296 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 34f0d2d305e8..d96d56497f8a 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -745,6 +745,197 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, return -ENOMEM; } +static inline int virtqueue_add_split_rpair(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + bool premapped, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_desc_extra *extra; + struct scatterlist *sg; + struct vring_desc *desc; + unsigned int i, n, avail, descs_used, prev, err_idx; + int head; + bool indirect; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + + LAST_ADD_TIME_UPDATE(vq); + + BUG_ON(total_sg == 0); + + head = vq->free_head; + + if (virtqueue_use_indirect(vq, total_sg)) + desc = alloc_indirect_split(_vq, total_sg, gfp); + else { + desc = NULL; + WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); + } + + if (desc) { + /* Use a single buffer which doesn't continue */ + indirect = true; + /* Set up rest to use this indirect table. */ + i = 0; + descs_used = 1; + extra = (struct vring_desc_extra *)&desc[total_sg]; + } else { + indirect = false; + desc = vq->split.vring.desc; + extra = vq->split.desc_extra; + i = head; + descs_used = total_sg; + } + + if (unlikely(vq->vq.num_free < descs_used)) { + pr_debug("Can't add buf len %i - avail = %i\n", + descs_used, vq->vq.num_free); + /* FIXME: for historical reasons, we force a notify here if + * there are outgoing parts to the buffer. Presumably the + * host should service the ring ASAP. + */ + if (out_sgs) + vq->notify(&vq->vq); + if (indirect) + kfree(desc); + END_USE(vq); + return -ENOSPC; + } + + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + u32 len; + + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, premapped)) + goto unmap_release; + + prev = i; + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ + i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, + VRING_DESC_F_NEXT, + premapped); + } + } + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + u32 len; + + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, premapped)) + goto unmap_release; + + prev = i; + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ + i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, + VRING_DESC_F_NEXT | + VRING_DESC_F_WRITE, + premapped); + } + } + /* Last one doesn't continue. */ + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + if (!indirect && vring_need_unmap_buffer(vq, &extra[prev])) + vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= + ~VRING_DESC_F_NEXT; + + if (indirect) { + /* Now that the indirect table is filled in, map it. */ + dma_addr_t addr = vring_map_single( + vq, desc, total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + + virtqueue_add_desc_split(_vq, vq->split.vring.desc, + vq->split.desc_extra, + head, addr, + total_sg * sizeof(struct vring_desc), + VRING_DESC_F_INDIRECT, false); + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= descs_used; + + /* Update free pointer */ + if (indirect) + vq->free_head = vq->split.desc_extra[head].next; + else + vq->free_head = i; + + /* Store token and indirect buffer state. */ + vq->split.desc_state[head].data = data; + if (indirect) + vq->split.desc_state[head].indir_desc = desc; + else + vq->split.desc_state[head].indir_desc = ctx; + + /* Put entry in available array (but don't update avail->idx until they + * do sync). + */ + avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + + /* Descriptors and available array need to be set before we expose the + * new available array entries. + */ + virtio_wmb(vq->weak_barriers); + vq->split.avail_idx_shadow++; + vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.avail_idx_shadow); + vq->num_added++; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + /* This is very unlikely, but theoretically possible. Kick + * just in case. + */ + if (unlikely(vq->num_added == (1 << 16) - 1)) + virtqueue_kick(_vq); + + return 0; + +unmap_release: + err_idx = i; + + if (indirect) + i = 0; + else + i = head; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + + i = vring_unmap_one_split(vq, &extra[i]); + } + + if (indirect) + kfree(desc); + + END_USE(vq); + return -ENOMEM; +} + static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -2238,6 +2429,24 @@ static inline int virtqueue_add(struct virtqueue *_vq, out_sgs, in_sgs, data, ctx, premapped, gfp); } +/* + * Generic functions and exported symbols for ringpair mode. + */ + +static inline int virtqueue_add_rpair(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + bool premapped, + gfp_t gfp) +{ + return virtqueue_add_split_rpair(_vq, sgs, total_sg, + out_sgs, in_sgs, data, ctx, premapped, gfp); +} + /** * virtqueue_add_sgs - expose buffers to other end * @_vq: the struct virtqueue we're talking about. @@ -2273,6 +2482,79 @@ int virtqueue_add_sgs(struct virtqueue *_vq, } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); +/** + * virtqueue_add_sgs_premapped - expose buffers to other end + * @_vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_sgs: the number of scatterlists readable by other side + * @in_sgs: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * Difference: add sgs with premapped buffers + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_sgs_premapped(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_sg = 0; + + /* Count them first. */ + for (i = 0; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_sg++; + } + return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, + data, NULL, true, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs_premapped); + +/** + * virtqueue_add_sgs_rpair - expose buffers to other end + * @_vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_sgs: the number of scatterlists readable by other side + * @in_sgs: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Only work for ring pair mode + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_sgs_rpair(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_sg = 0; + + /* Count them first. */ + for (i = 0; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_sg++; + } + return virtqueue_add_rpair(_vq, sgs, total_sg, out_sgs, in_sgs, + data, NULL, false, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs_rpair); + /** * virtqueue_add_outbuf - expose output buffers to other end * @vq: the struct virtqueue we're talking about. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 69832c02adff..781586186b99 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -73,6 +73,20 @@ int virtqueue_add_sgs(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_sgs_premapped(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp); + +int virtqueue_add_sgs_rpair(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp); + struct device *virtqueue_dma_dev(struct virtqueue *vq); bool virtqueue_kick(struct virtqueue *vq); -- Gitee From 97a172c440509f240fb7c78283d983965244c5ce Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 12/28] anolis: virtio-blk/virtio_ring: vring_pair pass indirect descriptor ANBZ: #20938 After enabling the vring pair feature, there is an issue. As indir_desc slot is freed when "submission_queue is finished but completion_queue not response yet", if a error occurs, both 'DRIVER' and 'BACKEND' can't locate 'unfinished I/Os'. Thus we need a method to save indirect_desc message until I/O done. This patch achieve this by passing indir_desc to backend, in other words, let backend save indir_desc. Details are following: When enabled INDIRECT virtqueue and vring pair, L1 indirect descriptor (infact the sg filled into virtqueue) will be also passed with L2. This feature is only needed by virtio-blk, and strongly depends on backend driver support. As a customized solution, the current form is: Original -> Now Content Dir | Contnet Dir out_hdr OUT | out_hdr OUT out_sg1 OUT | L1_desc OUT ... | out_sg1 OUT ... | ... status IN | ... | status IN As we can see, there are two major modifications: 1. total_sg +=1 2. L1 descriptor need to be mapped in advance (If error occurred, should call unmap cautiously) Finally, We should emphasize that current version is only compatible with the scene when the following conditions are all satisfied: 1. vring pair feature enabled. 2. virtio-blk with backend driver support. 3. only "split + indirect" virtqueue. 4. only SQ pass indir_desc, CQ not involved. ---- Fix conflict: Due to XuanZhuo's virtio_ring premapped related patches, there is no vq->do_unmap anymore. vring_unmap_one_split() is also changed. In this case, save_indir is used for preventing dma_unmap in ring_pair mode, which should be done in virtio_blk driver. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 4 +-- drivers/virtio/virtio_ring.c | 70 ++++++++++++++++++++++++++++-------- include/linux/virtio.h | 1 + 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1253b1cc045c..16bdc2aa6ba6 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -302,7 +302,7 @@ static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, if (ret < 0) return ret; - ret = virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); + ret = virtqueue_add_sgs_rpair(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); if (ret < 0) virtblk_rq_unmap(vq, vbr); return ret; @@ -337,7 +337,7 @@ static int virtblk_add_req_rpair(struct virtqueue *vq, struct virtblk_req *vbr) if (ret < 0) return ret; - ret = virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); + ret = virtqueue_add_sgs_rpair(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); if (ret < 0) virtblk_rq_unmap(vq, vbr); return ret; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index d96d56497f8a..bd3d6dacfeab 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -181,6 +181,11 @@ struct vring_virtqueue { /* Host publishes avail event idx */ bool event; + /* If enable vring pair, Virtqueue will save the indirect desc + * pointer and avoid the pre-unmap. + */ + bool save_indir; + /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ @@ -481,7 +486,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { - if (!vq->use_dma_api) + if (!vq->use_dma_api || vq->save_indir) goto out; dma_unmap_single(vring_dma_dev(vq), @@ -490,7 +495,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { - if (!vring_need_unmap_buffer(vq, extra)) + if (vq->save_indir || !vring_need_unmap_buffer(vq, extra)) goto out; dma_unmap_page(vring_dma_dev(vq), @@ -762,6 +767,7 @@ static inline int virtqueue_add_split_rpair(struct virtqueue *_vq, unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; + dma_addr_t l1_addr; START_USE(vq); @@ -779,9 +785,10 @@ static inline int virtqueue_add_split_rpair(struct virtqueue *_vq, head = vq->free_head; - if (virtqueue_use_indirect(vq, total_sg)) + if (virtqueue_use_indirect(vq, total_sg)) { + total_sg += 1; desc = alloc_indirect_split(_vq, total_sg, gfp); - else { + } else { desc = NULL; WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); } @@ -816,6 +823,14 @@ static inline int virtqueue_add_split_rpair(struct virtqueue *_vq, return -ENOSPC; } + if (indirect && vq->save_indir) { + l1_addr = vring_map_single(vq, desc, + total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, l1_addr)) + goto unmap_release; + } + for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; @@ -832,6 +847,12 @@ static inline int virtqueue_add_split_rpair(struct virtqueue *_vq, VRING_DESC_F_NEXT, premapped); } + if ((n == 0) && indirect && vq->save_indir) { + prev = i; + i = virtqueue_add_desc_split(_vq, desc, extra, i, l1_addr, + total_sg * sizeof(struct vring_desc), + VRING_DESC_F_NEXT, premapped); + } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { @@ -858,16 +879,17 @@ static inline int virtqueue_add_split_rpair(struct virtqueue *_vq, ~VRING_DESC_F_NEXT; if (indirect) { - /* Now that the indirect table is filled in, map it. */ - dma_addr_t addr = vring_map_single( - vq, desc, total_sg * sizeof(struct vring_desc), - DMA_TO_DEVICE); - if (vring_mapping_error(vq, addr)) - goto unmap_release; - + if (!vq->save_indir) { + /* Now that the indirect table is filled in, map it. */ + l1_addr = vring_map_single( + vq, desc, total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, l1_addr)) + goto unmap_release; + } virtqueue_add_desc_split(_vq, vq->split.vring.desc, vq->split.desc_extra, - head, addr, + head, l1_addr, total_sg * sizeof(struct vring_desc), VRING_DESC_F_INDIRECT, false); } @@ -1018,7 +1040,8 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, vring_unmap_one_split(vq, &extra[j]); } - kfree(indir_desc); + if (!vq->save_indir) + kfree(indir_desc); vq->split.desc_state[head].indir_desc = NULL; } else if (ctx) { *ctx = vq->split.desc_state[head].indir_desc; @@ -2309,6 +2332,7 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->packed_ring = true; vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); + vq->save_indir = false; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2551,7 +2575,7 @@ int virtqueue_add_sgs_rpair(struct virtqueue *_vq, total_sg++; } return virtqueue_add_rpair(_vq, sgs, total_sg, out_sgs, in_sgs, - data, NULL, false, gfp); + data, NULL, true, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs_rpair); @@ -2992,6 +3016,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, #endif vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); + vq->save_indir = false; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -3122,6 +3147,23 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, } EXPORT_SYMBOL_GPL(virtqueue_resize); +/** + * virtqueue_set_save_indir - set the vring save_indir + * @_vq: the struct virtqueue we're talking about. + * + * Enable the save_indir mode of the vq. + * + */ +void virtqueue_set_save_indir(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + START_USE(vq); + vq->save_indir = true; + END_USE(vq); +} +EXPORT_SYMBOL_GPL(virtqueue_set_save_indir); + /** * virtqueue_reset - detach and recycle all unused buffers * @_vq: the struct virtqueue we're talking about. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 781586186b99..96b620e94792 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -120,6 +120,7 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq); dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *vq); dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *vq); dma_addr_t virtqueue_get_used_addr(const struct virtqueue *vq); +void virtqueue_set_save_indir(struct virtqueue *_vq); int virtqueue_resize(struct virtqueue *vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf), -- Gitee From e460012b4eb549c9feacb8c60f64db98ccb03ee8 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 13/28] anolis: virtio-blk: add ring pair support ANBZ: #20938 Provide a new way to use virtqueue named "ring_pair". Two adjacent queues are called a pair of queues, with even numbered queues (e.g 0) are responsible for sending requests (called SQ) and odd queues harvests the requests (called CQ). There are some points: 1.SQ ret only means that backend has gotten the req, not finished. And this queue will not trigger irq. Res_q's ret means the request is done, triggered by irq. 2.How to match CQ's vbr to the real request? Now we reuse ioprio to pass request 'TAG', and 'len' in CQ is used for pass 'TAG' back. 3.Driver should recycle SQ voluntarily, and shouldn't unmap sgs until I/O really done, which means 'detach_buf_xxx' should be carefully handled. We can't free indir_desc either. 4.According to 3, to free indir_desc at I/O end, we should save it in driver. Correspondingly, driver should do dma_unmap for indir_desc area in 'virtblk_unmap_and_clear_desc'. 5.num_vqs must be Multiple of 2, or ring_pair mode can't be established. ------- This meams that backend handlers need to make corresponding modifications: 1.support basic vring pair. 2.After get request from SQ, update last_used_index directly. 3.Record tag value, ret with 'len' in virtqueue. ------- anck-6.6 fix conflict For CQ, also use virtqueue_add_sgs_rpair() to prefill and refill. This function means "premapped" virtqueue_add_sgs() function. vq->save_indir is set for SQ, and indir will be saved ; flag not set in CQ, and only "premapped = true" effect. So virtqueue_add_sgs_rpair() is not restrict to only setting save_indir. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 408 ++++++++++++++++++++++++++++++-- drivers/virtio/virtio_ring.c | 23 ++ include/linux/virtio.h | 3 + include/uapi/linux/virtio_blk.h | 8 +- 4 files changed, 415 insertions(+), 27 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 16bdc2aa6ba6..7d385d818197 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -65,6 +65,19 @@ enum virtblk_ring_t { VIRTBLK_RING_CQ = 1, VIRTBLK_RING_NUM = 2 }; + +struct virtblk_cq_req { + struct virtio_blk_outhdr out_hdr; + u8 status; + struct scatterlist inline_sg[2]; + struct scatterlist *sgs[2]; +}; + +struct virtblk_indir_desc { + struct vring_desc *desc; + dma_addr_t dma_addr; + u32 len; +}; #endif struct virtblk_uring_cmd_pdu { @@ -76,6 +89,10 @@ struct virtio_blk_vq { struct virtqueue *vq; spinlock_t lock; char name[VQ_NAME_LEN]; +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + /* prealloced prefill req for CQ */ + struct virtblk_cq_req *cq_req; +#endif } ____cacheline_aligned_in_smp; struct virtio_blk { @@ -113,6 +130,12 @@ struct virtio_blk { /* For passthrough cmd */ struct cdev cdev; struct device cdev_device; + +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + bool ring_pair; + /* saved indirect desc pointer, dma_addr and dma_len for SQ */ + struct virtblk_indir_desc **indir_desc; +#endif }; struct virtblk_req { @@ -178,10 +201,20 @@ static inline bool vbr_is_bidirectional(struct virtblk_req *vbr) } #ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static int virtblk_qid_to_sq_qid(int qid) +{ + return qid * VIRTBLK_RING_NUM; +} + +static int virtblk_qid_to_cq_qid(int qid) +{ + return qid * VIRTBLK_RING_NUM + 1; +} + static inline struct virtio_blk_vq *get_virtio_blk_vq_rpair(struct blk_mq_hw_ctx *hctx) { struct virtio_blk *vblk = hctx->queue->queuedata; - struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num]; + struct virtio_blk_vq *vq = &vblk->vqs[virtblk_qid_to_sq_qid(hctx->queue_num)]; return vq; } @@ -273,6 +306,150 @@ static void virtblk_rq_unmap(struct virtqueue *vq, struct virtblk_req *vbr) } } +static inline void virtblk_save_desc(struct virtqueue *vq, struct virtblk_req *vbr, + struct vring_desc *desc, dma_addr_t dma_addr, + u32 len) +{ + struct virtio_blk *vblk = vq->vdev->priv; + struct request *req = blk_mq_rq_from_pdu(vbr); + int tag = req->tag, qid = vq->index / VIRTBLK_RING_NUM; + struct virtblk_indir_desc *indir_desc = &vblk->indir_desc[qid][tag]; + + indir_desc->desc = desc; + indir_desc->dma_addr = dma_addr; + indir_desc->len = len; +} + +static inline void virtblk_unmap_and_clear_desc(struct virtqueue *vq, + struct virtblk_req *vbr) +{ + struct virtio_blk *vblk = vq->vdev->priv; + struct request *req = blk_mq_rq_from_pdu(vbr); + int tag = req->tag, qid = vq->index / VIRTBLK_RING_NUM; + struct virtblk_indir_desc *indir_desc = &vblk->indir_desc[qid][tag]; + + WARN_ON(!indir_desc->desc); + virtqueue_dma_unmap_page_attrs(vq, indir_desc->dma_addr, + indir_desc->len, DMA_TO_DEVICE, 0); + + kfree(indir_desc->desc); + indir_desc->desc = NULL; +} + +static void virtblk_recycle_buf(struct virtqueue *vq) +{ + unsigned int unused; + + while (virtqueue_get_buf(vq, &unused)) + ; +} + +static inline int virtblk_cq_rq_map(struct virtqueue *vq, struct scatterlist *sgs[]) +{ + int ret; + + ret = virtblk_map_sg(vq, sgs[0], DMA_TO_DEVICE); + if (ret < 0) + return ret; + ret = virtblk_map_sg(vq, sgs[1], DMA_FROM_DEVICE); + if (ret < 0) + virtblk_unmap_sg(vq, sgs[0], DMA_TO_DEVICE); + + return ret; +} + +static void virtblk_cq_rq_unmap(struct virtqueue *vq, struct scatterlist *sgs[]) +{ + virtblk_unmap_sg(vq, sgs[0], DMA_TO_DEVICE); + virtblk_unmap_sg(vq, sgs[1], DMA_FROM_DEVICE); +} + +static inline void virtblk_kfree_vqs_cq_reqs(struct virtio_blk *vblk) +{ + int i; + + if (!vblk->ring_pair) + return; + + if (vblk->vqs != NULL) { + for (i = 0; i < vblk->num_vqs; i++) { + if ((i % VIRTBLK_RING_NUM) == VIRTBLK_RING_CQ) + kfree(vblk->vqs[i].cq_req); + } + } +} + +static inline void virtblk_kfree_vblk_indir_descs(struct virtio_blk *vblk) +{ + int i; + + if (!vblk->ring_pair) + return; + + if (vblk->indir_desc != NULL) { + for (i = 0; i < vblk->num_vqs / VIRTBLK_RING_NUM; i++) + kfree(vblk->indir_desc[i]); + } + kfree(vblk->indir_desc); +} + +static int virtblk_prefill_res(struct virtio_blk *vblk, + struct virtqueue **vqs, int num_vqs) +{ + int i, j, ret, fail_i, fail_j; + unsigned int vring_size; + unsigned long flags; + struct virtblk_cq_req *vbr_res; + + for (i = 1; i < num_vqs; i += VIRTBLK_RING_NUM) { + vring_size = virtqueue_get_vring_size(vqs[i]); + + spin_lock_irqsave(&vblk->vqs[i].lock, flags); + for (j = 0; j < vring_size; j++) { + vbr_res = &vblk->vqs[i].cq_req[j]; + sg_init_one(&vbr_res->inline_sg[0], &vbr_res->out_hdr, + sizeof(struct virtio_blk_outhdr)); + sg_init_one(&vbr_res->inline_sg[1], &vbr_res->status, sizeof(u8)); + + vbr_res->sgs[0] = &vbr_res->inline_sg[0]; + vbr_res->sgs[1] = &vbr_res->inline_sg[1]; + + ret = virtblk_cq_rq_map(vqs[i], vbr_res->sgs); + if (ret < 0) { + spin_unlock_irqrestore(&vblk->vqs[i].lock, flags); + goto err; + } + + ret = virtqueue_add_sgs_premapped(vqs[i], vbr_res->sgs, + 1, 1, vbr_res, GFP_ATOMIC); + if (ret < 0) { + virtblk_cq_rq_unmap(vqs[i], vbr_res->sgs); + spin_unlock_irqrestore(&vblk->vqs[i].lock, flags); + goto err; + } + } + virtqueue_kick(vqs[i]); + spin_unlock_irqrestore(&vblk->vqs[i].lock, flags); + } + return 0; + +err: + fail_i = i; + fail_j = j; + for (i = 1; i <= fail_i; i += VIRTBLK_RING_NUM) { + if (i == fail_i) + vring_size = fail_j; + else + vring_size = virtqueue_get_vring_size(vqs[i]); + + for (j = 0; j < vring_size; j++) { + vbr_res = &vblk->vqs[i].cq_req[j]; + virtblk_cq_rq_unmap(vqs[i], vbr_res->sgs); + } + } + return -1; +} + static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, struct virtblk_req *vbr, struct scatterlist *data_sg, struct scatterlist *data_sg_extra) @@ -280,7 +457,10 @@ static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, struct scatterlist *sgs[4]; struct scatterlist *out_hdr = &vbr->inline_sg[0]; struct scatterlist *in_hdr = &vbr->inline_sg[1]; + struct vring_desc *desc; unsigned int num_out = 0, num_in = 0; + dma_addr_t dma_addr; + u32 dma_len; int ret; /* @@ -298,13 +478,19 @@ static int virtblk_add_req_bidirectional_rpair(struct virtqueue *vq, sgs[num_out + num_in++] = data_sg_extra; sgs[num_out + num_in++] = in_hdr; + virtblk_recycle_buf(vq); ret = virtblk_rq_map(vq, sgs, num_out, num_in); if (ret < 0) return ret; ret = virtqueue_add_sgs_rpair(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); - if (ret < 0) + if (ret < 0) { virtblk_rq_unmap(vq, vbr); + return ret; + } + desc = virtqueue_indir_get_last_desc_split(vq, &dma_addr, &dma_len); + virtblk_save_desc(vq, vbr, desc, dma_addr, dma_len); + return ret; } @@ -313,7 +499,10 @@ static int virtblk_add_req_rpair(struct virtqueue *vq, struct virtblk_req *vbr) struct scatterlist *sgs[3]; struct scatterlist *out_hdr = &vbr->inline_sg[0]; struct scatterlist *in_hdr = &vbr->inline_sg[1]; + struct vring_desc *desc; unsigned int num_out = 0, num_in = 0; + dma_addr_t dma_addr; + u32 dma_len; int ret; if (vbr_is_bidirectional(vbr)) @@ -333,15 +522,56 @@ static int virtblk_add_req_rpair(struct virtqueue *vq, struct virtblk_req *vbr) sg_init_one(in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); sgs[num_out + num_in++] = in_hdr; + virtblk_recycle_buf(vq); ret = virtblk_rq_map(vq, sgs, num_out, num_in); if (ret < 0) return ret; ret = virtqueue_add_sgs_rpair(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); - if (ret < 0) + if (ret < 0) { virtblk_rq_unmap(vq, vbr); + return ret; + } + desc = virtqueue_indir_get_last_desc_split(vq, &dma_addr, &dma_len); + virtblk_save_desc(vq, vbr, desc, dma_addr, dma_len); + return ret; } + +static inline void *virtblk_get_buf(struct virtio_blk *vblk, struct virtqueue *vq, u32 *len) +{ + struct virtblk_req *vbr; + struct virtqueue *sq_vq; + + vbr = virtqueue_get_buf(vq, len); + if (vbr) { + /* get request from paired req ring in ring_pair mode */ + int qid = vq->index / VIRTBLK_RING_NUM; + int tag = *len; + struct request *req = blk_mq_tag_to_rq(vblk->tag_set.tags[qid], tag); + struct virtblk_cq_req *vbr_res = (void *)vbr; + int ret; + + sq_vq = vblk->vqs[vq->index - 1].vq; + if (!req) { + pr_err("could not locate request for tag %#x, queue %d\n", + tag, qid); + return NULL; + } + + vbr = blk_mq_rq_to_pdu(req); + /* set status to the real response status. */ + vbr->in_hdr.status = vbr_res->status; + virtblk_rq_unmap(sq_vq, vbr); + virtblk_unmap_and_clear_desc(sq_vq, vbr); + + ret = virtqueue_add_sgs_premapped(vq, vbr_res->sgs, 1, 1, vbr_res, GFP_ATOMIC); + if (ret < 0) + pr_err("failed to refill res ring %d\n", ret); + + } + return vbr; +} #endif static int virtblk_add_req_bidirectional(struct virtqueue *vq, @@ -528,6 +758,10 @@ static blk_status_t virtblk_setup_cmd_rpair(struct virtio_device *vdev, bool unmap = false; u32 type; u64 sector = 0; + u32 ioprio; + + /* for ring_pair, tag is used and occupied high 16bit of ioprio*/ + vbr->out_hdr.rpair.tag = cpu_to_virtio16(vdev, req->tag); if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) return BLK_STS_NOTSUPP; @@ -592,9 +826,10 @@ static blk_status_t virtblk_setup_cmd_rpair(struct virtio_device *vdev, /* Set fields for non-REQ_OP_DRV_IN request types */ vbr->in_hdr_len = in_hdr_len; + ioprio = req_get_ioprio(req); vbr->out_hdr.type = cpu_to_virtio32(vdev, type); vbr->out_hdr.sector = cpu_to_virtio64(vdev, sector); - vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); + vbr->out_hdr.rpair.ioprio = cpu_to_virtio16(vdev, (u16)ioprio); if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES || type == VIRTIO_BLK_T_SECURE_ERASE) { @@ -726,11 +961,12 @@ static void virtblk_done_rpair(struct virtqueue *vq) struct virtblk_req *vbr; unsigned long flags; unsigned int len; + bool kick = false; spin_lock_irqsave(&vblk->vqs[qid].lock, flags); do { virtqueue_disable_cb(vq); - while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + while ((vbr = virtblk_get_buf(vblk, vblk->vqs[qid].vq, &len)) != NULL) { struct request *req = blk_mq_rq_from_pdu(vbr); if (likely(!blk_should_fake_timeout(req->q))) @@ -742,9 +978,14 @@ static void virtblk_done_rpair(struct virtqueue *vq) } while (!virtqueue_enable_cb(vq)); /* In case queue is stopped waiting for more buffers. */ - if (req_done) + if (req_done) { blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + kick = virtqueue_kick_prepare(vq); + } spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + + if (kick) + virtqueue_notify(vq); } #endif @@ -857,11 +1098,12 @@ static blk_status_t virtio_queue_rq_rpair(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; - int qid = hctx->queue_num; + int qid; bool notify = false; blk_status_t status; int err; + qid = virtblk_qid_to_sq_qid(hctx->queue_num); status = virtblk_prep_rq_rpair(hctx, vblk, req, vbr); if (unlikely(status)) return status; @@ -1454,6 +1696,9 @@ static void virtblk_free_disk(struct gendisk *disk) ida_free(&vd_index_ida, vblk->index); mutex_destroy(&vblk->vdev_mutex); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + virtblk_kfree_vblk_indir_descs(vblk); +#endif kfree(vblk); } @@ -1556,7 +1801,8 @@ static int init_vq_rpair(struct virtio_blk *vblk) const char **names; struct virtqueue **vqs; unsigned short num_vqs; - unsigned short num_poll_vqs; + unsigned short num_poll_vqs, num_queues, num_poll_queues; + unsigned int vring_size; struct virtio_device *vdev = vblk->vdev; struct irq_affinity desc = { 0, }; @@ -1571,22 +1817,42 @@ static int init_vq_rpair(struct virtio_blk *vblk) return -EINVAL; } - num_vqs = min_t(unsigned int, - min_not_zero(num_request_queues, nr_cpu_ids), - num_vqs); + if (num_vqs % VIRTBLK_RING_NUM) { + dev_err(&vdev->dev, + "RING_PAIR advertised but odd queues reported\n"); + vblk->ring_pair = false; + } - num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1); + /* ring pair only support split virtqueue + indirect enabled */ + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED) || + !virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) { + dev_err(&vdev->dev, "rpair only support indir+split queue\n"); + vblk->ring_pair = false; + } - vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs; + /* If vring pair is not enabled, fall back to orig virtqueue use. */ + if (!vblk->ring_pair) + return 1; + + num_queues = num_vqs / VIRTBLK_RING_NUM; + num_queues = min_t(unsigned int, + min_not_zero(num_request_queues, nr_cpu_ids), + num_queues); + num_poll_queues = min_t(unsigned int, poll_queues, num_queues - 1); + num_poll_vqs = num_poll_queues * VIRTBLK_RING_NUM; + num_vqs = num_queues * VIRTBLK_RING_NUM; + + vblk->io_queues[HCTX_TYPE_DEFAULT] = num_queues - num_poll_queues; vblk->io_queues[HCTX_TYPE_READ] = 0; - vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs; + vblk->io_queues[HCTX_TYPE_POLL] = num_poll_queues; dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n", vblk->io_queues[HCTX_TYPE_DEFAULT], vblk->io_queues[HCTX_TYPE_READ], vblk->io_queues[HCTX_TYPE_POLL]); - vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL); + vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), + GFP_KERNEL | __GFP_ZERO); if (!vblk->vqs) return -ENOMEM; @@ -1599,14 +1865,28 @@ static int init_vq_rpair(struct virtio_blk *vblk) } for (i = 0; i < num_vqs - num_poll_vqs; i++) { - callbacks[i] = virtblk_done; - snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i); + unsigned int index = i / VIRTBLK_RING_NUM; + unsigned int role = i % VIRTBLK_RING_NUM; + + if (role == VIRTBLK_RING_SQ) { + callbacks[i] = NULL; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", index); + } else { + callbacks[i] = virtblk_done_rpair; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "res.%u", index); + } names[i] = vblk->vqs[i].name; } for (; i < num_vqs; i++) { + unsigned int index = i / VIRTBLK_RING_NUM; + unsigned int role = i % VIRTBLK_RING_NUM; + + if (role == VIRTBLK_RING_SQ) + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req-poll.%u", index); + else + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "res-poll.%u", index); callbacks[i] = NULL; - snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i); names[i] = vblk->vqs[i].name; } @@ -1616,17 +1896,37 @@ static int init_vq_rpair(struct virtio_blk *vblk) goto out; for (i = 0; i < num_vqs; i++) { + vring_size = virtqueue_get_vring_size(vqs[i]); + if ((i % VIRTBLK_RING_NUM) == VIRTBLK_RING_CQ) { + vblk->vqs[i].cq_req = kmalloc_array(vring_size, + sizeof(struct virtblk_cq_req), + GFP_KERNEL | __GFP_ZERO); + if (!vblk->vqs[i].cq_req) { + err = -ENOMEM; + goto out; + } + } else { + virtqueue_set_save_indir(vqs[i]); + vblk->vqs[i].cq_req = NULL; + } spin_lock_init(&vblk->vqs[i].lock); vblk->vqs[i].vq = vqs[i]; } + + err = virtblk_prefill_res(vblk, vqs, num_vqs); + if (err < 0) + vdev->config->del_vqs(vdev); + vblk->num_vqs = num_vqs; out: kfree(vqs); kfree(callbacks); kfree(names); - if (err) + if (err < 0) { + virtblk_kfree_vqs_cq_reqs(vblk); kfree(vblk->vqs); + } return err; } #endif @@ -1647,8 +1947,14 @@ static int init_vq(struct virtio_blk *vblk) /* if virtblk_rpair_disabe = 1, init_vq() should fall back * to orginal use, so err needs a positive initial value */ - if (!virtblk_rpair_disable) - return init_vq_rpair(vblk); + vblk->ring_pair = false; + + if (!virtblk_rpair_disable) { + err = init_vq_rpair(vblk); + /* if err > 0, then vring pair fall back to original virtqueue use*/ + if (err <= 0) + return err; + } #endif err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, @@ -1885,15 +2191,17 @@ static void virtblk_complete_batch(struct io_comp_batch *iob) static int virtblk_poll_rpair(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; - struct virtio_blk_vq *vq = get_virtio_blk_vq_rpair(hctx); + struct virtio_blk_vq *vq = &vblk->vqs[virtblk_qid_to_cq_qid(hctx->queue_num)]; struct virtblk_req *vbr; unsigned long flags; unsigned int len; int found = 0; + bool kick = false; + /* get buf from paired CQ ring in ring_pair mode */ spin_lock_irqsave(&vq->lock, flags); - while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + while ((vbr = virtblk_get_buf(vblk, vq->vq, &len)) != NULL) { struct request *req = blk_mq_rq_from_pdu(vbr); found++; @@ -1903,11 +2211,16 @@ static int virtblk_poll_rpair(struct blk_mq_hw_ctx *hctx, struct io_comp_batch * virtblk_request_done(req); } - if (found) + if (found) { blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + kick = virtqueue_kick_prepare(vq->vq); + } spin_unlock_irqrestore(&vq->lock, flags); + if (kick) + virtqueue_notify(vq->vq); + return found; } #endif @@ -2286,7 +2599,7 @@ static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; struct request_queue *q; - int err, index; + int err, index, i; u32 v, blk_size, max_size, sg_elems, opt_io_size; u32 max_discard_segs = 0; @@ -2347,7 +2660,15 @@ static int virtblk_probe(struct virtio_device *vdev) } memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + vblk->tag_set.ops = vblk->ring_pair ? &virtio_mq_pair_ops : + &virtio_mq_ops; + vblk->tag_set.nr_hw_queues = vblk->ring_pair ? vblk->num_vqs / VIRTBLK_RING_NUM : + vblk->num_vqs; +#else vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.nr_hw_queues = vblk->num_vqs; +#endif vblk->tag_set.queue_depth = queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; @@ -2361,11 +2682,36 @@ static int virtblk_probe(struct virtio_device *vdev) sizeof(struct virtblk_req) + sizeof(struct scatterlist) * 2 * VIRTIO_BLK_INLINE_SG_CNT; vblk->tag_set.driver_data = vblk; - vblk->tag_set.nr_hw_queues = vblk->num_vqs; vblk->tag_set.nr_maps = 1; if (vblk->io_queues[HCTX_TYPE_POLL]) vblk->tag_set.nr_maps = 3; +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + /* Beginning here, we know queue_depth of tag_set, so we should alloc + * vblk->indir_desc here. If alloc goes -ENOMEM, kfree will be + * executed. + */ + if (vblk->ring_pair) { + vblk->indir_desc = kmalloc_array(vblk->num_vqs / VIRTBLK_RING_NUM, + sizeof(struct virtblk_indir_desc *), + GFP_KERNEL | __GFP_ZERO); + if (!vblk->indir_desc) { + err = -ENOMEM; + goto out_free_vq; + } + for (i = 0; i < vblk->num_vqs / VIRTBLK_RING_NUM ; i++) { + vblk->indir_desc[i] = kmalloc_array(vblk->tag_set.queue_depth, + sizeof(struct virtblk_indir_desc), + GFP_KERNEL | __GFP_ZERO); + if (!vblk->indir_desc[i]) { + err = -ENOMEM; + goto out_free_vq; + } + } + } + +#endif + err = blk_mq_alloc_tag_set(&vblk->tag_set); if (err) goto out_free_vq; @@ -2578,6 +2924,10 @@ static int virtblk_probe(struct virtio_device *vdev) blk_mq_free_tag_set(&vblk->tag_set); out_free_vq: vdev->config->del_vqs(vdev); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + virtblk_kfree_vblk_indir_descs(vblk); + virtblk_kfree_vqs_cq_reqs(vblk); +#endif kfree(vblk->vqs); out_free_vblk: kfree(vblk); @@ -2608,6 +2958,9 @@ static void virtblk_remove(struct virtio_device *vdev) vblk->vdev = NULL; vdev->config->del_vqs(vdev); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + virtblk_kfree_vqs_cq_reqs(vblk); +#endif kfree(vblk->vqs); mutex_unlock(&vblk->vdev_mutex); @@ -2633,6 +2986,9 @@ static int virtblk_freeze(struct virtio_device *vdev) flush_work(&vblk->config_work); vdev->config->del_vqs(vdev); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + virtblk_kfree_vqs_cq_reqs(vblk); +#endif kfree(vblk->vqs); return 0; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index bd3d6dacfeab..6d6b493c1af0 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -538,6 +538,29 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, return desc; } +struct vring_desc *virtqueue_indir_get_last_desc_split(struct virtqueue *_vq, + dma_addr_t *dma_addr, u32 *len) +{ + int tmp, idx; + struct vring_virtqueue *vq = to_vvq(_vq); + /* + * we should ensure this func is called after virtqueue_add_desc_split + * and before virtqueue_kick_prepare. + */ + if (!vq->indirect) + return NULL; + idx = (vq->split.avail_idx_shadow - 1) & (vq->split.vring.num - 1); + tmp = virtio16_to_cpu(_vq->vdev, vq->split.vring.avail->ring[idx]); + + /* get the last desc's dma_addr and dma_len + */ + *dma_addr = vq->split.desc_extra[tmp].addr; + *len = vq->split.desc_extra[tmp].len; + + return vq->split.desc_state[tmp].indir_desc; +} +EXPORT_SYMBOL(virtqueue_indir_get_last_desc_split); + static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, struct vring_desc *desc, struct vring_desc_extra *extra, diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 96b620e94792..a78542ac9733 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -87,6 +87,9 @@ int virtqueue_add_sgs_rpair(struct virtqueue *vq, void *data, gfp_t gfp); +struct vring_desc *virtqueue_indir_get_last_desc_split(struct virtqueue *_vq, + dma_addr_t *dma_addr, u32 *len); + struct device *virtqueue_dma_dev(struct virtqueue *vq); bool virtqueue_kick(struct virtqueue *vq); diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 014aaf6ca5a9..9f35e0ee4bc7 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -220,7 +220,13 @@ struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ __virtio32 type; /* io priority. */ - __virtio32 ioprio; + union { + struct { + __virtio16 ioprio; + __virtio16 tag; + } rpair; + __virtio32 ioprio; + }; /* Sector (ie. 512 byte offset) */ __virtio64 sector; }; -- Gitee From 7350689f2380286c1694068a82d23b5d2e863717 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 14/28] anolis: virtio_blk: reuse CQ ioprio to locate error position ANBZ: #20938 After enabling vring_pair feature, if we harvest a CQE from CQ, we add it into sgs immediately. This may causes backend don't know if a slot has been harvest. In order to locate error position more accurately and execute failover, backend need to account how many I/Os it has responsed to and which CQ slot is free. We reused CQ out_hdr ioprio to store a counter value. After we harvest a CQE, increase counter and save it in ioprio. From the view of backend, it can knows that last I/O is finished and this is a empty slot. Thus backend won't do failover for this I/O later. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7d385d818197..7f8a07d01ae9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -90,6 +90,8 @@ struct virtio_blk_vq { spinlock_t lock; char name[VQ_NAME_LEN]; #ifdef CONFIG_VIRTIO_BLK_RING_PAIR + /* check num for CQ */ + u16 counter; /* prealloced prefill req for CQ */ struct virtblk_cq_req *cq_req; #endif @@ -403,10 +405,14 @@ static int virtblk_prefill_res(struct virtio_blk *vblk, for (i = 1; i < num_vqs; i += VIRTBLK_RING_NUM) { vring_size = virtqueue_get_vring_size(vqs[i]); + vblk->vqs[i].counter = 0; spin_lock_irqsave(&vblk->vqs[i].lock, flags); for (j = 0; j < vring_size; j++) { vbr_res = &vblk->vqs[i].cq_req[j]; + vbr_res->out_hdr.rpair.tag = cpu_to_virtio16(vblk->vdev, + vblk->vqs[i].counter); + vblk->vqs[i].counter += 1; sg_init_one(&vbr_res->inline_sg[0], &vbr_res->out_hdr, sizeof(struct virtio_blk_outhdr)); sg_init_one(&vbr_res->inline_sg[1], &vbr_res->status, sizeof(u8)); @@ -565,6 +571,8 @@ static inline void *virtblk_get_buf(struct virtio_blk *vblk, struct virtqueue *v virtblk_rq_unmap(sq_vq, vbr); virtblk_unmap_and_clear_desc(sq_vq, vbr); + vbr_res->out_hdr.rpair.tag = cpu_to_virtio16(vblk->vdev, + vblk->vqs[vq->index].counter++); ret = virtqueue_add_sgs_premapped(vq, vbr_res->sgs, 1, 1, vbr_res, GFP_ATOMIC); if (ret < 0) pr_err("failed to refill res ring %d\n", ret); -- Gitee From 2fea5a744e7a6537e7514e43511dd743a761d2d8 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 15/28] anolis: virtio-blk: add ext_feature negotiation method and ring-pair feature bit ANBZ: #20938 Use external feature bits to management ring pair feature. This needs backend specific support. Driver read host_ext_features, and writes to guest_ext_features. Now ring pair feature holds virtio-blk extra feature bit 0. Signed-off-by: Ferry Meng Reviewed-by: Yifei Zhou --- drivers/block/virtio_blk.c | 67 +++++++++++-- drivers/block/virtio_blk_ext.c | 174 +++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 drivers/block/virtio_blk_ext.c diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7f8a07d01ae9..dc4806303b5f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -21,6 +21,9 @@ #include #include #include +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +#include "virtio_blk_ext.c" +#endif #define PART_BITS 4 #define VQ_NAME_LEN 16 @@ -1801,6 +1804,23 @@ bool virtblk_rpair_disable; module_param_named(rpair_disable, virtblk_rpair_disable, bool, 0444); MODULE_PARM_DESC(rpair_disable, "disable vring pair detective. (0=Not [default], 1=Yes)"); +int check_ext_feature(struct virtio_blk *vblk, void __iomem *ioaddr, + u32 *host_ext_features, + u32 *guest_ext_features) +{ + int ret = 0; + + ret = virtblk_get_ext_feature(ioaddr, host_ext_features); + if (ret < 0) + return ret; + + vblk->ring_pair = !!(*host_ext_features & VIRTIO_BLK_EXT_F_RING_PAIR); + if (vblk->ring_pair) + *guest_ext_features |= (VIRTIO_BLK_EXT_F_RING_PAIR); + + return 0; +} + static int init_vq_rpair(struct virtio_blk *vblk) { int err; @@ -1811,8 +1831,27 @@ static int init_vq_rpair(struct virtio_blk *vblk) unsigned short num_vqs; unsigned short num_poll_vqs, num_queues, num_poll_queues; unsigned int vring_size; + u32 ext_host_features = 0, ext_guest_features = 0, ext_bar_offset = 0; struct virtio_device *vdev = vblk->vdev; struct irq_affinity desc = { 0, }; + void __iomem *ioaddr = NULL; + + err = virtblk_get_ext_feature_bar(vdev, &ext_bar_offset); + /* if check ext feature error, fall back to orig virtqueue use. */ + if ((err < 0) || !ext_bar_offset) + return 1; + + ioaddr = pci_iomap_range(to_vp_device(vdev)->pci_dev, 0, ext_bar_offset, 16); + if (!ioaddr) { + err = 1; + goto negotiate_err; + } + + err = check_ext_feature(vblk, ioaddr, &ext_host_features, &ext_guest_features); + if ((err < 0) || !vblk->ring_pair) { + err = 1; + goto negotiate_err; + } err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, struct virtio_blk_config, num_queues, @@ -1822,25 +1861,29 @@ static int init_vq_rpair(struct virtio_blk *vblk) if (!err && !num_vqs) { dev_err(&vdev->dev, "MQ advertised but zero queues reported\n"); - return -EINVAL; + err = -EINVAL; + goto negotiate_err; } if (num_vqs % VIRTBLK_RING_NUM) { dev_err(&vdev->dev, "RING_PAIR advertised but odd queues reported\n"); - vblk->ring_pair = false; + err = 1; + goto negotiate_err; } /* ring pair only support split virtqueue + indirect enabled */ if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED) || !virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) { dev_err(&vdev->dev, "rpair only support indir+split queue\n"); - vblk->ring_pair = false; + err = 1; + goto negotiate_err; } - /* If vring pair is not enabled, fall back to orig virtqueue use. */ - if (!vblk->ring_pair) - return 1; + virtblk_set_ext_feature(ioaddr, ext_guest_features); + pci_iounmap(to_vp_device(vdev)->pci_dev, ioaddr); + dev_info(&vdev->dev, "rpair enabled, ext_guest_feature set 0x%x\n", + ext_guest_features); num_queues = num_vqs / VIRTBLK_RING_NUM; num_queues = min_t(unsigned int, @@ -1936,6 +1979,15 @@ static int init_vq_rpair(struct virtio_blk *vblk) kfree(vblk->vqs); } return err; + +negotiate_err: + if (ioaddr) { + ext_guest_features &= ~VIRTIO_BLK_EXT_F_RING_PAIR; + virtblk_set_ext_feature(ioaddr, ext_guest_features); + pci_iounmap(to_vp_device(vdev)->pci_dev, ioaddr); + } + vblk->ring_pair = false; + return err; } #endif @@ -1957,7 +2009,8 @@ static int init_vq(struct virtio_blk *vblk) */ vblk->ring_pair = false; - if (!virtblk_rpair_disable) { + /* ext feature only support for virtio_blk over pci device currently */ + if (!virtblk_rpair_disable && dev_is_pci(vblk->vdev->dev.parent)) { err = init_vq_rpair(vblk); /* if err > 0, then vring pair fall back to original virtqueue use*/ if (err <= 0) diff --git a/drivers/block/virtio_blk_ext.c b/drivers/block/virtio_blk_ext.c new file mode 100644 index 000000000000..74237f05bcec --- /dev/null +++ b/drivers/block/virtio_blk_ext.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include "../virtio/virtio_pci_common.h" +#include + +/* ext feature bit definition */ +#define VIRTIO_BLK_EXT_F_RING_PAIR (1U << 0) +#define VIRTIO_BLK_EXT_F_INVAL (-1) + +#define VIRTIO_PCI_VSF_MAGIC_NUM 0x0 +#define VIRTIO_PCI_VSF_MAGIC_NUM_VAL 0x7D4FEE9D +#define VIRTIO_PCI_HOST_VNDR_SPEC_FEATURE_SELECT 0x04 +/* A 32-bit r/o bitmask of the vendor specific features supported by the host */ +#define VIRTIO_PCI_HOST_VNDR_SPEC_FEATURES 0x08 + +#define VIRTIO_PCI_GUEST_VNDR_SPEC_FEATURE_SELECT 0x0c +/* A 32-bit r/w bitmask of the vendor specific features activated by the guest */ +#define VIRTIO_PCI_GUEST_VNDR_SPEC_FEATURES 0x10 + + +/* xdragon vsc */ +#define PCI_CAP_ID_VNDR 0x09 /* Vendor specific */ +#define PCI_XDRAGON_VSC_CFGTYPE 0xff + +/* xdragon vsec */ +#define PCI_EXT_CAP_ID_VNDR 0x0B +#define PCI_EXP_XDRAGON_VSEC_CFGTYPE 0xff +#define XDRAGON_VSEC_VERSION 2 + +#define XDRAGON_XVCS_MAGIC 0x53435658 +#define XDRAGON_XVCS_VSF_KEY "xvcs-vsf" +#define XDRAGON_XVCS_VERSION 1 +#define XDRAGON_XVCS_NUM_MAX 32U +#define XDRAGON_XVCS_KEY_MAX 16 + +#define XDRAGON_XVCS_O_MAGIC 0 +#define XDRAGON_XVCS_O_VER 4 +#define XDRAGON_XVCS_O_ADDR 12 +#define XDRAGON_XVCS_O_F_CNT 16 +#define XDRAGON_XVCS_O_CUR 16 +#define XDRAGON_XVCS_O_NEXT 20 +#define XDRAGON_XVCS_O_VSF 32 +static void xdragon_read_xvcs(struct pci_dev *d, u32 pos, + u32 cap_len, u32 addr, u32 num, void *data) +{ + u32 idx, where; + + for (idx = 0; idx < num; idx += 4) { + where = addr + idx; + pci_write_config_dword(d, pos + cap_len - 8, where); + pci_read_config_dword(d, pos + cap_len - 4, (u32 *)((u8 *)data + idx)); + } +} + +static int xdragon_vcs_find_vsf_bar0_offset(struct pci_dev *dev, uint32_t cap_len, + uint32_t pos, u32 *bar0_offset) +{ + u8 buf[XDRAGON_XVCS_KEY_MAX+1]; + u32 where; + u32 idx, num; + u32 reg; + + /* check xvcs magic */ + xdragon_read_xvcs(dev, pos, cap_len, XDRAGON_XVCS_O_MAGIC, sizeof(reg), ®); + if (reg != XDRAGON_XVCS_MAGIC) { + pr_err("%s: xvcs magic 0x%x not match\n", __func__, reg); + return -1; + } + /* check xvcs version */ + xdragon_read_xvcs(dev, pos, cap_len, XDRAGON_XVCS_O_VER, sizeof(reg), ®); + if (reg != XDRAGON_XVCS_VERSION) { + pr_err("%s: xvcs version 0x%x not match\n", __func__, reg); + return -1; + } + /* xvcs feat block addr */ + xdragon_read_xvcs(dev, pos, cap_len, XDRAGON_XVCS_O_ADDR, sizeof(reg), ®); + where = reg; + /* xvcs feat cnt */ + xdragon_read_xvcs(dev, pos, cap_len, XDRAGON_XVCS_O_F_CNT, sizeof(reg), ®); + num = reg; + for (idx = 0; (idx < min(XDRAGON_XVCS_NUM_MAX, num)) && (where > 0); idx++) { + memset(buf, 0, sizeof(buf)); + + /* self addr check */ + xdragon_read_xvcs(dev, pos, cap_len, + where + XDRAGON_XVCS_O_CUR, sizeof(reg), ®); + if (reg != where) + return -1; + + /* check key */ + xdragon_read_xvcs(dev, pos, cap_len, where, XDRAGON_XVCS_KEY_MAX, buf); + + /* found vsf */ + if (strncmp(buf, XDRAGON_XVCS_VSF_KEY, sizeof(XDRAGON_XVCS_VSF_KEY)) == 0) { + xdragon_read_xvcs(dev, pos, cap_len, where + XDRAGON_XVCS_O_VSF, + sizeof(reg), ®); + *bar0_offset = reg; + return 0; + } + /* next vcs feat */ + xdragon_read_xvcs(dev, pos, cap_len, + where + XDRAGON_XVCS_O_NEXT, sizeof(reg), ®); + where = reg; + } + pr_err("%s: vsf offset not found\n", __func__); + return -1; +} + +int virtblk_get_ext_feature_bar(struct virtio_device *vdev, u32 *bar_offset) +{ + struct pci_dev *dev = to_vp_device(vdev)->pci_dev; + int cap_len, vsec = 0; + u16 val; + u8 type, len = 0; + bool found = false; + + /* try to find vsc */ + for (vsec = pci_find_capability(dev, PCI_CAP_ID_VNDR); + vsec > 0; + vsec = pci_find_next_capability(dev, vsec, PCI_CAP_ID_VNDR)) { + pci_read_config_byte(dev, vsec + offsetof(struct virtio_pci_cap, cfg_type), &type); + if (type == PCI_XDRAGON_VSC_CFGTYPE) { + pci_read_config_byte(dev, + vsec + offsetof(struct virtio_pci_cap, cap_len), &len); + cap_len = len; + found = true; + break; + } + } + + /* try to find vsec */ + if (!found) { + vsec = 0; + while ((vsec = pci_find_next_ext_capability(dev, vsec, + PCI_EXT_CAP_ID_VNDR))) { + pci_read_config_word(dev, vsec + 0x4, &val); + /* vsec found */ + if (val == PCI_EXP_XDRAGON_VSEC_CFGTYPE) { + /* get vsec cap len */ + pci_read_config_word(dev, vsec + 0x6, &val); + if ((val & 0xF) != XDRAGON_VSEC_VERSION) + continue; + cap_len = (val >> 4) & (0xFFF); + found = true; + break; + } + } + } + + return found ? xdragon_vcs_find_vsf_bar0_offset(dev, cap_len, vsec, bar_offset) : -1; +} + +int virtblk_get_ext_feature(void __iomem *ioaddr, u32 *host_features) +{ + int ret; + + /* read ext bar magci number */ + ret = ioread32(ioaddr); + if (ret != VIRTIO_PCI_VSF_MAGIC_NUM_VAL) + return -EOPNOTSUPP; + + iowrite32(0, ioaddr + VIRTIO_PCI_HOST_VNDR_SPEC_FEATURE_SELECT); + *host_features = ioread32(ioaddr + VIRTIO_PCI_HOST_VNDR_SPEC_FEATURES); + + return 0; +} + +void virtblk_set_ext_feature(void __iomem *ioaddr, u32 guest_ext_features) +{ + iowrite32(0, ioaddr + VIRTIO_PCI_GUEST_VNDR_SPEC_FEATURE_SELECT); + iowrite32(guest_ext_features, ioaddr + VIRTIO_PCI_GUEST_VNDR_SPEC_FEATURES); +} -- Gitee From dfe774a5c50075d34d20cff59f5f6be9db5705c3 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 16/28] anolis: virtio-blk: add no_align extra feature bit ANBZ: #20938 Add NO_ALIGN support feature bit at ext-feature bit-1. This controls blk-mq dma alignment(0). Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 10 ++++++++++ drivers/block/virtio_blk_ext.c | 1 + 2 files changed, 11 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index dc4806303b5f..348eeaacd8b8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -138,6 +138,7 @@ struct virtio_blk { #ifdef CONFIG_VIRTIO_BLK_RING_PAIR bool ring_pair; + bool no_align; /* saved indirect desc pointer, dma_addr and dma_len for SQ */ struct virtblk_indir_desc **indir_desc; #endif @@ -1817,6 +1818,9 @@ int check_ext_feature(struct virtio_blk *vblk, void __iomem *ioaddr, vblk->ring_pair = !!(*host_ext_features & VIRTIO_BLK_EXT_F_RING_PAIR); if (vblk->ring_pair) *guest_ext_features |= (VIRTIO_BLK_EXT_F_RING_PAIR); + vblk->no_align = !!(*host_ext_features & VIRTIO_BLK_EXT_F_RING_NO_ALIGN); + if (vblk->no_align) + *guest_ext_features |= (VIRTIO_BLK_EXT_F_RING_NO_ALIGN); return 0; } @@ -2008,6 +2012,7 @@ static int init_vq(struct virtio_blk *vblk) * to orginal use, so err needs a positive initial value */ vblk->ring_pair = false; + vblk->no_align = false; /* ext feature only support for virtio_blk over pci device currently */ if (!virtblk_rpair_disable && dev_is_pci(vblk->vdev->dev.parent)) { @@ -2818,6 +2823,11 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_max_segment_size(q, max_size); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + if (vblk->no_align) + blk_queue_dma_alignment(q, 0); +#endif + /* Host can optionally specify the block size of the device */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, struct virtio_blk_config, blk_size, diff --git a/drivers/block/virtio_blk_ext.c b/drivers/block/virtio_blk_ext.c index 74237f05bcec..b09eeec1b9bb 100644 --- a/drivers/block/virtio_blk_ext.c +++ b/drivers/block/virtio_blk_ext.c @@ -7,6 +7,7 @@ /* ext feature bit definition */ #define VIRTIO_BLK_EXT_F_RING_PAIR (1U << 0) +#define VIRTIO_BLK_EXT_F_RING_NO_ALIGN (1U << 1) #define VIRTIO_BLK_EXT_F_INVAL (-1) #define VIRTIO_PCI_VSF_MAGIC_NUM 0x0 -- Gitee From 9b67a8da6488d0600c6e29d585b244ad76c595b1 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 17/28] anolis: virtio-blk: add hide_bdev extra feature bit ANBZ: #20938 Add hide block device feature bit at ext-feature bit-2. This controls hide /dev/vdX, chardev still exists. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 10 ++++++++++ drivers/block/virtio_blk_ext.c | 1 + 2 files changed, 11 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 348eeaacd8b8..f383eb3db0f9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -139,6 +139,7 @@ struct virtio_blk { #ifdef CONFIG_VIRTIO_BLK_RING_PAIR bool ring_pair; bool no_align; + bool hide_bdev; /* saved indirect desc pointer, dma_addr and dma_len for SQ */ struct virtblk_indir_desc **indir_desc; #endif @@ -1821,6 +1822,9 @@ int check_ext_feature(struct virtio_blk *vblk, void __iomem *ioaddr, vblk->no_align = !!(*host_ext_features & VIRTIO_BLK_EXT_F_RING_NO_ALIGN); if (vblk->no_align) *guest_ext_features |= (VIRTIO_BLK_EXT_F_RING_NO_ALIGN); + vblk->hide_bdev = !!(*host_ext_features & VIRTIO_BLK_EXT_F_HIDE_BLOCK); + if (vblk->hide_bdev) + *guest_ext_features |= (VIRTIO_BLK_EXT_F_HIDE_BLOCK); return 0; } @@ -2013,6 +2017,7 @@ static int init_vq(struct virtio_blk *vblk) */ vblk->ring_pair = false; vblk->no_align = false; + vblk->hide_bdev = false; /* ext feature only support for virtio_blk over pci device currently */ if (!virtblk_rpair_disable && dev_is_pci(vblk->vdev->dev.parent)) { @@ -2981,7 +2986,12 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_cleanup_disk; } +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + if (!vblk->hide_bdev) + err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); +#else err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); +#endif if (err) goto out_cleanup_disk; diff --git a/drivers/block/virtio_blk_ext.c b/drivers/block/virtio_blk_ext.c index b09eeec1b9bb..3846056f2b9a 100644 --- a/drivers/block/virtio_blk_ext.c +++ b/drivers/block/virtio_blk_ext.c @@ -8,6 +8,7 @@ /* ext feature bit definition */ #define VIRTIO_BLK_EXT_F_RING_PAIR (1U << 0) #define VIRTIO_BLK_EXT_F_RING_NO_ALIGN (1U << 1) +#define VIRTIO_BLK_EXT_F_HIDE_BLOCK (1U << 2) #define VIRTIO_BLK_EXT_F_INVAL (-1) #define VIRTIO_PCI_VSF_MAGIC_NUM 0x0 -- Gitee From 41222807f127f30f6a59b0f5491e88a491aabaec Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 18/28] anolis: virtio-blk: enable CONFIG_VIRTIO_BLK_RING_PAIR default ANBZ: #20938 For ak kernel from ANCK-6.6-005, enable this feature by default. To use virtio-blk vring pair, you need: 1. backend ring pair support. 2. modprobe virtio-blk with params "rp_enable" 3. (not must) dynamic request queue configurations "nr_pre_rqs" Signed-off-by: Ferry Meng --- anolis/configs/L0-MANDATORY/arm64/CONFIG_VIRTIO_BLK_RING_PAIR | 1 + .../configs/L0-MANDATORY/loongarch/CONFIG_VIRTIO_BLK_RING_PAIR | 1 + anolis/configs/L0-MANDATORY/x86/CONFIG_VIRTIO_BLK_RING_PAIR | 1 + 3 files changed, 3 insertions(+) create mode 100644 anolis/configs/L0-MANDATORY/arm64/CONFIG_VIRTIO_BLK_RING_PAIR create mode 100644 anolis/configs/L0-MANDATORY/loongarch/CONFIG_VIRTIO_BLK_RING_PAIR create mode 100644 anolis/configs/L0-MANDATORY/x86/CONFIG_VIRTIO_BLK_RING_PAIR diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_VIRTIO_BLK_RING_PAIR b/anolis/configs/L0-MANDATORY/arm64/CONFIG_VIRTIO_BLK_RING_PAIR new file mode 100644 index 000000000000..621ab9591ddf --- /dev/null +++ b/anolis/configs/L0-MANDATORY/arm64/CONFIG_VIRTIO_BLK_RING_PAIR @@ -0,0 +1 @@ +CONFIG_VIRTIO_BLK_RING_PAIR=y diff --git a/anolis/configs/L0-MANDATORY/loongarch/CONFIG_VIRTIO_BLK_RING_PAIR b/anolis/configs/L0-MANDATORY/loongarch/CONFIG_VIRTIO_BLK_RING_PAIR new file mode 100644 index 000000000000..621ab9591ddf --- /dev/null +++ b/anolis/configs/L0-MANDATORY/loongarch/CONFIG_VIRTIO_BLK_RING_PAIR @@ -0,0 +1 @@ +CONFIG_VIRTIO_BLK_RING_PAIR=y diff --git a/anolis/configs/L0-MANDATORY/x86/CONFIG_VIRTIO_BLK_RING_PAIR b/anolis/configs/L0-MANDATORY/x86/CONFIG_VIRTIO_BLK_RING_PAIR new file mode 100644 index 000000000000..621ab9591ddf --- /dev/null +++ b/anolis/configs/L0-MANDATORY/x86/CONFIG_VIRTIO_BLK_RING_PAIR @@ -0,0 +1 @@ +CONFIG_VIRTIO_BLK_RING_PAIR=y -- Gitee From e5200d7549d3d9239f0d64c85324c979f54dc757 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 19/28] anolis: blk-mq: support dynamic request alloc ANBZ: #20938 In blk-mq layer, tag_set->static_rqs is preallocated and distributed later. When queue_depth is too large , we want to avoid too much memory overhead caused by static preallocation. Design a new tag_set flag "BLK_MQ_F_DYN_ALLOC" and tag_set new member "nr_static_rqs". After setting the flag, tag No. above 'nr_static_rqs' requests (struct request + driver defined pdu ) will be allocated. Others will be allocated during blk_mq_alloc_request. old_pages = [sizeof(struct request) + sizeof(struct pdu)] * queue_depth / page_size new_pages = [sizeof(struct request) + sizeof(struct pdu)] * nr_static_rqs / page_size Signed-off-by: Ferry Meng --- block/blk-mq.c | 41 ++++++++++++++++++++++++++++++++++++----- include/linux/blk-mq.h | 5 ++++- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 058dd2bb3761..68f546b24b8c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -381,8 +381,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; - struct request *rq = tags->static_rqs[tag]; + struct request *rq; + struct blk_mq_tag_set *set = data->q->tag_set; + + if ((set->flags & BLK_MQ_F_DYN_ALLOC) && (tag >= set->nr_static_rqs)) + tags->static_rqs[tag] = kmalloc(sizeof(struct request) + + set->cmd_size, GFP_KERNEL | __GFP_ZERO); + rq = tags->static_rqs[tag]; rq->q = q; rq->mq_ctx = ctx; rq->mq_hctx = hctx; @@ -732,7 +738,7 @@ static void __blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - const int sched_tag = rq->internal_tag; + const int sched_tag = rq->internal_tag, tag = rq->tag; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); @@ -741,10 +747,23 @@ static void __blk_mq_free_request(struct request *rq) if (rq->rq_flags & RQF_MQ_INFLIGHT) __blk_mq_dec_active_requests(hctx); - if (rq->tag != BLK_MQ_NO_TAG) - blk_mq_put_tag(hctx->tags, ctx, rq->tag); - if (sched_tag != BLK_MQ_NO_TAG) + if (tag != BLK_MQ_NO_TAG) { + if ((q->tag_set->flags & BLK_MQ_F_DYN_ALLOC) && + tag >= q->tag_set->nr_static_rqs) { + hctx->tags->static_rqs[tag] = NULL; + kfree(rq); + } + blk_mq_put_tag(hctx->tags, ctx, tag); + } + if (sched_tag != BLK_MQ_NO_TAG) { + if ((q->tag_set->flags & BLK_MQ_F_DYN_ALLOC) && + sched_tag >= q->tag_set->nr_static_rqs) { + hctx->sched_tags->static_rqs[sched_tag] = NULL; + kfree(rq); + } blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); + } + blk_mq_sched_restart(hctx); blk_queue_exit(q); } @@ -3358,6 +3377,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, if (!rq) continue; set->ops->exit_request(set, rq, hctx_idx); + if ((set->flags & BLK_MQ_F_DYN_ALLOC) && + (i >= set->nr_static_rqs)) + kfree(rq); tags->static_rqs[i] = NULL; } } @@ -3484,6 +3506,14 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); + + if (set->flags & BLK_MQ_F_DYN_ALLOC) { + if (!set->nr_static_rqs || (set->nr_static_rqs > depth)) + set->nr_static_rqs = depth; + + depth = set->nr_static_rqs; + } + left = rq_size * depth; for (i = 0; i < depth; ) { @@ -4732,6 +4762,7 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; + set->nr_static_rqs = 0; return blk_mq_alloc_tag_set(set); } EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index afe40766a912..5d4745901cce 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -522,7 +522,8 @@ struct blk_mq_tag_set { struct list_head tag_list; struct srcu_struct *srcu; - CK_KABI_RESERVE(1) + /* number of static alloc rqs if dyn_alloc flag is set */ + CK_KABI_REPLACE(CK_KABI_RESERVE(1), unsigned int nr_static_rqs) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) CK_KABI_RESERVE(4) @@ -685,6 +686,8 @@ enum { BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, + BLK_MQ_F_DYN_ALLOC = 1 << 31, + BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, -- Gitee From 79aa8e6c4e5d7dd6c05a79079936825f32fd1d74 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 20/28] anolis: virtio-blk: dyn request alloc when enabled vring pair ANBZ: #20938 Enable dyn request alloc if one block device enable vring pair. Besides, we also set BLK_MQ_F_NO_SCHED to avoid using tag scheduler. Providing an extra module param 'dyn_max_rqs' for virtio-blk driver. After enabling ring_pair mode, dyn_max_rqs will be the real queue_depth, original queue_depth indicates the amount of pre_alloc 'struct requests +pdu' in static_rqs. If param not set, use default value(16384). Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f383eb3db0f9..edf1ad5bbf3f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -2666,6 +2666,12 @@ static const struct file_operations virtblk_chr_fops = { static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static unsigned short virtblk_dyn_max_rqs = 16384; +module_param_named(dyn_max_rqs, virtblk_dyn_max_rqs, short, 0444); +MODULE_PARM_DESC(dyn_max_rqs, "Max requests per rpair(0~65535), default 2^14"); +#endif + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -2732,17 +2738,28 @@ static int virtblk_probe(struct virtio_device *vdev) memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); #ifdef CONFIG_VIRTIO_BLK_RING_PAIR - vblk->tag_set.ops = vblk->ring_pair ? &virtio_mq_pair_ops : - &virtio_mq_ops; - vblk->tag_set.nr_hw_queues = vblk->ring_pair ? vblk->num_vqs / VIRTBLK_RING_NUM : - vblk->num_vqs; + if (vblk->ring_pair) { + vblk->tag_set.ops = &virtio_mq_pair_ops; + vblk->tag_set.nr_hw_queues = vblk->num_vqs / VIRTBLK_RING_NUM; + /* For ring pair, we don't want to use io scheduler. So we set + * NO_SCHED flag, in this case BLK_MQ_F_SHOULD_MERGE is unused. + */ + vblk->tag_set.flags = BLK_MQ_F_DYN_ALLOC | BLK_MQ_F_NO_SCHED; + vblk->tag_set.queue_depth = virtblk_dyn_max_rqs; + vblk->tag_set.nr_static_rqs = queue_depth; + } else { + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.nr_hw_queues = vblk->num_vqs; + vblk->tag_set.queue_depth = queue_depth; + vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + } #else vblk->tag_set.ops = &virtio_mq_ops; vblk->tag_set.nr_hw_queues = vblk->num_vqs; -#endif vblk->tag_set.queue_depth = queue_depth; - vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; +#endif + vblk->tag_set.numa_node = NUMA_NO_NODE; /* For bidirectional passthrough vblk request, both WRITE and READ * operations need pre-alloc inline SGs. So we should prealloc twice * the size than original ways. Due to the inability to predict whether -- Gitee From 4bd0f4b23a5f5b00de853f480bdab3002ab71721 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 21/28] anolis: virtio-blk: add trace events ANBZ: #20938 add trace events for virtio-blk Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 8 +++ include/trace/events/virtio_blk.h | 91 +++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 include/trace/events/virtio_blk.h diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index edf1ad5bbf3f..49e659389534 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -25,6 +25,9 @@ #include "virtio_blk_ext.c" #endif +#define CREATE_TRACE_POINTS +#include + #define PART_BITS 4 #define VQ_NAME_LEN 16 #define MAX_DISCARD_SEGMENTS 256u @@ -955,6 +958,7 @@ static inline void virtblk_request_done(struct request *req) blk_status_t status = virtblk_result(virtblk_vbr_status(vbr)); struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; + trace_virtblk_request_done(req, vbr->in_hdr.status); virtblk_unmap_data(req, vbr); virtblk_cleanup_cmd(req); @@ -1072,6 +1076,7 @@ static blk_status_t virtblk_prep_rq_rpair(struct blk_mq_hw_ctx *hctx, return status; num = virtblk_map_data(hctx, req, vbr); + trace_virtio_prep_rq(req, vbr_is_bidirectional(vbr), num); if (unlikely(num < 0)) return virtblk_fail_to_queue(req, -ENOMEM); vbr->sg_table.nents = num; @@ -1094,6 +1099,7 @@ static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, return status; num = virtblk_map_data(hctx, req, vbr); + trace_virtio_prep_rq(req, vbr_is_bidirectional(vbr), num); if (unlikely(num < 0)) return virtblk_fail_to_queue(req, -ENOMEM); vbr->sg_table.nents = num; @@ -2525,6 +2531,8 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk, WRITE_ONCE(ioucmd->cookie, req); } + trace_virtblk_uring_cmd_io(req, type, cmd->sector); + /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; req->end_io_data = ioucmd; diff --git a/include/trace/events/virtio_blk.h b/include/trace/events/virtio_blk.h new file mode 100644 index 000000000000..02d769b22bd6 --- /dev/null +++ b/include/trace/events/virtio_blk.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM virtio_blk + +#if !defined(_TRACE_VIRTIO_BLK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VIRTIO_BLK_H + +#include + +struct request; + +TRACE_EVENT(virtblk_request_done, + + TP_PROTO(struct request *req, u8 ret), + + TP_ARGS(req, ret), + + TP_STRUCT__entry( + __field(struct request *, req) + __field(u32, tag) + __field(int, qid) + __field(void *, end_io_data) + __field(u8, ret) + ), + + TP_fast_assign( + __entry->req = req; + __entry->tag = req->tag; + __entry->qid = req->q->id; + __entry->end_io_data = req->end_io_data; + __entry->ret = ret; + ), + + TP_printk("DONE: req=%p qid=%d tag=%d ret=%d ioucmd=%p", + __entry->req, __entry->qid, __entry->tag, + __entry->ret, __entry->end_io_data) +); + +TRACE_EVENT(virtblk_uring_cmd_io, + + TP_PROTO(struct request *req, u32 type, u64 sector), + + TP_ARGS(req, type, sector), + + TP_STRUCT__entry( + __field(struct request *, req) + __field(u32, tag) + __field(u32, type) + __field(u64, sector) + ), + + TP_fast_assign( + __entry->req = req; + __entry->tag = req->tag; + __entry->type = type; + __entry->sector = sector; + ), + + TP_printk("URING: req=%p tag=%d type=%d sector=%llu", + __entry->req, __entry->tag, __entry->type, + __entry->sector) +); + +TRACE_EVENT(virtio_prep_rq, + + TP_PROTO(struct request *req, bool bid, int num), + + TP_ARGS(req, bid, num), + + TP_STRUCT__entry( + __field(struct request *, req) + __field(u32, tag) + __field(bool, bid) + __field(int, num) + ), + + TP_fast_assign( + __entry->req = req; + __entry->tag = req->tag; + __entry->bid = bid; + __entry->num = num; + ), + + TP_printk("QUEUE: req=%p tag=%d bid=%d sgs=%d", + __entry->req, __entry->tag, __entry->bid, __entry->num) +); + +#endif /* _TRACE_VIRTIO_BLK_H */ + +/* This part must be outside protection */ +#include -- Gitee From 5ed9ccfe1f6caa48ca53370698620103e5d400a2 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 22/28] anolis: virtio_ring: add split queue seq_printf interface ANBZ: #20938 Provide ability for split virtqueue to show descriptors condition. Pay attention to isolating concurrent access before call this function. Signed-off-by: Ferry Meng --- drivers/virtio/virtio_ring.c | 98 ++++++++++++++++++++++++++++++++++++ include/linux/virtio.h | 1 + 2 files changed, 99 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6d6b493c1af0..3449af8d507a 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ @@ -3683,4 +3684,101 @@ void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, } EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device); +/** + * virtqueue_show_split_message - print split queue structure + * @_vq: the struct virtqueue we're talking about. + * @s: the struct seq_file + * Before calling this function, get lock to confirm that + * the virtqueue is not in use. + */ +void virtqueue_show_split_message(struct virtqueue *_vq, struct seq_file *s) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_virtqueue_split *split = &vq->split; + u16 last_used_idx, used_idx, idx, idx_in_used_ring, flags; + struct vring_desc *desc; + int len, i; + + last_used_idx = vq->last_used_idx; + used_idx = virtio16_to_cpu(vq->vq.vdev, split->vring.used->idx); + + seq_printf(s, "Virtqueue %d (0x%px): num %d\n", _vq->index, + vq, split->vring.num); + seq_printf(s, "Descriptor Table: num_free %d, free_head %d\n", + _vq->num_free, vq->free_head); + seq_printf(s, "Available Ring: flags 0x%x, avail_idx %d\n", + split->avail_flags_shadow, split->vring.avail->idx); + seq_printf(s, "Used Ring: used %d, last_used_index %d\n", + used_idx, last_used_idx); + + if (last_used_idx == used_idx) + goto out; + + seq_puts(s, "---------- ---------------- -------\n"); + seq_puts(s, "USED_INDEX DESC_TABLE_INDEX DRVDATA\n"); + while (last_used_idx != used_idx) { + idx = last_used_idx & (split->vring.num - 1); + idx_in_used_ring = virtio32_to_cpu(vq->vq.vdev, + split->vring.used->ring[idx].id); + + seq_printf(s, "%10d %16d 0x%px\n", idx, idx_in_used_ring, + split->desc_state[idx_in_used_ring].data); + last_used_idx++; + } + seq_puts(s, "---------- ---------------- -------\n"); + last_used_idx = vq->last_used_idx; + while (last_used_idx != used_idx) { + idx = last_used_idx & (split->vring.num - 1); + idx_in_used_ring = virtio32_to_cpu(vq->vq.vdev, + split->vring.used->ring[idx].id); + + if (!vq->indirect) { + seq_printf(s, "Direct desc[%d]\n", idx_in_used_ring); + i = idx_in_used_ring; + do { + desc = &split->vring.desc[i]; + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + seq_printf(s, " desc[%d] ", i); + seq_printf(s, "dma_addr=0x%-16llx ", + virtio64_to_cpu(vq->vq.vdev, desc->addr)); + seq_printf(s, "flags=0x%-4x ", flags); + seq_printf(s, "len=%-8d ", + virtio32_to_cpu(vq->vq.vdev, desc->len)); + seq_printf(s, "next=%-4d\n", + virtio16_to_cpu(vq->vq.vdev, desc->next)); + i = desc->next; + } while (flags & VRING_DESC_F_NEXT); + } else { + desc = &split->vring.desc[idx_in_used_ring]; + len = split->desc_extra[idx_in_used_ring].len; + seq_printf(s, "P{0x%px} desc[%d]", desc, idx_in_used_ring); + seq_printf(s, "dma_addr=0x%-16llx len=%-8d\n", + virtio64_to_cpu(vq->vq.vdev, desc[i].addr), + virtio32_to_cpu(vq->vq.vdev, desc[i].len)); + + /* print indir_descs */ + desc = split->desc_state[idx_in_used_ring].indir_desc; + for (i = 0; i < len / sizeof(struct vring_desc); i++) { + seq_printf(s, " indir_desc[%d] ", i); + seq_printf(s, "dma_addr=0x%-16llx ", + virtio64_to_cpu(vq->vq.vdev, desc[i].addr)); + seq_printf(s, "flags=0x%-4x ", + virtio16_to_cpu(vq->vq.vdev, desc[i].flags)); + seq_printf(s, "len=%-8d ", + virtio32_to_cpu(vq->vq.vdev, desc[i].len)); + seq_printf(s, "next=%-4d\n", + virtio16_to_cpu(vq->vq.vdev, desc[i].next)); + } + } + last_used_idx++; + } + +out: + seq_puts(s, "=======================================\n"); + return; + +} +EXPORT_SYMBOL_GPL(virtqueue_show_split_message); + MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index a78542ac9733..7af722611255 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -263,4 +263,5 @@ void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t a void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); +void virtqueue_show_split_message(struct virtqueue *_vq, struct seq_file *s); #endif /* _LINUX_VIRTIO_H */ -- Gitee From 5305e6f571e91a72ff54c66f4e96f7297998a814 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 23/28] anolis: virtio-blk: add debugfs interface for ring_pair ANBZ: #20938 Providing debugfs interface for virtio-blk ring pair feature. Path is /sys/kernel/debug/block/vdX/insight/[rpair|virtqueue] For virtqueue, it shows desc info of every request stay in virtqueue. For rpair, we can see every unfinished I/O's indirect_desc, dma_addr and dma_len. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 107 +++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 49e659389534..7678ba315061 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef CONFIG_VIRTIO_BLK_RING_PAIR #include "virtio_blk_ext.c" #endif @@ -146,6 +147,11 @@ struct virtio_blk { /* saved indirect desc pointer, dma_addr and dma_len for SQ */ struct virtblk_indir_desc **indir_desc; #endif + +#ifdef CONFIG_DEBUG_FS + struct dentry *dbg_dir; +#endif + }; struct virtblk_req { @@ -2671,6 +2677,105 @@ static const struct file_operations virtblk_chr_fops = { .uring_cmd_iopoll = virtblk_chr_uring_cmd_iopoll, }; +#ifdef CONFIG_DEBUG_FS +static int virtblk_dbg_virtqueues_show(struct seq_file *s, void *unused) +{ + struct virtio_blk *vblk = s->private; + unsigned long flags; + int i; + + for (i = 0; i < vblk->num_vqs; i++) { + spin_lock_irqsave(&vblk->vqs[i].lock, flags); + virtqueue_show_split_message(vblk->vqs[i].vq, s); + spin_unlock_irqrestore(&vblk->vqs[i].lock, flags); + } + return 0; +} + +static int virtblk_dbg_virtqueues_open(struct inode *inode, struct file *file) +{ + return single_open(file, virtblk_dbg_virtqueues_show, inode->i_private); +} + +static const struct file_operations virtblk_dbg_virtqueue_ops = { + .open = virtblk_dbg_virtqueues_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +static int virtblk_dbg_rqs_show(struct seq_file *s, void *unused) +{ + struct virtio_blk *vblk = s->private; + struct virtblk_indir_desc *indir_desc; + int i, j; + + seq_printf(s, "ring_pair is %d\n", vblk->ring_pair); + if (!vblk->ring_pair) + return 0; + + for (i = 0; i < vblk->num_vqs / VIRTBLK_RING_NUM; i++) { + for (j = 0; j < vblk->tag_set.queue_depth; j++) { + indir_desc = &vblk->indir_desc[i][j]; + if (indir_desc->desc) { + seq_printf(s, "hctx %d, tag %d, desc 0x%px, ", + i / VIRTBLK_RING_NUM, j, + indir_desc->desc); + seq_printf(s, "dma_addr 0x%llx, len 0x%x\n", + indir_desc->dma_addr, indir_desc->len); + } + } + } + + return 0; +} + +static int virtblk_dbg_rqs_open(struct inode *inode, struct file *file) +{ + return single_open(file, virtblk_dbg_rqs_show, inode->i_private); +} + +static const struct file_operations virtblk_dbg_rqs_ops = { + .open = virtblk_dbg_rqs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static int virtio_blk_dev_dbg_init(struct virtio_blk *vblk) +{ + struct dentry *dir, *parent_block_dir; + + parent_block_dir = vblk->disk->queue->debugfs_dir; + if (!parent_block_dir) + return -EIO; + + dir = debugfs_create_dir("insight", parent_block_dir); + if (IS_ERR(dir)) { + dev_err(&vblk->vdev->dev, "Failed to get debugfs dir for '%s'\n", + vblk->disk->disk_name); + return -EIO; + } + + debugfs_create_file("virtqueues", 0444, dir, vblk, &virtblk_dbg_virtqueue_ops); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + debugfs_create_file("rpair", 0444, dir, vblk, &virtblk_dbg_rqs_ops); +#endif + vblk->dbg_dir = dir; + return 0; +} + +static void virtblk_dev_dbg_close(struct virtio_blk *vblk) +{ + debugfs_remove_recursive(vblk->dbg_dir); +} +#else +static int virtblk_dev_dbg_init(struct virtio_blk *vblk) { return 0; } +static void virtblk_dev_dbg_close(struct virtio_blk *vblk) { } +#endif + static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); @@ -3020,6 +3125,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_cleanup_disk; + virtio_blk_dev_dbg_init(vblk); WARN_ON(virtblk_cdev_add(vblk, &virtblk_chr_fops)); return 0; @@ -3047,6 +3153,7 @@ static void virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; + virtblk_dev_dbg_close(vblk); /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); -- Gitee From 92798bd9410829df5eb7022ed999ec86674d41bf Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 24/28] anolis: blk-mq: bump max tag depth to 64K ANBZ: #20938 for virtio-blk ring pair mode, the tag map can be huge. So increase the max number of tags to the limit (1 << 16). Signed-off-by: Ferry Meng --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5d4745901cce..9401d15d8937 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -695,7 +695,7 @@ enum { /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3, - BLK_MQ_MAX_DEPTH = 10240, + BLK_MQ_MAX_DEPTH = 65536, BLK_MQ_CPU_WORK_BATCH = 8, }; -- Gitee From 3c3ad8c103ad653e5381bcfeb9446a9e33a2362d Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Sat, 10 May 2025 09:30:00 +0800 Subject: [PATCH 25/28] anolis: virtio-blk: choose right vq in map_queues for ring pair mode ANBZ: #20938 For ring pair mode, only odd queues has callbacks (alloc irqs), even queues' ->get_vq_affinity() function will returen NULL mask, which will cause blk_mq_virtio_map_queues() goes to fallback process. To avoid this, we should separate virtblk_map_queues for ring_pair, use virtblk_qid_to_cq_qid() to make hw_queue correspond correctly to CQ virtqueue, and finally get right affinity mask. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 55 +++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7678ba315061..000c3476ad83 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -2232,6 +2232,59 @@ static const struct attribute_group *virtblk_attr_groups[] = { NULL, }; +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR +void blk_mq_virtio_map_queues_rpair(struct blk_mq_queue_map *qmap, + struct virtio_device *vdev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + if (!vdev->config->get_vq_affinity) + goto fallback; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = vdev->config->get_vq_affinity(vdev, first_vec + + virtblk_qid_to_cq_qid(queue)); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return; +fallback: + blk_mq_map_queues(qmap); +} + +static void virtblk_map_queues_rpair(struct blk_mq_tag_set *set) +{ + struct virtio_blk *vblk = set->driver_data; + int i, qoff; + + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + map->nr_queues = vblk->io_queues[i]; + map->queue_offset = qoff; + qoff += map->nr_queues; + + if (map->nr_queues == 0) + continue; + + /* + * Regular queues have interrupts and hence CPU affinity is + * defined by the core virtio code, but polling queues have + * no interrupts so we let the block layer assign CPU affinity. + */ + if (i == HCTX_TYPE_POLL) + blk_mq_map_queues(&set->map[i]); + else + blk_mq_virtio_map_queues_rpair(&set->map[i], vblk->vdev, 0); + } +} +#endif + static void virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; @@ -2343,7 +2396,7 @@ static const struct blk_mq_ops virtio_mq_pair_ops = { .queue_rqs = virtio_queue_rqs_rpair, .commit_rqs = virtio_commit_rqs, .complete = virtblk_request_done, - .map_queues = virtblk_map_queues, + .map_queues = virtblk_map_queues_rpair, .poll = virtblk_poll_rpair, }; #endif -- Gitee From d20adb504583e115ce0983899bb3483773678f53 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Tue, 17 Jun 2025 15:40:39 +0800 Subject: [PATCH 26/28] anolis: virtio-blk: make chardev only exposed to specific devices ANBZ: #20938 Considering that the current ring pair and passthrough features are only open to specific devices, we'd better hide the chardev interface for compatibility. Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/5446 --- drivers/block/virtio_blk.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 000c3476ad83..5d64c1399270 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -3179,7 +3179,11 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_cleanup_disk; virtio_blk_dev_dbg_init(vblk); - WARN_ON(virtblk_cdev_add(vblk, &virtblk_chr_fops)); + +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + if (vblk->no_align) + WARN_ON(virtblk_cdev_add(vblk, &virtblk_chr_fops)); +#endif return 0; @@ -3210,7 +3214,10 @@ static void virtblk_remove(struct virtio_device *vdev) /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); - virtblk_cdev_del(&vblk->cdev, &vblk->cdev_device); +#ifdef CONFIG_VIRTIO_BLK_RING_PAIR + if (vblk->no_align) + virtblk_cdev_del(&vblk->cdev, &vblk->cdev_device); +#endif del_gendisk(vblk->disk); blk_mq_free_tag_set(&vblk->tag_set); -- Gitee From 4adf668018d472eceb64ab91f98397f30238f44a Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Tue, 17 Jun 2025 20:30:33 +0800 Subject: [PATCH 27/28] anolis: virtio-blk: rename no_align to pt_enable ANBZ: #20938 As we hide generic chardev, the actual meaning of no_alignment bit is passthrough feature enable. So we rename it. Signed-off-by: Ferry Meng Reviewed-by: Joseph Qi Link: https://gitee.com/anolis/cloud-kernel/pulls/5446 --- drivers/block/virtio_blk.c | 16 ++++++++-------- drivers/block/virtio_blk_ext.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5d64c1399270..bbcd516c2d72 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -142,7 +142,7 @@ struct virtio_blk { #ifdef CONFIG_VIRTIO_BLK_RING_PAIR bool ring_pair; - bool no_align; + bool pt_enable; bool hide_bdev; /* saved indirect desc pointer, dma_addr and dma_len for SQ */ struct virtblk_indir_desc **indir_desc; @@ -1831,9 +1831,9 @@ int check_ext_feature(struct virtio_blk *vblk, void __iomem *ioaddr, vblk->ring_pair = !!(*host_ext_features & VIRTIO_BLK_EXT_F_RING_PAIR); if (vblk->ring_pair) *guest_ext_features |= (VIRTIO_BLK_EXT_F_RING_PAIR); - vblk->no_align = !!(*host_ext_features & VIRTIO_BLK_EXT_F_RING_NO_ALIGN); - if (vblk->no_align) - *guest_ext_features |= (VIRTIO_BLK_EXT_F_RING_NO_ALIGN); + vblk->pt_enable = !!(*host_ext_features & VIRTIO_BLK_EXT_F_PT_ENABLE); + if (vblk->pt_enable) + *guest_ext_features |= (VIRTIO_BLK_EXT_F_PT_ENABLE); vblk->hide_bdev = !!(*host_ext_features & VIRTIO_BLK_EXT_F_HIDE_BLOCK); if (vblk->hide_bdev) *guest_ext_features |= (VIRTIO_BLK_EXT_F_HIDE_BLOCK); @@ -2028,7 +2028,7 @@ static int init_vq(struct virtio_blk *vblk) * to orginal use, so err needs a positive initial value */ vblk->ring_pair = false; - vblk->no_align = false; + vblk->pt_enable = false; vblk->hide_bdev = false; /* ext feature only support for virtio_blk over pci device currently */ @@ -3012,7 +3012,7 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_max_segment_size(q, max_size); #ifdef CONFIG_VIRTIO_BLK_RING_PAIR - if (vblk->no_align) + if (vblk->pt_enable) blk_queue_dma_alignment(q, 0); #endif @@ -3181,7 +3181,7 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_blk_dev_dbg_init(vblk); #ifdef CONFIG_VIRTIO_BLK_RING_PAIR - if (vblk->no_align) + if (vblk->pt_enable) WARN_ON(virtblk_cdev_add(vblk, &virtblk_chr_fops)); #endif @@ -3215,7 +3215,7 @@ static void virtblk_remove(struct virtio_device *vdev) flush_work(&vblk->config_work); #ifdef CONFIG_VIRTIO_BLK_RING_PAIR - if (vblk->no_align) + if (vblk->pt_enable) virtblk_cdev_del(&vblk->cdev, &vblk->cdev_device); #endif diff --git a/drivers/block/virtio_blk_ext.c b/drivers/block/virtio_blk_ext.c index 3846056f2b9a..0e996a9ee56c 100644 --- a/drivers/block/virtio_blk_ext.c +++ b/drivers/block/virtio_blk_ext.c @@ -7,7 +7,7 @@ /* ext feature bit definition */ #define VIRTIO_BLK_EXT_F_RING_PAIR (1U << 0) -#define VIRTIO_BLK_EXT_F_RING_NO_ALIGN (1U << 1) +#define VIRTIO_BLK_EXT_F_PT_ENABLE (1U << 1) #define VIRTIO_BLK_EXT_F_HIDE_BLOCK (1U << 2) #define VIRTIO_BLK_EXT_F_INVAL (-1) -- Gitee From f5959db372e8f0751bad440e86150edc67ecc4c6 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Fri, 22 Aug 2025 15:50:49 +0800 Subject: [PATCH 28/28] anolis: virtio-blk: delete zone related ops in virtblk_setup_cmd_rpair() ANBZ: #20938 now ring_pair has no support for zoned ops, delete related codes. Signed-off-by: Ferry Meng --- drivers/block/virtio_blk.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index bbcd516c2d72..643810895ba9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -785,9 +785,6 @@ static blk_status_t virtblk_setup_cmd_rpair(struct virtio_device *vdev, /* for ring_pair, tag is used and occupied high 16bit of ioprio*/ vbr->out_hdr.rpair.tag = cpu_to_virtio16(vdev, req->tag); - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) - return BLK_STS_NOTSUPP; - switch (req_op(req)) { case REQ_OP_READ: type = VIRTIO_BLK_T_IN; @@ -810,30 +807,6 @@ static blk_status_t virtblk_setup_cmd_rpair(struct virtio_device *vdev, case REQ_OP_SECURE_ERASE: type = VIRTIO_BLK_T_SECURE_ERASE; break; - case REQ_OP_ZONE_OPEN: - type = VIRTIO_BLK_T_ZONE_OPEN; - sector = blk_rq_pos(req); - break; - case REQ_OP_ZONE_CLOSE: - type = VIRTIO_BLK_T_ZONE_CLOSE; - sector = blk_rq_pos(req); - break; - case REQ_OP_ZONE_FINISH: - type = VIRTIO_BLK_T_ZONE_FINISH; - sector = blk_rq_pos(req); - break; - case REQ_OP_ZONE_APPEND: - type = VIRTIO_BLK_T_ZONE_APPEND; - sector = blk_rq_pos(req); - in_hdr_len = sizeof(vbr->in_hdr.zone_append); - break; - case REQ_OP_ZONE_RESET: - type = VIRTIO_BLK_T_ZONE_RESET; - sector = blk_rq_pos(req); - break; - case REQ_OP_ZONE_RESET_ALL: - type = VIRTIO_BLK_T_ZONE_RESET_ALL; - break; case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: /* -- Gitee