From 3ab10a5ad9bf1cbf3b4603f5a930a7924a07ad5a Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 29 Mar 2022 12:05:56 +0800 Subject: [PATCH 1/2] scsi-bus: fix incorrect call for blk_error_retry_reset_timeout() Fix commit 52115ca0("scsi-disk: Add support for retry on errors"). Call Stack: ... scsi_read_data() scsi_do_read(r, 0) scsi_disk_req_check_error() blk_error_retry_reset_timeout() blk->retry_start_time = 0; It will cause IO hang when storage network disconnected. Before the storage network recovered, the upper call stack will reset the retry_start_time, and cause the next IO operation not returned immediately. Signed-off-by: Yan Wang --- hw/scsi/scsi-disk.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 8661932a15..a66d2b0a98 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -255,10 +255,8 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int ret, bool acct_failed) } } -static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +static bool scsi_disk_req_handle_error(SCSIDiskReq *r, int ret, bool acct_failed) { - SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); - if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); return true; @@ -268,6 +266,17 @@ static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) return scsi_handle_rw_error(r, ret, acct_failed); } + return false; +} + +static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +{ + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + + if (r->req.io_canceled || ret < 0) { + return scsi_disk_req_handle_error(r, ret, acct_failed); + } + blk_error_retry_reset_timeout(s->qdev.conf.blk); return false; } @@ -418,7 +427,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret) SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); assert (r->req.aiocb == NULL); - if (scsi_disk_req_check_error(r, ret, false)) { + if (scsi_disk_req_handle_error(r, ret, false)) { goto done; } @@ -458,6 +467,9 @@ static void scsi_do_read_cb(void *opaque, int ret) block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); } else { block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + if (!r->req.io_canceled) { + blk_error_retry_reset_timeout(s->qdev.conf.blk); + } } scsi_do_read(opaque, ret); aio_context_release(blk_get_aio_context(s->qdev.conf.blk)); -- Gitee From e42b57adeac96c7d39b1c032ab3b66b7eff18cc8 Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 29 Mar 2022 15:18:56 +0800 Subject: [PATCH 2/2] Revert "monitor: limit io error qmp event to at most once per 60s" This reverts commit 44f45b5c163efed5387dac40e229e0a50bf5921a. The commit 44f45b5c will reduse the IO-hang related log, which is useful to solve the problem. Signed-off-by: Yan Wang --- monitor/monitor.c | 1 - 1 file changed, 1 deletion(-) diff --git a/monitor/monitor.c b/monitor/monitor.c index 28206bedc4..257ef4ee54 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -301,7 +301,6 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { [QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS }, [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, - [QAPI_EVENT_BLOCK_IO_ERROR] = { 60L * 1000 * SCALE_MS }, }; /* -- Gitee