From nobody Wed Oct 29 17:12:36 2025 Delivered-To: importer@patchew.org Received-SPF: pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted sender) client-ip=208.118.235.17; envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org; helo=lists.gnu.org; Authentication-Results: mx.zohomail.com; spf=pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted sender) smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org; dmarc=fail(p=none dis=none) header.from=virtuozzo.com Return-Path: Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) by mx.zohomail.com with SMTPS id 1524575517356169.78810376425804; Tue, 24 Apr 2018 06:11:57 -0700 (PDT) Received: from localhost ([::1]:58444 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1fAxjc-0001RX-9U for importer@patchew.org; Tue, 24 Apr 2018 09:11:52 -0400 Received: from eggs.gnu.org ([2001:4830:134:3::10]:35573) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1fAxgL-0007gm-6n for qemu-devel@nongnu.org; Tue, 24 Apr 2018 09:08:34 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1fAxgG-0005V0-Gy for qemu-devel@nongnu.org; Tue, 24 Apr 2018 09:08:29 -0400 Received: from relay.sw.ru ([185.231.240.75]:45498) by eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.71) (envelope-from ) id 1fAxgG-0005PC-8h; Tue, 24 Apr 2018 09:08:24 -0400 Received: from msk-vpn.virtuozzo.com ([195.214.232.6] helo=kvm.sw.ru) by relay.sw.ru with esmtp (Exim 4.90_1) (envelope-from ) id 1fAxgE-0007sT-5G; Tue, 24 Apr 2018 16:08:22 +0300 From: Vladimir Sementsov-Ogievskiy To: qemu-devel@nongnu.org, qemu-block@nongnu.org Date: Tue, 24 Apr 2018 16:08:21 +0300 Message-Id: <20180424130821.50987-4-vsementsov@virtuozzo.com> X-Mailer: git-send-email 2.11.1 In-Reply-To: <20180424130821.50987-1-vsementsov@virtuozzo.com> References: <20180424130821.50987-1-vsementsov@virtuozzo.com> X-detected-operating-system: by eggs.gnu.org: GNU/Linux 3.x [fuzzy] X-Received-From: 185.231.240.75 Subject: [Qemu-devel] [RFC 3/3] blk: add 'reconnect' error action X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.21 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: kwolf@redhat.com, vsementsov@virtuozzo.com, famz@redhat.com, armbru@redhat.com, mreitz@redhat.com, den@openvz.org, pbonzini@redhat.com Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org Sender: "Qemu-devel" X-ZohoMail: RSF_0 Z_629925259 SPT_0 Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" New action works as follows: Firstly, not stopping the vm, it tries to bdrv_reconnect several times with given pause. Then, if we failed to reconnect fallthrough to 'stop' error action. TODO: - qapi docs - support other disks (only scsi here) - support block jobs - add configuration of timeout and tries count parameters Signed-off-by: Vladimir Sementsov-Ogievskiy --- qapi/block-core.json | 4 ++-- block/block-backend.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- hw/scsi/scsi-disk.c | 4 +++- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index c50517bff3..d4d87dbd4f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1028,7 +1028,7 @@ # Since: 1.3 ## { 'enum': 'BlockdevOnError', - 'data': ['report', 'ignore', 'enospc', 'stop', 'auto'] } + 'data': ['report', 'ignore', 'enospc', 'stop', 'auto', 'reconnect'] } =20 ## # @MirrorSyncMode: @@ -4351,7 +4351,7 @@ # Since: 2.1 ## { 'enum': 'BlockErrorAction', - 'data': [ 'ignore', 'report', 'stop' ] } + 'data': [ 'ignore', 'report', 'stop', 'reconnect' ] } =20 =20 ## diff --git a/block/block-backend.c b/block/block-backend.c index 681b240b12..81eb9a7bd0 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -89,6 +89,11 @@ struct BlockBackend { */ unsigned int in_flight; AioWait wait; + + bool reconnect_failed; /* TODO: worth tri-state variable? */ + bool reconnecting; + unsigned int reconnect_max; + uint64_t reconnect_ns; }; =20 typedef struct BlockBackendAIOCB { @@ -322,6 +327,8 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_pe= rm) blk->refcnt =3D 1; blk->perm =3D perm; blk->shared_perm =3D shared_perm; + blk->reconnect_max =3D 10; /* TODO configure */ + blk->reconnect_ns =3D 5000000000; /* 5 seconds, TODO configure */ blk_set_enable_write_cache(blk, true); =20 block_acct_init(&blk->stats); @@ -1079,6 +1086,7 @@ void blk_iostatus_disable(BlockBackend *blk) =20 void blk_iostatus_reset(BlockBackend *blk) { + blk->reconnect_failed =3D false; if (blk_iostatus_is_enabled(blk)) { BlockDriverState *bs =3D blk_bs(blk); blk->iostatus =3D BLOCK_DEVICE_IO_STATUS_OK; @@ -1635,6 +1643,9 @@ BlockErrorAction blk_get_error_action(BlockBackend *b= lk, bool is_read, BlockdevOnError on_err =3D blk_get_on_error(blk, is_read); =20 switch (on_err) { + case BLOCKDEV_ON_ERROR_RECONNECT: + return blk->reconnect_failed ? BLOCK_ERROR_ACTION_STOP : + BLOCK_ERROR_ACTION_RECONNECT; case BLOCKDEV_ON_ERROR_ENOSPC: return (error =3D=3D ENOSPC) ? BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; @@ -1665,6 +1676,29 @@ static void send_qmp_error_event(BlockBackend *blk, &error_abort); } =20 + +static void coroutine_fn blk_reconnect_co(void *opaque) +{ + BlockBackend *blk =3D opaque; + int i; + + for (i =3D 0; i < blk->reconnect_max; i++) { + int ret; + + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, blk->reconnect_ns); + + ret =3D bdrv_reconnect(blk_bs(blk), NULL); + if (ret =3D=3D 0) { + blk->reconnecting =3D false; + blk_iostatus_reset(blk); + return; + } + } + + blk->reconnecting =3D false; + blk->reconnect_failed =3D true; +} + /* This is done by device models because, while the block layer knows * about the error, it does not know whether an operation comes from * the device or the block layer (from a job, for example). @@ -1674,7 +1708,19 @@ void blk_error_action(BlockBackend *blk, BlockErrorA= ction action, { assert(error >=3D 0); =20 - if (action =3D=3D BLOCK_ERROR_ACTION_STOP) { + if (action =3D=3D BLOCK_ERROR_ACTION_RECONNECT) { + Coroutine *co; + blk_iostatus_set_err(blk, error); + + if (blk->reconnecting || blk->reconnect_failed) { + return; + } + + blk->reconnecting =3D true; + + co =3D qemu_coroutine_create(blk_reconnect_co, blk); + aio_co_enter(blk_get_aio_context(blk), co); + } else if (action =3D=3D BLOCK_ERROR_ACTION_STOP) { /* First set the iostatus, so that "info block" returns an iostatus * that matches the events raised so far (an additional error iost= atus * is fine, but not a lost one). diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index ded23d36ca..f1c166dfda 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -474,7 +474,9 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int er= ror, bool acct_failed) } =20 blk_error_action(s->qdev.conf.blk, action, is_read, error); - if (action =3D=3D BLOCK_ERROR_ACTION_STOP) { + if (action =3D=3D BLOCK_ERROR_ACTION_STOP || + action =3D=3D BLOCK_ERROR_ACTION_RECONNECT) + { scsi_req_retry(&r->req); } return action !=3D BLOCK_ERROR_ACTION_IGNORE; --=20 2.11.1