After failover the Secondary side of replication shouldn't change state, because
it now functions as our primary disk.
In replication_start, replication_do_checkpoint, replication_stop, ignore
the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or
BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active
and hidden images into the base image).
Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
block/replication.c | 38 +++++++++++++++++++++++++++++++++++---
1 file changed, 35 insertions(+), 3 deletions(-)
diff --git a/block/replication.c b/block/replication.c
index 3d4dedddfc..97cc65c0cf 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -454,6 +454,17 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
aio_context_acquire(aio_context);
s = bs->opaque;
+ if (s->stage == BLOCK_REPLICATION_DONE ||
+ s->stage == BLOCK_REPLICATION_FAILOVER) {
+ /*
+ * This case happens when a secondary is promoted to primary.
+ * Ignore the request because the secondary side of replication
+ * doesn't have to do anything anymore.
+ */
+ aio_context_release(aio_context);
+ return;
+ }
+
if (s->stage != BLOCK_REPLICATION_NONE) {
error_setg(errp, "Block replication is running or done");
aio_context_release(aio_context);
@@ -529,8 +540,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
"Block device is in use by internal backup job");
top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
- if (!top_bs || !bdrv_is_root_node(top_bs) ||
- !check_top_bs(top_bs, bs)) {
+ if (!top_bs || !check_top_bs(top_bs, bs)) {
error_setg(errp, "No top_bs or it is invalid");
reopen_backing_file(bs, false, NULL);
aio_context_release(aio_context);
@@ -577,6 +587,17 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
aio_context_acquire(aio_context);
s = bs->opaque;
+ if (s->stage == BLOCK_REPLICATION_DONE ||
+ s->stage == BLOCK_REPLICATION_FAILOVER) {
+ /*
+ * This case happens when a secondary was promoted to primary.
+ * Ignore the request because the secondary side of replication
+ * doesn't have to do anything anymore.
+ */
+ aio_context_release(aio_context);
+ return;
+ }
+
if (s->mode == REPLICATION_MODE_SECONDARY) {
secondary_do_checkpoint(s, errp);
}
@@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp)
aio_context_acquire(aio_context);
s = bs->opaque;
- if (s->stage != BLOCK_REPLICATION_RUNNING) {
+ if (s->stage == BLOCK_REPLICATION_NONE) {
error_setg(errp, "Block replication is not running");
aio_context_release(aio_context);
return;
@@ -635,6 +656,17 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
aio_context_acquire(aio_context);
s = bs->opaque;
+ if (s->stage == BLOCK_REPLICATION_DONE ||
+ s->stage == BLOCK_REPLICATION_FAILOVER) {
+ /*
+ * This case happens when a secondary was promoted to primary.
+ * Ignore the request because the secondary side of replication
+ * doesn't have to do anything anymore.
+ */
+ aio_context_release(aio_context);
+ return;
+ }
+
if (s->stage != BLOCK_REPLICATION_RUNNING) {
error_setg(errp, "Block replication is not running");
aio_context_release(aio_context);
--
2.20.1
> -----Original Message-----
> From: Lukas Straub <lukasstraub2@web.de>
> Sent: Monday, September 16, 2019 3:20 AM
> To: qemu-devel <qemu-devel@nongnu.org>
> Cc: Zhang, Chen <chen.zhang@intel.com>; Jason Wang
> <jasowang@redhat.com>; Wen Congyang <wencongyang2@huawei.com>;
> Xie Changlong <xiechanglong.d@gmail.com>; kwolf@redhat.com;
> mreitz@redhat.com
> Subject: [PATCH v5 1/4] block/replication.c: Ignore requests after failover
>
> After failover the Secondary side of replication shouldn't change state,
> because it now functions as our primary disk.
>
> In replication_start, replication_do_checkpoint, replication_stop, ignore the
> request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or
> BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging
> active and hidden images into the base image).
>
It looks good for me, and this patch works well in COLO continuous backup status in my tests.
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
But I think this patch still need reviewed by Xie Changlong or Kevin Wolf.
Thanks
Zhang Chen
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
> block/replication.c | 38 +++++++++++++++++++++++++++++++++++---
> 1 file changed, 35 insertions(+), 3 deletions(-)
>
> diff --git a/block/replication.c b/block/replication.c index
> 3d4dedddfc..97cc65c0cf 100644
> --- a/block/replication.c
> +++ b/block/replication.c
> @@ -454,6 +454,17 @@ static void replication_start(ReplicationState *rs,
> ReplicationMode mode,
> aio_context_acquire(aio_context);
> s = bs->opaque;
>
> + if (s->stage == BLOCK_REPLICATION_DONE ||
> + s->stage == BLOCK_REPLICATION_FAILOVER) {
> + /*
> + * This case happens when a secondary is promoted to primary.
> + * Ignore the request because the secondary side of replication
> + * doesn't have to do anything anymore.
> + */
> + aio_context_release(aio_context);
> + return;
> + }
> +
> if (s->stage != BLOCK_REPLICATION_NONE) {
> error_setg(errp, "Block replication is running or done");
> aio_context_release(aio_context); @@ -529,8 +540,7 @@ static void
> replication_start(ReplicationState *rs, ReplicationMode mode,
> "Block device is in use by internal backup job");
>
> top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
> - if (!top_bs || !bdrv_is_root_node(top_bs) ||
> - !check_top_bs(top_bs, bs)) {
> + if (!top_bs || !check_top_bs(top_bs, bs)) {
> error_setg(errp, "No top_bs or it is invalid");
> reopen_backing_file(bs, false, NULL);
> aio_context_release(aio_context); @@ -577,6 +587,17 @@ static void
> replication_do_checkpoint(ReplicationState *rs, Error **errp)
> aio_context_acquire(aio_context);
> s = bs->opaque;
>
> + if (s->stage == BLOCK_REPLICATION_DONE ||
> + s->stage == BLOCK_REPLICATION_FAILOVER) {
> + /*
> + * This case happens when a secondary was promoted to primary.
> + * Ignore the request because the secondary side of replication
> + * doesn't have to do anything anymore.
> + */
> + aio_context_release(aio_context);
> + return;
> + }
> +
> if (s->mode == REPLICATION_MODE_SECONDARY) {
> secondary_do_checkpoint(s, errp);
> }
> @@ -593,7 +614,7 @@ static void replication_get_error(ReplicationState *rs,
> Error **errp)
> aio_context_acquire(aio_context);
> s = bs->opaque;
>
> - if (s->stage != BLOCK_REPLICATION_RUNNING) {
> + if (s->stage == BLOCK_REPLICATION_NONE) {
> error_setg(errp, "Block replication is not running");
> aio_context_release(aio_context);
> return;
> @@ -635,6 +656,17 @@ static void replication_stop(ReplicationState *rs,
> bool failover, Error **errp)
> aio_context_acquire(aio_context);
> s = bs->opaque;
>
> + if (s->stage == BLOCK_REPLICATION_DONE ||
> + s->stage == BLOCK_REPLICATION_FAILOVER) {
> + /*
> + * This case happens when a secondary was promoted to primary.
> + * Ignore the request because the secondary side of replication
> + * doesn't have to do anything anymore.
> + */
> + aio_context_release(aio_context);
> + return;
> + }
> +
> if (s->stage != BLOCK_REPLICATION_RUNNING) {
> error_setg(errp, "Block replication is not running");
> aio_context_release(aio_context);
> --
> 2.20.1
© 2016 - 2025 Red Hat, Inc.