[v5] fix image fleecing

[Qemu-devel] [PATCH v5 4/4] block/backup: fix fleecing scheme: use serialized writes

Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago

Fleecing scheme works as follows: we want a kind of temporary snapshot
of active drive A. We create temporary image B, with B->backing = A.
Then we start backup(sync=none) from A to B. From this point, B reads
as point-in-time snapshot of A (A continues to be active drive,
accepting guest IO).

This scheme needs some additional synchronization between reads from B
and backup COW operations, otherwise, the following situation is
theoretically possible:

(assume B is qcow2, client is NBD client, reading from B)

1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and
   goes up to l2 table loading (assume cache miss)

2) guest write => backup COW => qcow2 write =>
   try to take qcow2 mutex => waiting

3. l2 table loaded, we see that cluster is UNALLOCATED, go to
   "case QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before
   bdrv_co_preadv(bs->backing, ...)

4) aha, mutex unlocked, backup COW continues, and we finally finish
   guest write and change cluster in our active disk A

5. actually, do bdrv_co_preadv(bs->backing, ...) and read
   _new updated_ data.

To avoid this, let's make backup writes serializing, to not intersect
with reads from B.

Note: we expand range of handled cases from (sync=none and
B->backing = A) to just (A in backing chain of B), to finally allow
safe reading from B during backup for all cases when A in backing chain
of B, i.e. B formally looks like point-in-time snapshot of A.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/backup.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index f3e4e814b6..319fc922e8 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -47,6 +47,8 @@ typedef struct BackupBlockJob {
     HBitmap *copy_bitmap;
     bool use_copy_range;
     int64_t copy_range_size;
+
+    bool serialize_target_writes;
 } BackupBlockJob;
 
 static const BlockJobDriver backup_job_driver;
@@ -102,6 +104,8 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
     QEMUIOVector qiov;
     BlockBackend *blk = job->common.blk;
     int nbytes;
+    int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
+    int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
 
     hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);
     nbytes = MIN(job->cluster_size, job->len - start);
@@ -112,8 +116,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
     iov.iov_len = nbytes;
     qemu_iovec_init_external(&qiov, &iov, 1);
 
-    ret = blk_co_preadv(blk, start, qiov.size, &qiov,
-                        is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
+    ret = blk_co_preadv(blk, start, qiov.size, &qiov, read_flags);
     if (ret < 0) {
         trace_backup_do_cow_read_fail(job, start, ret);
         if (error_is_read) {
@@ -124,11 +127,11 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
 
     if (qemu_iovec_is_zero(&qiov)) {
         ret = blk_co_pwrite_zeroes(job->target, start,
-                                   qiov.size, BDRV_REQ_MAY_UNMAP);
+                                   qiov.size, write_flags | BDRV_REQ_MAY_UNMAP);
     } else {
         ret = blk_co_pwritev(job->target, start,
-                             qiov.size, &qiov,
-                             job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
+                             qiov.size, &qiov, write_flags |
+                             (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
     }
     if (ret < 0) {
         trace_backup_do_cow_write_fail(job, start, ret);
@@ -156,6 +159,8 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
     int nr_clusters;
     BlockBackend *blk = job->common.blk;
     int nbytes;
+    int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
+    int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
 
     assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
     nbytes = MIN(job->copy_range_size, end - start);
@@ -163,7 +168,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
     hbitmap_reset(job->copy_bitmap, start / job->cluster_size,
                   nr_clusters);
     ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
-                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0);
+                            read_flags, write_flags);
     if (ret < 0) {
         trace_backup_do_cow_copy_range_fail(job, start, ret);
         hbitmap_set(job->copy_bitmap, start / job->cluster_size,
@@ -701,6 +706,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
                        sync_bitmap : NULL;
     job->compress = compress;
 
+    /* Detect image-fleecing (and similar) schemes */
+    job->serialize_target_writes = bdrv_chain_contains(target, bs);
+
     /* If there is no backing file on the target, we cannot rely on COW if our
      * backup cluster size is smaller than the target cluster size. Even for
      * targets with a backing file, try to avoid COW if possible. */
-- 
2.11.1

Re: [Qemu-devel] [PATCH v5 4/4] block/backup: fix fleecing scheme: use serialized writes

Posted by Fam Zheng 7 years, 7 months ago

On Mon, 07/09 19:37, Vladimir Sementsov-Ogievskiy wrote:
> Fleecing scheme works as follows: we want a kind of temporary snapshot
> of active drive A. We create temporary image B, with B->backing = A.
> Then we start backup(sync=none) from A to B. From this point, B reads
> as point-in-time snapshot of A (A continues to be active drive,
> accepting guest IO).
> 
> This scheme needs some additional synchronization between reads from B
> and backup COW operations, otherwise, the following situation is
> theoretically possible:
> 
> (assume B is qcow2, client is NBD client, reading from B)
> 
> 1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and
>    goes up to l2 table loading (assume cache miss)
> 
> 2) guest write => backup COW => qcow2 write =>
>    try to take qcow2 mutex => waiting
> 
> 3. l2 table loaded, we see that cluster is UNALLOCATED, go to
>    "case QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before
>    bdrv_co_preadv(bs->backing, ...)
> 
> 4) aha, mutex unlocked, backup COW continues, and we finally finish
>    guest write and change cluster in our active disk A
> 
> 5. actually, do bdrv_co_preadv(bs->backing, ...) and read
>    _new updated_ data.
> 
> To avoid this, let's make backup writes serializing, to not intersect
> with reads from B.
> 
> Note: we expand range of handled cases from (sync=none and
> B->backing = A) to just (A in backing chain of B), to finally allow
> safe reading from B during backup for all cases when A in backing chain
> of B, i.e. B formally looks like point-in-time snapshot of A.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  block/backup.c | 20 ++++++++++++++------
>  1 file changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index f3e4e814b6..319fc922e8 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -47,6 +47,8 @@ typedef struct BackupBlockJob {
>      HBitmap *copy_bitmap;
>      bool use_copy_range;
>      int64_t copy_range_size;
> +
> +    bool serialize_target_writes;
>  } BackupBlockJob;
>  
>  static const BlockJobDriver backup_job_driver;
> @@ -102,6 +104,8 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>      QEMUIOVector qiov;
>      BlockBackend *blk = job->common.blk;
>      int nbytes;
> +    int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
> +    int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
>  
>      hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);
>      nbytes = MIN(job->cluster_size, job->len - start);
> @@ -112,8 +116,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>      iov.iov_len = nbytes;
>      qemu_iovec_init_external(&qiov, &iov, 1);
>  
> -    ret = blk_co_preadv(blk, start, qiov.size, &qiov,
> -                        is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
> +    ret = blk_co_preadv(blk, start, qiov.size, &qiov, read_flags);
>      if (ret < 0) {
>          trace_backup_do_cow_read_fail(job, start, ret);
>          if (error_is_read) {
> @@ -124,11 +127,11 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>  
>      if (qemu_iovec_is_zero(&qiov)) {
>          ret = blk_co_pwrite_zeroes(job->target, start,
> -                                   qiov.size, BDRV_REQ_MAY_UNMAP);
> +                                   qiov.size, write_flags | BDRV_REQ_MAY_UNMAP);
>      } else {
>          ret = blk_co_pwritev(job->target, start,
> -                             qiov.size, &qiov,
> -                             job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
> +                             qiov.size, &qiov, write_flags |
> +                             (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
>      }
>      if (ret < 0) {
>          trace_backup_do_cow_write_fail(job, start, ret);
> @@ -156,6 +159,8 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
>      int nr_clusters;
>      BlockBackend *blk = job->common.blk;
>      int nbytes;
> +    int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
> +    int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
>  
>      assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
>      nbytes = MIN(job->copy_range_size, end - start);
> @@ -163,7 +168,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
>      hbitmap_reset(job->copy_bitmap, start / job->cluster_size,
>                    nr_clusters);
>      ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
> -                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0);
> +                            read_flags, write_flags);
>      if (ret < 0) {
>          trace_backup_do_cow_copy_range_fail(job, start, ret);
>          hbitmap_set(job->copy_bitmap, start / job->cluster_size,
> @@ -701,6 +706,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
>                         sync_bitmap : NULL;
>      job->compress = compress;
>  
> +    /* Detect image-fleecing (and similar) schemes */
> +    job->serialize_target_writes = bdrv_chain_contains(target, bs);
> +
>      /* If there is no backing file on the target, we cannot rely on COW if our
>       * backup cluster size is smaller than the target cluster size. Even for
>       * targets with a backing file, try to avoid COW if possible. */

I always thought mirror is by far the most complicated job, but now backup is
catching up quickly!

Reviewed-by: Fam Zheng <famz@redhat.com>

[Qemu-devel] [PATCH v5 1/4] block/io: fix copy_range
[Qemu-devel] [PATCH v5 2/4] block: split flags in copy_range
[Qemu-devel] [PATCH v5 3/4] block: add BDRV_REQ_SERIALISING flag
[Qemu-devel] [PATCH v5 4/4] block/backup: fix fleecing scheme: use serialized writes