QCOW2: Add zeroes discard option

[RFC PATCH] QCOW2: Add zeroes discard option

Posted by Jean-Louis Dupond 2 years, 9 months ago

When we have a sparse qcow2 image and discard: unmap is enabled, there
is a lot of fragmentation in the image after some time. Surely on VM's
that do a lot of writes/deletes.
This causes the qcow2 image to grow even over 110% of its virtual size,
this because the free gaps in the image get to small to allocate new
continuous clusters.

There are multiple ways to properly resolve this. One way is to not
discard the blocks on a discard request, but just zero them. This causes
the allocation the still exist, and results in no gaps.
This should also cause a perfectly continuous image when using full
preallocation.

RFC because my knowledge of qcow2 is limited, and I think its best some
developer with qcow2 knowledge has a look at it :)
Tested and seems to zero the blocks correctly instead of unmapping them.

Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1621
Signed-off-by: Jean-Louis Dupond <jean-louis@dupond.be>
---
 block.c                              |  2 +
 block/io.c                           |  2 +-
 block/qcow2-cluster.c                | 85 +++++++++++++++-------------
 include/block/block-common.h         |  1 +
 qapi/block-core.json                 |  3 +-
 qemu-nbd.c                           |  2 +-
 qemu-options.hx                      |  2 +-
 storage-daemon/qemu-storage-daemon.c |  2 +-
 8 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/block.c b/block.c
index 5ec1a3897e..ed21d115dd 100644
--- a/block.c
+++ b/block.c
@@ -1146,6 +1146,8 @@ int bdrv_parse_discard_flags(const char *mode, int *flags)
         /* do nothing */
     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
         *flags |= BDRV_O_UNMAP;
+    } else if (!strcmp(mode, "zeroes")) {
+        *flags |= BDRV_O_UNMAP_ZERO;
     } else {
         return -1;
     }
diff --git a/block/io.c b/block/io.c
index 6fa1993374..2afc4b56b0 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2988,7 +2988,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
     }
 
     /* Do nothing if disabled.  */
-    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+    if (!(bs->open_flags & BDRV_O_UNMAP) && !(bs->open_flags & BDRV_O_UNMAP_ZERO)) {
         return 0;
     }
 
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 39cda7f907..9e52537f50 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1969,46 +1969,6 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
     return nb_clusters;
 }
 
-int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
-                          uint64_t bytes, enum qcow2_discard_type type,
-                          bool full_discard)
-{
-    BDRVQcow2State *s = bs->opaque;
-    uint64_t end_offset = offset + bytes;
-    uint64_t nb_clusters;
-    int64_t cleared;
-    int ret;
-
-    /* Caller must pass aligned values, except at image end */
-    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
-    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
-           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
-
-    nb_clusters = size_to_clusters(s, bytes);
-
-    s->cache_discards = true;
-
-    /* Each L2 slice is handled by its own loop iteration */
-    while (nb_clusters > 0) {
-        cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
-                                      full_discard);
-        if (cleared < 0) {
-            ret = cleared;
-            goto fail;
-        }
-
-        nb_clusters -= cleared;
-        offset += (cleared * s->cluster_size);
-    }
-
-    ret = 0;
-fail:
-    s->cache_discards = false;
-    qcow2_process_discards(bs, ret);
-
-    return ret;
-}
-
 /*
  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
  * all clusters in the same L2 slice) and returns the number of zeroed
@@ -2069,6 +2029,51 @@ static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
     return nb_clusters;
 }
 
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, enum qcow2_discard_type type,
+                          bool full_discard)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t end_offset = offset + bytes;
+    uint64_t nb_clusters;
+    int64_t cleared;
+    int ret;
+
+    /* Caller must pass aligned values, except at image end */
+    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
+
+    nb_clusters = size_to_clusters(s, bytes);
+
+    s->cache_discards = true;
+
+    /* Each L2 slice is handled by its own loop iteration */
+    while (nb_clusters > 0) {
+        cleared = 0;
+        if (bs->open_flags & BDRV_O_UNMAP_ZERO) {
+            cleared = zero_in_l2_slice(bs, offset, nb_clusters, 0);
+        } else {
+            cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
+                                        full_discard);
+        }
+        if (cleared < 0) {
+            ret = cleared;
+            goto fail;
+        }
+
+        nb_clusters -= cleared;
+        offset += (cleared * s->cluster_size);
+    }
+
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
+}
+
 static int coroutine_fn
 zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
                     unsigned nb_subclusters)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index b5122ef8ab..d90206087e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -179,6 +179,7 @@ typedef enum {
 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
                                       read-write fails */
 #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
+#define BDRV_O_UNMAP_ZERO  0x80000  /* execute guest UNMAP/TRIM operations but zero instead of deref */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c05ad0c07e..0f91d1a6b6 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2971,11 +2971,12 @@
 #
 # @ignore: Ignore the request
 # @unmap: Forward as an unmap request
+# @zeroes: Zero the clusters instead of unmapping (since 8.1)
 #
 # Since: 2.9
 ##
 { 'enum': 'BlockdevDiscardOptions',
-  'data': [ 'ignore', 'unmap' ] }
+  'data': [ 'ignore', 'unmap', 'zeroes' ] }
 
 ##
 # @BlockdevDetectZeroesOptions:
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 6ff45308a9..6c0b326db4 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -148,7 +148,7 @@ static void usage(const char *name)
 "                            valid options are: 'none', 'writeback' (default),\n"
 "                            'writethrough', 'directsync' and 'unsafe'\n"
 "      --aio=MODE            set AIO mode (native, io_uring or threads)\n"
-"      --discard=MODE        set discard mode (ignore, unmap)\n"
+"      --discard=MODE        set discard mode (ignore, unmap, zeroes)\n"
 "      --detect-zeroes=MODE  set detect-zeroes mode (off, on, unmap)\n"
 "      --image-opts          treat FILE as a full set of image options\n"
 "\n"
diff --git a/qemu-options.hx b/qemu-options.hx
index b5efa648ba..7e9d383499 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1209,7 +1209,7 @@ SRST
 ERST
 
 DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
-    "-blockdev [driver=]driver[,node-name=N][,discard=ignore|unmap]\n"
+    "-blockdev [driver=]driver[,node-name=N][,discard=ignore|unmap|zeroes]\n"
     "          [,cache.direct=on|off][,cache.no-flush=on|off]\n"
     "          [,read-only=on|off][,auto-read-only=on|off]\n"
     "          [,force-share=on|off][,detect-zeroes=on|off|unmap]\n"
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
index 0e9354faa6..08e8b1b3d9 100644
--- a/storage-daemon/qemu-storage-daemon.c
+++ b/storage-daemon/qemu-storage-daemon.c
@@ -86,7 +86,7 @@ static void help(void)
 "                         specify tracing options\n"
 "  -V, --version          output version information and exit\n"
 "\n"
-"  --blockdev [driver=]<driver>[,node-name=<N>][,discard=ignore|unmap]\n"
+"  --blockdev [driver=]<driver>[,node-name=<N>][,discard=ignore|unmap|zeroes]\n"
 "             [,cache.direct=on|off][,cache.no-flush=on|off]\n"
 "             [,read-only=on|off][,auto-read-only=on|off]\n"
 "             [,force-share=on|off][,detect-zeroes=on|off|unmap]\n"
-- 
2.40.1

Re: [RFC PATCH] QCOW2: Add zeroes discard option

Posted by Hanna Czenczek 2 years, 9 months ago

Thanks for the patch!

As a note for hopefully many future patches ( :) ), you should run 
scripts/checkpatch.pl on them to check for coding style issues. Here, it 
reports two lines that are longer than 80 characters.

Second, ideally, patches are not just sent to the qemu-devel list, but 
also to the maintainers for the code base in question (because they’re 
the ones who will have to look at patches eventually, and because 
qemu-devel has so much traffic, more often than not they don’t monitor 
it).  scripts/get_maintainer.pl can tell you who they are (including 
their email addresses).

On 09.05.23 11:01, Jean-Louis Dupond wrote:
> When we have a sparse qcow2 image and discard: unmap is enabled, there
> is a lot of fragmentation in the image after some time. Surely on VM's
> that do a lot of writes/deletes.
> This causes the qcow2 image to grow even over 110% of its virtual size,
> this because the free gaps in the image get to small to allocate new
> continuous clusters.
>
> There are multiple ways to properly resolve this. One way is to not
> discard the blocks on a discard request, but just zero them. This causes
> the allocation the still exist, and results in no gaps.
> This should also cause a perfectly continuous image when using full
> preallocation.

I think you should also mention why you want discard to work at all.  As 
far as I remember, it was that guests tend to prefer discard over 
write-zero when they don’t care about the data in some unused block, and 
you want those blocks to be efficiently copied when doing a back-up.  
You’ve explained why discard=unmap doesn’t work well in your case, so 
you want it to be a write-zero operation, which will still have 
everything be handled efficiently while retaining the allocation.

Did I get that right?

> RFC because my knowledge of qcow2 is limited, and I think its best some
> developer with qcow2 knowledge has a look at it :)
> Tested and seems to zero the blocks correctly instead of unmapping them.

Now that I’m actually reviewing this patch I wonder whether we need to 
do it in qcow2 at all.  Would it be simpler and better (because it would 
work not only for qcow2) to have bdrv_co_pdiscard() translate a discard 
request with UNMAP_ZERO into a bdrv_co_pwrite_zeroes()?

> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1621
> Signed-off-by: Jean-Louis Dupond <jean-louis@dupond.be>
> ---
>   block.c                              |  2 +
>   block/io.c                           |  2 +-
>   block/qcow2-cluster.c                | 85 +++++++++++++++-------------
>   include/block/block-common.h         |  1 +
>   qapi/block-core.json                 |  3 +-
>   qemu-nbd.c                           |  2 +-
>   qemu-options.hx                      |  2 +-
>   storage-daemon/qemu-storage-daemon.c |  2 +-
>   8 files changed, 54 insertions(+), 45 deletions(-)

I have a question for the patch as-is, though, too: What is supposed to 
happen if you use discard=zero on a non-qcow2 block device?  As far as I 
understand, they will just unmap the area then, not leaving it 
allocated.  But I think that isn’t the intention; I don’t think we want 
to have discard=zero fall back to discard=unmap, so for drivers that 
don’t support discard=zero, we should just do nothing and not call their 
discard function.  (We could achieve this by having bdrv_co_pdiscard() 
call bdrv_co_pwrite_zeroes(), as described above.)

I think the behavior we want should also be explicitly described in the 
documentation, as well as a guideline on when people may want to use 
this option over discard=unmap (i.e. what I’d also like to have in the 
commit message).

Finally, a technical thing: To keep the diff smaller, I wouldn’t move 
the whole qcow2_cluster_discard() function down, but instead 
forward-declare zero_in_l2_slice() above it.

Hanna

Re: [RFC PATCH] QCOW2: Add zeroes discard option

Posted by Jean-Louis Dupond 2 years, 9 months ago

Thanks for the feedback! :)

On 10/05/2023 12:45, Hanna Czenczek wrote:
> Thanks for the patch!
>
> As a note for hopefully many future patches ( :) ), you should run 
> scripts/checkpatch.pl on them to check for coding style issues. Here, 
> it reports two lines that are longer than 80 characters.
Fixed!
>
> Second, ideally, patches are not just sent to the qemu-devel list, but 
> also to the maintainers for the code base in question (because they’re 
> the ones who will have to look at patches eventually, and because 
> qemu-devel has so much traffic, more often than not they don’t monitor 
> it).  scripts/get_maintainer.pl can tell you who they are (including 
> their email addresses).
Check!
>
> On 09.05.23 11:01, Jean-Louis Dupond wrote:
>> When we have a sparse qcow2 image and discard: unmap is enabled, there
>> is a lot of fragmentation in the image after some time. Surely on VM's
>> that do a lot of writes/deletes.
>> This causes the qcow2 image to grow even over 110% of its virtual size,
>> this because the free gaps in the image get to small to allocate new
>> continuous clusters.
>>
>> There are multiple ways to properly resolve this. One way is to not
>> discard the blocks on a discard request, but just zero them. This causes
>> the allocation the still exist, and results in no gaps.
>> This should also cause a perfectly continuous image when using full
>> preallocation.
>
> I think you should also mention why you want discard to work at all.  
> As far as I remember, it was that guests tend to prefer discard over 
> write-zero when they don’t care about the data in some unused block, 
> and you want those blocks to be efficiently copied when doing a 
> back-up.  You’ve explained why discard=unmap doesn’t work well in your 
> case, so you want it to be a write-zero operation, which will still 
> have everything be handled efficiently while retaining the allocation.
>
> Did I get that right?

The main thing discard is needed is indeed to keep the incremental 
backup size low.
Without doing discards, the blocks are just still there in the qcow2 
image, and marked as dirty by the dirty-block-tracking. But as the 
blocks still contain data, this causes big incremental backups, which 
are not wanted.
The way to resolve this is to discard, which causes the blocks to be 
removed and not included in the incremental backup.

>
>> RFC because my knowledge of qcow2 is limited, and I think its best some
>> developer with qcow2 knowledge has a look at it :)
>> Tested and seems to zero the blocks correctly instead of unmapping them.
>
> Now that I’m actually reviewing this patch I wonder whether we need to 
> do it in qcow2 at all.  Would it be simpler and better (because it 
> would work not only for qcow2) to have bdrv_co_pdiscard() translate a 
> discard request with UNMAP_ZERO into a bdrv_co_pwrite_zeroes()?
>
While the use-case here is currently only relevant on qcow2, I think its 
not an issue to do just the same on other block backends.
So it's a good idea to just move this into io.c
>> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1621
>> Signed-off-by: Jean-Louis Dupond <jean-louis@dupond.be>
>> ---
>>   block.c                              |  2 +
>>   block/io.c                           |  2 +-
>>   block/qcow2-cluster.c                | 85 +++++++++++++++-------------
>>   include/block/block-common.h         |  1 +
>>   qapi/block-core.json                 |  3 +-
>>   qemu-nbd.c                           |  2 +-
>>   qemu-options.hx                      |  2 +-
>>   storage-daemon/qemu-storage-daemon.c |  2 +-
>>   8 files changed, 54 insertions(+), 45 deletions(-)
>
> I have a question for the patch as-is, though, too: What is supposed 
> to happen if you use discard=zero on a non-qcow2 block device?  As far 
> as I understand, they will just unmap the area then, not leaving it 
> allocated.  But I think that isn’t the intention; I don’t think we 
> want to have discard=zero fall back to discard=unmap, so for drivers 
> that don’t support discard=zero, we should just do nothing and not 
> call their discard function.  (We could achieve this by having 
> bdrv_co_pdiscard() call bdrv_co_pwrite_zeroes(), as described above.)
>
> I think the behavior we want should also be explicitly described in 
> the documentation, as well as a guideline on when people may want to 
> use this option over discard=unmap (i.e. what I’d also like to have in 
> the commit message).
>
> Finally, a technical thing: To keep the diff smaller, I wouldn’t move 
> the whole qcow2_cluster_discard() function down, but instead 
> forward-declare zero_in_l2_slice() above it.

I'll post a new patch!

>
> Hanna
>

Thanks
Jean-Louis