[PATCH 3/5] block: introduce preallocate filter

Vladimir Sementsov-Ogievskiy posted 5 patches 5 years, 7 months ago
Maintainers: Stefan Hajnoczi <stefanha@redhat.com>, Kevin Wolf <kwolf@redhat.com>, Markus Armbruster <armbru@redhat.com>, Fam Zheng <fam@euphon.net>, Max Reitz <mreitz@redhat.com>, Eric Blake <eblake@redhat.com>
There is a newer version of this series
[PATCH 3/5] block: introduce preallocate filter
Posted by Vladimir Sementsov-Ogievskiy 5 years, 7 months ago
It may be used for file-systems with slow allocation.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 qapi/block-core.json |   3 +-
 block/preallocate.c  | 255 +++++++++++++++++++++++++++++++++++++++++++
 block/Makefile.objs  |   1 +
 3 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 block/preallocate.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 0e1c6a59f2..a0bda399d6 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2805,7 +2805,7 @@
             'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps',
             'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi',
             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
-            'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+            'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
             'sheepdog',
             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
@@ -3995,6 +3995,7 @@
       'null-co':    'BlockdevOptionsNull',
       'nvme':       'BlockdevOptionsNVMe',
       'parallels':  'BlockdevOptionsGenericFormat',
+      'preallocate':'BlockdevOptionsGenericFormat',
       'qcow2':      'BlockdevOptionsQcow2',
       'qcow':       'BlockdevOptionsQcow',
       'qed':        'BlockdevOptionsGenericCOWFormat',
diff --git a/block/preallocate.c b/block/preallocate.c
new file mode 100644
index 0000000000..c272a6e41d
--- /dev/null
+++ b/block/preallocate.c
@@ -0,0 +1,255 @@
+/*
+ * preallocate filter driver
+ *
+ * The driver performs preallocate operation: it is injected above
+ * some node, and before each write over EOF it does additional preallocating
+ * write-zeroes request.
+ *
+ * Copyright (c) 2020 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/module.h"
+#include "qemu/units.h"
+#include "block/block_int.h"
+
+
+typedef struct BDRVPreallocateState {
+    int64_t prealloc_size;
+    int64_t prealloc_align;
+
+    /*
+     * Track real data end, to crop preallocation on close  data_end may be
+     * negative, which means that actual status is unknown (nothing cropped in
+     * this case)
+     */
+    int64_t data_end;
+} BDRVPreallocateState;
+
+
+static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
+                            Error **errp)
+{
+    BDRVPreallocateState *s = bs->opaque;
+
+    /*
+     * Parameters are hardcoded now. May need to add corresponding options in
+     * future.
+     */
+    s->prealloc_align = 1 * MiB;
+    s->prealloc_size = 128 * MiB;
+
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
+                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
+                               false, errp);
+    if (!bs->file) {
+        return -EINVAL;
+    }
+
+    s->data_end = bdrv_getlength(bs->file->bs);
+    if (s->data_end < 0) {
+        return s->data_end;
+    }
+
+    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
+        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
+
+    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
+        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+            bs->file->bs->supported_zero_flags);
+
+    return 0;
+}
+
+static void preallocate_close(BlockDriverState *bs)
+{
+    BDRVPreallocateState *s = bs->opaque;
+
+    if (s->data_end >= 0 && bdrv_getlength(bs->file->bs) > s->data_end) {
+        bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, NULL);
+    }
+}
+
+static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                   BdrvChildRole role,
+                                   BlockReopenQueue *reopen_queue,
+                                   uint64_t perm, uint64_t shared,
+                                   uint64_t *nperm, uint64_t *nshared)
+{
+    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
+
+    /* Force RESIZE permission, to be able to crop file on close() */
+    *nperm |= BLK_PERM_RESIZE;
+}
+
+static coroutine_fn int preallocate_co_preadv_part(
+        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+        QEMUIOVector *qiov, size_t qiov_offset, int flags)
+{
+    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
+                               flags);
+}
+
+static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
+                                               int64_t offset, int bytes)
+{
+    return bdrv_co_pdiscard(bs->file, offset, bytes);
+}
+
+static bool coroutine_fn do_preallocate(BlockDriverState *bs, int64_t offset,
+                                       int64_t bytes, bool write_zero)
+{
+    BDRVPreallocateState *s = bs->opaque;
+    int64_t len, start, end;
+    BdrvTrackedRequest *lock;
+    int ret;
+
+    if (s->data_end >= 0) {
+        s->data_end = MAX(s->data_end,
+                          QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE));
+    }
+
+    len = bdrv_getlength(bs->file->bs);
+    if (len < 0) {
+        return false;
+    }
+
+    if (s->data_end < 0) {
+        s->data_end = MAX(len,
+                          QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE));
+    }
+
+    if (offset + bytes <= len) {
+        return false;
+    }
+
+    lock = bdrv_co_range_try_lock(bs->file->bs, len, INT64_MAX - len);
+    if (!lock) {
+        /* There are already preallocating requests in-fligth */
+        return false;
+    }
+
+    /* Length should not have changed */
+    assert(len == bdrv_getlength(bs->file->bs));
+
+    start = write_zero ? MIN(offset, len) : len;
+    end = QEMU_ALIGN_UP(offset + bytes + s->prealloc_size, s->prealloc_align);
+
+    ret = bdrv_co_pwrite_zeroes_locked(bs->file, start, end - start,
+                                       BDRV_REQ_NO_FALLBACK, lock);
+
+    bdrv_co_range_unlock(lock);
+
+    return !ret;
+}
+
+static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
+        int64_t offset, int bytes, BdrvRequestFlags flags)
+{
+    if (do_preallocate(bs, offset, bytes, true)) {
+        return 0;
+    }
+
+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
+static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
+                                                    uint64_t offset,
+                                                    uint64_t bytes,
+                                                    QEMUIOVector *qiov,
+                                                    size_t qiov_offset,
+                                                    int flags)
+{
+    do_preallocate(bs, offset, bytes, false);
+
+    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
+                                flags);
+}
+
+static int coroutine_fn
+preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
+                        bool exact, PreallocMode prealloc,
+                        BdrvRequestFlags flags, Error **errp)
+{
+    BDRVPreallocateState *s = bs->opaque;
+    int ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
+
+    /* s->data_end may become negative here, which means unknown data end */
+    s->data_end = bdrv_getlength(bs->file->bs);
+
+    return ret;
+}
+
+static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
+{
+    if (!bs->file) {
+        return 0;
+    }
+
+    return bdrv_co_flush(bs->file->bs);
+}
+
+static int64_t preallocate_getlength(BlockDriverState *bs)
+{
+    /*
+     * We probably can return s->data_end here, but seems safer to return real
+     * file length, not trying to hide the preallocation.
+     *
+     * Still, don't miss the chance to restore s->data_end if it is broken.
+     */
+    BDRVPreallocateState *s = bs->opaque;
+    int64_t ret = bdrv_getlength(bs->file->bs);
+
+    if (s->data_end < 0) {
+        s->data_end = ret;
+    }
+
+    return ret;
+}
+
+BlockDriver bdrv_preallocate_filter = {
+    .format_name = "preallocate",
+    .instance_size = sizeof(BDRVPreallocateState),
+
+    .bdrv_getlength = preallocate_getlength,
+    .bdrv_open = preallocate_open,
+    .bdrv_close = preallocate_close,
+
+    .bdrv_co_preadv_part = preallocate_co_preadv_part,
+    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
+    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
+    .bdrv_co_pdiscard = preallocate_co_pdiscard,
+    .bdrv_co_flush = preallocate_co_flush,
+    .bdrv_co_truncate = preallocate_co_truncate,
+
+    .bdrv_co_block_status = bdrv_co_block_status_from_file,
+
+    .bdrv_child_perm = preallocate_child_perm,
+
+    .has_variable_length = true,
+    .is_filter = true,
+};
+
+static void bdrv_preallocate_init(void)
+{
+    bdrv_register(&bdrv_preallocate_filter);
+}
+
+block_init(bdrv_preallocate_init);
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 3635b6b4c1..f46a353a35 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -45,6 +45,7 @@ block-obj-y += crypto.o
 block-obj-y += aio_task.o
 block-obj-y += backup-top.o
 block-obj-y += filter-compress.o
+block-obj-y += preallocate.o
 common-obj-y += monitor/
 
 block-obj-y += stream.o
-- 
2.18.0


Re: [PATCH 3/5] block: introduce preallocate filter
Posted by Stefan Hajnoczi 5 years, 7 months ago
On Sat, Jun 20, 2020 at 05:36:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> It may be used for file-systems with slow allocation.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  qapi/block-core.json |   3 +-
>  block/preallocate.c  | 255 +++++++++++++++++++++++++++++++++++++++++++
>  block/Makefile.objs  |   1 +
>  3 files changed, 258 insertions(+), 1 deletion(-)
>  create mode 100644 block/preallocate.c

Please add documentation to docs/system/qemu-block-drivers.rst.inc
describing the purpose of this block driver and how to use it.

Since this filter grows the file I guess it's intended to be below an
image format?

> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 0e1c6a59f2..a0bda399d6 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2805,7 +2805,7 @@
>              'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps',
>              'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi',
>              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
> -            'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
> +            'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>              { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
>              'sheepdog',
>              'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
> @@ -3995,6 +3995,7 @@
>        'null-co':    'BlockdevOptionsNull',
>        'nvme':       'BlockdevOptionsNVMe',
>        'parallels':  'BlockdevOptionsGenericFormat',
> +      'preallocate':'BlockdevOptionsGenericFormat',
>        'qcow2':      'BlockdevOptionsQcow2',
>        'qcow':       'BlockdevOptionsQcow',
>        'qed':        'BlockdevOptionsGenericCOWFormat',
> diff --git a/block/preallocate.c b/block/preallocate.c
> new file mode 100644
> index 0000000000..c272a6e41d
> --- /dev/null
> +++ b/block/preallocate.c
> @@ -0,0 +1,255 @@
> +/*
> + * preallocate filter driver
> + *
> + * The driver performs preallocate operation: it is injected above
> + * some node, and before each write over EOF it does additional preallocating
> + * write-zeroes request.
> + *
> + * Copyright (c) 2020 Virtuozzo International GmbH.
> + *
> + * Author:
> + *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +
> +#include "qemu/module.h"
> +#include "qemu/units.h"
> +#include "block/block_int.h"
> +
> +
> +typedef struct BDRVPreallocateState {
> +    int64_t prealloc_size;
> +    int64_t prealloc_align;
> +
> +    /*
> +     * Track real data end, to crop preallocation on close  data_end may be
> +     * negative, which means that actual status is unknown (nothing cropped in
> +     * this case)
> +     */
> +    int64_t data_end;
> +} BDRVPreallocateState;
> +
> +
> +static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
> +                            Error **errp)
> +{
> +    BDRVPreallocateState *s = bs->opaque;
> +
> +    /*
> +     * Parameters are hardcoded now. May need to add corresponding options in
> +     * future.
> +     */

The code for .bdrv_open() options is quick to write. If you add the
options right away then it will be much easier for users who need to
tweak them in the future.

> +    s->prealloc_align = 1 * MiB;
> +    s->prealloc_size = 128 * MiB;
> +
> +    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
> +                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
> +                               false, errp);
> +    if (!bs->file) {
> +        return -EINVAL;
> +    }
> +
> +    s->data_end = bdrv_getlength(bs->file->bs);
> +    if (s->data_end < 0) {
> +        return s->data_end;
> +    }
> +
> +    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
> +        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
> +
> +    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
> +        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
> +            bs->file->bs->supported_zero_flags);
> +
> +    return 0;
> +}
> +
> +static void preallocate_close(BlockDriverState *bs)
> +{
> +    BDRVPreallocateState *s = bs->opaque;
> +
> +    if (s->data_end >= 0 && bdrv_getlength(bs->file->bs) > s->data_end) {
> +        bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, NULL);
> +    }
> +}
> +
> +static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
> +                                   BdrvChildRole role,
> +                                   BlockReopenQueue *reopen_queue,
> +                                   uint64_t perm, uint64_t shared,
> +                                   uint64_t *nperm, uint64_t *nshared)
> +{
> +    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
> +
> +    /* Force RESIZE permission, to be able to crop file on close() */
> +    *nperm |= BLK_PERM_RESIZE;
> +}
> +
> +static coroutine_fn int preallocate_co_preadv_part(
> +        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
> +        QEMUIOVector *qiov, size_t qiov_offset, int flags)
> +{
> +    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
> +                               flags);
> +}
> +
> +static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
> +                                               int64_t offset, int bytes)
> +{
> +    return bdrv_co_pdiscard(bs->file, offset, bytes);
> +}
> +
> +static bool coroutine_fn do_preallocate(BlockDriverState *bs, int64_t offset,
> +                                       int64_t bytes, bool write_zero)
> +{
> +    BDRVPreallocateState *s = bs->opaque;
> +    int64_t len, start, end;
> +    BdrvTrackedRequest *lock;
> +    int ret;
> +
> +    if (s->data_end >= 0) {
> +        s->data_end = MAX(s->data_end,
> +                          QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE));
> +    }
> +
> +    len = bdrv_getlength(bs->file->bs);
> +    if (len < 0) {
> +        return false;
> +    }
> +
> +    if (s->data_end < 0) {
> +        s->data_end = MAX(len,
> +                          QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE));
> +    }
> +
> +    if (offset + bytes <= len) {
> +        return false;
> +    }
> +
> +    lock = bdrv_co_range_try_lock(bs->file->bs, len, INT64_MAX - len);
> +    if (!lock) {
> +        /* There are already preallocating requests in-fligth */

s/fligth/flight/

> +        return false;
> +    }
> +
> +    /* Length should not have changed */
> +    assert(len == bdrv_getlength(bs->file->bs));
> +
> +    start = write_zero ? MIN(offset, len) : len;
> +    end = QEMU_ALIGN_UP(offset + bytes + s->prealloc_size, s->prealloc_align);
> +
> +    ret = bdrv_co_pwrite_zeroes_locked(bs->file, start, end - start,
> +                                       BDRV_REQ_NO_FALLBACK, lock);
> +
> +    bdrv_co_range_unlock(lock);

Hmm...if this piece of code is the only user of bdrv_co_range_try_lock()
then a BDRV_REQ_NO_WAIT flag might be a simpler API.

I thought the lock request would be used to perform multiple operations,
but if it's just for a single operation then I think it's less code and
easier to understand without the lock request.

> +
> +    return !ret;
> +}
> +
> +static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
> +        int64_t offset, int bytes, BdrvRequestFlags flags)
> +{
> +    if (do_preallocate(bs, offset, bytes, true)) {
> +        return 0;
> +    }
> +
> +    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
> +}
> +
> +static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
> +                                                    uint64_t offset,
> +                                                    uint64_t bytes,
> +                                                    QEMUIOVector *qiov,
> +                                                    size_t qiov_offset,
> +                                                    int flags)
> +{
> +    do_preallocate(bs, offset, bytes, false);
> +
> +    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
> +                                flags);
> +}
> +
> +static int coroutine_fn
> +preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
> +                        bool exact, PreallocMode prealloc,
> +                        BdrvRequestFlags flags, Error **errp)
> +{
> +    BDRVPreallocateState *s = bs->opaque;
> +    int ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
> +
> +    /* s->data_end may become negative here, which means unknown data end */
> +    s->data_end = bdrv_getlength(bs->file->bs);
> +
> +    return ret;
> +}
> +
> +static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
> +{
> +    if (!bs->file) {
> +        return 0;
> +    }

When does this happen? It's surprising to see the !bs->file check here
but not in other functions.

> +
> +    return bdrv_co_flush(bs->file->bs);
> +}
> +
> +static int64_t preallocate_getlength(BlockDriverState *bs)
> +{
> +    /*
> +     * We probably can return s->data_end here, but seems safer to return real
> +     * file length, not trying to hide the preallocation.
> +     *
> +     * Still, don't miss the chance to restore s->data_end if it is broken.
> +     */
> +    BDRVPreallocateState *s = bs->opaque;
> +    int64_t ret = bdrv_getlength(bs->file->bs);
> +
> +    if (s->data_end < 0) {
> +        s->data_end = ret;
> +    }
> +
> +    return ret;
> +}
> +
> +BlockDriver bdrv_preallocate_filter = {
> +    .format_name = "preallocate",
> +    .instance_size = sizeof(BDRVPreallocateState),
> +
> +    .bdrv_getlength = preallocate_getlength,
> +    .bdrv_open = preallocate_open,
> +    .bdrv_close = preallocate_close,
> +
> +    .bdrv_co_preadv_part = preallocate_co_preadv_part,
> +    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
> +    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
> +    .bdrv_co_pdiscard = preallocate_co_pdiscard,
> +    .bdrv_co_flush = preallocate_co_flush,
> +    .bdrv_co_truncate = preallocate_co_truncate,
> +
> +    .bdrv_co_block_status = bdrv_co_block_status_from_file,
> +
> +    .bdrv_child_perm = preallocate_child_perm,
> +
> +    .has_variable_length = true,
> +    .is_filter = true,
> +};
> +
> +static void bdrv_preallocate_init(void)
> +{
> +    bdrv_register(&bdrv_preallocate_filter);
> +}
> +
> +block_init(bdrv_preallocate_init);
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 3635b6b4c1..f46a353a35 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -45,6 +45,7 @@ block-obj-y += crypto.o
>  block-obj-y += aio_task.o
>  block-obj-y += backup-top.o
>  block-obj-y += filter-compress.o
> +block-obj-y += preallocate.o
>  common-obj-y += monitor/
>  
>  block-obj-y += stream.o
> -- 
> 2.18.0
> 
Re: [PATCH 3/5] block: introduce preallocate filter
Posted by Vladimir Sementsov-Ogievskiy 5 years, 7 months ago
08.07.2020 15:07, Stefan Hajnoczi wrote:
> On Sat, Jun 20, 2020 at 05:36:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
>> It may be used for file-systems with slow allocation.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   qapi/block-core.json |   3 +-
>>   block/preallocate.c  | 255 +++++++++++++++++++++++++++++++++++++++++++
>>   block/Makefile.objs  |   1 +
>>   3 files changed, 258 insertions(+), 1 deletion(-)
>>   create mode 100644 block/preallocate.c
> 
> Please add documentation to docs/system/qemu-block-drivers.rst.inc
> describing the purpose of this block driver and how to use it.

This implies adding new section "Filters", yes?

> 
> Since this filter grows the file I guess it's intended to be below an
> image format?

Yes, between format and protocol nodes.

> 
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 0e1c6a59f2..a0bda399d6 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2805,7 +2805,7 @@
>>               'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps',
>>               'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi',
>>               'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
>> -            'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>> +            'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>>               { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
>>               'sheepdog',
>>               'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
>> @@ -3995,6 +3995,7 @@
>>         'null-co':    'BlockdevOptionsNull',
>>         'nvme':       'BlockdevOptionsNVMe',
>>         'parallels':  'BlockdevOptionsGenericFormat',
>> +      'preallocate':'BlockdevOptionsGenericFormat',
>>         'qcow2':      'BlockdevOptionsQcow2',
>>         'qcow':       'BlockdevOptionsQcow',
>>         'qed':        'BlockdevOptionsGenericCOWFormat',
>> diff --git a/block/preallocate.c b/block/preallocate.c
>> new file mode 100644
>> index 0000000000..c272a6e41d
>> --- /dev/null
>> +++ b/block/preallocate.c
>> @@ -0,0 +1,255 @@
>> +/*
>> + * preallocate filter driver
>> + *
>> + * The driver performs preallocate operation: it is injected above
>> + * some node, and before each write over EOF it does additional preallocating
>> + * write-zeroes request.
>> + *
>> + * Copyright (c) 2020 Virtuozzo International GmbH.
>> + *
>> + * Author:
>> + *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +
>> +#include "qemu/module.h"
>> +#include "qemu/units.h"
>> +#include "block/block_int.h"
>> +
>> +
>> +typedef struct BDRVPreallocateState {
>> +    int64_t prealloc_size;
>> +    int64_t prealloc_align;
>> +
>> +    /*
>> +     * Track real data end, to crop preallocation on close  data_end may be
>> +     * negative, which means that actual status is unknown (nothing cropped in
>> +     * this case)
>> +     */
>> +    int64_t data_end;
>> +} BDRVPreallocateState;
>> +
>> +
>> +static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
>> +                            Error **errp)
>> +{
>> +    BDRVPreallocateState *s = bs->opaque;
>> +
>> +    /*
>> +     * Parameters are hardcoded now. May need to add corresponding options in
>> +     * future.
>> +     */
> 
> The code for .bdrv_open() options is quick to write. If you add the
> options right away then it will be much easier for users who need to
> tweak them in the future.

OK

> 
>> +    s->prealloc_align = 1 * MiB;
>> +    s->prealloc_size = 128 * MiB;
>> +
>> +    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
>> +                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
>> +                               false, errp);
>> +    if (!bs->file) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    s->data_end = bdrv_getlength(bs->file->bs);
>> +    if (s->data_end < 0) {
>> +        return s->data_end;
>> +    }
>> +
>> +    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
>> +        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
>> +
>> +    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
>> +        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
>> +            bs->file->bs->supported_zero_flags);
>> +
>> +    return 0;
>> +}
>> +
>> +static void preallocate_close(BlockDriverState *bs)
>> +{
>> +    BDRVPreallocateState *s = bs->opaque;
>> +
>> +    if (s->data_end >= 0 && bdrv_getlength(bs->file->bs) > s->data_end) {
>> +        bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, NULL);
>> +    }
>> +}
>> +
>> +static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
>> +                                   BdrvChildRole role,
>> +                                   BlockReopenQueue *reopen_queue,
>> +                                   uint64_t perm, uint64_t shared,
>> +                                   uint64_t *nperm, uint64_t *nshared)
>> +{
>> +    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
>> +
>> +    /* Force RESIZE permission, to be able to crop file on close() */
>> +    *nperm |= BLK_PERM_RESIZE;
>> +}
>> +
>> +static coroutine_fn int preallocate_co_preadv_part(
>> +        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
>> +        QEMUIOVector *qiov, size_t qiov_offset, int flags)
>> +{
>> +    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
>> +                               flags);
>> +}
>> +
>> +static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
>> +                                               int64_t offset, int bytes)
>> +{
>> +    return bdrv_co_pdiscard(bs->file, offset, bytes);
>> +}
>> +
>> +static bool coroutine_fn do_preallocate(BlockDriverState *bs, int64_t offset,
>> +                                       int64_t bytes, bool write_zero)
>> +{
>> +    BDRVPreallocateState *s = bs->opaque;
>> +    int64_t len, start, end;
>> +    BdrvTrackedRequest *lock;
>> +    int ret;
>> +
>> +    if (s->data_end >= 0) {
>> +        s->data_end = MAX(s->data_end,
>> +                          QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE));
>> +    }
>> +
>> +    len = bdrv_getlength(bs->file->bs);
>> +    if (len < 0) {
>> +        return false;
>> +    }
>> +
>> +    if (s->data_end < 0) {
>> +        s->data_end = MAX(len,
>> +                          QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE));
>> +    }
>> +
>> +    if (offset + bytes <= len) {
>> +        return false;
>> +    }
>> +
>> +    lock = bdrv_co_range_try_lock(bs->file->bs, len, INT64_MAX - len);
>> +    if (!lock) {
>> +        /* There are already preallocating requests in-fligth */
> 
> s/fligth/flight/
> 
>> +        return false;
>> +    }
>> +
>> +    /* Length should not have changed */
>> +    assert(len == bdrv_getlength(bs->file->bs));
>> +
>> +    start = write_zero ? MIN(offset, len) : len;
>> +    end = QEMU_ALIGN_UP(offset + bytes + s->prealloc_size, s->prealloc_align);
>> +
>> +    ret = bdrv_co_pwrite_zeroes_locked(bs->file, start, end - start,
>> +                                       BDRV_REQ_NO_FALLBACK, lock);
>> +
>> +    bdrv_co_range_unlock(lock);
> 
> Hmm...if this piece of code is the only user of bdrv_co_range_try_lock()
> then a BDRV_REQ_NO_WAIT flag might be a simpler API.
> 
> I thought the lock request would be used to perform multiple operations,
> but if it's just for a single operation then I think it's less code and
> easier to understand without the lock request.

Hmm, again, I don't remember exact reasons. Firstly, I was afraid of length
change during try_lock and have a double check for bdrv_getlength(). Then
I decided that it's impossible and change the check to an assertion.
Probably, the only reason to leave locked range was "I already have the code,
it will help with copy-on-read, why not to use it".. OK, I'll try rewrite it
with help of new flag.

> 
>> +
>> +    return !ret;
>> +}
>> +
>> +static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
>> +        int64_t offset, int bytes, BdrvRequestFlags flags)
>> +{
>> +    if (do_preallocate(bs, offset, bytes, true)) {
>> +        return 0;
>> +    }
>> +
>> +    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
>> +}
>> +
>> +static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
>> +                                                    uint64_t offset,
>> +                                                    uint64_t bytes,
>> +                                                    QEMUIOVector *qiov,
>> +                                                    size_t qiov_offset,
>> +                                                    int flags)
>> +{
>> +    do_preallocate(bs, offset, bytes, false);
>> +
>> +    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
>> +                                flags);
>> +}
>> +
>> +static int coroutine_fn
>> +preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
>> +                        bool exact, PreallocMode prealloc,
>> +                        BdrvRequestFlags flags, Error **errp)
>> +{
>> +    BDRVPreallocateState *s = bs->opaque;
>> +    int ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
>> +
>> +    /* s->data_end may become negative here, which means unknown data end */
>> +    s->data_end = bdrv_getlength(bs->file->bs);
>> +
>> +    return ret;
>> +}
>> +
>> +static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
>> +{
>> +    if (!bs->file) {
>> +        return 0;
>> +    }
> 
> When does this happen? It's surprising to see the !bs->file check here
> but not in other functions.

It's just done line in mirror-top and backup-top.. But seems there should not be such an issue. Will drop.

> 
>> +
>> +    return bdrv_co_flush(bs->file->bs);
>> +}
>> +
>> +static int64_t preallocate_getlength(BlockDriverState *bs)
>> +{
>> +    /*
>> +     * We probably can return s->data_end here, but seems safer to return real
>> +     * file length, not trying to hide the preallocation.
>> +     *
>> +     * Still, don't miss the chance to restore s->data_end if it is broken.
>> +     */
>> +    BDRVPreallocateState *s = bs->opaque;
>> +    int64_t ret = bdrv_getlength(bs->file->bs);
>> +
>> +    if (s->data_end < 0) {
>> +        s->data_end = ret;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +BlockDriver bdrv_preallocate_filter = {
>> +    .format_name = "preallocate",
>> +    .instance_size = sizeof(BDRVPreallocateState),
>> +
>> +    .bdrv_getlength = preallocate_getlength,
>> +    .bdrv_open = preallocate_open,
>> +    .bdrv_close = preallocate_close,
>> +
>> +    .bdrv_co_preadv_part = preallocate_co_preadv_part,
>> +    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
>> +    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
>> +    .bdrv_co_pdiscard = preallocate_co_pdiscard,
>> +    .bdrv_co_flush = preallocate_co_flush,
>> +    .bdrv_co_truncate = preallocate_co_truncate,
>> +
>> +    .bdrv_co_block_status = bdrv_co_block_status_from_file,
>> +
>> +    .bdrv_child_perm = preallocate_child_perm,
>> +
>> +    .has_variable_length = true,
>> +    .is_filter = true,
>> +};
>> +
>> +static void bdrv_preallocate_init(void)
>> +{
>> +    bdrv_register(&bdrv_preallocate_filter);
>> +}
>> +
>> +block_init(bdrv_preallocate_init);
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index 3635b6b4c1..f46a353a35 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -45,6 +45,7 @@ block-obj-y += crypto.o
>>   block-obj-y += aio_task.o
>>   block-obj-y += backup-top.o
>>   block-obj-y += filter-compress.o
>> +block-obj-y += preallocate.o
>>   common-obj-y += monitor/
>>   
>>   block-obj-y += stream.o
>> -- 
>> 2.18.0
>>


-- 
Best regards,
Vladimir

Re: [PATCH 3/5] block: introduce preallocate filter
Posted by Stefan Hajnoczi 5 years, 7 months ago
On Wed, Jul 08, 2020 at 07:17:52PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> 08.07.2020 15:07, Stefan Hajnoczi wrote:
> > On Sat, Jun 20, 2020 at 05:36:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > > It may be used for file-systems with slow allocation.
> > > 
> > > Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> > > ---
> > >   qapi/block-core.json |   3 +-
> > >   block/preallocate.c  | 255 +++++++++++++++++++++++++++++++++++++++++++
> > >   block/Makefile.objs  |   1 +
> > >   3 files changed, 258 insertions(+), 1 deletion(-)
> > >   create mode 100644 block/preallocate.c
> > 
> > Please add documentation to docs/system/qemu-block-drivers.rst.inc
> > describing the purpose of this block driver and how to use it.
> 
> This implies adding new section "Filters", yes?

Yes, please.

> > 
> > Since this filter grows the file I guess it's intended to be below an
> > image format?
> 
> Yes, between format and protocol nodes.

Thanks for confirming. Please include this in the documentation. While
reading the code I was thinking about how this can work if the guest is
directly exposed to the filter. Users might wonder about the same thing.

> > > +    /* Length should not have changed */
> > > +    assert(len == bdrv_getlength(bs->file->bs));
> > > +
> > > +    start = write_zero ? MIN(offset, len) : len;
> > > +    end = QEMU_ALIGN_UP(offset + bytes + s->prealloc_size, s->prealloc_align);
> > > +
> > > +    ret = bdrv_co_pwrite_zeroes_locked(bs->file, start, end - start,
> > > +                                       BDRV_REQ_NO_FALLBACK, lock);
> > > +
> > > +    bdrv_co_range_unlock(lock);
> > 
> > Hmm...if this piece of code is the only user of bdrv_co_range_try_lock()
> > then a BDRV_REQ_NO_WAIT flag might be a simpler API.
> > 
> > I thought the lock request would be used to perform multiple operations,
> > but if it's just for a single operation then I think it's less code and
> > easier to understand without the lock request.
> 
> Hmm, again, I don't remember exact reasons. Firstly, I was afraid of length
> change during try_lock and have a double check for bdrv_getlength(). Then
> I decided that it's impossible and change the check to an assertion.
> Probably, the only reason to leave locked range was "I already have the code,
> it will help with copy-on-read, why not to use it".. OK, I'll try rewrite it
> with help of new flag.

Thanks!

Stefan